commit 8462c4a571cdf6f6008273bd633923417b9106e2 Author: ModelHub XC Date: Wed Jun 10 15:40:17 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: boradorish/llama3-1B-sft Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..53553f8 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,45 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +checkpoint-1012/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-1104/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-1196/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-1288/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-1380/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-1472/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-1557/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-828/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-920/tokenizer.json filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..3268f1f --- /dev/null +++ b/README.md @@ -0,0 +1,61 @@ +--- +library_name: transformers +license: other +base_model: meta-llama/Llama-3.2-1B-Instruct +tags: +- llama-factory +- full +- generated_from_trainer +model-index: +- name: sft + results: [] +--- + + + +# sft + +This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) on the sunny_reasoning dataset. + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 4e-05 +- train_batch_size: 4 +- eval_batch_size: 8 +- seed: 42 +- distributed_type: multi-GPU +- num_devices: 2 +- gradient_accumulation_steps: 4 +- total_train_batch_size: 32 +- total_eval_batch_size: 16 +- optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_ratio: 0.1 +- num_epochs: 3.0 + +### Training results + + + +### Framework versions + +- Transformers 4.56.2 +- Pytorch 2.11.0+cu128 +- Datasets 3.0.0 +- Tokenizers 0.22.2 diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..62f8703 --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 3.0, + "total_flos": 2.043435500286509e+18, + "train_loss": 0.016654981696585226, + "train_runtime": 5294.7714, + "train_samples_per_second": 9.403, + "train_steps_per_second": 0.294 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..1bad6a0 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/checkpoint-1012/chat_template.jinja b/checkpoint-1012/chat_template.jinja new file mode 100644 index 0000000..1bad6a0 --- /dev/null +++ b/checkpoint-1012/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/checkpoint-1012/config.json b/checkpoint-1012/config.json new file mode 100644 index 0000000..f8bf41e --- /dev/null +++ b/checkpoint-1012/config.json @@ -0,0 +1,36 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128009, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": 128009, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "transformers_version": "4.56.2", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/checkpoint-1012/generation_config.json b/checkpoint-1012/generation_config.json new file mode 100644 index 0000000..2152026 --- /dev/null +++ b/checkpoint-1012/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128009, + 128001, + 128008, + 128009 + ], + "pad_token_id": 128009, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.56.2" +} diff --git a/checkpoint-1012/model.safetensors b/checkpoint-1012/model.safetensors new file mode 100644 index 0000000..964f6ef --- /dev/null +++ b/checkpoint-1012/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b8b7e4043342178df69a50d2e922139bde2e350984afbed16a89e6cf92bfebe +size 2996982344 diff --git a/checkpoint-1012/special_tokens_map.json b/checkpoint-1012/special_tokens_map.json new file mode 100644 index 0000000..14daf45 --- /dev/null +++ b/checkpoint-1012/special_tokens_map.json @@ -0,0 +1,26 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/checkpoint-1012/tokenizer.json b/checkpoint-1012/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/checkpoint-1012/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-1012/tokenizer_config.json b/checkpoint-1012/tokenizer_config.json new file mode 100644 index 0000000..d1e1ea9 --- /dev/null +++ b/checkpoint-1012/tokenizer_config.json @@ -0,0 +1,2068 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-1012/trainer_state.json b/checkpoint-1012/trainer_state.json new file mode 100644 index 0000000..571f549 --- /dev/null +++ b/checkpoint-1012/trainer_state.json @@ -0,0 +1,7118 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9503614457831326, + "eval_steps": 500, + "global_step": 1012, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0019277108433734939, + "grad_norm": 2.8518834114074707, + "learning_rate": 0.0, + "loss": 0.0891, + "step": 1 + }, + { + "epoch": 0.0038554216867469878, + "grad_norm": 1.8441249132156372, + "learning_rate": 2.564102564102564e-07, + "loss": 0.0539, + "step": 2 + }, + { + "epoch": 0.005783132530120482, + "grad_norm": 2.8263237476348877, + "learning_rate": 5.128205128205128e-07, + "loss": 0.099, + "step": 3 + }, + { + "epoch": 0.0077108433734939755, + "grad_norm": 2.5051236152648926, + "learning_rate": 7.692307692307694e-07, + "loss": 0.0789, + "step": 4 + }, + { + "epoch": 0.00963855421686747, + "grad_norm": 2.6903438568115234, + "learning_rate": 1.0256410256410257e-06, + "loss": 0.0881, + "step": 5 + }, + { + "epoch": 0.011566265060240964, + "grad_norm": 2.6205761432647705, + "learning_rate": 1.282051282051282e-06, + "loss": 0.0776, + "step": 6 + }, + { + "epoch": 0.013493975903614458, + "grad_norm": 2.6309337615966797, + "learning_rate": 1.5384615384615387e-06, + "loss": 0.0827, + "step": 7 + }, + { + "epoch": 0.015421686746987951, + "grad_norm": 1.5427855253219604, + "learning_rate": 1.794871794871795e-06, + "loss": 0.0577, + "step": 8 + }, + { + "epoch": 0.017349397590361446, + "grad_norm": 1.0973446369171143, + "learning_rate": 2.0512820512820513e-06, + "loss": 0.04, + "step": 9 + }, + { + "epoch": 0.01927710843373494, + "grad_norm": 1.3253350257873535, + "learning_rate": 2.307692307692308e-06, + "loss": 0.0506, + "step": 10 + }, + { + "epoch": 0.021204819277108433, + "grad_norm": 1.588739037513733, + "learning_rate": 2.564102564102564e-06, + "loss": 0.0874, + "step": 11 + }, + { + "epoch": 0.02313253012048193, + "grad_norm": 1.4987014532089233, + "learning_rate": 2.8205128205128207e-06, + "loss": 0.0597, + "step": 12 + }, + { + "epoch": 0.02506024096385542, + "grad_norm": 1.6571592092514038, + "learning_rate": 3.0769230769230774e-06, + "loss": 0.0559, + "step": 13 + }, + { + "epoch": 0.026987951807228915, + "grad_norm": 1.8860628604888916, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.0688, + "step": 14 + }, + { + "epoch": 0.02891566265060241, + "grad_norm": 1.3202295303344727, + "learning_rate": 3.58974358974359e-06, + "loss": 0.0433, + "step": 15 + }, + { + "epoch": 0.030843373493975902, + "grad_norm": 1.5870612859725952, + "learning_rate": 3.846153846153847e-06, + "loss": 0.0695, + "step": 16 + }, + { + "epoch": 0.0327710843373494, + "grad_norm": 0.9192284345626831, + "learning_rate": 4.102564102564103e-06, + "loss": 0.0392, + "step": 17 + }, + { + "epoch": 0.03469879518072289, + "grad_norm": 0.7950155735015869, + "learning_rate": 4.358974358974359e-06, + "loss": 0.0351, + "step": 18 + }, + { + "epoch": 0.03662650602409639, + "grad_norm": 0.8854314684867859, + "learning_rate": 4.615384615384616e-06, + "loss": 0.0356, + "step": 19 + }, + { + "epoch": 0.03855421686746988, + "grad_norm": 0.9546788930892944, + "learning_rate": 4.871794871794872e-06, + "loss": 0.0427, + "step": 20 + }, + { + "epoch": 0.04048192771084337, + "grad_norm": 0.6315903663635254, + "learning_rate": 5.128205128205128e-06, + "loss": 0.0397, + "step": 21 + }, + { + "epoch": 0.042409638554216866, + "grad_norm": 0.9230924844741821, + "learning_rate": 5.384615384615385e-06, + "loss": 0.0481, + "step": 22 + }, + { + "epoch": 0.04433734939759036, + "grad_norm": 0.711546003818512, + "learning_rate": 5.641025641025641e-06, + "loss": 0.0479, + "step": 23 + }, + { + "epoch": 0.04626506024096386, + "grad_norm": 0.5288046598434448, + "learning_rate": 5.897435897435898e-06, + "loss": 0.0182, + "step": 24 + }, + { + "epoch": 0.04819277108433735, + "grad_norm": 0.9420496225357056, + "learning_rate": 6.153846153846155e-06, + "loss": 0.0389, + "step": 25 + }, + { + "epoch": 0.05012048192771084, + "grad_norm": 0.5001983046531677, + "learning_rate": 6.410256410256412e-06, + "loss": 0.0268, + "step": 26 + }, + { + "epoch": 0.052048192771084335, + "grad_norm": 0.8084653615951538, + "learning_rate": 6.666666666666667e-06, + "loss": 0.0367, + "step": 27 + }, + { + "epoch": 0.05397590361445783, + "grad_norm": 0.7195103764533997, + "learning_rate": 6.923076923076923e-06, + "loss": 0.0251, + "step": 28 + }, + { + "epoch": 0.055903614457831326, + "grad_norm": 0.529958963394165, + "learning_rate": 7.17948717948718e-06, + "loss": 0.0289, + "step": 29 + }, + { + "epoch": 0.05783132530120482, + "grad_norm": 0.795376181602478, + "learning_rate": 7.435897435897437e-06, + "loss": 0.043, + "step": 30 + }, + { + "epoch": 0.059759036144578316, + "grad_norm": 0.6366249918937683, + "learning_rate": 7.692307692307694e-06, + "loss": 0.029, + "step": 31 + }, + { + "epoch": 0.061686746987951804, + "grad_norm": 0.5414115190505981, + "learning_rate": 7.948717948717949e-06, + "loss": 0.0365, + "step": 32 + }, + { + "epoch": 0.0636144578313253, + "grad_norm": 0.9350972175598145, + "learning_rate": 8.205128205128205e-06, + "loss": 0.0283, + "step": 33 + }, + { + "epoch": 0.0655421686746988, + "grad_norm": 0.5660741925239563, + "learning_rate": 8.461538461538462e-06, + "loss": 0.0234, + "step": 34 + }, + { + "epoch": 0.06746987951807229, + "grad_norm": 0.5623988509178162, + "learning_rate": 8.717948717948719e-06, + "loss": 0.0307, + "step": 35 + }, + { + "epoch": 0.06939759036144579, + "grad_norm": 0.5260195732116699, + "learning_rate": 8.974358974358976e-06, + "loss": 0.0264, + "step": 36 + }, + { + "epoch": 0.07132530120481928, + "grad_norm": 0.4934785068035126, + "learning_rate": 9.230769230769232e-06, + "loss": 0.0224, + "step": 37 + }, + { + "epoch": 0.07325301204819278, + "grad_norm": 0.4797322154045105, + "learning_rate": 9.487179487179487e-06, + "loss": 0.0163, + "step": 38 + }, + { + "epoch": 0.07518072289156627, + "grad_norm": 0.4739217460155487, + "learning_rate": 9.743589743589744e-06, + "loss": 0.0165, + "step": 39 + }, + { + "epoch": 0.07710843373493977, + "grad_norm": 0.4527677595615387, + "learning_rate": 1e-05, + "loss": 0.0163, + "step": 40 + }, + { + "epoch": 0.07903614457831325, + "grad_norm": 0.6241316795349121, + "learning_rate": 1.0256410256410256e-05, + "loss": 0.0302, + "step": 41 + }, + { + "epoch": 0.08096385542168674, + "grad_norm": 0.639043927192688, + "learning_rate": 1.0512820512820514e-05, + "loss": 0.0312, + "step": 42 + }, + { + "epoch": 0.08289156626506024, + "grad_norm": 0.5121409296989441, + "learning_rate": 1.076923076923077e-05, + "loss": 0.0256, + "step": 43 + }, + { + "epoch": 0.08481927710843373, + "grad_norm": 0.6340477466583252, + "learning_rate": 1.1025641025641028e-05, + "loss": 0.04, + "step": 44 + }, + { + "epoch": 0.08674698795180723, + "grad_norm": 0.5260409712791443, + "learning_rate": 1.1282051282051283e-05, + "loss": 0.0282, + "step": 45 + }, + { + "epoch": 0.08867469879518072, + "grad_norm": 0.6390711069107056, + "learning_rate": 1.1538461538461538e-05, + "loss": 0.0243, + "step": 46 + }, + { + "epoch": 0.09060240963855422, + "grad_norm": 0.46469295024871826, + "learning_rate": 1.1794871794871796e-05, + "loss": 0.0208, + "step": 47 + }, + { + "epoch": 0.09253012048192771, + "grad_norm": 0.8711516857147217, + "learning_rate": 1.2051282051282051e-05, + "loss": 0.0291, + "step": 48 + }, + { + "epoch": 0.09445783132530121, + "grad_norm": 0.9164300560951233, + "learning_rate": 1.230769230769231e-05, + "loss": 0.0342, + "step": 49 + }, + { + "epoch": 0.0963855421686747, + "grad_norm": 0.5401139259338379, + "learning_rate": 1.2564102564102565e-05, + "loss": 0.0185, + "step": 50 + }, + { + "epoch": 0.0983132530120482, + "grad_norm": 0.44393008947372437, + "learning_rate": 1.2820512820512823e-05, + "loss": 0.0228, + "step": 51 + }, + { + "epoch": 0.10024096385542168, + "grad_norm": 0.3855767846107483, + "learning_rate": 1.3076923076923078e-05, + "loss": 0.0176, + "step": 52 + }, + { + "epoch": 0.10216867469879518, + "grad_norm": 0.8561235070228577, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.0433, + "step": 53 + }, + { + "epoch": 0.10409638554216867, + "grad_norm": 0.768002450466156, + "learning_rate": 1.3589743589743592e-05, + "loss": 0.0245, + "step": 54 + }, + { + "epoch": 0.10602409638554217, + "grad_norm": 0.4559759497642517, + "learning_rate": 1.3846153846153847e-05, + "loss": 0.0224, + "step": 55 + }, + { + "epoch": 0.10795180722891566, + "grad_norm": 0.6203847527503967, + "learning_rate": 1.4102564102564105e-05, + "loss": 0.0296, + "step": 56 + }, + { + "epoch": 0.10987951807228916, + "grad_norm": 0.6651368141174316, + "learning_rate": 1.435897435897436e-05, + "loss": 0.0336, + "step": 57 + }, + { + "epoch": 0.11180722891566265, + "grad_norm": 0.377734512090683, + "learning_rate": 1.4615384615384615e-05, + "loss": 0.0196, + "step": 58 + }, + { + "epoch": 0.11373493975903615, + "grad_norm": 0.687568724155426, + "learning_rate": 1.4871794871794874e-05, + "loss": 0.0207, + "step": 59 + }, + { + "epoch": 0.11566265060240964, + "grad_norm": 0.7905604243278503, + "learning_rate": 1.5128205128205129e-05, + "loss": 0.047, + "step": 60 + }, + { + "epoch": 0.11759036144578314, + "grad_norm": 0.7938196063041687, + "learning_rate": 1.5384615384615387e-05, + "loss": 0.0198, + "step": 61 + }, + { + "epoch": 0.11951807228915663, + "grad_norm": 0.41340553760528564, + "learning_rate": 1.5641025641025644e-05, + "loss": 0.0161, + "step": 62 + }, + { + "epoch": 0.12144578313253013, + "grad_norm": 0.5668172240257263, + "learning_rate": 1.5897435897435897e-05, + "loss": 0.0275, + "step": 63 + }, + { + "epoch": 0.12337349397590361, + "grad_norm": 0.48333367705345154, + "learning_rate": 1.6153846153846154e-05, + "loss": 0.0137, + "step": 64 + }, + { + "epoch": 0.12530120481927712, + "grad_norm": 0.6843933463096619, + "learning_rate": 1.641025641025641e-05, + "loss": 0.0294, + "step": 65 + }, + { + "epoch": 0.1272289156626506, + "grad_norm": 0.7789272665977478, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.0401, + "step": 66 + }, + { + "epoch": 0.1291566265060241, + "grad_norm": 0.6203492879867554, + "learning_rate": 1.6923076923076924e-05, + "loss": 0.0292, + "step": 67 + }, + { + "epoch": 0.1310843373493976, + "grad_norm": 0.5940662622451782, + "learning_rate": 1.717948717948718e-05, + "loss": 0.0178, + "step": 68 + }, + { + "epoch": 0.13301204819277107, + "grad_norm": 0.35504868626594543, + "learning_rate": 1.7435897435897438e-05, + "loss": 0.0129, + "step": 69 + }, + { + "epoch": 0.13493975903614458, + "grad_norm": 0.8796699643135071, + "learning_rate": 1.7692307692307694e-05, + "loss": 0.034, + "step": 70 + }, + { + "epoch": 0.13686746987951806, + "grad_norm": 0.967444896697998, + "learning_rate": 1.794871794871795e-05, + "loss": 0.0266, + "step": 71 + }, + { + "epoch": 0.13879518072289157, + "grad_norm": 0.4428526759147644, + "learning_rate": 1.8205128205128208e-05, + "loss": 0.0223, + "step": 72 + }, + { + "epoch": 0.14072289156626505, + "grad_norm": 0.42897751927375793, + "learning_rate": 1.8461538461538465e-05, + "loss": 0.0187, + "step": 73 + }, + { + "epoch": 0.14265060240963856, + "grad_norm": 0.5100914835929871, + "learning_rate": 1.8717948717948718e-05, + "loss": 0.0164, + "step": 74 + }, + { + "epoch": 0.14457831325301204, + "grad_norm": 0.6028861999511719, + "learning_rate": 1.8974358974358975e-05, + "loss": 0.0164, + "step": 75 + }, + { + "epoch": 0.14650602409638555, + "grad_norm": 0.6187024116516113, + "learning_rate": 1.923076923076923e-05, + "loss": 0.0296, + "step": 76 + }, + { + "epoch": 0.14843373493975903, + "grad_norm": 0.4822489619255066, + "learning_rate": 1.9487179487179488e-05, + "loss": 0.0148, + "step": 77 + }, + { + "epoch": 0.15036144578313254, + "grad_norm": 0.7231149673461914, + "learning_rate": 1.9743589743589745e-05, + "loss": 0.0395, + "step": 78 + }, + { + "epoch": 0.15228915662650602, + "grad_norm": 0.8409642577171326, + "learning_rate": 2e-05, + "loss": 0.0446, + "step": 79 + }, + { + "epoch": 0.15421686746987953, + "grad_norm": 0.4883500039577484, + "learning_rate": 2.025641025641026e-05, + "loss": 0.0206, + "step": 80 + }, + { + "epoch": 0.156144578313253, + "grad_norm": 0.6287479400634766, + "learning_rate": 2.0512820512820512e-05, + "loss": 0.0333, + "step": 81 + }, + { + "epoch": 0.1580722891566265, + "grad_norm": 0.5041632652282715, + "learning_rate": 2.0769230769230772e-05, + "loss": 0.0414, + "step": 82 + }, + { + "epoch": 0.16, + "grad_norm": 0.5103405117988586, + "learning_rate": 2.102564102564103e-05, + "loss": 0.045, + "step": 83 + }, + { + "epoch": 0.16192771084337348, + "grad_norm": 0.493161678314209, + "learning_rate": 2.1282051282051285e-05, + "loss": 0.021, + "step": 84 + }, + { + "epoch": 0.163855421686747, + "grad_norm": 0.908843994140625, + "learning_rate": 2.153846153846154e-05, + "loss": 0.0389, + "step": 85 + }, + { + "epoch": 0.16578313253012048, + "grad_norm": 0.5067003965377808, + "learning_rate": 2.1794871794871795e-05, + "loss": 0.0272, + "step": 86 + }, + { + "epoch": 0.16771084337349398, + "grad_norm": 0.5791381597518921, + "learning_rate": 2.2051282051282056e-05, + "loss": 0.0368, + "step": 87 + }, + { + "epoch": 0.16963855421686747, + "grad_norm": 0.7056036591529846, + "learning_rate": 2.230769230769231e-05, + "loss": 0.0284, + "step": 88 + }, + { + "epoch": 0.17156626506024097, + "grad_norm": 0.6563822031021118, + "learning_rate": 2.2564102564102566e-05, + "loss": 0.0646, + "step": 89 + }, + { + "epoch": 0.17349397590361446, + "grad_norm": 0.9483286142349243, + "learning_rate": 2.2820512820512822e-05, + "loss": 0.0439, + "step": 90 + }, + { + "epoch": 0.17542168674698796, + "grad_norm": 0.370664119720459, + "learning_rate": 2.3076923076923076e-05, + "loss": 0.0109, + "step": 91 + }, + { + "epoch": 0.17734939759036145, + "grad_norm": 0.9776477813720703, + "learning_rate": 2.3333333333333336e-05, + "loss": 0.0458, + "step": 92 + }, + { + "epoch": 0.17927710843373493, + "grad_norm": 0.45710092782974243, + "learning_rate": 2.3589743589743593e-05, + "loss": 0.0212, + "step": 93 + }, + { + "epoch": 0.18120481927710844, + "grad_norm": 0.8623896837234497, + "learning_rate": 2.384615384615385e-05, + "loss": 0.0215, + "step": 94 + }, + { + "epoch": 0.18313253012048192, + "grad_norm": 0.55814528465271, + "learning_rate": 2.4102564102564103e-05, + "loss": 0.0218, + "step": 95 + }, + { + "epoch": 0.18506024096385543, + "grad_norm": 0.49882641434669495, + "learning_rate": 2.435897435897436e-05, + "loss": 0.0268, + "step": 96 + }, + { + "epoch": 0.1869879518072289, + "grad_norm": 0.3508654534816742, + "learning_rate": 2.461538461538462e-05, + "loss": 0.0172, + "step": 97 + }, + { + "epoch": 0.18891566265060242, + "grad_norm": 0.601170003414154, + "learning_rate": 2.4871794871794873e-05, + "loss": 0.0208, + "step": 98 + }, + { + "epoch": 0.1908433734939759, + "grad_norm": 1.1748133897781372, + "learning_rate": 2.512820512820513e-05, + "loss": 0.0259, + "step": 99 + }, + { + "epoch": 0.1927710843373494, + "grad_norm": 0.46370384097099304, + "learning_rate": 2.5384615384615386e-05, + "loss": 0.0242, + "step": 100 + }, + { + "epoch": 0.1946987951807229, + "grad_norm": 0.525010883808136, + "learning_rate": 2.5641025641025646e-05, + "loss": 0.0188, + "step": 101 + }, + { + "epoch": 0.1966265060240964, + "grad_norm": 0.766501784324646, + "learning_rate": 2.58974358974359e-05, + "loss": 0.0584, + "step": 102 + }, + { + "epoch": 0.19855421686746988, + "grad_norm": 0.3572964370250702, + "learning_rate": 2.6153846153846157e-05, + "loss": 0.0131, + "step": 103 + }, + { + "epoch": 0.20048192771084336, + "grad_norm": 0.6467130780220032, + "learning_rate": 2.6410256410256413e-05, + "loss": 0.0231, + "step": 104 + }, + { + "epoch": 0.20240963855421687, + "grad_norm": 1.1852102279663086, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.027, + "step": 105 + }, + { + "epoch": 0.20433734939759035, + "grad_norm": 2.3659932613372803, + "learning_rate": 2.6923076923076927e-05, + "loss": 0.0224, + "step": 106 + }, + { + "epoch": 0.20626506024096386, + "grad_norm": 0.5343687534332275, + "learning_rate": 2.7179487179487183e-05, + "loss": 0.0198, + "step": 107 + }, + { + "epoch": 0.20819277108433734, + "grad_norm": 1.852160096168518, + "learning_rate": 2.7435897435897437e-05, + "loss": 0.032, + "step": 108 + }, + { + "epoch": 0.21012048192771085, + "grad_norm": 0.47291702032089233, + "learning_rate": 2.7692307692307694e-05, + "loss": 0.0117, + "step": 109 + }, + { + "epoch": 0.21204819277108433, + "grad_norm": 0.7623187899589539, + "learning_rate": 2.794871794871795e-05, + "loss": 0.0337, + "step": 110 + }, + { + "epoch": 0.21397590361445784, + "grad_norm": 0.5272570848464966, + "learning_rate": 2.820512820512821e-05, + "loss": 0.0131, + "step": 111 + }, + { + "epoch": 0.21590361445783132, + "grad_norm": 0.5568500757217407, + "learning_rate": 2.8461538461538464e-05, + "loss": 0.0233, + "step": 112 + }, + { + "epoch": 0.21783132530120483, + "grad_norm": 0.4008469879627228, + "learning_rate": 2.871794871794872e-05, + "loss": 0.0204, + "step": 113 + }, + { + "epoch": 0.2197590361445783, + "grad_norm": 0.4888612926006317, + "learning_rate": 2.8974358974358977e-05, + "loss": 0.016, + "step": 114 + }, + { + "epoch": 0.2216867469879518, + "grad_norm": 0.44903355836868286, + "learning_rate": 2.923076923076923e-05, + "loss": 0.0135, + "step": 115 + }, + { + "epoch": 0.2236144578313253, + "grad_norm": 0.9266762733459473, + "learning_rate": 2.948717948717949e-05, + "loss": 0.0233, + "step": 116 + }, + { + "epoch": 0.22554216867469878, + "grad_norm": 0.5352638959884644, + "learning_rate": 2.9743589743589747e-05, + "loss": 0.0198, + "step": 117 + }, + { + "epoch": 0.2274698795180723, + "grad_norm": 0.6051343679428101, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.0246, + "step": 118 + }, + { + "epoch": 0.22939759036144577, + "grad_norm": 0.9971133470535278, + "learning_rate": 3.0256410256410257e-05, + "loss": 0.025, + "step": 119 + }, + { + "epoch": 0.23132530120481928, + "grad_norm": 0.704236626625061, + "learning_rate": 3.0512820512820514e-05, + "loss": 0.031, + "step": 120 + }, + { + "epoch": 0.23325301204819276, + "grad_norm": 0.6137097477912903, + "learning_rate": 3.0769230769230774e-05, + "loss": 0.0519, + "step": 121 + }, + { + "epoch": 0.23518072289156627, + "grad_norm": 0.7396159768104553, + "learning_rate": 3.102564102564103e-05, + "loss": 0.0325, + "step": 122 + }, + { + "epoch": 0.23710843373493976, + "grad_norm": 1.3282053470611572, + "learning_rate": 3.128205128205129e-05, + "loss": 0.0252, + "step": 123 + }, + { + "epoch": 0.23903614457831326, + "grad_norm": 0.5220731496810913, + "learning_rate": 3.153846153846154e-05, + "loss": 0.0262, + "step": 124 + }, + { + "epoch": 0.24096385542168675, + "grad_norm": 0.5357242822647095, + "learning_rate": 3.1794871794871795e-05, + "loss": 0.0243, + "step": 125 + }, + { + "epoch": 0.24289156626506025, + "grad_norm": 0.48207753896713257, + "learning_rate": 3.205128205128206e-05, + "loss": 0.0178, + "step": 126 + }, + { + "epoch": 0.24481927710843374, + "grad_norm": 0.552988588809967, + "learning_rate": 3.230769230769231e-05, + "loss": 0.023, + "step": 127 + }, + { + "epoch": 0.24674698795180722, + "grad_norm": 1.7962840795516968, + "learning_rate": 3.2564102564102565e-05, + "loss": 0.032, + "step": 128 + }, + { + "epoch": 0.24867469879518073, + "grad_norm": 1.6404600143432617, + "learning_rate": 3.282051282051282e-05, + "loss": 0.0231, + "step": 129 + }, + { + "epoch": 0.25060240963855424, + "grad_norm": 0.39142486453056335, + "learning_rate": 3.307692307692308e-05, + "loss": 0.0147, + "step": 130 + }, + { + "epoch": 0.2525301204819277, + "grad_norm": 1.3272887468338013, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.0439, + "step": 131 + }, + { + "epoch": 0.2544578313253012, + "grad_norm": 1.5122811794281006, + "learning_rate": 3.358974358974359e-05, + "loss": 0.0282, + "step": 132 + }, + { + "epoch": 0.2563855421686747, + "grad_norm": 1.8542430400848389, + "learning_rate": 3.384615384615385e-05, + "loss": 0.0515, + "step": 133 + }, + { + "epoch": 0.2583132530120482, + "grad_norm": 4.059277534484863, + "learning_rate": 3.4102564102564105e-05, + "loss": 0.0781, + "step": 134 + }, + { + "epoch": 0.26024096385542167, + "grad_norm": 0.6206214427947998, + "learning_rate": 3.435897435897436e-05, + "loss": 0.0306, + "step": 135 + }, + { + "epoch": 0.2621686746987952, + "grad_norm": 0.4575510323047638, + "learning_rate": 3.461538461538462e-05, + "loss": 0.0154, + "step": 136 + }, + { + "epoch": 0.2640963855421687, + "grad_norm": 1.1556978225708008, + "learning_rate": 3.4871794871794875e-05, + "loss": 0.0235, + "step": 137 + }, + { + "epoch": 0.26602409638554214, + "grad_norm": 0.6975051760673523, + "learning_rate": 3.512820512820513e-05, + "loss": 0.0453, + "step": 138 + }, + { + "epoch": 0.26795180722891565, + "grad_norm": 0.8686623573303223, + "learning_rate": 3.538461538461539e-05, + "loss": 0.0427, + "step": 139 + }, + { + "epoch": 0.26987951807228916, + "grad_norm": 2.0681848526000977, + "learning_rate": 3.5641025641025646e-05, + "loss": 0.04, + "step": 140 + }, + { + "epoch": 0.27180722891566267, + "grad_norm": 0.4397984445095062, + "learning_rate": 3.58974358974359e-05, + "loss": 0.0188, + "step": 141 + }, + { + "epoch": 0.2737349397590361, + "grad_norm": 0.5871334075927734, + "learning_rate": 3.615384615384616e-05, + "loss": 0.0253, + "step": 142 + }, + { + "epoch": 0.27566265060240963, + "grad_norm": 1.1078568696975708, + "learning_rate": 3.6410256410256416e-05, + "loss": 0.0316, + "step": 143 + }, + { + "epoch": 0.27759036144578314, + "grad_norm": 0.5691841840744019, + "learning_rate": 3.6666666666666666e-05, + "loss": 0.0266, + "step": 144 + }, + { + "epoch": 0.27951807228915665, + "grad_norm": 0.7896255254745483, + "learning_rate": 3.692307692307693e-05, + "loss": 0.0281, + "step": 145 + }, + { + "epoch": 0.2814457831325301, + "grad_norm": 0.9988337159156799, + "learning_rate": 3.7179487179487186e-05, + "loss": 0.0295, + "step": 146 + }, + { + "epoch": 0.2833734939759036, + "grad_norm": 0.9811834692955017, + "learning_rate": 3.7435897435897436e-05, + "loss": 0.0322, + "step": 147 + }, + { + "epoch": 0.2853012048192771, + "grad_norm": 0.6503105759620667, + "learning_rate": 3.769230769230769e-05, + "loss": 0.0266, + "step": 148 + }, + { + "epoch": 0.28722891566265063, + "grad_norm": 1.9164355993270874, + "learning_rate": 3.794871794871795e-05, + "loss": 0.0677, + "step": 149 + }, + { + "epoch": 0.2891566265060241, + "grad_norm": 1.1724557876586914, + "learning_rate": 3.820512820512821e-05, + "loss": 0.0324, + "step": 150 + }, + { + "epoch": 0.2910843373493976, + "grad_norm": 0.8482469916343689, + "learning_rate": 3.846153846153846e-05, + "loss": 0.0259, + "step": 151 + }, + { + "epoch": 0.2930120481927711, + "grad_norm": 0.8572830557823181, + "learning_rate": 3.871794871794872e-05, + "loss": 0.0358, + "step": 152 + }, + { + "epoch": 0.29493975903614456, + "grad_norm": 0.6630825400352478, + "learning_rate": 3.8974358974358976e-05, + "loss": 0.0447, + "step": 153 + }, + { + "epoch": 0.29686746987951806, + "grad_norm": 0.9197093844413757, + "learning_rate": 3.923076923076923e-05, + "loss": 0.0409, + "step": 154 + }, + { + "epoch": 0.2987951807228916, + "grad_norm": 0.6976819634437561, + "learning_rate": 3.948717948717949e-05, + "loss": 0.0317, + "step": 155 + }, + { + "epoch": 0.3007228915662651, + "grad_norm": 0.7353514432907104, + "learning_rate": 3.9743589743589747e-05, + "loss": 0.0306, + "step": 156 + }, + { + "epoch": 0.30265060240963854, + "grad_norm": 0.5730232000350952, + "learning_rate": 4e-05, + "loss": 0.0324, + "step": 157 + }, + { + "epoch": 0.30457831325301205, + "grad_norm": 0.7852078676223755, + "learning_rate": 3.999994971675547e-05, + "loss": 0.0354, + "step": 158 + }, + { + "epoch": 0.30650602409638555, + "grad_norm": 0.5924715399742126, + "learning_rate": 3.999979886727471e-05, + "loss": 0.0366, + "step": 159 + }, + { + "epoch": 0.30843373493975906, + "grad_norm": 0.7359845638275146, + "learning_rate": 3.999954745231624e-05, + "loss": 0.0437, + "step": 160 + }, + { + "epoch": 0.3103614457831325, + "grad_norm": 0.7866976857185364, + "learning_rate": 3.999919547314426e-05, + "loss": 0.0363, + "step": 161 + }, + { + "epoch": 0.312289156626506, + "grad_norm": 0.7425745129585266, + "learning_rate": 3.999874293152863e-05, + "loss": 0.0259, + "step": 162 + }, + { + "epoch": 0.31421686746987953, + "grad_norm": 1.8922245502471924, + "learning_rate": 3.9998189829744885e-05, + "loss": 0.0341, + "step": 163 + }, + { + "epoch": 0.316144578313253, + "grad_norm": 0.7908634543418884, + "learning_rate": 3.99975361705742e-05, + "loss": 0.0424, + "step": 164 + }, + { + "epoch": 0.3180722891566265, + "grad_norm": 2.047368049621582, + "learning_rate": 3.999678195730337e-05, + "loss": 0.0535, + "step": 165 + }, + { + "epoch": 0.32, + "grad_norm": 0.5702639222145081, + "learning_rate": 3.999592719372484e-05, + "loss": 0.0284, + "step": 166 + }, + { + "epoch": 0.3219277108433735, + "grad_norm": 0.45015648007392883, + "learning_rate": 3.9994971884136636e-05, + "loss": 0.0313, + "step": 167 + }, + { + "epoch": 0.32385542168674697, + "grad_norm": 4.094679355621338, + "learning_rate": 3.9993916033342355e-05, + "loss": 0.0524, + "step": 168 + }, + { + "epoch": 0.3257831325301205, + "grad_norm": 0.800846517086029, + "learning_rate": 3.999275964665117e-05, + "loss": 0.0282, + "step": 169 + }, + { + "epoch": 0.327710843373494, + "grad_norm": 0.47881078720092773, + "learning_rate": 3.999150272987776e-05, + "loss": 0.0293, + "step": 170 + }, + { + "epoch": 0.3296385542168675, + "grad_norm": 0.5716657638549805, + "learning_rate": 3.999014528934232e-05, + "loss": 0.0221, + "step": 171 + }, + { + "epoch": 0.33156626506024095, + "grad_norm": 0.6333311200141907, + "learning_rate": 3.998868733187048e-05, + "loss": 0.0302, + "step": 172 + }, + { + "epoch": 0.33349397590361446, + "grad_norm": 6.642521858215332, + "learning_rate": 3.998712886479335e-05, + "loss": 0.0364, + "step": 173 + }, + { + "epoch": 0.33542168674698797, + "grad_norm": 0.7515506148338318, + "learning_rate": 3.998546989594739e-05, + "loss": 0.0296, + "step": 174 + }, + { + "epoch": 0.3373493975903614, + "grad_norm": 1.0728015899658203, + "learning_rate": 3.998371043367445e-05, + "loss": 0.0549, + "step": 175 + }, + { + "epoch": 0.33927710843373493, + "grad_norm": 1.3025579452514648, + "learning_rate": 3.998185048682166e-05, + "loss": 0.0577, + "step": 176 + }, + { + "epoch": 0.34120481927710844, + "grad_norm": 1.0962958335876465, + "learning_rate": 3.997989006474144e-05, + "loss": 0.0313, + "step": 177 + }, + { + "epoch": 0.34313253012048195, + "grad_norm": 0.7064313292503357, + "learning_rate": 3.997782917729143e-05, + "loss": 0.0309, + "step": 178 + }, + { + "epoch": 0.3450602409638554, + "grad_norm": 0.43374207615852356, + "learning_rate": 3.997566783483445e-05, + "loss": 0.0166, + "step": 179 + }, + { + "epoch": 0.3469879518072289, + "grad_norm": 0.7236390113830566, + "learning_rate": 3.9973406048238413e-05, + "loss": 0.0254, + "step": 180 + }, + { + "epoch": 0.3489156626506024, + "grad_norm": 0.5041500926017761, + "learning_rate": 3.9971043828876334e-05, + "loss": 0.0239, + "step": 181 + }, + { + "epoch": 0.35084337349397593, + "grad_norm": 1.2744532823562622, + "learning_rate": 3.9968581188626204e-05, + "loss": 0.0404, + "step": 182 + }, + { + "epoch": 0.3527710843373494, + "grad_norm": 0.45845362544059753, + "learning_rate": 3.996601813987098e-05, + "loss": 0.0127, + "step": 183 + }, + { + "epoch": 0.3546987951807229, + "grad_norm": 0.4426881968975067, + "learning_rate": 3.996335469549852e-05, + "loss": 0.0176, + "step": 184 + }, + { + "epoch": 0.3566265060240964, + "grad_norm": 1.0030732154846191, + "learning_rate": 3.9960590868901465e-05, + "loss": 0.0457, + "step": 185 + }, + { + "epoch": 0.35855421686746985, + "grad_norm": 0.6428582668304443, + "learning_rate": 3.995772667397725e-05, + "loss": 0.0271, + "step": 186 + }, + { + "epoch": 0.36048192771084336, + "grad_norm": 0.5335744619369507, + "learning_rate": 3.995476212512795e-05, + "loss": 0.0297, + "step": 187 + }, + { + "epoch": 0.3624096385542169, + "grad_norm": 0.6995761394500732, + "learning_rate": 3.99516972372603e-05, + "loss": 0.0322, + "step": 188 + }, + { + "epoch": 0.3643373493975904, + "grad_norm": 0.765511155128479, + "learning_rate": 3.9948532025785546e-05, + "loss": 0.0253, + "step": 189 + }, + { + "epoch": 0.36626506024096384, + "grad_norm": 0.6165828108787537, + "learning_rate": 3.9945266506619403e-05, + "loss": 0.0355, + "step": 190 + }, + { + "epoch": 0.36819277108433734, + "grad_norm": 0.851970911026001, + "learning_rate": 3.994190069618195e-05, + "loss": 0.056, + "step": 191 + }, + { + "epoch": 0.37012048192771085, + "grad_norm": 0.9850023984909058, + "learning_rate": 3.993843461139757e-05, + "loss": 0.0415, + "step": 192 + }, + { + "epoch": 0.37204819277108436, + "grad_norm": 0.7455295324325562, + "learning_rate": 3.9934868269694886e-05, + "loss": 0.0379, + "step": 193 + }, + { + "epoch": 0.3739759036144578, + "grad_norm": 1.159469723701477, + "learning_rate": 3.9931201689006595e-05, + "loss": 0.0237, + "step": 194 + }, + { + "epoch": 0.3759036144578313, + "grad_norm": 0.5490080118179321, + "learning_rate": 3.992743488776947e-05, + "loss": 0.024, + "step": 195 + }, + { + "epoch": 0.37783132530120483, + "grad_norm": 1.279831886291504, + "learning_rate": 3.992356788492421e-05, + "loss": 0.0273, + "step": 196 + }, + { + "epoch": 0.3797590361445783, + "grad_norm": 0.859104335308075, + "learning_rate": 3.9919600699915355e-05, + "loss": 0.0411, + "step": 197 + }, + { + "epoch": 0.3816867469879518, + "grad_norm": 1.2525300979614258, + "learning_rate": 3.991553335269119e-05, + "loss": 0.0857, + "step": 198 + }, + { + "epoch": 0.3836144578313253, + "grad_norm": 0.4924193024635315, + "learning_rate": 3.991136586370367e-05, + "loss": 0.0294, + "step": 199 + }, + { + "epoch": 0.3855421686746988, + "grad_norm": 1.417190670967102, + "learning_rate": 3.990709825390828e-05, + "loss": 0.0395, + "step": 200 + }, + { + "epoch": 0.38746987951807227, + "grad_norm": 0.6172056198120117, + "learning_rate": 3.9902730544763936e-05, + "loss": 0.0194, + "step": 201 + }, + { + "epoch": 0.3893975903614458, + "grad_norm": 0.7292149662971497, + "learning_rate": 3.989826275823291e-05, + "loss": 0.0381, + "step": 202 + }, + { + "epoch": 0.3913253012048193, + "grad_norm": 0.5949816107749939, + "learning_rate": 3.989369491678067e-05, + "loss": 0.0254, + "step": 203 + }, + { + "epoch": 0.3932530120481928, + "grad_norm": 0.6012582182884216, + "learning_rate": 3.988902704337582e-05, + "loss": 0.048, + "step": 204 + }, + { + "epoch": 0.39518072289156625, + "grad_norm": 0.6273590922355652, + "learning_rate": 3.9884259161489936e-05, + "loss": 0.0268, + "step": 205 + }, + { + "epoch": 0.39710843373493976, + "grad_norm": 0.9615244269371033, + "learning_rate": 3.987939129509746e-05, + "loss": 0.0192, + "step": 206 + }, + { + "epoch": 0.39903614457831327, + "grad_norm": 0.6009241342544556, + "learning_rate": 3.9874423468675624e-05, + "loss": 0.0362, + "step": 207 + }, + { + "epoch": 0.4009638554216867, + "grad_norm": 0.411335289478302, + "learning_rate": 3.9869355707204266e-05, + "loss": 0.017, + "step": 208 + }, + { + "epoch": 0.40289156626506023, + "grad_norm": 0.6151527166366577, + "learning_rate": 3.986418803616573e-05, + "loss": 0.0283, + "step": 209 + }, + { + "epoch": 0.40481927710843374, + "grad_norm": 0.33808204531669617, + "learning_rate": 3.985892048154474e-05, + "loss": 0.0158, + "step": 210 + }, + { + "epoch": 0.40674698795180725, + "grad_norm": 0.5464187860488892, + "learning_rate": 3.9853553069828284e-05, + "loss": 0.0292, + "step": 211 + }, + { + "epoch": 0.4086746987951807, + "grad_norm": 0.6658390760421753, + "learning_rate": 3.984808582800543e-05, + "loss": 0.0281, + "step": 212 + }, + { + "epoch": 0.4106024096385542, + "grad_norm": 0.4253764748573303, + "learning_rate": 3.984251878356726e-05, + "loss": 0.031, + "step": 213 + }, + { + "epoch": 0.4125301204819277, + "grad_norm": 0.32309481501579285, + "learning_rate": 3.983685196450667e-05, + "loss": 0.0166, + "step": 214 + }, + { + "epoch": 0.41445783132530123, + "grad_norm": 0.43756410479545593, + "learning_rate": 3.9831085399318265e-05, + "loss": 0.0326, + "step": 215 + }, + { + "epoch": 0.4163855421686747, + "grad_norm": 0.264046847820282, + "learning_rate": 3.982521911699822e-05, + "loss": 0.0118, + "step": 216 + }, + { + "epoch": 0.4183132530120482, + "grad_norm": 0.8630897402763367, + "learning_rate": 3.9819253147044084e-05, + "loss": 0.0246, + "step": 217 + }, + { + "epoch": 0.4202409638554217, + "grad_norm": 0.6923379898071289, + "learning_rate": 3.98131875194547e-05, + "loss": 0.036, + "step": 218 + }, + { + "epoch": 0.42216867469879515, + "grad_norm": 0.5874778628349304, + "learning_rate": 3.9807022264730024e-05, + "loss": 0.0255, + "step": 219 + }, + { + "epoch": 0.42409638554216866, + "grad_norm": 0.394336074590683, + "learning_rate": 3.980075741387094e-05, + "loss": 0.0187, + "step": 220 + }, + { + "epoch": 0.4260240963855422, + "grad_norm": 0.6300327777862549, + "learning_rate": 3.979439299837915e-05, + "loss": 0.0214, + "step": 221 + }, + { + "epoch": 0.4279518072289157, + "grad_norm": 0.5200467109680176, + "learning_rate": 3.978792905025702e-05, + "loss": 0.0628, + "step": 222 + }, + { + "epoch": 0.42987951807228914, + "grad_norm": 0.5713880062103271, + "learning_rate": 3.978136560200735e-05, + "loss": 0.0302, + "step": 223 + }, + { + "epoch": 0.43180722891566264, + "grad_norm": 0.5345383286476135, + "learning_rate": 3.977470268663331e-05, + "loss": 0.0125, + "step": 224 + }, + { + "epoch": 0.43373493975903615, + "grad_norm": 0.5378350019454956, + "learning_rate": 3.976794033763819e-05, + "loss": 0.0246, + "step": 225 + }, + { + "epoch": 0.43566265060240966, + "grad_norm": 0.5554935336112976, + "learning_rate": 3.9761078589025276e-05, + "loss": 0.0212, + "step": 226 + }, + { + "epoch": 0.4375903614457831, + "grad_norm": 0.2832634747028351, + "learning_rate": 3.9754117475297664e-05, + "loss": 0.0125, + "step": 227 + }, + { + "epoch": 0.4395180722891566, + "grad_norm": 1.2910150289535522, + "learning_rate": 3.97470570314581e-05, + "loss": 0.0364, + "step": 228 + }, + { + "epoch": 0.44144578313253013, + "grad_norm": 0.3731018602848053, + "learning_rate": 3.973989729300878e-05, + "loss": 0.0128, + "step": 229 + }, + { + "epoch": 0.4433734939759036, + "grad_norm": 0.9433871507644653, + "learning_rate": 3.9732638295951195e-05, + "loss": 0.0367, + "step": 230 + }, + { + "epoch": 0.4453012048192771, + "grad_norm": 1.0779197216033936, + "learning_rate": 3.972528007678594e-05, + "loss": 0.0667, + "step": 231 + }, + { + "epoch": 0.4472289156626506, + "grad_norm": 1.7009105682373047, + "learning_rate": 3.9717822672512516e-05, + "loss": 0.0655, + "step": 232 + }, + { + "epoch": 0.4491566265060241, + "grad_norm": 0.5646032094955444, + "learning_rate": 3.971026612062919e-05, + "loss": 0.064, + "step": 233 + }, + { + "epoch": 0.45108433734939757, + "grad_norm": 0.44474121928215027, + "learning_rate": 3.970261045913274e-05, + "loss": 0.0206, + "step": 234 + }, + { + "epoch": 0.4530120481927711, + "grad_norm": 1.3969277143478394, + "learning_rate": 3.969485572651833e-05, + "loss": 0.0486, + "step": 235 + }, + { + "epoch": 0.4549397590361446, + "grad_norm": 0.6401994228363037, + "learning_rate": 3.968700196177925e-05, + "loss": 0.0262, + "step": 236 + }, + { + "epoch": 0.4568674698795181, + "grad_norm": 0.7091913223266602, + "learning_rate": 3.96790492044068e-05, + "loss": 0.014, + "step": 237 + }, + { + "epoch": 0.45879518072289155, + "grad_norm": 0.6561547517776489, + "learning_rate": 3.967099749439002e-05, + "loss": 0.0482, + "step": 238 + }, + { + "epoch": 0.46072289156626506, + "grad_norm": 0.6924155354499817, + "learning_rate": 3.966284687221551e-05, + "loss": 0.0289, + "step": 239 + }, + { + "epoch": 0.46265060240963857, + "grad_norm": 0.5868663787841797, + "learning_rate": 3.9654597378867256e-05, + "loss": 0.0331, + "step": 240 + }, + { + "epoch": 0.464578313253012, + "grad_norm": 0.7930939793586731, + "learning_rate": 3.964624905582637e-05, + "loss": 0.0925, + "step": 241 + }, + { + "epoch": 0.46650602409638553, + "grad_norm": 0.4888836145401001, + "learning_rate": 3.9637801945070944e-05, + "loss": 0.015, + "step": 242 + }, + { + "epoch": 0.46843373493975904, + "grad_norm": 0.7820287346839905, + "learning_rate": 3.962925608907579e-05, + "loss": 0.0382, + "step": 243 + }, + { + "epoch": 0.47036144578313255, + "grad_norm": 0.4914316236972809, + "learning_rate": 3.962061153081224e-05, + "loss": 0.0257, + "step": 244 + }, + { + "epoch": 0.472289156626506, + "grad_norm": 0.5681505799293518, + "learning_rate": 3.961186831374793e-05, + "loss": 0.0551, + "step": 245 + }, + { + "epoch": 0.4742168674698795, + "grad_norm": 0.5049723386764526, + "learning_rate": 3.9603026481846616e-05, + "loss": 0.0186, + "step": 246 + }, + { + "epoch": 0.476144578313253, + "grad_norm": 0.5034119486808777, + "learning_rate": 3.959408607956787e-05, + "loss": 0.024, + "step": 247 + }, + { + "epoch": 0.47807228915662653, + "grad_norm": 0.4543336033821106, + "learning_rate": 3.958504715186695e-05, + "loss": 0.0256, + "step": 248 + }, + { + "epoch": 0.48, + "grad_norm": 0.5595743656158447, + "learning_rate": 3.957590974419452e-05, + "loss": 0.0222, + "step": 249 + }, + { + "epoch": 0.4819277108433735, + "grad_norm": 0.5701581239700317, + "learning_rate": 3.956667390249642e-05, + "loss": 0.0334, + "step": 250 + }, + { + "epoch": 0.483855421686747, + "grad_norm": 0.53755784034729, + "learning_rate": 3.9557339673213474e-05, + "loss": 0.0345, + "step": 251 + }, + { + "epoch": 0.4857831325301205, + "grad_norm": 0.4368877112865448, + "learning_rate": 3.95479071032812e-05, + "loss": 0.0183, + "step": 252 + }, + { + "epoch": 0.48771084337349396, + "grad_norm": 0.7972906827926636, + "learning_rate": 3.953837624012963e-05, + "loss": 0.0337, + "step": 253 + }, + { + "epoch": 0.48963855421686747, + "grad_norm": 0.6148451566696167, + "learning_rate": 3.9528747131683023e-05, + "loss": 0.0524, + "step": 254 + }, + { + "epoch": 0.491566265060241, + "grad_norm": 0.500840961933136, + "learning_rate": 3.9519019826359676e-05, + "loss": 0.0248, + "step": 255 + }, + { + "epoch": 0.49349397590361443, + "grad_norm": 0.5536255240440369, + "learning_rate": 3.9509194373071624e-05, + "loss": 0.0219, + "step": 256 + }, + { + "epoch": 0.49542168674698794, + "grad_norm": 0.6873176097869873, + "learning_rate": 3.9499270821224444e-05, + "loss": 0.0312, + "step": 257 + }, + { + "epoch": 0.49734939759036145, + "grad_norm": 0.37207168340682983, + "learning_rate": 3.9489249220716974e-05, + "loss": 0.0149, + "step": 258 + }, + { + "epoch": 0.49927710843373496, + "grad_norm": 0.4458799660205841, + "learning_rate": 3.947912962194107e-05, + "loss": 0.0214, + "step": 259 + }, + { + "epoch": 0.5012048192771085, + "grad_norm": 0.4272724390029907, + "learning_rate": 3.9468912075781345e-05, + "loss": 0.0263, + "step": 260 + }, + { + "epoch": 0.503132530120482, + "grad_norm": 0.5245792269706726, + "learning_rate": 3.945859663361496e-05, + "loss": 0.0103, + "step": 261 + }, + { + "epoch": 0.5050602409638554, + "grad_norm": 0.8799260854721069, + "learning_rate": 3.9448183347311284e-05, + "loss": 0.0292, + "step": 262 + }, + { + "epoch": 0.5069879518072289, + "grad_norm": 0.5996833443641663, + "learning_rate": 3.943767226923171e-05, + "loss": 0.0306, + "step": 263 + }, + { + "epoch": 0.5089156626506024, + "grad_norm": 0.6044682860374451, + "learning_rate": 3.942706345222935e-05, + "loss": 0.0218, + "step": 264 + }, + { + "epoch": 0.5108433734939759, + "grad_norm": 0.4770200848579407, + "learning_rate": 3.941635694964878e-05, + "loss": 0.0226, + "step": 265 + }, + { + "epoch": 0.5127710843373494, + "grad_norm": 0.5605704188346863, + "learning_rate": 3.940555281532576e-05, + "loss": 0.0354, + "step": 266 + }, + { + "epoch": 0.5146987951807229, + "grad_norm": 0.46532443165779114, + "learning_rate": 3.939465110358699e-05, + "loss": 0.0223, + "step": 267 + }, + { + "epoch": 0.5166265060240964, + "grad_norm": 0.5190595388412476, + "learning_rate": 3.93836518692498e-05, + "loss": 0.0219, + "step": 268 + }, + { + "epoch": 0.5185542168674698, + "grad_norm": 0.5767757892608643, + "learning_rate": 3.937255516762193e-05, + "loss": 0.0294, + "step": 269 + }, + { + "epoch": 0.5204819277108433, + "grad_norm": 0.4543164372444153, + "learning_rate": 3.936136105450119e-05, + "loss": 0.0244, + "step": 270 + }, + { + "epoch": 0.5224096385542168, + "grad_norm": 0.4155154526233673, + "learning_rate": 3.9350069586175195e-05, + "loss": 0.02, + "step": 271 + }, + { + "epoch": 0.5243373493975904, + "grad_norm": 0.5470768213272095, + "learning_rate": 3.933868081942113e-05, + "loss": 0.0187, + "step": 272 + }, + { + "epoch": 0.5262650602409639, + "grad_norm": 0.9491772651672363, + "learning_rate": 3.9327194811505406e-05, + "loss": 0.0337, + "step": 273 + }, + { + "epoch": 0.5281927710843374, + "grad_norm": 0.9313873052597046, + "learning_rate": 3.93156116201834e-05, + "loss": 0.0573, + "step": 274 + }, + { + "epoch": 0.5301204819277109, + "grad_norm": 0.7181005477905273, + "learning_rate": 3.930393130369915e-05, + "loss": 0.0405, + "step": 275 + }, + { + "epoch": 0.5320481927710843, + "grad_norm": 0.34231385588645935, + "learning_rate": 3.9292153920785076e-05, + "loss": 0.0153, + "step": 276 + }, + { + "epoch": 0.5339759036144578, + "grad_norm": 0.6899610161781311, + "learning_rate": 3.928027953066168e-05, + "loss": 0.0338, + "step": 277 + }, + { + "epoch": 0.5359036144578313, + "grad_norm": 0.7509781718254089, + "learning_rate": 3.926830819303726e-05, + "loss": 0.0416, + "step": 278 + }, + { + "epoch": 0.5378313253012048, + "grad_norm": 0.6326774954795837, + "learning_rate": 3.925623996810757e-05, + "loss": 0.0293, + "step": 279 + }, + { + "epoch": 0.5397590361445783, + "grad_norm": 0.5543203353881836, + "learning_rate": 3.924407491655557e-05, + "loss": 0.0263, + "step": 280 + }, + { + "epoch": 0.5416867469879518, + "grad_norm": 0.5367572903633118, + "learning_rate": 3.9231813099551086e-05, + "loss": 0.0276, + "step": 281 + }, + { + "epoch": 0.5436144578313253, + "grad_norm": 0.3143869638442993, + "learning_rate": 3.921945457875051e-05, + "loss": 0.0146, + "step": 282 + }, + { + "epoch": 0.5455421686746988, + "grad_norm": 0.47403043508529663, + "learning_rate": 3.920699941629649e-05, + "loss": 0.0267, + "step": 283 + }, + { + "epoch": 0.5474698795180722, + "grad_norm": 0.5082595348358154, + "learning_rate": 3.919444767481763e-05, + "loss": 0.0183, + "step": 284 + }, + { + "epoch": 0.5493975903614458, + "grad_norm": 0.747949481010437, + "learning_rate": 3.918179941742816e-05, + "loss": 0.0412, + "step": 285 + }, + { + "epoch": 0.5513253012048193, + "grad_norm": 0.6553886532783508, + "learning_rate": 3.916905470772762e-05, + "loss": 0.0505, + "step": 286 + }, + { + "epoch": 0.5532530120481928, + "grad_norm": 0.3838176131248474, + "learning_rate": 3.9156213609800545e-05, + "loss": 0.0156, + "step": 287 + }, + { + "epoch": 0.5551807228915663, + "grad_norm": 0.7427731156349182, + "learning_rate": 3.914327618821614e-05, + "loss": 0.0278, + "step": 288 + }, + { + "epoch": 0.5571084337349398, + "grad_norm": 0.2612821161746979, + "learning_rate": 3.913024250802796e-05, + "loss": 0.0101, + "step": 289 + }, + { + "epoch": 0.5590361445783133, + "grad_norm": 0.3799416124820709, + "learning_rate": 3.911711263477357e-05, + "loss": 0.0168, + "step": 290 + }, + { + "epoch": 0.5609638554216867, + "grad_norm": 0.5053854584693909, + "learning_rate": 3.910388663447425e-05, + "loss": 0.0249, + "step": 291 + }, + { + "epoch": 0.5628915662650602, + "grad_norm": 0.38095012307167053, + "learning_rate": 3.909056457363461e-05, + "loss": 0.0156, + "step": 292 + }, + { + "epoch": 0.5648192771084337, + "grad_norm": 0.4477892220020294, + "learning_rate": 3.907714651924229e-05, + "loss": 0.0309, + "step": 293 + }, + { + "epoch": 0.5667469879518072, + "grad_norm": 0.5875864624977112, + "learning_rate": 3.906363253876763e-05, + "loss": 0.0287, + "step": 294 + }, + { + "epoch": 0.5686746987951807, + "grad_norm": 0.522990882396698, + "learning_rate": 3.90500227001633e-05, + "loss": 0.0318, + "step": 295 + }, + { + "epoch": 0.5706024096385542, + "grad_norm": 0.4153876304626465, + "learning_rate": 3.9036317071863994e-05, + "loss": 0.0192, + "step": 296 + }, + { + "epoch": 0.5725301204819278, + "grad_norm": 0.4675769507884979, + "learning_rate": 3.902251572278605e-05, + "loss": 0.067, + "step": 297 + }, + { + "epoch": 0.5744578313253013, + "grad_norm": 0.35778650641441345, + "learning_rate": 3.900861872232713e-05, + "loss": 0.0197, + "step": 298 + }, + { + "epoch": 0.5763855421686747, + "grad_norm": 0.7382330894470215, + "learning_rate": 3.899462614036587e-05, + "loss": 0.0283, + "step": 299 + }, + { + "epoch": 0.5783132530120482, + "grad_norm": 0.41268599033355713, + "learning_rate": 3.89805380472615e-05, + "loss": 0.0207, + "step": 300 + }, + { + "epoch": 0.5802409638554217, + "grad_norm": 1.2013020515441895, + "learning_rate": 3.8966354513853535e-05, + "loss": 0.0301, + "step": 301 + }, + { + "epoch": 0.5821686746987952, + "grad_norm": 0.424757719039917, + "learning_rate": 3.895207561146137e-05, + "loss": 0.022, + "step": 302 + }, + { + "epoch": 0.5840963855421687, + "grad_norm": 0.4196677505970001, + "learning_rate": 3.893770141188396e-05, + "loss": 0.0424, + "step": 303 + }, + { + "epoch": 0.5860240963855422, + "grad_norm": 0.8644190430641174, + "learning_rate": 3.892323198739946e-05, + "loss": 0.08, + "step": 304 + }, + { + "epoch": 0.5879518072289157, + "grad_norm": 0.5645135045051575, + "learning_rate": 3.890866741076482e-05, + "loss": 0.0152, + "step": 305 + }, + { + "epoch": 0.5898795180722891, + "grad_norm": 0.5218387246131897, + "learning_rate": 3.889400775521545e-05, + "loss": 0.0205, + "step": 306 + }, + { + "epoch": 0.5918072289156626, + "grad_norm": 0.39709413051605225, + "learning_rate": 3.8879253094464865e-05, + "loss": 0.0233, + "step": 307 + }, + { + "epoch": 0.5937349397590361, + "grad_norm": 0.3572910726070404, + "learning_rate": 3.8864403502704285e-05, + "loss": 0.0198, + "step": 308 + }, + { + "epoch": 0.5956626506024096, + "grad_norm": 0.382709264755249, + "learning_rate": 3.8849459054602274e-05, + "loss": 0.0176, + "step": 309 + }, + { + "epoch": 0.5975903614457831, + "grad_norm": 3.4527227878570557, + "learning_rate": 3.883441982530436e-05, + "loss": 0.0239, + "step": 310 + }, + { + "epoch": 0.5995180722891567, + "grad_norm": 0.4467569589614868, + "learning_rate": 3.8819285890432674e-05, + "loss": 0.0284, + "step": 311 + }, + { + "epoch": 0.6014457831325302, + "grad_norm": 0.44513460993766785, + "learning_rate": 3.880405732608555e-05, + "loss": 0.0233, + "step": 312 + }, + { + "epoch": 0.6033734939759036, + "grad_norm": 0.8029689192771912, + "learning_rate": 3.8788734208837155e-05, + "loss": 0.0433, + "step": 313 + }, + { + "epoch": 0.6053012048192771, + "grad_norm": 0.7291454076766968, + "learning_rate": 3.877331661573709e-05, + "loss": 0.043, + "step": 314 + }, + { + "epoch": 0.6072289156626506, + "grad_norm": 0.6050467491149902, + "learning_rate": 3.8757804624310006e-05, + "loss": 0.0377, + "step": 315 + }, + { + "epoch": 0.6091566265060241, + "grad_norm": 0.6714366674423218, + "learning_rate": 3.874219831255524e-05, + "loss": 0.046, + "step": 316 + }, + { + "epoch": 0.6110843373493976, + "grad_norm": 0.336037278175354, + "learning_rate": 3.8726497758946394e-05, + "loss": 0.0149, + "step": 317 + }, + { + "epoch": 0.6130120481927711, + "grad_norm": 0.3057402968406677, + "learning_rate": 3.871070304243094e-05, + "loss": 0.014, + "step": 318 + }, + { + "epoch": 0.6149397590361446, + "grad_norm": 0.4537644684314728, + "learning_rate": 3.8694814242429834e-05, + "loss": 0.0503, + "step": 319 + }, + { + "epoch": 0.6168674698795181, + "grad_norm": 0.45573824644088745, + "learning_rate": 3.8678831438837116e-05, + "loss": 0.021, + "step": 320 + }, + { + "epoch": 0.6187951807228915, + "grad_norm": 0.30729591846466064, + "learning_rate": 3.866275471201952e-05, + "loss": 0.0163, + "step": 321 + }, + { + "epoch": 0.620722891566265, + "grad_norm": 0.7614850401878357, + "learning_rate": 3.8646584142816036e-05, + "loss": 0.0347, + "step": 322 + }, + { + "epoch": 0.6226506024096385, + "grad_norm": 0.5323611497879028, + "learning_rate": 3.863031981253754e-05, + "loss": 0.0201, + "step": 323 + }, + { + "epoch": 0.624578313253012, + "grad_norm": 0.34426453709602356, + "learning_rate": 3.861396180296635e-05, + "loss": 0.0243, + "step": 324 + }, + { + "epoch": 0.6265060240963856, + "grad_norm": 0.621636152267456, + "learning_rate": 3.859751019635585e-05, + "loss": 0.0166, + "step": 325 + }, + { + "epoch": 0.6284337349397591, + "grad_norm": 0.549324095249176, + "learning_rate": 3.858096507543006e-05, + "loss": 0.0274, + "step": 326 + }, + { + "epoch": 0.6303614457831326, + "grad_norm": 0.358426570892334, + "learning_rate": 3.8564326523383214e-05, + "loss": 0.0207, + "step": 327 + }, + { + "epoch": 0.632289156626506, + "grad_norm": 0.3639723062515259, + "learning_rate": 3.8547594623879346e-05, + "loss": 0.0297, + "step": 328 + }, + { + "epoch": 0.6342168674698795, + "grad_norm": 0.3402212858200073, + "learning_rate": 3.853076946105188e-05, + "loss": 0.0258, + "step": 329 + }, + { + "epoch": 0.636144578313253, + "grad_norm": 0.4083027243614197, + "learning_rate": 3.85138511195032e-05, + "loss": 0.0351, + "step": 330 + }, + { + "epoch": 0.6380722891566265, + "grad_norm": 0.43532121181488037, + "learning_rate": 3.84968396843042e-05, + "loss": 0.0388, + "step": 331 + }, + { + "epoch": 0.64, + "grad_norm": 0.35353463888168335, + "learning_rate": 3.8479735240993904e-05, + "loss": 0.0203, + "step": 332 + }, + { + "epoch": 0.6419277108433735, + "grad_norm": 0.350149929523468, + "learning_rate": 3.846253787557901e-05, + "loss": 0.0261, + "step": 333 + }, + { + "epoch": 0.643855421686747, + "grad_norm": 0.7665389180183411, + "learning_rate": 3.844524767453344e-05, + "loss": 0.0108, + "step": 334 + }, + { + "epoch": 0.6457831325301204, + "grad_norm": 0.44621360301971436, + "learning_rate": 3.842786472479795e-05, + "loss": 0.0282, + "step": 335 + }, + { + "epoch": 0.6477108433734939, + "grad_norm": 0.7787201404571533, + "learning_rate": 3.841038911377962e-05, + "loss": 0.0216, + "step": 336 + }, + { + "epoch": 0.6496385542168674, + "grad_norm": 0.48260653018951416, + "learning_rate": 3.839282092935153e-05, + "loss": 0.0234, + "step": 337 + }, + { + "epoch": 0.651566265060241, + "grad_norm": 0.4987852871417999, + "learning_rate": 3.837516025985219e-05, + "loss": 0.0515, + "step": 338 + }, + { + "epoch": 0.6534939759036145, + "grad_norm": 0.9030266404151917, + "learning_rate": 3.835740719408517e-05, + "loss": 0.0508, + "step": 339 + }, + { + "epoch": 0.655421686746988, + "grad_norm": 0.6381701231002808, + "learning_rate": 3.833956182131867e-05, + "loss": 0.0405, + "step": 340 + }, + { + "epoch": 0.6573493975903615, + "grad_norm": 0.42828986048698425, + "learning_rate": 3.832162423128499e-05, + "loss": 0.024, + "step": 341 + }, + { + "epoch": 0.659277108433735, + "grad_norm": 0.38725873827934265, + "learning_rate": 3.8303594514180164e-05, + "loss": 0.0199, + "step": 342 + }, + { + "epoch": 0.6612048192771084, + "grad_norm": 0.23280498385429382, + "learning_rate": 3.828547276066346e-05, + "loss": 0.0101, + "step": 343 + }, + { + "epoch": 0.6631325301204819, + "grad_norm": 0.7298216819763184, + "learning_rate": 3.8267259061856925e-05, + "loss": 0.0455, + "step": 344 + }, + { + "epoch": 0.6650602409638554, + "grad_norm": 0.5975687503814697, + "learning_rate": 3.824895350934496e-05, + "loss": 0.0372, + "step": 345 + }, + { + "epoch": 0.6669879518072289, + "grad_norm": 0.6295403242111206, + "learning_rate": 3.823055619517381e-05, + "loss": 0.0362, + "step": 346 + }, + { + "epoch": 0.6689156626506024, + "grad_norm": 0.5086020827293396, + "learning_rate": 3.821206721185115e-05, + "loss": 0.0368, + "step": 347 + }, + { + "epoch": 0.6708433734939759, + "grad_norm": 0.34506168961524963, + "learning_rate": 3.819348665234557e-05, + "loss": 0.0178, + "step": 348 + }, + { + "epoch": 0.6727710843373494, + "grad_norm": 1.309940218925476, + "learning_rate": 3.817481461008617e-05, + "loss": 0.024, + "step": 349 + }, + { + "epoch": 0.6746987951807228, + "grad_norm": 0.4074770510196686, + "learning_rate": 3.815605117896204e-05, + "loss": 0.0262, + "step": 350 + }, + { + "epoch": 0.6766265060240964, + "grad_norm": 0.48525840044021606, + "learning_rate": 3.8137196453321775e-05, + "loss": 0.0209, + "step": 351 + }, + { + "epoch": 0.6785542168674699, + "grad_norm": 0.7199739217758179, + "learning_rate": 3.811825052797308e-05, + "loss": 0.0396, + "step": 352 + }, + { + "epoch": 0.6804819277108434, + "grad_norm": 0.519540011882782, + "learning_rate": 3.8099213498182196e-05, + "loss": 0.0453, + "step": 353 + }, + { + "epoch": 0.6824096385542169, + "grad_norm": 0.9738391041755676, + "learning_rate": 3.808008545967349e-05, + "loss": 0.0317, + "step": 354 + }, + { + "epoch": 0.6843373493975904, + "grad_norm": 1.888344407081604, + "learning_rate": 3.8060866508628953e-05, + "loss": 0.0452, + "step": 355 + }, + { + "epoch": 0.6862650602409639, + "grad_norm": 0.48989811539649963, + "learning_rate": 3.8041556741687695e-05, + "loss": 0.0315, + "step": 356 + }, + { + "epoch": 0.6881927710843373, + "grad_norm": 0.3764645457267761, + "learning_rate": 3.8022156255945496e-05, + "loss": 0.0269, + "step": 357 + }, + { + "epoch": 0.6901204819277108, + "grad_norm": 0.46409738063812256, + "learning_rate": 3.800266514895429e-05, + "loss": 0.0171, + "step": 358 + }, + { + "epoch": 0.6920481927710843, + "grad_norm": 0.41091030836105347, + "learning_rate": 3.7983083518721695e-05, + "loss": 0.0167, + "step": 359 + }, + { + "epoch": 0.6939759036144578, + "grad_norm": 0.8375523090362549, + "learning_rate": 3.79634114637105e-05, + "loss": 0.0342, + "step": 360 + }, + { + "epoch": 0.6959036144578313, + "grad_norm": 1.7053394317626953, + "learning_rate": 3.794364908283817e-05, + "loss": 0.02, + "step": 361 + }, + { + "epoch": 0.6978313253012048, + "grad_norm": 0.4163115918636322, + "learning_rate": 3.792379647547637e-05, + "loss": 0.0138, + "step": 362 + }, + { + "epoch": 0.6997590361445784, + "grad_norm": 0.388751745223999, + "learning_rate": 3.790385374145046e-05, + "loss": 0.0172, + "step": 363 + }, + { + "epoch": 0.7016867469879519, + "grad_norm": 0.5584064722061157, + "learning_rate": 3.7883820981038966e-05, + "loss": 0.0254, + "step": 364 + }, + { + "epoch": 0.7036144578313253, + "grad_norm": 1.394264817237854, + "learning_rate": 3.7863698294973114e-05, + "loss": 0.037, + "step": 365 + }, + { + "epoch": 0.7055421686746988, + "grad_norm": 0.46280744671821594, + "learning_rate": 3.78434857844363e-05, + "loss": 0.0234, + "step": 366 + }, + { + "epoch": 0.7074698795180723, + "grad_norm": 0.39548924565315247, + "learning_rate": 3.782318355106358e-05, + "loss": 0.0164, + "step": 367 + }, + { + "epoch": 0.7093975903614458, + "grad_norm": 0.7307773232460022, + "learning_rate": 3.780279169694118e-05, + "loss": 0.0192, + "step": 368 + }, + { + "epoch": 0.7113253012048193, + "grad_norm": 0.28035807609558105, + "learning_rate": 3.778231032460594e-05, + "loss": 0.0131, + "step": 369 + }, + { + "epoch": 0.7132530120481928, + "grad_norm": 0.8376953601837158, + "learning_rate": 3.776173953704486e-05, + "loss": 0.0291, + "step": 370 + }, + { + "epoch": 0.7151807228915663, + "grad_norm": 0.7356843948364258, + "learning_rate": 3.774107943769454e-05, + "loss": 0.0214, + "step": 371 + }, + { + "epoch": 0.7171084337349397, + "grad_norm": 0.41503390669822693, + "learning_rate": 3.772033013044064e-05, + "loss": 0.0221, + "step": 372 + }, + { + "epoch": 0.7190361445783132, + "grad_norm": 0.35732385516166687, + "learning_rate": 3.7699491719617436e-05, + "loss": 0.015, + "step": 373 + }, + { + "epoch": 0.7209638554216867, + "grad_norm": 0.283778578042984, + "learning_rate": 3.76785643100072e-05, + "loss": 0.0146, + "step": 374 + }, + { + "epoch": 0.7228915662650602, + "grad_norm": 0.3219413459300995, + "learning_rate": 3.765754800683974e-05, + "loss": 0.015, + "step": 375 + }, + { + "epoch": 0.7248192771084337, + "grad_norm": 0.610431432723999, + "learning_rate": 3.7636442915791856e-05, + "loss": 0.0326, + "step": 376 + }, + { + "epoch": 0.7267469879518073, + "grad_norm": 4.944870948791504, + "learning_rate": 3.7615249142986784e-05, + "loss": 0.0432, + "step": 377 + }, + { + "epoch": 0.7286746987951808, + "grad_norm": 0.4894593060016632, + "learning_rate": 3.7593966794993696e-05, + "loss": 0.0174, + "step": 378 + }, + { + "epoch": 0.7306024096385542, + "grad_norm": 0.4211325943470001, + "learning_rate": 3.757259597882714e-05, + "loss": 0.023, + "step": 379 + }, + { + "epoch": 0.7325301204819277, + "grad_norm": 0.33621737360954285, + "learning_rate": 3.755113680194651e-05, + "loss": 0.0201, + "step": 380 + }, + { + "epoch": 0.7344578313253012, + "grad_norm": 0.5799694657325745, + "learning_rate": 3.7529589372255514e-05, + "loss": 0.0173, + "step": 381 + }, + { + "epoch": 0.7363855421686747, + "grad_norm": 0.5172572731971741, + "learning_rate": 3.750795379810162e-05, + "loss": 0.0284, + "step": 382 + }, + { + "epoch": 0.7383132530120482, + "grad_norm": 0.5715453028678894, + "learning_rate": 3.748623018827552e-05, + "loss": 0.0194, + "step": 383 + }, + { + "epoch": 0.7402409638554217, + "grad_norm": 0.5284178256988525, + "learning_rate": 3.746441865201056e-05, + "loss": 0.0247, + "step": 384 + }, + { + "epoch": 0.7421686746987952, + "grad_norm": 0.37828654050827026, + "learning_rate": 3.744251929898223e-05, + "loss": 0.0097, + "step": 385 + }, + { + "epoch": 0.7440963855421687, + "grad_norm": 0.3252779543399811, + "learning_rate": 3.742053223930758e-05, + "loss": 0.0238, + "step": 386 + }, + { + "epoch": 0.7460240963855421, + "grad_norm": 0.6031543612480164, + "learning_rate": 3.7398457583544674e-05, + "loss": 0.0332, + "step": 387 + }, + { + "epoch": 0.7479518072289156, + "grad_norm": 0.23846614360809326, + "learning_rate": 3.737629544269206e-05, + "loss": 0.0122, + "step": 388 + }, + { + "epoch": 0.7498795180722891, + "grad_norm": 0.5274029970169067, + "learning_rate": 3.7354045928188155e-05, + "loss": 0.0324, + "step": 389 + }, + { + "epoch": 0.7518072289156627, + "grad_norm": 0.4672217071056366, + "learning_rate": 3.733170915191075e-05, + "loss": 0.0196, + "step": 390 + }, + { + "epoch": 0.7537349397590362, + "grad_norm": 0.29819396138191223, + "learning_rate": 3.730928522617639e-05, + "loss": 0.0131, + "step": 391 + }, + { + "epoch": 0.7556626506024097, + "grad_norm": 0.43824997544288635, + "learning_rate": 3.7286774263739855e-05, + "loss": 0.0238, + "step": 392 + }, + { + "epoch": 0.7575903614457832, + "grad_norm": 0.2822072505950928, + "learning_rate": 3.726417637779357e-05, + "loss": 0.0314, + "step": 393 + }, + { + "epoch": 0.7595180722891566, + "grad_norm": 0.43815648555755615, + "learning_rate": 3.7241491681967044e-05, + "loss": 0.0144, + "step": 394 + }, + { + "epoch": 0.7614457831325301, + "grad_norm": 0.37194815278053284, + "learning_rate": 3.721872029032628e-05, + "loss": 0.0286, + "step": 395 + }, + { + "epoch": 0.7633734939759036, + "grad_norm": 0.7319737672805786, + "learning_rate": 3.719586231737322e-05, + "loss": 0.0427, + "step": 396 + }, + { + "epoch": 0.7653012048192771, + "grad_norm": 0.5870066285133362, + "learning_rate": 3.717291787804517e-05, + "loss": 0.0138, + "step": 397 + }, + { + "epoch": 0.7672289156626506, + "grad_norm": 0.6574277281761169, + "learning_rate": 3.7149887087714225e-05, + "loss": 0.061, + "step": 398 + }, + { + "epoch": 0.7691566265060241, + "grad_norm": 0.5467348694801331, + "learning_rate": 3.712677006218666e-05, + "loss": 0.022, + "step": 399 + }, + { + "epoch": 0.7710843373493976, + "grad_norm": 0.3589288890361786, + "learning_rate": 3.710356691770238e-05, + "loss": 0.0161, + "step": 400 + }, + { + "epoch": 0.7730120481927711, + "grad_norm": 0.574630618095398, + "learning_rate": 3.708027777093433e-05, + "loss": 0.0285, + "step": 401 + }, + { + "epoch": 0.7749397590361445, + "grad_norm": 0.39048445224761963, + "learning_rate": 3.70569027389879e-05, + "loss": 0.012, + "step": 402 + }, + { + "epoch": 0.776867469879518, + "grad_norm": 0.34803536534309387, + "learning_rate": 3.703344193940032e-05, + "loss": 0.0155, + "step": 403 + }, + { + "epoch": 0.7787951807228916, + "grad_norm": 1.188948392868042, + "learning_rate": 3.700989549014011e-05, + "loss": 0.0617, + "step": 404 + }, + { + "epoch": 0.7807228915662651, + "grad_norm": 0.473157674074173, + "learning_rate": 3.698626350960646e-05, + "loss": 0.0298, + "step": 405 + }, + { + "epoch": 0.7826506024096386, + "grad_norm": 0.42009076476097107, + "learning_rate": 3.6962546116628634e-05, + "loss": 0.03, + "step": 406 + }, + { + "epoch": 0.7845783132530121, + "grad_norm": 0.6334308981895447, + "learning_rate": 3.693874343046537e-05, + "loss": 0.0107, + "step": 407 + }, + { + "epoch": 0.7865060240963856, + "grad_norm": 0.35594677925109863, + "learning_rate": 3.6914855570804314e-05, + "loss": 0.0174, + "step": 408 + }, + { + "epoch": 0.788433734939759, + "grad_norm": 0.28985708951950073, + "learning_rate": 3.689088265776136e-05, + "loss": 0.0149, + "step": 409 + }, + { + "epoch": 0.7903614457831325, + "grad_norm": 0.3981950581073761, + "learning_rate": 3.686682481188011e-05, + "loss": 0.019, + "step": 410 + }, + { + "epoch": 0.792289156626506, + "grad_norm": 0.48819583654403687, + "learning_rate": 3.6842682154131193e-05, + "loss": 0.0217, + "step": 411 + }, + { + "epoch": 0.7942168674698795, + "grad_norm": 0.42819952964782715, + "learning_rate": 3.681845480591174e-05, + "loss": 0.0198, + "step": 412 + }, + { + "epoch": 0.796144578313253, + "grad_norm": 0.48591694235801697, + "learning_rate": 3.6794142889044727e-05, + "loss": 0.0253, + "step": 413 + }, + { + "epoch": 0.7980722891566265, + "grad_norm": 0.4730607271194458, + "learning_rate": 3.676974652577835e-05, + "loss": 0.0329, + "step": 414 + }, + { + "epoch": 0.8, + "grad_norm": 0.5390865802764893, + "learning_rate": 3.6745265838785434e-05, + "loss": 0.0479, + "step": 415 + }, + { + "epoch": 0.8019277108433734, + "grad_norm": 0.6377891302108765, + "learning_rate": 3.672070095116283e-05, + "loss": 0.019, + "step": 416 + }, + { + "epoch": 0.803855421686747, + "grad_norm": 0.8984615206718445, + "learning_rate": 3.669605198643075e-05, + "loss": 0.0444, + "step": 417 + }, + { + "epoch": 0.8057831325301205, + "grad_norm": 0.4913877546787262, + "learning_rate": 3.667131906853219e-05, + "loss": 0.031, + "step": 418 + }, + { + "epoch": 0.807710843373494, + "grad_norm": 0.37894028425216675, + "learning_rate": 3.664650232183229e-05, + "loss": 0.0195, + "step": 419 + }, + { + "epoch": 0.8096385542168675, + "grad_norm": 0.3644949495792389, + "learning_rate": 3.66216018711177e-05, + "loss": 0.018, + "step": 420 + }, + { + "epoch": 0.811566265060241, + "grad_norm": 0.414440393447876, + "learning_rate": 3.659661784159597e-05, + "loss": 0.0188, + "step": 421 + }, + { + "epoch": 0.8134939759036145, + "grad_norm": 0.49220341444015503, + "learning_rate": 3.65715503588949e-05, + "loss": 0.016, + "step": 422 + }, + { + "epoch": 0.815421686746988, + "grad_norm": 1.0939836502075195, + "learning_rate": 3.654639954906193e-05, + "loss": 0.0758, + "step": 423 + }, + { + "epoch": 0.8173493975903614, + "grad_norm": 0.43222442269325256, + "learning_rate": 3.652116553856349e-05, + "loss": 0.0308, + "step": 424 + }, + { + "epoch": 0.8192771084337349, + "grad_norm": 0.5081896185874939, + "learning_rate": 3.649584845428438e-05, + "loss": 0.0493, + "step": 425 + }, + { + "epoch": 0.8212048192771084, + "grad_norm": 0.9811948537826538, + "learning_rate": 3.64704484235271e-05, + "loss": 0.019, + "step": 426 + }, + { + "epoch": 0.8231325301204819, + "grad_norm": 0.31656572222709656, + "learning_rate": 3.6444965574011255e-05, + "loss": 0.0135, + "step": 427 + }, + { + "epoch": 0.8250602409638554, + "grad_norm": 0.7844433188438416, + "learning_rate": 3.641940003387289e-05, + "loss": 0.0402, + "step": 428 + }, + { + "epoch": 0.826987951807229, + "grad_norm": 0.3353273570537567, + "learning_rate": 3.6393751931663814e-05, + "loss": 0.0132, + "step": 429 + }, + { + "epoch": 0.8289156626506025, + "grad_norm": 0.7253058552742004, + "learning_rate": 3.6368021396351015e-05, + "loss": 0.0296, + "step": 430 + }, + { + "epoch": 0.8308433734939759, + "grad_norm": 0.45300304889678955, + "learning_rate": 3.634220855731598e-05, + "loss": 0.0258, + "step": 431 + }, + { + "epoch": 0.8327710843373494, + "grad_norm": 0.3480473458766937, + "learning_rate": 3.631631354435403e-05, + "loss": 0.0099, + "step": 432 + }, + { + "epoch": 0.8346987951807229, + "grad_norm": 2.1114516258239746, + "learning_rate": 3.62903364876737e-05, + "loss": 0.0457, + "step": 433 + }, + { + "epoch": 0.8366265060240964, + "grad_norm": 0.5649561882019043, + "learning_rate": 3.626427751789606e-05, + "loss": 0.0444, + "step": 434 + }, + { + "epoch": 0.8385542168674699, + "grad_norm": 0.3864995539188385, + "learning_rate": 3.623813676605405e-05, + "loss": 0.0223, + "step": 435 + }, + { + "epoch": 0.8404819277108434, + "grad_norm": 1.2134298086166382, + "learning_rate": 3.621191436359186e-05, + "loss": 0.0353, + "step": 436 + }, + { + "epoch": 0.8424096385542169, + "grad_norm": 0.4403415024280548, + "learning_rate": 3.6185610442364246e-05, + "loss": 0.0216, + "step": 437 + }, + { + "epoch": 0.8443373493975903, + "grad_norm": 0.6050297021865845, + "learning_rate": 3.6159225134635846e-05, + "loss": 0.0433, + "step": 438 + }, + { + "epoch": 0.8462650602409638, + "grad_norm": 0.7951678037643433, + "learning_rate": 3.6132758573080556e-05, + "loss": 0.031, + "step": 439 + }, + { + "epoch": 0.8481927710843373, + "grad_norm": 0.4991949796676636, + "learning_rate": 3.6106210890780834e-05, + "loss": 0.0313, + "step": 440 + }, + { + "epoch": 0.8501204819277108, + "grad_norm": 0.47951385378837585, + "learning_rate": 3.607958222122704e-05, + "loss": 0.0218, + "step": 441 + }, + { + "epoch": 0.8520481927710843, + "grad_norm": 0.7345194220542908, + "learning_rate": 3.6052872698316755e-05, + "loss": 0.0239, + "step": 442 + }, + { + "epoch": 0.8539759036144579, + "grad_norm": 1.4814884662628174, + "learning_rate": 3.602608245635414e-05, + "loss": 0.0127, + "step": 443 + }, + { + "epoch": 0.8559036144578314, + "grad_norm": 2.4240877628326416, + "learning_rate": 3.599921163004922e-05, + "loss": 0.0618, + "step": 444 + }, + { + "epoch": 0.8578313253012049, + "grad_norm": 0.41523510217666626, + "learning_rate": 3.5972260354517216e-05, + "loss": 0.0283, + "step": 445 + }, + { + "epoch": 0.8597590361445783, + "grad_norm": 0.5577677488327026, + "learning_rate": 3.594522876527791e-05, + "loss": 0.0271, + "step": 446 + }, + { + "epoch": 0.8616867469879518, + "grad_norm": 0.5829064846038818, + "learning_rate": 3.591811699825487e-05, + "loss": 0.0169, + "step": 447 + }, + { + "epoch": 0.8636144578313253, + "grad_norm": 0.4478822350502014, + "learning_rate": 3.5890925189774886e-05, + "loss": 0.0239, + "step": 448 + }, + { + "epoch": 0.8655421686746988, + "grad_norm": 0.3498048782348633, + "learning_rate": 3.586365347656718e-05, + "loss": 0.0137, + "step": 449 + }, + { + "epoch": 0.8674698795180723, + "grad_norm": 0.6571130156517029, + "learning_rate": 3.583630199576278e-05, + "loss": 0.027, + "step": 450 + }, + { + "epoch": 0.8693975903614458, + "grad_norm": 0.344970166683197, + "learning_rate": 3.58088708848938e-05, + "loss": 0.0167, + "step": 451 + }, + { + "epoch": 0.8713253012048193, + "grad_norm": 0.34611570835113525, + "learning_rate": 3.5781360281892775e-05, + "loss": 0.0468, + "step": 452 + }, + { + "epoch": 0.8732530120481927, + "grad_norm": 0.66157066822052, + "learning_rate": 3.575377032509194e-05, + "loss": 0.0344, + "step": 453 + }, + { + "epoch": 0.8751807228915662, + "grad_norm": 0.3676326870918274, + "learning_rate": 3.5726101153222534e-05, + "loss": 0.0366, + "step": 454 + }, + { + "epoch": 0.8771084337349397, + "grad_norm": 0.5958423018455505, + "learning_rate": 3.569835290541414e-05, + "loss": 0.0382, + "step": 455 + }, + { + "epoch": 0.8790361445783132, + "grad_norm": 0.36787471175193787, + "learning_rate": 3.567052572119397e-05, + "loss": 0.018, + "step": 456 + }, + { + "epoch": 0.8809638554216868, + "grad_norm": 0.9478234052658081, + "learning_rate": 3.564261974048611e-05, + "loss": 0.0179, + "step": 457 + }, + { + "epoch": 0.8828915662650603, + "grad_norm": 0.3337579369544983, + "learning_rate": 3.56146351036109e-05, + "loss": 0.0147, + "step": 458 + }, + { + "epoch": 0.8848192771084338, + "grad_norm": 0.4984932243824005, + "learning_rate": 3.558657195128416e-05, + "loss": 0.0224, + "step": 459 + }, + { + "epoch": 0.8867469879518072, + "grad_norm": 0.36718735098838806, + "learning_rate": 3.555843042461653e-05, + "loss": 0.0202, + "step": 460 + }, + { + "epoch": 0.8886746987951807, + "grad_norm": 0.4081745445728302, + "learning_rate": 3.553021066511274e-05, + "loss": 0.0288, + "step": 461 + }, + { + "epoch": 0.8906024096385542, + "grad_norm": 0.3233242332935333, + "learning_rate": 3.55019128146709e-05, + "loss": 0.0362, + "step": 462 + }, + { + "epoch": 0.8925301204819277, + "grad_norm": 0.6560158729553223, + "learning_rate": 3.547353701558178e-05, + "loss": 0.038, + "step": 463 + }, + { + "epoch": 0.8944578313253012, + "grad_norm": 0.47668641805648804, + "learning_rate": 3.544508341052811e-05, + "loss": 0.0399, + "step": 464 + }, + { + "epoch": 0.8963855421686747, + "grad_norm": 0.45512664318084717, + "learning_rate": 3.541655214258383e-05, + "loss": 0.022, + "step": 465 + }, + { + "epoch": 0.8983132530120482, + "grad_norm": 0.8410730361938477, + "learning_rate": 3.538794335521343e-05, + "loss": 0.0315, + "step": 466 + }, + { + "epoch": 0.9002409638554217, + "grad_norm": 0.4872909486293793, + "learning_rate": 3.535925719227117e-05, + "loss": 0.0152, + "step": 467 + }, + { + "epoch": 0.9021686746987951, + "grad_norm": 0.45623311400413513, + "learning_rate": 3.533049379800038e-05, + "loss": 0.0305, + "step": 468 + }, + { + "epoch": 0.9040963855421686, + "grad_norm": 0.43087029457092285, + "learning_rate": 3.530165331703275e-05, + "loss": 0.0131, + "step": 469 + }, + { + "epoch": 0.9060240963855422, + "grad_norm": 0.4610525369644165, + "learning_rate": 3.527273589438756e-05, + "loss": 0.0187, + "step": 470 + }, + { + "epoch": 0.9079518072289157, + "grad_norm": 0.3356114327907562, + "learning_rate": 3.5243741675471006e-05, + "loss": 0.0185, + "step": 471 + }, + { + "epoch": 0.9098795180722892, + "grad_norm": 0.9065960049629211, + "learning_rate": 3.5214670806075426e-05, + "loss": 0.0433, + "step": 472 + }, + { + "epoch": 0.9118072289156627, + "grad_norm": 0.3652578294277191, + "learning_rate": 3.518552343237858e-05, + "loss": 0.02, + "step": 473 + }, + { + "epoch": 0.9137349397590362, + "grad_norm": 0.32377883791923523, + "learning_rate": 3.5156299700942916e-05, + "loss": 0.0165, + "step": 474 + }, + { + "epoch": 0.9156626506024096, + "grad_norm": 0.2431817352771759, + "learning_rate": 3.512699975871485e-05, + "loss": 0.0172, + "step": 475 + }, + { + "epoch": 0.9175903614457831, + "grad_norm": 0.6390707492828369, + "learning_rate": 3.509762375302399e-05, + "loss": 0.0356, + "step": 476 + }, + { + "epoch": 0.9195180722891566, + "grad_norm": 0.2283092886209488, + "learning_rate": 3.506817183158243e-05, + "loss": 0.0088, + "step": 477 + }, + { + "epoch": 0.9214457831325301, + "grad_norm": 0.5053914189338684, + "learning_rate": 3.5038644142483966e-05, + "loss": 0.0389, + "step": 478 + }, + { + "epoch": 0.9233734939759036, + "grad_norm": 0.2567576467990875, + "learning_rate": 3.500904083420342e-05, + "loss": 0.0155, + "step": 479 + }, + { + "epoch": 0.9253012048192771, + "grad_norm": 0.6852384209632874, + "learning_rate": 3.497936205559583e-05, + "loss": 0.0247, + "step": 480 + }, + { + "epoch": 0.9272289156626506, + "grad_norm": 0.36403414607048035, + "learning_rate": 3.494960795589572e-05, + "loss": 0.023, + "step": 481 + }, + { + "epoch": 0.929156626506024, + "grad_norm": 0.506554901599884, + "learning_rate": 3.491977868471635e-05, + "loss": 0.0273, + "step": 482 + }, + { + "epoch": 0.9310843373493976, + "grad_norm": 0.38329923152923584, + "learning_rate": 3.4889874392048985e-05, + "loss": 0.0169, + "step": 483 + }, + { + "epoch": 0.9330120481927711, + "grad_norm": 0.2805836498737335, + "learning_rate": 3.48598952282621e-05, + "loss": 0.0105, + "step": 484 + }, + { + "epoch": 0.9349397590361446, + "grad_norm": 0.6315302848815918, + "learning_rate": 3.482984134410067e-05, + "loss": 0.0289, + "step": 485 + }, + { + "epoch": 0.9368674698795181, + "grad_norm": 0.6431388854980469, + "learning_rate": 3.479971289068537e-05, + "loss": 0.0311, + "step": 486 + }, + { + "epoch": 0.9387951807228916, + "grad_norm": 0.9794723391532898, + "learning_rate": 3.476951001951184e-05, + "loss": 0.0452, + "step": 487 + }, + { + "epoch": 0.9407228915662651, + "grad_norm": 0.7984824180603027, + "learning_rate": 3.473923288244991e-05, + "loss": 0.0689, + "step": 488 + }, + { + "epoch": 0.9426506024096386, + "grad_norm": 0.46362006664276123, + "learning_rate": 3.470888163174286e-05, + "loss": 0.0241, + "step": 489 + }, + { + "epoch": 0.944578313253012, + "grad_norm": 0.5051195025444031, + "learning_rate": 3.467845642000661e-05, + "loss": 0.0228, + "step": 490 + }, + { + "epoch": 0.9465060240963855, + "grad_norm": 0.3082812428474426, + "learning_rate": 3.4647957400229004e-05, + "loss": 0.0144, + "step": 491 + }, + { + "epoch": 0.948433734939759, + "grad_norm": 0.2691391110420227, + "learning_rate": 3.461738472576902e-05, + "loss": 0.0167, + "step": 492 + }, + { + "epoch": 0.9503614457831325, + "grad_norm": 0.5627671480178833, + "learning_rate": 3.458673855035597e-05, + "loss": 0.031, + "step": 493 + }, + { + "epoch": 0.952289156626506, + "grad_norm": 0.4571435749530792, + "learning_rate": 3.455601902808876e-05, + "loss": 0.0191, + "step": 494 + }, + { + "epoch": 0.9542168674698795, + "grad_norm": 1.0117709636688232, + "learning_rate": 3.452522631343515e-05, + "loss": 0.0192, + "step": 495 + }, + { + "epoch": 0.9561445783132531, + "grad_norm": 0.28375712037086487, + "learning_rate": 3.449436056123086e-05, + "loss": 0.0159, + "step": 496 + }, + { + "epoch": 0.9580722891566265, + "grad_norm": 0.26381856203079224, + "learning_rate": 3.446342192667893e-05, + "loss": 0.0113, + "step": 497 + }, + { + "epoch": 0.96, + "grad_norm": 0.49317577481269836, + "learning_rate": 3.443241056534884e-05, + "loss": 0.0332, + "step": 498 + }, + { + "epoch": 0.9619277108433735, + "grad_norm": 0.28884485363960266, + "learning_rate": 3.440132663317579e-05, + "loss": 0.0117, + "step": 499 + }, + { + "epoch": 0.963855421686747, + "grad_norm": 0.36255285143852234, + "learning_rate": 3.4370170286459864e-05, + "loss": 0.0169, + "step": 500 + }, + { + "epoch": 0.9657831325301205, + "grad_norm": 0.4265049993991852, + "learning_rate": 3.433894168186529e-05, + "loss": 0.0217, + "step": 501 + }, + { + "epoch": 0.967710843373494, + "grad_norm": 0.8169426321983337, + "learning_rate": 3.430764097641962e-05, + "loss": 0.0207, + "step": 502 + }, + { + "epoch": 0.9696385542168675, + "grad_norm": 1.866077184677124, + "learning_rate": 3.427626832751296e-05, + "loss": 0.0381, + "step": 503 + }, + { + "epoch": 0.971566265060241, + "grad_norm": 0.33124980330467224, + "learning_rate": 3.424482389289716e-05, + "loss": 0.0245, + "step": 504 + }, + { + "epoch": 0.9734939759036144, + "grad_norm": 0.37479540705680847, + "learning_rate": 3.4213307830685055e-05, + "loss": 0.0164, + "step": 505 + }, + { + "epoch": 0.9754216867469879, + "grad_norm": 0.39738863706588745, + "learning_rate": 3.4181720299349615e-05, + "loss": 0.0297, + "step": 506 + }, + { + "epoch": 0.9773493975903614, + "grad_norm": 0.2567287087440491, + "learning_rate": 3.4150061457723205e-05, + "loss": 0.0102, + "step": 507 + }, + { + "epoch": 0.9792771084337349, + "grad_norm": 0.6230517029762268, + "learning_rate": 3.411833146499675e-05, + "loss": 0.0243, + "step": 508 + }, + { + "epoch": 0.9812048192771085, + "grad_norm": 0.44843971729278564, + "learning_rate": 3.408653048071894e-05, + "loss": 0.0357, + "step": 509 + }, + { + "epoch": 0.983132530120482, + "grad_norm": 1.0569655895233154, + "learning_rate": 3.405465866479546e-05, + "loss": 0.037, + "step": 510 + }, + { + "epoch": 0.9850602409638555, + "grad_norm": 0.29000964760780334, + "learning_rate": 3.402271617748812e-05, + "loss": 0.0129, + "step": 511 + }, + { + "epoch": 0.9869879518072289, + "grad_norm": 2.1627447605133057, + "learning_rate": 3.399070317941413e-05, + "loss": 0.0442, + "step": 512 + }, + { + "epoch": 0.9889156626506024, + "grad_norm": 0.27371272444725037, + "learning_rate": 3.395861983154522e-05, + "loss": 0.0119, + "step": 513 + }, + { + "epoch": 0.9908433734939759, + "grad_norm": 0.4117226302623749, + "learning_rate": 3.392646629520688e-05, + "loss": 0.0455, + "step": 514 + }, + { + "epoch": 0.9927710843373494, + "grad_norm": 0.5098996758460999, + "learning_rate": 3.389424273207752e-05, + "loss": 0.0203, + "step": 515 + }, + { + "epoch": 0.9946987951807229, + "grad_norm": 0.5192157626152039, + "learning_rate": 3.386194930418767e-05, + "loss": 0.0329, + "step": 516 + }, + { + "epoch": 0.9966265060240964, + "grad_norm": 0.18757697939872742, + "learning_rate": 3.382958617391915e-05, + "loss": 0.0065, + "step": 517 + }, + { + "epoch": 0.9985542168674699, + "grad_norm": 0.3334413170814514, + "learning_rate": 3.3797153504004296e-05, + "loss": 0.0266, + "step": 518 + }, + { + "epoch": 1.0, + "grad_norm": 0.4152225852012634, + "learning_rate": 3.3764651457525095e-05, + "loss": 0.0169, + "step": 519 + }, + { + "epoch": 1.0019277108433735, + "grad_norm": 0.43535247445106506, + "learning_rate": 3.373208019791237e-05, + "loss": 0.0221, + "step": 520 + }, + { + "epoch": 1.003855421686747, + "grad_norm": 0.39292722940444946, + "learning_rate": 3.3699439888945e-05, + "loss": 0.0211, + "step": 521 + }, + { + "epoch": 1.0057831325301205, + "grad_norm": 0.19566713273525238, + "learning_rate": 3.366673069474904e-05, + "loss": 0.0069, + "step": 522 + }, + { + "epoch": 1.007710843373494, + "grad_norm": 0.5101853609085083, + "learning_rate": 3.3633952779796914e-05, + "loss": 0.0191, + "step": 523 + }, + { + "epoch": 1.0096385542168675, + "grad_norm": 0.999434769153595, + "learning_rate": 3.360110630890664e-05, + "loss": 0.0196, + "step": 524 + }, + { + "epoch": 1.011566265060241, + "grad_norm": 0.4646223783493042, + "learning_rate": 3.356819144724092e-05, + "loss": 0.0328, + "step": 525 + }, + { + "epoch": 1.0134939759036146, + "grad_norm": 0.3132480978965759, + "learning_rate": 3.3535208360306354e-05, + "loss": 0.0203, + "step": 526 + }, + { + "epoch": 1.0154216867469879, + "grad_norm": 0.3038032352924347, + "learning_rate": 3.350215721395261e-05, + "loss": 0.0122, + "step": 527 + }, + { + "epoch": 1.0173493975903614, + "grad_norm": 0.45082882046699524, + "learning_rate": 3.346903817437157e-05, + "loss": 0.0437, + "step": 528 + }, + { + "epoch": 1.0192771084337349, + "grad_norm": 0.26917046308517456, + "learning_rate": 3.343585140809651e-05, + "loss": 0.013, + "step": 529 + }, + { + "epoch": 1.0212048192771084, + "grad_norm": 0.23869264125823975, + "learning_rate": 3.3402597082001276e-05, + "loss": 0.008, + "step": 530 + }, + { + "epoch": 1.0231325301204819, + "grad_norm": 0.31315353512763977, + "learning_rate": 3.3369275363299394e-05, + "loss": 0.0078, + "step": 531 + }, + { + "epoch": 1.0250602409638554, + "grad_norm": 0.4780346751213074, + "learning_rate": 3.333588641954327e-05, + "loss": 0.0225, + "step": 532 + }, + { + "epoch": 1.026987951807229, + "grad_norm": 0.2920368015766144, + "learning_rate": 3.330243041862336e-05, + "loss": 0.0118, + "step": 533 + }, + { + "epoch": 1.0289156626506024, + "grad_norm": 0.543669581413269, + "learning_rate": 3.326890752876728e-05, + "loss": 0.0338, + "step": 534 + }, + { + "epoch": 1.030843373493976, + "grad_norm": 0.4288000464439392, + "learning_rate": 3.323531791853901e-05, + "loss": 0.0341, + "step": 535 + }, + { + "epoch": 1.0327710843373494, + "grad_norm": 0.26600322127342224, + "learning_rate": 3.3201661756838e-05, + "loss": 0.0184, + "step": 536 + }, + { + "epoch": 1.034698795180723, + "grad_norm": 0.290937602519989, + "learning_rate": 3.316793921289835e-05, + "loss": 0.0152, + "step": 537 + }, + { + "epoch": 1.0366265060240965, + "grad_norm": 0.7621443271636963, + "learning_rate": 3.313415045628795e-05, + "loss": 0.0326, + "step": 538 + }, + { + "epoch": 1.03855421686747, + "grad_norm": 0.5581283569335938, + "learning_rate": 3.3100295656907646e-05, + "loss": 0.0164, + "step": 539 + }, + { + "epoch": 1.0404819277108435, + "grad_norm": 0.20930901169776917, + "learning_rate": 3.306637498499034e-05, + "loss": 0.0091, + "step": 540 + }, + { + "epoch": 1.0424096385542168, + "grad_norm": 0.46212059259414673, + "learning_rate": 3.303238861110018e-05, + "loss": 0.0118, + "step": 541 + }, + { + "epoch": 1.0443373493975903, + "grad_norm": 0.38259151577949524, + "learning_rate": 3.299833670613168e-05, + "loss": 0.0081, + "step": 542 + }, + { + "epoch": 1.0462650602409638, + "grad_norm": 0.4888618290424347, + "learning_rate": 3.2964219441308865e-05, + "loss": 0.0138, + "step": 543 + }, + { + "epoch": 1.0481927710843373, + "grad_norm": 0.32103127241134644, + "learning_rate": 3.2930036988184425e-05, + "loss": 0.0171, + "step": 544 + }, + { + "epoch": 1.0501204819277108, + "grad_norm": 0.27787327766418457, + "learning_rate": 3.28957895186388e-05, + "loss": 0.0106, + "step": 545 + }, + { + "epoch": 1.0520481927710843, + "grad_norm": 0.35597777366638184, + "learning_rate": 3.2861477204879395e-05, + "loss": 0.0123, + "step": 546 + }, + { + "epoch": 1.0539759036144578, + "grad_norm": 0.3619804084300995, + "learning_rate": 3.2827100219439656e-05, + "loss": 0.0088, + "step": 547 + }, + { + "epoch": 1.0559036144578313, + "grad_norm": 0.2525513470172882, + "learning_rate": 3.279265873517822e-05, + "loss": 0.0179, + "step": 548 + }, + { + "epoch": 1.0578313253012048, + "grad_norm": 0.3910020887851715, + "learning_rate": 3.275815292527804e-05, + "loss": 0.0142, + "step": 549 + }, + { + "epoch": 1.0597590361445783, + "grad_norm": 0.30515050888061523, + "learning_rate": 3.2723582963245526e-05, + "loss": 0.0123, + "step": 550 + }, + { + "epoch": 1.0616867469879518, + "grad_norm": 0.21708644926548004, + "learning_rate": 3.2688949022909665e-05, + "loss": 0.0098, + "step": 551 + }, + { + "epoch": 1.0636144578313254, + "grad_norm": 0.23307719826698303, + "learning_rate": 3.265425127842114e-05, + "loss": 0.0097, + "step": 552 + }, + { + "epoch": 1.0655421686746989, + "grad_norm": 0.676654577255249, + "learning_rate": 3.261948990425147e-05, + "loss": 0.0227, + "step": 553 + }, + { + "epoch": 1.0674698795180724, + "grad_norm": 0.4593975841999054, + "learning_rate": 3.258466507519213e-05, + "loss": 0.047, + "step": 554 + }, + { + "epoch": 1.0693975903614459, + "grad_norm": 0.19405829906463623, + "learning_rate": 3.254977696635366e-05, + "loss": 0.0314, + "step": 555 + }, + { + "epoch": 1.0713253012048192, + "grad_norm": 0.14563389122486115, + "learning_rate": 3.2514825753164774e-05, + "loss": 0.0046, + "step": 556 + }, + { + "epoch": 1.0732530120481927, + "grad_norm": 0.2642340064048767, + "learning_rate": 3.247981161137153e-05, + "loss": 0.022, + "step": 557 + }, + { + "epoch": 1.0751807228915662, + "grad_norm": 0.17274761199951172, + "learning_rate": 3.2444734717036386e-05, + "loss": 0.0134, + "step": 558 + }, + { + "epoch": 1.0771084337349397, + "grad_norm": 0.44354626536369324, + "learning_rate": 3.240959524653735e-05, + "loss": 0.0211, + "step": 559 + }, + { + "epoch": 1.0790361445783132, + "grad_norm": 0.2806888818740845, + "learning_rate": 3.237439337656708e-05, + "loss": 0.0141, + "step": 560 + }, + { + "epoch": 1.0809638554216867, + "grad_norm": 0.21679501235485077, + "learning_rate": 3.2339129284131994e-05, + "loss": 0.019, + "step": 561 + }, + { + "epoch": 1.0828915662650602, + "grad_norm": 0.3040260076522827, + "learning_rate": 3.2303803146551386e-05, + "loss": 0.0249, + "step": 562 + }, + { + "epoch": 1.0848192771084337, + "grad_norm": 0.2793775200843811, + "learning_rate": 3.226841514145656e-05, + "loss": 0.0088, + "step": 563 + }, + { + "epoch": 1.0867469879518072, + "grad_norm": 0.149955615401268, + "learning_rate": 3.223296544678987e-05, + "loss": 0.0054, + "step": 564 + }, + { + "epoch": 1.0886746987951808, + "grad_norm": 0.22166767716407776, + "learning_rate": 3.219745424080389e-05, + "loss": 0.0109, + "step": 565 + }, + { + "epoch": 1.0906024096385543, + "grad_norm": 0.22399431467056274, + "learning_rate": 3.2161881702060476e-05, + "loss": 0.0106, + "step": 566 + }, + { + "epoch": 1.0925301204819278, + "grad_norm": 0.18537986278533936, + "learning_rate": 3.2126248009429905e-05, + "loss": 0.0077, + "step": 567 + }, + { + "epoch": 1.0944578313253013, + "grad_norm": 0.24511495232582092, + "learning_rate": 3.2090553342089935e-05, + "loss": 0.0093, + "step": 568 + }, + { + "epoch": 1.0963855421686748, + "grad_norm": 0.4766045808792114, + "learning_rate": 3.205479787952494e-05, + "loss": 0.036, + "step": 569 + }, + { + "epoch": 1.0983132530120483, + "grad_norm": 0.1425715535879135, + "learning_rate": 3.201898180152499e-05, + "loss": 0.0085, + "step": 570 + }, + { + "epoch": 1.1002409638554216, + "grad_norm": 0.1909666359424591, + "learning_rate": 3.1983105288184945e-05, + "loss": 0.0081, + "step": 571 + }, + { + "epoch": 1.102168674698795, + "grad_norm": 0.44077104330062866, + "learning_rate": 3.194716851990355e-05, + "loss": 0.017, + "step": 572 + }, + { + "epoch": 1.1040963855421686, + "grad_norm": 0.5757400989532471, + "learning_rate": 3.191117167738253e-05, + "loss": 0.021, + "step": 573 + }, + { + "epoch": 1.106024096385542, + "grad_norm": 0.1977701038122177, + "learning_rate": 3.1875114941625705e-05, + "loss": 0.0096, + "step": 574 + }, + { + "epoch": 1.1079518072289156, + "grad_norm": 0.3524581491947174, + "learning_rate": 3.1838998493938026e-05, + "loss": 0.0118, + "step": 575 + }, + { + "epoch": 1.1098795180722891, + "grad_norm": 0.3301331698894501, + "learning_rate": 3.180282251592472e-05, + "loss": 0.0094, + "step": 576 + }, + { + "epoch": 1.1118072289156626, + "grad_norm": 0.2774488925933838, + "learning_rate": 3.1766587189490336e-05, + "loss": 0.0131, + "step": 577 + }, + { + "epoch": 1.1137349397590361, + "grad_norm": 1.732595443725586, + "learning_rate": 3.173029269683785e-05, + "loss": 0.0445, + "step": 578 + }, + { + "epoch": 1.1156626506024097, + "grad_norm": 0.28746843338012695, + "learning_rate": 3.169393922046776e-05, + "loss": 0.0116, + "step": 579 + }, + { + "epoch": 1.1175903614457832, + "grad_norm": 0.2952995002269745, + "learning_rate": 3.165752694317713e-05, + "loss": 0.0116, + "step": 580 + }, + { + "epoch": 1.1195180722891567, + "grad_norm": 0.2938575744628906, + "learning_rate": 3.16210560480587e-05, + "loss": 0.013, + "step": 581 + }, + { + "epoch": 1.1214457831325302, + "grad_norm": 0.22283495962619781, + "learning_rate": 3.158452671849998e-05, + "loss": 0.0052, + "step": 582 + }, + { + "epoch": 1.1233734939759037, + "grad_norm": 0.6272858381271362, + "learning_rate": 3.154793913818226e-05, + "loss": 0.0182, + "step": 583 + }, + { + "epoch": 1.1253012048192772, + "grad_norm": 0.479753702878952, + "learning_rate": 3.1511293491079804e-05, + "loss": 0.0146, + "step": 584 + }, + { + "epoch": 1.1272289156626507, + "grad_norm": 0.31104400753974915, + "learning_rate": 3.1474589961458786e-05, + "loss": 0.0139, + "step": 585 + }, + { + "epoch": 1.129156626506024, + "grad_norm": 0.4932832419872284, + "learning_rate": 3.1437828733876477e-05, + "loss": 0.0236, + "step": 586 + }, + { + "epoch": 1.1310843373493975, + "grad_norm": 0.222808837890625, + "learning_rate": 3.140100999318025e-05, + "loss": 0.0084, + "step": 587 + }, + { + "epoch": 1.133012048192771, + "grad_norm": 0.4515356719493866, + "learning_rate": 3.136413392450668e-05, + "loss": 0.0215, + "step": 588 + }, + { + "epoch": 1.1349397590361445, + "grad_norm": 0.39302268624305725, + "learning_rate": 3.132720071328061e-05, + "loss": 0.0154, + "step": 589 + }, + { + "epoch": 1.136867469879518, + "grad_norm": 0.43382835388183594, + "learning_rate": 3.1290210545214205e-05, + "loss": 0.0088, + "step": 590 + }, + { + "epoch": 1.1387951807228915, + "grad_norm": 0.18707136809825897, + "learning_rate": 3.125316360630602e-05, + "loss": 0.0126, + "step": 591 + }, + { + "epoch": 1.140722891566265, + "grad_norm": 0.5688219666481018, + "learning_rate": 3.121606008284011e-05, + "loss": 0.0147, + "step": 592 + }, + { + "epoch": 1.1426506024096386, + "grad_norm": 0.3321833312511444, + "learning_rate": 3.1178900161385005e-05, + "loss": 0.0119, + "step": 593 + }, + { + "epoch": 1.144578313253012, + "grad_norm": 0.3738424777984619, + "learning_rate": 3.114168402879286e-05, + "loss": 0.0158, + "step": 594 + }, + { + "epoch": 1.1465060240963856, + "grad_norm": 0.2386978417634964, + "learning_rate": 3.110441187219846e-05, + "loss": 0.0107, + "step": 595 + }, + { + "epoch": 1.148433734939759, + "grad_norm": 0.2165699452161789, + "learning_rate": 3.10670838790183e-05, + "loss": 0.0079, + "step": 596 + }, + { + "epoch": 1.1503614457831326, + "grad_norm": 0.25952696800231934, + "learning_rate": 3.102970023694965e-05, + "loss": 0.0147, + "step": 597 + }, + { + "epoch": 1.152289156626506, + "grad_norm": 0.21448305249214172, + "learning_rate": 3.099226113396959e-05, + "loss": 0.0099, + "step": 598 + }, + { + "epoch": 1.1542168674698796, + "grad_norm": 0.37226060032844543, + "learning_rate": 3.095476675833405e-05, + "loss": 0.0214, + "step": 599 + }, + { + "epoch": 1.1561445783132531, + "grad_norm": 0.29637983441352844, + "learning_rate": 3.0917217298576955e-05, + "loss": 0.0118, + "step": 600 + }, + { + "epoch": 1.1580722891566264, + "grad_norm": 0.18535609543323517, + "learning_rate": 3.0879612943509154e-05, + "loss": 0.0086, + "step": 601 + }, + { + "epoch": 1.16, + "grad_norm": 0.25874125957489014, + "learning_rate": 3.0841953882217536e-05, + "loss": 0.0088, + "step": 602 + }, + { + "epoch": 1.1619277108433734, + "grad_norm": 0.46092745661735535, + "learning_rate": 3.08042403040641e-05, + "loss": 0.0241, + "step": 603 + }, + { + "epoch": 1.163855421686747, + "grad_norm": 0.27023249864578247, + "learning_rate": 3.076647239868494e-05, + "loss": 0.0154, + "step": 604 + }, + { + "epoch": 1.1657831325301204, + "grad_norm": 0.445157527923584, + "learning_rate": 3.072865035598933e-05, + "loss": 0.0197, + "step": 605 + }, + { + "epoch": 1.167710843373494, + "grad_norm": 0.18097272515296936, + "learning_rate": 3.06907743661588e-05, + "loss": 0.0093, + "step": 606 + }, + { + "epoch": 1.1696385542168675, + "grad_norm": 0.22469942271709442, + "learning_rate": 3.065284461964609e-05, + "loss": 0.0171, + "step": 607 + }, + { + "epoch": 1.171566265060241, + "grad_norm": 0.20190906524658203, + "learning_rate": 3.061486130717428e-05, + "loss": 0.008, + "step": 608 + }, + { + "epoch": 1.1734939759036145, + "grad_norm": 0.18294145166873932, + "learning_rate": 3.057682461973579e-05, + "loss": 0.0155, + "step": 609 + }, + { + "epoch": 1.175421686746988, + "grad_norm": 0.34203943610191345, + "learning_rate": 3.053873474859143e-05, + "loss": 0.0212, + "step": 610 + }, + { + "epoch": 1.1773493975903615, + "grad_norm": 0.49073582887649536, + "learning_rate": 3.050059188526942e-05, + "loss": 0.019, + "step": 611 + }, + { + "epoch": 1.179277108433735, + "grad_norm": 0.3537680506706238, + "learning_rate": 3.046239622156446e-05, + "loss": 0.0147, + "step": 612 + }, + { + "epoch": 1.1812048192771085, + "grad_norm": 0.2584632635116577, + "learning_rate": 3.042414794953674e-05, + "loss": 0.0088, + "step": 613 + }, + { + "epoch": 1.1831325301204818, + "grad_norm": 0.3529360890388489, + "learning_rate": 3.0385847261510975e-05, + "loss": 0.0187, + "step": 614 + }, + { + "epoch": 1.1850602409638555, + "grad_norm": 0.3331570327281952, + "learning_rate": 3.0347494350075465e-05, + "loss": 0.0124, + "step": 615 + }, + { + "epoch": 1.1869879518072288, + "grad_norm": 0.2223527580499649, + "learning_rate": 3.0309089408081074e-05, + "loss": 0.01, + "step": 616 + }, + { + "epoch": 1.1889156626506023, + "grad_norm": 0.21985746920108795, + "learning_rate": 3.027063262864032e-05, + "loss": 0.0087, + "step": 617 + }, + { + "epoch": 1.1908433734939758, + "grad_norm": 0.2989653944969177, + "learning_rate": 3.023212420512637e-05, + "loss": 0.0137, + "step": 618 + }, + { + "epoch": 1.1927710843373494, + "grad_norm": 0.17423275113105774, + "learning_rate": 3.0193564331172074e-05, + "loss": 0.0056, + "step": 619 + }, + { + "epoch": 1.1946987951807229, + "grad_norm": 1.0992127656936646, + "learning_rate": 3.0154953200668976e-05, + "loss": 0.0274, + "step": 620 + }, + { + "epoch": 1.1966265060240964, + "grad_norm": 0.21641989052295685, + "learning_rate": 3.011629100776638e-05, + "loss": 0.0151, + "step": 621 + }, + { + "epoch": 1.1985542168674699, + "grad_norm": 0.4558199644088745, + "learning_rate": 3.007757794687033e-05, + "loss": 0.0424, + "step": 622 + }, + { + "epoch": 1.2004819277108434, + "grad_norm": 0.42380189895629883, + "learning_rate": 3.003881421264266e-05, + "loss": 0.0079, + "step": 623 + }, + { + "epoch": 1.202409638554217, + "grad_norm": 0.28791171312332153, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.0142, + "step": 624 + }, + { + "epoch": 1.2043373493975904, + "grad_norm": 0.3906581997871399, + "learning_rate": 2.996113550411281e-05, + "loss": 0.0251, + "step": 625 + }, + { + "epoch": 1.206265060240964, + "grad_norm": 0.47848746180534363, + "learning_rate": 2.9922220920404375e-05, + "loss": 0.0137, + "step": 626 + }, + { + "epoch": 1.2081927710843374, + "grad_norm": 0.22666941583156586, + "learning_rate": 2.9883256444549862e-05, + "loss": 0.0105, + "step": 627 + }, + { + "epoch": 1.210120481927711, + "grad_norm": 0.18968136608600616, + "learning_rate": 2.984424227247529e-05, + "loss": 0.0089, + "step": 628 + }, + { + "epoch": 1.2120481927710842, + "grad_norm": 0.28732606768608093, + "learning_rate": 2.980517860035656e-05, + "loss": 0.0253, + "step": 629 + }, + { + "epoch": 1.213975903614458, + "grad_norm": 0.21131543815135956, + "learning_rate": 2.9766065624618518e-05, + "loss": 0.0134, + "step": 630 + }, + { + "epoch": 1.2159036144578312, + "grad_norm": 0.7594877481460571, + "learning_rate": 2.972690354193388e-05, + "loss": 0.0157, + "step": 631 + }, + { + "epoch": 1.2178313253012047, + "grad_norm": 0.730291485786438, + "learning_rate": 2.96876925492223e-05, + "loss": 0.0204, + "step": 632 + }, + { + "epoch": 1.2197590361445783, + "grad_norm": 0.20333674550056458, + "learning_rate": 2.9648432843649382e-05, + "loss": 0.0114, + "step": 633 + }, + { + "epoch": 1.2216867469879518, + "grad_norm": 0.5680793523788452, + "learning_rate": 2.960912462262566e-05, + "loss": 0.0146, + "step": 634 + }, + { + "epoch": 1.2236144578313253, + "grad_norm": 0.4591079354286194, + "learning_rate": 2.9569768083805618e-05, + "loss": 0.0112, + "step": 635 + }, + { + "epoch": 1.2255421686746988, + "grad_norm": 0.3793511390686035, + "learning_rate": 2.953036342508671e-05, + "loss": 0.0377, + "step": 636 + }, + { + "epoch": 1.2274698795180723, + "grad_norm": 1.118723750114441, + "learning_rate": 2.9490910844608346e-05, + "loss": 0.0432, + "step": 637 + }, + { + "epoch": 1.2293975903614458, + "grad_norm": 0.36990776658058167, + "learning_rate": 2.9451410540750887e-05, + "loss": 0.0203, + "step": 638 + }, + { + "epoch": 1.2313253012048193, + "grad_norm": 0.930397629737854, + "learning_rate": 2.94118627121347e-05, + "loss": 0.0311, + "step": 639 + }, + { + "epoch": 1.2332530120481928, + "grad_norm": 0.2347625195980072, + "learning_rate": 2.9372267557619075e-05, + "loss": 0.0168, + "step": 640 + }, + { + "epoch": 1.2351807228915663, + "grad_norm": 0.3720332384109497, + "learning_rate": 2.933262527630131e-05, + "loss": 0.0136, + "step": 641 + }, + { + "epoch": 1.2371084337349398, + "grad_norm": 0.4871984124183655, + "learning_rate": 2.929293606751565e-05, + "loss": 0.0339, + "step": 642 + }, + { + "epoch": 1.2390361445783133, + "grad_norm": 0.35853689908981323, + "learning_rate": 2.9253200130832322e-05, + "loss": 0.0095, + "step": 643 + }, + { + "epoch": 1.2409638554216866, + "grad_norm": 0.42003703117370605, + "learning_rate": 2.92134176660565e-05, + "loss": 0.0142, + "step": 644 + }, + { + "epoch": 1.2428915662650604, + "grad_norm": 0.3854500651359558, + "learning_rate": 2.9173588873227338e-05, + "loss": 0.0209, + "step": 645 + }, + { + "epoch": 1.2448192771084337, + "grad_norm": 0.24665917456150055, + "learning_rate": 2.913371395261691e-05, + "loss": 0.0087, + "step": 646 + }, + { + "epoch": 1.2467469879518072, + "grad_norm": 0.41571593284606934, + "learning_rate": 2.9093793104729268e-05, + "loss": 0.0164, + "step": 647 + }, + { + "epoch": 1.2486746987951807, + "grad_norm": 0.4597891569137573, + "learning_rate": 2.9053826530299377e-05, + "loss": 0.0138, + "step": 648 + }, + { + "epoch": 1.2506024096385542, + "grad_norm": 0.43345385789871216, + "learning_rate": 2.901381443029215e-05, + "loss": 0.0353, + "step": 649 + }, + { + "epoch": 1.2525301204819277, + "grad_norm": 0.3706768751144409, + "learning_rate": 2.897375700590141e-05, + "loss": 0.007, + "step": 650 + }, + { + "epoch": 1.2544578313253012, + "grad_norm": 0.30305296182632446, + "learning_rate": 2.8933654458548873e-05, + "loss": 0.0123, + "step": 651 + }, + { + "epoch": 1.2563855421686747, + "grad_norm": 0.2042127549648285, + "learning_rate": 2.8893506989883167e-05, + "loss": 0.0099, + "step": 652 + }, + { + "epoch": 1.2583132530120482, + "grad_norm": 0.20524422824382782, + "learning_rate": 2.8853314801778784e-05, + "loss": 0.0097, + "step": 653 + }, + { + "epoch": 1.2602409638554217, + "grad_norm": 0.2351921945810318, + "learning_rate": 2.8813078096335093e-05, + "loss": 0.0091, + "step": 654 + }, + { + "epoch": 1.2621686746987952, + "grad_norm": 0.34547340869903564, + "learning_rate": 2.87727970758753e-05, + "loss": 0.0088, + "step": 655 + }, + { + "epoch": 1.2640963855421687, + "grad_norm": 0.35163217782974243, + "learning_rate": 2.8732471942945443e-05, + "loss": 0.0145, + "step": 656 + }, + { + "epoch": 1.266024096385542, + "grad_norm": 1.715137243270874, + "learning_rate": 2.8692102900313378e-05, + "loss": 0.0198, + "step": 657 + }, + { + "epoch": 1.2679518072289158, + "grad_norm": 0.2860178053379059, + "learning_rate": 2.8651690150967748e-05, + "loss": 0.0085, + "step": 658 + }, + { + "epoch": 1.269879518072289, + "grad_norm": 0.21175967156887054, + "learning_rate": 2.8611233898116967e-05, + "loss": 0.0071, + "step": 659 + }, + { + "epoch": 1.2718072289156628, + "grad_norm": 0.33726972341537476, + "learning_rate": 2.85707343451882e-05, + "loss": 0.012, + "step": 660 + }, + { + "epoch": 1.273734939759036, + "grad_norm": 0.2138456553220749, + "learning_rate": 2.853019169582635e-05, + "loss": 0.0092, + "step": 661 + }, + { + "epoch": 1.2756626506024096, + "grad_norm": 0.2304934412240982, + "learning_rate": 2.8489606153892997e-05, + "loss": 0.0144, + "step": 662 + }, + { + "epoch": 1.277590361445783, + "grad_norm": 0.2691061794757843, + "learning_rate": 2.8448977923465425e-05, + "loss": 0.0121, + "step": 663 + }, + { + "epoch": 1.2795180722891566, + "grad_norm": 0.35254305601119995, + "learning_rate": 2.840830720883555e-05, + "loss": 0.0125, + "step": 664 + }, + { + "epoch": 1.28144578313253, + "grad_norm": 0.36552608013153076, + "learning_rate": 2.836759421450893e-05, + "loss": 0.021, + "step": 665 + }, + { + "epoch": 1.2833734939759036, + "grad_norm": 0.37177154421806335, + "learning_rate": 2.83268391452037e-05, + "loss": 0.0216, + "step": 666 + }, + { + "epoch": 1.2853012048192771, + "grad_norm": 0.20932547748088837, + "learning_rate": 2.828604220584958e-05, + "loss": 0.0077, + "step": 667 + }, + { + "epoch": 1.2872289156626506, + "grad_norm": 0.5158557295799255, + "learning_rate": 2.824520360158681e-05, + "loss": 0.0394, + "step": 668 + }, + { + "epoch": 1.2891566265060241, + "grad_norm": 0.22623969614505768, + "learning_rate": 2.820432353776515e-05, + "loss": 0.0087, + "step": 669 + }, + { + "epoch": 1.2910843373493976, + "grad_norm": 0.2996046245098114, + "learning_rate": 2.8163402219942822e-05, + "loss": 0.01, + "step": 670 + }, + { + "epoch": 1.2930120481927712, + "grad_norm": 0.24957989156246185, + "learning_rate": 2.8122439853885488e-05, + "loss": 0.0127, + "step": 671 + }, + { + "epoch": 1.2949397590361444, + "grad_norm": 0.2636559307575226, + "learning_rate": 2.8081436645565216e-05, + "loss": 0.0128, + "step": 672 + }, + { + "epoch": 1.2968674698795182, + "grad_norm": 0.3531591296195984, + "learning_rate": 2.804039280115944e-05, + "loss": 0.0199, + "step": 673 + }, + { + "epoch": 1.2987951807228915, + "grad_norm": 0.3682299852371216, + "learning_rate": 2.7999308527049927e-05, + "loss": 0.0088, + "step": 674 + }, + { + "epoch": 1.3007228915662652, + "grad_norm": 0.19555217027664185, + "learning_rate": 2.795818402982174e-05, + "loss": 0.0084, + "step": 675 + }, + { + "epoch": 1.3026506024096385, + "grad_norm": 0.2864912450313568, + "learning_rate": 2.7917019516262186e-05, + "loss": 0.0154, + "step": 676 + }, + { + "epoch": 1.304578313253012, + "grad_norm": 0.2211237996816635, + "learning_rate": 2.78758151933598e-05, + "loss": 0.0078, + "step": 677 + }, + { + "epoch": 1.3065060240963855, + "grad_norm": 0.13646945357322693, + "learning_rate": 2.7834571268303294e-05, + "loss": 0.0058, + "step": 678 + }, + { + "epoch": 1.308433734939759, + "grad_norm": 0.16530285775661469, + "learning_rate": 2.779328794848049e-05, + "loss": 0.007, + "step": 679 + }, + { + "epoch": 1.3103614457831325, + "grad_norm": 0.2145693302154541, + "learning_rate": 2.7751965441477325e-05, + "loss": 0.0203, + "step": 680 + }, + { + "epoch": 1.312289156626506, + "grad_norm": 0.24273739755153656, + "learning_rate": 2.771060395507677e-05, + "loss": 0.0106, + "step": 681 + }, + { + "epoch": 1.3142168674698795, + "grad_norm": 0.20430618524551392, + "learning_rate": 2.7669203697257794e-05, + "loss": 0.0122, + "step": 682 + }, + { + "epoch": 1.316144578313253, + "grad_norm": 0.2502615749835968, + "learning_rate": 2.7627764876194335e-05, + "loss": 0.0101, + "step": 683 + }, + { + "epoch": 1.3180722891566266, + "grad_norm": 0.287239670753479, + "learning_rate": 2.7586287700254214e-05, + "loss": 0.0203, + "step": 684 + }, + { + "epoch": 1.32, + "grad_norm": 0.16239754855632782, + "learning_rate": 2.7544772377998147e-05, + "loss": 0.0084, + "step": 685 + }, + { + "epoch": 1.3219277108433736, + "grad_norm": 0.27174142003059387, + "learning_rate": 2.7503219118178636e-05, + "loss": 0.008, + "step": 686 + }, + { + "epoch": 1.3238554216867469, + "grad_norm": 0.12878240644931793, + "learning_rate": 2.7461628129738954e-05, + "loss": 0.0053, + "step": 687 + }, + { + "epoch": 1.3257831325301206, + "grad_norm": 0.16112515330314636, + "learning_rate": 2.7419999621812086e-05, + "loss": 0.0059, + "step": 688 + }, + { + "epoch": 1.3277108433734939, + "grad_norm": 0.2398834228515625, + "learning_rate": 2.7378333803719672e-05, + "loss": 0.0095, + "step": 689 + }, + { + "epoch": 1.3296385542168676, + "grad_norm": 0.18516193330287933, + "learning_rate": 2.733663088497097e-05, + "loss": 0.0071, + "step": 690 + }, + { + "epoch": 1.331566265060241, + "grad_norm": 0.2974924147129059, + "learning_rate": 2.7294891075261785e-05, + "loss": 0.0227, + "step": 691 + }, + { + "epoch": 1.3334939759036144, + "grad_norm": 0.12931054830551147, + "learning_rate": 2.7253114584473418e-05, + "loss": 0.0039, + "step": 692 + }, + { + "epoch": 1.335421686746988, + "grad_norm": 0.16319474577903748, + "learning_rate": 2.7211301622671623e-05, + "loss": 0.008, + "step": 693 + }, + { + "epoch": 1.3373493975903614, + "grad_norm": 0.27622169256210327, + "learning_rate": 2.7169452400105533e-05, + "loss": 0.0238, + "step": 694 + }, + { + "epoch": 1.339277108433735, + "grad_norm": 0.45309779047966003, + "learning_rate": 2.712756712720663e-05, + "loss": 0.0439, + "step": 695 + }, + { + "epoch": 1.3412048192771084, + "grad_norm": 0.2469855099916458, + "learning_rate": 2.708564601458765e-05, + "loss": 0.0085, + "step": 696 + }, + { + "epoch": 1.343132530120482, + "grad_norm": 0.4245856702327728, + "learning_rate": 2.7043689273041535e-05, + "loss": 0.0097, + "step": 697 + }, + { + "epoch": 1.3450602409638555, + "grad_norm": 0.26796087622642517, + "learning_rate": 2.7001697113540414e-05, + "loss": 0.0119, + "step": 698 + }, + { + "epoch": 1.346987951807229, + "grad_norm": 0.3569283187389374, + "learning_rate": 2.6959669747234482e-05, + "loss": 0.0096, + "step": 699 + }, + { + "epoch": 1.3489156626506025, + "grad_norm": 0.7038524150848389, + "learning_rate": 2.6917607385450973e-05, + "loss": 0.0317, + "step": 700 + }, + { + "epoch": 1.350843373493976, + "grad_norm": 0.23568563163280487, + "learning_rate": 2.687551023969308e-05, + "loss": 0.0112, + "step": 701 + }, + { + "epoch": 1.3527710843373493, + "grad_norm": 0.20338499546051025, + "learning_rate": 2.6833378521638935e-05, + "loss": 0.0092, + "step": 702 + }, + { + "epoch": 1.354698795180723, + "grad_norm": 4.22187614440918, + "learning_rate": 2.679121244314046e-05, + "loss": 0.0314, + "step": 703 + }, + { + "epoch": 1.3566265060240963, + "grad_norm": 0.2542206048965454, + "learning_rate": 2.674901221622239e-05, + "loss": 0.0158, + "step": 704 + }, + { + "epoch": 1.3585542168674698, + "grad_norm": 0.49705010652542114, + "learning_rate": 2.670677805308116e-05, + "loss": 0.0162, + "step": 705 + }, + { + "epoch": 1.3604819277108433, + "grad_norm": 0.17502115666866302, + "learning_rate": 2.666451016608383e-05, + "loss": 0.0074, + "step": 706 + }, + { + "epoch": 1.3624096385542168, + "grad_norm": 0.21738742291927338, + "learning_rate": 2.6622208767767075e-05, + "loss": 0.0135, + "step": 707 + }, + { + "epoch": 1.3643373493975903, + "grad_norm": 0.3309847414493561, + "learning_rate": 2.6579874070836032e-05, + "loss": 0.0107, + "step": 708 + }, + { + "epoch": 1.3662650602409638, + "grad_norm": 0.10706827789545059, + "learning_rate": 2.6537506288163303e-05, + "loss": 0.0043, + "step": 709 + }, + { + "epoch": 1.3681927710843373, + "grad_norm": 0.173640176653862, + "learning_rate": 2.6495105632787835e-05, + "loss": 0.0092, + "step": 710 + }, + { + "epoch": 1.3701204819277109, + "grad_norm": 0.2636397182941437, + "learning_rate": 2.6452672317913893e-05, + "loss": 0.0097, + "step": 711 + }, + { + "epoch": 1.3720481927710844, + "grad_norm": 0.28485360741615295, + "learning_rate": 2.6410206556909943e-05, + "loss": 0.0193, + "step": 712 + }, + { + "epoch": 1.3739759036144579, + "grad_norm": 0.23210027813911438, + "learning_rate": 2.636770856330761e-05, + "loss": 0.0229, + "step": 713 + }, + { + "epoch": 1.3759036144578314, + "grad_norm": 0.13388316333293915, + "learning_rate": 2.6325178550800596e-05, + "loss": 0.004, + "step": 714 + }, + { + "epoch": 1.377831325301205, + "grad_norm": 0.5131422877311707, + "learning_rate": 2.6282616733243603e-05, + "loss": 0.0137, + "step": 715 + }, + { + "epoch": 1.3797590361445784, + "grad_norm": 0.3243267834186554, + "learning_rate": 2.6240023324651258e-05, + "loss": 0.0153, + "step": 716 + }, + { + "epoch": 1.3816867469879517, + "grad_norm": 0.1440611034631729, + "learning_rate": 2.619739853919704e-05, + "loss": 0.0031, + "step": 717 + }, + { + "epoch": 1.3836144578313254, + "grad_norm": 0.30346596240997314, + "learning_rate": 2.6154742591212196e-05, + "loss": 0.0109, + "step": 718 + }, + { + "epoch": 1.3855421686746987, + "grad_norm": 0.19109240174293518, + "learning_rate": 2.611205569518468e-05, + "loss": 0.0094, + "step": 719 + }, + { + "epoch": 1.3874698795180722, + "grad_norm": 0.28636518120765686, + "learning_rate": 2.6069338065758056e-05, + "loss": 0.0123, + "step": 720 + }, + { + "epoch": 1.3893975903614457, + "grad_norm": 0.28083911538124084, + "learning_rate": 2.6026589917730416e-05, + "loss": 0.0104, + "step": 721 + }, + { + "epoch": 1.3913253012048192, + "grad_norm": 0.36553966999053955, + "learning_rate": 2.5983811466053327e-05, + "loss": 0.0143, + "step": 722 + }, + { + "epoch": 1.3932530120481927, + "grad_norm": 0.23317205905914307, + "learning_rate": 2.5941002925830708e-05, + "loss": 0.011, + "step": 723 + }, + { + "epoch": 1.3951807228915662, + "grad_norm": 0.3825171887874603, + "learning_rate": 2.589816451231781e-05, + "loss": 0.0098, + "step": 724 + }, + { + "epoch": 1.3971084337349398, + "grad_norm": 0.19916608929634094, + "learning_rate": 2.585529644092006e-05, + "loss": 0.0094, + "step": 725 + }, + { + "epoch": 1.3990361445783133, + "grad_norm": 0.19990523159503937, + "learning_rate": 2.5812398927192027e-05, + "loss": 0.0128, + "step": 726 + }, + { + "epoch": 1.4009638554216868, + "grad_norm": 0.34662899374961853, + "learning_rate": 2.5769472186836347e-05, + "loss": 0.0091, + "step": 727 + }, + { + "epoch": 1.4028915662650603, + "grad_norm": 0.23481112718582153, + "learning_rate": 2.5726516435702583e-05, + "loss": 0.0154, + "step": 728 + }, + { + "epoch": 1.4048192771084338, + "grad_norm": 0.1846667379140854, + "learning_rate": 2.5683531889786194e-05, + "loss": 0.0088, + "step": 729 + }, + { + "epoch": 1.4067469879518073, + "grad_norm": 0.16717663407325745, + "learning_rate": 2.564051876522742e-05, + "loss": 0.0083, + "step": 730 + }, + { + "epoch": 1.4086746987951808, + "grad_norm": 0.4116475284099579, + "learning_rate": 2.5597477278310202e-05, + "loss": 0.0179, + "step": 731 + }, + { + "epoch": 1.410602409638554, + "grad_norm": 0.171807661652565, + "learning_rate": 2.5554407645461115e-05, + "loss": 0.0063, + "step": 732 + }, + { + "epoch": 1.4125301204819278, + "grad_norm": 0.1954439878463745, + "learning_rate": 2.5511310083248243e-05, + "loss": 0.017, + "step": 733 + }, + { + "epoch": 1.4144578313253011, + "grad_norm": 0.37158989906311035, + "learning_rate": 2.5468184808380104e-05, + "loss": 0.0173, + "step": 734 + }, + { + "epoch": 1.4163855421686746, + "grad_norm": 0.2001633644104004, + "learning_rate": 2.542503203770458e-05, + "loss": 0.0165, + "step": 735 + }, + { + "epoch": 1.4183132530120481, + "grad_norm": 0.45673373341560364, + "learning_rate": 2.53818519882078e-05, + "loss": 0.0185, + "step": 736 + }, + { + "epoch": 1.4202409638554216, + "grad_norm": 0.3838701546192169, + "learning_rate": 2.5338644877013067e-05, + "loss": 0.0134, + "step": 737 + }, + { + "epoch": 1.4221686746987952, + "grad_norm": 0.32032477855682373, + "learning_rate": 2.5295410921379745e-05, + "loss": 0.0143, + "step": 738 + }, + { + "epoch": 1.4240963855421687, + "grad_norm": 0.4594039022922516, + "learning_rate": 2.52521503387022e-05, + "loss": 0.0193, + "step": 739 + }, + { + "epoch": 1.4260240963855422, + "grad_norm": 0.3889620900154114, + "learning_rate": 2.5208863346508667e-05, + "loss": 0.0114, + "step": 740 + }, + { + "epoch": 1.4279518072289157, + "grad_norm": 0.33153319358825684, + "learning_rate": 2.5165550162460203e-05, + "loss": 0.0102, + "step": 741 + }, + { + "epoch": 1.4298795180722892, + "grad_norm": 0.7269518375396729, + "learning_rate": 2.5122211004349536e-05, + "loss": 0.0215, + "step": 742 + }, + { + "epoch": 1.4318072289156627, + "grad_norm": 0.31653261184692383, + "learning_rate": 2.5078846090100023e-05, + "loss": 0.0115, + "step": 743 + }, + { + "epoch": 1.4337349397590362, + "grad_norm": 0.20620353519916534, + "learning_rate": 2.5035455637764518e-05, + "loss": 0.0153, + "step": 744 + }, + { + "epoch": 1.4356626506024097, + "grad_norm": 0.17266008257865906, + "learning_rate": 2.4992039865524297e-05, + "loss": 0.0069, + "step": 745 + }, + { + "epoch": 1.4375903614457832, + "grad_norm": 0.24760811030864716, + "learning_rate": 2.494859899168795e-05, + "loss": 0.0108, + "step": 746 + }, + { + "epoch": 1.4395180722891565, + "grad_norm": 0.2584865391254425, + "learning_rate": 2.4905133234690282e-05, + "loss": 0.0095, + "step": 747 + }, + { + "epoch": 1.4414457831325302, + "grad_norm": 0.48847514390945435, + "learning_rate": 2.486164281309122e-05, + "loss": 0.0181, + "step": 748 + }, + { + "epoch": 1.4433734939759035, + "grad_norm": 0.42942047119140625, + "learning_rate": 2.4818127945574717e-05, + "loss": 0.025, + "step": 749 + }, + { + "epoch": 1.445301204819277, + "grad_norm": 0.23713800311088562, + "learning_rate": 2.4774588850947648e-05, + "loss": 0.0085, + "step": 750 + }, + { + "epoch": 1.4472289156626506, + "grad_norm": 0.8797569870948792, + "learning_rate": 2.473102574813871e-05, + "loss": 0.0097, + "step": 751 + }, + { + "epoch": 1.449156626506024, + "grad_norm": 0.2744862735271454, + "learning_rate": 2.4687438856197302e-05, + "loss": 0.0122, + "step": 752 + }, + { + "epoch": 1.4510843373493976, + "grad_norm": 0.12747010588645935, + "learning_rate": 2.4643828394292478e-05, + "loss": 0.0056, + "step": 753 + }, + { + "epoch": 1.453012048192771, + "grad_norm": 0.37376829981803894, + "learning_rate": 2.4600194581711775e-05, + "loss": 0.0052, + "step": 754 + }, + { + "epoch": 1.4549397590361446, + "grad_norm": 0.2536911368370056, + "learning_rate": 2.4556537637860176e-05, + "loss": 0.0113, + "step": 755 + }, + { + "epoch": 1.456867469879518, + "grad_norm": 0.25950780510902405, + "learning_rate": 2.451285778225894e-05, + "loss": 0.0099, + "step": 756 + }, + { + "epoch": 1.4587951807228916, + "grad_norm": 0.19535955786705017, + "learning_rate": 2.4469155234544565e-05, + "loss": 0.0069, + "step": 757 + }, + { + "epoch": 1.4607228915662651, + "grad_norm": 0.22816115617752075, + "learning_rate": 2.442543021446764e-05, + "loss": 0.0088, + "step": 758 + }, + { + "epoch": 1.4626506024096386, + "grad_norm": 0.3363986313343048, + "learning_rate": 2.4381682941891755e-05, + "loss": 0.0182, + "step": 759 + }, + { + "epoch": 1.464578313253012, + "grad_norm": 0.21492891013622284, + "learning_rate": 2.4337913636792382e-05, + "loss": 0.0069, + "step": 760 + }, + { + "epoch": 1.4665060240963856, + "grad_norm": 0.6070862412452698, + "learning_rate": 2.429412251925579e-05, + "loss": 0.0406, + "step": 761 + }, + { + "epoch": 1.468433734939759, + "grad_norm": 2.6469690799713135, + "learning_rate": 2.425030980947793e-05, + "loss": 0.0205, + "step": 762 + }, + { + "epoch": 1.4703614457831327, + "grad_norm": 0.30909740924835205, + "learning_rate": 2.420647572776332e-05, + "loss": 0.0136, + "step": 763 + }, + { + "epoch": 1.472289156626506, + "grad_norm": 0.6639553904533386, + "learning_rate": 2.416262049452395e-05, + "loss": 0.011, + "step": 764 + }, + { + "epoch": 1.4742168674698795, + "grad_norm": 0.2919616997241974, + "learning_rate": 2.4118744330278147e-05, + "loss": 0.0131, + "step": 765 + }, + { + "epoch": 1.476144578313253, + "grad_norm": 0.5232429504394531, + "learning_rate": 2.4074847455649523e-05, + "loss": 0.0138, + "step": 766 + }, + { + "epoch": 1.4780722891566265, + "grad_norm": 5.630630970001221, + "learning_rate": 2.403093009136579e-05, + "loss": 0.0264, + "step": 767 + }, + { + "epoch": 1.48, + "grad_norm": 0.33234721422195435, + "learning_rate": 2.3986992458257707e-05, + "loss": 0.0111, + "step": 768 + }, + { + "epoch": 1.4819277108433735, + "grad_norm": 0.28444772958755493, + "learning_rate": 2.3943034777257945e-05, + "loss": 0.0144, + "step": 769 + }, + { + "epoch": 1.483855421686747, + "grad_norm": 0.16229979693889618, + "learning_rate": 2.38990572694e-05, + "loss": 0.0062, + "step": 770 + }, + { + "epoch": 1.4857831325301205, + "grad_norm": 0.27474716305732727, + "learning_rate": 2.385506015581704e-05, + "loss": 0.0172, + "step": 771 + }, + { + "epoch": 1.487710843373494, + "grad_norm": 0.246526300907135, + "learning_rate": 2.381104365774083e-05, + "loss": 0.012, + "step": 772 + }, + { + "epoch": 1.4896385542168675, + "grad_norm": 0.282047837972641, + "learning_rate": 2.37670079965006e-05, + "loss": 0.0116, + "step": 773 + }, + { + "epoch": 1.491566265060241, + "grad_norm": 0.2878139317035675, + "learning_rate": 2.3722953393521944e-05, + "loss": 0.0147, + "step": 774 + }, + { + "epoch": 1.4934939759036143, + "grad_norm": 0.5586277842521667, + "learning_rate": 2.367888007032571e-05, + "loss": 0.0111, + "step": 775 + }, + { + "epoch": 1.495421686746988, + "grad_norm": 0.562160313129425, + "learning_rate": 2.3634788248526846e-05, + "loss": 0.0061, + "step": 776 + }, + { + "epoch": 1.4973493975903613, + "grad_norm": 0.3452005982398987, + "learning_rate": 2.3590678149833356e-05, + "loss": 0.0205, + "step": 777 + }, + { + "epoch": 1.499277108433735, + "grad_norm": 0.7757686376571655, + "learning_rate": 2.3546549996045114e-05, + "loss": 0.0273, + "step": 778 + }, + { + "epoch": 1.5012048192771084, + "grad_norm": 0.19530551135540009, + "learning_rate": 2.3502404009052812e-05, + "loss": 0.0083, + "step": 779 + }, + { + "epoch": 1.503132530120482, + "grad_norm": 0.2586531639099121, + "learning_rate": 2.3458240410836775e-05, + "loss": 0.0122, + "step": 780 + }, + { + "epoch": 1.5050602409638554, + "grad_norm": 0.30063286423683167, + "learning_rate": 2.3414059423465924e-05, + "loss": 0.0083, + "step": 781 + }, + { + "epoch": 1.5069879518072289, + "grad_norm": 0.18663185834884644, + "learning_rate": 2.3369861269096575e-05, + "loss": 0.0104, + "step": 782 + }, + { + "epoch": 1.5089156626506024, + "grad_norm": 0.4405941069126129, + "learning_rate": 2.3325646169971416e-05, + "loss": 0.0264, + "step": 783 + }, + { + "epoch": 1.510843373493976, + "grad_norm": 0.2947913110256195, + "learning_rate": 2.3281414348418294e-05, + "loss": 0.0107, + "step": 784 + }, + { + "epoch": 1.5127710843373494, + "grad_norm": 0.23813778162002563, + "learning_rate": 2.3237166026849158e-05, + "loss": 0.0084, + "step": 785 + }, + { + "epoch": 1.514698795180723, + "grad_norm": 0.33380329608917236, + "learning_rate": 2.3192901427758932e-05, + "loss": 0.0111, + "step": 786 + }, + { + "epoch": 1.5166265060240964, + "grad_norm": 0.3736988306045532, + "learning_rate": 2.314862077372438e-05, + "loss": 0.0135, + "step": 787 + }, + { + "epoch": 1.5185542168674697, + "grad_norm": 0.3785395920276642, + "learning_rate": 2.3104324287402996e-05, + "loss": 0.0265, + "step": 788 + }, + { + "epoch": 1.5204819277108435, + "grad_norm": 0.3359154462814331, + "learning_rate": 2.3060012191531885e-05, + "loss": 0.0127, + "step": 789 + }, + { + "epoch": 1.5224096385542167, + "grad_norm": 0.720753014087677, + "learning_rate": 2.301568470892664e-05, + "loss": 0.0134, + "step": 790 + }, + { + "epoch": 1.5243373493975905, + "grad_norm": 0.36473193764686584, + "learning_rate": 2.297134206248024e-05, + "loss": 0.0318, + "step": 791 + }, + { + "epoch": 1.5262650602409638, + "grad_norm": 0.29987087845802307, + "learning_rate": 2.2926984475161884e-05, + "loss": 0.008, + "step": 792 + }, + { + "epoch": 1.5281927710843375, + "grad_norm": 0.2883112132549286, + "learning_rate": 2.2882612170015914e-05, + "loss": 0.0125, + "step": 793 + }, + { + "epoch": 1.5301204819277108, + "grad_norm": 0.28983229398727417, + "learning_rate": 2.2838225370160682e-05, + "loss": 0.0155, + "step": 794 + }, + { + "epoch": 1.5320481927710843, + "grad_norm": 0.47236886620521545, + "learning_rate": 2.2793824298787414e-05, + "loss": 0.0132, + "step": 795 + }, + { + "epoch": 1.5339759036144578, + "grad_norm": 0.8328865170478821, + "learning_rate": 2.2749409179159104e-05, + "loss": 0.026, + "step": 796 + }, + { + "epoch": 1.5359036144578313, + "grad_norm": 0.3129172623157501, + "learning_rate": 2.2704980234609396e-05, + "loss": 0.0099, + "step": 797 + }, + { + "epoch": 1.5378313253012048, + "grad_norm": 0.22284500300884247, + "learning_rate": 2.2660537688541416e-05, + "loss": 0.009, + "step": 798 + }, + { + "epoch": 1.5397590361445783, + "grad_norm": 0.3346405625343323, + "learning_rate": 2.2616081764426726e-05, + "loss": 0.0077, + "step": 799 + }, + { + "epoch": 1.5416867469879518, + "grad_norm": 0.2923565208911896, + "learning_rate": 2.2571612685804124e-05, + "loss": 0.0119, + "step": 800 + }, + { + "epoch": 1.5436144578313253, + "grad_norm": 0.1921311914920807, + "learning_rate": 2.252713067627857e-05, + "loss": 0.0083, + "step": 801 + }, + { + "epoch": 1.5455421686746988, + "grad_norm": 0.23221106827259064, + "learning_rate": 2.2482635959520044e-05, + "loss": 0.0049, + "step": 802 + }, + { + "epoch": 1.5474698795180721, + "grad_norm": 0.6340724229812622, + "learning_rate": 2.243812875926241e-05, + "loss": 0.0273, + "step": 803 + }, + { + "epoch": 1.5493975903614459, + "grad_norm": 0.2699439823627472, + "learning_rate": 2.2393609299302314e-05, + "loss": 0.0108, + "step": 804 + }, + { + "epoch": 1.5513253012048192, + "grad_norm": 0.2005189210176468, + "learning_rate": 2.2349077803498052e-05, + "loss": 0.0076, + "step": 805 + }, + { + "epoch": 1.5532530120481929, + "grad_norm": 0.39668548107147217, + "learning_rate": 2.230453449576842e-05, + "loss": 0.0135, + "step": 806 + }, + { + "epoch": 1.5551807228915662, + "grad_norm": 0.2406950294971466, + "learning_rate": 2.2259979600091635e-05, + "loss": 0.0094, + "step": 807 + }, + { + "epoch": 1.55710843373494, + "grad_norm": 0.30363157391548157, + "learning_rate": 2.2215413340504158e-05, + "loss": 0.0178, + "step": 808 + }, + { + "epoch": 1.5590361445783132, + "grad_norm": 0.19508181512355804, + "learning_rate": 2.2170835941099605e-05, + "loss": 0.0069, + "step": 809 + }, + { + "epoch": 1.5609638554216867, + "grad_norm": 0.734106719493866, + "learning_rate": 2.2126247626027615e-05, + "loss": 0.0319, + "step": 810 + }, + { + "epoch": 1.5628915662650602, + "grad_norm": 0.2591583728790283, + "learning_rate": 2.208164861949268e-05, + "loss": 0.0168, + "step": 811 + }, + { + "epoch": 1.5648192771084337, + "grad_norm": 0.2386734038591385, + "learning_rate": 2.20370391457531e-05, + "loss": 0.0041, + "step": 812 + }, + { + "epoch": 1.5667469879518072, + "grad_norm": 0.1675218939781189, + "learning_rate": 2.1992419429119764e-05, + "loss": 0.0078, + "step": 813 + }, + { + "epoch": 1.5686746987951807, + "grad_norm": 0.45591506361961365, + "learning_rate": 2.1947789693955097e-05, + "loss": 0.0166, + "step": 814 + }, + { + "epoch": 1.5706024096385542, + "grad_norm": 0.46940621733665466, + "learning_rate": 2.190315016467188e-05, + "loss": 0.0176, + "step": 815 + }, + { + "epoch": 1.5725301204819278, + "grad_norm": 0.2294205278158188, + "learning_rate": 2.1858501065732146e-05, + "loss": 0.0102, + "step": 816 + }, + { + "epoch": 1.5744578313253013, + "grad_norm": 0.28922322392463684, + "learning_rate": 2.181384262164606e-05, + "loss": 0.0111, + "step": 817 + }, + { + "epoch": 1.5763855421686745, + "grad_norm": 0.19650064408779144, + "learning_rate": 2.1769175056970765e-05, + "loss": 0.0076, + "step": 818 + }, + { + "epoch": 1.5783132530120483, + "grad_norm": 0.19538825750350952, + "learning_rate": 2.172449859630927e-05, + "loss": 0.0118, + "step": 819 + }, + { + "epoch": 1.5802409638554216, + "grad_norm": 0.1900389939546585, + "learning_rate": 2.167981346430931e-05, + "loss": 0.0066, + "step": 820 + }, + { + "epoch": 1.5821686746987953, + "grad_norm": 0.21593710780143738, + "learning_rate": 2.1635119885662235e-05, + "loss": 0.0101, + "step": 821 + }, + { + "epoch": 1.5840963855421686, + "grad_norm": 0.2699289321899414, + "learning_rate": 2.159041808510185e-05, + "loss": 0.0118, + "step": 822 + }, + { + "epoch": 1.5860240963855423, + "grad_norm": 0.31867673993110657, + "learning_rate": 2.1545708287403322e-05, + "loss": 0.0122, + "step": 823 + }, + { + "epoch": 1.5879518072289156, + "grad_norm": 0.2862400412559509, + "learning_rate": 2.1500990717382004e-05, + "loss": 0.0216, + "step": 824 + }, + { + "epoch": 1.589879518072289, + "grad_norm": 0.28482481837272644, + "learning_rate": 2.145626559989237e-05, + "loss": 0.0136, + "step": 825 + }, + { + "epoch": 1.5918072289156626, + "grad_norm": 0.2866958975791931, + "learning_rate": 2.1411533159826803e-05, + "loss": 0.0298, + "step": 826 + }, + { + "epoch": 1.5937349397590361, + "grad_norm": 0.39092838764190674, + "learning_rate": 2.1366793622114533e-05, + "loss": 0.0382, + "step": 827 + }, + { + "epoch": 1.5956626506024096, + "grad_norm": 0.16381537914276123, + "learning_rate": 2.1322047211720468e-05, + "loss": 0.0074, + "step": 828 + }, + { + "epoch": 1.5975903614457831, + "grad_norm": 0.22146940231323242, + "learning_rate": 2.1277294153644083e-05, + "loss": 0.0103, + "step": 829 + }, + { + "epoch": 1.5995180722891567, + "grad_norm": 0.2155209183692932, + "learning_rate": 2.123253467291827e-05, + "loss": 0.0095, + "step": 830 + }, + { + "epoch": 1.6014457831325302, + "grad_norm": 0.41510409116744995, + "learning_rate": 2.118776899460822e-05, + "loss": 0.0457, + "step": 831 + }, + { + "epoch": 1.6033734939759037, + "grad_norm": 0.19718150794506073, + "learning_rate": 2.1142997343810293e-05, + "loss": 0.0192, + "step": 832 + }, + { + "epoch": 1.605301204819277, + "grad_norm": 0.40924403071403503, + "learning_rate": 2.1098219945650865e-05, + "loss": 0.0278, + "step": 833 + }, + { + "epoch": 1.6072289156626507, + "grad_norm": 0.18657824397087097, + "learning_rate": 2.105343702528524e-05, + "loss": 0.0076, + "step": 834 + }, + { + "epoch": 1.609156626506024, + "grad_norm": 0.1727641075849533, + "learning_rate": 2.100864880789645e-05, + "loss": 0.0076, + "step": 835 + }, + { + "epoch": 1.6110843373493977, + "grad_norm": 0.18138745427131653, + "learning_rate": 2.0963855518694203e-05, + "loss": 0.005, + "step": 836 + }, + { + "epoch": 1.613012048192771, + "grad_norm": 0.19173955917358398, + "learning_rate": 2.0919057382913675e-05, + "loss": 0.0084, + "step": 837 + }, + { + "epoch": 1.6149397590361447, + "grad_norm": 0.3812403380870819, + "learning_rate": 2.0874254625814435e-05, + "loss": 0.009, + "step": 838 + }, + { + "epoch": 1.616867469879518, + "grad_norm": 0.2009759545326233, + "learning_rate": 2.0829447472679285e-05, + "loss": 0.0098, + "step": 839 + }, + { + "epoch": 1.6187951807228915, + "grad_norm": 0.48703446984291077, + "learning_rate": 2.0784636148813124e-05, + "loss": 0.0099, + "step": 840 + }, + { + "epoch": 1.620722891566265, + "grad_norm": 0.28995075821876526, + "learning_rate": 2.0739820879541827e-05, + "loss": 0.0075, + "step": 841 + }, + { + "epoch": 1.6226506024096385, + "grad_norm": 0.2130059450864792, + "learning_rate": 2.069500189021111e-05, + "loss": 0.007, + "step": 842 + }, + { + "epoch": 1.624578313253012, + "grad_norm": 0.252524733543396, + "learning_rate": 2.0650179406185397e-05, + "loss": 0.0249, + "step": 843 + }, + { + "epoch": 1.6265060240963856, + "grad_norm": 0.23069098591804504, + "learning_rate": 2.060535365284668e-05, + "loss": 0.0084, + "step": 844 + }, + { + "epoch": 1.628433734939759, + "grad_norm": 0.25051403045654297, + "learning_rate": 2.056052485559338e-05, + "loss": 0.0071, + "step": 845 + }, + { + "epoch": 1.6303614457831326, + "grad_norm": 0.27664798498153687, + "learning_rate": 2.051569323983924e-05, + "loss": 0.0198, + "step": 846 + }, + { + "epoch": 1.632289156626506, + "grad_norm": 0.2954922318458557, + "learning_rate": 2.047085903101218e-05, + "loss": 0.006, + "step": 847 + }, + { + "epoch": 1.6342168674698794, + "grad_norm": 0.28477591276168823, + "learning_rate": 2.0426022454553137e-05, + "loss": 0.0147, + "step": 848 + }, + { + "epoch": 1.636144578313253, + "grad_norm": 0.2785305678844452, + "learning_rate": 2.0381183735914968e-05, + "loss": 0.0117, + "step": 849 + }, + { + "epoch": 1.6380722891566264, + "grad_norm": 0.2500309348106384, + "learning_rate": 2.0336343100561295e-05, + "loss": 0.008, + "step": 850 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.18932047486305237, + "learning_rate": 2.0291500773965392e-05, + "loss": 0.0256, + "step": 851 + }, + { + "epoch": 1.6419277108433734, + "grad_norm": 0.6396257877349854, + "learning_rate": 2.0246656981609013e-05, + "loss": 0.0141, + "step": 852 + }, + { + "epoch": 1.6438554216867471, + "grad_norm": 0.5072891116142273, + "learning_rate": 2.02018119489813e-05, + "loss": 0.008, + "step": 853 + }, + { + "epoch": 1.6457831325301204, + "grad_norm": 0.2920839488506317, + "learning_rate": 2.0156965901577635e-05, + "loss": 0.0085, + "step": 854 + }, + { + "epoch": 1.647710843373494, + "grad_norm": 0.1391262263059616, + "learning_rate": 2.011211906489848e-05, + "loss": 0.0078, + "step": 855 + }, + { + "epoch": 1.6496385542168674, + "grad_norm": 0.29620468616485596, + "learning_rate": 2.00672716644483e-05, + "loss": 0.0109, + "step": 856 + }, + { + "epoch": 1.651566265060241, + "grad_norm": 0.13946573436260223, + "learning_rate": 2.002242392573436e-05, + "loss": 0.0076, + "step": 857 + }, + { + "epoch": 1.6534939759036145, + "grad_norm": 0.9766128659248352, + "learning_rate": 1.997757607426565e-05, + "loss": 0.0309, + "step": 858 + }, + { + "epoch": 1.655421686746988, + "grad_norm": 0.18002203106880188, + "learning_rate": 1.9932728335551702e-05, + "loss": 0.0072, + "step": 859 + }, + { + "epoch": 1.6573493975903615, + "grad_norm": 0.28073111176490784, + "learning_rate": 1.988788093510152e-05, + "loss": 0.0246, + "step": 860 + }, + { + "epoch": 1.659277108433735, + "grad_norm": 0.1919957399368286, + "learning_rate": 1.9843034098422375e-05, + "loss": 0.0087, + "step": 861 + }, + { + "epoch": 1.6612048192771085, + "grad_norm": 0.1825258433818817, + "learning_rate": 1.9798188051018705e-05, + "loss": 0.0092, + "step": 862 + }, + { + "epoch": 1.6631325301204818, + "grad_norm": 0.32412952184677124, + "learning_rate": 1.9753343018390997e-05, + "loss": 0.0118, + "step": 863 + }, + { + "epoch": 1.6650602409638555, + "grad_norm": 0.12828563153743744, + "learning_rate": 1.9708499226034618e-05, + "loss": 0.0056, + "step": 864 + }, + { + "epoch": 1.6669879518072288, + "grad_norm": 0.18647560477256775, + "learning_rate": 1.966365689943871e-05, + "loss": 0.0094, + "step": 865 + }, + { + "epoch": 1.6689156626506025, + "grad_norm": 0.19835828244686127, + "learning_rate": 1.9618816264085042e-05, + "loss": 0.0097, + "step": 866 + }, + { + "epoch": 1.6708433734939758, + "grad_norm": 0.22364282608032227, + "learning_rate": 1.957397754544687e-05, + "loss": 0.0062, + "step": 867 + }, + { + "epoch": 1.6727710843373496, + "grad_norm": 0.29420018196105957, + "learning_rate": 1.952914096898783e-05, + "loss": 0.0182, + "step": 868 + }, + { + "epoch": 1.6746987951807228, + "grad_norm": 0.2149929702281952, + "learning_rate": 1.9484306760160766e-05, + "loss": 0.0125, + "step": 869 + }, + { + "epoch": 1.6766265060240964, + "grad_norm": 0.16844330728054047, + "learning_rate": 1.9439475144406623e-05, + "loss": 0.0074, + "step": 870 + }, + { + "epoch": 1.6785542168674699, + "grad_norm": 0.5010282397270203, + "learning_rate": 1.9394646347153334e-05, + "loss": 0.0213, + "step": 871 + }, + { + "epoch": 1.6804819277108434, + "grad_norm": 0.29847195744514465, + "learning_rate": 1.9349820593814606e-05, + "loss": 0.0173, + "step": 872 + }, + { + "epoch": 1.6824096385542169, + "grad_norm": 0.23835812509059906, + "learning_rate": 1.930499810978889e-05, + "loss": 0.011, + "step": 873 + }, + { + "epoch": 1.6843373493975904, + "grad_norm": 0.3269020617008209, + "learning_rate": 1.9260179120458177e-05, + "loss": 0.0285, + "step": 874 + }, + { + "epoch": 1.686265060240964, + "grad_norm": 0.2142144739627838, + "learning_rate": 1.9215363851186883e-05, + "loss": 0.0146, + "step": 875 + }, + { + "epoch": 1.6881927710843372, + "grad_norm": 0.3098377585411072, + "learning_rate": 1.9170552527320725e-05, + "loss": 0.0104, + "step": 876 + }, + { + "epoch": 1.690120481927711, + "grad_norm": 0.22504115104675293, + "learning_rate": 1.9125745374185568e-05, + "loss": 0.0091, + "step": 877 + }, + { + "epoch": 1.6920481927710842, + "grad_norm": 0.20633333921432495, + "learning_rate": 1.908094261708633e-05, + "loss": 0.0097, + "step": 878 + }, + { + "epoch": 1.693975903614458, + "grad_norm": 1.179566502571106, + "learning_rate": 1.9036144481305807e-05, + "loss": 0.0143, + "step": 879 + }, + { + "epoch": 1.6959036144578312, + "grad_norm": 0.15525613725185394, + "learning_rate": 1.8991351192103554e-05, + "loss": 0.0062, + "step": 880 + }, + { + "epoch": 1.697831325301205, + "grad_norm": 0.15966367721557617, + "learning_rate": 1.8946562974714763e-05, + "loss": 0.0048, + "step": 881 + }, + { + "epoch": 1.6997590361445782, + "grad_norm": 0.18902607262134552, + "learning_rate": 1.890178005434914e-05, + "loss": 0.0124, + "step": 882 + }, + { + "epoch": 1.701686746987952, + "grad_norm": 0.21692413091659546, + "learning_rate": 1.885700265618971e-05, + "loss": 0.0135, + "step": 883 + }, + { + "epoch": 1.7036144578313253, + "grad_norm": 0.38948455452919006, + "learning_rate": 1.8812231005391786e-05, + "loss": 0.0365, + "step": 884 + }, + { + "epoch": 1.7055421686746988, + "grad_norm": 0.2483491599559784, + "learning_rate": 1.8767465327081736e-05, + "loss": 0.0202, + "step": 885 + }, + { + "epoch": 1.7074698795180723, + "grad_norm": 0.15305832028388977, + "learning_rate": 1.872270584635592e-05, + "loss": 0.0035, + "step": 886 + }, + { + "epoch": 1.7093975903614458, + "grad_norm": 0.17794466018676758, + "learning_rate": 1.867795278827954e-05, + "loss": 0.0157, + "step": 887 + }, + { + "epoch": 1.7113253012048193, + "grad_norm": 0.1938813328742981, + "learning_rate": 1.863320637788547e-05, + "loss": 0.0071, + "step": 888 + }, + { + "epoch": 1.7132530120481928, + "grad_norm": 0.27061617374420166, + "learning_rate": 1.8588466840173207e-05, + "loss": 0.0347, + "step": 889 + }, + { + "epoch": 1.7151807228915663, + "grad_norm": 0.1541014313697815, + "learning_rate": 1.8543734400107637e-05, + "loss": 0.006, + "step": 890 + }, + { + "epoch": 1.7171084337349396, + "grad_norm": 0.1436876654624939, + "learning_rate": 1.8499009282617996e-05, + "loss": 0.0059, + "step": 891 + }, + { + "epoch": 1.7190361445783133, + "grad_norm": 1.0573723316192627, + "learning_rate": 1.8454291712596688e-05, + "loss": 0.008, + "step": 892 + }, + { + "epoch": 1.7209638554216866, + "grad_norm": 0.15406259894371033, + "learning_rate": 1.8409581914898157e-05, + "loss": 0.0061, + "step": 893 + }, + { + "epoch": 1.7228915662650603, + "grad_norm": 0.24822913110256195, + "learning_rate": 1.836488011433777e-05, + "loss": 0.0085, + "step": 894 + }, + { + "epoch": 1.7248192771084336, + "grad_norm": 0.21049316227436066, + "learning_rate": 1.83201865356907e-05, + "loss": 0.0075, + "step": 895 + }, + { + "epoch": 1.7267469879518074, + "grad_norm": 0.24159866571426392, + "learning_rate": 1.8275501403690733e-05, + "loss": 0.0156, + "step": 896 + }, + { + "epoch": 1.7286746987951807, + "grad_norm": 0.3191063106060028, + "learning_rate": 1.823082494302924e-05, + "loss": 0.0218, + "step": 897 + }, + { + "epoch": 1.7306024096385542, + "grad_norm": 0.20296362042427063, + "learning_rate": 1.8186157378353945e-05, + "loss": 0.0126, + "step": 898 + }, + { + "epoch": 1.7325301204819277, + "grad_norm": 0.1905524581670761, + "learning_rate": 1.8141498934267858e-05, + "loss": 0.0131, + "step": 899 + }, + { + "epoch": 1.7344578313253012, + "grad_norm": 0.5350520610809326, + "learning_rate": 1.809684983532813e-05, + "loss": 0.0115, + "step": 900 + }, + { + "epoch": 1.7363855421686747, + "grad_norm": 0.17144092917442322, + "learning_rate": 1.8052210306044907e-05, + "loss": 0.0113, + "step": 901 + }, + { + "epoch": 1.7383132530120482, + "grad_norm": 0.11777982115745544, + "learning_rate": 1.8007580570880236e-05, + "loss": 0.0058, + "step": 902 + }, + { + "epoch": 1.7402409638554217, + "grad_norm": 0.2078275978565216, + "learning_rate": 1.7962960854246908e-05, + "loss": 0.0106, + "step": 903 + }, + { + "epoch": 1.7421686746987952, + "grad_norm": 0.2550877630710602, + "learning_rate": 1.791835138050732e-05, + "loss": 0.0076, + "step": 904 + }, + { + "epoch": 1.7440963855421687, + "grad_norm": 0.11553912609815598, + "learning_rate": 1.7873752373972395e-05, + "loss": 0.0038, + "step": 905 + }, + { + "epoch": 1.746024096385542, + "grad_norm": 0.10724586248397827, + "learning_rate": 1.7829164058900398e-05, + "loss": 0.0043, + "step": 906 + }, + { + "epoch": 1.7479518072289157, + "grad_norm": 0.30152231454849243, + "learning_rate": 1.7784586659495845e-05, + "loss": 0.0099, + "step": 907 + }, + { + "epoch": 1.749879518072289, + "grad_norm": 0.18372933566570282, + "learning_rate": 1.7740020399908372e-05, + "loss": 0.0074, + "step": 908 + }, + { + "epoch": 1.7518072289156628, + "grad_norm": 0.35184428095817566, + "learning_rate": 1.7695465504231586e-05, + "loss": 0.0184, + "step": 909 + }, + { + "epoch": 1.753734939759036, + "grad_norm": 0.15083615481853485, + "learning_rate": 1.765092219650196e-05, + "loss": 0.0061, + "step": 910 + }, + { + "epoch": 1.7556626506024098, + "grad_norm": 0.2599961459636688, + "learning_rate": 1.7606390700697693e-05, + "loss": 0.0101, + "step": 911 + }, + { + "epoch": 1.757590361445783, + "grad_norm": 0.10829206556081772, + "learning_rate": 1.7561871240737595e-05, + "loss": 0.0034, + "step": 912 + }, + { + "epoch": 1.7595180722891566, + "grad_norm": 0.38098782300949097, + "learning_rate": 1.7517364040479966e-05, + "loss": 0.0384, + "step": 913 + }, + { + "epoch": 1.76144578313253, + "grad_norm": 0.14975085854530334, + "learning_rate": 1.7472869323721432e-05, + "loss": 0.0055, + "step": 914 + }, + { + "epoch": 1.7633734939759036, + "grad_norm": 0.4151444733142853, + "learning_rate": 1.742838731419588e-05, + "loss": 0.0307, + "step": 915 + }, + { + "epoch": 1.765301204819277, + "grad_norm": 0.22238481044769287, + "learning_rate": 1.738391823557328e-05, + "loss": 0.0059, + "step": 916 + }, + { + "epoch": 1.7672289156626506, + "grad_norm": 0.23386356234550476, + "learning_rate": 1.7339462311458587e-05, + "loss": 0.0113, + "step": 917 + }, + { + "epoch": 1.7691566265060241, + "grad_norm": 0.21911191940307617, + "learning_rate": 1.7295019765390618e-05, + "loss": 0.0071, + "step": 918 + }, + { + "epoch": 1.7710843373493976, + "grad_norm": 0.343159943819046, + "learning_rate": 1.7250590820840903e-05, + "loss": 0.0144, + "step": 919 + }, + { + "epoch": 1.7730120481927711, + "grad_norm": 0.32204556465148926, + "learning_rate": 1.720617570121259e-05, + "loss": 0.0131, + "step": 920 + }, + { + "epoch": 1.7749397590361444, + "grad_norm": 0.4105585515499115, + "learning_rate": 1.7161774629839328e-05, + "loss": 0.0148, + "step": 921 + }, + { + "epoch": 1.7768674698795182, + "grad_norm": 0.16380974650382996, + "learning_rate": 1.7117387829984093e-05, + "loss": 0.0066, + "step": 922 + }, + { + "epoch": 1.7787951807228914, + "grad_norm": 0.22920913994312286, + "learning_rate": 1.707301552483813e-05, + "loss": 0.0105, + "step": 923 + }, + { + "epoch": 1.7807228915662652, + "grad_norm": 0.2075149267911911, + "learning_rate": 1.7028657937519767e-05, + "loss": 0.0104, + "step": 924 + }, + { + "epoch": 1.7826506024096385, + "grad_norm": 0.44439977407455444, + "learning_rate": 1.6984315291073355e-05, + "loss": 0.0134, + "step": 925 + }, + { + "epoch": 1.7845783132530122, + "grad_norm": 0.24068203568458557, + "learning_rate": 1.6939987808468125e-05, + "loss": 0.0078, + "step": 926 + }, + { + "epoch": 1.7865060240963855, + "grad_norm": 0.34044349193573, + "learning_rate": 1.689567571259701e-05, + "loss": 0.0108, + "step": 927 + }, + { + "epoch": 1.788433734939759, + "grad_norm": 0.34082743525505066, + "learning_rate": 1.6851379226275624e-05, + "loss": 0.0266, + "step": 928 + }, + { + "epoch": 1.7903614457831325, + "grad_norm": 0.19490115344524384, + "learning_rate": 1.6807098572241075e-05, + "loss": 0.0109, + "step": 929 + }, + { + "epoch": 1.792289156626506, + "grad_norm": 0.16208237409591675, + "learning_rate": 1.6762833973150846e-05, + "loss": 0.0113, + "step": 930 + }, + { + "epoch": 1.7942168674698795, + "grad_norm": 0.35555699467658997, + "learning_rate": 1.671858565158172e-05, + "loss": 0.0196, + "step": 931 + }, + { + "epoch": 1.796144578313253, + "grad_norm": 0.1600857824087143, + "learning_rate": 1.6674353830028587e-05, + "loss": 0.0089, + "step": 932 + }, + { + "epoch": 1.7980722891566265, + "grad_norm": 0.1699574887752533, + "learning_rate": 1.663013873090342e-05, + "loss": 0.0074, + "step": 933 + }, + { + "epoch": 1.8, + "grad_norm": 0.2472933828830719, + "learning_rate": 1.6585940576534086e-05, + "loss": 0.0063, + "step": 934 + }, + { + "epoch": 1.8019277108433736, + "grad_norm": 0.23491555452346802, + "learning_rate": 1.654175958916323e-05, + "loss": 0.0101, + "step": 935 + }, + { + "epoch": 1.8038554216867468, + "grad_norm": 0.28635191917419434, + "learning_rate": 1.6497595990947195e-05, + "loss": 0.0131, + "step": 936 + }, + { + "epoch": 1.8057831325301206, + "grad_norm": 0.15400712192058563, + "learning_rate": 1.645345000395489e-05, + "loss": 0.0068, + "step": 937 + }, + { + "epoch": 1.8077108433734939, + "grad_norm": 0.18223172426223755, + "learning_rate": 1.6409321850166647e-05, + "loss": 0.0094, + "step": 938 + }, + { + "epoch": 1.8096385542168676, + "grad_norm": 0.2789457142353058, + "learning_rate": 1.636521175147316e-05, + "loss": 0.0202, + "step": 939 + }, + { + "epoch": 1.8115662650602409, + "grad_norm": 0.4267627000808716, + "learning_rate": 1.6321119929674297e-05, + "loss": 0.0176, + "step": 940 + }, + { + "epoch": 1.8134939759036146, + "grad_norm": 0.3021615445613861, + "learning_rate": 1.6277046606478056e-05, + "loss": 0.0085, + "step": 941 + }, + { + "epoch": 1.815421686746988, + "grad_norm": 0.3724934756755829, + "learning_rate": 1.6232992003499405e-05, + "loss": 0.0474, + "step": 942 + }, + { + "epoch": 1.8173493975903614, + "grad_norm": 0.20904326438903809, + "learning_rate": 1.6188956342259177e-05, + "loss": 0.0078, + "step": 943 + }, + { + "epoch": 1.819277108433735, + "grad_norm": 0.31168171763420105, + "learning_rate": 1.614493984418297e-05, + "loss": 0.0174, + "step": 944 + }, + { + "epoch": 1.8212048192771084, + "grad_norm": 0.21273556351661682, + "learning_rate": 1.6100942730600003e-05, + "loss": 0.0054, + "step": 945 + }, + { + "epoch": 1.823132530120482, + "grad_norm": 0.16991695761680603, + "learning_rate": 1.6056965222742055e-05, + "loss": 0.0063, + "step": 946 + }, + { + "epoch": 1.8250602409638554, + "grad_norm": 0.22762684524059296, + "learning_rate": 1.6013007541742303e-05, + "loss": 0.0234, + "step": 947 + }, + { + "epoch": 1.826987951807229, + "grad_norm": 0.20128795504570007, + "learning_rate": 1.596906990863422e-05, + "loss": 0.0095, + "step": 948 + }, + { + "epoch": 1.8289156626506025, + "grad_norm": 0.30772027373313904, + "learning_rate": 1.592515254435048e-05, + "loss": 0.0356, + "step": 949 + }, + { + "epoch": 1.830843373493976, + "grad_norm": 0.12954631447792053, + "learning_rate": 1.5881255669721857e-05, + "loss": 0.008, + "step": 950 + }, + { + "epoch": 1.8327710843373493, + "grad_norm": 0.7787145972251892, + "learning_rate": 1.5837379505476054e-05, + "loss": 0.0108, + "step": 951 + }, + { + "epoch": 1.834698795180723, + "grad_norm": 0.1683879941701889, + "learning_rate": 1.5793524272236683e-05, + "loss": 0.006, + "step": 952 + }, + { + "epoch": 1.8366265060240963, + "grad_norm": 0.16475361585617065, + "learning_rate": 1.5749690190522076e-05, + "loss": 0.0065, + "step": 953 + }, + { + "epoch": 1.83855421686747, + "grad_norm": 0.211905375123024, + "learning_rate": 1.5705877480744214e-05, + "loss": 0.0092, + "step": 954 + }, + { + "epoch": 1.8404819277108433, + "grad_norm": 0.23850117623806, + "learning_rate": 1.5662086363207628e-05, + "loss": 0.012, + "step": 955 + }, + { + "epoch": 1.842409638554217, + "grad_norm": 0.19100065529346466, + "learning_rate": 1.561831705810825e-05, + "loss": 0.0113, + "step": 956 + }, + { + "epoch": 1.8443373493975903, + "grad_norm": 0.3635985255241394, + "learning_rate": 1.557456978553236e-05, + "loss": 0.0168, + "step": 957 + }, + { + "epoch": 1.8462650602409638, + "grad_norm": 0.16449116170406342, + "learning_rate": 1.553084476545544e-05, + "loss": 0.0042, + "step": 958 + }, + { + "epoch": 1.8481927710843373, + "grad_norm": 0.566093385219574, + "learning_rate": 1.5487142217741062e-05, + "loss": 0.0145, + "step": 959 + }, + { + "epoch": 1.8501204819277108, + "grad_norm": 0.15960252285003662, + "learning_rate": 1.5443462362139834e-05, + "loss": 0.0059, + "step": 960 + }, + { + "epoch": 1.8520481927710843, + "grad_norm": 0.40773797035217285, + "learning_rate": 1.539980541828823e-05, + "loss": 0.0257, + "step": 961 + }, + { + "epoch": 1.8539759036144579, + "grad_norm": 0.4802496135234833, + "learning_rate": 1.5356171605707522e-05, + "loss": 0.0111, + "step": 962 + }, + { + "epoch": 1.8559036144578314, + "grad_norm": 0.15745794773101807, + "learning_rate": 1.5312561143802704e-05, + "loss": 0.0049, + "step": 963 + }, + { + "epoch": 1.8578313253012049, + "grad_norm": 0.15139251947402954, + "learning_rate": 1.5268974251861298e-05, + "loss": 0.0077, + "step": 964 + }, + { + "epoch": 1.8597590361445784, + "grad_norm": 0.2188841849565506, + "learning_rate": 1.5225411149052356e-05, + "loss": 0.017, + "step": 965 + }, + { + "epoch": 1.8616867469879517, + "grad_norm": 0.10853131115436554, + "learning_rate": 1.5181872054425287e-05, + "loss": 0.0049, + "step": 966 + }, + { + "epoch": 1.8636144578313254, + "grad_norm": 0.8254880905151367, + "learning_rate": 1.5138357186908785e-05, + "loss": 0.0317, + "step": 967 + }, + { + "epoch": 1.8655421686746987, + "grad_norm": 0.2989620566368103, + "learning_rate": 1.5094866765309728e-05, + "loss": 0.0126, + "step": 968 + }, + { + "epoch": 1.8674698795180724, + "grad_norm": 0.16411150991916656, + "learning_rate": 1.5051401008312054e-05, + "loss": 0.0101, + "step": 969 + }, + { + "epoch": 1.8693975903614457, + "grad_norm": 0.2861763834953308, + "learning_rate": 1.5007960134475706e-05, + "loss": 0.0155, + "step": 970 + }, + { + "epoch": 1.8713253012048194, + "grad_norm": 0.24879588186740875, + "learning_rate": 1.4964544362235487e-05, + "loss": 0.0187, + "step": 971 + }, + { + "epoch": 1.8732530120481927, + "grad_norm": 0.2433672398328781, + "learning_rate": 1.4921153909899983e-05, + "loss": 0.0084, + "step": 972 + }, + { + "epoch": 1.8751807228915662, + "grad_norm": 0.15097154676914215, + "learning_rate": 1.487778899565047e-05, + "loss": 0.007, + "step": 973 + }, + { + "epoch": 1.8771084337349397, + "grad_norm": 0.1629047691822052, + "learning_rate": 1.4834449837539806e-05, + "loss": 0.0058, + "step": 974 + }, + { + "epoch": 1.8790361445783132, + "grad_norm": 0.9937071204185486, + "learning_rate": 1.4791136653491333e-05, + "loss": 0.0323, + "step": 975 + }, + { + "epoch": 1.8809638554216868, + "grad_norm": 0.19555562734603882, + "learning_rate": 1.4747849661297808e-05, + "loss": 0.0126, + "step": 976 + }, + { + "epoch": 1.8828915662650603, + "grad_norm": 0.16147711873054504, + "learning_rate": 1.470458907862026e-05, + "loss": 0.0067, + "step": 977 + }, + { + "epoch": 1.8848192771084338, + "grad_norm": 0.2730027735233307, + "learning_rate": 1.4661355122986945e-05, + "loss": 0.0147, + "step": 978 + }, + { + "epoch": 1.886746987951807, + "grad_norm": 0.13759832084178925, + "learning_rate": 1.4618148011792206e-05, + "loss": 0.0038, + "step": 979 + }, + { + "epoch": 1.8886746987951808, + "grad_norm": 0.33516690135002136, + "learning_rate": 1.4574967962295419e-05, + "loss": 0.0139, + "step": 980 + }, + { + "epoch": 1.890602409638554, + "grad_norm": 0.2345741093158722, + "learning_rate": 1.4531815191619903e-05, + "loss": 0.0094, + "step": 981 + }, + { + "epoch": 1.8925301204819278, + "grad_norm": 0.14681044220924377, + "learning_rate": 1.4488689916751762e-05, + "loss": 0.0065, + "step": 982 + }, + { + "epoch": 1.894457831325301, + "grad_norm": 0.21143914759159088, + "learning_rate": 1.4445592354538885e-05, + "loss": 0.0057, + "step": 983 + }, + { + "epoch": 1.8963855421686748, + "grad_norm": 0.3109160363674164, + "learning_rate": 1.44025227216898e-05, + "loss": 0.0142, + "step": 984 + }, + { + "epoch": 1.8983132530120481, + "grad_norm": 0.24301907420158386, + "learning_rate": 1.435948123477259e-05, + "loss": 0.012, + "step": 985 + }, + { + "epoch": 1.9002409638554218, + "grad_norm": 0.19817675650119781, + "learning_rate": 1.431646811021382e-05, + "loss": 0.0097, + "step": 986 + }, + { + "epoch": 1.9021686746987951, + "grad_norm": 0.13464932143688202, + "learning_rate": 1.4273483564297425e-05, + "loss": 0.0046, + "step": 987 + }, + { + "epoch": 1.9040963855421686, + "grad_norm": 0.1698642522096634, + "learning_rate": 1.4230527813163656e-05, + "loss": 0.0038, + "step": 988 + }, + { + "epoch": 1.9060240963855422, + "grad_norm": 0.19395388662815094, + "learning_rate": 1.4187601072807975e-05, + "loss": 0.0123, + "step": 989 + }, + { + "epoch": 1.9079518072289157, + "grad_norm": 0.2093188613653183, + "learning_rate": 1.4144703559079948e-05, + "loss": 0.0093, + "step": 990 + }, + { + "epoch": 1.9098795180722892, + "grad_norm": 0.1529311090707779, + "learning_rate": 1.4101835487682198e-05, + "loss": 0.0051, + "step": 991 + }, + { + "epoch": 1.9118072289156627, + "grad_norm": 0.18725350499153137, + "learning_rate": 1.4058997074169299e-05, + "loss": 0.0083, + "step": 992 + }, + { + "epoch": 1.9137349397590362, + "grad_norm": 0.15601560473442078, + "learning_rate": 1.401618853394668e-05, + "loss": 0.0086, + "step": 993 + }, + { + "epoch": 1.9156626506024095, + "grad_norm": 0.23890644311904907, + "learning_rate": 1.3973410082269591e-05, + "loss": 0.015, + "step": 994 + }, + { + "epoch": 1.9175903614457832, + "grad_norm": 0.2442619949579239, + "learning_rate": 1.3930661934241947e-05, + "loss": 0.0089, + "step": 995 + }, + { + "epoch": 1.9195180722891565, + "grad_norm": 0.1540212482213974, + "learning_rate": 1.388794430481532e-05, + "loss": 0.0072, + "step": 996 + }, + { + "epoch": 1.9214457831325302, + "grad_norm": 0.1359291970729828, + "learning_rate": 1.3845257408787807e-05, + "loss": 0.0131, + "step": 997 + }, + { + "epoch": 1.9233734939759035, + "grad_norm": 0.25486138463020325, + "learning_rate": 1.3802601460802967e-05, + "loss": 0.0198, + "step": 998 + }, + { + "epoch": 1.9253012048192772, + "grad_norm": 0.28815609216690063, + "learning_rate": 1.3759976675348754e-05, + "loss": 0.014, + "step": 999 + }, + { + "epoch": 1.9272289156626505, + "grad_norm": 0.15648497641086578, + "learning_rate": 1.3717383266756403e-05, + "loss": 0.0065, + "step": 1000 + }, + { + "epoch": 1.929156626506024, + "grad_norm": 0.16912540793418884, + "learning_rate": 1.367482144919941e-05, + "loss": 0.0059, + "step": 1001 + }, + { + "epoch": 1.9310843373493976, + "grad_norm": 0.16896723210811615, + "learning_rate": 1.3632291436692397e-05, + "loss": 0.0054, + "step": 1002 + }, + { + "epoch": 1.933012048192771, + "grad_norm": 0.20287497341632843, + "learning_rate": 1.3589793443090064e-05, + "loss": 0.0097, + "step": 1003 + }, + { + "epoch": 1.9349397590361446, + "grad_norm": 0.14804276823997498, + "learning_rate": 1.3547327682086114e-05, + "loss": 0.0125, + "step": 1004 + }, + { + "epoch": 1.936867469879518, + "grad_norm": 0.23820064961910248, + "learning_rate": 1.3504894367212171e-05, + "loss": 0.0131, + "step": 1005 + }, + { + "epoch": 1.9387951807228916, + "grad_norm": 0.25607362389564514, + "learning_rate": 1.34624937118367e-05, + "loss": 0.0115, + "step": 1006 + }, + { + "epoch": 1.940722891566265, + "grad_norm": 0.37233737111091614, + "learning_rate": 1.3420125929163976e-05, + "loss": 0.0309, + "step": 1007 + }, + { + "epoch": 1.9426506024096386, + "grad_norm": 0.19426730275154114, + "learning_rate": 1.3377791232232929e-05, + "loss": 0.0078, + "step": 1008 + }, + { + "epoch": 1.944578313253012, + "grad_norm": 0.2784160077571869, + "learning_rate": 1.333548983391617e-05, + "loss": 0.0142, + "step": 1009 + }, + { + "epoch": 1.9465060240963856, + "grad_norm": 0.11407195776700974, + "learning_rate": 1.3293221946918853e-05, + "loss": 0.0035, + "step": 1010 + }, + { + "epoch": 1.948433734939759, + "grad_norm": 0.3965436816215515, + "learning_rate": 1.325098778377762e-05, + "loss": 0.0242, + "step": 1011 + }, + { + "epoch": 1.9503614457831326, + "grad_norm": 0.18520519137382507, + "learning_rate": 1.3208787556859543e-05, + "loss": 0.0096, + "step": 1012 + } + ], + "logging_steps": 1, + "max_steps": 1557, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 92, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3284111481978225e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1012/training_args.bin b/checkpoint-1012/training_args.bin new file mode 100644 index 0000000..ecc7b6b --- /dev/null +++ b/checkpoint-1012/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:342dfb3c86216e436950100c79812c54066d5572c4e9975b0f133c067f061bcf +size 7825 diff --git a/checkpoint-1104/chat_template.jinja b/checkpoint-1104/chat_template.jinja new file mode 100644 index 0000000..1bad6a0 --- /dev/null +++ b/checkpoint-1104/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/checkpoint-1104/config.json b/checkpoint-1104/config.json new file mode 100644 index 0000000..f8bf41e --- /dev/null +++ b/checkpoint-1104/config.json @@ -0,0 +1,36 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128009, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": 128009, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "transformers_version": "4.56.2", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/checkpoint-1104/generation_config.json b/checkpoint-1104/generation_config.json new file mode 100644 index 0000000..2152026 --- /dev/null +++ b/checkpoint-1104/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128009, + 128001, + 128008, + 128009 + ], + "pad_token_id": 128009, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.56.2" +} diff --git a/checkpoint-1104/model.safetensors b/checkpoint-1104/model.safetensors new file mode 100644 index 0000000..0f33851 --- /dev/null +++ b/checkpoint-1104/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7aa3e6311bb222d4c4402598e4ef6a9775854ddc1f7618789b532586a245f363 +size 2996982344 diff --git a/checkpoint-1104/special_tokens_map.json b/checkpoint-1104/special_tokens_map.json new file mode 100644 index 0000000..14daf45 --- /dev/null +++ b/checkpoint-1104/special_tokens_map.json @@ -0,0 +1,26 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/checkpoint-1104/tokenizer.json b/checkpoint-1104/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/checkpoint-1104/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-1104/tokenizer_config.json b/checkpoint-1104/tokenizer_config.json new file mode 100644 index 0000000..d1e1ea9 --- /dev/null +++ b/checkpoint-1104/tokenizer_config.json @@ -0,0 +1,2068 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-1104/trainer_state.json b/checkpoint-1104/trainer_state.json new file mode 100644 index 0000000..c967a1a --- /dev/null +++ b/checkpoint-1104/trainer_state.json @@ -0,0 +1,7762 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.1272289156626507, + "eval_steps": 500, + "global_step": 1104, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0019277108433734939, + "grad_norm": 2.8518834114074707, + "learning_rate": 0.0, + "loss": 0.0891, + "step": 1 + }, + { + "epoch": 0.0038554216867469878, + "grad_norm": 1.8441249132156372, + "learning_rate": 2.564102564102564e-07, + "loss": 0.0539, + "step": 2 + }, + { + "epoch": 0.005783132530120482, + "grad_norm": 2.8263237476348877, + "learning_rate": 5.128205128205128e-07, + "loss": 0.099, + "step": 3 + }, + { + "epoch": 0.0077108433734939755, + "grad_norm": 2.5051236152648926, + "learning_rate": 7.692307692307694e-07, + "loss": 0.0789, + "step": 4 + }, + { + "epoch": 0.00963855421686747, + "grad_norm": 2.6903438568115234, + "learning_rate": 1.0256410256410257e-06, + "loss": 0.0881, + "step": 5 + }, + { + "epoch": 0.011566265060240964, + "grad_norm": 2.6205761432647705, + "learning_rate": 1.282051282051282e-06, + "loss": 0.0776, + "step": 6 + }, + { + "epoch": 0.013493975903614458, + "grad_norm": 2.6309337615966797, + "learning_rate": 1.5384615384615387e-06, + "loss": 0.0827, + "step": 7 + }, + { + "epoch": 0.015421686746987951, + "grad_norm": 1.5427855253219604, + "learning_rate": 1.794871794871795e-06, + "loss": 0.0577, + "step": 8 + }, + { + "epoch": 0.017349397590361446, + "grad_norm": 1.0973446369171143, + "learning_rate": 2.0512820512820513e-06, + "loss": 0.04, + "step": 9 + }, + { + "epoch": 0.01927710843373494, + "grad_norm": 1.3253350257873535, + "learning_rate": 2.307692307692308e-06, + "loss": 0.0506, + "step": 10 + }, + { + "epoch": 0.021204819277108433, + "grad_norm": 1.588739037513733, + "learning_rate": 2.564102564102564e-06, + "loss": 0.0874, + "step": 11 + }, + { + "epoch": 0.02313253012048193, + "grad_norm": 1.4987014532089233, + "learning_rate": 2.8205128205128207e-06, + "loss": 0.0597, + "step": 12 + }, + { + "epoch": 0.02506024096385542, + "grad_norm": 1.6571592092514038, + "learning_rate": 3.0769230769230774e-06, + "loss": 0.0559, + "step": 13 + }, + { + "epoch": 0.026987951807228915, + "grad_norm": 1.8860628604888916, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.0688, + "step": 14 + }, + { + "epoch": 0.02891566265060241, + "grad_norm": 1.3202295303344727, + "learning_rate": 3.58974358974359e-06, + "loss": 0.0433, + "step": 15 + }, + { + "epoch": 0.030843373493975902, + "grad_norm": 1.5870612859725952, + "learning_rate": 3.846153846153847e-06, + "loss": 0.0695, + "step": 16 + }, + { + "epoch": 0.0327710843373494, + "grad_norm": 0.9192284345626831, + "learning_rate": 4.102564102564103e-06, + "loss": 0.0392, + "step": 17 + }, + { + "epoch": 0.03469879518072289, + "grad_norm": 0.7950155735015869, + "learning_rate": 4.358974358974359e-06, + "loss": 0.0351, + "step": 18 + }, + { + "epoch": 0.03662650602409639, + "grad_norm": 0.8854314684867859, + "learning_rate": 4.615384615384616e-06, + "loss": 0.0356, + "step": 19 + }, + { + "epoch": 0.03855421686746988, + "grad_norm": 0.9546788930892944, + "learning_rate": 4.871794871794872e-06, + "loss": 0.0427, + "step": 20 + }, + { + "epoch": 0.04048192771084337, + "grad_norm": 0.6315903663635254, + "learning_rate": 5.128205128205128e-06, + "loss": 0.0397, + "step": 21 + }, + { + "epoch": 0.042409638554216866, + "grad_norm": 0.9230924844741821, + "learning_rate": 5.384615384615385e-06, + "loss": 0.0481, + "step": 22 + }, + { + "epoch": 0.04433734939759036, + "grad_norm": 0.711546003818512, + "learning_rate": 5.641025641025641e-06, + "loss": 0.0479, + "step": 23 + }, + { + "epoch": 0.04626506024096386, + "grad_norm": 0.5288046598434448, + "learning_rate": 5.897435897435898e-06, + "loss": 0.0182, + "step": 24 + }, + { + "epoch": 0.04819277108433735, + "grad_norm": 0.9420496225357056, + "learning_rate": 6.153846153846155e-06, + "loss": 0.0389, + "step": 25 + }, + { + "epoch": 0.05012048192771084, + "grad_norm": 0.5001983046531677, + "learning_rate": 6.410256410256412e-06, + "loss": 0.0268, + "step": 26 + }, + { + "epoch": 0.052048192771084335, + "grad_norm": 0.8084653615951538, + "learning_rate": 6.666666666666667e-06, + "loss": 0.0367, + "step": 27 + }, + { + "epoch": 0.05397590361445783, + "grad_norm": 0.7195103764533997, + "learning_rate": 6.923076923076923e-06, + "loss": 0.0251, + "step": 28 + }, + { + "epoch": 0.055903614457831326, + "grad_norm": 0.529958963394165, + "learning_rate": 7.17948717948718e-06, + "loss": 0.0289, + "step": 29 + }, + { + "epoch": 0.05783132530120482, + "grad_norm": 0.795376181602478, + "learning_rate": 7.435897435897437e-06, + "loss": 0.043, + "step": 30 + }, + { + "epoch": 0.059759036144578316, + "grad_norm": 0.6366249918937683, + "learning_rate": 7.692307692307694e-06, + "loss": 0.029, + "step": 31 + }, + { + "epoch": 0.061686746987951804, + "grad_norm": 0.5414115190505981, + "learning_rate": 7.948717948717949e-06, + "loss": 0.0365, + "step": 32 + }, + { + "epoch": 0.0636144578313253, + "grad_norm": 0.9350972175598145, + "learning_rate": 8.205128205128205e-06, + "loss": 0.0283, + "step": 33 + }, + { + "epoch": 0.0655421686746988, + "grad_norm": 0.5660741925239563, + "learning_rate": 8.461538461538462e-06, + "loss": 0.0234, + "step": 34 + }, + { + "epoch": 0.06746987951807229, + "grad_norm": 0.5623988509178162, + "learning_rate": 8.717948717948719e-06, + "loss": 0.0307, + "step": 35 + }, + { + "epoch": 0.06939759036144579, + "grad_norm": 0.5260195732116699, + "learning_rate": 8.974358974358976e-06, + "loss": 0.0264, + "step": 36 + }, + { + "epoch": 0.07132530120481928, + "grad_norm": 0.4934785068035126, + "learning_rate": 9.230769230769232e-06, + "loss": 0.0224, + "step": 37 + }, + { + "epoch": 0.07325301204819278, + "grad_norm": 0.4797322154045105, + "learning_rate": 9.487179487179487e-06, + "loss": 0.0163, + "step": 38 + }, + { + "epoch": 0.07518072289156627, + "grad_norm": 0.4739217460155487, + "learning_rate": 9.743589743589744e-06, + "loss": 0.0165, + "step": 39 + }, + { + "epoch": 0.07710843373493977, + "grad_norm": 0.4527677595615387, + "learning_rate": 1e-05, + "loss": 0.0163, + "step": 40 + }, + { + "epoch": 0.07903614457831325, + "grad_norm": 0.6241316795349121, + "learning_rate": 1.0256410256410256e-05, + "loss": 0.0302, + "step": 41 + }, + { + "epoch": 0.08096385542168674, + "grad_norm": 0.639043927192688, + "learning_rate": 1.0512820512820514e-05, + "loss": 0.0312, + "step": 42 + }, + { + "epoch": 0.08289156626506024, + "grad_norm": 0.5121409296989441, + "learning_rate": 1.076923076923077e-05, + "loss": 0.0256, + "step": 43 + }, + { + "epoch": 0.08481927710843373, + "grad_norm": 0.6340477466583252, + "learning_rate": 1.1025641025641028e-05, + "loss": 0.04, + "step": 44 + }, + { + "epoch": 0.08674698795180723, + "grad_norm": 0.5260409712791443, + "learning_rate": 1.1282051282051283e-05, + "loss": 0.0282, + "step": 45 + }, + { + "epoch": 0.08867469879518072, + "grad_norm": 0.6390711069107056, + "learning_rate": 1.1538461538461538e-05, + "loss": 0.0243, + "step": 46 + }, + { + "epoch": 0.09060240963855422, + "grad_norm": 0.46469295024871826, + "learning_rate": 1.1794871794871796e-05, + "loss": 0.0208, + "step": 47 + }, + { + "epoch": 0.09253012048192771, + "grad_norm": 0.8711516857147217, + "learning_rate": 1.2051282051282051e-05, + "loss": 0.0291, + "step": 48 + }, + { + "epoch": 0.09445783132530121, + "grad_norm": 0.9164300560951233, + "learning_rate": 1.230769230769231e-05, + "loss": 0.0342, + "step": 49 + }, + { + "epoch": 0.0963855421686747, + "grad_norm": 0.5401139259338379, + "learning_rate": 1.2564102564102565e-05, + "loss": 0.0185, + "step": 50 + }, + { + "epoch": 0.0983132530120482, + "grad_norm": 0.44393008947372437, + "learning_rate": 1.2820512820512823e-05, + "loss": 0.0228, + "step": 51 + }, + { + "epoch": 0.10024096385542168, + "grad_norm": 0.3855767846107483, + "learning_rate": 1.3076923076923078e-05, + "loss": 0.0176, + "step": 52 + }, + { + "epoch": 0.10216867469879518, + "grad_norm": 0.8561235070228577, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.0433, + "step": 53 + }, + { + "epoch": 0.10409638554216867, + "grad_norm": 0.768002450466156, + "learning_rate": 1.3589743589743592e-05, + "loss": 0.0245, + "step": 54 + }, + { + "epoch": 0.10602409638554217, + "grad_norm": 0.4559759497642517, + "learning_rate": 1.3846153846153847e-05, + "loss": 0.0224, + "step": 55 + }, + { + "epoch": 0.10795180722891566, + "grad_norm": 0.6203847527503967, + "learning_rate": 1.4102564102564105e-05, + "loss": 0.0296, + "step": 56 + }, + { + "epoch": 0.10987951807228916, + "grad_norm": 0.6651368141174316, + "learning_rate": 1.435897435897436e-05, + "loss": 0.0336, + "step": 57 + }, + { + "epoch": 0.11180722891566265, + "grad_norm": 0.377734512090683, + "learning_rate": 1.4615384615384615e-05, + "loss": 0.0196, + "step": 58 + }, + { + "epoch": 0.11373493975903615, + "grad_norm": 0.687568724155426, + "learning_rate": 1.4871794871794874e-05, + "loss": 0.0207, + "step": 59 + }, + { + "epoch": 0.11566265060240964, + "grad_norm": 0.7905604243278503, + "learning_rate": 1.5128205128205129e-05, + "loss": 0.047, + "step": 60 + }, + { + "epoch": 0.11759036144578314, + "grad_norm": 0.7938196063041687, + "learning_rate": 1.5384615384615387e-05, + "loss": 0.0198, + "step": 61 + }, + { + "epoch": 0.11951807228915663, + "grad_norm": 0.41340553760528564, + "learning_rate": 1.5641025641025644e-05, + "loss": 0.0161, + "step": 62 + }, + { + "epoch": 0.12144578313253013, + "grad_norm": 0.5668172240257263, + "learning_rate": 1.5897435897435897e-05, + "loss": 0.0275, + "step": 63 + }, + { + "epoch": 0.12337349397590361, + "grad_norm": 0.48333367705345154, + "learning_rate": 1.6153846153846154e-05, + "loss": 0.0137, + "step": 64 + }, + { + "epoch": 0.12530120481927712, + "grad_norm": 0.6843933463096619, + "learning_rate": 1.641025641025641e-05, + "loss": 0.0294, + "step": 65 + }, + { + "epoch": 0.1272289156626506, + "grad_norm": 0.7789272665977478, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.0401, + "step": 66 + }, + { + "epoch": 0.1291566265060241, + "grad_norm": 0.6203492879867554, + "learning_rate": 1.6923076923076924e-05, + "loss": 0.0292, + "step": 67 + }, + { + "epoch": 0.1310843373493976, + "grad_norm": 0.5940662622451782, + "learning_rate": 1.717948717948718e-05, + "loss": 0.0178, + "step": 68 + }, + { + "epoch": 0.13301204819277107, + "grad_norm": 0.35504868626594543, + "learning_rate": 1.7435897435897438e-05, + "loss": 0.0129, + "step": 69 + }, + { + "epoch": 0.13493975903614458, + "grad_norm": 0.8796699643135071, + "learning_rate": 1.7692307692307694e-05, + "loss": 0.034, + "step": 70 + }, + { + "epoch": 0.13686746987951806, + "grad_norm": 0.967444896697998, + "learning_rate": 1.794871794871795e-05, + "loss": 0.0266, + "step": 71 + }, + { + "epoch": 0.13879518072289157, + "grad_norm": 0.4428526759147644, + "learning_rate": 1.8205128205128208e-05, + "loss": 0.0223, + "step": 72 + }, + { + "epoch": 0.14072289156626505, + "grad_norm": 0.42897751927375793, + "learning_rate": 1.8461538461538465e-05, + "loss": 0.0187, + "step": 73 + }, + { + "epoch": 0.14265060240963856, + "grad_norm": 0.5100914835929871, + "learning_rate": 1.8717948717948718e-05, + "loss": 0.0164, + "step": 74 + }, + { + "epoch": 0.14457831325301204, + "grad_norm": 0.6028861999511719, + "learning_rate": 1.8974358974358975e-05, + "loss": 0.0164, + "step": 75 + }, + { + "epoch": 0.14650602409638555, + "grad_norm": 0.6187024116516113, + "learning_rate": 1.923076923076923e-05, + "loss": 0.0296, + "step": 76 + }, + { + "epoch": 0.14843373493975903, + "grad_norm": 0.4822489619255066, + "learning_rate": 1.9487179487179488e-05, + "loss": 0.0148, + "step": 77 + }, + { + "epoch": 0.15036144578313254, + "grad_norm": 0.7231149673461914, + "learning_rate": 1.9743589743589745e-05, + "loss": 0.0395, + "step": 78 + }, + { + "epoch": 0.15228915662650602, + "grad_norm": 0.8409642577171326, + "learning_rate": 2e-05, + "loss": 0.0446, + "step": 79 + }, + { + "epoch": 0.15421686746987953, + "grad_norm": 0.4883500039577484, + "learning_rate": 2.025641025641026e-05, + "loss": 0.0206, + "step": 80 + }, + { + "epoch": 0.156144578313253, + "grad_norm": 0.6287479400634766, + "learning_rate": 2.0512820512820512e-05, + "loss": 0.0333, + "step": 81 + }, + { + "epoch": 0.1580722891566265, + "grad_norm": 0.5041632652282715, + "learning_rate": 2.0769230769230772e-05, + "loss": 0.0414, + "step": 82 + }, + { + "epoch": 0.16, + "grad_norm": 0.5103405117988586, + "learning_rate": 2.102564102564103e-05, + "loss": 0.045, + "step": 83 + }, + { + "epoch": 0.16192771084337348, + "grad_norm": 0.493161678314209, + "learning_rate": 2.1282051282051285e-05, + "loss": 0.021, + "step": 84 + }, + { + "epoch": 0.163855421686747, + "grad_norm": 0.908843994140625, + "learning_rate": 2.153846153846154e-05, + "loss": 0.0389, + "step": 85 + }, + { + "epoch": 0.16578313253012048, + "grad_norm": 0.5067003965377808, + "learning_rate": 2.1794871794871795e-05, + "loss": 0.0272, + "step": 86 + }, + { + "epoch": 0.16771084337349398, + "grad_norm": 0.5791381597518921, + "learning_rate": 2.2051282051282056e-05, + "loss": 0.0368, + "step": 87 + }, + { + "epoch": 0.16963855421686747, + "grad_norm": 0.7056036591529846, + "learning_rate": 2.230769230769231e-05, + "loss": 0.0284, + "step": 88 + }, + { + "epoch": 0.17156626506024097, + "grad_norm": 0.6563822031021118, + "learning_rate": 2.2564102564102566e-05, + "loss": 0.0646, + "step": 89 + }, + { + "epoch": 0.17349397590361446, + "grad_norm": 0.9483286142349243, + "learning_rate": 2.2820512820512822e-05, + "loss": 0.0439, + "step": 90 + }, + { + "epoch": 0.17542168674698796, + "grad_norm": 0.370664119720459, + "learning_rate": 2.3076923076923076e-05, + "loss": 0.0109, + "step": 91 + }, + { + "epoch": 0.17734939759036145, + "grad_norm": 0.9776477813720703, + "learning_rate": 2.3333333333333336e-05, + "loss": 0.0458, + "step": 92 + }, + { + "epoch": 0.17927710843373493, + "grad_norm": 0.45710092782974243, + "learning_rate": 2.3589743589743593e-05, + "loss": 0.0212, + "step": 93 + }, + { + "epoch": 0.18120481927710844, + "grad_norm": 0.8623896837234497, + "learning_rate": 2.384615384615385e-05, + "loss": 0.0215, + "step": 94 + }, + { + "epoch": 0.18313253012048192, + "grad_norm": 0.55814528465271, + "learning_rate": 2.4102564102564103e-05, + "loss": 0.0218, + "step": 95 + }, + { + "epoch": 0.18506024096385543, + "grad_norm": 0.49882641434669495, + "learning_rate": 2.435897435897436e-05, + "loss": 0.0268, + "step": 96 + }, + { + "epoch": 0.1869879518072289, + "grad_norm": 0.3508654534816742, + "learning_rate": 2.461538461538462e-05, + "loss": 0.0172, + "step": 97 + }, + { + "epoch": 0.18891566265060242, + "grad_norm": 0.601170003414154, + "learning_rate": 2.4871794871794873e-05, + "loss": 0.0208, + "step": 98 + }, + { + "epoch": 0.1908433734939759, + "grad_norm": 1.1748133897781372, + "learning_rate": 2.512820512820513e-05, + "loss": 0.0259, + "step": 99 + }, + { + "epoch": 0.1927710843373494, + "grad_norm": 0.46370384097099304, + "learning_rate": 2.5384615384615386e-05, + "loss": 0.0242, + "step": 100 + }, + { + "epoch": 0.1946987951807229, + "grad_norm": 0.525010883808136, + "learning_rate": 2.5641025641025646e-05, + "loss": 0.0188, + "step": 101 + }, + { + "epoch": 0.1966265060240964, + "grad_norm": 0.766501784324646, + "learning_rate": 2.58974358974359e-05, + "loss": 0.0584, + "step": 102 + }, + { + "epoch": 0.19855421686746988, + "grad_norm": 0.3572964370250702, + "learning_rate": 2.6153846153846157e-05, + "loss": 0.0131, + "step": 103 + }, + { + "epoch": 0.20048192771084336, + "grad_norm": 0.6467130780220032, + "learning_rate": 2.6410256410256413e-05, + "loss": 0.0231, + "step": 104 + }, + { + "epoch": 0.20240963855421687, + "grad_norm": 1.1852102279663086, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.027, + "step": 105 + }, + { + "epoch": 0.20433734939759035, + "grad_norm": 2.3659932613372803, + "learning_rate": 2.6923076923076927e-05, + "loss": 0.0224, + "step": 106 + }, + { + "epoch": 0.20626506024096386, + "grad_norm": 0.5343687534332275, + "learning_rate": 2.7179487179487183e-05, + "loss": 0.0198, + "step": 107 + }, + { + "epoch": 0.20819277108433734, + "grad_norm": 1.852160096168518, + "learning_rate": 2.7435897435897437e-05, + "loss": 0.032, + "step": 108 + }, + { + "epoch": 0.21012048192771085, + "grad_norm": 0.47291702032089233, + "learning_rate": 2.7692307692307694e-05, + "loss": 0.0117, + "step": 109 + }, + { + "epoch": 0.21204819277108433, + "grad_norm": 0.7623187899589539, + "learning_rate": 2.794871794871795e-05, + "loss": 0.0337, + "step": 110 + }, + { + "epoch": 0.21397590361445784, + "grad_norm": 0.5272570848464966, + "learning_rate": 2.820512820512821e-05, + "loss": 0.0131, + "step": 111 + }, + { + "epoch": 0.21590361445783132, + "grad_norm": 0.5568500757217407, + "learning_rate": 2.8461538461538464e-05, + "loss": 0.0233, + "step": 112 + }, + { + "epoch": 0.21783132530120483, + "grad_norm": 0.4008469879627228, + "learning_rate": 2.871794871794872e-05, + "loss": 0.0204, + "step": 113 + }, + { + "epoch": 0.2197590361445783, + "grad_norm": 0.4888612926006317, + "learning_rate": 2.8974358974358977e-05, + "loss": 0.016, + "step": 114 + }, + { + "epoch": 0.2216867469879518, + "grad_norm": 0.44903355836868286, + "learning_rate": 2.923076923076923e-05, + "loss": 0.0135, + "step": 115 + }, + { + "epoch": 0.2236144578313253, + "grad_norm": 0.9266762733459473, + "learning_rate": 2.948717948717949e-05, + "loss": 0.0233, + "step": 116 + }, + { + "epoch": 0.22554216867469878, + "grad_norm": 0.5352638959884644, + "learning_rate": 2.9743589743589747e-05, + "loss": 0.0198, + "step": 117 + }, + { + "epoch": 0.2274698795180723, + "grad_norm": 0.6051343679428101, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.0246, + "step": 118 + }, + { + "epoch": 0.22939759036144577, + "grad_norm": 0.9971133470535278, + "learning_rate": 3.0256410256410257e-05, + "loss": 0.025, + "step": 119 + }, + { + "epoch": 0.23132530120481928, + "grad_norm": 0.704236626625061, + "learning_rate": 3.0512820512820514e-05, + "loss": 0.031, + "step": 120 + }, + { + "epoch": 0.23325301204819276, + "grad_norm": 0.6137097477912903, + "learning_rate": 3.0769230769230774e-05, + "loss": 0.0519, + "step": 121 + }, + { + "epoch": 0.23518072289156627, + "grad_norm": 0.7396159768104553, + "learning_rate": 3.102564102564103e-05, + "loss": 0.0325, + "step": 122 + }, + { + "epoch": 0.23710843373493976, + "grad_norm": 1.3282053470611572, + "learning_rate": 3.128205128205129e-05, + "loss": 0.0252, + "step": 123 + }, + { + "epoch": 0.23903614457831326, + "grad_norm": 0.5220731496810913, + "learning_rate": 3.153846153846154e-05, + "loss": 0.0262, + "step": 124 + }, + { + "epoch": 0.24096385542168675, + "grad_norm": 0.5357242822647095, + "learning_rate": 3.1794871794871795e-05, + "loss": 0.0243, + "step": 125 + }, + { + "epoch": 0.24289156626506025, + "grad_norm": 0.48207753896713257, + "learning_rate": 3.205128205128206e-05, + "loss": 0.0178, + "step": 126 + }, + { + "epoch": 0.24481927710843374, + "grad_norm": 0.552988588809967, + "learning_rate": 3.230769230769231e-05, + "loss": 0.023, + "step": 127 + }, + { + "epoch": 0.24674698795180722, + "grad_norm": 1.7962840795516968, + "learning_rate": 3.2564102564102565e-05, + "loss": 0.032, + "step": 128 + }, + { + "epoch": 0.24867469879518073, + "grad_norm": 1.6404600143432617, + "learning_rate": 3.282051282051282e-05, + "loss": 0.0231, + "step": 129 + }, + { + "epoch": 0.25060240963855424, + "grad_norm": 0.39142486453056335, + "learning_rate": 3.307692307692308e-05, + "loss": 0.0147, + "step": 130 + }, + { + "epoch": 0.2525301204819277, + "grad_norm": 1.3272887468338013, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.0439, + "step": 131 + }, + { + "epoch": 0.2544578313253012, + "grad_norm": 1.5122811794281006, + "learning_rate": 3.358974358974359e-05, + "loss": 0.0282, + "step": 132 + }, + { + "epoch": 0.2563855421686747, + "grad_norm": 1.8542430400848389, + "learning_rate": 3.384615384615385e-05, + "loss": 0.0515, + "step": 133 + }, + { + "epoch": 0.2583132530120482, + "grad_norm": 4.059277534484863, + "learning_rate": 3.4102564102564105e-05, + "loss": 0.0781, + "step": 134 + }, + { + "epoch": 0.26024096385542167, + "grad_norm": 0.6206214427947998, + "learning_rate": 3.435897435897436e-05, + "loss": 0.0306, + "step": 135 + }, + { + "epoch": 0.2621686746987952, + "grad_norm": 0.4575510323047638, + "learning_rate": 3.461538461538462e-05, + "loss": 0.0154, + "step": 136 + }, + { + "epoch": 0.2640963855421687, + "grad_norm": 1.1556978225708008, + "learning_rate": 3.4871794871794875e-05, + "loss": 0.0235, + "step": 137 + }, + { + "epoch": 0.26602409638554214, + "grad_norm": 0.6975051760673523, + "learning_rate": 3.512820512820513e-05, + "loss": 0.0453, + "step": 138 + }, + { + "epoch": 0.26795180722891565, + "grad_norm": 0.8686623573303223, + "learning_rate": 3.538461538461539e-05, + "loss": 0.0427, + "step": 139 + }, + { + "epoch": 0.26987951807228916, + "grad_norm": 2.0681848526000977, + "learning_rate": 3.5641025641025646e-05, + "loss": 0.04, + "step": 140 + }, + { + "epoch": 0.27180722891566267, + "grad_norm": 0.4397984445095062, + "learning_rate": 3.58974358974359e-05, + "loss": 0.0188, + "step": 141 + }, + { + "epoch": 0.2737349397590361, + "grad_norm": 0.5871334075927734, + "learning_rate": 3.615384615384616e-05, + "loss": 0.0253, + "step": 142 + }, + { + "epoch": 0.27566265060240963, + "grad_norm": 1.1078568696975708, + "learning_rate": 3.6410256410256416e-05, + "loss": 0.0316, + "step": 143 + }, + { + "epoch": 0.27759036144578314, + "grad_norm": 0.5691841840744019, + "learning_rate": 3.6666666666666666e-05, + "loss": 0.0266, + "step": 144 + }, + { + "epoch": 0.27951807228915665, + "grad_norm": 0.7896255254745483, + "learning_rate": 3.692307692307693e-05, + "loss": 0.0281, + "step": 145 + }, + { + "epoch": 0.2814457831325301, + "grad_norm": 0.9988337159156799, + "learning_rate": 3.7179487179487186e-05, + "loss": 0.0295, + "step": 146 + }, + { + "epoch": 0.2833734939759036, + "grad_norm": 0.9811834692955017, + "learning_rate": 3.7435897435897436e-05, + "loss": 0.0322, + "step": 147 + }, + { + "epoch": 0.2853012048192771, + "grad_norm": 0.6503105759620667, + "learning_rate": 3.769230769230769e-05, + "loss": 0.0266, + "step": 148 + }, + { + "epoch": 0.28722891566265063, + "grad_norm": 1.9164355993270874, + "learning_rate": 3.794871794871795e-05, + "loss": 0.0677, + "step": 149 + }, + { + "epoch": 0.2891566265060241, + "grad_norm": 1.1724557876586914, + "learning_rate": 3.820512820512821e-05, + "loss": 0.0324, + "step": 150 + }, + { + "epoch": 0.2910843373493976, + "grad_norm": 0.8482469916343689, + "learning_rate": 3.846153846153846e-05, + "loss": 0.0259, + "step": 151 + }, + { + "epoch": 0.2930120481927711, + "grad_norm": 0.8572830557823181, + "learning_rate": 3.871794871794872e-05, + "loss": 0.0358, + "step": 152 + }, + { + "epoch": 0.29493975903614456, + "grad_norm": 0.6630825400352478, + "learning_rate": 3.8974358974358976e-05, + "loss": 0.0447, + "step": 153 + }, + { + "epoch": 0.29686746987951806, + "grad_norm": 0.9197093844413757, + "learning_rate": 3.923076923076923e-05, + "loss": 0.0409, + "step": 154 + }, + { + "epoch": 0.2987951807228916, + "grad_norm": 0.6976819634437561, + "learning_rate": 3.948717948717949e-05, + "loss": 0.0317, + "step": 155 + }, + { + "epoch": 0.3007228915662651, + "grad_norm": 0.7353514432907104, + "learning_rate": 3.9743589743589747e-05, + "loss": 0.0306, + "step": 156 + }, + { + "epoch": 0.30265060240963854, + "grad_norm": 0.5730232000350952, + "learning_rate": 4e-05, + "loss": 0.0324, + "step": 157 + }, + { + "epoch": 0.30457831325301205, + "grad_norm": 0.7852078676223755, + "learning_rate": 3.999994971675547e-05, + "loss": 0.0354, + "step": 158 + }, + { + "epoch": 0.30650602409638555, + "grad_norm": 0.5924715399742126, + "learning_rate": 3.999979886727471e-05, + "loss": 0.0366, + "step": 159 + }, + { + "epoch": 0.30843373493975906, + "grad_norm": 0.7359845638275146, + "learning_rate": 3.999954745231624e-05, + "loss": 0.0437, + "step": 160 + }, + { + "epoch": 0.3103614457831325, + "grad_norm": 0.7866976857185364, + "learning_rate": 3.999919547314426e-05, + "loss": 0.0363, + "step": 161 + }, + { + "epoch": 0.312289156626506, + "grad_norm": 0.7425745129585266, + "learning_rate": 3.999874293152863e-05, + "loss": 0.0259, + "step": 162 + }, + { + "epoch": 0.31421686746987953, + "grad_norm": 1.8922245502471924, + "learning_rate": 3.9998189829744885e-05, + "loss": 0.0341, + "step": 163 + }, + { + "epoch": 0.316144578313253, + "grad_norm": 0.7908634543418884, + "learning_rate": 3.99975361705742e-05, + "loss": 0.0424, + "step": 164 + }, + { + "epoch": 0.3180722891566265, + "grad_norm": 2.047368049621582, + "learning_rate": 3.999678195730337e-05, + "loss": 0.0535, + "step": 165 + }, + { + "epoch": 0.32, + "grad_norm": 0.5702639222145081, + "learning_rate": 3.999592719372484e-05, + "loss": 0.0284, + "step": 166 + }, + { + "epoch": 0.3219277108433735, + "grad_norm": 0.45015648007392883, + "learning_rate": 3.9994971884136636e-05, + "loss": 0.0313, + "step": 167 + }, + { + "epoch": 0.32385542168674697, + "grad_norm": 4.094679355621338, + "learning_rate": 3.9993916033342355e-05, + "loss": 0.0524, + "step": 168 + }, + { + "epoch": 0.3257831325301205, + "grad_norm": 0.800846517086029, + "learning_rate": 3.999275964665117e-05, + "loss": 0.0282, + "step": 169 + }, + { + "epoch": 0.327710843373494, + "grad_norm": 0.47881078720092773, + "learning_rate": 3.999150272987776e-05, + "loss": 0.0293, + "step": 170 + }, + { + "epoch": 0.3296385542168675, + "grad_norm": 0.5716657638549805, + "learning_rate": 3.999014528934232e-05, + "loss": 0.0221, + "step": 171 + }, + { + "epoch": 0.33156626506024095, + "grad_norm": 0.6333311200141907, + "learning_rate": 3.998868733187048e-05, + "loss": 0.0302, + "step": 172 + }, + { + "epoch": 0.33349397590361446, + "grad_norm": 6.642521858215332, + "learning_rate": 3.998712886479335e-05, + "loss": 0.0364, + "step": 173 + }, + { + "epoch": 0.33542168674698797, + "grad_norm": 0.7515506148338318, + "learning_rate": 3.998546989594739e-05, + "loss": 0.0296, + "step": 174 + }, + { + "epoch": 0.3373493975903614, + "grad_norm": 1.0728015899658203, + "learning_rate": 3.998371043367445e-05, + "loss": 0.0549, + "step": 175 + }, + { + "epoch": 0.33927710843373493, + "grad_norm": 1.3025579452514648, + "learning_rate": 3.998185048682166e-05, + "loss": 0.0577, + "step": 176 + }, + { + "epoch": 0.34120481927710844, + "grad_norm": 1.0962958335876465, + "learning_rate": 3.997989006474144e-05, + "loss": 0.0313, + "step": 177 + }, + { + "epoch": 0.34313253012048195, + "grad_norm": 0.7064313292503357, + "learning_rate": 3.997782917729143e-05, + "loss": 0.0309, + "step": 178 + }, + { + "epoch": 0.3450602409638554, + "grad_norm": 0.43374207615852356, + "learning_rate": 3.997566783483445e-05, + "loss": 0.0166, + "step": 179 + }, + { + "epoch": 0.3469879518072289, + "grad_norm": 0.7236390113830566, + "learning_rate": 3.9973406048238413e-05, + "loss": 0.0254, + "step": 180 + }, + { + "epoch": 0.3489156626506024, + "grad_norm": 0.5041500926017761, + "learning_rate": 3.9971043828876334e-05, + "loss": 0.0239, + "step": 181 + }, + { + "epoch": 0.35084337349397593, + "grad_norm": 1.2744532823562622, + "learning_rate": 3.9968581188626204e-05, + "loss": 0.0404, + "step": 182 + }, + { + "epoch": 0.3527710843373494, + "grad_norm": 0.45845362544059753, + "learning_rate": 3.996601813987098e-05, + "loss": 0.0127, + "step": 183 + }, + { + "epoch": 0.3546987951807229, + "grad_norm": 0.4426881968975067, + "learning_rate": 3.996335469549852e-05, + "loss": 0.0176, + "step": 184 + }, + { + "epoch": 0.3566265060240964, + "grad_norm": 1.0030732154846191, + "learning_rate": 3.9960590868901465e-05, + "loss": 0.0457, + "step": 185 + }, + { + "epoch": 0.35855421686746985, + "grad_norm": 0.6428582668304443, + "learning_rate": 3.995772667397725e-05, + "loss": 0.0271, + "step": 186 + }, + { + "epoch": 0.36048192771084336, + "grad_norm": 0.5335744619369507, + "learning_rate": 3.995476212512795e-05, + "loss": 0.0297, + "step": 187 + }, + { + "epoch": 0.3624096385542169, + "grad_norm": 0.6995761394500732, + "learning_rate": 3.99516972372603e-05, + "loss": 0.0322, + "step": 188 + }, + { + "epoch": 0.3643373493975904, + "grad_norm": 0.765511155128479, + "learning_rate": 3.9948532025785546e-05, + "loss": 0.0253, + "step": 189 + }, + { + "epoch": 0.36626506024096384, + "grad_norm": 0.6165828108787537, + "learning_rate": 3.9945266506619403e-05, + "loss": 0.0355, + "step": 190 + }, + { + "epoch": 0.36819277108433734, + "grad_norm": 0.851970911026001, + "learning_rate": 3.994190069618195e-05, + "loss": 0.056, + "step": 191 + }, + { + "epoch": 0.37012048192771085, + "grad_norm": 0.9850023984909058, + "learning_rate": 3.993843461139757e-05, + "loss": 0.0415, + "step": 192 + }, + { + "epoch": 0.37204819277108436, + "grad_norm": 0.7455295324325562, + "learning_rate": 3.9934868269694886e-05, + "loss": 0.0379, + "step": 193 + }, + { + "epoch": 0.3739759036144578, + "grad_norm": 1.159469723701477, + "learning_rate": 3.9931201689006595e-05, + "loss": 0.0237, + "step": 194 + }, + { + "epoch": 0.3759036144578313, + "grad_norm": 0.5490080118179321, + "learning_rate": 3.992743488776947e-05, + "loss": 0.024, + "step": 195 + }, + { + "epoch": 0.37783132530120483, + "grad_norm": 1.279831886291504, + "learning_rate": 3.992356788492421e-05, + "loss": 0.0273, + "step": 196 + }, + { + "epoch": 0.3797590361445783, + "grad_norm": 0.859104335308075, + "learning_rate": 3.9919600699915355e-05, + "loss": 0.0411, + "step": 197 + }, + { + "epoch": 0.3816867469879518, + "grad_norm": 1.2525300979614258, + "learning_rate": 3.991553335269119e-05, + "loss": 0.0857, + "step": 198 + }, + { + "epoch": 0.3836144578313253, + "grad_norm": 0.4924193024635315, + "learning_rate": 3.991136586370367e-05, + "loss": 0.0294, + "step": 199 + }, + { + "epoch": 0.3855421686746988, + "grad_norm": 1.417190670967102, + "learning_rate": 3.990709825390828e-05, + "loss": 0.0395, + "step": 200 + }, + { + "epoch": 0.38746987951807227, + "grad_norm": 0.6172056198120117, + "learning_rate": 3.9902730544763936e-05, + "loss": 0.0194, + "step": 201 + }, + { + "epoch": 0.3893975903614458, + "grad_norm": 0.7292149662971497, + "learning_rate": 3.989826275823291e-05, + "loss": 0.0381, + "step": 202 + }, + { + "epoch": 0.3913253012048193, + "grad_norm": 0.5949816107749939, + "learning_rate": 3.989369491678067e-05, + "loss": 0.0254, + "step": 203 + }, + { + "epoch": 0.3932530120481928, + "grad_norm": 0.6012582182884216, + "learning_rate": 3.988902704337582e-05, + "loss": 0.048, + "step": 204 + }, + { + "epoch": 0.39518072289156625, + "grad_norm": 0.6273590922355652, + "learning_rate": 3.9884259161489936e-05, + "loss": 0.0268, + "step": 205 + }, + { + "epoch": 0.39710843373493976, + "grad_norm": 0.9615244269371033, + "learning_rate": 3.987939129509746e-05, + "loss": 0.0192, + "step": 206 + }, + { + "epoch": 0.39903614457831327, + "grad_norm": 0.6009241342544556, + "learning_rate": 3.9874423468675624e-05, + "loss": 0.0362, + "step": 207 + }, + { + "epoch": 0.4009638554216867, + "grad_norm": 0.411335289478302, + "learning_rate": 3.9869355707204266e-05, + "loss": 0.017, + "step": 208 + }, + { + "epoch": 0.40289156626506023, + "grad_norm": 0.6151527166366577, + "learning_rate": 3.986418803616573e-05, + "loss": 0.0283, + "step": 209 + }, + { + "epoch": 0.40481927710843374, + "grad_norm": 0.33808204531669617, + "learning_rate": 3.985892048154474e-05, + "loss": 0.0158, + "step": 210 + }, + { + "epoch": 0.40674698795180725, + "grad_norm": 0.5464187860488892, + "learning_rate": 3.9853553069828284e-05, + "loss": 0.0292, + "step": 211 + }, + { + "epoch": 0.4086746987951807, + "grad_norm": 0.6658390760421753, + "learning_rate": 3.984808582800543e-05, + "loss": 0.0281, + "step": 212 + }, + { + "epoch": 0.4106024096385542, + "grad_norm": 0.4253764748573303, + "learning_rate": 3.984251878356726e-05, + "loss": 0.031, + "step": 213 + }, + { + "epoch": 0.4125301204819277, + "grad_norm": 0.32309481501579285, + "learning_rate": 3.983685196450667e-05, + "loss": 0.0166, + "step": 214 + }, + { + "epoch": 0.41445783132530123, + "grad_norm": 0.43756410479545593, + "learning_rate": 3.9831085399318265e-05, + "loss": 0.0326, + "step": 215 + }, + { + "epoch": 0.4163855421686747, + "grad_norm": 0.264046847820282, + "learning_rate": 3.982521911699822e-05, + "loss": 0.0118, + "step": 216 + }, + { + "epoch": 0.4183132530120482, + "grad_norm": 0.8630897402763367, + "learning_rate": 3.9819253147044084e-05, + "loss": 0.0246, + "step": 217 + }, + { + "epoch": 0.4202409638554217, + "grad_norm": 0.6923379898071289, + "learning_rate": 3.98131875194547e-05, + "loss": 0.036, + "step": 218 + }, + { + "epoch": 0.42216867469879515, + "grad_norm": 0.5874778628349304, + "learning_rate": 3.9807022264730024e-05, + "loss": 0.0255, + "step": 219 + }, + { + "epoch": 0.42409638554216866, + "grad_norm": 0.394336074590683, + "learning_rate": 3.980075741387094e-05, + "loss": 0.0187, + "step": 220 + }, + { + "epoch": 0.4260240963855422, + "grad_norm": 0.6300327777862549, + "learning_rate": 3.979439299837915e-05, + "loss": 0.0214, + "step": 221 + }, + { + "epoch": 0.4279518072289157, + "grad_norm": 0.5200467109680176, + "learning_rate": 3.978792905025702e-05, + "loss": 0.0628, + "step": 222 + }, + { + "epoch": 0.42987951807228914, + "grad_norm": 0.5713880062103271, + "learning_rate": 3.978136560200735e-05, + "loss": 0.0302, + "step": 223 + }, + { + "epoch": 0.43180722891566264, + "grad_norm": 0.5345383286476135, + "learning_rate": 3.977470268663331e-05, + "loss": 0.0125, + "step": 224 + }, + { + "epoch": 0.43373493975903615, + "grad_norm": 0.5378350019454956, + "learning_rate": 3.976794033763819e-05, + "loss": 0.0246, + "step": 225 + }, + { + "epoch": 0.43566265060240966, + "grad_norm": 0.5554935336112976, + "learning_rate": 3.9761078589025276e-05, + "loss": 0.0212, + "step": 226 + }, + { + "epoch": 0.4375903614457831, + "grad_norm": 0.2832634747028351, + "learning_rate": 3.9754117475297664e-05, + "loss": 0.0125, + "step": 227 + }, + { + "epoch": 0.4395180722891566, + "grad_norm": 1.2910150289535522, + "learning_rate": 3.97470570314581e-05, + "loss": 0.0364, + "step": 228 + }, + { + "epoch": 0.44144578313253013, + "grad_norm": 0.3731018602848053, + "learning_rate": 3.973989729300878e-05, + "loss": 0.0128, + "step": 229 + }, + { + "epoch": 0.4433734939759036, + "grad_norm": 0.9433871507644653, + "learning_rate": 3.9732638295951195e-05, + "loss": 0.0367, + "step": 230 + }, + { + "epoch": 0.4453012048192771, + "grad_norm": 1.0779197216033936, + "learning_rate": 3.972528007678594e-05, + "loss": 0.0667, + "step": 231 + }, + { + "epoch": 0.4472289156626506, + "grad_norm": 1.7009105682373047, + "learning_rate": 3.9717822672512516e-05, + "loss": 0.0655, + "step": 232 + }, + { + "epoch": 0.4491566265060241, + "grad_norm": 0.5646032094955444, + "learning_rate": 3.971026612062919e-05, + "loss": 0.064, + "step": 233 + }, + { + "epoch": 0.45108433734939757, + "grad_norm": 0.44474121928215027, + "learning_rate": 3.970261045913274e-05, + "loss": 0.0206, + "step": 234 + }, + { + "epoch": 0.4530120481927711, + "grad_norm": 1.3969277143478394, + "learning_rate": 3.969485572651833e-05, + "loss": 0.0486, + "step": 235 + }, + { + "epoch": 0.4549397590361446, + "grad_norm": 0.6401994228363037, + "learning_rate": 3.968700196177925e-05, + "loss": 0.0262, + "step": 236 + }, + { + "epoch": 0.4568674698795181, + "grad_norm": 0.7091913223266602, + "learning_rate": 3.96790492044068e-05, + "loss": 0.014, + "step": 237 + }, + { + "epoch": 0.45879518072289155, + "grad_norm": 0.6561547517776489, + "learning_rate": 3.967099749439002e-05, + "loss": 0.0482, + "step": 238 + }, + { + "epoch": 0.46072289156626506, + "grad_norm": 0.6924155354499817, + "learning_rate": 3.966284687221551e-05, + "loss": 0.0289, + "step": 239 + }, + { + "epoch": 0.46265060240963857, + "grad_norm": 0.5868663787841797, + "learning_rate": 3.9654597378867256e-05, + "loss": 0.0331, + "step": 240 + }, + { + "epoch": 0.464578313253012, + "grad_norm": 0.7930939793586731, + "learning_rate": 3.964624905582637e-05, + "loss": 0.0925, + "step": 241 + }, + { + "epoch": 0.46650602409638553, + "grad_norm": 0.4888836145401001, + "learning_rate": 3.9637801945070944e-05, + "loss": 0.015, + "step": 242 + }, + { + "epoch": 0.46843373493975904, + "grad_norm": 0.7820287346839905, + "learning_rate": 3.962925608907579e-05, + "loss": 0.0382, + "step": 243 + }, + { + "epoch": 0.47036144578313255, + "grad_norm": 0.4914316236972809, + "learning_rate": 3.962061153081224e-05, + "loss": 0.0257, + "step": 244 + }, + { + "epoch": 0.472289156626506, + "grad_norm": 0.5681505799293518, + "learning_rate": 3.961186831374793e-05, + "loss": 0.0551, + "step": 245 + }, + { + "epoch": 0.4742168674698795, + "grad_norm": 0.5049723386764526, + "learning_rate": 3.9603026481846616e-05, + "loss": 0.0186, + "step": 246 + }, + { + "epoch": 0.476144578313253, + "grad_norm": 0.5034119486808777, + "learning_rate": 3.959408607956787e-05, + "loss": 0.024, + "step": 247 + }, + { + "epoch": 0.47807228915662653, + "grad_norm": 0.4543336033821106, + "learning_rate": 3.958504715186695e-05, + "loss": 0.0256, + "step": 248 + }, + { + "epoch": 0.48, + "grad_norm": 0.5595743656158447, + "learning_rate": 3.957590974419452e-05, + "loss": 0.0222, + "step": 249 + }, + { + "epoch": 0.4819277108433735, + "grad_norm": 0.5701581239700317, + "learning_rate": 3.956667390249642e-05, + "loss": 0.0334, + "step": 250 + }, + { + "epoch": 0.483855421686747, + "grad_norm": 0.53755784034729, + "learning_rate": 3.9557339673213474e-05, + "loss": 0.0345, + "step": 251 + }, + { + "epoch": 0.4857831325301205, + "grad_norm": 0.4368877112865448, + "learning_rate": 3.95479071032812e-05, + "loss": 0.0183, + "step": 252 + }, + { + "epoch": 0.48771084337349396, + "grad_norm": 0.7972906827926636, + "learning_rate": 3.953837624012963e-05, + "loss": 0.0337, + "step": 253 + }, + { + "epoch": 0.48963855421686747, + "grad_norm": 0.6148451566696167, + "learning_rate": 3.9528747131683023e-05, + "loss": 0.0524, + "step": 254 + }, + { + "epoch": 0.491566265060241, + "grad_norm": 0.500840961933136, + "learning_rate": 3.9519019826359676e-05, + "loss": 0.0248, + "step": 255 + }, + { + "epoch": 0.49349397590361443, + "grad_norm": 0.5536255240440369, + "learning_rate": 3.9509194373071624e-05, + "loss": 0.0219, + "step": 256 + }, + { + "epoch": 0.49542168674698794, + "grad_norm": 0.6873176097869873, + "learning_rate": 3.9499270821224444e-05, + "loss": 0.0312, + "step": 257 + }, + { + "epoch": 0.49734939759036145, + "grad_norm": 0.37207168340682983, + "learning_rate": 3.9489249220716974e-05, + "loss": 0.0149, + "step": 258 + }, + { + "epoch": 0.49927710843373496, + "grad_norm": 0.4458799660205841, + "learning_rate": 3.947912962194107e-05, + "loss": 0.0214, + "step": 259 + }, + { + "epoch": 0.5012048192771085, + "grad_norm": 0.4272724390029907, + "learning_rate": 3.9468912075781345e-05, + "loss": 0.0263, + "step": 260 + }, + { + "epoch": 0.503132530120482, + "grad_norm": 0.5245792269706726, + "learning_rate": 3.945859663361496e-05, + "loss": 0.0103, + "step": 261 + }, + { + "epoch": 0.5050602409638554, + "grad_norm": 0.8799260854721069, + "learning_rate": 3.9448183347311284e-05, + "loss": 0.0292, + "step": 262 + }, + { + "epoch": 0.5069879518072289, + "grad_norm": 0.5996833443641663, + "learning_rate": 3.943767226923171e-05, + "loss": 0.0306, + "step": 263 + }, + { + "epoch": 0.5089156626506024, + "grad_norm": 0.6044682860374451, + "learning_rate": 3.942706345222935e-05, + "loss": 0.0218, + "step": 264 + }, + { + "epoch": 0.5108433734939759, + "grad_norm": 0.4770200848579407, + "learning_rate": 3.941635694964878e-05, + "loss": 0.0226, + "step": 265 + }, + { + "epoch": 0.5127710843373494, + "grad_norm": 0.5605704188346863, + "learning_rate": 3.940555281532576e-05, + "loss": 0.0354, + "step": 266 + }, + { + "epoch": 0.5146987951807229, + "grad_norm": 0.46532443165779114, + "learning_rate": 3.939465110358699e-05, + "loss": 0.0223, + "step": 267 + }, + { + "epoch": 0.5166265060240964, + "grad_norm": 0.5190595388412476, + "learning_rate": 3.93836518692498e-05, + "loss": 0.0219, + "step": 268 + }, + { + "epoch": 0.5185542168674698, + "grad_norm": 0.5767757892608643, + "learning_rate": 3.937255516762193e-05, + "loss": 0.0294, + "step": 269 + }, + { + "epoch": 0.5204819277108433, + "grad_norm": 0.4543164372444153, + "learning_rate": 3.936136105450119e-05, + "loss": 0.0244, + "step": 270 + }, + { + "epoch": 0.5224096385542168, + "grad_norm": 0.4155154526233673, + "learning_rate": 3.9350069586175195e-05, + "loss": 0.02, + "step": 271 + }, + { + "epoch": 0.5243373493975904, + "grad_norm": 0.5470768213272095, + "learning_rate": 3.933868081942113e-05, + "loss": 0.0187, + "step": 272 + }, + { + "epoch": 0.5262650602409639, + "grad_norm": 0.9491772651672363, + "learning_rate": 3.9327194811505406e-05, + "loss": 0.0337, + "step": 273 + }, + { + "epoch": 0.5281927710843374, + "grad_norm": 0.9313873052597046, + "learning_rate": 3.93156116201834e-05, + "loss": 0.0573, + "step": 274 + }, + { + "epoch": 0.5301204819277109, + "grad_norm": 0.7181005477905273, + "learning_rate": 3.930393130369915e-05, + "loss": 0.0405, + "step": 275 + }, + { + "epoch": 0.5320481927710843, + "grad_norm": 0.34231385588645935, + "learning_rate": 3.9292153920785076e-05, + "loss": 0.0153, + "step": 276 + }, + { + "epoch": 0.5339759036144578, + "grad_norm": 0.6899610161781311, + "learning_rate": 3.928027953066168e-05, + "loss": 0.0338, + "step": 277 + }, + { + "epoch": 0.5359036144578313, + "grad_norm": 0.7509781718254089, + "learning_rate": 3.926830819303726e-05, + "loss": 0.0416, + "step": 278 + }, + { + "epoch": 0.5378313253012048, + "grad_norm": 0.6326774954795837, + "learning_rate": 3.925623996810757e-05, + "loss": 0.0293, + "step": 279 + }, + { + "epoch": 0.5397590361445783, + "grad_norm": 0.5543203353881836, + "learning_rate": 3.924407491655557e-05, + "loss": 0.0263, + "step": 280 + }, + { + "epoch": 0.5416867469879518, + "grad_norm": 0.5367572903633118, + "learning_rate": 3.9231813099551086e-05, + "loss": 0.0276, + "step": 281 + }, + { + "epoch": 0.5436144578313253, + "grad_norm": 0.3143869638442993, + "learning_rate": 3.921945457875051e-05, + "loss": 0.0146, + "step": 282 + }, + { + "epoch": 0.5455421686746988, + "grad_norm": 0.47403043508529663, + "learning_rate": 3.920699941629649e-05, + "loss": 0.0267, + "step": 283 + }, + { + "epoch": 0.5474698795180722, + "grad_norm": 0.5082595348358154, + "learning_rate": 3.919444767481763e-05, + "loss": 0.0183, + "step": 284 + }, + { + "epoch": 0.5493975903614458, + "grad_norm": 0.747949481010437, + "learning_rate": 3.918179941742816e-05, + "loss": 0.0412, + "step": 285 + }, + { + "epoch": 0.5513253012048193, + "grad_norm": 0.6553886532783508, + "learning_rate": 3.916905470772762e-05, + "loss": 0.0505, + "step": 286 + }, + { + "epoch": 0.5532530120481928, + "grad_norm": 0.3838176131248474, + "learning_rate": 3.9156213609800545e-05, + "loss": 0.0156, + "step": 287 + }, + { + "epoch": 0.5551807228915663, + "grad_norm": 0.7427731156349182, + "learning_rate": 3.914327618821614e-05, + "loss": 0.0278, + "step": 288 + }, + { + "epoch": 0.5571084337349398, + "grad_norm": 0.2612821161746979, + "learning_rate": 3.913024250802796e-05, + "loss": 0.0101, + "step": 289 + }, + { + "epoch": 0.5590361445783133, + "grad_norm": 0.3799416124820709, + "learning_rate": 3.911711263477357e-05, + "loss": 0.0168, + "step": 290 + }, + { + "epoch": 0.5609638554216867, + "grad_norm": 0.5053854584693909, + "learning_rate": 3.910388663447425e-05, + "loss": 0.0249, + "step": 291 + }, + { + "epoch": 0.5628915662650602, + "grad_norm": 0.38095012307167053, + "learning_rate": 3.909056457363461e-05, + "loss": 0.0156, + "step": 292 + }, + { + "epoch": 0.5648192771084337, + "grad_norm": 0.4477892220020294, + "learning_rate": 3.907714651924229e-05, + "loss": 0.0309, + "step": 293 + }, + { + "epoch": 0.5667469879518072, + "grad_norm": 0.5875864624977112, + "learning_rate": 3.906363253876763e-05, + "loss": 0.0287, + "step": 294 + }, + { + "epoch": 0.5686746987951807, + "grad_norm": 0.522990882396698, + "learning_rate": 3.90500227001633e-05, + "loss": 0.0318, + "step": 295 + }, + { + "epoch": 0.5706024096385542, + "grad_norm": 0.4153876304626465, + "learning_rate": 3.9036317071863994e-05, + "loss": 0.0192, + "step": 296 + }, + { + "epoch": 0.5725301204819278, + "grad_norm": 0.4675769507884979, + "learning_rate": 3.902251572278605e-05, + "loss": 0.067, + "step": 297 + }, + { + "epoch": 0.5744578313253013, + "grad_norm": 0.35778650641441345, + "learning_rate": 3.900861872232713e-05, + "loss": 0.0197, + "step": 298 + }, + { + "epoch": 0.5763855421686747, + "grad_norm": 0.7382330894470215, + "learning_rate": 3.899462614036587e-05, + "loss": 0.0283, + "step": 299 + }, + { + "epoch": 0.5783132530120482, + "grad_norm": 0.41268599033355713, + "learning_rate": 3.89805380472615e-05, + "loss": 0.0207, + "step": 300 + }, + { + "epoch": 0.5802409638554217, + "grad_norm": 1.2013020515441895, + "learning_rate": 3.8966354513853535e-05, + "loss": 0.0301, + "step": 301 + }, + { + "epoch": 0.5821686746987952, + "grad_norm": 0.424757719039917, + "learning_rate": 3.895207561146137e-05, + "loss": 0.022, + "step": 302 + }, + { + "epoch": 0.5840963855421687, + "grad_norm": 0.4196677505970001, + "learning_rate": 3.893770141188396e-05, + "loss": 0.0424, + "step": 303 + }, + { + "epoch": 0.5860240963855422, + "grad_norm": 0.8644190430641174, + "learning_rate": 3.892323198739946e-05, + "loss": 0.08, + "step": 304 + }, + { + "epoch": 0.5879518072289157, + "grad_norm": 0.5645135045051575, + "learning_rate": 3.890866741076482e-05, + "loss": 0.0152, + "step": 305 + }, + { + "epoch": 0.5898795180722891, + "grad_norm": 0.5218387246131897, + "learning_rate": 3.889400775521545e-05, + "loss": 0.0205, + "step": 306 + }, + { + "epoch": 0.5918072289156626, + "grad_norm": 0.39709413051605225, + "learning_rate": 3.8879253094464865e-05, + "loss": 0.0233, + "step": 307 + }, + { + "epoch": 0.5937349397590361, + "grad_norm": 0.3572910726070404, + "learning_rate": 3.8864403502704285e-05, + "loss": 0.0198, + "step": 308 + }, + { + "epoch": 0.5956626506024096, + "grad_norm": 0.382709264755249, + "learning_rate": 3.8849459054602274e-05, + "loss": 0.0176, + "step": 309 + }, + { + "epoch": 0.5975903614457831, + "grad_norm": 3.4527227878570557, + "learning_rate": 3.883441982530436e-05, + "loss": 0.0239, + "step": 310 + }, + { + "epoch": 0.5995180722891567, + "grad_norm": 0.4467569589614868, + "learning_rate": 3.8819285890432674e-05, + "loss": 0.0284, + "step": 311 + }, + { + "epoch": 0.6014457831325302, + "grad_norm": 0.44513460993766785, + "learning_rate": 3.880405732608555e-05, + "loss": 0.0233, + "step": 312 + }, + { + "epoch": 0.6033734939759036, + "grad_norm": 0.8029689192771912, + "learning_rate": 3.8788734208837155e-05, + "loss": 0.0433, + "step": 313 + }, + { + "epoch": 0.6053012048192771, + "grad_norm": 0.7291454076766968, + "learning_rate": 3.877331661573709e-05, + "loss": 0.043, + "step": 314 + }, + { + "epoch": 0.6072289156626506, + "grad_norm": 0.6050467491149902, + "learning_rate": 3.8757804624310006e-05, + "loss": 0.0377, + "step": 315 + }, + { + "epoch": 0.6091566265060241, + "grad_norm": 0.6714366674423218, + "learning_rate": 3.874219831255524e-05, + "loss": 0.046, + "step": 316 + }, + { + "epoch": 0.6110843373493976, + "grad_norm": 0.336037278175354, + "learning_rate": 3.8726497758946394e-05, + "loss": 0.0149, + "step": 317 + }, + { + "epoch": 0.6130120481927711, + "grad_norm": 0.3057402968406677, + "learning_rate": 3.871070304243094e-05, + "loss": 0.014, + "step": 318 + }, + { + "epoch": 0.6149397590361446, + "grad_norm": 0.4537644684314728, + "learning_rate": 3.8694814242429834e-05, + "loss": 0.0503, + "step": 319 + }, + { + "epoch": 0.6168674698795181, + "grad_norm": 0.45573824644088745, + "learning_rate": 3.8678831438837116e-05, + "loss": 0.021, + "step": 320 + }, + { + "epoch": 0.6187951807228915, + "grad_norm": 0.30729591846466064, + "learning_rate": 3.866275471201952e-05, + "loss": 0.0163, + "step": 321 + }, + { + "epoch": 0.620722891566265, + "grad_norm": 0.7614850401878357, + "learning_rate": 3.8646584142816036e-05, + "loss": 0.0347, + "step": 322 + }, + { + "epoch": 0.6226506024096385, + "grad_norm": 0.5323611497879028, + "learning_rate": 3.863031981253754e-05, + "loss": 0.0201, + "step": 323 + }, + { + "epoch": 0.624578313253012, + "grad_norm": 0.34426453709602356, + "learning_rate": 3.861396180296635e-05, + "loss": 0.0243, + "step": 324 + }, + { + "epoch": 0.6265060240963856, + "grad_norm": 0.621636152267456, + "learning_rate": 3.859751019635585e-05, + "loss": 0.0166, + "step": 325 + }, + { + "epoch": 0.6284337349397591, + "grad_norm": 0.549324095249176, + "learning_rate": 3.858096507543006e-05, + "loss": 0.0274, + "step": 326 + }, + { + "epoch": 0.6303614457831326, + "grad_norm": 0.358426570892334, + "learning_rate": 3.8564326523383214e-05, + "loss": 0.0207, + "step": 327 + }, + { + "epoch": 0.632289156626506, + "grad_norm": 0.3639723062515259, + "learning_rate": 3.8547594623879346e-05, + "loss": 0.0297, + "step": 328 + }, + { + "epoch": 0.6342168674698795, + "grad_norm": 0.3402212858200073, + "learning_rate": 3.853076946105188e-05, + "loss": 0.0258, + "step": 329 + }, + { + "epoch": 0.636144578313253, + "grad_norm": 0.4083027243614197, + "learning_rate": 3.85138511195032e-05, + "loss": 0.0351, + "step": 330 + }, + { + "epoch": 0.6380722891566265, + "grad_norm": 0.43532121181488037, + "learning_rate": 3.84968396843042e-05, + "loss": 0.0388, + "step": 331 + }, + { + "epoch": 0.64, + "grad_norm": 0.35353463888168335, + "learning_rate": 3.8479735240993904e-05, + "loss": 0.0203, + "step": 332 + }, + { + "epoch": 0.6419277108433735, + "grad_norm": 0.350149929523468, + "learning_rate": 3.846253787557901e-05, + "loss": 0.0261, + "step": 333 + }, + { + "epoch": 0.643855421686747, + "grad_norm": 0.7665389180183411, + "learning_rate": 3.844524767453344e-05, + "loss": 0.0108, + "step": 334 + }, + { + "epoch": 0.6457831325301204, + "grad_norm": 0.44621360301971436, + "learning_rate": 3.842786472479795e-05, + "loss": 0.0282, + "step": 335 + }, + { + "epoch": 0.6477108433734939, + "grad_norm": 0.7787201404571533, + "learning_rate": 3.841038911377962e-05, + "loss": 0.0216, + "step": 336 + }, + { + "epoch": 0.6496385542168674, + "grad_norm": 0.48260653018951416, + "learning_rate": 3.839282092935153e-05, + "loss": 0.0234, + "step": 337 + }, + { + "epoch": 0.651566265060241, + "grad_norm": 0.4987852871417999, + "learning_rate": 3.837516025985219e-05, + "loss": 0.0515, + "step": 338 + }, + { + "epoch": 0.6534939759036145, + "grad_norm": 0.9030266404151917, + "learning_rate": 3.835740719408517e-05, + "loss": 0.0508, + "step": 339 + }, + { + "epoch": 0.655421686746988, + "grad_norm": 0.6381701231002808, + "learning_rate": 3.833956182131867e-05, + "loss": 0.0405, + "step": 340 + }, + { + "epoch": 0.6573493975903615, + "grad_norm": 0.42828986048698425, + "learning_rate": 3.832162423128499e-05, + "loss": 0.024, + "step": 341 + }, + { + "epoch": 0.659277108433735, + "grad_norm": 0.38725873827934265, + "learning_rate": 3.8303594514180164e-05, + "loss": 0.0199, + "step": 342 + }, + { + "epoch": 0.6612048192771084, + "grad_norm": 0.23280498385429382, + "learning_rate": 3.828547276066346e-05, + "loss": 0.0101, + "step": 343 + }, + { + "epoch": 0.6631325301204819, + "grad_norm": 0.7298216819763184, + "learning_rate": 3.8267259061856925e-05, + "loss": 0.0455, + "step": 344 + }, + { + "epoch": 0.6650602409638554, + "grad_norm": 0.5975687503814697, + "learning_rate": 3.824895350934496e-05, + "loss": 0.0372, + "step": 345 + }, + { + "epoch": 0.6669879518072289, + "grad_norm": 0.6295403242111206, + "learning_rate": 3.823055619517381e-05, + "loss": 0.0362, + "step": 346 + }, + { + "epoch": 0.6689156626506024, + "grad_norm": 0.5086020827293396, + "learning_rate": 3.821206721185115e-05, + "loss": 0.0368, + "step": 347 + }, + { + "epoch": 0.6708433734939759, + "grad_norm": 0.34506168961524963, + "learning_rate": 3.819348665234557e-05, + "loss": 0.0178, + "step": 348 + }, + { + "epoch": 0.6727710843373494, + "grad_norm": 1.309940218925476, + "learning_rate": 3.817481461008617e-05, + "loss": 0.024, + "step": 349 + }, + { + "epoch": 0.6746987951807228, + "grad_norm": 0.4074770510196686, + "learning_rate": 3.815605117896204e-05, + "loss": 0.0262, + "step": 350 + }, + { + "epoch": 0.6766265060240964, + "grad_norm": 0.48525840044021606, + "learning_rate": 3.8137196453321775e-05, + "loss": 0.0209, + "step": 351 + }, + { + "epoch": 0.6785542168674699, + "grad_norm": 0.7199739217758179, + "learning_rate": 3.811825052797308e-05, + "loss": 0.0396, + "step": 352 + }, + { + "epoch": 0.6804819277108434, + "grad_norm": 0.519540011882782, + "learning_rate": 3.8099213498182196e-05, + "loss": 0.0453, + "step": 353 + }, + { + "epoch": 0.6824096385542169, + "grad_norm": 0.9738391041755676, + "learning_rate": 3.808008545967349e-05, + "loss": 0.0317, + "step": 354 + }, + { + "epoch": 0.6843373493975904, + "grad_norm": 1.888344407081604, + "learning_rate": 3.8060866508628953e-05, + "loss": 0.0452, + "step": 355 + }, + { + "epoch": 0.6862650602409639, + "grad_norm": 0.48989811539649963, + "learning_rate": 3.8041556741687695e-05, + "loss": 0.0315, + "step": 356 + }, + { + "epoch": 0.6881927710843373, + "grad_norm": 0.3764645457267761, + "learning_rate": 3.8022156255945496e-05, + "loss": 0.0269, + "step": 357 + }, + { + "epoch": 0.6901204819277108, + "grad_norm": 0.46409738063812256, + "learning_rate": 3.800266514895429e-05, + "loss": 0.0171, + "step": 358 + }, + { + "epoch": 0.6920481927710843, + "grad_norm": 0.41091030836105347, + "learning_rate": 3.7983083518721695e-05, + "loss": 0.0167, + "step": 359 + }, + { + "epoch": 0.6939759036144578, + "grad_norm": 0.8375523090362549, + "learning_rate": 3.79634114637105e-05, + "loss": 0.0342, + "step": 360 + }, + { + "epoch": 0.6959036144578313, + "grad_norm": 1.7053394317626953, + "learning_rate": 3.794364908283817e-05, + "loss": 0.02, + "step": 361 + }, + { + "epoch": 0.6978313253012048, + "grad_norm": 0.4163115918636322, + "learning_rate": 3.792379647547637e-05, + "loss": 0.0138, + "step": 362 + }, + { + "epoch": 0.6997590361445784, + "grad_norm": 0.388751745223999, + "learning_rate": 3.790385374145046e-05, + "loss": 0.0172, + "step": 363 + }, + { + "epoch": 0.7016867469879519, + "grad_norm": 0.5584064722061157, + "learning_rate": 3.7883820981038966e-05, + "loss": 0.0254, + "step": 364 + }, + { + "epoch": 0.7036144578313253, + "grad_norm": 1.394264817237854, + "learning_rate": 3.7863698294973114e-05, + "loss": 0.037, + "step": 365 + }, + { + "epoch": 0.7055421686746988, + "grad_norm": 0.46280744671821594, + "learning_rate": 3.78434857844363e-05, + "loss": 0.0234, + "step": 366 + }, + { + "epoch": 0.7074698795180723, + "grad_norm": 0.39548924565315247, + "learning_rate": 3.782318355106358e-05, + "loss": 0.0164, + "step": 367 + }, + { + "epoch": 0.7093975903614458, + "grad_norm": 0.7307773232460022, + "learning_rate": 3.780279169694118e-05, + "loss": 0.0192, + "step": 368 + }, + { + "epoch": 0.7113253012048193, + "grad_norm": 0.28035807609558105, + "learning_rate": 3.778231032460594e-05, + "loss": 0.0131, + "step": 369 + }, + { + "epoch": 0.7132530120481928, + "grad_norm": 0.8376953601837158, + "learning_rate": 3.776173953704486e-05, + "loss": 0.0291, + "step": 370 + }, + { + "epoch": 0.7151807228915663, + "grad_norm": 0.7356843948364258, + "learning_rate": 3.774107943769454e-05, + "loss": 0.0214, + "step": 371 + }, + { + "epoch": 0.7171084337349397, + "grad_norm": 0.41503390669822693, + "learning_rate": 3.772033013044064e-05, + "loss": 0.0221, + "step": 372 + }, + { + "epoch": 0.7190361445783132, + "grad_norm": 0.35732385516166687, + "learning_rate": 3.7699491719617436e-05, + "loss": 0.015, + "step": 373 + }, + { + "epoch": 0.7209638554216867, + "grad_norm": 0.283778578042984, + "learning_rate": 3.76785643100072e-05, + "loss": 0.0146, + "step": 374 + }, + { + "epoch": 0.7228915662650602, + "grad_norm": 0.3219413459300995, + "learning_rate": 3.765754800683974e-05, + "loss": 0.015, + "step": 375 + }, + { + "epoch": 0.7248192771084337, + "grad_norm": 0.610431432723999, + "learning_rate": 3.7636442915791856e-05, + "loss": 0.0326, + "step": 376 + }, + { + "epoch": 0.7267469879518073, + "grad_norm": 4.944870948791504, + "learning_rate": 3.7615249142986784e-05, + "loss": 0.0432, + "step": 377 + }, + { + "epoch": 0.7286746987951808, + "grad_norm": 0.4894593060016632, + "learning_rate": 3.7593966794993696e-05, + "loss": 0.0174, + "step": 378 + }, + { + "epoch": 0.7306024096385542, + "grad_norm": 0.4211325943470001, + "learning_rate": 3.757259597882714e-05, + "loss": 0.023, + "step": 379 + }, + { + "epoch": 0.7325301204819277, + "grad_norm": 0.33621737360954285, + "learning_rate": 3.755113680194651e-05, + "loss": 0.0201, + "step": 380 + }, + { + "epoch": 0.7344578313253012, + "grad_norm": 0.5799694657325745, + "learning_rate": 3.7529589372255514e-05, + "loss": 0.0173, + "step": 381 + }, + { + "epoch": 0.7363855421686747, + "grad_norm": 0.5172572731971741, + "learning_rate": 3.750795379810162e-05, + "loss": 0.0284, + "step": 382 + }, + { + "epoch": 0.7383132530120482, + "grad_norm": 0.5715453028678894, + "learning_rate": 3.748623018827552e-05, + "loss": 0.0194, + "step": 383 + }, + { + "epoch": 0.7402409638554217, + "grad_norm": 0.5284178256988525, + "learning_rate": 3.746441865201056e-05, + "loss": 0.0247, + "step": 384 + }, + { + "epoch": 0.7421686746987952, + "grad_norm": 0.37828654050827026, + "learning_rate": 3.744251929898223e-05, + "loss": 0.0097, + "step": 385 + }, + { + "epoch": 0.7440963855421687, + "grad_norm": 0.3252779543399811, + "learning_rate": 3.742053223930758e-05, + "loss": 0.0238, + "step": 386 + }, + { + "epoch": 0.7460240963855421, + "grad_norm": 0.6031543612480164, + "learning_rate": 3.7398457583544674e-05, + "loss": 0.0332, + "step": 387 + }, + { + "epoch": 0.7479518072289156, + "grad_norm": 0.23846614360809326, + "learning_rate": 3.737629544269206e-05, + "loss": 0.0122, + "step": 388 + }, + { + "epoch": 0.7498795180722891, + "grad_norm": 0.5274029970169067, + "learning_rate": 3.7354045928188155e-05, + "loss": 0.0324, + "step": 389 + }, + { + "epoch": 0.7518072289156627, + "grad_norm": 0.4672217071056366, + "learning_rate": 3.733170915191075e-05, + "loss": 0.0196, + "step": 390 + }, + { + "epoch": 0.7537349397590362, + "grad_norm": 0.29819396138191223, + "learning_rate": 3.730928522617639e-05, + "loss": 0.0131, + "step": 391 + }, + { + "epoch": 0.7556626506024097, + "grad_norm": 0.43824997544288635, + "learning_rate": 3.7286774263739855e-05, + "loss": 0.0238, + "step": 392 + }, + { + "epoch": 0.7575903614457832, + "grad_norm": 0.2822072505950928, + "learning_rate": 3.726417637779357e-05, + "loss": 0.0314, + "step": 393 + }, + { + "epoch": 0.7595180722891566, + "grad_norm": 0.43815648555755615, + "learning_rate": 3.7241491681967044e-05, + "loss": 0.0144, + "step": 394 + }, + { + "epoch": 0.7614457831325301, + "grad_norm": 0.37194815278053284, + "learning_rate": 3.721872029032628e-05, + "loss": 0.0286, + "step": 395 + }, + { + "epoch": 0.7633734939759036, + "grad_norm": 0.7319737672805786, + "learning_rate": 3.719586231737322e-05, + "loss": 0.0427, + "step": 396 + }, + { + "epoch": 0.7653012048192771, + "grad_norm": 0.5870066285133362, + "learning_rate": 3.717291787804517e-05, + "loss": 0.0138, + "step": 397 + }, + { + "epoch": 0.7672289156626506, + "grad_norm": 0.6574277281761169, + "learning_rate": 3.7149887087714225e-05, + "loss": 0.061, + "step": 398 + }, + { + "epoch": 0.7691566265060241, + "grad_norm": 0.5467348694801331, + "learning_rate": 3.712677006218666e-05, + "loss": 0.022, + "step": 399 + }, + { + "epoch": 0.7710843373493976, + "grad_norm": 0.3589288890361786, + "learning_rate": 3.710356691770238e-05, + "loss": 0.0161, + "step": 400 + }, + { + "epoch": 0.7730120481927711, + "grad_norm": 0.574630618095398, + "learning_rate": 3.708027777093433e-05, + "loss": 0.0285, + "step": 401 + }, + { + "epoch": 0.7749397590361445, + "grad_norm": 0.39048445224761963, + "learning_rate": 3.70569027389879e-05, + "loss": 0.012, + "step": 402 + }, + { + "epoch": 0.776867469879518, + "grad_norm": 0.34803536534309387, + "learning_rate": 3.703344193940032e-05, + "loss": 0.0155, + "step": 403 + }, + { + "epoch": 0.7787951807228916, + "grad_norm": 1.188948392868042, + "learning_rate": 3.700989549014011e-05, + "loss": 0.0617, + "step": 404 + }, + { + "epoch": 0.7807228915662651, + "grad_norm": 0.473157674074173, + "learning_rate": 3.698626350960646e-05, + "loss": 0.0298, + "step": 405 + }, + { + "epoch": 0.7826506024096386, + "grad_norm": 0.42009076476097107, + "learning_rate": 3.6962546116628634e-05, + "loss": 0.03, + "step": 406 + }, + { + "epoch": 0.7845783132530121, + "grad_norm": 0.6334308981895447, + "learning_rate": 3.693874343046537e-05, + "loss": 0.0107, + "step": 407 + }, + { + "epoch": 0.7865060240963856, + "grad_norm": 0.35594677925109863, + "learning_rate": 3.6914855570804314e-05, + "loss": 0.0174, + "step": 408 + }, + { + "epoch": 0.788433734939759, + "grad_norm": 0.28985708951950073, + "learning_rate": 3.689088265776136e-05, + "loss": 0.0149, + "step": 409 + }, + { + "epoch": 0.7903614457831325, + "grad_norm": 0.3981950581073761, + "learning_rate": 3.686682481188011e-05, + "loss": 0.019, + "step": 410 + }, + { + "epoch": 0.792289156626506, + "grad_norm": 0.48819583654403687, + "learning_rate": 3.6842682154131193e-05, + "loss": 0.0217, + "step": 411 + }, + { + "epoch": 0.7942168674698795, + "grad_norm": 0.42819952964782715, + "learning_rate": 3.681845480591174e-05, + "loss": 0.0198, + "step": 412 + }, + { + "epoch": 0.796144578313253, + "grad_norm": 0.48591694235801697, + "learning_rate": 3.6794142889044727e-05, + "loss": 0.0253, + "step": 413 + }, + { + "epoch": 0.7980722891566265, + "grad_norm": 0.4730607271194458, + "learning_rate": 3.676974652577835e-05, + "loss": 0.0329, + "step": 414 + }, + { + "epoch": 0.8, + "grad_norm": 0.5390865802764893, + "learning_rate": 3.6745265838785434e-05, + "loss": 0.0479, + "step": 415 + }, + { + "epoch": 0.8019277108433734, + "grad_norm": 0.6377891302108765, + "learning_rate": 3.672070095116283e-05, + "loss": 0.019, + "step": 416 + }, + { + "epoch": 0.803855421686747, + "grad_norm": 0.8984615206718445, + "learning_rate": 3.669605198643075e-05, + "loss": 0.0444, + "step": 417 + }, + { + "epoch": 0.8057831325301205, + "grad_norm": 0.4913877546787262, + "learning_rate": 3.667131906853219e-05, + "loss": 0.031, + "step": 418 + }, + { + "epoch": 0.807710843373494, + "grad_norm": 0.37894028425216675, + "learning_rate": 3.664650232183229e-05, + "loss": 0.0195, + "step": 419 + }, + { + "epoch": 0.8096385542168675, + "grad_norm": 0.3644949495792389, + "learning_rate": 3.66216018711177e-05, + "loss": 0.018, + "step": 420 + }, + { + "epoch": 0.811566265060241, + "grad_norm": 0.414440393447876, + "learning_rate": 3.659661784159597e-05, + "loss": 0.0188, + "step": 421 + }, + { + "epoch": 0.8134939759036145, + "grad_norm": 0.49220341444015503, + "learning_rate": 3.65715503588949e-05, + "loss": 0.016, + "step": 422 + }, + { + "epoch": 0.815421686746988, + "grad_norm": 1.0939836502075195, + "learning_rate": 3.654639954906193e-05, + "loss": 0.0758, + "step": 423 + }, + { + "epoch": 0.8173493975903614, + "grad_norm": 0.43222442269325256, + "learning_rate": 3.652116553856349e-05, + "loss": 0.0308, + "step": 424 + }, + { + "epoch": 0.8192771084337349, + "grad_norm": 0.5081896185874939, + "learning_rate": 3.649584845428438e-05, + "loss": 0.0493, + "step": 425 + }, + { + "epoch": 0.8212048192771084, + "grad_norm": 0.9811948537826538, + "learning_rate": 3.64704484235271e-05, + "loss": 0.019, + "step": 426 + }, + { + "epoch": 0.8231325301204819, + "grad_norm": 0.31656572222709656, + "learning_rate": 3.6444965574011255e-05, + "loss": 0.0135, + "step": 427 + }, + { + "epoch": 0.8250602409638554, + "grad_norm": 0.7844433188438416, + "learning_rate": 3.641940003387289e-05, + "loss": 0.0402, + "step": 428 + }, + { + "epoch": 0.826987951807229, + "grad_norm": 0.3353273570537567, + "learning_rate": 3.6393751931663814e-05, + "loss": 0.0132, + "step": 429 + }, + { + "epoch": 0.8289156626506025, + "grad_norm": 0.7253058552742004, + "learning_rate": 3.6368021396351015e-05, + "loss": 0.0296, + "step": 430 + }, + { + "epoch": 0.8308433734939759, + "grad_norm": 0.45300304889678955, + "learning_rate": 3.634220855731598e-05, + "loss": 0.0258, + "step": 431 + }, + { + "epoch": 0.8327710843373494, + "grad_norm": 0.3480473458766937, + "learning_rate": 3.631631354435403e-05, + "loss": 0.0099, + "step": 432 + }, + { + "epoch": 0.8346987951807229, + "grad_norm": 2.1114516258239746, + "learning_rate": 3.62903364876737e-05, + "loss": 0.0457, + "step": 433 + }, + { + "epoch": 0.8366265060240964, + "grad_norm": 0.5649561882019043, + "learning_rate": 3.626427751789606e-05, + "loss": 0.0444, + "step": 434 + }, + { + "epoch": 0.8385542168674699, + "grad_norm": 0.3864995539188385, + "learning_rate": 3.623813676605405e-05, + "loss": 0.0223, + "step": 435 + }, + { + "epoch": 0.8404819277108434, + "grad_norm": 1.2134298086166382, + "learning_rate": 3.621191436359186e-05, + "loss": 0.0353, + "step": 436 + }, + { + "epoch": 0.8424096385542169, + "grad_norm": 0.4403415024280548, + "learning_rate": 3.6185610442364246e-05, + "loss": 0.0216, + "step": 437 + }, + { + "epoch": 0.8443373493975903, + "grad_norm": 0.6050297021865845, + "learning_rate": 3.6159225134635846e-05, + "loss": 0.0433, + "step": 438 + }, + { + "epoch": 0.8462650602409638, + "grad_norm": 0.7951678037643433, + "learning_rate": 3.6132758573080556e-05, + "loss": 0.031, + "step": 439 + }, + { + "epoch": 0.8481927710843373, + "grad_norm": 0.4991949796676636, + "learning_rate": 3.6106210890780834e-05, + "loss": 0.0313, + "step": 440 + }, + { + "epoch": 0.8501204819277108, + "grad_norm": 0.47951385378837585, + "learning_rate": 3.607958222122704e-05, + "loss": 0.0218, + "step": 441 + }, + { + "epoch": 0.8520481927710843, + "grad_norm": 0.7345194220542908, + "learning_rate": 3.6052872698316755e-05, + "loss": 0.0239, + "step": 442 + }, + { + "epoch": 0.8539759036144579, + "grad_norm": 1.4814884662628174, + "learning_rate": 3.602608245635414e-05, + "loss": 0.0127, + "step": 443 + }, + { + "epoch": 0.8559036144578314, + "grad_norm": 2.4240877628326416, + "learning_rate": 3.599921163004922e-05, + "loss": 0.0618, + "step": 444 + }, + { + "epoch": 0.8578313253012049, + "grad_norm": 0.41523510217666626, + "learning_rate": 3.5972260354517216e-05, + "loss": 0.0283, + "step": 445 + }, + { + "epoch": 0.8597590361445783, + "grad_norm": 0.5577677488327026, + "learning_rate": 3.594522876527791e-05, + "loss": 0.0271, + "step": 446 + }, + { + "epoch": 0.8616867469879518, + "grad_norm": 0.5829064846038818, + "learning_rate": 3.591811699825487e-05, + "loss": 0.0169, + "step": 447 + }, + { + "epoch": 0.8636144578313253, + "grad_norm": 0.4478822350502014, + "learning_rate": 3.5890925189774886e-05, + "loss": 0.0239, + "step": 448 + }, + { + "epoch": 0.8655421686746988, + "grad_norm": 0.3498048782348633, + "learning_rate": 3.586365347656718e-05, + "loss": 0.0137, + "step": 449 + }, + { + "epoch": 0.8674698795180723, + "grad_norm": 0.6571130156517029, + "learning_rate": 3.583630199576278e-05, + "loss": 0.027, + "step": 450 + }, + { + "epoch": 0.8693975903614458, + "grad_norm": 0.344970166683197, + "learning_rate": 3.58088708848938e-05, + "loss": 0.0167, + "step": 451 + }, + { + "epoch": 0.8713253012048193, + "grad_norm": 0.34611570835113525, + "learning_rate": 3.5781360281892775e-05, + "loss": 0.0468, + "step": 452 + }, + { + "epoch": 0.8732530120481927, + "grad_norm": 0.66157066822052, + "learning_rate": 3.575377032509194e-05, + "loss": 0.0344, + "step": 453 + }, + { + "epoch": 0.8751807228915662, + "grad_norm": 0.3676326870918274, + "learning_rate": 3.5726101153222534e-05, + "loss": 0.0366, + "step": 454 + }, + { + "epoch": 0.8771084337349397, + "grad_norm": 0.5958423018455505, + "learning_rate": 3.569835290541414e-05, + "loss": 0.0382, + "step": 455 + }, + { + "epoch": 0.8790361445783132, + "grad_norm": 0.36787471175193787, + "learning_rate": 3.567052572119397e-05, + "loss": 0.018, + "step": 456 + }, + { + "epoch": 0.8809638554216868, + "grad_norm": 0.9478234052658081, + "learning_rate": 3.564261974048611e-05, + "loss": 0.0179, + "step": 457 + }, + { + "epoch": 0.8828915662650603, + "grad_norm": 0.3337579369544983, + "learning_rate": 3.56146351036109e-05, + "loss": 0.0147, + "step": 458 + }, + { + "epoch": 0.8848192771084338, + "grad_norm": 0.4984932243824005, + "learning_rate": 3.558657195128416e-05, + "loss": 0.0224, + "step": 459 + }, + { + "epoch": 0.8867469879518072, + "grad_norm": 0.36718735098838806, + "learning_rate": 3.555843042461653e-05, + "loss": 0.0202, + "step": 460 + }, + { + "epoch": 0.8886746987951807, + "grad_norm": 0.4081745445728302, + "learning_rate": 3.553021066511274e-05, + "loss": 0.0288, + "step": 461 + }, + { + "epoch": 0.8906024096385542, + "grad_norm": 0.3233242332935333, + "learning_rate": 3.55019128146709e-05, + "loss": 0.0362, + "step": 462 + }, + { + "epoch": 0.8925301204819277, + "grad_norm": 0.6560158729553223, + "learning_rate": 3.547353701558178e-05, + "loss": 0.038, + "step": 463 + }, + { + "epoch": 0.8944578313253012, + "grad_norm": 0.47668641805648804, + "learning_rate": 3.544508341052811e-05, + "loss": 0.0399, + "step": 464 + }, + { + "epoch": 0.8963855421686747, + "grad_norm": 0.45512664318084717, + "learning_rate": 3.541655214258383e-05, + "loss": 0.022, + "step": 465 + }, + { + "epoch": 0.8983132530120482, + "grad_norm": 0.8410730361938477, + "learning_rate": 3.538794335521343e-05, + "loss": 0.0315, + "step": 466 + }, + { + "epoch": 0.9002409638554217, + "grad_norm": 0.4872909486293793, + "learning_rate": 3.535925719227117e-05, + "loss": 0.0152, + "step": 467 + }, + { + "epoch": 0.9021686746987951, + "grad_norm": 0.45623311400413513, + "learning_rate": 3.533049379800038e-05, + "loss": 0.0305, + "step": 468 + }, + { + "epoch": 0.9040963855421686, + "grad_norm": 0.43087029457092285, + "learning_rate": 3.530165331703275e-05, + "loss": 0.0131, + "step": 469 + }, + { + "epoch": 0.9060240963855422, + "grad_norm": 0.4610525369644165, + "learning_rate": 3.527273589438756e-05, + "loss": 0.0187, + "step": 470 + }, + { + "epoch": 0.9079518072289157, + "grad_norm": 0.3356114327907562, + "learning_rate": 3.5243741675471006e-05, + "loss": 0.0185, + "step": 471 + }, + { + "epoch": 0.9098795180722892, + "grad_norm": 0.9065960049629211, + "learning_rate": 3.5214670806075426e-05, + "loss": 0.0433, + "step": 472 + }, + { + "epoch": 0.9118072289156627, + "grad_norm": 0.3652578294277191, + "learning_rate": 3.518552343237858e-05, + "loss": 0.02, + "step": 473 + }, + { + "epoch": 0.9137349397590362, + "grad_norm": 0.32377883791923523, + "learning_rate": 3.5156299700942916e-05, + "loss": 0.0165, + "step": 474 + }, + { + "epoch": 0.9156626506024096, + "grad_norm": 0.2431817352771759, + "learning_rate": 3.512699975871485e-05, + "loss": 0.0172, + "step": 475 + }, + { + "epoch": 0.9175903614457831, + "grad_norm": 0.6390707492828369, + "learning_rate": 3.509762375302399e-05, + "loss": 0.0356, + "step": 476 + }, + { + "epoch": 0.9195180722891566, + "grad_norm": 0.2283092886209488, + "learning_rate": 3.506817183158243e-05, + "loss": 0.0088, + "step": 477 + }, + { + "epoch": 0.9214457831325301, + "grad_norm": 0.5053914189338684, + "learning_rate": 3.5038644142483966e-05, + "loss": 0.0389, + "step": 478 + }, + { + "epoch": 0.9233734939759036, + "grad_norm": 0.2567576467990875, + "learning_rate": 3.500904083420342e-05, + "loss": 0.0155, + "step": 479 + }, + { + "epoch": 0.9253012048192771, + "grad_norm": 0.6852384209632874, + "learning_rate": 3.497936205559583e-05, + "loss": 0.0247, + "step": 480 + }, + { + "epoch": 0.9272289156626506, + "grad_norm": 0.36403414607048035, + "learning_rate": 3.494960795589572e-05, + "loss": 0.023, + "step": 481 + }, + { + "epoch": 0.929156626506024, + "grad_norm": 0.506554901599884, + "learning_rate": 3.491977868471635e-05, + "loss": 0.0273, + "step": 482 + }, + { + "epoch": 0.9310843373493976, + "grad_norm": 0.38329923152923584, + "learning_rate": 3.4889874392048985e-05, + "loss": 0.0169, + "step": 483 + }, + { + "epoch": 0.9330120481927711, + "grad_norm": 0.2805836498737335, + "learning_rate": 3.48598952282621e-05, + "loss": 0.0105, + "step": 484 + }, + { + "epoch": 0.9349397590361446, + "grad_norm": 0.6315302848815918, + "learning_rate": 3.482984134410067e-05, + "loss": 0.0289, + "step": 485 + }, + { + "epoch": 0.9368674698795181, + "grad_norm": 0.6431388854980469, + "learning_rate": 3.479971289068537e-05, + "loss": 0.0311, + "step": 486 + }, + { + "epoch": 0.9387951807228916, + "grad_norm": 0.9794723391532898, + "learning_rate": 3.476951001951184e-05, + "loss": 0.0452, + "step": 487 + }, + { + "epoch": 0.9407228915662651, + "grad_norm": 0.7984824180603027, + "learning_rate": 3.473923288244991e-05, + "loss": 0.0689, + "step": 488 + }, + { + "epoch": 0.9426506024096386, + "grad_norm": 0.46362006664276123, + "learning_rate": 3.470888163174286e-05, + "loss": 0.0241, + "step": 489 + }, + { + "epoch": 0.944578313253012, + "grad_norm": 0.5051195025444031, + "learning_rate": 3.467845642000661e-05, + "loss": 0.0228, + "step": 490 + }, + { + "epoch": 0.9465060240963855, + "grad_norm": 0.3082812428474426, + "learning_rate": 3.4647957400229004e-05, + "loss": 0.0144, + "step": 491 + }, + { + "epoch": 0.948433734939759, + "grad_norm": 0.2691391110420227, + "learning_rate": 3.461738472576902e-05, + "loss": 0.0167, + "step": 492 + }, + { + "epoch": 0.9503614457831325, + "grad_norm": 0.5627671480178833, + "learning_rate": 3.458673855035597e-05, + "loss": 0.031, + "step": 493 + }, + { + "epoch": 0.952289156626506, + "grad_norm": 0.4571435749530792, + "learning_rate": 3.455601902808876e-05, + "loss": 0.0191, + "step": 494 + }, + { + "epoch": 0.9542168674698795, + "grad_norm": 1.0117709636688232, + "learning_rate": 3.452522631343515e-05, + "loss": 0.0192, + "step": 495 + }, + { + "epoch": 0.9561445783132531, + "grad_norm": 0.28375712037086487, + "learning_rate": 3.449436056123086e-05, + "loss": 0.0159, + "step": 496 + }, + { + "epoch": 0.9580722891566265, + "grad_norm": 0.26381856203079224, + "learning_rate": 3.446342192667893e-05, + "loss": 0.0113, + "step": 497 + }, + { + "epoch": 0.96, + "grad_norm": 0.49317577481269836, + "learning_rate": 3.443241056534884e-05, + "loss": 0.0332, + "step": 498 + }, + { + "epoch": 0.9619277108433735, + "grad_norm": 0.28884485363960266, + "learning_rate": 3.440132663317579e-05, + "loss": 0.0117, + "step": 499 + }, + { + "epoch": 0.963855421686747, + "grad_norm": 0.36255285143852234, + "learning_rate": 3.4370170286459864e-05, + "loss": 0.0169, + "step": 500 + }, + { + "epoch": 0.9657831325301205, + "grad_norm": 0.4265049993991852, + "learning_rate": 3.433894168186529e-05, + "loss": 0.0217, + "step": 501 + }, + { + "epoch": 0.967710843373494, + "grad_norm": 0.8169426321983337, + "learning_rate": 3.430764097641962e-05, + "loss": 0.0207, + "step": 502 + }, + { + "epoch": 0.9696385542168675, + "grad_norm": 1.866077184677124, + "learning_rate": 3.427626832751296e-05, + "loss": 0.0381, + "step": 503 + }, + { + "epoch": 0.971566265060241, + "grad_norm": 0.33124980330467224, + "learning_rate": 3.424482389289716e-05, + "loss": 0.0245, + "step": 504 + }, + { + "epoch": 0.9734939759036144, + "grad_norm": 0.37479540705680847, + "learning_rate": 3.4213307830685055e-05, + "loss": 0.0164, + "step": 505 + }, + { + "epoch": 0.9754216867469879, + "grad_norm": 0.39738863706588745, + "learning_rate": 3.4181720299349615e-05, + "loss": 0.0297, + "step": 506 + }, + { + "epoch": 0.9773493975903614, + "grad_norm": 0.2567287087440491, + "learning_rate": 3.4150061457723205e-05, + "loss": 0.0102, + "step": 507 + }, + { + "epoch": 0.9792771084337349, + "grad_norm": 0.6230517029762268, + "learning_rate": 3.411833146499675e-05, + "loss": 0.0243, + "step": 508 + }, + { + "epoch": 0.9812048192771085, + "grad_norm": 0.44843971729278564, + "learning_rate": 3.408653048071894e-05, + "loss": 0.0357, + "step": 509 + }, + { + "epoch": 0.983132530120482, + "grad_norm": 1.0569655895233154, + "learning_rate": 3.405465866479546e-05, + "loss": 0.037, + "step": 510 + }, + { + "epoch": 0.9850602409638555, + "grad_norm": 0.29000964760780334, + "learning_rate": 3.402271617748812e-05, + "loss": 0.0129, + "step": 511 + }, + { + "epoch": 0.9869879518072289, + "grad_norm": 2.1627447605133057, + "learning_rate": 3.399070317941413e-05, + "loss": 0.0442, + "step": 512 + }, + { + "epoch": 0.9889156626506024, + "grad_norm": 0.27371272444725037, + "learning_rate": 3.395861983154522e-05, + "loss": 0.0119, + "step": 513 + }, + { + "epoch": 0.9908433734939759, + "grad_norm": 0.4117226302623749, + "learning_rate": 3.392646629520688e-05, + "loss": 0.0455, + "step": 514 + }, + { + "epoch": 0.9927710843373494, + "grad_norm": 0.5098996758460999, + "learning_rate": 3.389424273207752e-05, + "loss": 0.0203, + "step": 515 + }, + { + "epoch": 0.9946987951807229, + "grad_norm": 0.5192157626152039, + "learning_rate": 3.386194930418767e-05, + "loss": 0.0329, + "step": 516 + }, + { + "epoch": 0.9966265060240964, + "grad_norm": 0.18757697939872742, + "learning_rate": 3.382958617391915e-05, + "loss": 0.0065, + "step": 517 + }, + { + "epoch": 0.9985542168674699, + "grad_norm": 0.3334413170814514, + "learning_rate": 3.3797153504004296e-05, + "loss": 0.0266, + "step": 518 + }, + { + "epoch": 1.0, + "grad_norm": 0.4152225852012634, + "learning_rate": 3.3764651457525095e-05, + "loss": 0.0169, + "step": 519 + }, + { + "epoch": 1.0019277108433735, + "grad_norm": 0.43535247445106506, + "learning_rate": 3.373208019791237e-05, + "loss": 0.0221, + "step": 520 + }, + { + "epoch": 1.003855421686747, + "grad_norm": 0.39292722940444946, + "learning_rate": 3.3699439888945e-05, + "loss": 0.0211, + "step": 521 + }, + { + "epoch": 1.0057831325301205, + "grad_norm": 0.19566713273525238, + "learning_rate": 3.366673069474904e-05, + "loss": 0.0069, + "step": 522 + }, + { + "epoch": 1.007710843373494, + "grad_norm": 0.5101853609085083, + "learning_rate": 3.3633952779796914e-05, + "loss": 0.0191, + "step": 523 + }, + { + "epoch": 1.0096385542168675, + "grad_norm": 0.999434769153595, + "learning_rate": 3.360110630890664e-05, + "loss": 0.0196, + "step": 524 + }, + { + "epoch": 1.011566265060241, + "grad_norm": 0.4646223783493042, + "learning_rate": 3.356819144724092e-05, + "loss": 0.0328, + "step": 525 + }, + { + "epoch": 1.0134939759036146, + "grad_norm": 0.3132480978965759, + "learning_rate": 3.3535208360306354e-05, + "loss": 0.0203, + "step": 526 + }, + { + "epoch": 1.0154216867469879, + "grad_norm": 0.3038032352924347, + "learning_rate": 3.350215721395261e-05, + "loss": 0.0122, + "step": 527 + }, + { + "epoch": 1.0173493975903614, + "grad_norm": 0.45082882046699524, + "learning_rate": 3.346903817437157e-05, + "loss": 0.0437, + "step": 528 + }, + { + "epoch": 1.0192771084337349, + "grad_norm": 0.26917046308517456, + "learning_rate": 3.343585140809651e-05, + "loss": 0.013, + "step": 529 + }, + { + "epoch": 1.0212048192771084, + "grad_norm": 0.23869264125823975, + "learning_rate": 3.3402597082001276e-05, + "loss": 0.008, + "step": 530 + }, + { + "epoch": 1.0231325301204819, + "grad_norm": 0.31315353512763977, + "learning_rate": 3.3369275363299394e-05, + "loss": 0.0078, + "step": 531 + }, + { + "epoch": 1.0250602409638554, + "grad_norm": 0.4780346751213074, + "learning_rate": 3.333588641954327e-05, + "loss": 0.0225, + "step": 532 + }, + { + "epoch": 1.026987951807229, + "grad_norm": 0.2920368015766144, + "learning_rate": 3.330243041862336e-05, + "loss": 0.0118, + "step": 533 + }, + { + "epoch": 1.0289156626506024, + "grad_norm": 0.543669581413269, + "learning_rate": 3.326890752876728e-05, + "loss": 0.0338, + "step": 534 + }, + { + "epoch": 1.030843373493976, + "grad_norm": 0.4288000464439392, + "learning_rate": 3.323531791853901e-05, + "loss": 0.0341, + "step": 535 + }, + { + "epoch": 1.0327710843373494, + "grad_norm": 0.26600322127342224, + "learning_rate": 3.3201661756838e-05, + "loss": 0.0184, + "step": 536 + }, + { + "epoch": 1.034698795180723, + "grad_norm": 0.290937602519989, + "learning_rate": 3.316793921289835e-05, + "loss": 0.0152, + "step": 537 + }, + { + "epoch": 1.0366265060240965, + "grad_norm": 0.7621443271636963, + "learning_rate": 3.313415045628795e-05, + "loss": 0.0326, + "step": 538 + }, + { + "epoch": 1.03855421686747, + "grad_norm": 0.5581283569335938, + "learning_rate": 3.3100295656907646e-05, + "loss": 0.0164, + "step": 539 + }, + { + "epoch": 1.0404819277108435, + "grad_norm": 0.20930901169776917, + "learning_rate": 3.306637498499034e-05, + "loss": 0.0091, + "step": 540 + }, + { + "epoch": 1.0424096385542168, + "grad_norm": 0.46212059259414673, + "learning_rate": 3.303238861110018e-05, + "loss": 0.0118, + "step": 541 + }, + { + "epoch": 1.0443373493975903, + "grad_norm": 0.38259151577949524, + "learning_rate": 3.299833670613168e-05, + "loss": 0.0081, + "step": 542 + }, + { + "epoch": 1.0462650602409638, + "grad_norm": 0.4888618290424347, + "learning_rate": 3.2964219441308865e-05, + "loss": 0.0138, + "step": 543 + }, + { + "epoch": 1.0481927710843373, + "grad_norm": 0.32103127241134644, + "learning_rate": 3.2930036988184425e-05, + "loss": 0.0171, + "step": 544 + }, + { + "epoch": 1.0501204819277108, + "grad_norm": 0.27787327766418457, + "learning_rate": 3.28957895186388e-05, + "loss": 0.0106, + "step": 545 + }, + { + "epoch": 1.0520481927710843, + "grad_norm": 0.35597777366638184, + "learning_rate": 3.2861477204879395e-05, + "loss": 0.0123, + "step": 546 + }, + { + "epoch": 1.0539759036144578, + "grad_norm": 0.3619804084300995, + "learning_rate": 3.2827100219439656e-05, + "loss": 0.0088, + "step": 547 + }, + { + "epoch": 1.0559036144578313, + "grad_norm": 0.2525513470172882, + "learning_rate": 3.279265873517822e-05, + "loss": 0.0179, + "step": 548 + }, + { + "epoch": 1.0578313253012048, + "grad_norm": 0.3910020887851715, + "learning_rate": 3.275815292527804e-05, + "loss": 0.0142, + "step": 549 + }, + { + "epoch": 1.0597590361445783, + "grad_norm": 0.30515050888061523, + "learning_rate": 3.2723582963245526e-05, + "loss": 0.0123, + "step": 550 + }, + { + "epoch": 1.0616867469879518, + "grad_norm": 0.21708644926548004, + "learning_rate": 3.2688949022909665e-05, + "loss": 0.0098, + "step": 551 + }, + { + "epoch": 1.0636144578313254, + "grad_norm": 0.23307719826698303, + "learning_rate": 3.265425127842114e-05, + "loss": 0.0097, + "step": 552 + }, + { + "epoch": 1.0655421686746989, + "grad_norm": 0.676654577255249, + "learning_rate": 3.261948990425147e-05, + "loss": 0.0227, + "step": 553 + }, + { + "epoch": 1.0674698795180724, + "grad_norm": 0.4593975841999054, + "learning_rate": 3.258466507519213e-05, + "loss": 0.047, + "step": 554 + }, + { + "epoch": 1.0693975903614459, + "grad_norm": 0.19405829906463623, + "learning_rate": 3.254977696635366e-05, + "loss": 0.0314, + "step": 555 + }, + { + "epoch": 1.0713253012048192, + "grad_norm": 0.14563389122486115, + "learning_rate": 3.2514825753164774e-05, + "loss": 0.0046, + "step": 556 + }, + { + "epoch": 1.0732530120481927, + "grad_norm": 0.2642340064048767, + "learning_rate": 3.247981161137153e-05, + "loss": 0.022, + "step": 557 + }, + { + "epoch": 1.0751807228915662, + "grad_norm": 0.17274761199951172, + "learning_rate": 3.2444734717036386e-05, + "loss": 0.0134, + "step": 558 + }, + { + "epoch": 1.0771084337349397, + "grad_norm": 0.44354626536369324, + "learning_rate": 3.240959524653735e-05, + "loss": 0.0211, + "step": 559 + }, + { + "epoch": 1.0790361445783132, + "grad_norm": 0.2806888818740845, + "learning_rate": 3.237439337656708e-05, + "loss": 0.0141, + "step": 560 + }, + { + "epoch": 1.0809638554216867, + "grad_norm": 0.21679501235485077, + "learning_rate": 3.2339129284131994e-05, + "loss": 0.019, + "step": 561 + }, + { + "epoch": 1.0828915662650602, + "grad_norm": 0.3040260076522827, + "learning_rate": 3.2303803146551386e-05, + "loss": 0.0249, + "step": 562 + }, + { + "epoch": 1.0848192771084337, + "grad_norm": 0.2793775200843811, + "learning_rate": 3.226841514145656e-05, + "loss": 0.0088, + "step": 563 + }, + { + "epoch": 1.0867469879518072, + "grad_norm": 0.149955615401268, + "learning_rate": 3.223296544678987e-05, + "loss": 0.0054, + "step": 564 + }, + { + "epoch": 1.0886746987951808, + "grad_norm": 0.22166767716407776, + "learning_rate": 3.219745424080389e-05, + "loss": 0.0109, + "step": 565 + }, + { + "epoch": 1.0906024096385543, + "grad_norm": 0.22399431467056274, + "learning_rate": 3.2161881702060476e-05, + "loss": 0.0106, + "step": 566 + }, + { + "epoch": 1.0925301204819278, + "grad_norm": 0.18537986278533936, + "learning_rate": 3.2126248009429905e-05, + "loss": 0.0077, + "step": 567 + }, + { + "epoch": 1.0944578313253013, + "grad_norm": 0.24511495232582092, + "learning_rate": 3.2090553342089935e-05, + "loss": 0.0093, + "step": 568 + }, + { + "epoch": 1.0963855421686748, + "grad_norm": 0.4766045808792114, + "learning_rate": 3.205479787952494e-05, + "loss": 0.036, + "step": 569 + }, + { + "epoch": 1.0983132530120483, + "grad_norm": 0.1425715535879135, + "learning_rate": 3.201898180152499e-05, + "loss": 0.0085, + "step": 570 + }, + { + "epoch": 1.1002409638554216, + "grad_norm": 0.1909666359424591, + "learning_rate": 3.1983105288184945e-05, + "loss": 0.0081, + "step": 571 + }, + { + "epoch": 1.102168674698795, + "grad_norm": 0.44077104330062866, + "learning_rate": 3.194716851990355e-05, + "loss": 0.017, + "step": 572 + }, + { + "epoch": 1.1040963855421686, + "grad_norm": 0.5757400989532471, + "learning_rate": 3.191117167738253e-05, + "loss": 0.021, + "step": 573 + }, + { + "epoch": 1.106024096385542, + "grad_norm": 0.1977701038122177, + "learning_rate": 3.1875114941625705e-05, + "loss": 0.0096, + "step": 574 + }, + { + "epoch": 1.1079518072289156, + "grad_norm": 0.3524581491947174, + "learning_rate": 3.1838998493938026e-05, + "loss": 0.0118, + "step": 575 + }, + { + "epoch": 1.1098795180722891, + "grad_norm": 0.3301331698894501, + "learning_rate": 3.180282251592472e-05, + "loss": 0.0094, + "step": 576 + }, + { + "epoch": 1.1118072289156626, + "grad_norm": 0.2774488925933838, + "learning_rate": 3.1766587189490336e-05, + "loss": 0.0131, + "step": 577 + }, + { + "epoch": 1.1137349397590361, + "grad_norm": 1.732595443725586, + "learning_rate": 3.173029269683785e-05, + "loss": 0.0445, + "step": 578 + }, + { + "epoch": 1.1156626506024097, + "grad_norm": 0.28746843338012695, + "learning_rate": 3.169393922046776e-05, + "loss": 0.0116, + "step": 579 + }, + { + "epoch": 1.1175903614457832, + "grad_norm": 0.2952995002269745, + "learning_rate": 3.165752694317713e-05, + "loss": 0.0116, + "step": 580 + }, + { + "epoch": 1.1195180722891567, + "grad_norm": 0.2938575744628906, + "learning_rate": 3.16210560480587e-05, + "loss": 0.013, + "step": 581 + }, + { + "epoch": 1.1214457831325302, + "grad_norm": 0.22283495962619781, + "learning_rate": 3.158452671849998e-05, + "loss": 0.0052, + "step": 582 + }, + { + "epoch": 1.1233734939759037, + "grad_norm": 0.6272858381271362, + "learning_rate": 3.154793913818226e-05, + "loss": 0.0182, + "step": 583 + }, + { + "epoch": 1.1253012048192772, + "grad_norm": 0.479753702878952, + "learning_rate": 3.1511293491079804e-05, + "loss": 0.0146, + "step": 584 + }, + { + "epoch": 1.1272289156626507, + "grad_norm": 0.31104400753974915, + "learning_rate": 3.1474589961458786e-05, + "loss": 0.0139, + "step": 585 + }, + { + "epoch": 1.129156626506024, + "grad_norm": 0.4932832419872284, + "learning_rate": 3.1437828733876477e-05, + "loss": 0.0236, + "step": 586 + }, + { + "epoch": 1.1310843373493975, + "grad_norm": 0.222808837890625, + "learning_rate": 3.140100999318025e-05, + "loss": 0.0084, + "step": 587 + }, + { + "epoch": 1.133012048192771, + "grad_norm": 0.4515356719493866, + "learning_rate": 3.136413392450668e-05, + "loss": 0.0215, + "step": 588 + }, + { + "epoch": 1.1349397590361445, + "grad_norm": 0.39302268624305725, + "learning_rate": 3.132720071328061e-05, + "loss": 0.0154, + "step": 589 + }, + { + "epoch": 1.136867469879518, + "grad_norm": 0.43382835388183594, + "learning_rate": 3.1290210545214205e-05, + "loss": 0.0088, + "step": 590 + }, + { + "epoch": 1.1387951807228915, + "grad_norm": 0.18707136809825897, + "learning_rate": 3.125316360630602e-05, + "loss": 0.0126, + "step": 591 + }, + { + "epoch": 1.140722891566265, + "grad_norm": 0.5688219666481018, + "learning_rate": 3.121606008284011e-05, + "loss": 0.0147, + "step": 592 + }, + { + "epoch": 1.1426506024096386, + "grad_norm": 0.3321833312511444, + "learning_rate": 3.1178900161385005e-05, + "loss": 0.0119, + "step": 593 + }, + { + "epoch": 1.144578313253012, + "grad_norm": 0.3738424777984619, + "learning_rate": 3.114168402879286e-05, + "loss": 0.0158, + "step": 594 + }, + { + "epoch": 1.1465060240963856, + "grad_norm": 0.2386978417634964, + "learning_rate": 3.110441187219846e-05, + "loss": 0.0107, + "step": 595 + }, + { + "epoch": 1.148433734939759, + "grad_norm": 0.2165699452161789, + "learning_rate": 3.10670838790183e-05, + "loss": 0.0079, + "step": 596 + }, + { + "epoch": 1.1503614457831326, + "grad_norm": 0.25952696800231934, + "learning_rate": 3.102970023694965e-05, + "loss": 0.0147, + "step": 597 + }, + { + "epoch": 1.152289156626506, + "grad_norm": 0.21448305249214172, + "learning_rate": 3.099226113396959e-05, + "loss": 0.0099, + "step": 598 + }, + { + "epoch": 1.1542168674698796, + "grad_norm": 0.37226060032844543, + "learning_rate": 3.095476675833405e-05, + "loss": 0.0214, + "step": 599 + }, + { + "epoch": 1.1561445783132531, + "grad_norm": 0.29637983441352844, + "learning_rate": 3.0917217298576955e-05, + "loss": 0.0118, + "step": 600 + }, + { + "epoch": 1.1580722891566264, + "grad_norm": 0.18535609543323517, + "learning_rate": 3.0879612943509154e-05, + "loss": 0.0086, + "step": 601 + }, + { + "epoch": 1.16, + "grad_norm": 0.25874125957489014, + "learning_rate": 3.0841953882217536e-05, + "loss": 0.0088, + "step": 602 + }, + { + "epoch": 1.1619277108433734, + "grad_norm": 0.46092745661735535, + "learning_rate": 3.08042403040641e-05, + "loss": 0.0241, + "step": 603 + }, + { + "epoch": 1.163855421686747, + "grad_norm": 0.27023249864578247, + "learning_rate": 3.076647239868494e-05, + "loss": 0.0154, + "step": 604 + }, + { + "epoch": 1.1657831325301204, + "grad_norm": 0.445157527923584, + "learning_rate": 3.072865035598933e-05, + "loss": 0.0197, + "step": 605 + }, + { + "epoch": 1.167710843373494, + "grad_norm": 0.18097272515296936, + "learning_rate": 3.06907743661588e-05, + "loss": 0.0093, + "step": 606 + }, + { + "epoch": 1.1696385542168675, + "grad_norm": 0.22469942271709442, + "learning_rate": 3.065284461964609e-05, + "loss": 0.0171, + "step": 607 + }, + { + "epoch": 1.171566265060241, + "grad_norm": 0.20190906524658203, + "learning_rate": 3.061486130717428e-05, + "loss": 0.008, + "step": 608 + }, + { + "epoch": 1.1734939759036145, + "grad_norm": 0.18294145166873932, + "learning_rate": 3.057682461973579e-05, + "loss": 0.0155, + "step": 609 + }, + { + "epoch": 1.175421686746988, + "grad_norm": 0.34203943610191345, + "learning_rate": 3.053873474859143e-05, + "loss": 0.0212, + "step": 610 + }, + { + "epoch": 1.1773493975903615, + "grad_norm": 0.49073582887649536, + "learning_rate": 3.050059188526942e-05, + "loss": 0.019, + "step": 611 + }, + { + "epoch": 1.179277108433735, + "grad_norm": 0.3537680506706238, + "learning_rate": 3.046239622156446e-05, + "loss": 0.0147, + "step": 612 + }, + { + "epoch": 1.1812048192771085, + "grad_norm": 0.2584632635116577, + "learning_rate": 3.042414794953674e-05, + "loss": 0.0088, + "step": 613 + }, + { + "epoch": 1.1831325301204818, + "grad_norm": 0.3529360890388489, + "learning_rate": 3.0385847261510975e-05, + "loss": 0.0187, + "step": 614 + }, + { + "epoch": 1.1850602409638555, + "grad_norm": 0.3331570327281952, + "learning_rate": 3.0347494350075465e-05, + "loss": 0.0124, + "step": 615 + }, + { + "epoch": 1.1869879518072288, + "grad_norm": 0.2223527580499649, + "learning_rate": 3.0309089408081074e-05, + "loss": 0.01, + "step": 616 + }, + { + "epoch": 1.1889156626506023, + "grad_norm": 0.21985746920108795, + "learning_rate": 3.027063262864032e-05, + "loss": 0.0087, + "step": 617 + }, + { + "epoch": 1.1908433734939758, + "grad_norm": 0.2989653944969177, + "learning_rate": 3.023212420512637e-05, + "loss": 0.0137, + "step": 618 + }, + { + "epoch": 1.1927710843373494, + "grad_norm": 0.17423275113105774, + "learning_rate": 3.0193564331172074e-05, + "loss": 0.0056, + "step": 619 + }, + { + "epoch": 1.1946987951807229, + "grad_norm": 1.0992127656936646, + "learning_rate": 3.0154953200668976e-05, + "loss": 0.0274, + "step": 620 + }, + { + "epoch": 1.1966265060240964, + "grad_norm": 0.21641989052295685, + "learning_rate": 3.011629100776638e-05, + "loss": 0.0151, + "step": 621 + }, + { + "epoch": 1.1985542168674699, + "grad_norm": 0.4558199644088745, + "learning_rate": 3.007757794687033e-05, + "loss": 0.0424, + "step": 622 + }, + { + "epoch": 1.2004819277108434, + "grad_norm": 0.42380189895629883, + "learning_rate": 3.003881421264266e-05, + "loss": 0.0079, + "step": 623 + }, + { + "epoch": 1.202409638554217, + "grad_norm": 0.28791171312332153, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.0142, + "step": 624 + }, + { + "epoch": 1.2043373493975904, + "grad_norm": 0.3906581997871399, + "learning_rate": 2.996113550411281e-05, + "loss": 0.0251, + "step": 625 + }, + { + "epoch": 1.206265060240964, + "grad_norm": 0.47848746180534363, + "learning_rate": 2.9922220920404375e-05, + "loss": 0.0137, + "step": 626 + }, + { + "epoch": 1.2081927710843374, + "grad_norm": 0.22666941583156586, + "learning_rate": 2.9883256444549862e-05, + "loss": 0.0105, + "step": 627 + }, + { + "epoch": 1.210120481927711, + "grad_norm": 0.18968136608600616, + "learning_rate": 2.984424227247529e-05, + "loss": 0.0089, + "step": 628 + }, + { + "epoch": 1.2120481927710842, + "grad_norm": 0.28732606768608093, + "learning_rate": 2.980517860035656e-05, + "loss": 0.0253, + "step": 629 + }, + { + "epoch": 1.213975903614458, + "grad_norm": 0.21131543815135956, + "learning_rate": 2.9766065624618518e-05, + "loss": 0.0134, + "step": 630 + }, + { + "epoch": 1.2159036144578312, + "grad_norm": 0.7594877481460571, + "learning_rate": 2.972690354193388e-05, + "loss": 0.0157, + "step": 631 + }, + { + "epoch": 1.2178313253012047, + "grad_norm": 0.730291485786438, + "learning_rate": 2.96876925492223e-05, + "loss": 0.0204, + "step": 632 + }, + { + "epoch": 1.2197590361445783, + "grad_norm": 0.20333674550056458, + "learning_rate": 2.9648432843649382e-05, + "loss": 0.0114, + "step": 633 + }, + { + "epoch": 1.2216867469879518, + "grad_norm": 0.5680793523788452, + "learning_rate": 2.960912462262566e-05, + "loss": 0.0146, + "step": 634 + }, + { + "epoch": 1.2236144578313253, + "grad_norm": 0.4591079354286194, + "learning_rate": 2.9569768083805618e-05, + "loss": 0.0112, + "step": 635 + }, + { + "epoch": 1.2255421686746988, + "grad_norm": 0.3793511390686035, + "learning_rate": 2.953036342508671e-05, + "loss": 0.0377, + "step": 636 + }, + { + "epoch": 1.2274698795180723, + "grad_norm": 1.118723750114441, + "learning_rate": 2.9490910844608346e-05, + "loss": 0.0432, + "step": 637 + }, + { + "epoch": 1.2293975903614458, + "grad_norm": 0.36990776658058167, + "learning_rate": 2.9451410540750887e-05, + "loss": 0.0203, + "step": 638 + }, + { + "epoch": 1.2313253012048193, + "grad_norm": 0.930397629737854, + "learning_rate": 2.94118627121347e-05, + "loss": 0.0311, + "step": 639 + }, + { + "epoch": 1.2332530120481928, + "grad_norm": 0.2347625195980072, + "learning_rate": 2.9372267557619075e-05, + "loss": 0.0168, + "step": 640 + }, + { + "epoch": 1.2351807228915663, + "grad_norm": 0.3720332384109497, + "learning_rate": 2.933262527630131e-05, + "loss": 0.0136, + "step": 641 + }, + { + "epoch": 1.2371084337349398, + "grad_norm": 0.4871984124183655, + "learning_rate": 2.929293606751565e-05, + "loss": 0.0339, + "step": 642 + }, + { + "epoch": 1.2390361445783133, + "grad_norm": 0.35853689908981323, + "learning_rate": 2.9253200130832322e-05, + "loss": 0.0095, + "step": 643 + }, + { + "epoch": 1.2409638554216866, + "grad_norm": 0.42003703117370605, + "learning_rate": 2.92134176660565e-05, + "loss": 0.0142, + "step": 644 + }, + { + "epoch": 1.2428915662650604, + "grad_norm": 0.3854500651359558, + "learning_rate": 2.9173588873227338e-05, + "loss": 0.0209, + "step": 645 + }, + { + "epoch": 1.2448192771084337, + "grad_norm": 0.24665917456150055, + "learning_rate": 2.913371395261691e-05, + "loss": 0.0087, + "step": 646 + }, + { + "epoch": 1.2467469879518072, + "grad_norm": 0.41571593284606934, + "learning_rate": 2.9093793104729268e-05, + "loss": 0.0164, + "step": 647 + }, + { + "epoch": 1.2486746987951807, + "grad_norm": 0.4597891569137573, + "learning_rate": 2.9053826530299377e-05, + "loss": 0.0138, + "step": 648 + }, + { + "epoch": 1.2506024096385542, + "grad_norm": 0.43345385789871216, + "learning_rate": 2.901381443029215e-05, + "loss": 0.0353, + "step": 649 + }, + { + "epoch": 1.2525301204819277, + "grad_norm": 0.3706768751144409, + "learning_rate": 2.897375700590141e-05, + "loss": 0.007, + "step": 650 + }, + { + "epoch": 1.2544578313253012, + "grad_norm": 0.30305296182632446, + "learning_rate": 2.8933654458548873e-05, + "loss": 0.0123, + "step": 651 + }, + { + "epoch": 1.2563855421686747, + "grad_norm": 0.2042127549648285, + "learning_rate": 2.8893506989883167e-05, + "loss": 0.0099, + "step": 652 + }, + { + "epoch": 1.2583132530120482, + "grad_norm": 0.20524422824382782, + "learning_rate": 2.8853314801778784e-05, + "loss": 0.0097, + "step": 653 + }, + { + "epoch": 1.2602409638554217, + "grad_norm": 0.2351921945810318, + "learning_rate": 2.8813078096335093e-05, + "loss": 0.0091, + "step": 654 + }, + { + "epoch": 1.2621686746987952, + "grad_norm": 0.34547340869903564, + "learning_rate": 2.87727970758753e-05, + "loss": 0.0088, + "step": 655 + }, + { + "epoch": 1.2640963855421687, + "grad_norm": 0.35163217782974243, + "learning_rate": 2.8732471942945443e-05, + "loss": 0.0145, + "step": 656 + }, + { + "epoch": 1.266024096385542, + "grad_norm": 1.715137243270874, + "learning_rate": 2.8692102900313378e-05, + "loss": 0.0198, + "step": 657 + }, + { + "epoch": 1.2679518072289158, + "grad_norm": 0.2860178053379059, + "learning_rate": 2.8651690150967748e-05, + "loss": 0.0085, + "step": 658 + }, + { + "epoch": 1.269879518072289, + "grad_norm": 0.21175967156887054, + "learning_rate": 2.8611233898116967e-05, + "loss": 0.0071, + "step": 659 + }, + { + "epoch": 1.2718072289156628, + "grad_norm": 0.33726972341537476, + "learning_rate": 2.85707343451882e-05, + "loss": 0.012, + "step": 660 + }, + { + "epoch": 1.273734939759036, + "grad_norm": 0.2138456553220749, + "learning_rate": 2.853019169582635e-05, + "loss": 0.0092, + "step": 661 + }, + { + "epoch": 1.2756626506024096, + "grad_norm": 0.2304934412240982, + "learning_rate": 2.8489606153892997e-05, + "loss": 0.0144, + "step": 662 + }, + { + "epoch": 1.277590361445783, + "grad_norm": 0.2691061794757843, + "learning_rate": 2.8448977923465425e-05, + "loss": 0.0121, + "step": 663 + }, + { + "epoch": 1.2795180722891566, + "grad_norm": 0.35254305601119995, + "learning_rate": 2.840830720883555e-05, + "loss": 0.0125, + "step": 664 + }, + { + "epoch": 1.28144578313253, + "grad_norm": 0.36552608013153076, + "learning_rate": 2.836759421450893e-05, + "loss": 0.021, + "step": 665 + }, + { + "epoch": 1.2833734939759036, + "grad_norm": 0.37177154421806335, + "learning_rate": 2.83268391452037e-05, + "loss": 0.0216, + "step": 666 + }, + { + "epoch": 1.2853012048192771, + "grad_norm": 0.20932547748088837, + "learning_rate": 2.828604220584958e-05, + "loss": 0.0077, + "step": 667 + }, + { + "epoch": 1.2872289156626506, + "grad_norm": 0.5158557295799255, + "learning_rate": 2.824520360158681e-05, + "loss": 0.0394, + "step": 668 + }, + { + "epoch": 1.2891566265060241, + "grad_norm": 0.22623969614505768, + "learning_rate": 2.820432353776515e-05, + "loss": 0.0087, + "step": 669 + }, + { + "epoch": 1.2910843373493976, + "grad_norm": 0.2996046245098114, + "learning_rate": 2.8163402219942822e-05, + "loss": 0.01, + "step": 670 + }, + { + "epoch": 1.2930120481927712, + "grad_norm": 0.24957989156246185, + "learning_rate": 2.8122439853885488e-05, + "loss": 0.0127, + "step": 671 + }, + { + "epoch": 1.2949397590361444, + "grad_norm": 0.2636559307575226, + "learning_rate": 2.8081436645565216e-05, + "loss": 0.0128, + "step": 672 + }, + { + "epoch": 1.2968674698795182, + "grad_norm": 0.3531591296195984, + "learning_rate": 2.804039280115944e-05, + "loss": 0.0199, + "step": 673 + }, + { + "epoch": 1.2987951807228915, + "grad_norm": 0.3682299852371216, + "learning_rate": 2.7999308527049927e-05, + "loss": 0.0088, + "step": 674 + }, + { + "epoch": 1.3007228915662652, + "grad_norm": 0.19555217027664185, + "learning_rate": 2.795818402982174e-05, + "loss": 0.0084, + "step": 675 + }, + { + "epoch": 1.3026506024096385, + "grad_norm": 0.2864912450313568, + "learning_rate": 2.7917019516262186e-05, + "loss": 0.0154, + "step": 676 + }, + { + "epoch": 1.304578313253012, + "grad_norm": 0.2211237996816635, + "learning_rate": 2.78758151933598e-05, + "loss": 0.0078, + "step": 677 + }, + { + "epoch": 1.3065060240963855, + "grad_norm": 0.13646945357322693, + "learning_rate": 2.7834571268303294e-05, + "loss": 0.0058, + "step": 678 + }, + { + "epoch": 1.308433734939759, + "grad_norm": 0.16530285775661469, + "learning_rate": 2.779328794848049e-05, + "loss": 0.007, + "step": 679 + }, + { + "epoch": 1.3103614457831325, + "grad_norm": 0.2145693302154541, + "learning_rate": 2.7751965441477325e-05, + "loss": 0.0203, + "step": 680 + }, + { + "epoch": 1.312289156626506, + "grad_norm": 0.24273739755153656, + "learning_rate": 2.771060395507677e-05, + "loss": 0.0106, + "step": 681 + }, + { + "epoch": 1.3142168674698795, + "grad_norm": 0.20430618524551392, + "learning_rate": 2.7669203697257794e-05, + "loss": 0.0122, + "step": 682 + }, + { + "epoch": 1.316144578313253, + "grad_norm": 0.2502615749835968, + "learning_rate": 2.7627764876194335e-05, + "loss": 0.0101, + "step": 683 + }, + { + "epoch": 1.3180722891566266, + "grad_norm": 0.287239670753479, + "learning_rate": 2.7586287700254214e-05, + "loss": 0.0203, + "step": 684 + }, + { + "epoch": 1.32, + "grad_norm": 0.16239754855632782, + "learning_rate": 2.7544772377998147e-05, + "loss": 0.0084, + "step": 685 + }, + { + "epoch": 1.3219277108433736, + "grad_norm": 0.27174142003059387, + "learning_rate": 2.7503219118178636e-05, + "loss": 0.008, + "step": 686 + }, + { + "epoch": 1.3238554216867469, + "grad_norm": 0.12878240644931793, + "learning_rate": 2.7461628129738954e-05, + "loss": 0.0053, + "step": 687 + }, + { + "epoch": 1.3257831325301206, + "grad_norm": 0.16112515330314636, + "learning_rate": 2.7419999621812086e-05, + "loss": 0.0059, + "step": 688 + }, + { + "epoch": 1.3277108433734939, + "grad_norm": 0.2398834228515625, + "learning_rate": 2.7378333803719672e-05, + "loss": 0.0095, + "step": 689 + }, + { + "epoch": 1.3296385542168676, + "grad_norm": 0.18516193330287933, + "learning_rate": 2.733663088497097e-05, + "loss": 0.0071, + "step": 690 + }, + { + "epoch": 1.331566265060241, + "grad_norm": 0.2974924147129059, + "learning_rate": 2.7294891075261785e-05, + "loss": 0.0227, + "step": 691 + }, + { + "epoch": 1.3334939759036144, + "grad_norm": 0.12931054830551147, + "learning_rate": 2.7253114584473418e-05, + "loss": 0.0039, + "step": 692 + }, + { + "epoch": 1.335421686746988, + "grad_norm": 0.16319474577903748, + "learning_rate": 2.7211301622671623e-05, + "loss": 0.008, + "step": 693 + }, + { + "epoch": 1.3373493975903614, + "grad_norm": 0.27622169256210327, + "learning_rate": 2.7169452400105533e-05, + "loss": 0.0238, + "step": 694 + }, + { + "epoch": 1.339277108433735, + "grad_norm": 0.45309779047966003, + "learning_rate": 2.712756712720663e-05, + "loss": 0.0439, + "step": 695 + }, + { + "epoch": 1.3412048192771084, + "grad_norm": 0.2469855099916458, + "learning_rate": 2.708564601458765e-05, + "loss": 0.0085, + "step": 696 + }, + { + "epoch": 1.343132530120482, + "grad_norm": 0.4245856702327728, + "learning_rate": 2.7043689273041535e-05, + "loss": 0.0097, + "step": 697 + }, + { + "epoch": 1.3450602409638555, + "grad_norm": 0.26796087622642517, + "learning_rate": 2.7001697113540414e-05, + "loss": 0.0119, + "step": 698 + }, + { + "epoch": 1.346987951807229, + "grad_norm": 0.3569283187389374, + "learning_rate": 2.6959669747234482e-05, + "loss": 0.0096, + "step": 699 + }, + { + "epoch": 1.3489156626506025, + "grad_norm": 0.7038524150848389, + "learning_rate": 2.6917607385450973e-05, + "loss": 0.0317, + "step": 700 + }, + { + "epoch": 1.350843373493976, + "grad_norm": 0.23568563163280487, + "learning_rate": 2.687551023969308e-05, + "loss": 0.0112, + "step": 701 + }, + { + "epoch": 1.3527710843373493, + "grad_norm": 0.20338499546051025, + "learning_rate": 2.6833378521638935e-05, + "loss": 0.0092, + "step": 702 + }, + { + "epoch": 1.354698795180723, + "grad_norm": 4.22187614440918, + "learning_rate": 2.679121244314046e-05, + "loss": 0.0314, + "step": 703 + }, + { + "epoch": 1.3566265060240963, + "grad_norm": 0.2542206048965454, + "learning_rate": 2.674901221622239e-05, + "loss": 0.0158, + "step": 704 + }, + { + "epoch": 1.3585542168674698, + "grad_norm": 0.49705010652542114, + "learning_rate": 2.670677805308116e-05, + "loss": 0.0162, + "step": 705 + }, + { + "epoch": 1.3604819277108433, + "grad_norm": 0.17502115666866302, + "learning_rate": 2.666451016608383e-05, + "loss": 0.0074, + "step": 706 + }, + { + "epoch": 1.3624096385542168, + "grad_norm": 0.21738742291927338, + "learning_rate": 2.6622208767767075e-05, + "loss": 0.0135, + "step": 707 + }, + { + "epoch": 1.3643373493975903, + "grad_norm": 0.3309847414493561, + "learning_rate": 2.6579874070836032e-05, + "loss": 0.0107, + "step": 708 + }, + { + "epoch": 1.3662650602409638, + "grad_norm": 0.10706827789545059, + "learning_rate": 2.6537506288163303e-05, + "loss": 0.0043, + "step": 709 + }, + { + "epoch": 1.3681927710843373, + "grad_norm": 0.173640176653862, + "learning_rate": 2.6495105632787835e-05, + "loss": 0.0092, + "step": 710 + }, + { + "epoch": 1.3701204819277109, + "grad_norm": 0.2636397182941437, + "learning_rate": 2.6452672317913893e-05, + "loss": 0.0097, + "step": 711 + }, + { + "epoch": 1.3720481927710844, + "grad_norm": 0.28485360741615295, + "learning_rate": 2.6410206556909943e-05, + "loss": 0.0193, + "step": 712 + }, + { + "epoch": 1.3739759036144579, + "grad_norm": 0.23210027813911438, + "learning_rate": 2.636770856330761e-05, + "loss": 0.0229, + "step": 713 + }, + { + "epoch": 1.3759036144578314, + "grad_norm": 0.13388316333293915, + "learning_rate": 2.6325178550800596e-05, + "loss": 0.004, + "step": 714 + }, + { + "epoch": 1.377831325301205, + "grad_norm": 0.5131422877311707, + "learning_rate": 2.6282616733243603e-05, + "loss": 0.0137, + "step": 715 + }, + { + "epoch": 1.3797590361445784, + "grad_norm": 0.3243267834186554, + "learning_rate": 2.6240023324651258e-05, + "loss": 0.0153, + "step": 716 + }, + { + "epoch": 1.3816867469879517, + "grad_norm": 0.1440611034631729, + "learning_rate": 2.619739853919704e-05, + "loss": 0.0031, + "step": 717 + }, + { + "epoch": 1.3836144578313254, + "grad_norm": 0.30346596240997314, + "learning_rate": 2.6154742591212196e-05, + "loss": 0.0109, + "step": 718 + }, + { + "epoch": 1.3855421686746987, + "grad_norm": 0.19109240174293518, + "learning_rate": 2.611205569518468e-05, + "loss": 0.0094, + "step": 719 + }, + { + "epoch": 1.3874698795180722, + "grad_norm": 0.28636518120765686, + "learning_rate": 2.6069338065758056e-05, + "loss": 0.0123, + "step": 720 + }, + { + "epoch": 1.3893975903614457, + "grad_norm": 0.28083911538124084, + "learning_rate": 2.6026589917730416e-05, + "loss": 0.0104, + "step": 721 + }, + { + "epoch": 1.3913253012048192, + "grad_norm": 0.36553966999053955, + "learning_rate": 2.5983811466053327e-05, + "loss": 0.0143, + "step": 722 + }, + { + "epoch": 1.3932530120481927, + "grad_norm": 0.23317205905914307, + "learning_rate": 2.5941002925830708e-05, + "loss": 0.011, + "step": 723 + }, + { + "epoch": 1.3951807228915662, + "grad_norm": 0.3825171887874603, + "learning_rate": 2.589816451231781e-05, + "loss": 0.0098, + "step": 724 + }, + { + "epoch": 1.3971084337349398, + "grad_norm": 0.19916608929634094, + "learning_rate": 2.585529644092006e-05, + "loss": 0.0094, + "step": 725 + }, + { + "epoch": 1.3990361445783133, + "grad_norm": 0.19990523159503937, + "learning_rate": 2.5812398927192027e-05, + "loss": 0.0128, + "step": 726 + }, + { + "epoch": 1.4009638554216868, + "grad_norm": 0.34662899374961853, + "learning_rate": 2.5769472186836347e-05, + "loss": 0.0091, + "step": 727 + }, + { + "epoch": 1.4028915662650603, + "grad_norm": 0.23481112718582153, + "learning_rate": 2.5726516435702583e-05, + "loss": 0.0154, + "step": 728 + }, + { + "epoch": 1.4048192771084338, + "grad_norm": 0.1846667379140854, + "learning_rate": 2.5683531889786194e-05, + "loss": 0.0088, + "step": 729 + }, + { + "epoch": 1.4067469879518073, + "grad_norm": 0.16717663407325745, + "learning_rate": 2.564051876522742e-05, + "loss": 0.0083, + "step": 730 + }, + { + "epoch": 1.4086746987951808, + "grad_norm": 0.4116475284099579, + "learning_rate": 2.5597477278310202e-05, + "loss": 0.0179, + "step": 731 + }, + { + "epoch": 1.410602409638554, + "grad_norm": 0.171807661652565, + "learning_rate": 2.5554407645461115e-05, + "loss": 0.0063, + "step": 732 + }, + { + "epoch": 1.4125301204819278, + "grad_norm": 0.1954439878463745, + "learning_rate": 2.5511310083248243e-05, + "loss": 0.017, + "step": 733 + }, + { + "epoch": 1.4144578313253011, + "grad_norm": 0.37158989906311035, + "learning_rate": 2.5468184808380104e-05, + "loss": 0.0173, + "step": 734 + }, + { + "epoch": 1.4163855421686746, + "grad_norm": 0.2001633644104004, + "learning_rate": 2.542503203770458e-05, + "loss": 0.0165, + "step": 735 + }, + { + "epoch": 1.4183132530120481, + "grad_norm": 0.45673373341560364, + "learning_rate": 2.53818519882078e-05, + "loss": 0.0185, + "step": 736 + }, + { + "epoch": 1.4202409638554216, + "grad_norm": 0.3838701546192169, + "learning_rate": 2.5338644877013067e-05, + "loss": 0.0134, + "step": 737 + }, + { + "epoch": 1.4221686746987952, + "grad_norm": 0.32032477855682373, + "learning_rate": 2.5295410921379745e-05, + "loss": 0.0143, + "step": 738 + }, + { + "epoch": 1.4240963855421687, + "grad_norm": 0.4594039022922516, + "learning_rate": 2.52521503387022e-05, + "loss": 0.0193, + "step": 739 + }, + { + "epoch": 1.4260240963855422, + "grad_norm": 0.3889620900154114, + "learning_rate": 2.5208863346508667e-05, + "loss": 0.0114, + "step": 740 + }, + { + "epoch": 1.4279518072289157, + "grad_norm": 0.33153319358825684, + "learning_rate": 2.5165550162460203e-05, + "loss": 0.0102, + "step": 741 + }, + { + "epoch": 1.4298795180722892, + "grad_norm": 0.7269518375396729, + "learning_rate": 2.5122211004349536e-05, + "loss": 0.0215, + "step": 742 + }, + { + "epoch": 1.4318072289156627, + "grad_norm": 0.31653261184692383, + "learning_rate": 2.5078846090100023e-05, + "loss": 0.0115, + "step": 743 + }, + { + "epoch": 1.4337349397590362, + "grad_norm": 0.20620353519916534, + "learning_rate": 2.5035455637764518e-05, + "loss": 0.0153, + "step": 744 + }, + { + "epoch": 1.4356626506024097, + "grad_norm": 0.17266008257865906, + "learning_rate": 2.4992039865524297e-05, + "loss": 0.0069, + "step": 745 + }, + { + "epoch": 1.4375903614457832, + "grad_norm": 0.24760811030864716, + "learning_rate": 2.494859899168795e-05, + "loss": 0.0108, + "step": 746 + }, + { + "epoch": 1.4395180722891565, + "grad_norm": 0.2584865391254425, + "learning_rate": 2.4905133234690282e-05, + "loss": 0.0095, + "step": 747 + }, + { + "epoch": 1.4414457831325302, + "grad_norm": 0.48847514390945435, + "learning_rate": 2.486164281309122e-05, + "loss": 0.0181, + "step": 748 + }, + { + "epoch": 1.4433734939759035, + "grad_norm": 0.42942047119140625, + "learning_rate": 2.4818127945574717e-05, + "loss": 0.025, + "step": 749 + }, + { + "epoch": 1.445301204819277, + "grad_norm": 0.23713800311088562, + "learning_rate": 2.4774588850947648e-05, + "loss": 0.0085, + "step": 750 + }, + { + "epoch": 1.4472289156626506, + "grad_norm": 0.8797569870948792, + "learning_rate": 2.473102574813871e-05, + "loss": 0.0097, + "step": 751 + }, + { + "epoch": 1.449156626506024, + "grad_norm": 0.2744862735271454, + "learning_rate": 2.4687438856197302e-05, + "loss": 0.0122, + "step": 752 + }, + { + "epoch": 1.4510843373493976, + "grad_norm": 0.12747010588645935, + "learning_rate": 2.4643828394292478e-05, + "loss": 0.0056, + "step": 753 + }, + { + "epoch": 1.453012048192771, + "grad_norm": 0.37376829981803894, + "learning_rate": 2.4600194581711775e-05, + "loss": 0.0052, + "step": 754 + }, + { + "epoch": 1.4549397590361446, + "grad_norm": 0.2536911368370056, + "learning_rate": 2.4556537637860176e-05, + "loss": 0.0113, + "step": 755 + }, + { + "epoch": 1.456867469879518, + "grad_norm": 0.25950780510902405, + "learning_rate": 2.451285778225894e-05, + "loss": 0.0099, + "step": 756 + }, + { + "epoch": 1.4587951807228916, + "grad_norm": 0.19535955786705017, + "learning_rate": 2.4469155234544565e-05, + "loss": 0.0069, + "step": 757 + }, + { + "epoch": 1.4607228915662651, + "grad_norm": 0.22816115617752075, + "learning_rate": 2.442543021446764e-05, + "loss": 0.0088, + "step": 758 + }, + { + "epoch": 1.4626506024096386, + "grad_norm": 0.3363986313343048, + "learning_rate": 2.4381682941891755e-05, + "loss": 0.0182, + "step": 759 + }, + { + "epoch": 1.464578313253012, + "grad_norm": 0.21492891013622284, + "learning_rate": 2.4337913636792382e-05, + "loss": 0.0069, + "step": 760 + }, + { + "epoch": 1.4665060240963856, + "grad_norm": 0.6070862412452698, + "learning_rate": 2.429412251925579e-05, + "loss": 0.0406, + "step": 761 + }, + { + "epoch": 1.468433734939759, + "grad_norm": 2.6469690799713135, + "learning_rate": 2.425030980947793e-05, + "loss": 0.0205, + "step": 762 + }, + { + "epoch": 1.4703614457831327, + "grad_norm": 0.30909740924835205, + "learning_rate": 2.420647572776332e-05, + "loss": 0.0136, + "step": 763 + }, + { + "epoch": 1.472289156626506, + "grad_norm": 0.6639553904533386, + "learning_rate": 2.416262049452395e-05, + "loss": 0.011, + "step": 764 + }, + { + "epoch": 1.4742168674698795, + "grad_norm": 0.2919616997241974, + "learning_rate": 2.4118744330278147e-05, + "loss": 0.0131, + "step": 765 + }, + { + "epoch": 1.476144578313253, + "grad_norm": 0.5232429504394531, + "learning_rate": 2.4074847455649523e-05, + "loss": 0.0138, + "step": 766 + }, + { + "epoch": 1.4780722891566265, + "grad_norm": 5.630630970001221, + "learning_rate": 2.403093009136579e-05, + "loss": 0.0264, + "step": 767 + }, + { + "epoch": 1.48, + "grad_norm": 0.33234721422195435, + "learning_rate": 2.3986992458257707e-05, + "loss": 0.0111, + "step": 768 + }, + { + "epoch": 1.4819277108433735, + "grad_norm": 0.28444772958755493, + "learning_rate": 2.3943034777257945e-05, + "loss": 0.0144, + "step": 769 + }, + { + "epoch": 1.483855421686747, + "grad_norm": 0.16229979693889618, + "learning_rate": 2.38990572694e-05, + "loss": 0.0062, + "step": 770 + }, + { + "epoch": 1.4857831325301205, + "grad_norm": 0.27474716305732727, + "learning_rate": 2.385506015581704e-05, + "loss": 0.0172, + "step": 771 + }, + { + "epoch": 1.487710843373494, + "grad_norm": 0.246526300907135, + "learning_rate": 2.381104365774083e-05, + "loss": 0.012, + "step": 772 + }, + { + "epoch": 1.4896385542168675, + "grad_norm": 0.282047837972641, + "learning_rate": 2.37670079965006e-05, + "loss": 0.0116, + "step": 773 + }, + { + "epoch": 1.491566265060241, + "grad_norm": 0.2878139317035675, + "learning_rate": 2.3722953393521944e-05, + "loss": 0.0147, + "step": 774 + }, + { + "epoch": 1.4934939759036143, + "grad_norm": 0.5586277842521667, + "learning_rate": 2.367888007032571e-05, + "loss": 0.0111, + "step": 775 + }, + { + "epoch": 1.495421686746988, + "grad_norm": 0.562160313129425, + "learning_rate": 2.3634788248526846e-05, + "loss": 0.0061, + "step": 776 + }, + { + "epoch": 1.4973493975903613, + "grad_norm": 0.3452005982398987, + "learning_rate": 2.3590678149833356e-05, + "loss": 0.0205, + "step": 777 + }, + { + "epoch": 1.499277108433735, + "grad_norm": 0.7757686376571655, + "learning_rate": 2.3546549996045114e-05, + "loss": 0.0273, + "step": 778 + }, + { + "epoch": 1.5012048192771084, + "grad_norm": 0.19530551135540009, + "learning_rate": 2.3502404009052812e-05, + "loss": 0.0083, + "step": 779 + }, + { + "epoch": 1.503132530120482, + "grad_norm": 0.2586531639099121, + "learning_rate": 2.3458240410836775e-05, + "loss": 0.0122, + "step": 780 + }, + { + "epoch": 1.5050602409638554, + "grad_norm": 0.30063286423683167, + "learning_rate": 2.3414059423465924e-05, + "loss": 0.0083, + "step": 781 + }, + { + "epoch": 1.5069879518072289, + "grad_norm": 0.18663185834884644, + "learning_rate": 2.3369861269096575e-05, + "loss": 0.0104, + "step": 782 + }, + { + "epoch": 1.5089156626506024, + "grad_norm": 0.4405941069126129, + "learning_rate": 2.3325646169971416e-05, + "loss": 0.0264, + "step": 783 + }, + { + "epoch": 1.510843373493976, + "grad_norm": 0.2947913110256195, + "learning_rate": 2.3281414348418294e-05, + "loss": 0.0107, + "step": 784 + }, + { + "epoch": 1.5127710843373494, + "grad_norm": 0.23813778162002563, + "learning_rate": 2.3237166026849158e-05, + "loss": 0.0084, + "step": 785 + }, + { + "epoch": 1.514698795180723, + "grad_norm": 0.33380329608917236, + "learning_rate": 2.3192901427758932e-05, + "loss": 0.0111, + "step": 786 + }, + { + "epoch": 1.5166265060240964, + "grad_norm": 0.3736988306045532, + "learning_rate": 2.314862077372438e-05, + "loss": 0.0135, + "step": 787 + }, + { + "epoch": 1.5185542168674697, + "grad_norm": 0.3785395920276642, + "learning_rate": 2.3104324287402996e-05, + "loss": 0.0265, + "step": 788 + }, + { + "epoch": 1.5204819277108435, + "grad_norm": 0.3359154462814331, + "learning_rate": 2.3060012191531885e-05, + "loss": 0.0127, + "step": 789 + }, + { + "epoch": 1.5224096385542167, + "grad_norm": 0.720753014087677, + "learning_rate": 2.301568470892664e-05, + "loss": 0.0134, + "step": 790 + }, + { + "epoch": 1.5243373493975905, + "grad_norm": 0.36473193764686584, + "learning_rate": 2.297134206248024e-05, + "loss": 0.0318, + "step": 791 + }, + { + "epoch": 1.5262650602409638, + "grad_norm": 0.29987087845802307, + "learning_rate": 2.2926984475161884e-05, + "loss": 0.008, + "step": 792 + }, + { + "epoch": 1.5281927710843375, + "grad_norm": 0.2883112132549286, + "learning_rate": 2.2882612170015914e-05, + "loss": 0.0125, + "step": 793 + }, + { + "epoch": 1.5301204819277108, + "grad_norm": 0.28983229398727417, + "learning_rate": 2.2838225370160682e-05, + "loss": 0.0155, + "step": 794 + }, + { + "epoch": 1.5320481927710843, + "grad_norm": 0.47236886620521545, + "learning_rate": 2.2793824298787414e-05, + "loss": 0.0132, + "step": 795 + }, + { + "epoch": 1.5339759036144578, + "grad_norm": 0.8328865170478821, + "learning_rate": 2.2749409179159104e-05, + "loss": 0.026, + "step": 796 + }, + { + "epoch": 1.5359036144578313, + "grad_norm": 0.3129172623157501, + "learning_rate": 2.2704980234609396e-05, + "loss": 0.0099, + "step": 797 + }, + { + "epoch": 1.5378313253012048, + "grad_norm": 0.22284500300884247, + "learning_rate": 2.2660537688541416e-05, + "loss": 0.009, + "step": 798 + }, + { + "epoch": 1.5397590361445783, + "grad_norm": 0.3346405625343323, + "learning_rate": 2.2616081764426726e-05, + "loss": 0.0077, + "step": 799 + }, + { + "epoch": 1.5416867469879518, + "grad_norm": 0.2923565208911896, + "learning_rate": 2.2571612685804124e-05, + "loss": 0.0119, + "step": 800 + }, + { + "epoch": 1.5436144578313253, + "grad_norm": 0.1921311914920807, + "learning_rate": 2.252713067627857e-05, + "loss": 0.0083, + "step": 801 + }, + { + "epoch": 1.5455421686746988, + "grad_norm": 0.23221106827259064, + "learning_rate": 2.2482635959520044e-05, + "loss": 0.0049, + "step": 802 + }, + { + "epoch": 1.5474698795180721, + "grad_norm": 0.6340724229812622, + "learning_rate": 2.243812875926241e-05, + "loss": 0.0273, + "step": 803 + }, + { + "epoch": 1.5493975903614459, + "grad_norm": 0.2699439823627472, + "learning_rate": 2.2393609299302314e-05, + "loss": 0.0108, + "step": 804 + }, + { + "epoch": 1.5513253012048192, + "grad_norm": 0.2005189210176468, + "learning_rate": 2.2349077803498052e-05, + "loss": 0.0076, + "step": 805 + }, + { + "epoch": 1.5532530120481929, + "grad_norm": 0.39668548107147217, + "learning_rate": 2.230453449576842e-05, + "loss": 0.0135, + "step": 806 + }, + { + "epoch": 1.5551807228915662, + "grad_norm": 0.2406950294971466, + "learning_rate": 2.2259979600091635e-05, + "loss": 0.0094, + "step": 807 + }, + { + "epoch": 1.55710843373494, + "grad_norm": 0.30363157391548157, + "learning_rate": 2.2215413340504158e-05, + "loss": 0.0178, + "step": 808 + }, + { + "epoch": 1.5590361445783132, + "grad_norm": 0.19508181512355804, + "learning_rate": 2.2170835941099605e-05, + "loss": 0.0069, + "step": 809 + }, + { + "epoch": 1.5609638554216867, + "grad_norm": 0.734106719493866, + "learning_rate": 2.2126247626027615e-05, + "loss": 0.0319, + "step": 810 + }, + { + "epoch": 1.5628915662650602, + "grad_norm": 0.2591583728790283, + "learning_rate": 2.208164861949268e-05, + "loss": 0.0168, + "step": 811 + }, + { + "epoch": 1.5648192771084337, + "grad_norm": 0.2386734038591385, + "learning_rate": 2.20370391457531e-05, + "loss": 0.0041, + "step": 812 + }, + { + "epoch": 1.5667469879518072, + "grad_norm": 0.1675218939781189, + "learning_rate": 2.1992419429119764e-05, + "loss": 0.0078, + "step": 813 + }, + { + "epoch": 1.5686746987951807, + "grad_norm": 0.45591506361961365, + "learning_rate": 2.1947789693955097e-05, + "loss": 0.0166, + "step": 814 + }, + { + "epoch": 1.5706024096385542, + "grad_norm": 0.46940621733665466, + "learning_rate": 2.190315016467188e-05, + "loss": 0.0176, + "step": 815 + }, + { + "epoch": 1.5725301204819278, + "grad_norm": 0.2294205278158188, + "learning_rate": 2.1858501065732146e-05, + "loss": 0.0102, + "step": 816 + }, + { + "epoch": 1.5744578313253013, + "grad_norm": 0.28922322392463684, + "learning_rate": 2.181384262164606e-05, + "loss": 0.0111, + "step": 817 + }, + { + "epoch": 1.5763855421686745, + "grad_norm": 0.19650064408779144, + "learning_rate": 2.1769175056970765e-05, + "loss": 0.0076, + "step": 818 + }, + { + "epoch": 1.5783132530120483, + "grad_norm": 0.19538825750350952, + "learning_rate": 2.172449859630927e-05, + "loss": 0.0118, + "step": 819 + }, + { + "epoch": 1.5802409638554216, + "grad_norm": 0.1900389939546585, + "learning_rate": 2.167981346430931e-05, + "loss": 0.0066, + "step": 820 + }, + { + "epoch": 1.5821686746987953, + "grad_norm": 0.21593710780143738, + "learning_rate": 2.1635119885662235e-05, + "loss": 0.0101, + "step": 821 + }, + { + "epoch": 1.5840963855421686, + "grad_norm": 0.2699289321899414, + "learning_rate": 2.159041808510185e-05, + "loss": 0.0118, + "step": 822 + }, + { + "epoch": 1.5860240963855423, + "grad_norm": 0.31867673993110657, + "learning_rate": 2.1545708287403322e-05, + "loss": 0.0122, + "step": 823 + }, + { + "epoch": 1.5879518072289156, + "grad_norm": 0.2862400412559509, + "learning_rate": 2.1500990717382004e-05, + "loss": 0.0216, + "step": 824 + }, + { + "epoch": 1.589879518072289, + "grad_norm": 0.28482481837272644, + "learning_rate": 2.145626559989237e-05, + "loss": 0.0136, + "step": 825 + }, + { + "epoch": 1.5918072289156626, + "grad_norm": 0.2866958975791931, + "learning_rate": 2.1411533159826803e-05, + "loss": 0.0298, + "step": 826 + }, + { + "epoch": 1.5937349397590361, + "grad_norm": 0.39092838764190674, + "learning_rate": 2.1366793622114533e-05, + "loss": 0.0382, + "step": 827 + }, + { + "epoch": 1.5956626506024096, + "grad_norm": 0.16381537914276123, + "learning_rate": 2.1322047211720468e-05, + "loss": 0.0074, + "step": 828 + }, + { + "epoch": 1.5975903614457831, + "grad_norm": 0.22146940231323242, + "learning_rate": 2.1277294153644083e-05, + "loss": 0.0103, + "step": 829 + }, + { + "epoch": 1.5995180722891567, + "grad_norm": 0.2155209183692932, + "learning_rate": 2.123253467291827e-05, + "loss": 0.0095, + "step": 830 + }, + { + "epoch": 1.6014457831325302, + "grad_norm": 0.41510409116744995, + "learning_rate": 2.118776899460822e-05, + "loss": 0.0457, + "step": 831 + }, + { + "epoch": 1.6033734939759037, + "grad_norm": 0.19718150794506073, + "learning_rate": 2.1142997343810293e-05, + "loss": 0.0192, + "step": 832 + }, + { + "epoch": 1.605301204819277, + "grad_norm": 0.40924403071403503, + "learning_rate": 2.1098219945650865e-05, + "loss": 0.0278, + "step": 833 + }, + { + "epoch": 1.6072289156626507, + "grad_norm": 0.18657824397087097, + "learning_rate": 2.105343702528524e-05, + "loss": 0.0076, + "step": 834 + }, + { + "epoch": 1.609156626506024, + "grad_norm": 0.1727641075849533, + "learning_rate": 2.100864880789645e-05, + "loss": 0.0076, + "step": 835 + }, + { + "epoch": 1.6110843373493977, + "grad_norm": 0.18138745427131653, + "learning_rate": 2.0963855518694203e-05, + "loss": 0.005, + "step": 836 + }, + { + "epoch": 1.613012048192771, + "grad_norm": 0.19173955917358398, + "learning_rate": 2.0919057382913675e-05, + "loss": 0.0084, + "step": 837 + }, + { + "epoch": 1.6149397590361447, + "grad_norm": 0.3812403380870819, + "learning_rate": 2.0874254625814435e-05, + "loss": 0.009, + "step": 838 + }, + { + "epoch": 1.616867469879518, + "grad_norm": 0.2009759545326233, + "learning_rate": 2.0829447472679285e-05, + "loss": 0.0098, + "step": 839 + }, + { + "epoch": 1.6187951807228915, + "grad_norm": 0.48703446984291077, + "learning_rate": 2.0784636148813124e-05, + "loss": 0.0099, + "step": 840 + }, + { + "epoch": 1.620722891566265, + "grad_norm": 0.28995075821876526, + "learning_rate": 2.0739820879541827e-05, + "loss": 0.0075, + "step": 841 + }, + { + "epoch": 1.6226506024096385, + "grad_norm": 0.2130059450864792, + "learning_rate": 2.069500189021111e-05, + "loss": 0.007, + "step": 842 + }, + { + "epoch": 1.624578313253012, + "grad_norm": 0.252524733543396, + "learning_rate": 2.0650179406185397e-05, + "loss": 0.0249, + "step": 843 + }, + { + "epoch": 1.6265060240963856, + "grad_norm": 0.23069098591804504, + "learning_rate": 2.060535365284668e-05, + "loss": 0.0084, + "step": 844 + }, + { + "epoch": 1.628433734939759, + "grad_norm": 0.25051403045654297, + "learning_rate": 2.056052485559338e-05, + "loss": 0.0071, + "step": 845 + }, + { + "epoch": 1.6303614457831326, + "grad_norm": 0.27664798498153687, + "learning_rate": 2.051569323983924e-05, + "loss": 0.0198, + "step": 846 + }, + { + "epoch": 1.632289156626506, + "grad_norm": 0.2954922318458557, + "learning_rate": 2.047085903101218e-05, + "loss": 0.006, + "step": 847 + }, + { + "epoch": 1.6342168674698794, + "grad_norm": 0.28477591276168823, + "learning_rate": 2.0426022454553137e-05, + "loss": 0.0147, + "step": 848 + }, + { + "epoch": 1.636144578313253, + "grad_norm": 0.2785305678844452, + "learning_rate": 2.0381183735914968e-05, + "loss": 0.0117, + "step": 849 + }, + { + "epoch": 1.6380722891566264, + "grad_norm": 0.2500309348106384, + "learning_rate": 2.0336343100561295e-05, + "loss": 0.008, + "step": 850 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.18932047486305237, + "learning_rate": 2.0291500773965392e-05, + "loss": 0.0256, + "step": 851 + }, + { + "epoch": 1.6419277108433734, + "grad_norm": 0.6396257877349854, + "learning_rate": 2.0246656981609013e-05, + "loss": 0.0141, + "step": 852 + }, + { + "epoch": 1.6438554216867471, + "grad_norm": 0.5072891116142273, + "learning_rate": 2.02018119489813e-05, + "loss": 0.008, + "step": 853 + }, + { + "epoch": 1.6457831325301204, + "grad_norm": 0.2920839488506317, + "learning_rate": 2.0156965901577635e-05, + "loss": 0.0085, + "step": 854 + }, + { + "epoch": 1.647710843373494, + "grad_norm": 0.1391262263059616, + "learning_rate": 2.011211906489848e-05, + "loss": 0.0078, + "step": 855 + }, + { + "epoch": 1.6496385542168674, + "grad_norm": 0.29620468616485596, + "learning_rate": 2.00672716644483e-05, + "loss": 0.0109, + "step": 856 + }, + { + "epoch": 1.651566265060241, + "grad_norm": 0.13946573436260223, + "learning_rate": 2.002242392573436e-05, + "loss": 0.0076, + "step": 857 + }, + { + "epoch": 1.6534939759036145, + "grad_norm": 0.9766128659248352, + "learning_rate": 1.997757607426565e-05, + "loss": 0.0309, + "step": 858 + }, + { + "epoch": 1.655421686746988, + "grad_norm": 0.18002203106880188, + "learning_rate": 1.9932728335551702e-05, + "loss": 0.0072, + "step": 859 + }, + { + "epoch": 1.6573493975903615, + "grad_norm": 0.28073111176490784, + "learning_rate": 1.988788093510152e-05, + "loss": 0.0246, + "step": 860 + }, + { + "epoch": 1.659277108433735, + "grad_norm": 0.1919957399368286, + "learning_rate": 1.9843034098422375e-05, + "loss": 0.0087, + "step": 861 + }, + { + "epoch": 1.6612048192771085, + "grad_norm": 0.1825258433818817, + "learning_rate": 1.9798188051018705e-05, + "loss": 0.0092, + "step": 862 + }, + { + "epoch": 1.6631325301204818, + "grad_norm": 0.32412952184677124, + "learning_rate": 1.9753343018390997e-05, + "loss": 0.0118, + "step": 863 + }, + { + "epoch": 1.6650602409638555, + "grad_norm": 0.12828563153743744, + "learning_rate": 1.9708499226034618e-05, + "loss": 0.0056, + "step": 864 + }, + { + "epoch": 1.6669879518072288, + "grad_norm": 0.18647560477256775, + "learning_rate": 1.966365689943871e-05, + "loss": 0.0094, + "step": 865 + }, + { + "epoch": 1.6689156626506025, + "grad_norm": 0.19835828244686127, + "learning_rate": 1.9618816264085042e-05, + "loss": 0.0097, + "step": 866 + }, + { + "epoch": 1.6708433734939758, + "grad_norm": 0.22364282608032227, + "learning_rate": 1.957397754544687e-05, + "loss": 0.0062, + "step": 867 + }, + { + "epoch": 1.6727710843373496, + "grad_norm": 0.29420018196105957, + "learning_rate": 1.952914096898783e-05, + "loss": 0.0182, + "step": 868 + }, + { + "epoch": 1.6746987951807228, + "grad_norm": 0.2149929702281952, + "learning_rate": 1.9484306760160766e-05, + "loss": 0.0125, + "step": 869 + }, + { + "epoch": 1.6766265060240964, + "grad_norm": 0.16844330728054047, + "learning_rate": 1.9439475144406623e-05, + "loss": 0.0074, + "step": 870 + }, + { + "epoch": 1.6785542168674699, + "grad_norm": 0.5010282397270203, + "learning_rate": 1.9394646347153334e-05, + "loss": 0.0213, + "step": 871 + }, + { + "epoch": 1.6804819277108434, + "grad_norm": 0.29847195744514465, + "learning_rate": 1.9349820593814606e-05, + "loss": 0.0173, + "step": 872 + }, + { + "epoch": 1.6824096385542169, + "grad_norm": 0.23835812509059906, + "learning_rate": 1.930499810978889e-05, + "loss": 0.011, + "step": 873 + }, + { + "epoch": 1.6843373493975904, + "grad_norm": 0.3269020617008209, + "learning_rate": 1.9260179120458177e-05, + "loss": 0.0285, + "step": 874 + }, + { + "epoch": 1.686265060240964, + "grad_norm": 0.2142144739627838, + "learning_rate": 1.9215363851186883e-05, + "loss": 0.0146, + "step": 875 + }, + { + "epoch": 1.6881927710843372, + "grad_norm": 0.3098377585411072, + "learning_rate": 1.9170552527320725e-05, + "loss": 0.0104, + "step": 876 + }, + { + "epoch": 1.690120481927711, + "grad_norm": 0.22504115104675293, + "learning_rate": 1.9125745374185568e-05, + "loss": 0.0091, + "step": 877 + }, + { + "epoch": 1.6920481927710842, + "grad_norm": 0.20633333921432495, + "learning_rate": 1.908094261708633e-05, + "loss": 0.0097, + "step": 878 + }, + { + "epoch": 1.693975903614458, + "grad_norm": 1.179566502571106, + "learning_rate": 1.9036144481305807e-05, + "loss": 0.0143, + "step": 879 + }, + { + "epoch": 1.6959036144578312, + "grad_norm": 0.15525613725185394, + "learning_rate": 1.8991351192103554e-05, + "loss": 0.0062, + "step": 880 + }, + { + "epoch": 1.697831325301205, + "grad_norm": 0.15966367721557617, + "learning_rate": 1.8946562974714763e-05, + "loss": 0.0048, + "step": 881 + }, + { + "epoch": 1.6997590361445782, + "grad_norm": 0.18902607262134552, + "learning_rate": 1.890178005434914e-05, + "loss": 0.0124, + "step": 882 + }, + { + "epoch": 1.701686746987952, + "grad_norm": 0.21692413091659546, + "learning_rate": 1.885700265618971e-05, + "loss": 0.0135, + "step": 883 + }, + { + "epoch": 1.7036144578313253, + "grad_norm": 0.38948455452919006, + "learning_rate": 1.8812231005391786e-05, + "loss": 0.0365, + "step": 884 + }, + { + "epoch": 1.7055421686746988, + "grad_norm": 0.2483491599559784, + "learning_rate": 1.8767465327081736e-05, + "loss": 0.0202, + "step": 885 + }, + { + "epoch": 1.7074698795180723, + "grad_norm": 0.15305832028388977, + "learning_rate": 1.872270584635592e-05, + "loss": 0.0035, + "step": 886 + }, + { + "epoch": 1.7093975903614458, + "grad_norm": 0.17794466018676758, + "learning_rate": 1.867795278827954e-05, + "loss": 0.0157, + "step": 887 + }, + { + "epoch": 1.7113253012048193, + "grad_norm": 0.1938813328742981, + "learning_rate": 1.863320637788547e-05, + "loss": 0.0071, + "step": 888 + }, + { + "epoch": 1.7132530120481928, + "grad_norm": 0.27061617374420166, + "learning_rate": 1.8588466840173207e-05, + "loss": 0.0347, + "step": 889 + }, + { + "epoch": 1.7151807228915663, + "grad_norm": 0.1541014313697815, + "learning_rate": 1.8543734400107637e-05, + "loss": 0.006, + "step": 890 + }, + { + "epoch": 1.7171084337349396, + "grad_norm": 0.1436876654624939, + "learning_rate": 1.8499009282617996e-05, + "loss": 0.0059, + "step": 891 + }, + { + "epoch": 1.7190361445783133, + "grad_norm": 1.0573723316192627, + "learning_rate": 1.8454291712596688e-05, + "loss": 0.008, + "step": 892 + }, + { + "epoch": 1.7209638554216866, + "grad_norm": 0.15406259894371033, + "learning_rate": 1.8409581914898157e-05, + "loss": 0.0061, + "step": 893 + }, + { + "epoch": 1.7228915662650603, + "grad_norm": 0.24822913110256195, + "learning_rate": 1.836488011433777e-05, + "loss": 0.0085, + "step": 894 + }, + { + "epoch": 1.7248192771084336, + "grad_norm": 0.21049316227436066, + "learning_rate": 1.83201865356907e-05, + "loss": 0.0075, + "step": 895 + }, + { + "epoch": 1.7267469879518074, + "grad_norm": 0.24159866571426392, + "learning_rate": 1.8275501403690733e-05, + "loss": 0.0156, + "step": 896 + }, + { + "epoch": 1.7286746987951807, + "grad_norm": 0.3191063106060028, + "learning_rate": 1.823082494302924e-05, + "loss": 0.0218, + "step": 897 + }, + { + "epoch": 1.7306024096385542, + "grad_norm": 0.20296362042427063, + "learning_rate": 1.8186157378353945e-05, + "loss": 0.0126, + "step": 898 + }, + { + "epoch": 1.7325301204819277, + "grad_norm": 0.1905524581670761, + "learning_rate": 1.8141498934267858e-05, + "loss": 0.0131, + "step": 899 + }, + { + "epoch": 1.7344578313253012, + "grad_norm": 0.5350520610809326, + "learning_rate": 1.809684983532813e-05, + "loss": 0.0115, + "step": 900 + }, + { + "epoch": 1.7363855421686747, + "grad_norm": 0.17144092917442322, + "learning_rate": 1.8052210306044907e-05, + "loss": 0.0113, + "step": 901 + }, + { + "epoch": 1.7383132530120482, + "grad_norm": 0.11777982115745544, + "learning_rate": 1.8007580570880236e-05, + "loss": 0.0058, + "step": 902 + }, + { + "epoch": 1.7402409638554217, + "grad_norm": 0.2078275978565216, + "learning_rate": 1.7962960854246908e-05, + "loss": 0.0106, + "step": 903 + }, + { + "epoch": 1.7421686746987952, + "grad_norm": 0.2550877630710602, + "learning_rate": 1.791835138050732e-05, + "loss": 0.0076, + "step": 904 + }, + { + "epoch": 1.7440963855421687, + "grad_norm": 0.11553912609815598, + "learning_rate": 1.7873752373972395e-05, + "loss": 0.0038, + "step": 905 + }, + { + "epoch": 1.746024096385542, + "grad_norm": 0.10724586248397827, + "learning_rate": 1.7829164058900398e-05, + "loss": 0.0043, + "step": 906 + }, + { + "epoch": 1.7479518072289157, + "grad_norm": 0.30152231454849243, + "learning_rate": 1.7784586659495845e-05, + "loss": 0.0099, + "step": 907 + }, + { + "epoch": 1.749879518072289, + "grad_norm": 0.18372933566570282, + "learning_rate": 1.7740020399908372e-05, + "loss": 0.0074, + "step": 908 + }, + { + "epoch": 1.7518072289156628, + "grad_norm": 0.35184428095817566, + "learning_rate": 1.7695465504231586e-05, + "loss": 0.0184, + "step": 909 + }, + { + "epoch": 1.753734939759036, + "grad_norm": 0.15083615481853485, + "learning_rate": 1.765092219650196e-05, + "loss": 0.0061, + "step": 910 + }, + { + "epoch": 1.7556626506024098, + "grad_norm": 0.2599961459636688, + "learning_rate": 1.7606390700697693e-05, + "loss": 0.0101, + "step": 911 + }, + { + "epoch": 1.757590361445783, + "grad_norm": 0.10829206556081772, + "learning_rate": 1.7561871240737595e-05, + "loss": 0.0034, + "step": 912 + }, + { + "epoch": 1.7595180722891566, + "grad_norm": 0.38098782300949097, + "learning_rate": 1.7517364040479966e-05, + "loss": 0.0384, + "step": 913 + }, + { + "epoch": 1.76144578313253, + "grad_norm": 0.14975085854530334, + "learning_rate": 1.7472869323721432e-05, + "loss": 0.0055, + "step": 914 + }, + { + "epoch": 1.7633734939759036, + "grad_norm": 0.4151444733142853, + "learning_rate": 1.742838731419588e-05, + "loss": 0.0307, + "step": 915 + }, + { + "epoch": 1.765301204819277, + "grad_norm": 0.22238481044769287, + "learning_rate": 1.738391823557328e-05, + "loss": 0.0059, + "step": 916 + }, + { + "epoch": 1.7672289156626506, + "grad_norm": 0.23386356234550476, + "learning_rate": 1.7339462311458587e-05, + "loss": 0.0113, + "step": 917 + }, + { + "epoch": 1.7691566265060241, + "grad_norm": 0.21911191940307617, + "learning_rate": 1.7295019765390618e-05, + "loss": 0.0071, + "step": 918 + }, + { + "epoch": 1.7710843373493976, + "grad_norm": 0.343159943819046, + "learning_rate": 1.7250590820840903e-05, + "loss": 0.0144, + "step": 919 + }, + { + "epoch": 1.7730120481927711, + "grad_norm": 0.32204556465148926, + "learning_rate": 1.720617570121259e-05, + "loss": 0.0131, + "step": 920 + }, + { + "epoch": 1.7749397590361444, + "grad_norm": 0.4105585515499115, + "learning_rate": 1.7161774629839328e-05, + "loss": 0.0148, + "step": 921 + }, + { + "epoch": 1.7768674698795182, + "grad_norm": 0.16380974650382996, + "learning_rate": 1.7117387829984093e-05, + "loss": 0.0066, + "step": 922 + }, + { + "epoch": 1.7787951807228914, + "grad_norm": 0.22920913994312286, + "learning_rate": 1.707301552483813e-05, + "loss": 0.0105, + "step": 923 + }, + { + "epoch": 1.7807228915662652, + "grad_norm": 0.2075149267911911, + "learning_rate": 1.7028657937519767e-05, + "loss": 0.0104, + "step": 924 + }, + { + "epoch": 1.7826506024096385, + "grad_norm": 0.44439977407455444, + "learning_rate": 1.6984315291073355e-05, + "loss": 0.0134, + "step": 925 + }, + { + "epoch": 1.7845783132530122, + "grad_norm": 0.24068203568458557, + "learning_rate": 1.6939987808468125e-05, + "loss": 0.0078, + "step": 926 + }, + { + "epoch": 1.7865060240963855, + "grad_norm": 0.34044349193573, + "learning_rate": 1.689567571259701e-05, + "loss": 0.0108, + "step": 927 + }, + { + "epoch": 1.788433734939759, + "grad_norm": 0.34082743525505066, + "learning_rate": 1.6851379226275624e-05, + "loss": 0.0266, + "step": 928 + }, + { + "epoch": 1.7903614457831325, + "grad_norm": 0.19490115344524384, + "learning_rate": 1.6807098572241075e-05, + "loss": 0.0109, + "step": 929 + }, + { + "epoch": 1.792289156626506, + "grad_norm": 0.16208237409591675, + "learning_rate": 1.6762833973150846e-05, + "loss": 0.0113, + "step": 930 + }, + { + "epoch": 1.7942168674698795, + "grad_norm": 0.35555699467658997, + "learning_rate": 1.671858565158172e-05, + "loss": 0.0196, + "step": 931 + }, + { + "epoch": 1.796144578313253, + "grad_norm": 0.1600857824087143, + "learning_rate": 1.6674353830028587e-05, + "loss": 0.0089, + "step": 932 + }, + { + "epoch": 1.7980722891566265, + "grad_norm": 0.1699574887752533, + "learning_rate": 1.663013873090342e-05, + "loss": 0.0074, + "step": 933 + }, + { + "epoch": 1.8, + "grad_norm": 0.2472933828830719, + "learning_rate": 1.6585940576534086e-05, + "loss": 0.0063, + "step": 934 + }, + { + "epoch": 1.8019277108433736, + "grad_norm": 0.23491555452346802, + "learning_rate": 1.654175958916323e-05, + "loss": 0.0101, + "step": 935 + }, + { + "epoch": 1.8038554216867468, + "grad_norm": 0.28635191917419434, + "learning_rate": 1.6497595990947195e-05, + "loss": 0.0131, + "step": 936 + }, + { + "epoch": 1.8057831325301206, + "grad_norm": 0.15400712192058563, + "learning_rate": 1.645345000395489e-05, + "loss": 0.0068, + "step": 937 + }, + { + "epoch": 1.8077108433734939, + "grad_norm": 0.18223172426223755, + "learning_rate": 1.6409321850166647e-05, + "loss": 0.0094, + "step": 938 + }, + { + "epoch": 1.8096385542168676, + "grad_norm": 0.2789457142353058, + "learning_rate": 1.636521175147316e-05, + "loss": 0.0202, + "step": 939 + }, + { + "epoch": 1.8115662650602409, + "grad_norm": 0.4267627000808716, + "learning_rate": 1.6321119929674297e-05, + "loss": 0.0176, + "step": 940 + }, + { + "epoch": 1.8134939759036146, + "grad_norm": 0.3021615445613861, + "learning_rate": 1.6277046606478056e-05, + "loss": 0.0085, + "step": 941 + }, + { + "epoch": 1.815421686746988, + "grad_norm": 0.3724934756755829, + "learning_rate": 1.6232992003499405e-05, + "loss": 0.0474, + "step": 942 + }, + { + "epoch": 1.8173493975903614, + "grad_norm": 0.20904326438903809, + "learning_rate": 1.6188956342259177e-05, + "loss": 0.0078, + "step": 943 + }, + { + "epoch": 1.819277108433735, + "grad_norm": 0.31168171763420105, + "learning_rate": 1.614493984418297e-05, + "loss": 0.0174, + "step": 944 + }, + { + "epoch": 1.8212048192771084, + "grad_norm": 0.21273556351661682, + "learning_rate": 1.6100942730600003e-05, + "loss": 0.0054, + "step": 945 + }, + { + "epoch": 1.823132530120482, + "grad_norm": 0.16991695761680603, + "learning_rate": 1.6056965222742055e-05, + "loss": 0.0063, + "step": 946 + }, + { + "epoch": 1.8250602409638554, + "grad_norm": 0.22762684524059296, + "learning_rate": 1.6013007541742303e-05, + "loss": 0.0234, + "step": 947 + }, + { + "epoch": 1.826987951807229, + "grad_norm": 0.20128795504570007, + "learning_rate": 1.596906990863422e-05, + "loss": 0.0095, + "step": 948 + }, + { + "epoch": 1.8289156626506025, + "grad_norm": 0.30772027373313904, + "learning_rate": 1.592515254435048e-05, + "loss": 0.0356, + "step": 949 + }, + { + "epoch": 1.830843373493976, + "grad_norm": 0.12954631447792053, + "learning_rate": 1.5881255669721857e-05, + "loss": 0.008, + "step": 950 + }, + { + "epoch": 1.8327710843373493, + "grad_norm": 0.7787145972251892, + "learning_rate": 1.5837379505476054e-05, + "loss": 0.0108, + "step": 951 + }, + { + "epoch": 1.834698795180723, + "grad_norm": 0.1683879941701889, + "learning_rate": 1.5793524272236683e-05, + "loss": 0.006, + "step": 952 + }, + { + "epoch": 1.8366265060240963, + "grad_norm": 0.16475361585617065, + "learning_rate": 1.5749690190522076e-05, + "loss": 0.0065, + "step": 953 + }, + { + "epoch": 1.83855421686747, + "grad_norm": 0.211905375123024, + "learning_rate": 1.5705877480744214e-05, + "loss": 0.0092, + "step": 954 + }, + { + "epoch": 1.8404819277108433, + "grad_norm": 0.23850117623806, + "learning_rate": 1.5662086363207628e-05, + "loss": 0.012, + "step": 955 + }, + { + "epoch": 1.842409638554217, + "grad_norm": 0.19100065529346466, + "learning_rate": 1.561831705810825e-05, + "loss": 0.0113, + "step": 956 + }, + { + "epoch": 1.8443373493975903, + "grad_norm": 0.3635985255241394, + "learning_rate": 1.557456978553236e-05, + "loss": 0.0168, + "step": 957 + }, + { + "epoch": 1.8462650602409638, + "grad_norm": 0.16449116170406342, + "learning_rate": 1.553084476545544e-05, + "loss": 0.0042, + "step": 958 + }, + { + "epoch": 1.8481927710843373, + "grad_norm": 0.566093385219574, + "learning_rate": 1.5487142217741062e-05, + "loss": 0.0145, + "step": 959 + }, + { + "epoch": 1.8501204819277108, + "grad_norm": 0.15960252285003662, + "learning_rate": 1.5443462362139834e-05, + "loss": 0.0059, + "step": 960 + }, + { + "epoch": 1.8520481927710843, + "grad_norm": 0.40773797035217285, + "learning_rate": 1.539980541828823e-05, + "loss": 0.0257, + "step": 961 + }, + { + "epoch": 1.8539759036144579, + "grad_norm": 0.4802496135234833, + "learning_rate": 1.5356171605707522e-05, + "loss": 0.0111, + "step": 962 + }, + { + "epoch": 1.8559036144578314, + "grad_norm": 0.15745794773101807, + "learning_rate": 1.5312561143802704e-05, + "loss": 0.0049, + "step": 963 + }, + { + "epoch": 1.8578313253012049, + "grad_norm": 0.15139251947402954, + "learning_rate": 1.5268974251861298e-05, + "loss": 0.0077, + "step": 964 + }, + { + "epoch": 1.8597590361445784, + "grad_norm": 0.2188841849565506, + "learning_rate": 1.5225411149052356e-05, + "loss": 0.017, + "step": 965 + }, + { + "epoch": 1.8616867469879517, + "grad_norm": 0.10853131115436554, + "learning_rate": 1.5181872054425287e-05, + "loss": 0.0049, + "step": 966 + }, + { + "epoch": 1.8636144578313254, + "grad_norm": 0.8254880905151367, + "learning_rate": 1.5138357186908785e-05, + "loss": 0.0317, + "step": 967 + }, + { + "epoch": 1.8655421686746987, + "grad_norm": 0.2989620566368103, + "learning_rate": 1.5094866765309728e-05, + "loss": 0.0126, + "step": 968 + }, + { + "epoch": 1.8674698795180724, + "grad_norm": 0.16411150991916656, + "learning_rate": 1.5051401008312054e-05, + "loss": 0.0101, + "step": 969 + }, + { + "epoch": 1.8693975903614457, + "grad_norm": 0.2861763834953308, + "learning_rate": 1.5007960134475706e-05, + "loss": 0.0155, + "step": 970 + }, + { + "epoch": 1.8713253012048194, + "grad_norm": 0.24879588186740875, + "learning_rate": 1.4964544362235487e-05, + "loss": 0.0187, + "step": 971 + }, + { + "epoch": 1.8732530120481927, + "grad_norm": 0.2433672398328781, + "learning_rate": 1.4921153909899983e-05, + "loss": 0.0084, + "step": 972 + }, + { + "epoch": 1.8751807228915662, + "grad_norm": 0.15097154676914215, + "learning_rate": 1.487778899565047e-05, + "loss": 0.007, + "step": 973 + }, + { + "epoch": 1.8771084337349397, + "grad_norm": 0.1629047691822052, + "learning_rate": 1.4834449837539806e-05, + "loss": 0.0058, + "step": 974 + }, + { + "epoch": 1.8790361445783132, + "grad_norm": 0.9937071204185486, + "learning_rate": 1.4791136653491333e-05, + "loss": 0.0323, + "step": 975 + }, + { + "epoch": 1.8809638554216868, + "grad_norm": 0.19555562734603882, + "learning_rate": 1.4747849661297808e-05, + "loss": 0.0126, + "step": 976 + }, + { + "epoch": 1.8828915662650603, + "grad_norm": 0.16147711873054504, + "learning_rate": 1.470458907862026e-05, + "loss": 0.0067, + "step": 977 + }, + { + "epoch": 1.8848192771084338, + "grad_norm": 0.2730027735233307, + "learning_rate": 1.4661355122986945e-05, + "loss": 0.0147, + "step": 978 + }, + { + "epoch": 1.886746987951807, + "grad_norm": 0.13759832084178925, + "learning_rate": 1.4618148011792206e-05, + "loss": 0.0038, + "step": 979 + }, + { + "epoch": 1.8886746987951808, + "grad_norm": 0.33516690135002136, + "learning_rate": 1.4574967962295419e-05, + "loss": 0.0139, + "step": 980 + }, + { + "epoch": 1.890602409638554, + "grad_norm": 0.2345741093158722, + "learning_rate": 1.4531815191619903e-05, + "loss": 0.0094, + "step": 981 + }, + { + "epoch": 1.8925301204819278, + "grad_norm": 0.14681044220924377, + "learning_rate": 1.4488689916751762e-05, + "loss": 0.0065, + "step": 982 + }, + { + "epoch": 1.894457831325301, + "grad_norm": 0.21143914759159088, + "learning_rate": 1.4445592354538885e-05, + "loss": 0.0057, + "step": 983 + }, + { + "epoch": 1.8963855421686748, + "grad_norm": 0.3109160363674164, + "learning_rate": 1.44025227216898e-05, + "loss": 0.0142, + "step": 984 + }, + { + "epoch": 1.8983132530120481, + "grad_norm": 0.24301907420158386, + "learning_rate": 1.435948123477259e-05, + "loss": 0.012, + "step": 985 + }, + { + "epoch": 1.9002409638554218, + "grad_norm": 0.19817675650119781, + "learning_rate": 1.431646811021382e-05, + "loss": 0.0097, + "step": 986 + }, + { + "epoch": 1.9021686746987951, + "grad_norm": 0.13464932143688202, + "learning_rate": 1.4273483564297425e-05, + "loss": 0.0046, + "step": 987 + }, + { + "epoch": 1.9040963855421686, + "grad_norm": 0.1698642522096634, + "learning_rate": 1.4230527813163656e-05, + "loss": 0.0038, + "step": 988 + }, + { + "epoch": 1.9060240963855422, + "grad_norm": 0.19395388662815094, + "learning_rate": 1.4187601072807975e-05, + "loss": 0.0123, + "step": 989 + }, + { + "epoch": 1.9079518072289157, + "grad_norm": 0.2093188613653183, + "learning_rate": 1.4144703559079948e-05, + "loss": 0.0093, + "step": 990 + }, + { + "epoch": 1.9098795180722892, + "grad_norm": 0.1529311090707779, + "learning_rate": 1.4101835487682198e-05, + "loss": 0.0051, + "step": 991 + }, + { + "epoch": 1.9118072289156627, + "grad_norm": 0.18725350499153137, + "learning_rate": 1.4058997074169299e-05, + "loss": 0.0083, + "step": 992 + }, + { + "epoch": 1.9137349397590362, + "grad_norm": 0.15601560473442078, + "learning_rate": 1.401618853394668e-05, + "loss": 0.0086, + "step": 993 + }, + { + "epoch": 1.9156626506024095, + "grad_norm": 0.23890644311904907, + "learning_rate": 1.3973410082269591e-05, + "loss": 0.015, + "step": 994 + }, + { + "epoch": 1.9175903614457832, + "grad_norm": 0.2442619949579239, + "learning_rate": 1.3930661934241947e-05, + "loss": 0.0089, + "step": 995 + }, + { + "epoch": 1.9195180722891565, + "grad_norm": 0.1540212482213974, + "learning_rate": 1.388794430481532e-05, + "loss": 0.0072, + "step": 996 + }, + { + "epoch": 1.9214457831325302, + "grad_norm": 0.1359291970729828, + "learning_rate": 1.3845257408787807e-05, + "loss": 0.0131, + "step": 997 + }, + { + "epoch": 1.9233734939759035, + "grad_norm": 0.25486138463020325, + "learning_rate": 1.3802601460802967e-05, + "loss": 0.0198, + "step": 998 + }, + { + "epoch": 1.9253012048192772, + "grad_norm": 0.28815609216690063, + "learning_rate": 1.3759976675348754e-05, + "loss": 0.014, + "step": 999 + }, + { + "epoch": 1.9272289156626505, + "grad_norm": 0.15648497641086578, + "learning_rate": 1.3717383266756403e-05, + "loss": 0.0065, + "step": 1000 + }, + { + "epoch": 1.929156626506024, + "grad_norm": 0.16912540793418884, + "learning_rate": 1.367482144919941e-05, + "loss": 0.0059, + "step": 1001 + }, + { + "epoch": 1.9310843373493976, + "grad_norm": 0.16896723210811615, + "learning_rate": 1.3632291436692397e-05, + "loss": 0.0054, + "step": 1002 + }, + { + "epoch": 1.933012048192771, + "grad_norm": 0.20287497341632843, + "learning_rate": 1.3589793443090064e-05, + "loss": 0.0097, + "step": 1003 + }, + { + "epoch": 1.9349397590361446, + "grad_norm": 0.14804276823997498, + "learning_rate": 1.3547327682086114e-05, + "loss": 0.0125, + "step": 1004 + }, + { + "epoch": 1.936867469879518, + "grad_norm": 0.23820064961910248, + "learning_rate": 1.3504894367212171e-05, + "loss": 0.0131, + "step": 1005 + }, + { + "epoch": 1.9387951807228916, + "grad_norm": 0.25607362389564514, + "learning_rate": 1.34624937118367e-05, + "loss": 0.0115, + "step": 1006 + }, + { + "epoch": 1.940722891566265, + "grad_norm": 0.37233737111091614, + "learning_rate": 1.3420125929163976e-05, + "loss": 0.0309, + "step": 1007 + }, + { + "epoch": 1.9426506024096386, + "grad_norm": 0.19426730275154114, + "learning_rate": 1.3377791232232929e-05, + "loss": 0.0078, + "step": 1008 + }, + { + "epoch": 1.944578313253012, + "grad_norm": 0.2784160077571869, + "learning_rate": 1.333548983391617e-05, + "loss": 0.0142, + "step": 1009 + }, + { + "epoch": 1.9465060240963856, + "grad_norm": 0.11407195776700974, + "learning_rate": 1.3293221946918853e-05, + "loss": 0.0035, + "step": 1010 + }, + { + "epoch": 1.948433734939759, + "grad_norm": 0.3965436816215515, + "learning_rate": 1.325098778377762e-05, + "loss": 0.0242, + "step": 1011 + }, + { + "epoch": 1.9503614457831326, + "grad_norm": 0.18520519137382507, + "learning_rate": 1.3208787556859543e-05, + "loss": 0.0096, + "step": 1012 + }, + { + "epoch": 1.952289156626506, + "grad_norm": 0.2783315181732178, + "learning_rate": 1.3166621478361075e-05, + "loss": 0.0103, + "step": 1013 + }, + { + "epoch": 1.9542168674698797, + "grad_norm": 0.22714459896087646, + "learning_rate": 1.3124489760306917e-05, + "loss": 0.0078, + "step": 1014 + }, + { + "epoch": 1.956144578313253, + "grad_norm": 0.1257915049791336, + "learning_rate": 1.3082392614549036e-05, + "loss": 0.0077, + "step": 1015 + }, + { + "epoch": 1.9580722891566265, + "grad_norm": 0.15592887997627258, + "learning_rate": 1.3040330252765526e-05, + "loss": 0.0106, + "step": 1016 + }, + { + "epoch": 1.96, + "grad_norm": 0.19295449554920197, + "learning_rate": 1.2998302886459586e-05, + "loss": 0.0082, + "step": 1017 + }, + { + "epoch": 1.9619277108433735, + "grad_norm": 0.15544794499874115, + "learning_rate": 1.2956310726958472e-05, + "loss": 0.0068, + "step": 1018 + }, + { + "epoch": 1.963855421686747, + "grad_norm": 0.25899502635002136, + "learning_rate": 1.291435398541236e-05, + "loss": 0.0086, + "step": 1019 + }, + { + "epoch": 1.9657831325301205, + "grad_norm": 0.34639033675193787, + "learning_rate": 1.2872432872793379e-05, + "loss": 0.0116, + "step": 1020 + }, + { + "epoch": 1.967710843373494, + "grad_norm": 0.1628410518169403, + "learning_rate": 1.283054759989447e-05, + "loss": 0.0055, + "step": 1021 + }, + { + "epoch": 1.9696385542168675, + "grad_norm": 0.9273788928985596, + "learning_rate": 1.2788698377328385e-05, + "loss": 0.0264, + "step": 1022 + }, + { + "epoch": 1.971566265060241, + "grad_norm": 0.163126140832901, + "learning_rate": 1.2746885415526594e-05, + "loss": 0.0046, + "step": 1023 + }, + { + "epoch": 1.9734939759036143, + "grad_norm": 0.1475439816713333, + "learning_rate": 1.2705108924738223e-05, + "loss": 0.0056, + "step": 1024 + }, + { + "epoch": 1.975421686746988, + "grad_norm": 0.1654318869113922, + "learning_rate": 1.2663369115029034e-05, + "loss": 0.0056, + "step": 1025 + }, + { + "epoch": 1.9773493975903613, + "grad_norm": 0.20536045730113983, + "learning_rate": 1.2621666196280333e-05, + "loss": 0.0101, + "step": 1026 + }, + { + "epoch": 1.979277108433735, + "grad_norm": 0.19256474077701569, + "learning_rate": 1.258000037818792e-05, + "loss": 0.0059, + "step": 1027 + }, + { + "epoch": 1.9812048192771083, + "grad_norm": 0.2605120539665222, + "learning_rate": 1.2538371870261053e-05, + "loss": 0.0115, + "step": 1028 + }, + { + "epoch": 1.983132530120482, + "grad_norm": 0.14840295910835266, + "learning_rate": 1.249678088182137e-05, + "loss": 0.0046, + "step": 1029 + }, + { + "epoch": 1.9850602409638554, + "grad_norm": 0.17585207521915436, + "learning_rate": 1.2455227622001851e-05, + "loss": 0.0086, + "step": 1030 + }, + { + "epoch": 1.9869879518072289, + "grad_norm": 0.11044781655073166, + "learning_rate": 1.241371229974579e-05, + "loss": 0.0034, + "step": 1031 + }, + { + "epoch": 1.9889156626506024, + "grad_norm": 0.25584840774536133, + "learning_rate": 1.2372235123805672e-05, + "loss": 0.0245, + "step": 1032 + }, + { + "epoch": 1.9908433734939759, + "grad_norm": 0.25962474942207336, + "learning_rate": 1.2330796302742211e-05, + "loss": 0.0104, + "step": 1033 + }, + { + "epoch": 1.9927710843373494, + "grad_norm": 0.33408522605895996, + "learning_rate": 1.2289396044923238e-05, + "loss": 0.0176, + "step": 1034 + }, + { + "epoch": 1.994698795180723, + "grad_norm": 0.479950487613678, + "learning_rate": 1.2248034558522682e-05, + "loss": 0.0113, + "step": 1035 + }, + { + "epoch": 1.9966265060240964, + "grad_norm": 0.16567294299602509, + "learning_rate": 1.2206712051519518e-05, + "loss": 0.0036, + "step": 1036 + }, + { + "epoch": 1.99855421686747, + "grad_norm": 0.19343771040439606, + "learning_rate": 1.2165428731696713e-05, + "loss": 0.0077, + "step": 1037 + }, + { + "epoch": 2.0, + "grad_norm": 0.22895601391792297, + "learning_rate": 1.2124184806640202e-05, + "loss": 0.0114, + "step": 1038 + }, + { + "epoch": 2.0019277108433733, + "grad_norm": 0.15838384628295898, + "learning_rate": 1.208298048373782e-05, + "loss": 0.0043, + "step": 1039 + }, + { + "epoch": 2.003855421686747, + "grad_norm": 0.681065559387207, + "learning_rate": 1.2041815970178268e-05, + "loss": 0.0214, + "step": 1040 + }, + { + "epoch": 2.0057831325301203, + "grad_norm": 0.3357350528240204, + "learning_rate": 1.2000691472950081e-05, + "loss": 0.0079, + "step": 1041 + }, + { + "epoch": 2.007710843373494, + "grad_norm": 0.15238308906555176, + "learning_rate": 1.1959607198840568e-05, + "loss": 0.0041, + "step": 1042 + }, + { + "epoch": 2.0096385542168673, + "grad_norm": 0.11763229966163635, + "learning_rate": 1.1918563354434784e-05, + "loss": 0.0033, + "step": 1043 + }, + { + "epoch": 2.011566265060241, + "grad_norm": 0.3759301006793976, + "learning_rate": 1.1877560146114515e-05, + "loss": 0.0128, + "step": 1044 + }, + { + "epoch": 2.0134939759036143, + "grad_norm": 0.1143188625574112, + "learning_rate": 1.1836597780057183e-05, + "loss": 0.0078, + "step": 1045 + }, + { + "epoch": 2.015421686746988, + "grad_norm": 0.20059260725975037, + "learning_rate": 1.179567646223485e-05, + "loss": 0.0149, + "step": 1046 + }, + { + "epoch": 2.0173493975903614, + "grad_norm": 0.15569567680358887, + "learning_rate": 1.1754796398413196e-05, + "loss": 0.0038, + "step": 1047 + }, + { + "epoch": 2.019277108433735, + "grad_norm": 0.1153278723359108, + "learning_rate": 1.1713957794150423e-05, + "loss": 0.0041, + "step": 1048 + }, + { + "epoch": 2.0212048192771084, + "grad_norm": 0.1838717758655548, + "learning_rate": 1.1673160854796307e-05, + "loss": 0.0041, + "step": 1049 + }, + { + "epoch": 2.023132530120482, + "grad_norm": 0.12264502793550491, + "learning_rate": 1.1632405785491077e-05, + "loss": 0.0043, + "step": 1050 + }, + { + "epoch": 2.0250602409638554, + "grad_norm": 0.14363229274749756, + "learning_rate": 1.159169279116445e-05, + "loss": 0.0066, + "step": 1051 + }, + { + "epoch": 2.026987951807229, + "grad_norm": 0.1316995471715927, + "learning_rate": 1.1551022076534585e-05, + "loss": 0.0024, + "step": 1052 + }, + { + "epoch": 2.0289156626506024, + "grad_norm": 0.13392619788646698, + "learning_rate": 1.1510393846107001e-05, + "loss": 0.0051, + "step": 1053 + }, + { + "epoch": 2.0308433734939757, + "grad_norm": 3.0086817741394043, + "learning_rate": 1.1469808304173658e-05, + "loss": 0.0334, + "step": 1054 + }, + { + "epoch": 2.0327710843373494, + "grad_norm": 0.17756076157093048, + "learning_rate": 1.1429265654811803e-05, + "loss": 0.0068, + "step": 1055 + }, + { + "epoch": 2.0346987951807227, + "grad_norm": 0.13250532746315002, + "learning_rate": 1.1388766101883038e-05, + "loss": 0.0087, + "step": 1056 + }, + { + "epoch": 2.0366265060240965, + "grad_norm": 0.3534089922904968, + "learning_rate": 1.1348309849032257e-05, + "loss": 0.0076, + "step": 1057 + }, + { + "epoch": 2.0385542168674697, + "grad_norm": 0.11939049512147903, + "learning_rate": 1.1307897099686627e-05, + "loss": 0.0029, + "step": 1058 + }, + { + "epoch": 2.0404819277108435, + "grad_norm": 0.11862517893314362, + "learning_rate": 1.1267528057054562e-05, + "loss": 0.0062, + "step": 1059 + }, + { + "epoch": 2.0424096385542168, + "grad_norm": 0.1539212018251419, + "learning_rate": 1.1227202924124704e-05, + "loss": 0.0067, + "step": 1060 + }, + { + "epoch": 2.0443373493975905, + "grad_norm": 0.17163440585136414, + "learning_rate": 1.118692190366491e-05, + "loss": 0.0055, + "step": 1061 + }, + { + "epoch": 2.0462650602409638, + "grad_norm": 0.12304897606372833, + "learning_rate": 1.1146685198221222e-05, + "loss": 0.0036, + "step": 1062 + }, + { + "epoch": 2.0481927710843375, + "grad_norm": 0.17319051921367645, + "learning_rate": 1.1106493010116842e-05, + "loss": 0.0058, + "step": 1063 + }, + { + "epoch": 2.050120481927711, + "grad_norm": 0.2242443859577179, + "learning_rate": 1.1066345541451127e-05, + "loss": 0.0059, + "step": 1064 + }, + { + "epoch": 2.0520481927710845, + "grad_norm": 0.09533938020467758, + "learning_rate": 1.1026242994098597e-05, + "loss": 0.0033, + "step": 1065 + }, + { + "epoch": 2.053975903614458, + "grad_norm": 0.11697929352521896, + "learning_rate": 1.0986185569707852e-05, + "loss": 0.0038, + "step": 1066 + }, + { + "epoch": 2.0559036144578315, + "grad_norm": 0.2563149333000183, + "learning_rate": 1.0946173469700625e-05, + "loss": 0.0158, + "step": 1067 + }, + { + "epoch": 2.057831325301205, + "grad_norm": 0.21836932003498077, + "learning_rate": 1.0906206895270739e-05, + "loss": 0.0085, + "step": 1068 + }, + { + "epoch": 2.059759036144578, + "grad_norm": 0.1798071414232254, + "learning_rate": 1.0866286047383094e-05, + "loss": 0.0053, + "step": 1069 + }, + { + "epoch": 2.061686746987952, + "grad_norm": 0.08937730640172958, + "learning_rate": 1.0826411126772675e-05, + "loss": 0.0025, + "step": 1070 + }, + { + "epoch": 2.063614457831325, + "grad_norm": 0.0942138060927391, + "learning_rate": 1.0786582333943499e-05, + "loss": 0.0017, + "step": 1071 + }, + { + "epoch": 2.065542168674699, + "grad_norm": 0.13076582551002502, + "learning_rate": 1.0746799869167679e-05, + "loss": 0.0033, + "step": 1072 + }, + { + "epoch": 2.067469879518072, + "grad_norm": 0.0993233174085617, + "learning_rate": 1.0707063932484357e-05, + "loss": 0.0046, + "step": 1073 + }, + { + "epoch": 2.069397590361446, + "grad_norm": 0.3046741485595703, + "learning_rate": 1.0667374723698698e-05, + "loss": 0.009, + "step": 1074 + }, + { + "epoch": 2.071325301204819, + "grad_norm": 0.12197669595479965, + "learning_rate": 1.0627732442380932e-05, + "loss": 0.0034, + "step": 1075 + }, + { + "epoch": 2.073253012048193, + "grad_norm": 0.12721140682697296, + "learning_rate": 1.058813728786531e-05, + "loss": 0.0048, + "step": 1076 + }, + { + "epoch": 2.075180722891566, + "grad_norm": 0.10011966526508331, + "learning_rate": 1.0548589459249112e-05, + "loss": 0.0026, + "step": 1077 + }, + { + "epoch": 2.07710843373494, + "grad_norm": 0.3314201831817627, + "learning_rate": 1.0509089155391661e-05, + "loss": 0.0284, + "step": 1078 + }, + { + "epoch": 2.079036144578313, + "grad_norm": 0.32739701867103577, + "learning_rate": 1.0469636574913288e-05, + "loss": 0.0088, + "step": 1079 + }, + { + "epoch": 2.080963855421687, + "grad_norm": 0.13805675506591797, + "learning_rate": 1.043023191619438e-05, + "loss": 0.0042, + "step": 1080 + }, + { + "epoch": 2.0828915662650602, + "grad_norm": 0.14789745211601257, + "learning_rate": 1.039087537737435e-05, + "loss": 0.0037, + "step": 1081 + }, + { + "epoch": 2.0848192771084335, + "grad_norm": 0.15518991649150848, + "learning_rate": 1.0351567156350617e-05, + "loss": 0.0044, + "step": 1082 + }, + { + "epoch": 2.0867469879518072, + "grad_norm": 0.08380113542079926, + "learning_rate": 1.0312307450777706e-05, + "loss": 0.0019, + "step": 1083 + }, + { + "epoch": 2.0886746987951805, + "grad_norm": 0.17892400920391083, + "learning_rate": 1.027309645806613e-05, + "loss": 0.0065, + "step": 1084 + }, + { + "epoch": 2.0906024096385543, + "grad_norm": 0.5497608780860901, + "learning_rate": 1.0233934375381489e-05, + "loss": 0.0238, + "step": 1085 + }, + { + "epoch": 2.0925301204819275, + "grad_norm": 1.0189186334609985, + "learning_rate": 1.019482139964344e-05, + "loss": 0.0092, + "step": 1086 + }, + { + "epoch": 2.0944578313253013, + "grad_norm": 0.12144117057323456, + "learning_rate": 1.015575772752472e-05, + "loss": 0.0038, + "step": 1087 + }, + { + "epoch": 2.0963855421686746, + "grad_norm": 0.1115315854549408, + "learning_rate": 1.0116743555450148e-05, + "loss": 0.0024, + "step": 1088 + }, + { + "epoch": 2.0983132530120483, + "grad_norm": 0.22671759128570557, + "learning_rate": 1.0077779079595631e-05, + "loss": 0.0136, + "step": 1089 + }, + { + "epoch": 2.1002409638554216, + "grad_norm": 2.0009827613830566, + "learning_rate": 1.003886449588719e-05, + "loss": 0.0493, + "step": 1090 + }, + { + "epoch": 2.1021686746987953, + "grad_norm": 0.11907301843166351, + "learning_rate": 1.0000000000000006e-05, + "loss": 0.0034, + "step": 1091 + }, + { + "epoch": 2.1040963855421686, + "grad_norm": 0.31257638335227966, + "learning_rate": 9.961185787357346e-06, + "loss": 0.0129, + "step": 1092 + }, + { + "epoch": 2.1060240963855423, + "grad_norm": 0.11033743619918823, + "learning_rate": 9.922422053129674e-06, + "loss": 0.0184, + "step": 1093 + }, + { + "epoch": 2.1079518072289156, + "grad_norm": 0.2575698494911194, + "learning_rate": 9.883708992233626e-06, + "loss": 0.0054, + "step": 1094 + }, + { + "epoch": 2.1098795180722894, + "grad_norm": 0.12921132147312164, + "learning_rate": 9.845046799331029e-06, + "loss": 0.0037, + "step": 1095 + }, + { + "epoch": 2.1118072289156626, + "grad_norm": 0.21405921876430511, + "learning_rate": 9.806435668827941e-06, + "loss": 0.006, + "step": 1096 + }, + { + "epoch": 2.113734939759036, + "grad_norm": 0.12929430603981018, + "learning_rate": 9.76787579487363e-06, + "loss": 0.0049, + "step": 1097 + }, + { + "epoch": 2.1156626506024097, + "grad_norm": 0.1793181151151657, + "learning_rate": 9.729367371359681e-06, + "loss": 0.0086, + "step": 1098 + }, + { + "epoch": 2.117590361445783, + "grad_norm": 0.2182074338197708, + "learning_rate": 9.690910591918936e-06, + "loss": 0.0106, + "step": 1099 + }, + { + "epoch": 2.1195180722891567, + "grad_norm": 0.0705680400133133, + "learning_rate": 9.652505649924547e-06, + "loss": 0.0012, + "step": 1100 + }, + { + "epoch": 2.12144578313253, + "grad_norm": 0.10509738326072693, + "learning_rate": 9.614152738489021e-06, + "loss": 0.0048, + "step": 1101 + }, + { + "epoch": 2.1233734939759037, + "grad_norm": 0.13775436580181122, + "learning_rate": 9.575852050463268e-06, + "loss": 0.0089, + "step": 1102 + }, + { + "epoch": 2.125301204819277, + "grad_norm": 0.15230101346969604, + "learning_rate": 9.537603778435545e-06, + "loss": 0.0065, + "step": 1103 + }, + { + "epoch": 2.1272289156626507, + "grad_norm": 0.24702346324920654, + "learning_rate": 9.499408114730583e-06, + "loss": 0.016, + "step": 1104 + } + ], + "logging_steps": 1, + "max_steps": 1557, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 92, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.4489752635462124e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1104/training_args.bin b/checkpoint-1104/training_args.bin new file mode 100644 index 0000000..ecc7b6b --- /dev/null +++ b/checkpoint-1104/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:342dfb3c86216e436950100c79812c54066d5572c4e9975b0f133c067f061bcf +size 7825 diff --git a/checkpoint-1196/chat_template.jinja b/checkpoint-1196/chat_template.jinja new file mode 100644 index 0000000..1bad6a0 --- /dev/null +++ b/checkpoint-1196/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/checkpoint-1196/config.json b/checkpoint-1196/config.json new file mode 100644 index 0000000..f8bf41e --- /dev/null +++ b/checkpoint-1196/config.json @@ -0,0 +1,36 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128009, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": 128009, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "transformers_version": "4.56.2", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/checkpoint-1196/generation_config.json b/checkpoint-1196/generation_config.json new file mode 100644 index 0000000..2152026 --- /dev/null +++ b/checkpoint-1196/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128009, + 128001, + 128008, + 128009 + ], + "pad_token_id": 128009, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.56.2" +} diff --git a/checkpoint-1196/model.safetensors b/checkpoint-1196/model.safetensors new file mode 100644 index 0000000..b46cf1d --- /dev/null +++ b/checkpoint-1196/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d92a463c2ac1170faedcf4ee928e870f823c56d2ad7946999bca3dafc999a3e7 +size 2996982344 diff --git a/checkpoint-1196/special_tokens_map.json b/checkpoint-1196/special_tokens_map.json new file mode 100644 index 0000000..14daf45 --- /dev/null +++ b/checkpoint-1196/special_tokens_map.json @@ -0,0 +1,26 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/checkpoint-1196/tokenizer.json b/checkpoint-1196/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/checkpoint-1196/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-1196/tokenizer_config.json b/checkpoint-1196/tokenizer_config.json new file mode 100644 index 0000000..d1e1ea9 --- /dev/null +++ b/checkpoint-1196/tokenizer_config.json @@ -0,0 +1,2068 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-1196/trainer_state.json b/checkpoint-1196/trainer_state.json new file mode 100644 index 0000000..fea43b4 --- /dev/null +++ b/checkpoint-1196/trainer_state.json @@ -0,0 +1,8406 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.304578313253012, + "eval_steps": 500, + "global_step": 1196, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0019277108433734939, + "grad_norm": 2.8518834114074707, + "learning_rate": 0.0, + "loss": 0.0891, + "step": 1 + }, + { + "epoch": 0.0038554216867469878, + "grad_norm": 1.8441249132156372, + "learning_rate": 2.564102564102564e-07, + "loss": 0.0539, + "step": 2 + }, + { + "epoch": 0.005783132530120482, + "grad_norm": 2.8263237476348877, + "learning_rate": 5.128205128205128e-07, + "loss": 0.099, + "step": 3 + }, + { + "epoch": 0.0077108433734939755, + "grad_norm": 2.5051236152648926, + "learning_rate": 7.692307692307694e-07, + "loss": 0.0789, + "step": 4 + }, + { + "epoch": 0.00963855421686747, + "grad_norm": 2.6903438568115234, + "learning_rate": 1.0256410256410257e-06, + "loss": 0.0881, + "step": 5 + }, + { + "epoch": 0.011566265060240964, + "grad_norm": 2.6205761432647705, + "learning_rate": 1.282051282051282e-06, + "loss": 0.0776, + "step": 6 + }, + { + "epoch": 0.013493975903614458, + "grad_norm": 2.6309337615966797, + "learning_rate": 1.5384615384615387e-06, + "loss": 0.0827, + "step": 7 + }, + { + "epoch": 0.015421686746987951, + "grad_norm": 1.5427855253219604, + "learning_rate": 1.794871794871795e-06, + "loss": 0.0577, + "step": 8 + }, + { + "epoch": 0.017349397590361446, + "grad_norm": 1.0973446369171143, + "learning_rate": 2.0512820512820513e-06, + "loss": 0.04, + "step": 9 + }, + { + "epoch": 0.01927710843373494, + "grad_norm": 1.3253350257873535, + "learning_rate": 2.307692307692308e-06, + "loss": 0.0506, + "step": 10 + }, + { + "epoch": 0.021204819277108433, + "grad_norm": 1.588739037513733, + "learning_rate": 2.564102564102564e-06, + "loss": 0.0874, + "step": 11 + }, + { + "epoch": 0.02313253012048193, + "grad_norm": 1.4987014532089233, + "learning_rate": 2.8205128205128207e-06, + "loss": 0.0597, + "step": 12 + }, + { + "epoch": 0.02506024096385542, + "grad_norm": 1.6571592092514038, + "learning_rate": 3.0769230769230774e-06, + "loss": 0.0559, + "step": 13 + }, + { + "epoch": 0.026987951807228915, + "grad_norm": 1.8860628604888916, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.0688, + "step": 14 + }, + { + "epoch": 0.02891566265060241, + "grad_norm": 1.3202295303344727, + "learning_rate": 3.58974358974359e-06, + "loss": 0.0433, + "step": 15 + }, + { + "epoch": 0.030843373493975902, + "grad_norm": 1.5870612859725952, + "learning_rate": 3.846153846153847e-06, + "loss": 0.0695, + "step": 16 + }, + { + "epoch": 0.0327710843373494, + "grad_norm": 0.9192284345626831, + "learning_rate": 4.102564102564103e-06, + "loss": 0.0392, + "step": 17 + }, + { + "epoch": 0.03469879518072289, + "grad_norm": 0.7950155735015869, + "learning_rate": 4.358974358974359e-06, + "loss": 0.0351, + "step": 18 + }, + { + "epoch": 0.03662650602409639, + "grad_norm": 0.8854314684867859, + "learning_rate": 4.615384615384616e-06, + "loss": 0.0356, + "step": 19 + }, + { + "epoch": 0.03855421686746988, + "grad_norm": 0.9546788930892944, + "learning_rate": 4.871794871794872e-06, + "loss": 0.0427, + "step": 20 + }, + { + "epoch": 0.04048192771084337, + "grad_norm": 0.6315903663635254, + "learning_rate": 5.128205128205128e-06, + "loss": 0.0397, + "step": 21 + }, + { + "epoch": 0.042409638554216866, + "grad_norm": 0.9230924844741821, + "learning_rate": 5.384615384615385e-06, + "loss": 0.0481, + "step": 22 + }, + { + "epoch": 0.04433734939759036, + "grad_norm": 0.711546003818512, + "learning_rate": 5.641025641025641e-06, + "loss": 0.0479, + "step": 23 + }, + { + "epoch": 0.04626506024096386, + "grad_norm": 0.5288046598434448, + "learning_rate": 5.897435897435898e-06, + "loss": 0.0182, + "step": 24 + }, + { + "epoch": 0.04819277108433735, + "grad_norm": 0.9420496225357056, + "learning_rate": 6.153846153846155e-06, + "loss": 0.0389, + "step": 25 + }, + { + "epoch": 0.05012048192771084, + "grad_norm": 0.5001983046531677, + "learning_rate": 6.410256410256412e-06, + "loss": 0.0268, + "step": 26 + }, + { + "epoch": 0.052048192771084335, + "grad_norm": 0.8084653615951538, + "learning_rate": 6.666666666666667e-06, + "loss": 0.0367, + "step": 27 + }, + { + "epoch": 0.05397590361445783, + "grad_norm": 0.7195103764533997, + "learning_rate": 6.923076923076923e-06, + "loss": 0.0251, + "step": 28 + }, + { + "epoch": 0.055903614457831326, + "grad_norm": 0.529958963394165, + "learning_rate": 7.17948717948718e-06, + "loss": 0.0289, + "step": 29 + }, + { + "epoch": 0.05783132530120482, + "grad_norm": 0.795376181602478, + "learning_rate": 7.435897435897437e-06, + "loss": 0.043, + "step": 30 + }, + { + "epoch": 0.059759036144578316, + "grad_norm": 0.6366249918937683, + "learning_rate": 7.692307692307694e-06, + "loss": 0.029, + "step": 31 + }, + { + "epoch": 0.061686746987951804, + "grad_norm": 0.5414115190505981, + "learning_rate": 7.948717948717949e-06, + "loss": 0.0365, + "step": 32 + }, + { + "epoch": 0.0636144578313253, + "grad_norm": 0.9350972175598145, + "learning_rate": 8.205128205128205e-06, + "loss": 0.0283, + "step": 33 + }, + { + "epoch": 0.0655421686746988, + "grad_norm": 0.5660741925239563, + "learning_rate": 8.461538461538462e-06, + "loss": 0.0234, + "step": 34 + }, + { + "epoch": 0.06746987951807229, + "grad_norm": 0.5623988509178162, + "learning_rate": 8.717948717948719e-06, + "loss": 0.0307, + "step": 35 + }, + { + "epoch": 0.06939759036144579, + "grad_norm": 0.5260195732116699, + "learning_rate": 8.974358974358976e-06, + "loss": 0.0264, + "step": 36 + }, + { + "epoch": 0.07132530120481928, + "grad_norm": 0.4934785068035126, + "learning_rate": 9.230769230769232e-06, + "loss": 0.0224, + "step": 37 + }, + { + "epoch": 0.07325301204819278, + "grad_norm": 0.4797322154045105, + "learning_rate": 9.487179487179487e-06, + "loss": 0.0163, + "step": 38 + }, + { + "epoch": 0.07518072289156627, + "grad_norm": 0.4739217460155487, + "learning_rate": 9.743589743589744e-06, + "loss": 0.0165, + "step": 39 + }, + { + "epoch": 0.07710843373493977, + "grad_norm": 0.4527677595615387, + "learning_rate": 1e-05, + "loss": 0.0163, + "step": 40 + }, + { + "epoch": 0.07903614457831325, + "grad_norm": 0.6241316795349121, + "learning_rate": 1.0256410256410256e-05, + "loss": 0.0302, + "step": 41 + }, + { + "epoch": 0.08096385542168674, + "grad_norm": 0.639043927192688, + "learning_rate": 1.0512820512820514e-05, + "loss": 0.0312, + "step": 42 + }, + { + "epoch": 0.08289156626506024, + "grad_norm": 0.5121409296989441, + "learning_rate": 1.076923076923077e-05, + "loss": 0.0256, + "step": 43 + }, + { + "epoch": 0.08481927710843373, + "grad_norm": 0.6340477466583252, + "learning_rate": 1.1025641025641028e-05, + "loss": 0.04, + "step": 44 + }, + { + "epoch": 0.08674698795180723, + "grad_norm": 0.5260409712791443, + "learning_rate": 1.1282051282051283e-05, + "loss": 0.0282, + "step": 45 + }, + { + "epoch": 0.08867469879518072, + "grad_norm": 0.6390711069107056, + "learning_rate": 1.1538461538461538e-05, + "loss": 0.0243, + "step": 46 + }, + { + "epoch": 0.09060240963855422, + "grad_norm": 0.46469295024871826, + "learning_rate": 1.1794871794871796e-05, + "loss": 0.0208, + "step": 47 + }, + { + "epoch": 0.09253012048192771, + "grad_norm": 0.8711516857147217, + "learning_rate": 1.2051282051282051e-05, + "loss": 0.0291, + "step": 48 + }, + { + "epoch": 0.09445783132530121, + "grad_norm": 0.9164300560951233, + "learning_rate": 1.230769230769231e-05, + "loss": 0.0342, + "step": 49 + }, + { + "epoch": 0.0963855421686747, + "grad_norm": 0.5401139259338379, + "learning_rate": 1.2564102564102565e-05, + "loss": 0.0185, + "step": 50 + }, + { + "epoch": 0.0983132530120482, + "grad_norm": 0.44393008947372437, + "learning_rate": 1.2820512820512823e-05, + "loss": 0.0228, + "step": 51 + }, + { + "epoch": 0.10024096385542168, + "grad_norm": 0.3855767846107483, + "learning_rate": 1.3076923076923078e-05, + "loss": 0.0176, + "step": 52 + }, + { + "epoch": 0.10216867469879518, + "grad_norm": 0.8561235070228577, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.0433, + "step": 53 + }, + { + "epoch": 0.10409638554216867, + "grad_norm": 0.768002450466156, + "learning_rate": 1.3589743589743592e-05, + "loss": 0.0245, + "step": 54 + }, + { + "epoch": 0.10602409638554217, + "grad_norm": 0.4559759497642517, + "learning_rate": 1.3846153846153847e-05, + "loss": 0.0224, + "step": 55 + }, + { + "epoch": 0.10795180722891566, + "grad_norm": 0.6203847527503967, + "learning_rate": 1.4102564102564105e-05, + "loss": 0.0296, + "step": 56 + }, + { + "epoch": 0.10987951807228916, + "grad_norm": 0.6651368141174316, + "learning_rate": 1.435897435897436e-05, + "loss": 0.0336, + "step": 57 + }, + { + "epoch": 0.11180722891566265, + "grad_norm": 0.377734512090683, + "learning_rate": 1.4615384615384615e-05, + "loss": 0.0196, + "step": 58 + }, + { + "epoch": 0.11373493975903615, + "grad_norm": 0.687568724155426, + "learning_rate": 1.4871794871794874e-05, + "loss": 0.0207, + "step": 59 + }, + { + "epoch": 0.11566265060240964, + "grad_norm": 0.7905604243278503, + "learning_rate": 1.5128205128205129e-05, + "loss": 0.047, + "step": 60 + }, + { + "epoch": 0.11759036144578314, + "grad_norm": 0.7938196063041687, + "learning_rate": 1.5384615384615387e-05, + "loss": 0.0198, + "step": 61 + }, + { + "epoch": 0.11951807228915663, + "grad_norm": 0.41340553760528564, + "learning_rate": 1.5641025641025644e-05, + "loss": 0.0161, + "step": 62 + }, + { + "epoch": 0.12144578313253013, + "grad_norm": 0.5668172240257263, + "learning_rate": 1.5897435897435897e-05, + "loss": 0.0275, + "step": 63 + }, + { + "epoch": 0.12337349397590361, + "grad_norm": 0.48333367705345154, + "learning_rate": 1.6153846153846154e-05, + "loss": 0.0137, + "step": 64 + }, + { + "epoch": 0.12530120481927712, + "grad_norm": 0.6843933463096619, + "learning_rate": 1.641025641025641e-05, + "loss": 0.0294, + "step": 65 + }, + { + "epoch": 0.1272289156626506, + "grad_norm": 0.7789272665977478, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.0401, + "step": 66 + }, + { + "epoch": 0.1291566265060241, + "grad_norm": 0.6203492879867554, + "learning_rate": 1.6923076923076924e-05, + "loss": 0.0292, + "step": 67 + }, + { + "epoch": 0.1310843373493976, + "grad_norm": 0.5940662622451782, + "learning_rate": 1.717948717948718e-05, + "loss": 0.0178, + "step": 68 + }, + { + "epoch": 0.13301204819277107, + "grad_norm": 0.35504868626594543, + "learning_rate": 1.7435897435897438e-05, + "loss": 0.0129, + "step": 69 + }, + { + "epoch": 0.13493975903614458, + "grad_norm": 0.8796699643135071, + "learning_rate": 1.7692307692307694e-05, + "loss": 0.034, + "step": 70 + }, + { + "epoch": 0.13686746987951806, + "grad_norm": 0.967444896697998, + "learning_rate": 1.794871794871795e-05, + "loss": 0.0266, + "step": 71 + }, + { + "epoch": 0.13879518072289157, + "grad_norm": 0.4428526759147644, + "learning_rate": 1.8205128205128208e-05, + "loss": 0.0223, + "step": 72 + }, + { + "epoch": 0.14072289156626505, + "grad_norm": 0.42897751927375793, + "learning_rate": 1.8461538461538465e-05, + "loss": 0.0187, + "step": 73 + }, + { + "epoch": 0.14265060240963856, + "grad_norm": 0.5100914835929871, + "learning_rate": 1.8717948717948718e-05, + "loss": 0.0164, + "step": 74 + }, + { + "epoch": 0.14457831325301204, + "grad_norm": 0.6028861999511719, + "learning_rate": 1.8974358974358975e-05, + "loss": 0.0164, + "step": 75 + }, + { + "epoch": 0.14650602409638555, + "grad_norm": 0.6187024116516113, + "learning_rate": 1.923076923076923e-05, + "loss": 0.0296, + "step": 76 + }, + { + "epoch": 0.14843373493975903, + "grad_norm": 0.4822489619255066, + "learning_rate": 1.9487179487179488e-05, + "loss": 0.0148, + "step": 77 + }, + { + "epoch": 0.15036144578313254, + "grad_norm": 0.7231149673461914, + "learning_rate": 1.9743589743589745e-05, + "loss": 0.0395, + "step": 78 + }, + { + "epoch": 0.15228915662650602, + "grad_norm": 0.8409642577171326, + "learning_rate": 2e-05, + "loss": 0.0446, + "step": 79 + }, + { + "epoch": 0.15421686746987953, + "grad_norm": 0.4883500039577484, + "learning_rate": 2.025641025641026e-05, + "loss": 0.0206, + "step": 80 + }, + { + "epoch": 0.156144578313253, + "grad_norm": 0.6287479400634766, + "learning_rate": 2.0512820512820512e-05, + "loss": 0.0333, + "step": 81 + }, + { + "epoch": 0.1580722891566265, + "grad_norm": 0.5041632652282715, + "learning_rate": 2.0769230769230772e-05, + "loss": 0.0414, + "step": 82 + }, + { + "epoch": 0.16, + "grad_norm": 0.5103405117988586, + "learning_rate": 2.102564102564103e-05, + "loss": 0.045, + "step": 83 + }, + { + "epoch": 0.16192771084337348, + "grad_norm": 0.493161678314209, + "learning_rate": 2.1282051282051285e-05, + "loss": 0.021, + "step": 84 + }, + { + "epoch": 0.163855421686747, + "grad_norm": 0.908843994140625, + "learning_rate": 2.153846153846154e-05, + "loss": 0.0389, + "step": 85 + }, + { + "epoch": 0.16578313253012048, + "grad_norm": 0.5067003965377808, + "learning_rate": 2.1794871794871795e-05, + "loss": 0.0272, + "step": 86 + }, + { + "epoch": 0.16771084337349398, + "grad_norm": 0.5791381597518921, + "learning_rate": 2.2051282051282056e-05, + "loss": 0.0368, + "step": 87 + }, + { + "epoch": 0.16963855421686747, + "grad_norm": 0.7056036591529846, + "learning_rate": 2.230769230769231e-05, + "loss": 0.0284, + "step": 88 + }, + { + "epoch": 0.17156626506024097, + "grad_norm": 0.6563822031021118, + "learning_rate": 2.2564102564102566e-05, + "loss": 0.0646, + "step": 89 + }, + { + "epoch": 0.17349397590361446, + "grad_norm": 0.9483286142349243, + "learning_rate": 2.2820512820512822e-05, + "loss": 0.0439, + "step": 90 + }, + { + "epoch": 0.17542168674698796, + "grad_norm": 0.370664119720459, + "learning_rate": 2.3076923076923076e-05, + "loss": 0.0109, + "step": 91 + }, + { + "epoch": 0.17734939759036145, + "grad_norm": 0.9776477813720703, + "learning_rate": 2.3333333333333336e-05, + "loss": 0.0458, + "step": 92 + }, + { + "epoch": 0.17927710843373493, + "grad_norm": 0.45710092782974243, + "learning_rate": 2.3589743589743593e-05, + "loss": 0.0212, + "step": 93 + }, + { + "epoch": 0.18120481927710844, + "grad_norm": 0.8623896837234497, + "learning_rate": 2.384615384615385e-05, + "loss": 0.0215, + "step": 94 + }, + { + "epoch": 0.18313253012048192, + "grad_norm": 0.55814528465271, + "learning_rate": 2.4102564102564103e-05, + "loss": 0.0218, + "step": 95 + }, + { + "epoch": 0.18506024096385543, + "grad_norm": 0.49882641434669495, + "learning_rate": 2.435897435897436e-05, + "loss": 0.0268, + "step": 96 + }, + { + "epoch": 0.1869879518072289, + "grad_norm": 0.3508654534816742, + "learning_rate": 2.461538461538462e-05, + "loss": 0.0172, + "step": 97 + }, + { + "epoch": 0.18891566265060242, + "grad_norm": 0.601170003414154, + "learning_rate": 2.4871794871794873e-05, + "loss": 0.0208, + "step": 98 + }, + { + "epoch": 0.1908433734939759, + "grad_norm": 1.1748133897781372, + "learning_rate": 2.512820512820513e-05, + "loss": 0.0259, + "step": 99 + }, + { + "epoch": 0.1927710843373494, + "grad_norm": 0.46370384097099304, + "learning_rate": 2.5384615384615386e-05, + "loss": 0.0242, + "step": 100 + }, + { + "epoch": 0.1946987951807229, + "grad_norm": 0.525010883808136, + "learning_rate": 2.5641025641025646e-05, + "loss": 0.0188, + "step": 101 + }, + { + "epoch": 0.1966265060240964, + "grad_norm": 0.766501784324646, + "learning_rate": 2.58974358974359e-05, + "loss": 0.0584, + "step": 102 + }, + { + "epoch": 0.19855421686746988, + "grad_norm": 0.3572964370250702, + "learning_rate": 2.6153846153846157e-05, + "loss": 0.0131, + "step": 103 + }, + { + "epoch": 0.20048192771084336, + "grad_norm": 0.6467130780220032, + "learning_rate": 2.6410256410256413e-05, + "loss": 0.0231, + "step": 104 + }, + { + "epoch": 0.20240963855421687, + "grad_norm": 1.1852102279663086, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.027, + "step": 105 + }, + { + "epoch": 0.20433734939759035, + "grad_norm": 2.3659932613372803, + "learning_rate": 2.6923076923076927e-05, + "loss": 0.0224, + "step": 106 + }, + { + "epoch": 0.20626506024096386, + "grad_norm": 0.5343687534332275, + "learning_rate": 2.7179487179487183e-05, + "loss": 0.0198, + "step": 107 + }, + { + "epoch": 0.20819277108433734, + "grad_norm": 1.852160096168518, + "learning_rate": 2.7435897435897437e-05, + "loss": 0.032, + "step": 108 + }, + { + "epoch": 0.21012048192771085, + "grad_norm": 0.47291702032089233, + "learning_rate": 2.7692307692307694e-05, + "loss": 0.0117, + "step": 109 + }, + { + "epoch": 0.21204819277108433, + "grad_norm": 0.7623187899589539, + "learning_rate": 2.794871794871795e-05, + "loss": 0.0337, + "step": 110 + }, + { + "epoch": 0.21397590361445784, + "grad_norm": 0.5272570848464966, + "learning_rate": 2.820512820512821e-05, + "loss": 0.0131, + "step": 111 + }, + { + "epoch": 0.21590361445783132, + "grad_norm": 0.5568500757217407, + "learning_rate": 2.8461538461538464e-05, + "loss": 0.0233, + "step": 112 + }, + { + "epoch": 0.21783132530120483, + "grad_norm": 0.4008469879627228, + "learning_rate": 2.871794871794872e-05, + "loss": 0.0204, + "step": 113 + }, + { + "epoch": 0.2197590361445783, + "grad_norm": 0.4888612926006317, + "learning_rate": 2.8974358974358977e-05, + "loss": 0.016, + "step": 114 + }, + { + "epoch": 0.2216867469879518, + "grad_norm": 0.44903355836868286, + "learning_rate": 2.923076923076923e-05, + "loss": 0.0135, + "step": 115 + }, + { + "epoch": 0.2236144578313253, + "grad_norm": 0.9266762733459473, + "learning_rate": 2.948717948717949e-05, + "loss": 0.0233, + "step": 116 + }, + { + "epoch": 0.22554216867469878, + "grad_norm": 0.5352638959884644, + "learning_rate": 2.9743589743589747e-05, + "loss": 0.0198, + "step": 117 + }, + { + "epoch": 0.2274698795180723, + "grad_norm": 0.6051343679428101, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.0246, + "step": 118 + }, + { + "epoch": 0.22939759036144577, + "grad_norm": 0.9971133470535278, + "learning_rate": 3.0256410256410257e-05, + "loss": 0.025, + "step": 119 + }, + { + "epoch": 0.23132530120481928, + "grad_norm": 0.704236626625061, + "learning_rate": 3.0512820512820514e-05, + "loss": 0.031, + "step": 120 + }, + { + "epoch": 0.23325301204819276, + "grad_norm": 0.6137097477912903, + "learning_rate": 3.0769230769230774e-05, + "loss": 0.0519, + "step": 121 + }, + { + "epoch": 0.23518072289156627, + "grad_norm": 0.7396159768104553, + "learning_rate": 3.102564102564103e-05, + "loss": 0.0325, + "step": 122 + }, + { + "epoch": 0.23710843373493976, + "grad_norm": 1.3282053470611572, + "learning_rate": 3.128205128205129e-05, + "loss": 0.0252, + "step": 123 + }, + { + "epoch": 0.23903614457831326, + "grad_norm": 0.5220731496810913, + "learning_rate": 3.153846153846154e-05, + "loss": 0.0262, + "step": 124 + }, + { + "epoch": 0.24096385542168675, + "grad_norm": 0.5357242822647095, + "learning_rate": 3.1794871794871795e-05, + "loss": 0.0243, + "step": 125 + }, + { + "epoch": 0.24289156626506025, + "grad_norm": 0.48207753896713257, + "learning_rate": 3.205128205128206e-05, + "loss": 0.0178, + "step": 126 + }, + { + "epoch": 0.24481927710843374, + "grad_norm": 0.552988588809967, + "learning_rate": 3.230769230769231e-05, + "loss": 0.023, + "step": 127 + }, + { + "epoch": 0.24674698795180722, + "grad_norm": 1.7962840795516968, + "learning_rate": 3.2564102564102565e-05, + "loss": 0.032, + "step": 128 + }, + { + "epoch": 0.24867469879518073, + "grad_norm": 1.6404600143432617, + "learning_rate": 3.282051282051282e-05, + "loss": 0.0231, + "step": 129 + }, + { + "epoch": 0.25060240963855424, + "grad_norm": 0.39142486453056335, + "learning_rate": 3.307692307692308e-05, + "loss": 0.0147, + "step": 130 + }, + { + "epoch": 0.2525301204819277, + "grad_norm": 1.3272887468338013, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.0439, + "step": 131 + }, + { + "epoch": 0.2544578313253012, + "grad_norm": 1.5122811794281006, + "learning_rate": 3.358974358974359e-05, + "loss": 0.0282, + "step": 132 + }, + { + "epoch": 0.2563855421686747, + "grad_norm": 1.8542430400848389, + "learning_rate": 3.384615384615385e-05, + "loss": 0.0515, + "step": 133 + }, + { + "epoch": 0.2583132530120482, + "grad_norm": 4.059277534484863, + "learning_rate": 3.4102564102564105e-05, + "loss": 0.0781, + "step": 134 + }, + { + "epoch": 0.26024096385542167, + "grad_norm": 0.6206214427947998, + "learning_rate": 3.435897435897436e-05, + "loss": 0.0306, + "step": 135 + }, + { + "epoch": 0.2621686746987952, + "grad_norm": 0.4575510323047638, + "learning_rate": 3.461538461538462e-05, + "loss": 0.0154, + "step": 136 + }, + { + "epoch": 0.2640963855421687, + "grad_norm": 1.1556978225708008, + "learning_rate": 3.4871794871794875e-05, + "loss": 0.0235, + "step": 137 + }, + { + "epoch": 0.26602409638554214, + "grad_norm": 0.6975051760673523, + "learning_rate": 3.512820512820513e-05, + "loss": 0.0453, + "step": 138 + }, + { + "epoch": 0.26795180722891565, + "grad_norm": 0.8686623573303223, + "learning_rate": 3.538461538461539e-05, + "loss": 0.0427, + "step": 139 + }, + { + "epoch": 0.26987951807228916, + "grad_norm": 2.0681848526000977, + "learning_rate": 3.5641025641025646e-05, + "loss": 0.04, + "step": 140 + }, + { + "epoch": 0.27180722891566267, + "grad_norm": 0.4397984445095062, + "learning_rate": 3.58974358974359e-05, + "loss": 0.0188, + "step": 141 + }, + { + "epoch": 0.2737349397590361, + "grad_norm": 0.5871334075927734, + "learning_rate": 3.615384615384616e-05, + "loss": 0.0253, + "step": 142 + }, + { + "epoch": 0.27566265060240963, + "grad_norm": 1.1078568696975708, + "learning_rate": 3.6410256410256416e-05, + "loss": 0.0316, + "step": 143 + }, + { + "epoch": 0.27759036144578314, + "grad_norm": 0.5691841840744019, + "learning_rate": 3.6666666666666666e-05, + "loss": 0.0266, + "step": 144 + }, + { + "epoch": 0.27951807228915665, + "grad_norm": 0.7896255254745483, + "learning_rate": 3.692307692307693e-05, + "loss": 0.0281, + "step": 145 + }, + { + "epoch": 0.2814457831325301, + "grad_norm": 0.9988337159156799, + "learning_rate": 3.7179487179487186e-05, + "loss": 0.0295, + "step": 146 + }, + { + "epoch": 0.2833734939759036, + "grad_norm": 0.9811834692955017, + "learning_rate": 3.7435897435897436e-05, + "loss": 0.0322, + "step": 147 + }, + { + "epoch": 0.2853012048192771, + "grad_norm": 0.6503105759620667, + "learning_rate": 3.769230769230769e-05, + "loss": 0.0266, + "step": 148 + }, + { + "epoch": 0.28722891566265063, + "grad_norm": 1.9164355993270874, + "learning_rate": 3.794871794871795e-05, + "loss": 0.0677, + "step": 149 + }, + { + "epoch": 0.2891566265060241, + "grad_norm": 1.1724557876586914, + "learning_rate": 3.820512820512821e-05, + "loss": 0.0324, + "step": 150 + }, + { + "epoch": 0.2910843373493976, + "grad_norm": 0.8482469916343689, + "learning_rate": 3.846153846153846e-05, + "loss": 0.0259, + "step": 151 + }, + { + "epoch": 0.2930120481927711, + "grad_norm": 0.8572830557823181, + "learning_rate": 3.871794871794872e-05, + "loss": 0.0358, + "step": 152 + }, + { + "epoch": 0.29493975903614456, + "grad_norm": 0.6630825400352478, + "learning_rate": 3.8974358974358976e-05, + "loss": 0.0447, + "step": 153 + }, + { + "epoch": 0.29686746987951806, + "grad_norm": 0.9197093844413757, + "learning_rate": 3.923076923076923e-05, + "loss": 0.0409, + "step": 154 + }, + { + "epoch": 0.2987951807228916, + "grad_norm": 0.6976819634437561, + "learning_rate": 3.948717948717949e-05, + "loss": 0.0317, + "step": 155 + }, + { + "epoch": 0.3007228915662651, + "grad_norm": 0.7353514432907104, + "learning_rate": 3.9743589743589747e-05, + "loss": 0.0306, + "step": 156 + }, + { + "epoch": 0.30265060240963854, + "grad_norm": 0.5730232000350952, + "learning_rate": 4e-05, + "loss": 0.0324, + "step": 157 + }, + { + "epoch": 0.30457831325301205, + "grad_norm": 0.7852078676223755, + "learning_rate": 3.999994971675547e-05, + "loss": 0.0354, + "step": 158 + }, + { + "epoch": 0.30650602409638555, + "grad_norm": 0.5924715399742126, + "learning_rate": 3.999979886727471e-05, + "loss": 0.0366, + "step": 159 + }, + { + "epoch": 0.30843373493975906, + "grad_norm": 0.7359845638275146, + "learning_rate": 3.999954745231624e-05, + "loss": 0.0437, + "step": 160 + }, + { + "epoch": 0.3103614457831325, + "grad_norm": 0.7866976857185364, + "learning_rate": 3.999919547314426e-05, + "loss": 0.0363, + "step": 161 + }, + { + "epoch": 0.312289156626506, + "grad_norm": 0.7425745129585266, + "learning_rate": 3.999874293152863e-05, + "loss": 0.0259, + "step": 162 + }, + { + "epoch": 0.31421686746987953, + "grad_norm": 1.8922245502471924, + "learning_rate": 3.9998189829744885e-05, + "loss": 0.0341, + "step": 163 + }, + { + "epoch": 0.316144578313253, + "grad_norm": 0.7908634543418884, + "learning_rate": 3.99975361705742e-05, + "loss": 0.0424, + "step": 164 + }, + { + "epoch": 0.3180722891566265, + "grad_norm": 2.047368049621582, + "learning_rate": 3.999678195730337e-05, + "loss": 0.0535, + "step": 165 + }, + { + "epoch": 0.32, + "grad_norm": 0.5702639222145081, + "learning_rate": 3.999592719372484e-05, + "loss": 0.0284, + "step": 166 + }, + { + "epoch": 0.3219277108433735, + "grad_norm": 0.45015648007392883, + "learning_rate": 3.9994971884136636e-05, + "loss": 0.0313, + "step": 167 + }, + { + "epoch": 0.32385542168674697, + "grad_norm": 4.094679355621338, + "learning_rate": 3.9993916033342355e-05, + "loss": 0.0524, + "step": 168 + }, + { + "epoch": 0.3257831325301205, + "grad_norm": 0.800846517086029, + "learning_rate": 3.999275964665117e-05, + "loss": 0.0282, + "step": 169 + }, + { + "epoch": 0.327710843373494, + "grad_norm": 0.47881078720092773, + "learning_rate": 3.999150272987776e-05, + "loss": 0.0293, + "step": 170 + }, + { + "epoch": 0.3296385542168675, + "grad_norm": 0.5716657638549805, + "learning_rate": 3.999014528934232e-05, + "loss": 0.0221, + "step": 171 + }, + { + "epoch": 0.33156626506024095, + "grad_norm": 0.6333311200141907, + "learning_rate": 3.998868733187048e-05, + "loss": 0.0302, + "step": 172 + }, + { + "epoch": 0.33349397590361446, + "grad_norm": 6.642521858215332, + "learning_rate": 3.998712886479335e-05, + "loss": 0.0364, + "step": 173 + }, + { + "epoch": 0.33542168674698797, + "grad_norm": 0.7515506148338318, + "learning_rate": 3.998546989594739e-05, + "loss": 0.0296, + "step": 174 + }, + { + "epoch": 0.3373493975903614, + "grad_norm": 1.0728015899658203, + "learning_rate": 3.998371043367445e-05, + "loss": 0.0549, + "step": 175 + }, + { + "epoch": 0.33927710843373493, + "grad_norm": 1.3025579452514648, + "learning_rate": 3.998185048682166e-05, + "loss": 0.0577, + "step": 176 + }, + { + "epoch": 0.34120481927710844, + "grad_norm": 1.0962958335876465, + "learning_rate": 3.997989006474144e-05, + "loss": 0.0313, + "step": 177 + }, + { + "epoch": 0.34313253012048195, + "grad_norm": 0.7064313292503357, + "learning_rate": 3.997782917729143e-05, + "loss": 0.0309, + "step": 178 + }, + { + "epoch": 0.3450602409638554, + "grad_norm": 0.43374207615852356, + "learning_rate": 3.997566783483445e-05, + "loss": 0.0166, + "step": 179 + }, + { + "epoch": 0.3469879518072289, + "grad_norm": 0.7236390113830566, + "learning_rate": 3.9973406048238413e-05, + "loss": 0.0254, + "step": 180 + }, + { + "epoch": 0.3489156626506024, + "grad_norm": 0.5041500926017761, + "learning_rate": 3.9971043828876334e-05, + "loss": 0.0239, + "step": 181 + }, + { + "epoch": 0.35084337349397593, + "grad_norm": 1.2744532823562622, + "learning_rate": 3.9968581188626204e-05, + "loss": 0.0404, + "step": 182 + }, + { + "epoch": 0.3527710843373494, + "grad_norm": 0.45845362544059753, + "learning_rate": 3.996601813987098e-05, + "loss": 0.0127, + "step": 183 + }, + { + "epoch": 0.3546987951807229, + "grad_norm": 0.4426881968975067, + "learning_rate": 3.996335469549852e-05, + "loss": 0.0176, + "step": 184 + }, + { + "epoch": 0.3566265060240964, + "grad_norm": 1.0030732154846191, + "learning_rate": 3.9960590868901465e-05, + "loss": 0.0457, + "step": 185 + }, + { + "epoch": 0.35855421686746985, + "grad_norm": 0.6428582668304443, + "learning_rate": 3.995772667397725e-05, + "loss": 0.0271, + "step": 186 + }, + { + "epoch": 0.36048192771084336, + "grad_norm": 0.5335744619369507, + "learning_rate": 3.995476212512795e-05, + "loss": 0.0297, + "step": 187 + }, + { + "epoch": 0.3624096385542169, + "grad_norm": 0.6995761394500732, + "learning_rate": 3.99516972372603e-05, + "loss": 0.0322, + "step": 188 + }, + { + "epoch": 0.3643373493975904, + "grad_norm": 0.765511155128479, + "learning_rate": 3.9948532025785546e-05, + "loss": 0.0253, + "step": 189 + }, + { + "epoch": 0.36626506024096384, + "grad_norm": 0.6165828108787537, + "learning_rate": 3.9945266506619403e-05, + "loss": 0.0355, + "step": 190 + }, + { + "epoch": 0.36819277108433734, + "grad_norm": 0.851970911026001, + "learning_rate": 3.994190069618195e-05, + "loss": 0.056, + "step": 191 + }, + { + "epoch": 0.37012048192771085, + "grad_norm": 0.9850023984909058, + "learning_rate": 3.993843461139757e-05, + "loss": 0.0415, + "step": 192 + }, + { + "epoch": 0.37204819277108436, + "grad_norm": 0.7455295324325562, + "learning_rate": 3.9934868269694886e-05, + "loss": 0.0379, + "step": 193 + }, + { + "epoch": 0.3739759036144578, + "grad_norm": 1.159469723701477, + "learning_rate": 3.9931201689006595e-05, + "loss": 0.0237, + "step": 194 + }, + { + "epoch": 0.3759036144578313, + "grad_norm": 0.5490080118179321, + "learning_rate": 3.992743488776947e-05, + "loss": 0.024, + "step": 195 + }, + { + "epoch": 0.37783132530120483, + "grad_norm": 1.279831886291504, + "learning_rate": 3.992356788492421e-05, + "loss": 0.0273, + "step": 196 + }, + { + "epoch": 0.3797590361445783, + "grad_norm": 0.859104335308075, + "learning_rate": 3.9919600699915355e-05, + "loss": 0.0411, + "step": 197 + }, + { + "epoch": 0.3816867469879518, + "grad_norm": 1.2525300979614258, + "learning_rate": 3.991553335269119e-05, + "loss": 0.0857, + "step": 198 + }, + { + "epoch": 0.3836144578313253, + "grad_norm": 0.4924193024635315, + "learning_rate": 3.991136586370367e-05, + "loss": 0.0294, + "step": 199 + }, + { + "epoch": 0.3855421686746988, + "grad_norm": 1.417190670967102, + "learning_rate": 3.990709825390828e-05, + "loss": 0.0395, + "step": 200 + }, + { + "epoch": 0.38746987951807227, + "grad_norm": 0.6172056198120117, + "learning_rate": 3.9902730544763936e-05, + "loss": 0.0194, + "step": 201 + }, + { + "epoch": 0.3893975903614458, + "grad_norm": 0.7292149662971497, + "learning_rate": 3.989826275823291e-05, + "loss": 0.0381, + "step": 202 + }, + { + "epoch": 0.3913253012048193, + "grad_norm": 0.5949816107749939, + "learning_rate": 3.989369491678067e-05, + "loss": 0.0254, + "step": 203 + }, + { + "epoch": 0.3932530120481928, + "grad_norm": 0.6012582182884216, + "learning_rate": 3.988902704337582e-05, + "loss": 0.048, + "step": 204 + }, + { + "epoch": 0.39518072289156625, + "grad_norm": 0.6273590922355652, + "learning_rate": 3.9884259161489936e-05, + "loss": 0.0268, + "step": 205 + }, + { + "epoch": 0.39710843373493976, + "grad_norm": 0.9615244269371033, + "learning_rate": 3.987939129509746e-05, + "loss": 0.0192, + "step": 206 + }, + { + "epoch": 0.39903614457831327, + "grad_norm": 0.6009241342544556, + "learning_rate": 3.9874423468675624e-05, + "loss": 0.0362, + "step": 207 + }, + { + "epoch": 0.4009638554216867, + "grad_norm": 0.411335289478302, + "learning_rate": 3.9869355707204266e-05, + "loss": 0.017, + "step": 208 + }, + { + "epoch": 0.40289156626506023, + "grad_norm": 0.6151527166366577, + "learning_rate": 3.986418803616573e-05, + "loss": 0.0283, + "step": 209 + }, + { + "epoch": 0.40481927710843374, + "grad_norm": 0.33808204531669617, + "learning_rate": 3.985892048154474e-05, + "loss": 0.0158, + "step": 210 + }, + { + "epoch": 0.40674698795180725, + "grad_norm": 0.5464187860488892, + "learning_rate": 3.9853553069828284e-05, + "loss": 0.0292, + "step": 211 + }, + { + "epoch": 0.4086746987951807, + "grad_norm": 0.6658390760421753, + "learning_rate": 3.984808582800543e-05, + "loss": 0.0281, + "step": 212 + }, + { + "epoch": 0.4106024096385542, + "grad_norm": 0.4253764748573303, + "learning_rate": 3.984251878356726e-05, + "loss": 0.031, + "step": 213 + }, + { + "epoch": 0.4125301204819277, + "grad_norm": 0.32309481501579285, + "learning_rate": 3.983685196450667e-05, + "loss": 0.0166, + "step": 214 + }, + { + "epoch": 0.41445783132530123, + "grad_norm": 0.43756410479545593, + "learning_rate": 3.9831085399318265e-05, + "loss": 0.0326, + "step": 215 + }, + { + "epoch": 0.4163855421686747, + "grad_norm": 0.264046847820282, + "learning_rate": 3.982521911699822e-05, + "loss": 0.0118, + "step": 216 + }, + { + "epoch": 0.4183132530120482, + "grad_norm": 0.8630897402763367, + "learning_rate": 3.9819253147044084e-05, + "loss": 0.0246, + "step": 217 + }, + { + "epoch": 0.4202409638554217, + "grad_norm": 0.6923379898071289, + "learning_rate": 3.98131875194547e-05, + "loss": 0.036, + "step": 218 + }, + { + "epoch": 0.42216867469879515, + "grad_norm": 0.5874778628349304, + "learning_rate": 3.9807022264730024e-05, + "loss": 0.0255, + "step": 219 + }, + { + "epoch": 0.42409638554216866, + "grad_norm": 0.394336074590683, + "learning_rate": 3.980075741387094e-05, + "loss": 0.0187, + "step": 220 + }, + { + "epoch": 0.4260240963855422, + "grad_norm": 0.6300327777862549, + "learning_rate": 3.979439299837915e-05, + "loss": 0.0214, + "step": 221 + }, + { + "epoch": 0.4279518072289157, + "grad_norm": 0.5200467109680176, + "learning_rate": 3.978792905025702e-05, + "loss": 0.0628, + "step": 222 + }, + { + "epoch": 0.42987951807228914, + "grad_norm": 0.5713880062103271, + "learning_rate": 3.978136560200735e-05, + "loss": 0.0302, + "step": 223 + }, + { + "epoch": 0.43180722891566264, + "grad_norm": 0.5345383286476135, + "learning_rate": 3.977470268663331e-05, + "loss": 0.0125, + "step": 224 + }, + { + "epoch": 0.43373493975903615, + "grad_norm": 0.5378350019454956, + "learning_rate": 3.976794033763819e-05, + "loss": 0.0246, + "step": 225 + }, + { + "epoch": 0.43566265060240966, + "grad_norm": 0.5554935336112976, + "learning_rate": 3.9761078589025276e-05, + "loss": 0.0212, + "step": 226 + }, + { + "epoch": 0.4375903614457831, + "grad_norm": 0.2832634747028351, + "learning_rate": 3.9754117475297664e-05, + "loss": 0.0125, + "step": 227 + }, + { + "epoch": 0.4395180722891566, + "grad_norm": 1.2910150289535522, + "learning_rate": 3.97470570314581e-05, + "loss": 0.0364, + "step": 228 + }, + { + "epoch": 0.44144578313253013, + "grad_norm": 0.3731018602848053, + "learning_rate": 3.973989729300878e-05, + "loss": 0.0128, + "step": 229 + }, + { + "epoch": 0.4433734939759036, + "grad_norm": 0.9433871507644653, + "learning_rate": 3.9732638295951195e-05, + "loss": 0.0367, + "step": 230 + }, + { + "epoch": 0.4453012048192771, + "grad_norm": 1.0779197216033936, + "learning_rate": 3.972528007678594e-05, + "loss": 0.0667, + "step": 231 + }, + { + "epoch": 0.4472289156626506, + "grad_norm": 1.7009105682373047, + "learning_rate": 3.9717822672512516e-05, + "loss": 0.0655, + "step": 232 + }, + { + "epoch": 0.4491566265060241, + "grad_norm": 0.5646032094955444, + "learning_rate": 3.971026612062919e-05, + "loss": 0.064, + "step": 233 + }, + { + "epoch": 0.45108433734939757, + "grad_norm": 0.44474121928215027, + "learning_rate": 3.970261045913274e-05, + "loss": 0.0206, + "step": 234 + }, + { + "epoch": 0.4530120481927711, + "grad_norm": 1.3969277143478394, + "learning_rate": 3.969485572651833e-05, + "loss": 0.0486, + "step": 235 + }, + { + "epoch": 0.4549397590361446, + "grad_norm": 0.6401994228363037, + "learning_rate": 3.968700196177925e-05, + "loss": 0.0262, + "step": 236 + }, + { + "epoch": 0.4568674698795181, + "grad_norm": 0.7091913223266602, + "learning_rate": 3.96790492044068e-05, + "loss": 0.014, + "step": 237 + }, + { + "epoch": 0.45879518072289155, + "grad_norm": 0.6561547517776489, + "learning_rate": 3.967099749439002e-05, + "loss": 0.0482, + "step": 238 + }, + { + "epoch": 0.46072289156626506, + "grad_norm": 0.6924155354499817, + "learning_rate": 3.966284687221551e-05, + "loss": 0.0289, + "step": 239 + }, + { + "epoch": 0.46265060240963857, + "grad_norm": 0.5868663787841797, + "learning_rate": 3.9654597378867256e-05, + "loss": 0.0331, + "step": 240 + }, + { + "epoch": 0.464578313253012, + "grad_norm": 0.7930939793586731, + "learning_rate": 3.964624905582637e-05, + "loss": 0.0925, + "step": 241 + }, + { + "epoch": 0.46650602409638553, + "grad_norm": 0.4888836145401001, + "learning_rate": 3.9637801945070944e-05, + "loss": 0.015, + "step": 242 + }, + { + "epoch": 0.46843373493975904, + "grad_norm": 0.7820287346839905, + "learning_rate": 3.962925608907579e-05, + "loss": 0.0382, + "step": 243 + }, + { + "epoch": 0.47036144578313255, + "grad_norm": 0.4914316236972809, + "learning_rate": 3.962061153081224e-05, + "loss": 0.0257, + "step": 244 + }, + { + "epoch": 0.472289156626506, + "grad_norm": 0.5681505799293518, + "learning_rate": 3.961186831374793e-05, + "loss": 0.0551, + "step": 245 + }, + { + "epoch": 0.4742168674698795, + "grad_norm": 0.5049723386764526, + "learning_rate": 3.9603026481846616e-05, + "loss": 0.0186, + "step": 246 + }, + { + "epoch": 0.476144578313253, + "grad_norm": 0.5034119486808777, + "learning_rate": 3.959408607956787e-05, + "loss": 0.024, + "step": 247 + }, + { + "epoch": 0.47807228915662653, + "grad_norm": 0.4543336033821106, + "learning_rate": 3.958504715186695e-05, + "loss": 0.0256, + "step": 248 + }, + { + "epoch": 0.48, + "grad_norm": 0.5595743656158447, + "learning_rate": 3.957590974419452e-05, + "loss": 0.0222, + "step": 249 + }, + { + "epoch": 0.4819277108433735, + "grad_norm": 0.5701581239700317, + "learning_rate": 3.956667390249642e-05, + "loss": 0.0334, + "step": 250 + }, + { + "epoch": 0.483855421686747, + "grad_norm": 0.53755784034729, + "learning_rate": 3.9557339673213474e-05, + "loss": 0.0345, + "step": 251 + }, + { + "epoch": 0.4857831325301205, + "grad_norm": 0.4368877112865448, + "learning_rate": 3.95479071032812e-05, + "loss": 0.0183, + "step": 252 + }, + { + "epoch": 0.48771084337349396, + "grad_norm": 0.7972906827926636, + "learning_rate": 3.953837624012963e-05, + "loss": 0.0337, + "step": 253 + }, + { + "epoch": 0.48963855421686747, + "grad_norm": 0.6148451566696167, + "learning_rate": 3.9528747131683023e-05, + "loss": 0.0524, + "step": 254 + }, + { + "epoch": 0.491566265060241, + "grad_norm": 0.500840961933136, + "learning_rate": 3.9519019826359676e-05, + "loss": 0.0248, + "step": 255 + }, + { + "epoch": 0.49349397590361443, + "grad_norm": 0.5536255240440369, + "learning_rate": 3.9509194373071624e-05, + "loss": 0.0219, + "step": 256 + }, + { + "epoch": 0.49542168674698794, + "grad_norm": 0.6873176097869873, + "learning_rate": 3.9499270821224444e-05, + "loss": 0.0312, + "step": 257 + }, + { + "epoch": 0.49734939759036145, + "grad_norm": 0.37207168340682983, + "learning_rate": 3.9489249220716974e-05, + "loss": 0.0149, + "step": 258 + }, + { + "epoch": 0.49927710843373496, + "grad_norm": 0.4458799660205841, + "learning_rate": 3.947912962194107e-05, + "loss": 0.0214, + "step": 259 + }, + { + "epoch": 0.5012048192771085, + "grad_norm": 0.4272724390029907, + "learning_rate": 3.9468912075781345e-05, + "loss": 0.0263, + "step": 260 + }, + { + "epoch": 0.503132530120482, + "grad_norm": 0.5245792269706726, + "learning_rate": 3.945859663361496e-05, + "loss": 0.0103, + "step": 261 + }, + { + "epoch": 0.5050602409638554, + "grad_norm": 0.8799260854721069, + "learning_rate": 3.9448183347311284e-05, + "loss": 0.0292, + "step": 262 + }, + { + "epoch": 0.5069879518072289, + "grad_norm": 0.5996833443641663, + "learning_rate": 3.943767226923171e-05, + "loss": 0.0306, + "step": 263 + }, + { + "epoch": 0.5089156626506024, + "grad_norm": 0.6044682860374451, + "learning_rate": 3.942706345222935e-05, + "loss": 0.0218, + "step": 264 + }, + { + "epoch": 0.5108433734939759, + "grad_norm": 0.4770200848579407, + "learning_rate": 3.941635694964878e-05, + "loss": 0.0226, + "step": 265 + }, + { + "epoch": 0.5127710843373494, + "grad_norm": 0.5605704188346863, + "learning_rate": 3.940555281532576e-05, + "loss": 0.0354, + "step": 266 + }, + { + "epoch": 0.5146987951807229, + "grad_norm": 0.46532443165779114, + "learning_rate": 3.939465110358699e-05, + "loss": 0.0223, + "step": 267 + }, + { + "epoch": 0.5166265060240964, + "grad_norm": 0.5190595388412476, + "learning_rate": 3.93836518692498e-05, + "loss": 0.0219, + "step": 268 + }, + { + "epoch": 0.5185542168674698, + "grad_norm": 0.5767757892608643, + "learning_rate": 3.937255516762193e-05, + "loss": 0.0294, + "step": 269 + }, + { + "epoch": 0.5204819277108433, + "grad_norm": 0.4543164372444153, + "learning_rate": 3.936136105450119e-05, + "loss": 0.0244, + "step": 270 + }, + { + "epoch": 0.5224096385542168, + "grad_norm": 0.4155154526233673, + "learning_rate": 3.9350069586175195e-05, + "loss": 0.02, + "step": 271 + }, + { + "epoch": 0.5243373493975904, + "grad_norm": 0.5470768213272095, + "learning_rate": 3.933868081942113e-05, + "loss": 0.0187, + "step": 272 + }, + { + "epoch": 0.5262650602409639, + "grad_norm": 0.9491772651672363, + "learning_rate": 3.9327194811505406e-05, + "loss": 0.0337, + "step": 273 + }, + { + "epoch": 0.5281927710843374, + "grad_norm": 0.9313873052597046, + "learning_rate": 3.93156116201834e-05, + "loss": 0.0573, + "step": 274 + }, + { + "epoch": 0.5301204819277109, + "grad_norm": 0.7181005477905273, + "learning_rate": 3.930393130369915e-05, + "loss": 0.0405, + "step": 275 + }, + { + "epoch": 0.5320481927710843, + "grad_norm": 0.34231385588645935, + "learning_rate": 3.9292153920785076e-05, + "loss": 0.0153, + "step": 276 + }, + { + "epoch": 0.5339759036144578, + "grad_norm": 0.6899610161781311, + "learning_rate": 3.928027953066168e-05, + "loss": 0.0338, + "step": 277 + }, + { + "epoch": 0.5359036144578313, + "grad_norm": 0.7509781718254089, + "learning_rate": 3.926830819303726e-05, + "loss": 0.0416, + "step": 278 + }, + { + "epoch": 0.5378313253012048, + "grad_norm": 0.6326774954795837, + "learning_rate": 3.925623996810757e-05, + "loss": 0.0293, + "step": 279 + }, + { + "epoch": 0.5397590361445783, + "grad_norm": 0.5543203353881836, + "learning_rate": 3.924407491655557e-05, + "loss": 0.0263, + "step": 280 + }, + { + "epoch": 0.5416867469879518, + "grad_norm": 0.5367572903633118, + "learning_rate": 3.9231813099551086e-05, + "loss": 0.0276, + "step": 281 + }, + { + "epoch": 0.5436144578313253, + "grad_norm": 0.3143869638442993, + "learning_rate": 3.921945457875051e-05, + "loss": 0.0146, + "step": 282 + }, + { + "epoch": 0.5455421686746988, + "grad_norm": 0.47403043508529663, + "learning_rate": 3.920699941629649e-05, + "loss": 0.0267, + "step": 283 + }, + { + "epoch": 0.5474698795180722, + "grad_norm": 0.5082595348358154, + "learning_rate": 3.919444767481763e-05, + "loss": 0.0183, + "step": 284 + }, + { + "epoch": 0.5493975903614458, + "grad_norm": 0.747949481010437, + "learning_rate": 3.918179941742816e-05, + "loss": 0.0412, + "step": 285 + }, + { + "epoch": 0.5513253012048193, + "grad_norm": 0.6553886532783508, + "learning_rate": 3.916905470772762e-05, + "loss": 0.0505, + "step": 286 + }, + { + "epoch": 0.5532530120481928, + "grad_norm": 0.3838176131248474, + "learning_rate": 3.9156213609800545e-05, + "loss": 0.0156, + "step": 287 + }, + { + "epoch": 0.5551807228915663, + "grad_norm": 0.7427731156349182, + "learning_rate": 3.914327618821614e-05, + "loss": 0.0278, + "step": 288 + }, + { + "epoch": 0.5571084337349398, + "grad_norm": 0.2612821161746979, + "learning_rate": 3.913024250802796e-05, + "loss": 0.0101, + "step": 289 + }, + { + "epoch": 0.5590361445783133, + "grad_norm": 0.3799416124820709, + "learning_rate": 3.911711263477357e-05, + "loss": 0.0168, + "step": 290 + }, + { + "epoch": 0.5609638554216867, + "grad_norm": 0.5053854584693909, + "learning_rate": 3.910388663447425e-05, + "loss": 0.0249, + "step": 291 + }, + { + "epoch": 0.5628915662650602, + "grad_norm": 0.38095012307167053, + "learning_rate": 3.909056457363461e-05, + "loss": 0.0156, + "step": 292 + }, + { + "epoch": 0.5648192771084337, + "grad_norm": 0.4477892220020294, + "learning_rate": 3.907714651924229e-05, + "loss": 0.0309, + "step": 293 + }, + { + "epoch": 0.5667469879518072, + "grad_norm": 0.5875864624977112, + "learning_rate": 3.906363253876763e-05, + "loss": 0.0287, + "step": 294 + }, + { + "epoch": 0.5686746987951807, + "grad_norm": 0.522990882396698, + "learning_rate": 3.90500227001633e-05, + "loss": 0.0318, + "step": 295 + }, + { + "epoch": 0.5706024096385542, + "grad_norm": 0.4153876304626465, + "learning_rate": 3.9036317071863994e-05, + "loss": 0.0192, + "step": 296 + }, + { + "epoch": 0.5725301204819278, + "grad_norm": 0.4675769507884979, + "learning_rate": 3.902251572278605e-05, + "loss": 0.067, + "step": 297 + }, + { + "epoch": 0.5744578313253013, + "grad_norm": 0.35778650641441345, + "learning_rate": 3.900861872232713e-05, + "loss": 0.0197, + "step": 298 + }, + { + "epoch": 0.5763855421686747, + "grad_norm": 0.7382330894470215, + "learning_rate": 3.899462614036587e-05, + "loss": 0.0283, + "step": 299 + }, + { + "epoch": 0.5783132530120482, + "grad_norm": 0.41268599033355713, + "learning_rate": 3.89805380472615e-05, + "loss": 0.0207, + "step": 300 + }, + { + "epoch": 0.5802409638554217, + "grad_norm": 1.2013020515441895, + "learning_rate": 3.8966354513853535e-05, + "loss": 0.0301, + "step": 301 + }, + { + "epoch": 0.5821686746987952, + "grad_norm": 0.424757719039917, + "learning_rate": 3.895207561146137e-05, + "loss": 0.022, + "step": 302 + }, + { + "epoch": 0.5840963855421687, + "grad_norm": 0.4196677505970001, + "learning_rate": 3.893770141188396e-05, + "loss": 0.0424, + "step": 303 + }, + { + "epoch": 0.5860240963855422, + "grad_norm": 0.8644190430641174, + "learning_rate": 3.892323198739946e-05, + "loss": 0.08, + "step": 304 + }, + { + "epoch": 0.5879518072289157, + "grad_norm": 0.5645135045051575, + "learning_rate": 3.890866741076482e-05, + "loss": 0.0152, + "step": 305 + }, + { + "epoch": 0.5898795180722891, + "grad_norm": 0.5218387246131897, + "learning_rate": 3.889400775521545e-05, + "loss": 0.0205, + "step": 306 + }, + { + "epoch": 0.5918072289156626, + "grad_norm": 0.39709413051605225, + "learning_rate": 3.8879253094464865e-05, + "loss": 0.0233, + "step": 307 + }, + { + "epoch": 0.5937349397590361, + "grad_norm": 0.3572910726070404, + "learning_rate": 3.8864403502704285e-05, + "loss": 0.0198, + "step": 308 + }, + { + "epoch": 0.5956626506024096, + "grad_norm": 0.382709264755249, + "learning_rate": 3.8849459054602274e-05, + "loss": 0.0176, + "step": 309 + }, + { + "epoch": 0.5975903614457831, + "grad_norm": 3.4527227878570557, + "learning_rate": 3.883441982530436e-05, + "loss": 0.0239, + "step": 310 + }, + { + "epoch": 0.5995180722891567, + "grad_norm": 0.4467569589614868, + "learning_rate": 3.8819285890432674e-05, + "loss": 0.0284, + "step": 311 + }, + { + "epoch": 0.6014457831325302, + "grad_norm": 0.44513460993766785, + "learning_rate": 3.880405732608555e-05, + "loss": 0.0233, + "step": 312 + }, + { + "epoch": 0.6033734939759036, + "grad_norm": 0.8029689192771912, + "learning_rate": 3.8788734208837155e-05, + "loss": 0.0433, + "step": 313 + }, + { + "epoch": 0.6053012048192771, + "grad_norm": 0.7291454076766968, + "learning_rate": 3.877331661573709e-05, + "loss": 0.043, + "step": 314 + }, + { + "epoch": 0.6072289156626506, + "grad_norm": 0.6050467491149902, + "learning_rate": 3.8757804624310006e-05, + "loss": 0.0377, + "step": 315 + }, + { + "epoch": 0.6091566265060241, + "grad_norm": 0.6714366674423218, + "learning_rate": 3.874219831255524e-05, + "loss": 0.046, + "step": 316 + }, + { + "epoch": 0.6110843373493976, + "grad_norm": 0.336037278175354, + "learning_rate": 3.8726497758946394e-05, + "loss": 0.0149, + "step": 317 + }, + { + "epoch": 0.6130120481927711, + "grad_norm": 0.3057402968406677, + "learning_rate": 3.871070304243094e-05, + "loss": 0.014, + "step": 318 + }, + { + "epoch": 0.6149397590361446, + "grad_norm": 0.4537644684314728, + "learning_rate": 3.8694814242429834e-05, + "loss": 0.0503, + "step": 319 + }, + { + "epoch": 0.6168674698795181, + "grad_norm": 0.45573824644088745, + "learning_rate": 3.8678831438837116e-05, + "loss": 0.021, + "step": 320 + }, + { + "epoch": 0.6187951807228915, + "grad_norm": 0.30729591846466064, + "learning_rate": 3.866275471201952e-05, + "loss": 0.0163, + "step": 321 + }, + { + "epoch": 0.620722891566265, + "grad_norm": 0.7614850401878357, + "learning_rate": 3.8646584142816036e-05, + "loss": 0.0347, + "step": 322 + }, + { + "epoch": 0.6226506024096385, + "grad_norm": 0.5323611497879028, + "learning_rate": 3.863031981253754e-05, + "loss": 0.0201, + "step": 323 + }, + { + "epoch": 0.624578313253012, + "grad_norm": 0.34426453709602356, + "learning_rate": 3.861396180296635e-05, + "loss": 0.0243, + "step": 324 + }, + { + "epoch": 0.6265060240963856, + "grad_norm": 0.621636152267456, + "learning_rate": 3.859751019635585e-05, + "loss": 0.0166, + "step": 325 + }, + { + "epoch": 0.6284337349397591, + "grad_norm": 0.549324095249176, + "learning_rate": 3.858096507543006e-05, + "loss": 0.0274, + "step": 326 + }, + { + "epoch": 0.6303614457831326, + "grad_norm": 0.358426570892334, + "learning_rate": 3.8564326523383214e-05, + "loss": 0.0207, + "step": 327 + }, + { + "epoch": 0.632289156626506, + "grad_norm": 0.3639723062515259, + "learning_rate": 3.8547594623879346e-05, + "loss": 0.0297, + "step": 328 + }, + { + "epoch": 0.6342168674698795, + "grad_norm": 0.3402212858200073, + "learning_rate": 3.853076946105188e-05, + "loss": 0.0258, + "step": 329 + }, + { + "epoch": 0.636144578313253, + "grad_norm": 0.4083027243614197, + "learning_rate": 3.85138511195032e-05, + "loss": 0.0351, + "step": 330 + }, + { + "epoch": 0.6380722891566265, + "grad_norm": 0.43532121181488037, + "learning_rate": 3.84968396843042e-05, + "loss": 0.0388, + "step": 331 + }, + { + "epoch": 0.64, + "grad_norm": 0.35353463888168335, + "learning_rate": 3.8479735240993904e-05, + "loss": 0.0203, + "step": 332 + }, + { + "epoch": 0.6419277108433735, + "grad_norm": 0.350149929523468, + "learning_rate": 3.846253787557901e-05, + "loss": 0.0261, + "step": 333 + }, + { + "epoch": 0.643855421686747, + "grad_norm": 0.7665389180183411, + "learning_rate": 3.844524767453344e-05, + "loss": 0.0108, + "step": 334 + }, + { + "epoch": 0.6457831325301204, + "grad_norm": 0.44621360301971436, + "learning_rate": 3.842786472479795e-05, + "loss": 0.0282, + "step": 335 + }, + { + "epoch": 0.6477108433734939, + "grad_norm": 0.7787201404571533, + "learning_rate": 3.841038911377962e-05, + "loss": 0.0216, + "step": 336 + }, + { + "epoch": 0.6496385542168674, + "grad_norm": 0.48260653018951416, + "learning_rate": 3.839282092935153e-05, + "loss": 0.0234, + "step": 337 + }, + { + "epoch": 0.651566265060241, + "grad_norm": 0.4987852871417999, + "learning_rate": 3.837516025985219e-05, + "loss": 0.0515, + "step": 338 + }, + { + "epoch": 0.6534939759036145, + "grad_norm": 0.9030266404151917, + "learning_rate": 3.835740719408517e-05, + "loss": 0.0508, + "step": 339 + }, + { + "epoch": 0.655421686746988, + "grad_norm": 0.6381701231002808, + "learning_rate": 3.833956182131867e-05, + "loss": 0.0405, + "step": 340 + }, + { + "epoch": 0.6573493975903615, + "grad_norm": 0.42828986048698425, + "learning_rate": 3.832162423128499e-05, + "loss": 0.024, + "step": 341 + }, + { + "epoch": 0.659277108433735, + "grad_norm": 0.38725873827934265, + "learning_rate": 3.8303594514180164e-05, + "loss": 0.0199, + "step": 342 + }, + { + "epoch": 0.6612048192771084, + "grad_norm": 0.23280498385429382, + "learning_rate": 3.828547276066346e-05, + "loss": 0.0101, + "step": 343 + }, + { + "epoch": 0.6631325301204819, + "grad_norm": 0.7298216819763184, + "learning_rate": 3.8267259061856925e-05, + "loss": 0.0455, + "step": 344 + }, + { + "epoch": 0.6650602409638554, + "grad_norm": 0.5975687503814697, + "learning_rate": 3.824895350934496e-05, + "loss": 0.0372, + "step": 345 + }, + { + "epoch": 0.6669879518072289, + "grad_norm": 0.6295403242111206, + "learning_rate": 3.823055619517381e-05, + "loss": 0.0362, + "step": 346 + }, + { + "epoch": 0.6689156626506024, + "grad_norm": 0.5086020827293396, + "learning_rate": 3.821206721185115e-05, + "loss": 0.0368, + "step": 347 + }, + { + "epoch": 0.6708433734939759, + "grad_norm": 0.34506168961524963, + "learning_rate": 3.819348665234557e-05, + "loss": 0.0178, + "step": 348 + }, + { + "epoch": 0.6727710843373494, + "grad_norm": 1.309940218925476, + "learning_rate": 3.817481461008617e-05, + "loss": 0.024, + "step": 349 + }, + { + "epoch": 0.6746987951807228, + "grad_norm": 0.4074770510196686, + "learning_rate": 3.815605117896204e-05, + "loss": 0.0262, + "step": 350 + }, + { + "epoch": 0.6766265060240964, + "grad_norm": 0.48525840044021606, + "learning_rate": 3.8137196453321775e-05, + "loss": 0.0209, + "step": 351 + }, + { + "epoch": 0.6785542168674699, + "grad_norm": 0.7199739217758179, + "learning_rate": 3.811825052797308e-05, + "loss": 0.0396, + "step": 352 + }, + { + "epoch": 0.6804819277108434, + "grad_norm": 0.519540011882782, + "learning_rate": 3.8099213498182196e-05, + "loss": 0.0453, + "step": 353 + }, + { + "epoch": 0.6824096385542169, + "grad_norm": 0.9738391041755676, + "learning_rate": 3.808008545967349e-05, + "loss": 0.0317, + "step": 354 + }, + { + "epoch": 0.6843373493975904, + "grad_norm": 1.888344407081604, + "learning_rate": 3.8060866508628953e-05, + "loss": 0.0452, + "step": 355 + }, + { + "epoch": 0.6862650602409639, + "grad_norm": 0.48989811539649963, + "learning_rate": 3.8041556741687695e-05, + "loss": 0.0315, + "step": 356 + }, + { + "epoch": 0.6881927710843373, + "grad_norm": 0.3764645457267761, + "learning_rate": 3.8022156255945496e-05, + "loss": 0.0269, + "step": 357 + }, + { + "epoch": 0.6901204819277108, + "grad_norm": 0.46409738063812256, + "learning_rate": 3.800266514895429e-05, + "loss": 0.0171, + "step": 358 + }, + { + "epoch": 0.6920481927710843, + "grad_norm": 0.41091030836105347, + "learning_rate": 3.7983083518721695e-05, + "loss": 0.0167, + "step": 359 + }, + { + "epoch": 0.6939759036144578, + "grad_norm": 0.8375523090362549, + "learning_rate": 3.79634114637105e-05, + "loss": 0.0342, + "step": 360 + }, + { + "epoch": 0.6959036144578313, + "grad_norm": 1.7053394317626953, + "learning_rate": 3.794364908283817e-05, + "loss": 0.02, + "step": 361 + }, + { + "epoch": 0.6978313253012048, + "grad_norm": 0.4163115918636322, + "learning_rate": 3.792379647547637e-05, + "loss": 0.0138, + "step": 362 + }, + { + "epoch": 0.6997590361445784, + "grad_norm": 0.388751745223999, + "learning_rate": 3.790385374145046e-05, + "loss": 0.0172, + "step": 363 + }, + { + "epoch": 0.7016867469879519, + "grad_norm": 0.5584064722061157, + "learning_rate": 3.7883820981038966e-05, + "loss": 0.0254, + "step": 364 + }, + { + "epoch": 0.7036144578313253, + "grad_norm": 1.394264817237854, + "learning_rate": 3.7863698294973114e-05, + "loss": 0.037, + "step": 365 + }, + { + "epoch": 0.7055421686746988, + "grad_norm": 0.46280744671821594, + "learning_rate": 3.78434857844363e-05, + "loss": 0.0234, + "step": 366 + }, + { + "epoch": 0.7074698795180723, + "grad_norm": 0.39548924565315247, + "learning_rate": 3.782318355106358e-05, + "loss": 0.0164, + "step": 367 + }, + { + "epoch": 0.7093975903614458, + "grad_norm": 0.7307773232460022, + "learning_rate": 3.780279169694118e-05, + "loss": 0.0192, + "step": 368 + }, + { + "epoch": 0.7113253012048193, + "grad_norm": 0.28035807609558105, + "learning_rate": 3.778231032460594e-05, + "loss": 0.0131, + "step": 369 + }, + { + "epoch": 0.7132530120481928, + "grad_norm": 0.8376953601837158, + "learning_rate": 3.776173953704486e-05, + "loss": 0.0291, + "step": 370 + }, + { + "epoch": 0.7151807228915663, + "grad_norm": 0.7356843948364258, + "learning_rate": 3.774107943769454e-05, + "loss": 0.0214, + "step": 371 + }, + { + "epoch": 0.7171084337349397, + "grad_norm": 0.41503390669822693, + "learning_rate": 3.772033013044064e-05, + "loss": 0.0221, + "step": 372 + }, + { + "epoch": 0.7190361445783132, + "grad_norm": 0.35732385516166687, + "learning_rate": 3.7699491719617436e-05, + "loss": 0.015, + "step": 373 + }, + { + "epoch": 0.7209638554216867, + "grad_norm": 0.283778578042984, + "learning_rate": 3.76785643100072e-05, + "loss": 0.0146, + "step": 374 + }, + { + "epoch": 0.7228915662650602, + "grad_norm": 0.3219413459300995, + "learning_rate": 3.765754800683974e-05, + "loss": 0.015, + "step": 375 + }, + { + "epoch": 0.7248192771084337, + "grad_norm": 0.610431432723999, + "learning_rate": 3.7636442915791856e-05, + "loss": 0.0326, + "step": 376 + }, + { + "epoch": 0.7267469879518073, + "grad_norm": 4.944870948791504, + "learning_rate": 3.7615249142986784e-05, + "loss": 0.0432, + "step": 377 + }, + { + "epoch": 0.7286746987951808, + "grad_norm": 0.4894593060016632, + "learning_rate": 3.7593966794993696e-05, + "loss": 0.0174, + "step": 378 + }, + { + "epoch": 0.7306024096385542, + "grad_norm": 0.4211325943470001, + "learning_rate": 3.757259597882714e-05, + "loss": 0.023, + "step": 379 + }, + { + "epoch": 0.7325301204819277, + "grad_norm": 0.33621737360954285, + "learning_rate": 3.755113680194651e-05, + "loss": 0.0201, + "step": 380 + }, + { + "epoch": 0.7344578313253012, + "grad_norm": 0.5799694657325745, + "learning_rate": 3.7529589372255514e-05, + "loss": 0.0173, + "step": 381 + }, + { + "epoch": 0.7363855421686747, + "grad_norm": 0.5172572731971741, + "learning_rate": 3.750795379810162e-05, + "loss": 0.0284, + "step": 382 + }, + { + "epoch": 0.7383132530120482, + "grad_norm": 0.5715453028678894, + "learning_rate": 3.748623018827552e-05, + "loss": 0.0194, + "step": 383 + }, + { + "epoch": 0.7402409638554217, + "grad_norm": 0.5284178256988525, + "learning_rate": 3.746441865201056e-05, + "loss": 0.0247, + "step": 384 + }, + { + "epoch": 0.7421686746987952, + "grad_norm": 0.37828654050827026, + "learning_rate": 3.744251929898223e-05, + "loss": 0.0097, + "step": 385 + }, + { + "epoch": 0.7440963855421687, + "grad_norm": 0.3252779543399811, + "learning_rate": 3.742053223930758e-05, + "loss": 0.0238, + "step": 386 + }, + { + "epoch": 0.7460240963855421, + "grad_norm": 0.6031543612480164, + "learning_rate": 3.7398457583544674e-05, + "loss": 0.0332, + "step": 387 + }, + { + "epoch": 0.7479518072289156, + "grad_norm": 0.23846614360809326, + "learning_rate": 3.737629544269206e-05, + "loss": 0.0122, + "step": 388 + }, + { + "epoch": 0.7498795180722891, + "grad_norm": 0.5274029970169067, + "learning_rate": 3.7354045928188155e-05, + "loss": 0.0324, + "step": 389 + }, + { + "epoch": 0.7518072289156627, + "grad_norm": 0.4672217071056366, + "learning_rate": 3.733170915191075e-05, + "loss": 0.0196, + "step": 390 + }, + { + "epoch": 0.7537349397590362, + "grad_norm": 0.29819396138191223, + "learning_rate": 3.730928522617639e-05, + "loss": 0.0131, + "step": 391 + }, + { + "epoch": 0.7556626506024097, + "grad_norm": 0.43824997544288635, + "learning_rate": 3.7286774263739855e-05, + "loss": 0.0238, + "step": 392 + }, + { + "epoch": 0.7575903614457832, + "grad_norm": 0.2822072505950928, + "learning_rate": 3.726417637779357e-05, + "loss": 0.0314, + "step": 393 + }, + { + "epoch": 0.7595180722891566, + "grad_norm": 0.43815648555755615, + "learning_rate": 3.7241491681967044e-05, + "loss": 0.0144, + "step": 394 + }, + { + "epoch": 0.7614457831325301, + "grad_norm": 0.37194815278053284, + "learning_rate": 3.721872029032628e-05, + "loss": 0.0286, + "step": 395 + }, + { + "epoch": 0.7633734939759036, + "grad_norm": 0.7319737672805786, + "learning_rate": 3.719586231737322e-05, + "loss": 0.0427, + "step": 396 + }, + { + "epoch": 0.7653012048192771, + "grad_norm": 0.5870066285133362, + "learning_rate": 3.717291787804517e-05, + "loss": 0.0138, + "step": 397 + }, + { + "epoch": 0.7672289156626506, + "grad_norm": 0.6574277281761169, + "learning_rate": 3.7149887087714225e-05, + "loss": 0.061, + "step": 398 + }, + { + "epoch": 0.7691566265060241, + "grad_norm": 0.5467348694801331, + "learning_rate": 3.712677006218666e-05, + "loss": 0.022, + "step": 399 + }, + { + "epoch": 0.7710843373493976, + "grad_norm": 0.3589288890361786, + "learning_rate": 3.710356691770238e-05, + "loss": 0.0161, + "step": 400 + }, + { + "epoch": 0.7730120481927711, + "grad_norm": 0.574630618095398, + "learning_rate": 3.708027777093433e-05, + "loss": 0.0285, + "step": 401 + }, + { + "epoch": 0.7749397590361445, + "grad_norm": 0.39048445224761963, + "learning_rate": 3.70569027389879e-05, + "loss": 0.012, + "step": 402 + }, + { + "epoch": 0.776867469879518, + "grad_norm": 0.34803536534309387, + "learning_rate": 3.703344193940032e-05, + "loss": 0.0155, + "step": 403 + }, + { + "epoch": 0.7787951807228916, + "grad_norm": 1.188948392868042, + "learning_rate": 3.700989549014011e-05, + "loss": 0.0617, + "step": 404 + }, + { + "epoch": 0.7807228915662651, + "grad_norm": 0.473157674074173, + "learning_rate": 3.698626350960646e-05, + "loss": 0.0298, + "step": 405 + }, + { + "epoch": 0.7826506024096386, + "grad_norm": 0.42009076476097107, + "learning_rate": 3.6962546116628634e-05, + "loss": 0.03, + "step": 406 + }, + { + "epoch": 0.7845783132530121, + "grad_norm": 0.6334308981895447, + "learning_rate": 3.693874343046537e-05, + "loss": 0.0107, + "step": 407 + }, + { + "epoch": 0.7865060240963856, + "grad_norm": 0.35594677925109863, + "learning_rate": 3.6914855570804314e-05, + "loss": 0.0174, + "step": 408 + }, + { + "epoch": 0.788433734939759, + "grad_norm": 0.28985708951950073, + "learning_rate": 3.689088265776136e-05, + "loss": 0.0149, + "step": 409 + }, + { + "epoch": 0.7903614457831325, + "grad_norm": 0.3981950581073761, + "learning_rate": 3.686682481188011e-05, + "loss": 0.019, + "step": 410 + }, + { + "epoch": 0.792289156626506, + "grad_norm": 0.48819583654403687, + "learning_rate": 3.6842682154131193e-05, + "loss": 0.0217, + "step": 411 + }, + { + "epoch": 0.7942168674698795, + "grad_norm": 0.42819952964782715, + "learning_rate": 3.681845480591174e-05, + "loss": 0.0198, + "step": 412 + }, + { + "epoch": 0.796144578313253, + "grad_norm": 0.48591694235801697, + "learning_rate": 3.6794142889044727e-05, + "loss": 0.0253, + "step": 413 + }, + { + "epoch": 0.7980722891566265, + "grad_norm": 0.4730607271194458, + "learning_rate": 3.676974652577835e-05, + "loss": 0.0329, + "step": 414 + }, + { + "epoch": 0.8, + "grad_norm": 0.5390865802764893, + "learning_rate": 3.6745265838785434e-05, + "loss": 0.0479, + "step": 415 + }, + { + "epoch": 0.8019277108433734, + "grad_norm": 0.6377891302108765, + "learning_rate": 3.672070095116283e-05, + "loss": 0.019, + "step": 416 + }, + { + "epoch": 0.803855421686747, + "grad_norm": 0.8984615206718445, + "learning_rate": 3.669605198643075e-05, + "loss": 0.0444, + "step": 417 + }, + { + "epoch": 0.8057831325301205, + "grad_norm": 0.4913877546787262, + "learning_rate": 3.667131906853219e-05, + "loss": 0.031, + "step": 418 + }, + { + "epoch": 0.807710843373494, + "grad_norm": 0.37894028425216675, + "learning_rate": 3.664650232183229e-05, + "loss": 0.0195, + "step": 419 + }, + { + "epoch": 0.8096385542168675, + "grad_norm": 0.3644949495792389, + "learning_rate": 3.66216018711177e-05, + "loss": 0.018, + "step": 420 + }, + { + "epoch": 0.811566265060241, + "grad_norm": 0.414440393447876, + "learning_rate": 3.659661784159597e-05, + "loss": 0.0188, + "step": 421 + }, + { + "epoch": 0.8134939759036145, + "grad_norm": 0.49220341444015503, + "learning_rate": 3.65715503588949e-05, + "loss": 0.016, + "step": 422 + }, + { + "epoch": 0.815421686746988, + "grad_norm": 1.0939836502075195, + "learning_rate": 3.654639954906193e-05, + "loss": 0.0758, + "step": 423 + }, + { + "epoch": 0.8173493975903614, + "grad_norm": 0.43222442269325256, + "learning_rate": 3.652116553856349e-05, + "loss": 0.0308, + "step": 424 + }, + { + "epoch": 0.8192771084337349, + "grad_norm": 0.5081896185874939, + "learning_rate": 3.649584845428438e-05, + "loss": 0.0493, + "step": 425 + }, + { + "epoch": 0.8212048192771084, + "grad_norm": 0.9811948537826538, + "learning_rate": 3.64704484235271e-05, + "loss": 0.019, + "step": 426 + }, + { + "epoch": 0.8231325301204819, + "grad_norm": 0.31656572222709656, + "learning_rate": 3.6444965574011255e-05, + "loss": 0.0135, + "step": 427 + }, + { + "epoch": 0.8250602409638554, + "grad_norm": 0.7844433188438416, + "learning_rate": 3.641940003387289e-05, + "loss": 0.0402, + "step": 428 + }, + { + "epoch": 0.826987951807229, + "grad_norm": 0.3353273570537567, + "learning_rate": 3.6393751931663814e-05, + "loss": 0.0132, + "step": 429 + }, + { + "epoch": 0.8289156626506025, + "grad_norm": 0.7253058552742004, + "learning_rate": 3.6368021396351015e-05, + "loss": 0.0296, + "step": 430 + }, + { + "epoch": 0.8308433734939759, + "grad_norm": 0.45300304889678955, + "learning_rate": 3.634220855731598e-05, + "loss": 0.0258, + "step": 431 + }, + { + "epoch": 0.8327710843373494, + "grad_norm": 0.3480473458766937, + "learning_rate": 3.631631354435403e-05, + "loss": 0.0099, + "step": 432 + }, + { + "epoch": 0.8346987951807229, + "grad_norm": 2.1114516258239746, + "learning_rate": 3.62903364876737e-05, + "loss": 0.0457, + "step": 433 + }, + { + "epoch": 0.8366265060240964, + "grad_norm": 0.5649561882019043, + "learning_rate": 3.626427751789606e-05, + "loss": 0.0444, + "step": 434 + }, + { + "epoch": 0.8385542168674699, + "grad_norm": 0.3864995539188385, + "learning_rate": 3.623813676605405e-05, + "loss": 0.0223, + "step": 435 + }, + { + "epoch": 0.8404819277108434, + "grad_norm": 1.2134298086166382, + "learning_rate": 3.621191436359186e-05, + "loss": 0.0353, + "step": 436 + }, + { + "epoch": 0.8424096385542169, + "grad_norm": 0.4403415024280548, + "learning_rate": 3.6185610442364246e-05, + "loss": 0.0216, + "step": 437 + }, + { + "epoch": 0.8443373493975903, + "grad_norm": 0.6050297021865845, + "learning_rate": 3.6159225134635846e-05, + "loss": 0.0433, + "step": 438 + }, + { + "epoch": 0.8462650602409638, + "grad_norm": 0.7951678037643433, + "learning_rate": 3.6132758573080556e-05, + "loss": 0.031, + "step": 439 + }, + { + "epoch": 0.8481927710843373, + "grad_norm": 0.4991949796676636, + "learning_rate": 3.6106210890780834e-05, + "loss": 0.0313, + "step": 440 + }, + { + "epoch": 0.8501204819277108, + "grad_norm": 0.47951385378837585, + "learning_rate": 3.607958222122704e-05, + "loss": 0.0218, + "step": 441 + }, + { + "epoch": 0.8520481927710843, + "grad_norm": 0.7345194220542908, + "learning_rate": 3.6052872698316755e-05, + "loss": 0.0239, + "step": 442 + }, + { + "epoch": 0.8539759036144579, + "grad_norm": 1.4814884662628174, + "learning_rate": 3.602608245635414e-05, + "loss": 0.0127, + "step": 443 + }, + { + "epoch": 0.8559036144578314, + "grad_norm": 2.4240877628326416, + "learning_rate": 3.599921163004922e-05, + "loss": 0.0618, + "step": 444 + }, + { + "epoch": 0.8578313253012049, + "grad_norm": 0.41523510217666626, + "learning_rate": 3.5972260354517216e-05, + "loss": 0.0283, + "step": 445 + }, + { + "epoch": 0.8597590361445783, + "grad_norm": 0.5577677488327026, + "learning_rate": 3.594522876527791e-05, + "loss": 0.0271, + "step": 446 + }, + { + "epoch": 0.8616867469879518, + "grad_norm": 0.5829064846038818, + "learning_rate": 3.591811699825487e-05, + "loss": 0.0169, + "step": 447 + }, + { + "epoch": 0.8636144578313253, + "grad_norm": 0.4478822350502014, + "learning_rate": 3.5890925189774886e-05, + "loss": 0.0239, + "step": 448 + }, + { + "epoch": 0.8655421686746988, + "grad_norm": 0.3498048782348633, + "learning_rate": 3.586365347656718e-05, + "loss": 0.0137, + "step": 449 + }, + { + "epoch": 0.8674698795180723, + "grad_norm": 0.6571130156517029, + "learning_rate": 3.583630199576278e-05, + "loss": 0.027, + "step": 450 + }, + { + "epoch": 0.8693975903614458, + "grad_norm": 0.344970166683197, + "learning_rate": 3.58088708848938e-05, + "loss": 0.0167, + "step": 451 + }, + { + "epoch": 0.8713253012048193, + "grad_norm": 0.34611570835113525, + "learning_rate": 3.5781360281892775e-05, + "loss": 0.0468, + "step": 452 + }, + { + "epoch": 0.8732530120481927, + "grad_norm": 0.66157066822052, + "learning_rate": 3.575377032509194e-05, + "loss": 0.0344, + "step": 453 + }, + { + "epoch": 0.8751807228915662, + "grad_norm": 0.3676326870918274, + "learning_rate": 3.5726101153222534e-05, + "loss": 0.0366, + "step": 454 + }, + { + "epoch": 0.8771084337349397, + "grad_norm": 0.5958423018455505, + "learning_rate": 3.569835290541414e-05, + "loss": 0.0382, + "step": 455 + }, + { + "epoch": 0.8790361445783132, + "grad_norm": 0.36787471175193787, + "learning_rate": 3.567052572119397e-05, + "loss": 0.018, + "step": 456 + }, + { + "epoch": 0.8809638554216868, + "grad_norm": 0.9478234052658081, + "learning_rate": 3.564261974048611e-05, + "loss": 0.0179, + "step": 457 + }, + { + "epoch": 0.8828915662650603, + "grad_norm": 0.3337579369544983, + "learning_rate": 3.56146351036109e-05, + "loss": 0.0147, + "step": 458 + }, + { + "epoch": 0.8848192771084338, + "grad_norm": 0.4984932243824005, + "learning_rate": 3.558657195128416e-05, + "loss": 0.0224, + "step": 459 + }, + { + "epoch": 0.8867469879518072, + "grad_norm": 0.36718735098838806, + "learning_rate": 3.555843042461653e-05, + "loss": 0.0202, + "step": 460 + }, + { + "epoch": 0.8886746987951807, + "grad_norm": 0.4081745445728302, + "learning_rate": 3.553021066511274e-05, + "loss": 0.0288, + "step": 461 + }, + { + "epoch": 0.8906024096385542, + "grad_norm": 0.3233242332935333, + "learning_rate": 3.55019128146709e-05, + "loss": 0.0362, + "step": 462 + }, + { + "epoch": 0.8925301204819277, + "grad_norm": 0.6560158729553223, + "learning_rate": 3.547353701558178e-05, + "loss": 0.038, + "step": 463 + }, + { + "epoch": 0.8944578313253012, + "grad_norm": 0.47668641805648804, + "learning_rate": 3.544508341052811e-05, + "loss": 0.0399, + "step": 464 + }, + { + "epoch": 0.8963855421686747, + "grad_norm": 0.45512664318084717, + "learning_rate": 3.541655214258383e-05, + "loss": 0.022, + "step": 465 + }, + { + "epoch": 0.8983132530120482, + "grad_norm": 0.8410730361938477, + "learning_rate": 3.538794335521343e-05, + "loss": 0.0315, + "step": 466 + }, + { + "epoch": 0.9002409638554217, + "grad_norm": 0.4872909486293793, + "learning_rate": 3.535925719227117e-05, + "loss": 0.0152, + "step": 467 + }, + { + "epoch": 0.9021686746987951, + "grad_norm": 0.45623311400413513, + "learning_rate": 3.533049379800038e-05, + "loss": 0.0305, + "step": 468 + }, + { + "epoch": 0.9040963855421686, + "grad_norm": 0.43087029457092285, + "learning_rate": 3.530165331703275e-05, + "loss": 0.0131, + "step": 469 + }, + { + "epoch": 0.9060240963855422, + "grad_norm": 0.4610525369644165, + "learning_rate": 3.527273589438756e-05, + "loss": 0.0187, + "step": 470 + }, + { + "epoch": 0.9079518072289157, + "grad_norm": 0.3356114327907562, + "learning_rate": 3.5243741675471006e-05, + "loss": 0.0185, + "step": 471 + }, + { + "epoch": 0.9098795180722892, + "grad_norm": 0.9065960049629211, + "learning_rate": 3.5214670806075426e-05, + "loss": 0.0433, + "step": 472 + }, + { + "epoch": 0.9118072289156627, + "grad_norm": 0.3652578294277191, + "learning_rate": 3.518552343237858e-05, + "loss": 0.02, + "step": 473 + }, + { + "epoch": 0.9137349397590362, + "grad_norm": 0.32377883791923523, + "learning_rate": 3.5156299700942916e-05, + "loss": 0.0165, + "step": 474 + }, + { + "epoch": 0.9156626506024096, + "grad_norm": 0.2431817352771759, + "learning_rate": 3.512699975871485e-05, + "loss": 0.0172, + "step": 475 + }, + { + "epoch": 0.9175903614457831, + "grad_norm": 0.6390707492828369, + "learning_rate": 3.509762375302399e-05, + "loss": 0.0356, + "step": 476 + }, + { + "epoch": 0.9195180722891566, + "grad_norm": 0.2283092886209488, + "learning_rate": 3.506817183158243e-05, + "loss": 0.0088, + "step": 477 + }, + { + "epoch": 0.9214457831325301, + "grad_norm": 0.5053914189338684, + "learning_rate": 3.5038644142483966e-05, + "loss": 0.0389, + "step": 478 + }, + { + "epoch": 0.9233734939759036, + "grad_norm": 0.2567576467990875, + "learning_rate": 3.500904083420342e-05, + "loss": 0.0155, + "step": 479 + }, + { + "epoch": 0.9253012048192771, + "grad_norm": 0.6852384209632874, + "learning_rate": 3.497936205559583e-05, + "loss": 0.0247, + "step": 480 + }, + { + "epoch": 0.9272289156626506, + "grad_norm": 0.36403414607048035, + "learning_rate": 3.494960795589572e-05, + "loss": 0.023, + "step": 481 + }, + { + "epoch": 0.929156626506024, + "grad_norm": 0.506554901599884, + "learning_rate": 3.491977868471635e-05, + "loss": 0.0273, + "step": 482 + }, + { + "epoch": 0.9310843373493976, + "grad_norm": 0.38329923152923584, + "learning_rate": 3.4889874392048985e-05, + "loss": 0.0169, + "step": 483 + }, + { + "epoch": 0.9330120481927711, + "grad_norm": 0.2805836498737335, + "learning_rate": 3.48598952282621e-05, + "loss": 0.0105, + "step": 484 + }, + { + "epoch": 0.9349397590361446, + "grad_norm": 0.6315302848815918, + "learning_rate": 3.482984134410067e-05, + "loss": 0.0289, + "step": 485 + }, + { + "epoch": 0.9368674698795181, + "grad_norm": 0.6431388854980469, + "learning_rate": 3.479971289068537e-05, + "loss": 0.0311, + "step": 486 + }, + { + "epoch": 0.9387951807228916, + "grad_norm": 0.9794723391532898, + "learning_rate": 3.476951001951184e-05, + "loss": 0.0452, + "step": 487 + }, + { + "epoch": 0.9407228915662651, + "grad_norm": 0.7984824180603027, + "learning_rate": 3.473923288244991e-05, + "loss": 0.0689, + "step": 488 + }, + { + "epoch": 0.9426506024096386, + "grad_norm": 0.46362006664276123, + "learning_rate": 3.470888163174286e-05, + "loss": 0.0241, + "step": 489 + }, + { + "epoch": 0.944578313253012, + "grad_norm": 0.5051195025444031, + "learning_rate": 3.467845642000661e-05, + "loss": 0.0228, + "step": 490 + }, + { + "epoch": 0.9465060240963855, + "grad_norm": 0.3082812428474426, + "learning_rate": 3.4647957400229004e-05, + "loss": 0.0144, + "step": 491 + }, + { + "epoch": 0.948433734939759, + "grad_norm": 0.2691391110420227, + "learning_rate": 3.461738472576902e-05, + "loss": 0.0167, + "step": 492 + }, + { + "epoch": 0.9503614457831325, + "grad_norm": 0.5627671480178833, + "learning_rate": 3.458673855035597e-05, + "loss": 0.031, + "step": 493 + }, + { + "epoch": 0.952289156626506, + "grad_norm": 0.4571435749530792, + "learning_rate": 3.455601902808876e-05, + "loss": 0.0191, + "step": 494 + }, + { + "epoch": 0.9542168674698795, + "grad_norm": 1.0117709636688232, + "learning_rate": 3.452522631343515e-05, + "loss": 0.0192, + "step": 495 + }, + { + "epoch": 0.9561445783132531, + "grad_norm": 0.28375712037086487, + "learning_rate": 3.449436056123086e-05, + "loss": 0.0159, + "step": 496 + }, + { + "epoch": 0.9580722891566265, + "grad_norm": 0.26381856203079224, + "learning_rate": 3.446342192667893e-05, + "loss": 0.0113, + "step": 497 + }, + { + "epoch": 0.96, + "grad_norm": 0.49317577481269836, + "learning_rate": 3.443241056534884e-05, + "loss": 0.0332, + "step": 498 + }, + { + "epoch": 0.9619277108433735, + "grad_norm": 0.28884485363960266, + "learning_rate": 3.440132663317579e-05, + "loss": 0.0117, + "step": 499 + }, + { + "epoch": 0.963855421686747, + "grad_norm": 0.36255285143852234, + "learning_rate": 3.4370170286459864e-05, + "loss": 0.0169, + "step": 500 + }, + { + "epoch": 0.9657831325301205, + "grad_norm": 0.4265049993991852, + "learning_rate": 3.433894168186529e-05, + "loss": 0.0217, + "step": 501 + }, + { + "epoch": 0.967710843373494, + "grad_norm": 0.8169426321983337, + "learning_rate": 3.430764097641962e-05, + "loss": 0.0207, + "step": 502 + }, + { + "epoch": 0.9696385542168675, + "grad_norm": 1.866077184677124, + "learning_rate": 3.427626832751296e-05, + "loss": 0.0381, + "step": 503 + }, + { + "epoch": 0.971566265060241, + "grad_norm": 0.33124980330467224, + "learning_rate": 3.424482389289716e-05, + "loss": 0.0245, + "step": 504 + }, + { + "epoch": 0.9734939759036144, + "grad_norm": 0.37479540705680847, + "learning_rate": 3.4213307830685055e-05, + "loss": 0.0164, + "step": 505 + }, + { + "epoch": 0.9754216867469879, + "grad_norm": 0.39738863706588745, + "learning_rate": 3.4181720299349615e-05, + "loss": 0.0297, + "step": 506 + }, + { + "epoch": 0.9773493975903614, + "grad_norm": 0.2567287087440491, + "learning_rate": 3.4150061457723205e-05, + "loss": 0.0102, + "step": 507 + }, + { + "epoch": 0.9792771084337349, + "grad_norm": 0.6230517029762268, + "learning_rate": 3.411833146499675e-05, + "loss": 0.0243, + "step": 508 + }, + { + "epoch": 0.9812048192771085, + "grad_norm": 0.44843971729278564, + "learning_rate": 3.408653048071894e-05, + "loss": 0.0357, + "step": 509 + }, + { + "epoch": 0.983132530120482, + "grad_norm": 1.0569655895233154, + "learning_rate": 3.405465866479546e-05, + "loss": 0.037, + "step": 510 + }, + { + "epoch": 0.9850602409638555, + "grad_norm": 0.29000964760780334, + "learning_rate": 3.402271617748812e-05, + "loss": 0.0129, + "step": 511 + }, + { + "epoch": 0.9869879518072289, + "grad_norm": 2.1627447605133057, + "learning_rate": 3.399070317941413e-05, + "loss": 0.0442, + "step": 512 + }, + { + "epoch": 0.9889156626506024, + "grad_norm": 0.27371272444725037, + "learning_rate": 3.395861983154522e-05, + "loss": 0.0119, + "step": 513 + }, + { + "epoch": 0.9908433734939759, + "grad_norm": 0.4117226302623749, + "learning_rate": 3.392646629520688e-05, + "loss": 0.0455, + "step": 514 + }, + { + "epoch": 0.9927710843373494, + "grad_norm": 0.5098996758460999, + "learning_rate": 3.389424273207752e-05, + "loss": 0.0203, + "step": 515 + }, + { + "epoch": 0.9946987951807229, + "grad_norm": 0.5192157626152039, + "learning_rate": 3.386194930418767e-05, + "loss": 0.0329, + "step": 516 + }, + { + "epoch": 0.9966265060240964, + "grad_norm": 0.18757697939872742, + "learning_rate": 3.382958617391915e-05, + "loss": 0.0065, + "step": 517 + }, + { + "epoch": 0.9985542168674699, + "grad_norm": 0.3334413170814514, + "learning_rate": 3.3797153504004296e-05, + "loss": 0.0266, + "step": 518 + }, + { + "epoch": 1.0, + "grad_norm": 0.4152225852012634, + "learning_rate": 3.3764651457525095e-05, + "loss": 0.0169, + "step": 519 + }, + { + "epoch": 1.0019277108433735, + "grad_norm": 0.43535247445106506, + "learning_rate": 3.373208019791237e-05, + "loss": 0.0221, + "step": 520 + }, + { + "epoch": 1.003855421686747, + "grad_norm": 0.39292722940444946, + "learning_rate": 3.3699439888945e-05, + "loss": 0.0211, + "step": 521 + }, + { + "epoch": 1.0057831325301205, + "grad_norm": 0.19566713273525238, + "learning_rate": 3.366673069474904e-05, + "loss": 0.0069, + "step": 522 + }, + { + "epoch": 1.007710843373494, + "grad_norm": 0.5101853609085083, + "learning_rate": 3.3633952779796914e-05, + "loss": 0.0191, + "step": 523 + }, + { + "epoch": 1.0096385542168675, + "grad_norm": 0.999434769153595, + "learning_rate": 3.360110630890664e-05, + "loss": 0.0196, + "step": 524 + }, + { + "epoch": 1.011566265060241, + "grad_norm": 0.4646223783493042, + "learning_rate": 3.356819144724092e-05, + "loss": 0.0328, + "step": 525 + }, + { + "epoch": 1.0134939759036146, + "grad_norm": 0.3132480978965759, + "learning_rate": 3.3535208360306354e-05, + "loss": 0.0203, + "step": 526 + }, + { + "epoch": 1.0154216867469879, + "grad_norm": 0.3038032352924347, + "learning_rate": 3.350215721395261e-05, + "loss": 0.0122, + "step": 527 + }, + { + "epoch": 1.0173493975903614, + "grad_norm": 0.45082882046699524, + "learning_rate": 3.346903817437157e-05, + "loss": 0.0437, + "step": 528 + }, + { + "epoch": 1.0192771084337349, + "grad_norm": 0.26917046308517456, + "learning_rate": 3.343585140809651e-05, + "loss": 0.013, + "step": 529 + }, + { + "epoch": 1.0212048192771084, + "grad_norm": 0.23869264125823975, + "learning_rate": 3.3402597082001276e-05, + "loss": 0.008, + "step": 530 + }, + { + "epoch": 1.0231325301204819, + "grad_norm": 0.31315353512763977, + "learning_rate": 3.3369275363299394e-05, + "loss": 0.0078, + "step": 531 + }, + { + "epoch": 1.0250602409638554, + "grad_norm": 0.4780346751213074, + "learning_rate": 3.333588641954327e-05, + "loss": 0.0225, + "step": 532 + }, + { + "epoch": 1.026987951807229, + "grad_norm": 0.2920368015766144, + "learning_rate": 3.330243041862336e-05, + "loss": 0.0118, + "step": 533 + }, + { + "epoch": 1.0289156626506024, + "grad_norm": 0.543669581413269, + "learning_rate": 3.326890752876728e-05, + "loss": 0.0338, + "step": 534 + }, + { + "epoch": 1.030843373493976, + "grad_norm": 0.4288000464439392, + "learning_rate": 3.323531791853901e-05, + "loss": 0.0341, + "step": 535 + }, + { + "epoch": 1.0327710843373494, + "grad_norm": 0.26600322127342224, + "learning_rate": 3.3201661756838e-05, + "loss": 0.0184, + "step": 536 + }, + { + "epoch": 1.034698795180723, + "grad_norm": 0.290937602519989, + "learning_rate": 3.316793921289835e-05, + "loss": 0.0152, + "step": 537 + }, + { + "epoch": 1.0366265060240965, + "grad_norm": 0.7621443271636963, + "learning_rate": 3.313415045628795e-05, + "loss": 0.0326, + "step": 538 + }, + { + "epoch": 1.03855421686747, + "grad_norm": 0.5581283569335938, + "learning_rate": 3.3100295656907646e-05, + "loss": 0.0164, + "step": 539 + }, + { + "epoch": 1.0404819277108435, + "grad_norm": 0.20930901169776917, + "learning_rate": 3.306637498499034e-05, + "loss": 0.0091, + "step": 540 + }, + { + "epoch": 1.0424096385542168, + "grad_norm": 0.46212059259414673, + "learning_rate": 3.303238861110018e-05, + "loss": 0.0118, + "step": 541 + }, + { + "epoch": 1.0443373493975903, + "grad_norm": 0.38259151577949524, + "learning_rate": 3.299833670613168e-05, + "loss": 0.0081, + "step": 542 + }, + { + "epoch": 1.0462650602409638, + "grad_norm": 0.4888618290424347, + "learning_rate": 3.2964219441308865e-05, + "loss": 0.0138, + "step": 543 + }, + { + "epoch": 1.0481927710843373, + "grad_norm": 0.32103127241134644, + "learning_rate": 3.2930036988184425e-05, + "loss": 0.0171, + "step": 544 + }, + { + "epoch": 1.0501204819277108, + "grad_norm": 0.27787327766418457, + "learning_rate": 3.28957895186388e-05, + "loss": 0.0106, + "step": 545 + }, + { + "epoch": 1.0520481927710843, + "grad_norm": 0.35597777366638184, + "learning_rate": 3.2861477204879395e-05, + "loss": 0.0123, + "step": 546 + }, + { + "epoch": 1.0539759036144578, + "grad_norm": 0.3619804084300995, + "learning_rate": 3.2827100219439656e-05, + "loss": 0.0088, + "step": 547 + }, + { + "epoch": 1.0559036144578313, + "grad_norm": 0.2525513470172882, + "learning_rate": 3.279265873517822e-05, + "loss": 0.0179, + "step": 548 + }, + { + "epoch": 1.0578313253012048, + "grad_norm": 0.3910020887851715, + "learning_rate": 3.275815292527804e-05, + "loss": 0.0142, + "step": 549 + }, + { + "epoch": 1.0597590361445783, + "grad_norm": 0.30515050888061523, + "learning_rate": 3.2723582963245526e-05, + "loss": 0.0123, + "step": 550 + }, + { + "epoch": 1.0616867469879518, + "grad_norm": 0.21708644926548004, + "learning_rate": 3.2688949022909665e-05, + "loss": 0.0098, + "step": 551 + }, + { + "epoch": 1.0636144578313254, + "grad_norm": 0.23307719826698303, + "learning_rate": 3.265425127842114e-05, + "loss": 0.0097, + "step": 552 + }, + { + "epoch": 1.0655421686746989, + "grad_norm": 0.676654577255249, + "learning_rate": 3.261948990425147e-05, + "loss": 0.0227, + "step": 553 + }, + { + "epoch": 1.0674698795180724, + "grad_norm": 0.4593975841999054, + "learning_rate": 3.258466507519213e-05, + "loss": 0.047, + "step": 554 + }, + { + "epoch": 1.0693975903614459, + "grad_norm": 0.19405829906463623, + "learning_rate": 3.254977696635366e-05, + "loss": 0.0314, + "step": 555 + }, + { + "epoch": 1.0713253012048192, + "grad_norm": 0.14563389122486115, + "learning_rate": 3.2514825753164774e-05, + "loss": 0.0046, + "step": 556 + }, + { + "epoch": 1.0732530120481927, + "grad_norm": 0.2642340064048767, + "learning_rate": 3.247981161137153e-05, + "loss": 0.022, + "step": 557 + }, + { + "epoch": 1.0751807228915662, + "grad_norm": 0.17274761199951172, + "learning_rate": 3.2444734717036386e-05, + "loss": 0.0134, + "step": 558 + }, + { + "epoch": 1.0771084337349397, + "grad_norm": 0.44354626536369324, + "learning_rate": 3.240959524653735e-05, + "loss": 0.0211, + "step": 559 + }, + { + "epoch": 1.0790361445783132, + "grad_norm": 0.2806888818740845, + "learning_rate": 3.237439337656708e-05, + "loss": 0.0141, + "step": 560 + }, + { + "epoch": 1.0809638554216867, + "grad_norm": 0.21679501235485077, + "learning_rate": 3.2339129284131994e-05, + "loss": 0.019, + "step": 561 + }, + { + "epoch": 1.0828915662650602, + "grad_norm": 0.3040260076522827, + "learning_rate": 3.2303803146551386e-05, + "loss": 0.0249, + "step": 562 + }, + { + "epoch": 1.0848192771084337, + "grad_norm": 0.2793775200843811, + "learning_rate": 3.226841514145656e-05, + "loss": 0.0088, + "step": 563 + }, + { + "epoch": 1.0867469879518072, + "grad_norm": 0.149955615401268, + "learning_rate": 3.223296544678987e-05, + "loss": 0.0054, + "step": 564 + }, + { + "epoch": 1.0886746987951808, + "grad_norm": 0.22166767716407776, + "learning_rate": 3.219745424080389e-05, + "loss": 0.0109, + "step": 565 + }, + { + "epoch": 1.0906024096385543, + "grad_norm": 0.22399431467056274, + "learning_rate": 3.2161881702060476e-05, + "loss": 0.0106, + "step": 566 + }, + { + "epoch": 1.0925301204819278, + "grad_norm": 0.18537986278533936, + "learning_rate": 3.2126248009429905e-05, + "loss": 0.0077, + "step": 567 + }, + { + "epoch": 1.0944578313253013, + "grad_norm": 0.24511495232582092, + "learning_rate": 3.2090553342089935e-05, + "loss": 0.0093, + "step": 568 + }, + { + "epoch": 1.0963855421686748, + "grad_norm": 0.4766045808792114, + "learning_rate": 3.205479787952494e-05, + "loss": 0.036, + "step": 569 + }, + { + "epoch": 1.0983132530120483, + "grad_norm": 0.1425715535879135, + "learning_rate": 3.201898180152499e-05, + "loss": 0.0085, + "step": 570 + }, + { + "epoch": 1.1002409638554216, + "grad_norm": 0.1909666359424591, + "learning_rate": 3.1983105288184945e-05, + "loss": 0.0081, + "step": 571 + }, + { + "epoch": 1.102168674698795, + "grad_norm": 0.44077104330062866, + "learning_rate": 3.194716851990355e-05, + "loss": 0.017, + "step": 572 + }, + { + "epoch": 1.1040963855421686, + "grad_norm": 0.5757400989532471, + "learning_rate": 3.191117167738253e-05, + "loss": 0.021, + "step": 573 + }, + { + "epoch": 1.106024096385542, + "grad_norm": 0.1977701038122177, + "learning_rate": 3.1875114941625705e-05, + "loss": 0.0096, + "step": 574 + }, + { + "epoch": 1.1079518072289156, + "grad_norm": 0.3524581491947174, + "learning_rate": 3.1838998493938026e-05, + "loss": 0.0118, + "step": 575 + }, + { + "epoch": 1.1098795180722891, + "grad_norm": 0.3301331698894501, + "learning_rate": 3.180282251592472e-05, + "loss": 0.0094, + "step": 576 + }, + { + "epoch": 1.1118072289156626, + "grad_norm": 0.2774488925933838, + "learning_rate": 3.1766587189490336e-05, + "loss": 0.0131, + "step": 577 + }, + { + "epoch": 1.1137349397590361, + "grad_norm": 1.732595443725586, + "learning_rate": 3.173029269683785e-05, + "loss": 0.0445, + "step": 578 + }, + { + "epoch": 1.1156626506024097, + "grad_norm": 0.28746843338012695, + "learning_rate": 3.169393922046776e-05, + "loss": 0.0116, + "step": 579 + }, + { + "epoch": 1.1175903614457832, + "grad_norm": 0.2952995002269745, + "learning_rate": 3.165752694317713e-05, + "loss": 0.0116, + "step": 580 + }, + { + "epoch": 1.1195180722891567, + "grad_norm": 0.2938575744628906, + "learning_rate": 3.16210560480587e-05, + "loss": 0.013, + "step": 581 + }, + { + "epoch": 1.1214457831325302, + "grad_norm": 0.22283495962619781, + "learning_rate": 3.158452671849998e-05, + "loss": 0.0052, + "step": 582 + }, + { + "epoch": 1.1233734939759037, + "grad_norm": 0.6272858381271362, + "learning_rate": 3.154793913818226e-05, + "loss": 0.0182, + "step": 583 + }, + { + "epoch": 1.1253012048192772, + "grad_norm": 0.479753702878952, + "learning_rate": 3.1511293491079804e-05, + "loss": 0.0146, + "step": 584 + }, + { + "epoch": 1.1272289156626507, + "grad_norm": 0.31104400753974915, + "learning_rate": 3.1474589961458786e-05, + "loss": 0.0139, + "step": 585 + }, + { + "epoch": 1.129156626506024, + "grad_norm": 0.4932832419872284, + "learning_rate": 3.1437828733876477e-05, + "loss": 0.0236, + "step": 586 + }, + { + "epoch": 1.1310843373493975, + "grad_norm": 0.222808837890625, + "learning_rate": 3.140100999318025e-05, + "loss": 0.0084, + "step": 587 + }, + { + "epoch": 1.133012048192771, + "grad_norm": 0.4515356719493866, + "learning_rate": 3.136413392450668e-05, + "loss": 0.0215, + "step": 588 + }, + { + "epoch": 1.1349397590361445, + "grad_norm": 0.39302268624305725, + "learning_rate": 3.132720071328061e-05, + "loss": 0.0154, + "step": 589 + }, + { + "epoch": 1.136867469879518, + "grad_norm": 0.43382835388183594, + "learning_rate": 3.1290210545214205e-05, + "loss": 0.0088, + "step": 590 + }, + { + "epoch": 1.1387951807228915, + "grad_norm": 0.18707136809825897, + "learning_rate": 3.125316360630602e-05, + "loss": 0.0126, + "step": 591 + }, + { + "epoch": 1.140722891566265, + "grad_norm": 0.5688219666481018, + "learning_rate": 3.121606008284011e-05, + "loss": 0.0147, + "step": 592 + }, + { + "epoch": 1.1426506024096386, + "grad_norm": 0.3321833312511444, + "learning_rate": 3.1178900161385005e-05, + "loss": 0.0119, + "step": 593 + }, + { + "epoch": 1.144578313253012, + "grad_norm": 0.3738424777984619, + "learning_rate": 3.114168402879286e-05, + "loss": 0.0158, + "step": 594 + }, + { + "epoch": 1.1465060240963856, + "grad_norm": 0.2386978417634964, + "learning_rate": 3.110441187219846e-05, + "loss": 0.0107, + "step": 595 + }, + { + "epoch": 1.148433734939759, + "grad_norm": 0.2165699452161789, + "learning_rate": 3.10670838790183e-05, + "loss": 0.0079, + "step": 596 + }, + { + "epoch": 1.1503614457831326, + "grad_norm": 0.25952696800231934, + "learning_rate": 3.102970023694965e-05, + "loss": 0.0147, + "step": 597 + }, + { + "epoch": 1.152289156626506, + "grad_norm": 0.21448305249214172, + "learning_rate": 3.099226113396959e-05, + "loss": 0.0099, + "step": 598 + }, + { + "epoch": 1.1542168674698796, + "grad_norm": 0.37226060032844543, + "learning_rate": 3.095476675833405e-05, + "loss": 0.0214, + "step": 599 + }, + { + "epoch": 1.1561445783132531, + "grad_norm": 0.29637983441352844, + "learning_rate": 3.0917217298576955e-05, + "loss": 0.0118, + "step": 600 + }, + { + "epoch": 1.1580722891566264, + "grad_norm": 0.18535609543323517, + "learning_rate": 3.0879612943509154e-05, + "loss": 0.0086, + "step": 601 + }, + { + "epoch": 1.16, + "grad_norm": 0.25874125957489014, + "learning_rate": 3.0841953882217536e-05, + "loss": 0.0088, + "step": 602 + }, + { + "epoch": 1.1619277108433734, + "grad_norm": 0.46092745661735535, + "learning_rate": 3.08042403040641e-05, + "loss": 0.0241, + "step": 603 + }, + { + "epoch": 1.163855421686747, + "grad_norm": 0.27023249864578247, + "learning_rate": 3.076647239868494e-05, + "loss": 0.0154, + "step": 604 + }, + { + "epoch": 1.1657831325301204, + "grad_norm": 0.445157527923584, + "learning_rate": 3.072865035598933e-05, + "loss": 0.0197, + "step": 605 + }, + { + "epoch": 1.167710843373494, + "grad_norm": 0.18097272515296936, + "learning_rate": 3.06907743661588e-05, + "loss": 0.0093, + "step": 606 + }, + { + "epoch": 1.1696385542168675, + "grad_norm": 0.22469942271709442, + "learning_rate": 3.065284461964609e-05, + "loss": 0.0171, + "step": 607 + }, + { + "epoch": 1.171566265060241, + "grad_norm": 0.20190906524658203, + "learning_rate": 3.061486130717428e-05, + "loss": 0.008, + "step": 608 + }, + { + "epoch": 1.1734939759036145, + "grad_norm": 0.18294145166873932, + "learning_rate": 3.057682461973579e-05, + "loss": 0.0155, + "step": 609 + }, + { + "epoch": 1.175421686746988, + "grad_norm": 0.34203943610191345, + "learning_rate": 3.053873474859143e-05, + "loss": 0.0212, + "step": 610 + }, + { + "epoch": 1.1773493975903615, + "grad_norm": 0.49073582887649536, + "learning_rate": 3.050059188526942e-05, + "loss": 0.019, + "step": 611 + }, + { + "epoch": 1.179277108433735, + "grad_norm": 0.3537680506706238, + "learning_rate": 3.046239622156446e-05, + "loss": 0.0147, + "step": 612 + }, + { + "epoch": 1.1812048192771085, + "grad_norm": 0.2584632635116577, + "learning_rate": 3.042414794953674e-05, + "loss": 0.0088, + "step": 613 + }, + { + "epoch": 1.1831325301204818, + "grad_norm": 0.3529360890388489, + "learning_rate": 3.0385847261510975e-05, + "loss": 0.0187, + "step": 614 + }, + { + "epoch": 1.1850602409638555, + "grad_norm": 0.3331570327281952, + "learning_rate": 3.0347494350075465e-05, + "loss": 0.0124, + "step": 615 + }, + { + "epoch": 1.1869879518072288, + "grad_norm": 0.2223527580499649, + "learning_rate": 3.0309089408081074e-05, + "loss": 0.01, + "step": 616 + }, + { + "epoch": 1.1889156626506023, + "grad_norm": 0.21985746920108795, + "learning_rate": 3.027063262864032e-05, + "loss": 0.0087, + "step": 617 + }, + { + "epoch": 1.1908433734939758, + "grad_norm": 0.2989653944969177, + "learning_rate": 3.023212420512637e-05, + "loss": 0.0137, + "step": 618 + }, + { + "epoch": 1.1927710843373494, + "grad_norm": 0.17423275113105774, + "learning_rate": 3.0193564331172074e-05, + "loss": 0.0056, + "step": 619 + }, + { + "epoch": 1.1946987951807229, + "grad_norm": 1.0992127656936646, + "learning_rate": 3.0154953200668976e-05, + "loss": 0.0274, + "step": 620 + }, + { + "epoch": 1.1966265060240964, + "grad_norm": 0.21641989052295685, + "learning_rate": 3.011629100776638e-05, + "loss": 0.0151, + "step": 621 + }, + { + "epoch": 1.1985542168674699, + "grad_norm": 0.4558199644088745, + "learning_rate": 3.007757794687033e-05, + "loss": 0.0424, + "step": 622 + }, + { + "epoch": 1.2004819277108434, + "grad_norm": 0.42380189895629883, + "learning_rate": 3.003881421264266e-05, + "loss": 0.0079, + "step": 623 + }, + { + "epoch": 1.202409638554217, + "grad_norm": 0.28791171312332153, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.0142, + "step": 624 + }, + { + "epoch": 1.2043373493975904, + "grad_norm": 0.3906581997871399, + "learning_rate": 2.996113550411281e-05, + "loss": 0.0251, + "step": 625 + }, + { + "epoch": 1.206265060240964, + "grad_norm": 0.47848746180534363, + "learning_rate": 2.9922220920404375e-05, + "loss": 0.0137, + "step": 626 + }, + { + "epoch": 1.2081927710843374, + "grad_norm": 0.22666941583156586, + "learning_rate": 2.9883256444549862e-05, + "loss": 0.0105, + "step": 627 + }, + { + "epoch": 1.210120481927711, + "grad_norm": 0.18968136608600616, + "learning_rate": 2.984424227247529e-05, + "loss": 0.0089, + "step": 628 + }, + { + "epoch": 1.2120481927710842, + "grad_norm": 0.28732606768608093, + "learning_rate": 2.980517860035656e-05, + "loss": 0.0253, + "step": 629 + }, + { + "epoch": 1.213975903614458, + "grad_norm": 0.21131543815135956, + "learning_rate": 2.9766065624618518e-05, + "loss": 0.0134, + "step": 630 + }, + { + "epoch": 1.2159036144578312, + "grad_norm": 0.7594877481460571, + "learning_rate": 2.972690354193388e-05, + "loss": 0.0157, + "step": 631 + }, + { + "epoch": 1.2178313253012047, + "grad_norm": 0.730291485786438, + "learning_rate": 2.96876925492223e-05, + "loss": 0.0204, + "step": 632 + }, + { + "epoch": 1.2197590361445783, + "grad_norm": 0.20333674550056458, + "learning_rate": 2.9648432843649382e-05, + "loss": 0.0114, + "step": 633 + }, + { + "epoch": 1.2216867469879518, + "grad_norm": 0.5680793523788452, + "learning_rate": 2.960912462262566e-05, + "loss": 0.0146, + "step": 634 + }, + { + "epoch": 1.2236144578313253, + "grad_norm": 0.4591079354286194, + "learning_rate": 2.9569768083805618e-05, + "loss": 0.0112, + "step": 635 + }, + { + "epoch": 1.2255421686746988, + "grad_norm": 0.3793511390686035, + "learning_rate": 2.953036342508671e-05, + "loss": 0.0377, + "step": 636 + }, + { + "epoch": 1.2274698795180723, + "grad_norm": 1.118723750114441, + "learning_rate": 2.9490910844608346e-05, + "loss": 0.0432, + "step": 637 + }, + { + "epoch": 1.2293975903614458, + "grad_norm": 0.36990776658058167, + "learning_rate": 2.9451410540750887e-05, + "loss": 0.0203, + "step": 638 + }, + { + "epoch": 1.2313253012048193, + "grad_norm": 0.930397629737854, + "learning_rate": 2.94118627121347e-05, + "loss": 0.0311, + "step": 639 + }, + { + "epoch": 1.2332530120481928, + "grad_norm": 0.2347625195980072, + "learning_rate": 2.9372267557619075e-05, + "loss": 0.0168, + "step": 640 + }, + { + "epoch": 1.2351807228915663, + "grad_norm": 0.3720332384109497, + "learning_rate": 2.933262527630131e-05, + "loss": 0.0136, + "step": 641 + }, + { + "epoch": 1.2371084337349398, + "grad_norm": 0.4871984124183655, + "learning_rate": 2.929293606751565e-05, + "loss": 0.0339, + "step": 642 + }, + { + "epoch": 1.2390361445783133, + "grad_norm": 0.35853689908981323, + "learning_rate": 2.9253200130832322e-05, + "loss": 0.0095, + "step": 643 + }, + { + "epoch": 1.2409638554216866, + "grad_norm": 0.42003703117370605, + "learning_rate": 2.92134176660565e-05, + "loss": 0.0142, + "step": 644 + }, + { + "epoch": 1.2428915662650604, + "grad_norm": 0.3854500651359558, + "learning_rate": 2.9173588873227338e-05, + "loss": 0.0209, + "step": 645 + }, + { + "epoch": 1.2448192771084337, + "grad_norm": 0.24665917456150055, + "learning_rate": 2.913371395261691e-05, + "loss": 0.0087, + "step": 646 + }, + { + "epoch": 1.2467469879518072, + "grad_norm": 0.41571593284606934, + "learning_rate": 2.9093793104729268e-05, + "loss": 0.0164, + "step": 647 + }, + { + "epoch": 1.2486746987951807, + "grad_norm": 0.4597891569137573, + "learning_rate": 2.9053826530299377e-05, + "loss": 0.0138, + "step": 648 + }, + { + "epoch": 1.2506024096385542, + "grad_norm": 0.43345385789871216, + "learning_rate": 2.901381443029215e-05, + "loss": 0.0353, + "step": 649 + }, + { + "epoch": 1.2525301204819277, + "grad_norm": 0.3706768751144409, + "learning_rate": 2.897375700590141e-05, + "loss": 0.007, + "step": 650 + }, + { + "epoch": 1.2544578313253012, + "grad_norm": 0.30305296182632446, + "learning_rate": 2.8933654458548873e-05, + "loss": 0.0123, + "step": 651 + }, + { + "epoch": 1.2563855421686747, + "grad_norm": 0.2042127549648285, + "learning_rate": 2.8893506989883167e-05, + "loss": 0.0099, + "step": 652 + }, + { + "epoch": 1.2583132530120482, + "grad_norm": 0.20524422824382782, + "learning_rate": 2.8853314801778784e-05, + "loss": 0.0097, + "step": 653 + }, + { + "epoch": 1.2602409638554217, + "grad_norm": 0.2351921945810318, + "learning_rate": 2.8813078096335093e-05, + "loss": 0.0091, + "step": 654 + }, + { + "epoch": 1.2621686746987952, + "grad_norm": 0.34547340869903564, + "learning_rate": 2.87727970758753e-05, + "loss": 0.0088, + "step": 655 + }, + { + "epoch": 1.2640963855421687, + "grad_norm": 0.35163217782974243, + "learning_rate": 2.8732471942945443e-05, + "loss": 0.0145, + "step": 656 + }, + { + "epoch": 1.266024096385542, + "grad_norm": 1.715137243270874, + "learning_rate": 2.8692102900313378e-05, + "loss": 0.0198, + "step": 657 + }, + { + "epoch": 1.2679518072289158, + "grad_norm": 0.2860178053379059, + "learning_rate": 2.8651690150967748e-05, + "loss": 0.0085, + "step": 658 + }, + { + "epoch": 1.269879518072289, + "grad_norm": 0.21175967156887054, + "learning_rate": 2.8611233898116967e-05, + "loss": 0.0071, + "step": 659 + }, + { + "epoch": 1.2718072289156628, + "grad_norm": 0.33726972341537476, + "learning_rate": 2.85707343451882e-05, + "loss": 0.012, + "step": 660 + }, + { + "epoch": 1.273734939759036, + "grad_norm": 0.2138456553220749, + "learning_rate": 2.853019169582635e-05, + "loss": 0.0092, + "step": 661 + }, + { + "epoch": 1.2756626506024096, + "grad_norm": 0.2304934412240982, + "learning_rate": 2.8489606153892997e-05, + "loss": 0.0144, + "step": 662 + }, + { + "epoch": 1.277590361445783, + "grad_norm": 0.2691061794757843, + "learning_rate": 2.8448977923465425e-05, + "loss": 0.0121, + "step": 663 + }, + { + "epoch": 1.2795180722891566, + "grad_norm": 0.35254305601119995, + "learning_rate": 2.840830720883555e-05, + "loss": 0.0125, + "step": 664 + }, + { + "epoch": 1.28144578313253, + "grad_norm": 0.36552608013153076, + "learning_rate": 2.836759421450893e-05, + "loss": 0.021, + "step": 665 + }, + { + "epoch": 1.2833734939759036, + "grad_norm": 0.37177154421806335, + "learning_rate": 2.83268391452037e-05, + "loss": 0.0216, + "step": 666 + }, + { + "epoch": 1.2853012048192771, + "grad_norm": 0.20932547748088837, + "learning_rate": 2.828604220584958e-05, + "loss": 0.0077, + "step": 667 + }, + { + "epoch": 1.2872289156626506, + "grad_norm": 0.5158557295799255, + "learning_rate": 2.824520360158681e-05, + "loss": 0.0394, + "step": 668 + }, + { + "epoch": 1.2891566265060241, + "grad_norm": 0.22623969614505768, + "learning_rate": 2.820432353776515e-05, + "loss": 0.0087, + "step": 669 + }, + { + "epoch": 1.2910843373493976, + "grad_norm": 0.2996046245098114, + "learning_rate": 2.8163402219942822e-05, + "loss": 0.01, + "step": 670 + }, + { + "epoch": 1.2930120481927712, + "grad_norm": 0.24957989156246185, + "learning_rate": 2.8122439853885488e-05, + "loss": 0.0127, + "step": 671 + }, + { + "epoch": 1.2949397590361444, + "grad_norm": 0.2636559307575226, + "learning_rate": 2.8081436645565216e-05, + "loss": 0.0128, + "step": 672 + }, + { + "epoch": 1.2968674698795182, + "grad_norm": 0.3531591296195984, + "learning_rate": 2.804039280115944e-05, + "loss": 0.0199, + "step": 673 + }, + { + "epoch": 1.2987951807228915, + "grad_norm": 0.3682299852371216, + "learning_rate": 2.7999308527049927e-05, + "loss": 0.0088, + "step": 674 + }, + { + "epoch": 1.3007228915662652, + "grad_norm": 0.19555217027664185, + "learning_rate": 2.795818402982174e-05, + "loss": 0.0084, + "step": 675 + }, + { + "epoch": 1.3026506024096385, + "grad_norm": 0.2864912450313568, + "learning_rate": 2.7917019516262186e-05, + "loss": 0.0154, + "step": 676 + }, + { + "epoch": 1.304578313253012, + "grad_norm": 0.2211237996816635, + "learning_rate": 2.78758151933598e-05, + "loss": 0.0078, + "step": 677 + }, + { + "epoch": 1.3065060240963855, + "grad_norm": 0.13646945357322693, + "learning_rate": 2.7834571268303294e-05, + "loss": 0.0058, + "step": 678 + }, + { + "epoch": 1.308433734939759, + "grad_norm": 0.16530285775661469, + "learning_rate": 2.779328794848049e-05, + "loss": 0.007, + "step": 679 + }, + { + "epoch": 1.3103614457831325, + "grad_norm": 0.2145693302154541, + "learning_rate": 2.7751965441477325e-05, + "loss": 0.0203, + "step": 680 + }, + { + "epoch": 1.312289156626506, + "grad_norm": 0.24273739755153656, + "learning_rate": 2.771060395507677e-05, + "loss": 0.0106, + "step": 681 + }, + { + "epoch": 1.3142168674698795, + "grad_norm": 0.20430618524551392, + "learning_rate": 2.7669203697257794e-05, + "loss": 0.0122, + "step": 682 + }, + { + "epoch": 1.316144578313253, + "grad_norm": 0.2502615749835968, + "learning_rate": 2.7627764876194335e-05, + "loss": 0.0101, + "step": 683 + }, + { + "epoch": 1.3180722891566266, + "grad_norm": 0.287239670753479, + "learning_rate": 2.7586287700254214e-05, + "loss": 0.0203, + "step": 684 + }, + { + "epoch": 1.32, + "grad_norm": 0.16239754855632782, + "learning_rate": 2.7544772377998147e-05, + "loss": 0.0084, + "step": 685 + }, + { + "epoch": 1.3219277108433736, + "grad_norm": 0.27174142003059387, + "learning_rate": 2.7503219118178636e-05, + "loss": 0.008, + "step": 686 + }, + { + "epoch": 1.3238554216867469, + "grad_norm": 0.12878240644931793, + "learning_rate": 2.7461628129738954e-05, + "loss": 0.0053, + "step": 687 + }, + { + "epoch": 1.3257831325301206, + "grad_norm": 0.16112515330314636, + "learning_rate": 2.7419999621812086e-05, + "loss": 0.0059, + "step": 688 + }, + { + "epoch": 1.3277108433734939, + "grad_norm": 0.2398834228515625, + "learning_rate": 2.7378333803719672e-05, + "loss": 0.0095, + "step": 689 + }, + { + "epoch": 1.3296385542168676, + "grad_norm": 0.18516193330287933, + "learning_rate": 2.733663088497097e-05, + "loss": 0.0071, + "step": 690 + }, + { + "epoch": 1.331566265060241, + "grad_norm": 0.2974924147129059, + "learning_rate": 2.7294891075261785e-05, + "loss": 0.0227, + "step": 691 + }, + { + "epoch": 1.3334939759036144, + "grad_norm": 0.12931054830551147, + "learning_rate": 2.7253114584473418e-05, + "loss": 0.0039, + "step": 692 + }, + { + "epoch": 1.335421686746988, + "grad_norm": 0.16319474577903748, + "learning_rate": 2.7211301622671623e-05, + "loss": 0.008, + "step": 693 + }, + { + "epoch": 1.3373493975903614, + "grad_norm": 0.27622169256210327, + "learning_rate": 2.7169452400105533e-05, + "loss": 0.0238, + "step": 694 + }, + { + "epoch": 1.339277108433735, + "grad_norm": 0.45309779047966003, + "learning_rate": 2.712756712720663e-05, + "loss": 0.0439, + "step": 695 + }, + { + "epoch": 1.3412048192771084, + "grad_norm": 0.2469855099916458, + "learning_rate": 2.708564601458765e-05, + "loss": 0.0085, + "step": 696 + }, + { + "epoch": 1.343132530120482, + "grad_norm": 0.4245856702327728, + "learning_rate": 2.7043689273041535e-05, + "loss": 0.0097, + "step": 697 + }, + { + "epoch": 1.3450602409638555, + "grad_norm": 0.26796087622642517, + "learning_rate": 2.7001697113540414e-05, + "loss": 0.0119, + "step": 698 + }, + { + "epoch": 1.346987951807229, + "grad_norm": 0.3569283187389374, + "learning_rate": 2.6959669747234482e-05, + "loss": 0.0096, + "step": 699 + }, + { + "epoch": 1.3489156626506025, + "grad_norm": 0.7038524150848389, + "learning_rate": 2.6917607385450973e-05, + "loss": 0.0317, + "step": 700 + }, + { + "epoch": 1.350843373493976, + "grad_norm": 0.23568563163280487, + "learning_rate": 2.687551023969308e-05, + "loss": 0.0112, + "step": 701 + }, + { + "epoch": 1.3527710843373493, + "grad_norm": 0.20338499546051025, + "learning_rate": 2.6833378521638935e-05, + "loss": 0.0092, + "step": 702 + }, + { + "epoch": 1.354698795180723, + "grad_norm": 4.22187614440918, + "learning_rate": 2.679121244314046e-05, + "loss": 0.0314, + "step": 703 + }, + { + "epoch": 1.3566265060240963, + "grad_norm": 0.2542206048965454, + "learning_rate": 2.674901221622239e-05, + "loss": 0.0158, + "step": 704 + }, + { + "epoch": 1.3585542168674698, + "grad_norm": 0.49705010652542114, + "learning_rate": 2.670677805308116e-05, + "loss": 0.0162, + "step": 705 + }, + { + "epoch": 1.3604819277108433, + "grad_norm": 0.17502115666866302, + "learning_rate": 2.666451016608383e-05, + "loss": 0.0074, + "step": 706 + }, + { + "epoch": 1.3624096385542168, + "grad_norm": 0.21738742291927338, + "learning_rate": 2.6622208767767075e-05, + "loss": 0.0135, + "step": 707 + }, + { + "epoch": 1.3643373493975903, + "grad_norm": 0.3309847414493561, + "learning_rate": 2.6579874070836032e-05, + "loss": 0.0107, + "step": 708 + }, + { + "epoch": 1.3662650602409638, + "grad_norm": 0.10706827789545059, + "learning_rate": 2.6537506288163303e-05, + "loss": 0.0043, + "step": 709 + }, + { + "epoch": 1.3681927710843373, + "grad_norm": 0.173640176653862, + "learning_rate": 2.6495105632787835e-05, + "loss": 0.0092, + "step": 710 + }, + { + "epoch": 1.3701204819277109, + "grad_norm": 0.2636397182941437, + "learning_rate": 2.6452672317913893e-05, + "loss": 0.0097, + "step": 711 + }, + { + "epoch": 1.3720481927710844, + "grad_norm": 0.28485360741615295, + "learning_rate": 2.6410206556909943e-05, + "loss": 0.0193, + "step": 712 + }, + { + "epoch": 1.3739759036144579, + "grad_norm": 0.23210027813911438, + "learning_rate": 2.636770856330761e-05, + "loss": 0.0229, + "step": 713 + }, + { + "epoch": 1.3759036144578314, + "grad_norm": 0.13388316333293915, + "learning_rate": 2.6325178550800596e-05, + "loss": 0.004, + "step": 714 + }, + { + "epoch": 1.377831325301205, + "grad_norm": 0.5131422877311707, + "learning_rate": 2.6282616733243603e-05, + "loss": 0.0137, + "step": 715 + }, + { + "epoch": 1.3797590361445784, + "grad_norm": 0.3243267834186554, + "learning_rate": 2.6240023324651258e-05, + "loss": 0.0153, + "step": 716 + }, + { + "epoch": 1.3816867469879517, + "grad_norm": 0.1440611034631729, + "learning_rate": 2.619739853919704e-05, + "loss": 0.0031, + "step": 717 + }, + { + "epoch": 1.3836144578313254, + "grad_norm": 0.30346596240997314, + "learning_rate": 2.6154742591212196e-05, + "loss": 0.0109, + "step": 718 + }, + { + "epoch": 1.3855421686746987, + "grad_norm": 0.19109240174293518, + "learning_rate": 2.611205569518468e-05, + "loss": 0.0094, + "step": 719 + }, + { + "epoch": 1.3874698795180722, + "grad_norm": 0.28636518120765686, + "learning_rate": 2.6069338065758056e-05, + "loss": 0.0123, + "step": 720 + }, + { + "epoch": 1.3893975903614457, + "grad_norm": 0.28083911538124084, + "learning_rate": 2.6026589917730416e-05, + "loss": 0.0104, + "step": 721 + }, + { + "epoch": 1.3913253012048192, + "grad_norm": 0.36553966999053955, + "learning_rate": 2.5983811466053327e-05, + "loss": 0.0143, + "step": 722 + }, + { + "epoch": 1.3932530120481927, + "grad_norm": 0.23317205905914307, + "learning_rate": 2.5941002925830708e-05, + "loss": 0.011, + "step": 723 + }, + { + "epoch": 1.3951807228915662, + "grad_norm": 0.3825171887874603, + "learning_rate": 2.589816451231781e-05, + "loss": 0.0098, + "step": 724 + }, + { + "epoch": 1.3971084337349398, + "grad_norm": 0.19916608929634094, + "learning_rate": 2.585529644092006e-05, + "loss": 0.0094, + "step": 725 + }, + { + "epoch": 1.3990361445783133, + "grad_norm": 0.19990523159503937, + "learning_rate": 2.5812398927192027e-05, + "loss": 0.0128, + "step": 726 + }, + { + "epoch": 1.4009638554216868, + "grad_norm": 0.34662899374961853, + "learning_rate": 2.5769472186836347e-05, + "loss": 0.0091, + "step": 727 + }, + { + "epoch": 1.4028915662650603, + "grad_norm": 0.23481112718582153, + "learning_rate": 2.5726516435702583e-05, + "loss": 0.0154, + "step": 728 + }, + { + "epoch": 1.4048192771084338, + "grad_norm": 0.1846667379140854, + "learning_rate": 2.5683531889786194e-05, + "loss": 0.0088, + "step": 729 + }, + { + "epoch": 1.4067469879518073, + "grad_norm": 0.16717663407325745, + "learning_rate": 2.564051876522742e-05, + "loss": 0.0083, + "step": 730 + }, + { + "epoch": 1.4086746987951808, + "grad_norm": 0.4116475284099579, + "learning_rate": 2.5597477278310202e-05, + "loss": 0.0179, + "step": 731 + }, + { + "epoch": 1.410602409638554, + "grad_norm": 0.171807661652565, + "learning_rate": 2.5554407645461115e-05, + "loss": 0.0063, + "step": 732 + }, + { + "epoch": 1.4125301204819278, + "grad_norm": 0.1954439878463745, + "learning_rate": 2.5511310083248243e-05, + "loss": 0.017, + "step": 733 + }, + { + "epoch": 1.4144578313253011, + "grad_norm": 0.37158989906311035, + "learning_rate": 2.5468184808380104e-05, + "loss": 0.0173, + "step": 734 + }, + { + "epoch": 1.4163855421686746, + "grad_norm": 0.2001633644104004, + "learning_rate": 2.542503203770458e-05, + "loss": 0.0165, + "step": 735 + }, + { + "epoch": 1.4183132530120481, + "grad_norm": 0.45673373341560364, + "learning_rate": 2.53818519882078e-05, + "loss": 0.0185, + "step": 736 + }, + { + "epoch": 1.4202409638554216, + "grad_norm": 0.3838701546192169, + "learning_rate": 2.5338644877013067e-05, + "loss": 0.0134, + "step": 737 + }, + { + "epoch": 1.4221686746987952, + "grad_norm": 0.32032477855682373, + "learning_rate": 2.5295410921379745e-05, + "loss": 0.0143, + "step": 738 + }, + { + "epoch": 1.4240963855421687, + "grad_norm": 0.4594039022922516, + "learning_rate": 2.52521503387022e-05, + "loss": 0.0193, + "step": 739 + }, + { + "epoch": 1.4260240963855422, + "grad_norm": 0.3889620900154114, + "learning_rate": 2.5208863346508667e-05, + "loss": 0.0114, + "step": 740 + }, + { + "epoch": 1.4279518072289157, + "grad_norm": 0.33153319358825684, + "learning_rate": 2.5165550162460203e-05, + "loss": 0.0102, + "step": 741 + }, + { + "epoch": 1.4298795180722892, + "grad_norm": 0.7269518375396729, + "learning_rate": 2.5122211004349536e-05, + "loss": 0.0215, + "step": 742 + }, + { + "epoch": 1.4318072289156627, + "grad_norm": 0.31653261184692383, + "learning_rate": 2.5078846090100023e-05, + "loss": 0.0115, + "step": 743 + }, + { + "epoch": 1.4337349397590362, + "grad_norm": 0.20620353519916534, + "learning_rate": 2.5035455637764518e-05, + "loss": 0.0153, + "step": 744 + }, + { + "epoch": 1.4356626506024097, + "grad_norm": 0.17266008257865906, + "learning_rate": 2.4992039865524297e-05, + "loss": 0.0069, + "step": 745 + }, + { + "epoch": 1.4375903614457832, + "grad_norm": 0.24760811030864716, + "learning_rate": 2.494859899168795e-05, + "loss": 0.0108, + "step": 746 + }, + { + "epoch": 1.4395180722891565, + "grad_norm": 0.2584865391254425, + "learning_rate": 2.4905133234690282e-05, + "loss": 0.0095, + "step": 747 + }, + { + "epoch": 1.4414457831325302, + "grad_norm": 0.48847514390945435, + "learning_rate": 2.486164281309122e-05, + "loss": 0.0181, + "step": 748 + }, + { + "epoch": 1.4433734939759035, + "grad_norm": 0.42942047119140625, + "learning_rate": 2.4818127945574717e-05, + "loss": 0.025, + "step": 749 + }, + { + "epoch": 1.445301204819277, + "grad_norm": 0.23713800311088562, + "learning_rate": 2.4774588850947648e-05, + "loss": 0.0085, + "step": 750 + }, + { + "epoch": 1.4472289156626506, + "grad_norm": 0.8797569870948792, + "learning_rate": 2.473102574813871e-05, + "loss": 0.0097, + "step": 751 + }, + { + "epoch": 1.449156626506024, + "grad_norm": 0.2744862735271454, + "learning_rate": 2.4687438856197302e-05, + "loss": 0.0122, + "step": 752 + }, + { + "epoch": 1.4510843373493976, + "grad_norm": 0.12747010588645935, + "learning_rate": 2.4643828394292478e-05, + "loss": 0.0056, + "step": 753 + }, + { + "epoch": 1.453012048192771, + "grad_norm": 0.37376829981803894, + "learning_rate": 2.4600194581711775e-05, + "loss": 0.0052, + "step": 754 + }, + { + "epoch": 1.4549397590361446, + "grad_norm": 0.2536911368370056, + "learning_rate": 2.4556537637860176e-05, + "loss": 0.0113, + "step": 755 + }, + { + "epoch": 1.456867469879518, + "grad_norm": 0.25950780510902405, + "learning_rate": 2.451285778225894e-05, + "loss": 0.0099, + "step": 756 + }, + { + "epoch": 1.4587951807228916, + "grad_norm": 0.19535955786705017, + "learning_rate": 2.4469155234544565e-05, + "loss": 0.0069, + "step": 757 + }, + { + "epoch": 1.4607228915662651, + "grad_norm": 0.22816115617752075, + "learning_rate": 2.442543021446764e-05, + "loss": 0.0088, + "step": 758 + }, + { + "epoch": 1.4626506024096386, + "grad_norm": 0.3363986313343048, + "learning_rate": 2.4381682941891755e-05, + "loss": 0.0182, + "step": 759 + }, + { + "epoch": 1.464578313253012, + "grad_norm": 0.21492891013622284, + "learning_rate": 2.4337913636792382e-05, + "loss": 0.0069, + "step": 760 + }, + { + "epoch": 1.4665060240963856, + "grad_norm": 0.6070862412452698, + "learning_rate": 2.429412251925579e-05, + "loss": 0.0406, + "step": 761 + }, + { + "epoch": 1.468433734939759, + "grad_norm": 2.6469690799713135, + "learning_rate": 2.425030980947793e-05, + "loss": 0.0205, + "step": 762 + }, + { + "epoch": 1.4703614457831327, + "grad_norm": 0.30909740924835205, + "learning_rate": 2.420647572776332e-05, + "loss": 0.0136, + "step": 763 + }, + { + "epoch": 1.472289156626506, + "grad_norm": 0.6639553904533386, + "learning_rate": 2.416262049452395e-05, + "loss": 0.011, + "step": 764 + }, + { + "epoch": 1.4742168674698795, + "grad_norm": 0.2919616997241974, + "learning_rate": 2.4118744330278147e-05, + "loss": 0.0131, + "step": 765 + }, + { + "epoch": 1.476144578313253, + "grad_norm": 0.5232429504394531, + "learning_rate": 2.4074847455649523e-05, + "loss": 0.0138, + "step": 766 + }, + { + "epoch": 1.4780722891566265, + "grad_norm": 5.630630970001221, + "learning_rate": 2.403093009136579e-05, + "loss": 0.0264, + "step": 767 + }, + { + "epoch": 1.48, + "grad_norm": 0.33234721422195435, + "learning_rate": 2.3986992458257707e-05, + "loss": 0.0111, + "step": 768 + }, + { + "epoch": 1.4819277108433735, + "grad_norm": 0.28444772958755493, + "learning_rate": 2.3943034777257945e-05, + "loss": 0.0144, + "step": 769 + }, + { + "epoch": 1.483855421686747, + "grad_norm": 0.16229979693889618, + "learning_rate": 2.38990572694e-05, + "loss": 0.0062, + "step": 770 + }, + { + "epoch": 1.4857831325301205, + "grad_norm": 0.27474716305732727, + "learning_rate": 2.385506015581704e-05, + "loss": 0.0172, + "step": 771 + }, + { + "epoch": 1.487710843373494, + "grad_norm": 0.246526300907135, + "learning_rate": 2.381104365774083e-05, + "loss": 0.012, + "step": 772 + }, + { + "epoch": 1.4896385542168675, + "grad_norm": 0.282047837972641, + "learning_rate": 2.37670079965006e-05, + "loss": 0.0116, + "step": 773 + }, + { + "epoch": 1.491566265060241, + "grad_norm": 0.2878139317035675, + "learning_rate": 2.3722953393521944e-05, + "loss": 0.0147, + "step": 774 + }, + { + "epoch": 1.4934939759036143, + "grad_norm": 0.5586277842521667, + "learning_rate": 2.367888007032571e-05, + "loss": 0.0111, + "step": 775 + }, + { + "epoch": 1.495421686746988, + "grad_norm": 0.562160313129425, + "learning_rate": 2.3634788248526846e-05, + "loss": 0.0061, + "step": 776 + }, + { + "epoch": 1.4973493975903613, + "grad_norm": 0.3452005982398987, + "learning_rate": 2.3590678149833356e-05, + "loss": 0.0205, + "step": 777 + }, + { + "epoch": 1.499277108433735, + "grad_norm": 0.7757686376571655, + "learning_rate": 2.3546549996045114e-05, + "loss": 0.0273, + "step": 778 + }, + { + "epoch": 1.5012048192771084, + "grad_norm": 0.19530551135540009, + "learning_rate": 2.3502404009052812e-05, + "loss": 0.0083, + "step": 779 + }, + { + "epoch": 1.503132530120482, + "grad_norm": 0.2586531639099121, + "learning_rate": 2.3458240410836775e-05, + "loss": 0.0122, + "step": 780 + }, + { + "epoch": 1.5050602409638554, + "grad_norm": 0.30063286423683167, + "learning_rate": 2.3414059423465924e-05, + "loss": 0.0083, + "step": 781 + }, + { + "epoch": 1.5069879518072289, + "grad_norm": 0.18663185834884644, + "learning_rate": 2.3369861269096575e-05, + "loss": 0.0104, + "step": 782 + }, + { + "epoch": 1.5089156626506024, + "grad_norm": 0.4405941069126129, + "learning_rate": 2.3325646169971416e-05, + "loss": 0.0264, + "step": 783 + }, + { + "epoch": 1.510843373493976, + "grad_norm": 0.2947913110256195, + "learning_rate": 2.3281414348418294e-05, + "loss": 0.0107, + "step": 784 + }, + { + "epoch": 1.5127710843373494, + "grad_norm": 0.23813778162002563, + "learning_rate": 2.3237166026849158e-05, + "loss": 0.0084, + "step": 785 + }, + { + "epoch": 1.514698795180723, + "grad_norm": 0.33380329608917236, + "learning_rate": 2.3192901427758932e-05, + "loss": 0.0111, + "step": 786 + }, + { + "epoch": 1.5166265060240964, + "grad_norm": 0.3736988306045532, + "learning_rate": 2.314862077372438e-05, + "loss": 0.0135, + "step": 787 + }, + { + "epoch": 1.5185542168674697, + "grad_norm": 0.3785395920276642, + "learning_rate": 2.3104324287402996e-05, + "loss": 0.0265, + "step": 788 + }, + { + "epoch": 1.5204819277108435, + "grad_norm": 0.3359154462814331, + "learning_rate": 2.3060012191531885e-05, + "loss": 0.0127, + "step": 789 + }, + { + "epoch": 1.5224096385542167, + "grad_norm": 0.720753014087677, + "learning_rate": 2.301568470892664e-05, + "loss": 0.0134, + "step": 790 + }, + { + "epoch": 1.5243373493975905, + "grad_norm": 0.36473193764686584, + "learning_rate": 2.297134206248024e-05, + "loss": 0.0318, + "step": 791 + }, + { + "epoch": 1.5262650602409638, + "grad_norm": 0.29987087845802307, + "learning_rate": 2.2926984475161884e-05, + "loss": 0.008, + "step": 792 + }, + { + "epoch": 1.5281927710843375, + "grad_norm": 0.2883112132549286, + "learning_rate": 2.2882612170015914e-05, + "loss": 0.0125, + "step": 793 + }, + { + "epoch": 1.5301204819277108, + "grad_norm": 0.28983229398727417, + "learning_rate": 2.2838225370160682e-05, + "loss": 0.0155, + "step": 794 + }, + { + "epoch": 1.5320481927710843, + "grad_norm": 0.47236886620521545, + "learning_rate": 2.2793824298787414e-05, + "loss": 0.0132, + "step": 795 + }, + { + "epoch": 1.5339759036144578, + "grad_norm": 0.8328865170478821, + "learning_rate": 2.2749409179159104e-05, + "loss": 0.026, + "step": 796 + }, + { + "epoch": 1.5359036144578313, + "grad_norm": 0.3129172623157501, + "learning_rate": 2.2704980234609396e-05, + "loss": 0.0099, + "step": 797 + }, + { + "epoch": 1.5378313253012048, + "grad_norm": 0.22284500300884247, + "learning_rate": 2.2660537688541416e-05, + "loss": 0.009, + "step": 798 + }, + { + "epoch": 1.5397590361445783, + "grad_norm": 0.3346405625343323, + "learning_rate": 2.2616081764426726e-05, + "loss": 0.0077, + "step": 799 + }, + { + "epoch": 1.5416867469879518, + "grad_norm": 0.2923565208911896, + "learning_rate": 2.2571612685804124e-05, + "loss": 0.0119, + "step": 800 + }, + { + "epoch": 1.5436144578313253, + "grad_norm": 0.1921311914920807, + "learning_rate": 2.252713067627857e-05, + "loss": 0.0083, + "step": 801 + }, + { + "epoch": 1.5455421686746988, + "grad_norm": 0.23221106827259064, + "learning_rate": 2.2482635959520044e-05, + "loss": 0.0049, + "step": 802 + }, + { + "epoch": 1.5474698795180721, + "grad_norm": 0.6340724229812622, + "learning_rate": 2.243812875926241e-05, + "loss": 0.0273, + "step": 803 + }, + { + "epoch": 1.5493975903614459, + "grad_norm": 0.2699439823627472, + "learning_rate": 2.2393609299302314e-05, + "loss": 0.0108, + "step": 804 + }, + { + "epoch": 1.5513253012048192, + "grad_norm": 0.2005189210176468, + "learning_rate": 2.2349077803498052e-05, + "loss": 0.0076, + "step": 805 + }, + { + "epoch": 1.5532530120481929, + "grad_norm": 0.39668548107147217, + "learning_rate": 2.230453449576842e-05, + "loss": 0.0135, + "step": 806 + }, + { + "epoch": 1.5551807228915662, + "grad_norm": 0.2406950294971466, + "learning_rate": 2.2259979600091635e-05, + "loss": 0.0094, + "step": 807 + }, + { + "epoch": 1.55710843373494, + "grad_norm": 0.30363157391548157, + "learning_rate": 2.2215413340504158e-05, + "loss": 0.0178, + "step": 808 + }, + { + "epoch": 1.5590361445783132, + "grad_norm": 0.19508181512355804, + "learning_rate": 2.2170835941099605e-05, + "loss": 0.0069, + "step": 809 + }, + { + "epoch": 1.5609638554216867, + "grad_norm": 0.734106719493866, + "learning_rate": 2.2126247626027615e-05, + "loss": 0.0319, + "step": 810 + }, + { + "epoch": 1.5628915662650602, + "grad_norm": 0.2591583728790283, + "learning_rate": 2.208164861949268e-05, + "loss": 0.0168, + "step": 811 + }, + { + "epoch": 1.5648192771084337, + "grad_norm": 0.2386734038591385, + "learning_rate": 2.20370391457531e-05, + "loss": 0.0041, + "step": 812 + }, + { + "epoch": 1.5667469879518072, + "grad_norm": 0.1675218939781189, + "learning_rate": 2.1992419429119764e-05, + "loss": 0.0078, + "step": 813 + }, + { + "epoch": 1.5686746987951807, + "grad_norm": 0.45591506361961365, + "learning_rate": 2.1947789693955097e-05, + "loss": 0.0166, + "step": 814 + }, + { + "epoch": 1.5706024096385542, + "grad_norm": 0.46940621733665466, + "learning_rate": 2.190315016467188e-05, + "loss": 0.0176, + "step": 815 + }, + { + "epoch": 1.5725301204819278, + "grad_norm": 0.2294205278158188, + "learning_rate": 2.1858501065732146e-05, + "loss": 0.0102, + "step": 816 + }, + { + "epoch": 1.5744578313253013, + "grad_norm": 0.28922322392463684, + "learning_rate": 2.181384262164606e-05, + "loss": 0.0111, + "step": 817 + }, + { + "epoch": 1.5763855421686745, + "grad_norm": 0.19650064408779144, + "learning_rate": 2.1769175056970765e-05, + "loss": 0.0076, + "step": 818 + }, + { + "epoch": 1.5783132530120483, + "grad_norm": 0.19538825750350952, + "learning_rate": 2.172449859630927e-05, + "loss": 0.0118, + "step": 819 + }, + { + "epoch": 1.5802409638554216, + "grad_norm": 0.1900389939546585, + "learning_rate": 2.167981346430931e-05, + "loss": 0.0066, + "step": 820 + }, + { + "epoch": 1.5821686746987953, + "grad_norm": 0.21593710780143738, + "learning_rate": 2.1635119885662235e-05, + "loss": 0.0101, + "step": 821 + }, + { + "epoch": 1.5840963855421686, + "grad_norm": 0.2699289321899414, + "learning_rate": 2.159041808510185e-05, + "loss": 0.0118, + "step": 822 + }, + { + "epoch": 1.5860240963855423, + "grad_norm": 0.31867673993110657, + "learning_rate": 2.1545708287403322e-05, + "loss": 0.0122, + "step": 823 + }, + { + "epoch": 1.5879518072289156, + "grad_norm": 0.2862400412559509, + "learning_rate": 2.1500990717382004e-05, + "loss": 0.0216, + "step": 824 + }, + { + "epoch": 1.589879518072289, + "grad_norm": 0.28482481837272644, + "learning_rate": 2.145626559989237e-05, + "loss": 0.0136, + "step": 825 + }, + { + "epoch": 1.5918072289156626, + "grad_norm": 0.2866958975791931, + "learning_rate": 2.1411533159826803e-05, + "loss": 0.0298, + "step": 826 + }, + { + "epoch": 1.5937349397590361, + "grad_norm": 0.39092838764190674, + "learning_rate": 2.1366793622114533e-05, + "loss": 0.0382, + "step": 827 + }, + { + "epoch": 1.5956626506024096, + "grad_norm": 0.16381537914276123, + "learning_rate": 2.1322047211720468e-05, + "loss": 0.0074, + "step": 828 + }, + { + "epoch": 1.5975903614457831, + "grad_norm": 0.22146940231323242, + "learning_rate": 2.1277294153644083e-05, + "loss": 0.0103, + "step": 829 + }, + { + "epoch": 1.5995180722891567, + "grad_norm": 0.2155209183692932, + "learning_rate": 2.123253467291827e-05, + "loss": 0.0095, + "step": 830 + }, + { + "epoch": 1.6014457831325302, + "grad_norm": 0.41510409116744995, + "learning_rate": 2.118776899460822e-05, + "loss": 0.0457, + "step": 831 + }, + { + "epoch": 1.6033734939759037, + "grad_norm": 0.19718150794506073, + "learning_rate": 2.1142997343810293e-05, + "loss": 0.0192, + "step": 832 + }, + { + "epoch": 1.605301204819277, + "grad_norm": 0.40924403071403503, + "learning_rate": 2.1098219945650865e-05, + "loss": 0.0278, + "step": 833 + }, + { + "epoch": 1.6072289156626507, + "grad_norm": 0.18657824397087097, + "learning_rate": 2.105343702528524e-05, + "loss": 0.0076, + "step": 834 + }, + { + "epoch": 1.609156626506024, + "grad_norm": 0.1727641075849533, + "learning_rate": 2.100864880789645e-05, + "loss": 0.0076, + "step": 835 + }, + { + "epoch": 1.6110843373493977, + "grad_norm": 0.18138745427131653, + "learning_rate": 2.0963855518694203e-05, + "loss": 0.005, + "step": 836 + }, + { + "epoch": 1.613012048192771, + "grad_norm": 0.19173955917358398, + "learning_rate": 2.0919057382913675e-05, + "loss": 0.0084, + "step": 837 + }, + { + "epoch": 1.6149397590361447, + "grad_norm": 0.3812403380870819, + "learning_rate": 2.0874254625814435e-05, + "loss": 0.009, + "step": 838 + }, + { + "epoch": 1.616867469879518, + "grad_norm": 0.2009759545326233, + "learning_rate": 2.0829447472679285e-05, + "loss": 0.0098, + "step": 839 + }, + { + "epoch": 1.6187951807228915, + "grad_norm": 0.48703446984291077, + "learning_rate": 2.0784636148813124e-05, + "loss": 0.0099, + "step": 840 + }, + { + "epoch": 1.620722891566265, + "grad_norm": 0.28995075821876526, + "learning_rate": 2.0739820879541827e-05, + "loss": 0.0075, + "step": 841 + }, + { + "epoch": 1.6226506024096385, + "grad_norm": 0.2130059450864792, + "learning_rate": 2.069500189021111e-05, + "loss": 0.007, + "step": 842 + }, + { + "epoch": 1.624578313253012, + "grad_norm": 0.252524733543396, + "learning_rate": 2.0650179406185397e-05, + "loss": 0.0249, + "step": 843 + }, + { + "epoch": 1.6265060240963856, + "grad_norm": 0.23069098591804504, + "learning_rate": 2.060535365284668e-05, + "loss": 0.0084, + "step": 844 + }, + { + "epoch": 1.628433734939759, + "grad_norm": 0.25051403045654297, + "learning_rate": 2.056052485559338e-05, + "loss": 0.0071, + "step": 845 + }, + { + "epoch": 1.6303614457831326, + "grad_norm": 0.27664798498153687, + "learning_rate": 2.051569323983924e-05, + "loss": 0.0198, + "step": 846 + }, + { + "epoch": 1.632289156626506, + "grad_norm": 0.2954922318458557, + "learning_rate": 2.047085903101218e-05, + "loss": 0.006, + "step": 847 + }, + { + "epoch": 1.6342168674698794, + "grad_norm": 0.28477591276168823, + "learning_rate": 2.0426022454553137e-05, + "loss": 0.0147, + "step": 848 + }, + { + "epoch": 1.636144578313253, + "grad_norm": 0.2785305678844452, + "learning_rate": 2.0381183735914968e-05, + "loss": 0.0117, + "step": 849 + }, + { + "epoch": 1.6380722891566264, + "grad_norm": 0.2500309348106384, + "learning_rate": 2.0336343100561295e-05, + "loss": 0.008, + "step": 850 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.18932047486305237, + "learning_rate": 2.0291500773965392e-05, + "loss": 0.0256, + "step": 851 + }, + { + "epoch": 1.6419277108433734, + "grad_norm": 0.6396257877349854, + "learning_rate": 2.0246656981609013e-05, + "loss": 0.0141, + "step": 852 + }, + { + "epoch": 1.6438554216867471, + "grad_norm": 0.5072891116142273, + "learning_rate": 2.02018119489813e-05, + "loss": 0.008, + "step": 853 + }, + { + "epoch": 1.6457831325301204, + "grad_norm": 0.2920839488506317, + "learning_rate": 2.0156965901577635e-05, + "loss": 0.0085, + "step": 854 + }, + { + "epoch": 1.647710843373494, + "grad_norm": 0.1391262263059616, + "learning_rate": 2.011211906489848e-05, + "loss": 0.0078, + "step": 855 + }, + { + "epoch": 1.6496385542168674, + "grad_norm": 0.29620468616485596, + "learning_rate": 2.00672716644483e-05, + "loss": 0.0109, + "step": 856 + }, + { + "epoch": 1.651566265060241, + "grad_norm": 0.13946573436260223, + "learning_rate": 2.002242392573436e-05, + "loss": 0.0076, + "step": 857 + }, + { + "epoch": 1.6534939759036145, + "grad_norm": 0.9766128659248352, + "learning_rate": 1.997757607426565e-05, + "loss": 0.0309, + "step": 858 + }, + { + "epoch": 1.655421686746988, + "grad_norm": 0.18002203106880188, + "learning_rate": 1.9932728335551702e-05, + "loss": 0.0072, + "step": 859 + }, + { + "epoch": 1.6573493975903615, + "grad_norm": 0.28073111176490784, + "learning_rate": 1.988788093510152e-05, + "loss": 0.0246, + "step": 860 + }, + { + "epoch": 1.659277108433735, + "grad_norm": 0.1919957399368286, + "learning_rate": 1.9843034098422375e-05, + "loss": 0.0087, + "step": 861 + }, + { + "epoch": 1.6612048192771085, + "grad_norm": 0.1825258433818817, + "learning_rate": 1.9798188051018705e-05, + "loss": 0.0092, + "step": 862 + }, + { + "epoch": 1.6631325301204818, + "grad_norm": 0.32412952184677124, + "learning_rate": 1.9753343018390997e-05, + "loss": 0.0118, + "step": 863 + }, + { + "epoch": 1.6650602409638555, + "grad_norm": 0.12828563153743744, + "learning_rate": 1.9708499226034618e-05, + "loss": 0.0056, + "step": 864 + }, + { + "epoch": 1.6669879518072288, + "grad_norm": 0.18647560477256775, + "learning_rate": 1.966365689943871e-05, + "loss": 0.0094, + "step": 865 + }, + { + "epoch": 1.6689156626506025, + "grad_norm": 0.19835828244686127, + "learning_rate": 1.9618816264085042e-05, + "loss": 0.0097, + "step": 866 + }, + { + "epoch": 1.6708433734939758, + "grad_norm": 0.22364282608032227, + "learning_rate": 1.957397754544687e-05, + "loss": 0.0062, + "step": 867 + }, + { + "epoch": 1.6727710843373496, + "grad_norm": 0.29420018196105957, + "learning_rate": 1.952914096898783e-05, + "loss": 0.0182, + "step": 868 + }, + { + "epoch": 1.6746987951807228, + "grad_norm": 0.2149929702281952, + "learning_rate": 1.9484306760160766e-05, + "loss": 0.0125, + "step": 869 + }, + { + "epoch": 1.6766265060240964, + "grad_norm": 0.16844330728054047, + "learning_rate": 1.9439475144406623e-05, + "loss": 0.0074, + "step": 870 + }, + { + "epoch": 1.6785542168674699, + "grad_norm": 0.5010282397270203, + "learning_rate": 1.9394646347153334e-05, + "loss": 0.0213, + "step": 871 + }, + { + "epoch": 1.6804819277108434, + "grad_norm": 0.29847195744514465, + "learning_rate": 1.9349820593814606e-05, + "loss": 0.0173, + "step": 872 + }, + { + "epoch": 1.6824096385542169, + "grad_norm": 0.23835812509059906, + "learning_rate": 1.930499810978889e-05, + "loss": 0.011, + "step": 873 + }, + { + "epoch": 1.6843373493975904, + "grad_norm": 0.3269020617008209, + "learning_rate": 1.9260179120458177e-05, + "loss": 0.0285, + "step": 874 + }, + { + "epoch": 1.686265060240964, + "grad_norm": 0.2142144739627838, + "learning_rate": 1.9215363851186883e-05, + "loss": 0.0146, + "step": 875 + }, + { + "epoch": 1.6881927710843372, + "grad_norm": 0.3098377585411072, + "learning_rate": 1.9170552527320725e-05, + "loss": 0.0104, + "step": 876 + }, + { + "epoch": 1.690120481927711, + "grad_norm": 0.22504115104675293, + "learning_rate": 1.9125745374185568e-05, + "loss": 0.0091, + "step": 877 + }, + { + "epoch": 1.6920481927710842, + "grad_norm": 0.20633333921432495, + "learning_rate": 1.908094261708633e-05, + "loss": 0.0097, + "step": 878 + }, + { + "epoch": 1.693975903614458, + "grad_norm": 1.179566502571106, + "learning_rate": 1.9036144481305807e-05, + "loss": 0.0143, + "step": 879 + }, + { + "epoch": 1.6959036144578312, + "grad_norm": 0.15525613725185394, + "learning_rate": 1.8991351192103554e-05, + "loss": 0.0062, + "step": 880 + }, + { + "epoch": 1.697831325301205, + "grad_norm": 0.15966367721557617, + "learning_rate": 1.8946562974714763e-05, + "loss": 0.0048, + "step": 881 + }, + { + "epoch": 1.6997590361445782, + "grad_norm": 0.18902607262134552, + "learning_rate": 1.890178005434914e-05, + "loss": 0.0124, + "step": 882 + }, + { + "epoch": 1.701686746987952, + "grad_norm": 0.21692413091659546, + "learning_rate": 1.885700265618971e-05, + "loss": 0.0135, + "step": 883 + }, + { + "epoch": 1.7036144578313253, + "grad_norm": 0.38948455452919006, + "learning_rate": 1.8812231005391786e-05, + "loss": 0.0365, + "step": 884 + }, + { + "epoch": 1.7055421686746988, + "grad_norm": 0.2483491599559784, + "learning_rate": 1.8767465327081736e-05, + "loss": 0.0202, + "step": 885 + }, + { + "epoch": 1.7074698795180723, + "grad_norm": 0.15305832028388977, + "learning_rate": 1.872270584635592e-05, + "loss": 0.0035, + "step": 886 + }, + { + "epoch": 1.7093975903614458, + "grad_norm": 0.17794466018676758, + "learning_rate": 1.867795278827954e-05, + "loss": 0.0157, + "step": 887 + }, + { + "epoch": 1.7113253012048193, + "grad_norm": 0.1938813328742981, + "learning_rate": 1.863320637788547e-05, + "loss": 0.0071, + "step": 888 + }, + { + "epoch": 1.7132530120481928, + "grad_norm": 0.27061617374420166, + "learning_rate": 1.8588466840173207e-05, + "loss": 0.0347, + "step": 889 + }, + { + "epoch": 1.7151807228915663, + "grad_norm": 0.1541014313697815, + "learning_rate": 1.8543734400107637e-05, + "loss": 0.006, + "step": 890 + }, + { + "epoch": 1.7171084337349396, + "grad_norm": 0.1436876654624939, + "learning_rate": 1.8499009282617996e-05, + "loss": 0.0059, + "step": 891 + }, + { + "epoch": 1.7190361445783133, + "grad_norm": 1.0573723316192627, + "learning_rate": 1.8454291712596688e-05, + "loss": 0.008, + "step": 892 + }, + { + "epoch": 1.7209638554216866, + "grad_norm": 0.15406259894371033, + "learning_rate": 1.8409581914898157e-05, + "loss": 0.0061, + "step": 893 + }, + { + "epoch": 1.7228915662650603, + "grad_norm": 0.24822913110256195, + "learning_rate": 1.836488011433777e-05, + "loss": 0.0085, + "step": 894 + }, + { + "epoch": 1.7248192771084336, + "grad_norm": 0.21049316227436066, + "learning_rate": 1.83201865356907e-05, + "loss": 0.0075, + "step": 895 + }, + { + "epoch": 1.7267469879518074, + "grad_norm": 0.24159866571426392, + "learning_rate": 1.8275501403690733e-05, + "loss": 0.0156, + "step": 896 + }, + { + "epoch": 1.7286746987951807, + "grad_norm": 0.3191063106060028, + "learning_rate": 1.823082494302924e-05, + "loss": 0.0218, + "step": 897 + }, + { + "epoch": 1.7306024096385542, + "grad_norm": 0.20296362042427063, + "learning_rate": 1.8186157378353945e-05, + "loss": 0.0126, + "step": 898 + }, + { + "epoch": 1.7325301204819277, + "grad_norm": 0.1905524581670761, + "learning_rate": 1.8141498934267858e-05, + "loss": 0.0131, + "step": 899 + }, + { + "epoch": 1.7344578313253012, + "grad_norm": 0.5350520610809326, + "learning_rate": 1.809684983532813e-05, + "loss": 0.0115, + "step": 900 + }, + { + "epoch": 1.7363855421686747, + "grad_norm": 0.17144092917442322, + "learning_rate": 1.8052210306044907e-05, + "loss": 0.0113, + "step": 901 + }, + { + "epoch": 1.7383132530120482, + "grad_norm": 0.11777982115745544, + "learning_rate": 1.8007580570880236e-05, + "loss": 0.0058, + "step": 902 + }, + { + "epoch": 1.7402409638554217, + "grad_norm": 0.2078275978565216, + "learning_rate": 1.7962960854246908e-05, + "loss": 0.0106, + "step": 903 + }, + { + "epoch": 1.7421686746987952, + "grad_norm": 0.2550877630710602, + "learning_rate": 1.791835138050732e-05, + "loss": 0.0076, + "step": 904 + }, + { + "epoch": 1.7440963855421687, + "grad_norm": 0.11553912609815598, + "learning_rate": 1.7873752373972395e-05, + "loss": 0.0038, + "step": 905 + }, + { + "epoch": 1.746024096385542, + "grad_norm": 0.10724586248397827, + "learning_rate": 1.7829164058900398e-05, + "loss": 0.0043, + "step": 906 + }, + { + "epoch": 1.7479518072289157, + "grad_norm": 0.30152231454849243, + "learning_rate": 1.7784586659495845e-05, + "loss": 0.0099, + "step": 907 + }, + { + "epoch": 1.749879518072289, + "grad_norm": 0.18372933566570282, + "learning_rate": 1.7740020399908372e-05, + "loss": 0.0074, + "step": 908 + }, + { + "epoch": 1.7518072289156628, + "grad_norm": 0.35184428095817566, + "learning_rate": 1.7695465504231586e-05, + "loss": 0.0184, + "step": 909 + }, + { + "epoch": 1.753734939759036, + "grad_norm": 0.15083615481853485, + "learning_rate": 1.765092219650196e-05, + "loss": 0.0061, + "step": 910 + }, + { + "epoch": 1.7556626506024098, + "grad_norm": 0.2599961459636688, + "learning_rate": 1.7606390700697693e-05, + "loss": 0.0101, + "step": 911 + }, + { + "epoch": 1.757590361445783, + "grad_norm": 0.10829206556081772, + "learning_rate": 1.7561871240737595e-05, + "loss": 0.0034, + "step": 912 + }, + { + "epoch": 1.7595180722891566, + "grad_norm": 0.38098782300949097, + "learning_rate": 1.7517364040479966e-05, + "loss": 0.0384, + "step": 913 + }, + { + "epoch": 1.76144578313253, + "grad_norm": 0.14975085854530334, + "learning_rate": 1.7472869323721432e-05, + "loss": 0.0055, + "step": 914 + }, + { + "epoch": 1.7633734939759036, + "grad_norm": 0.4151444733142853, + "learning_rate": 1.742838731419588e-05, + "loss": 0.0307, + "step": 915 + }, + { + "epoch": 1.765301204819277, + "grad_norm": 0.22238481044769287, + "learning_rate": 1.738391823557328e-05, + "loss": 0.0059, + "step": 916 + }, + { + "epoch": 1.7672289156626506, + "grad_norm": 0.23386356234550476, + "learning_rate": 1.7339462311458587e-05, + "loss": 0.0113, + "step": 917 + }, + { + "epoch": 1.7691566265060241, + "grad_norm": 0.21911191940307617, + "learning_rate": 1.7295019765390618e-05, + "loss": 0.0071, + "step": 918 + }, + { + "epoch": 1.7710843373493976, + "grad_norm": 0.343159943819046, + "learning_rate": 1.7250590820840903e-05, + "loss": 0.0144, + "step": 919 + }, + { + "epoch": 1.7730120481927711, + "grad_norm": 0.32204556465148926, + "learning_rate": 1.720617570121259e-05, + "loss": 0.0131, + "step": 920 + }, + { + "epoch": 1.7749397590361444, + "grad_norm": 0.4105585515499115, + "learning_rate": 1.7161774629839328e-05, + "loss": 0.0148, + "step": 921 + }, + { + "epoch": 1.7768674698795182, + "grad_norm": 0.16380974650382996, + "learning_rate": 1.7117387829984093e-05, + "loss": 0.0066, + "step": 922 + }, + { + "epoch": 1.7787951807228914, + "grad_norm": 0.22920913994312286, + "learning_rate": 1.707301552483813e-05, + "loss": 0.0105, + "step": 923 + }, + { + "epoch": 1.7807228915662652, + "grad_norm": 0.2075149267911911, + "learning_rate": 1.7028657937519767e-05, + "loss": 0.0104, + "step": 924 + }, + { + "epoch": 1.7826506024096385, + "grad_norm": 0.44439977407455444, + "learning_rate": 1.6984315291073355e-05, + "loss": 0.0134, + "step": 925 + }, + { + "epoch": 1.7845783132530122, + "grad_norm": 0.24068203568458557, + "learning_rate": 1.6939987808468125e-05, + "loss": 0.0078, + "step": 926 + }, + { + "epoch": 1.7865060240963855, + "grad_norm": 0.34044349193573, + "learning_rate": 1.689567571259701e-05, + "loss": 0.0108, + "step": 927 + }, + { + "epoch": 1.788433734939759, + "grad_norm": 0.34082743525505066, + "learning_rate": 1.6851379226275624e-05, + "loss": 0.0266, + "step": 928 + }, + { + "epoch": 1.7903614457831325, + "grad_norm": 0.19490115344524384, + "learning_rate": 1.6807098572241075e-05, + "loss": 0.0109, + "step": 929 + }, + { + "epoch": 1.792289156626506, + "grad_norm": 0.16208237409591675, + "learning_rate": 1.6762833973150846e-05, + "loss": 0.0113, + "step": 930 + }, + { + "epoch": 1.7942168674698795, + "grad_norm": 0.35555699467658997, + "learning_rate": 1.671858565158172e-05, + "loss": 0.0196, + "step": 931 + }, + { + "epoch": 1.796144578313253, + "grad_norm": 0.1600857824087143, + "learning_rate": 1.6674353830028587e-05, + "loss": 0.0089, + "step": 932 + }, + { + "epoch": 1.7980722891566265, + "grad_norm": 0.1699574887752533, + "learning_rate": 1.663013873090342e-05, + "loss": 0.0074, + "step": 933 + }, + { + "epoch": 1.8, + "grad_norm": 0.2472933828830719, + "learning_rate": 1.6585940576534086e-05, + "loss": 0.0063, + "step": 934 + }, + { + "epoch": 1.8019277108433736, + "grad_norm": 0.23491555452346802, + "learning_rate": 1.654175958916323e-05, + "loss": 0.0101, + "step": 935 + }, + { + "epoch": 1.8038554216867468, + "grad_norm": 0.28635191917419434, + "learning_rate": 1.6497595990947195e-05, + "loss": 0.0131, + "step": 936 + }, + { + "epoch": 1.8057831325301206, + "grad_norm": 0.15400712192058563, + "learning_rate": 1.645345000395489e-05, + "loss": 0.0068, + "step": 937 + }, + { + "epoch": 1.8077108433734939, + "grad_norm": 0.18223172426223755, + "learning_rate": 1.6409321850166647e-05, + "loss": 0.0094, + "step": 938 + }, + { + "epoch": 1.8096385542168676, + "grad_norm": 0.2789457142353058, + "learning_rate": 1.636521175147316e-05, + "loss": 0.0202, + "step": 939 + }, + { + "epoch": 1.8115662650602409, + "grad_norm": 0.4267627000808716, + "learning_rate": 1.6321119929674297e-05, + "loss": 0.0176, + "step": 940 + }, + { + "epoch": 1.8134939759036146, + "grad_norm": 0.3021615445613861, + "learning_rate": 1.6277046606478056e-05, + "loss": 0.0085, + "step": 941 + }, + { + "epoch": 1.815421686746988, + "grad_norm": 0.3724934756755829, + "learning_rate": 1.6232992003499405e-05, + "loss": 0.0474, + "step": 942 + }, + { + "epoch": 1.8173493975903614, + "grad_norm": 0.20904326438903809, + "learning_rate": 1.6188956342259177e-05, + "loss": 0.0078, + "step": 943 + }, + { + "epoch": 1.819277108433735, + "grad_norm": 0.31168171763420105, + "learning_rate": 1.614493984418297e-05, + "loss": 0.0174, + "step": 944 + }, + { + "epoch": 1.8212048192771084, + "grad_norm": 0.21273556351661682, + "learning_rate": 1.6100942730600003e-05, + "loss": 0.0054, + "step": 945 + }, + { + "epoch": 1.823132530120482, + "grad_norm": 0.16991695761680603, + "learning_rate": 1.6056965222742055e-05, + "loss": 0.0063, + "step": 946 + }, + { + "epoch": 1.8250602409638554, + "grad_norm": 0.22762684524059296, + "learning_rate": 1.6013007541742303e-05, + "loss": 0.0234, + "step": 947 + }, + { + "epoch": 1.826987951807229, + "grad_norm": 0.20128795504570007, + "learning_rate": 1.596906990863422e-05, + "loss": 0.0095, + "step": 948 + }, + { + "epoch": 1.8289156626506025, + "grad_norm": 0.30772027373313904, + "learning_rate": 1.592515254435048e-05, + "loss": 0.0356, + "step": 949 + }, + { + "epoch": 1.830843373493976, + "grad_norm": 0.12954631447792053, + "learning_rate": 1.5881255669721857e-05, + "loss": 0.008, + "step": 950 + }, + { + "epoch": 1.8327710843373493, + "grad_norm": 0.7787145972251892, + "learning_rate": 1.5837379505476054e-05, + "loss": 0.0108, + "step": 951 + }, + { + "epoch": 1.834698795180723, + "grad_norm": 0.1683879941701889, + "learning_rate": 1.5793524272236683e-05, + "loss": 0.006, + "step": 952 + }, + { + "epoch": 1.8366265060240963, + "grad_norm": 0.16475361585617065, + "learning_rate": 1.5749690190522076e-05, + "loss": 0.0065, + "step": 953 + }, + { + "epoch": 1.83855421686747, + "grad_norm": 0.211905375123024, + "learning_rate": 1.5705877480744214e-05, + "loss": 0.0092, + "step": 954 + }, + { + "epoch": 1.8404819277108433, + "grad_norm": 0.23850117623806, + "learning_rate": 1.5662086363207628e-05, + "loss": 0.012, + "step": 955 + }, + { + "epoch": 1.842409638554217, + "grad_norm": 0.19100065529346466, + "learning_rate": 1.561831705810825e-05, + "loss": 0.0113, + "step": 956 + }, + { + "epoch": 1.8443373493975903, + "grad_norm": 0.3635985255241394, + "learning_rate": 1.557456978553236e-05, + "loss": 0.0168, + "step": 957 + }, + { + "epoch": 1.8462650602409638, + "grad_norm": 0.16449116170406342, + "learning_rate": 1.553084476545544e-05, + "loss": 0.0042, + "step": 958 + }, + { + "epoch": 1.8481927710843373, + "grad_norm": 0.566093385219574, + "learning_rate": 1.5487142217741062e-05, + "loss": 0.0145, + "step": 959 + }, + { + "epoch": 1.8501204819277108, + "grad_norm": 0.15960252285003662, + "learning_rate": 1.5443462362139834e-05, + "loss": 0.0059, + "step": 960 + }, + { + "epoch": 1.8520481927710843, + "grad_norm": 0.40773797035217285, + "learning_rate": 1.539980541828823e-05, + "loss": 0.0257, + "step": 961 + }, + { + "epoch": 1.8539759036144579, + "grad_norm": 0.4802496135234833, + "learning_rate": 1.5356171605707522e-05, + "loss": 0.0111, + "step": 962 + }, + { + "epoch": 1.8559036144578314, + "grad_norm": 0.15745794773101807, + "learning_rate": 1.5312561143802704e-05, + "loss": 0.0049, + "step": 963 + }, + { + "epoch": 1.8578313253012049, + "grad_norm": 0.15139251947402954, + "learning_rate": 1.5268974251861298e-05, + "loss": 0.0077, + "step": 964 + }, + { + "epoch": 1.8597590361445784, + "grad_norm": 0.2188841849565506, + "learning_rate": 1.5225411149052356e-05, + "loss": 0.017, + "step": 965 + }, + { + "epoch": 1.8616867469879517, + "grad_norm": 0.10853131115436554, + "learning_rate": 1.5181872054425287e-05, + "loss": 0.0049, + "step": 966 + }, + { + "epoch": 1.8636144578313254, + "grad_norm": 0.8254880905151367, + "learning_rate": 1.5138357186908785e-05, + "loss": 0.0317, + "step": 967 + }, + { + "epoch": 1.8655421686746987, + "grad_norm": 0.2989620566368103, + "learning_rate": 1.5094866765309728e-05, + "loss": 0.0126, + "step": 968 + }, + { + "epoch": 1.8674698795180724, + "grad_norm": 0.16411150991916656, + "learning_rate": 1.5051401008312054e-05, + "loss": 0.0101, + "step": 969 + }, + { + "epoch": 1.8693975903614457, + "grad_norm": 0.2861763834953308, + "learning_rate": 1.5007960134475706e-05, + "loss": 0.0155, + "step": 970 + }, + { + "epoch": 1.8713253012048194, + "grad_norm": 0.24879588186740875, + "learning_rate": 1.4964544362235487e-05, + "loss": 0.0187, + "step": 971 + }, + { + "epoch": 1.8732530120481927, + "grad_norm": 0.2433672398328781, + "learning_rate": 1.4921153909899983e-05, + "loss": 0.0084, + "step": 972 + }, + { + "epoch": 1.8751807228915662, + "grad_norm": 0.15097154676914215, + "learning_rate": 1.487778899565047e-05, + "loss": 0.007, + "step": 973 + }, + { + "epoch": 1.8771084337349397, + "grad_norm": 0.1629047691822052, + "learning_rate": 1.4834449837539806e-05, + "loss": 0.0058, + "step": 974 + }, + { + "epoch": 1.8790361445783132, + "grad_norm": 0.9937071204185486, + "learning_rate": 1.4791136653491333e-05, + "loss": 0.0323, + "step": 975 + }, + { + "epoch": 1.8809638554216868, + "grad_norm": 0.19555562734603882, + "learning_rate": 1.4747849661297808e-05, + "loss": 0.0126, + "step": 976 + }, + { + "epoch": 1.8828915662650603, + "grad_norm": 0.16147711873054504, + "learning_rate": 1.470458907862026e-05, + "loss": 0.0067, + "step": 977 + }, + { + "epoch": 1.8848192771084338, + "grad_norm": 0.2730027735233307, + "learning_rate": 1.4661355122986945e-05, + "loss": 0.0147, + "step": 978 + }, + { + "epoch": 1.886746987951807, + "grad_norm": 0.13759832084178925, + "learning_rate": 1.4618148011792206e-05, + "loss": 0.0038, + "step": 979 + }, + { + "epoch": 1.8886746987951808, + "grad_norm": 0.33516690135002136, + "learning_rate": 1.4574967962295419e-05, + "loss": 0.0139, + "step": 980 + }, + { + "epoch": 1.890602409638554, + "grad_norm": 0.2345741093158722, + "learning_rate": 1.4531815191619903e-05, + "loss": 0.0094, + "step": 981 + }, + { + "epoch": 1.8925301204819278, + "grad_norm": 0.14681044220924377, + "learning_rate": 1.4488689916751762e-05, + "loss": 0.0065, + "step": 982 + }, + { + "epoch": 1.894457831325301, + "grad_norm": 0.21143914759159088, + "learning_rate": 1.4445592354538885e-05, + "loss": 0.0057, + "step": 983 + }, + { + "epoch": 1.8963855421686748, + "grad_norm": 0.3109160363674164, + "learning_rate": 1.44025227216898e-05, + "loss": 0.0142, + "step": 984 + }, + { + "epoch": 1.8983132530120481, + "grad_norm": 0.24301907420158386, + "learning_rate": 1.435948123477259e-05, + "loss": 0.012, + "step": 985 + }, + { + "epoch": 1.9002409638554218, + "grad_norm": 0.19817675650119781, + "learning_rate": 1.431646811021382e-05, + "loss": 0.0097, + "step": 986 + }, + { + "epoch": 1.9021686746987951, + "grad_norm": 0.13464932143688202, + "learning_rate": 1.4273483564297425e-05, + "loss": 0.0046, + "step": 987 + }, + { + "epoch": 1.9040963855421686, + "grad_norm": 0.1698642522096634, + "learning_rate": 1.4230527813163656e-05, + "loss": 0.0038, + "step": 988 + }, + { + "epoch": 1.9060240963855422, + "grad_norm": 0.19395388662815094, + "learning_rate": 1.4187601072807975e-05, + "loss": 0.0123, + "step": 989 + }, + { + "epoch": 1.9079518072289157, + "grad_norm": 0.2093188613653183, + "learning_rate": 1.4144703559079948e-05, + "loss": 0.0093, + "step": 990 + }, + { + "epoch": 1.9098795180722892, + "grad_norm": 0.1529311090707779, + "learning_rate": 1.4101835487682198e-05, + "loss": 0.0051, + "step": 991 + }, + { + "epoch": 1.9118072289156627, + "grad_norm": 0.18725350499153137, + "learning_rate": 1.4058997074169299e-05, + "loss": 0.0083, + "step": 992 + }, + { + "epoch": 1.9137349397590362, + "grad_norm": 0.15601560473442078, + "learning_rate": 1.401618853394668e-05, + "loss": 0.0086, + "step": 993 + }, + { + "epoch": 1.9156626506024095, + "grad_norm": 0.23890644311904907, + "learning_rate": 1.3973410082269591e-05, + "loss": 0.015, + "step": 994 + }, + { + "epoch": 1.9175903614457832, + "grad_norm": 0.2442619949579239, + "learning_rate": 1.3930661934241947e-05, + "loss": 0.0089, + "step": 995 + }, + { + "epoch": 1.9195180722891565, + "grad_norm": 0.1540212482213974, + "learning_rate": 1.388794430481532e-05, + "loss": 0.0072, + "step": 996 + }, + { + "epoch": 1.9214457831325302, + "grad_norm": 0.1359291970729828, + "learning_rate": 1.3845257408787807e-05, + "loss": 0.0131, + "step": 997 + }, + { + "epoch": 1.9233734939759035, + "grad_norm": 0.25486138463020325, + "learning_rate": 1.3802601460802967e-05, + "loss": 0.0198, + "step": 998 + }, + { + "epoch": 1.9253012048192772, + "grad_norm": 0.28815609216690063, + "learning_rate": 1.3759976675348754e-05, + "loss": 0.014, + "step": 999 + }, + { + "epoch": 1.9272289156626505, + "grad_norm": 0.15648497641086578, + "learning_rate": 1.3717383266756403e-05, + "loss": 0.0065, + "step": 1000 + }, + { + "epoch": 1.929156626506024, + "grad_norm": 0.16912540793418884, + "learning_rate": 1.367482144919941e-05, + "loss": 0.0059, + "step": 1001 + }, + { + "epoch": 1.9310843373493976, + "grad_norm": 0.16896723210811615, + "learning_rate": 1.3632291436692397e-05, + "loss": 0.0054, + "step": 1002 + }, + { + "epoch": 1.933012048192771, + "grad_norm": 0.20287497341632843, + "learning_rate": 1.3589793443090064e-05, + "loss": 0.0097, + "step": 1003 + }, + { + "epoch": 1.9349397590361446, + "grad_norm": 0.14804276823997498, + "learning_rate": 1.3547327682086114e-05, + "loss": 0.0125, + "step": 1004 + }, + { + "epoch": 1.936867469879518, + "grad_norm": 0.23820064961910248, + "learning_rate": 1.3504894367212171e-05, + "loss": 0.0131, + "step": 1005 + }, + { + "epoch": 1.9387951807228916, + "grad_norm": 0.25607362389564514, + "learning_rate": 1.34624937118367e-05, + "loss": 0.0115, + "step": 1006 + }, + { + "epoch": 1.940722891566265, + "grad_norm": 0.37233737111091614, + "learning_rate": 1.3420125929163976e-05, + "loss": 0.0309, + "step": 1007 + }, + { + "epoch": 1.9426506024096386, + "grad_norm": 0.19426730275154114, + "learning_rate": 1.3377791232232929e-05, + "loss": 0.0078, + "step": 1008 + }, + { + "epoch": 1.944578313253012, + "grad_norm": 0.2784160077571869, + "learning_rate": 1.333548983391617e-05, + "loss": 0.0142, + "step": 1009 + }, + { + "epoch": 1.9465060240963856, + "grad_norm": 0.11407195776700974, + "learning_rate": 1.3293221946918853e-05, + "loss": 0.0035, + "step": 1010 + }, + { + "epoch": 1.948433734939759, + "grad_norm": 0.3965436816215515, + "learning_rate": 1.325098778377762e-05, + "loss": 0.0242, + "step": 1011 + }, + { + "epoch": 1.9503614457831326, + "grad_norm": 0.18520519137382507, + "learning_rate": 1.3208787556859543e-05, + "loss": 0.0096, + "step": 1012 + }, + { + "epoch": 1.952289156626506, + "grad_norm": 0.2783315181732178, + "learning_rate": 1.3166621478361075e-05, + "loss": 0.0103, + "step": 1013 + }, + { + "epoch": 1.9542168674698797, + "grad_norm": 0.22714459896087646, + "learning_rate": 1.3124489760306917e-05, + "loss": 0.0078, + "step": 1014 + }, + { + "epoch": 1.956144578313253, + "grad_norm": 0.1257915049791336, + "learning_rate": 1.3082392614549036e-05, + "loss": 0.0077, + "step": 1015 + }, + { + "epoch": 1.9580722891566265, + "grad_norm": 0.15592887997627258, + "learning_rate": 1.3040330252765526e-05, + "loss": 0.0106, + "step": 1016 + }, + { + "epoch": 1.96, + "grad_norm": 0.19295449554920197, + "learning_rate": 1.2998302886459586e-05, + "loss": 0.0082, + "step": 1017 + }, + { + "epoch": 1.9619277108433735, + "grad_norm": 0.15544794499874115, + "learning_rate": 1.2956310726958472e-05, + "loss": 0.0068, + "step": 1018 + }, + { + "epoch": 1.963855421686747, + "grad_norm": 0.25899502635002136, + "learning_rate": 1.291435398541236e-05, + "loss": 0.0086, + "step": 1019 + }, + { + "epoch": 1.9657831325301205, + "grad_norm": 0.34639033675193787, + "learning_rate": 1.2872432872793379e-05, + "loss": 0.0116, + "step": 1020 + }, + { + "epoch": 1.967710843373494, + "grad_norm": 0.1628410518169403, + "learning_rate": 1.283054759989447e-05, + "loss": 0.0055, + "step": 1021 + }, + { + "epoch": 1.9696385542168675, + "grad_norm": 0.9273788928985596, + "learning_rate": 1.2788698377328385e-05, + "loss": 0.0264, + "step": 1022 + }, + { + "epoch": 1.971566265060241, + "grad_norm": 0.163126140832901, + "learning_rate": 1.2746885415526594e-05, + "loss": 0.0046, + "step": 1023 + }, + { + "epoch": 1.9734939759036143, + "grad_norm": 0.1475439816713333, + "learning_rate": 1.2705108924738223e-05, + "loss": 0.0056, + "step": 1024 + }, + { + "epoch": 1.975421686746988, + "grad_norm": 0.1654318869113922, + "learning_rate": 1.2663369115029034e-05, + "loss": 0.0056, + "step": 1025 + }, + { + "epoch": 1.9773493975903613, + "grad_norm": 0.20536045730113983, + "learning_rate": 1.2621666196280333e-05, + "loss": 0.0101, + "step": 1026 + }, + { + "epoch": 1.979277108433735, + "grad_norm": 0.19256474077701569, + "learning_rate": 1.258000037818792e-05, + "loss": 0.0059, + "step": 1027 + }, + { + "epoch": 1.9812048192771083, + "grad_norm": 0.2605120539665222, + "learning_rate": 1.2538371870261053e-05, + "loss": 0.0115, + "step": 1028 + }, + { + "epoch": 1.983132530120482, + "grad_norm": 0.14840295910835266, + "learning_rate": 1.249678088182137e-05, + "loss": 0.0046, + "step": 1029 + }, + { + "epoch": 1.9850602409638554, + "grad_norm": 0.17585207521915436, + "learning_rate": 1.2455227622001851e-05, + "loss": 0.0086, + "step": 1030 + }, + { + "epoch": 1.9869879518072289, + "grad_norm": 0.11044781655073166, + "learning_rate": 1.241371229974579e-05, + "loss": 0.0034, + "step": 1031 + }, + { + "epoch": 1.9889156626506024, + "grad_norm": 0.25584840774536133, + "learning_rate": 1.2372235123805672e-05, + "loss": 0.0245, + "step": 1032 + }, + { + "epoch": 1.9908433734939759, + "grad_norm": 0.25962474942207336, + "learning_rate": 1.2330796302742211e-05, + "loss": 0.0104, + "step": 1033 + }, + { + "epoch": 1.9927710843373494, + "grad_norm": 0.33408522605895996, + "learning_rate": 1.2289396044923238e-05, + "loss": 0.0176, + "step": 1034 + }, + { + "epoch": 1.994698795180723, + "grad_norm": 0.479950487613678, + "learning_rate": 1.2248034558522682e-05, + "loss": 0.0113, + "step": 1035 + }, + { + "epoch": 1.9966265060240964, + "grad_norm": 0.16567294299602509, + "learning_rate": 1.2206712051519518e-05, + "loss": 0.0036, + "step": 1036 + }, + { + "epoch": 1.99855421686747, + "grad_norm": 0.19343771040439606, + "learning_rate": 1.2165428731696713e-05, + "loss": 0.0077, + "step": 1037 + }, + { + "epoch": 2.0, + "grad_norm": 0.22895601391792297, + "learning_rate": 1.2124184806640202e-05, + "loss": 0.0114, + "step": 1038 + }, + { + "epoch": 2.0019277108433733, + "grad_norm": 0.15838384628295898, + "learning_rate": 1.208298048373782e-05, + "loss": 0.0043, + "step": 1039 + }, + { + "epoch": 2.003855421686747, + "grad_norm": 0.681065559387207, + "learning_rate": 1.2041815970178268e-05, + "loss": 0.0214, + "step": 1040 + }, + { + "epoch": 2.0057831325301203, + "grad_norm": 0.3357350528240204, + "learning_rate": 1.2000691472950081e-05, + "loss": 0.0079, + "step": 1041 + }, + { + "epoch": 2.007710843373494, + "grad_norm": 0.15238308906555176, + "learning_rate": 1.1959607198840568e-05, + "loss": 0.0041, + "step": 1042 + }, + { + "epoch": 2.0096385542168673, + "grad_norm": 0.11763229966163635, + "learning_rate": 1.1918563354434784e-05, + "loss": 0.0033, + "step": 1043 + }, + { + "epoch": 2.011566265060241, + "grad_norm": 0.3759301006793976, + "learning_rate": 1.1877560146114515e-05, + "loss": 0.0128, + "step": 1044 + }, + { + "epoch": 2.0134939759036143, + "grad_norm": 0.1143188625574112, + "learning_rate": 1.1836597780057183e-05, + "loss": 0.0078, + "step": 1045 + }, + { + "epoch": 2.015421686746988, + "grad_norm": 0.20059260725975037, + "learning_rate": 1.179567646223485e-05, + "loss": 0.0149, + "step": 1046 + }, + { + "epoch": 2.0173493975903614, + "grad_norm": 0.15569567680358887, + "learning_rate": 1.1754796398413196e-05, + "loss": 0.0038, + "step": 1047 + }, + { + "epoch": 2.019277108433735, + "grad_norm": 0.1153278723359108, + "learning_rate": 1.1713957794150423e-05, + "loss": 0.0041, + "step": 1048 + }, + { + "epoch": 2.0212048192771084, + "grad_norm": 0.1838717758655548, + "learning_rate": 1.1673160854796307e-05, + "loss": 0.0041, + "step": 1049 + }, + { + "epoch": 2.023132530120482, + "grad_norm": 0.12264502793550491, + "learning_rate": 1.1632405785491077e-05, + "loss": 0.0043, + "step": 1050 + }, + { + "epoch": 2.0250602409638554, + "grad_norm": 0.14363229274749756, + "learning_rate": 1.159169279116445e-05, + "loss": 0.0066, + "step": 1051 + }, + { + "epoch": 2.026987951807229, + "grad_norm": 0.1316995471715927, + "learning_rate": 1.1551022076534585e-05, + "loss": 0.0024, + "step": 1052 + }, + { + "epoch": 2.0289156626506024, + "grad_norm": 0.13392619788646698, + "learning_rate": 1.1510393846107001e-05, + "loss": 0.0051, + "step": 1053 + }, + { + "epoch": 2.0308433734939757, + "grad_norm": 3.0086817741394043, + "learning_rate": 1.1469808304173658e-05, + "loss": 0.0334, + "step": 1054 + }, + { + "epoch": 2.0327710843373494, + "grad_norm": 0.17756076157093048, + "learning_rate": 1.1429265654811803e-05, + "loss": 0.0068, + "step": 1055 + }, + { + "epoch": 2.0346987951807227, + "grad_norm": 0.13250532746315002, + "learning_rate": 1.1388766101883038e-05, + "loss": 0.0087, + "step": 1056 + }, + { + "epoch": 2.0366265060240965, + "grad_norm": 0.3534089922904968, + "learning_rate": 1.1348309849032257e-05, + "loss": 0.0076, + "step": 1057 + }, + { + "epoch": 2.0385542168674697, + "grad_norm": 0.11939049512147903, + "learning_rate": 1.1307897099686627e-05, + "loss": 0.0029, + "step": 1058 + }, + { + "epoch": 2.0404819277108435, + "grad_norm": 0.11862517893314362, + "learning_rate": 1.1267528057054562e-05, + "loss": 0.0062, + "step": 1059 + }, + { + "epoch": 2.0424096385542168, + "grad_norm": 0.1539212018251419, + "learning_rate": 1.1227202924124704e-05, + "loss": 0.0067, + "step": 1060 + }, + { + "epoch": 2.0443373493975905, + "grad_norm": 0.17163440585136414, + "learning_rate": 1.118692190366491e-05, + "loss": 0.0055, + "step": 1061 + }, + { + "epoch": 2.0462650602409638, + "grad_norm": 0.12304897606372833, + "learning_rate": 1.1146685198221222e-05, + "loss": 0.0036, + "step": 1062 + }, + { + "epoch": 2.0481927710843375, + "grad_norm": 0.17319051921367645, + "learning_rate": 1.1106493010116842e-05, + "loss": 0.0058, + "step": 1063 + }, + { + "epoch": 2.050120481927711, + "grad_norm": 0.2242443859577179, + "learning_rate": 1.1066345541451127e-05, + "loss": 0.0059, + "step": 1064 + }, + { + "epoch": 2.0520481927710845, + "grad_norm": 0.09533938020467758, + "learning_rate": 1.1026242994098597e-05, + "loss": 0.0033, + "step": 1065 + }, + { + "epoch": 2.053975903614458, + "grad_norm": 0.11697929352521896, + "learning_rate": 1.0986185569707852e-05, + "loss": 0.0038, + "step": 1066 + }, + { + "epoch": 2.0559036144578315, + "grad_norm": 0.2563149333000183, + "learning_rate": 1.0946173469700625e-05, + "loss": 0.0158, + "step": 1067 + }, + { + "epoch": 2.057831325301205, + "grad_norm": 0.21836932003498077, + "learning_rate": 1.0906206895270739e-05, + "loss": 0.0085, + "step": 1068 + }, + { + "epoch": 2.059759036144578, + "grad_norm": 0.1798071414232254, + "learning_rate": 1.0866286047383094e-05, + "loss": 0.0053, + "step": 1069 + }, + { + "epoch": 2.061686746987952, + "grad_norm": 0.08937730640172958, + "learning_rate": 1.0826411126772675e-05, + "loss": 0.0025, + "step": 1070 + }, + { + "epoch": 2.063614457831325, + "grad_norm": 0.0942138060927391, + "learning_rate": 1.0786582333943499e-05, + "loss": 0.0017, + "step": 1071 + }, + { + "epoch": 2.065542168674699, + "grad_norm": 0.13076582551002502, + "learning_rate": 1.0746799869167679e-05, + "loss": 0.0033, + "step": 1072 + }, + { + "epoch": 2.067469879518072, + "grad_norm": 0.0993233174085617, + "learning_rate": 1.0707063932484357e-05, + "loss": 0.0046, + "step": 1073 + }, + { + "epoch": 2.069397590361446, + "grad_norm": 0.3046741485595703, + "learning_rate": 1.0667374723698698e-05, + "loss": 0.009, + "step": 1074 + }, + { + "epoch": 2.071325301204819, + "grad_norm": 0.12197669595479965, + "learning_rate": 1.0627732442380932e-05, + "loss": 0.0034, + "step": 1075 + }, + { + "epoch": 2.073253012048193, + "grad_norm": 0.12721140682697296, + "learning_rate": 1.058813728786531e-05, + "loss": 0.0048, + "step": 1076 + }, + { + "epoch": 2.075180722891566, + "grad_norm": 0.10011966526508331, + "learning_rate": 1.0548589459249112e-05, + "loss": 0.0026, + "step": 1077 + }, + { + "epoch": 2.07710843373494, + "grad_norm": 0.3314201831817627, + "learning_rate": 1.0509089155391661e-05, + "loss": 0.0284, + "step": 1078 + }, + { + "epoch": 2.079036144578313, + "grad_norm": 0.32739701867103577, + "learning_rate": 1.0469636574913288e-05, + "loss": 0.0088, + "step": 1079 + }, + { + "epoch": 2.080963855421687, + "grad_norm": 0.13805675506591797, + "learning_rate": 1.043023191619438e-05, + "loss": 0.0042, + "step": 1080 + }, + { + "epoch": 2.0828915662650602, + "grad_norm": 0.14789745211601257, + "learning_rate": 1.039087537737435e-05, + "loss": 0.0037, + "step": 1081 + }, + { + "epoch": 2.0848192771084335, + "grad_norm": 0.15518991649150848, + "learning_rate": 1.0351567156350617e-05, + "loss": 0.0044, + "step": 1082 + }, + { + "epoch": 2.0867469879518072, + "grad_norm": 0.08380113542079926, + "learning_rate": 1.0312307450777706e-05, + "loss": 0.0019, + "step": 1083 + }, + { + "epoch": 2.0886746987951805, + "grad_norm": 0.17892400920391083, + "learning_rate": 1.027309645806613e-05, + "loss": 0.0065, + "step": 1084 + }, + { + "epoch": 2.0906024096385543, + "grad_norm": 0.5497608780860901, + "learning_rate": 1.0233934375381489e-05, + "loss": 0.0238, + "step": 1085 + }, + { + "epoch": 2.0925301204819275, + "grad_norm": 1.0189186334609985, + "learning_rate": 1.019482139964344e-05, + "loss": 0.0092, + "step": 1086 + }, + { + "epoch": 2.0944578313253013, + "grad_norm": 0.12144117057323456, + "learning_rate": 1.015575772752472e-05, + "loss": 0.0038, + "step": 1087 + }, + { + "epoch": 2.0963855421686746, + "grad_norm": 0.1115315854549408, + "learning_rate": 1.0116743555450148e-05, + "loss": 0.0024, + "step": 1088 + }, + { + "epoch": 2.0983132530120483, + "grad_norm": 0.22671759128570557, + "learning_rate": 1.0077779079595631e-05, + "loss": 0.0136, + "step": 1089 + }, + { + "epoch": 2.1002409638554216, + "grad_norm": 2.0009827613830566, + "learning_rate": 1.003886449588719e-05, + "loss": 0.0493, + "step": 1090 + }, + { + "epoch": 2.1021686746987953, + "grad_norm": 0.11907301843166351, + "learning_rate": 1.0000000000000006e-05, + "loss": 0.0034, + "step": 1091 + }, + { + "epoch": 2.1040963855421686, + "grad_norm": 0.31257638335227966, + "learning_rate": 9.961185787357346e-06, + "loss": 0.0129, + "step": 1092 + }, + { + "epoch": 2.1060240963855423, + "grad_norm": 0.11033743619918823, + "learning_rate": 9.922422053129674e-06, + "loss": 0.0184, + "step": 1093 + }, + { + "epoch": 2.1079518072289156, + "grad_norm": 0.2575698494911194, + "learning_rate": 9.883708992233626e-06, + "loss": 0.0054, + "step": 1094 + }, + { + "epoch": 2.1098795180722894, + "grad_norm": 0.12921132147312164, + "learning_rate": 9.845046799331029e-06, + "loss": 0.0037, + "step": 1095 + }, + { + "epoch": 2.1118072289156626, + "grad_norm": 0.21405921876430511, + "learning_rate": 9.806435668827941e-06, + "loss": 0.006, + "step": 1096 + }, + { + "epoch": 2.113734939759036, + "grad_norm": 0.12929430603981018, + "learning_rate": 9.76787579487363e-06, + "loss": 0.0049, + "step": 1097 + }, + { + "epoch": 2.1156626506024097, + "grad_norm": 0.1793181151151657, + "learning_rate": 9.729367371359681e-06, + "loss": 0.0086, + "step": 1098 + }, + { + "epoch": 2.117590361445783, + "grad_norm": 0.2182074338197708, + "learning_rate": 9.690910591918936e-06, + "loss": 0.0106, + "step": 1099 + }, + { + "epoch": 2.1195180722891567, + "grad_norm": 0.0705680400133133, + "learning_rate": 9.652505649924547e-06, + "loss": 0.0012, + "step": 1100 + }, + { + "epoch": 2.12144578313253, + "grad_norm": 0.10509738326072693, + "learning_rate": 9.614152738489021e-06, + "loss": 0.0048, + "step": 1101 + }, + { + "epoch": 2.1233734939759037, + "grad_norm": 0.13775436580181122, + "learning_rate": 9.575852050463268e-06, + "loss": 0.0089, + "step": 1102 + }, + { + "epoch": 2.125301204819277, + "grad_norm": 0.15230101346969604, + "learning_rate": 9.537603778435545e-06, + "loss": 0.0065, + "step": 1103 + }, + { + "epoch": 2.1272289156626507, + "grad_norm": 0.24702346324920654, + "learning_rate": 9.499408114730583e-06, + "loss": 0.016, + "step": 1104 + }, + { + "epoch": 2.129156626506024, + "grad_norm": 0.1082577034831047, + "learning_rate": 9.461265251408575e-06, + "loss": 0.0036, + "step": 1105 + }, + { + "epoch": 2.1310843373493977, + "grad_norm": 0.1063847690820694, + "learning_rate": 9.423175380264211e-06, + "loss": 0.0037, + "step": 1106 + }, + { + "epoch": 2.133012048192771, + "grad_norm": 0.07686953246593475, + "learning_rate": 9.385138692825729e-06, + "loss": 0.0031, + "step": 1107 + }, + { + "epoch": 2.1349397590361447, + "grad_norm": 0.2046380341053009, + "learning_rate": 9.347155380353912e-06, + "loss": 0.0087, + "step": 1108 + }, + { + "epoch": 2.136867469879518, + "grad_norm": 0.1341692954301834, + "learning_rate": 9.30922563384121e-06, + "loss": 0.0045, + "step": 1109 + }, + { + "epoch": 2.1387951807228918, + "grad_norm": 0.09870535880327225, + "learning_rate": 9.271349644010672e-06, + "loss": 0.003, + "step": 1110 + }, + { + "epoch": 2.140722891566265, + "grad_norm": 0.18708615005016327, + "learning_rate": 9.233527601315069e-06, + "loss": 0.0042, + "step": 1111 + }, + { + "epoch": 2.1426506024096383, + "grad_norm": 0.5175634026527405, + "learning_rate": 9.195759695935907e-06, + "loss": 0.0173, + "step": 1112 + }, + { + "epoch": 2.144578313253012, + "grad_norm": 0.14939036965370178, + "learning_rate": 9.158046117782464e-06, + "loss": 0.0031, + "step": 1113 + }, + { + "epoch": 2.1465060240963854, + "grad_norm": 0.2837410569190979, + "learning_rate": 9.120387056490851e-06, + "loss": 0.0097, + "step": 1114 + }, + { + "epoch": 2.148433734939759, + "grad_norm": 0.11088677495718002, + "learning_rate": 9.082782701423047e-06, + "loss": 0.0026, + "step": 1115 + }, + { + "epoch": 2.1503614457831324, + "grad_norm": 0.07785166054964066, + "learning_rate": 9.045233241665947e-06, + "loss": 0.0019, + "step": 1116 + }, + { + "epoch": 2.152289156626506, + "grad_norm": 0.17568141222000122, + "learning_rate": 9.007738866030427e-06, + "loss": 0.0039, + "step": 1117 + }, + { + "epoch": 2.1542168674698794, + "grad_norm": 0.12652266025543213, + "learning_rate": 8.970299763050356e-06, + "loss": 0.0033, + "step": 1118 + }, + { + "epoch": 2.156144578313253, + "grad_norm": 0.16801467537879944, + "learning_rate": 8.932916120981695e-06, + "loss": 0.0076, + "step": 1119 + }, + { + "epoch": 2.1580722891566264, + "grad_norm": 0.18313169479370117, + "learning_rate": 8.895588127801545e-06, + "loss": 0.0052, + "step": 1120 + }, + { + "epoch": 2.16, + "grad_norm": 0.07546049356460571, + "learning_rate": 8.858315971207146e-06, + "loss": 0.0022, + "step": 1121 + }, + { + "epoch": 2.1619277108433734, + "grad_norm": 0.4039839208126068, + "learning_rate": 8.821099838614996e-06, + "loss": 0.0203, + "step": 1122 + }, + { + "epoch": 2.163855421686747, + "grad_norm": 0.09244243055582047, + "learning_rate": 8.783939917159897e-06, + "loss": 0.002, + "step": 1123 + }, + { + "epoch": 2.1657831325301204, + "grad_norm": 0.18327835202217102, + "learning_rate": 8.746836393693978e-06, + "loss": 0.0055, + "step": 1124 + }, + { + "epoch": 2.167710843373494, + "grad_norm": 0.22010307013988495, + "learning_rate": 8.709789454785809e-06, + "loss": 0.0077, + "step": 1125 + }, + { + "epoch": 2.1696385542168675, + "grad_norm": 0.09438297897577286, + "learning_rate": 8.67279928671939e-06, + "loss": 0.0032, + "step": 1126 + }, + { + "epoch": 2.1715662650602408, + "grad_norm": 0.20782770216464996, + "learning_rate": 8.635866075493318e-06, + "loss": 0.0028, + "step": 1127 + }, + { + "epoch": 2.1734939759036145, + "grad_norm": 0.1958685964345932, + "learning_rate": 8.598990006819756e-06, + "loss": 0.0047, + "step": 1128 + }, + { + "epoch": 2.1754216867469878, + "grad_norm": 0.06459935009479523, + "learning_rate": 8.562171266123528e-06, + "loss": 0.0015, + "step": 1129 + }, + { + "epoch": 2.1773493975903615, + "grad_norm": 0.33486708998680115, + "learning_rate": 8.525410038541218e-06, + "loss": 0.0094, + "step": 1130 + }, + { + "epoch": 2.179277108433735, + "grad_norm": 0.5755940079689026, + "learning_rate": 8.488706508920202e-06, + "loss": 0.0067, + "step": 1131 + }, + { + "epoch": 2.1812048192771085, + "grad_norm": 0.10840924829244614, + "learning_rate": 8.452060861817738e-06, + "loss": 0.0082, + "step": 1132 + }, + { + "epoch": 2.183132530120482, + "grad_norm": 0.18611350655555725, + "learning_rate": 8.415473281500037e-06, + "loss": 0.0059, + "step": 1133 + }, + { + "epoch": 2.1850602409638555, + "grad_norm": 0.11245249956846237, + "learning_rate": 8.378943951941301e-06, + "loss": 0.0107, + "step": 1134 + }, + { + "epoch": 2.186987951807229, + "grad_norm": 0.12284426391124725, + "learning_rate": 8.342473056822873e-06, + "loss": 0.0025, + "step": 1135 + }, + { + "epoch": 2.1889156626506026, + "grad_norm": 0.12542888522148132, + "learning_rate": 8.306060779532245e-06, + "loss": 0.0059, + "step": 1136 + }, + { + "epoch": 2.190843373493976, + "grad_norm": 0.1287655532360077, + "learning_rate": 8.26970730316215e-06, + "loss": 0.0022, + "step": 1137 + }, + { + "epoch": 2.1927710843373496, + "grad_norm": 0.1818632185459137, + "learning_rate": 8.233412810509669e-06, + "loss": 0.0131, + "step": 1138 + }, + { + "epoch": 2.194698795180723, + "grad_norm": 0.09687745571136475, + "learning_rate": 8.197177484075284e-06, + "loss": 0.0025, + "step": 1139 + }, + { + "epoch": 2.1966265060240966, + "grad_norm": 0.16103452444076538, + "learning_rate": 8.161001506061979e-06, + "loss": 0.0031, + "step": 1140 + }, + { + "epoch": 2.19855421686747, + "grad_norm": 0.2711680233478546, + "learning_rate": 8.124885058374302e-06, + "loss": 0.0034, + "step": 1141 + }, + { + "epoch": 2.200481927710843, + "grad_norm": 0.17613105475902557, + "learning_rate": 8.088828322617473e-06, + "loss": 0.0044, + "step": 1142 + }, + { + "epoch": 2.202409638554217, + "grad_norm": 0.2298487424850464, + "learning_rate": 8.052831480096464e-06, + "loss": 0.0168, + "step": 1143 + }, + { + "epoch": 2.20433734939759, + "grad_norm": 0.17042206227779388, + "learning_rate": 8.016894711815067e-06, + "loss": 0.007, + "step": 1144 + }, + { + "epoch": 2.206265060240964, + "grad_norm": 0.2830466628074646, + "learning_rate": 7.98101819847501e-06, + "loss": 0.0091, + "step": 1145 + }, + { + "epoch": 2.208192771084337, + "grad_norm": 0.22089065611362457, + "learning_rate": 7.945202120475063e-06, + "loss": 0.0046, + "step": 1146 + }, + { + "epoch": 2.210120481927711, + "grad_norm": 0.1716073900461197, + "learning_rate": 7.909446657910072e-06, + "loss": 0.0032, + "step": 1147 + }, + { + "epoch": 2.212048192771084, + "grad_norm": 0.16140373051166534, + "learning_rate": 7.873751990570104e-06, + "loss": 0.0057, + "step": 1148 + }, + { + "epoch": 2.213975903614458, + "grad_norm": 0.1671605408191681, + "learning_rate": 7.838118297939529e-06, + "loss": 0.0039, + "step": 1149 + }, + { + "epoch": 2.2159036144578312, + "grad_norm": 0.10933005809783936, + "learning_rate": 7.802545759196117e-06, + "loss": 0.005, + "step": 1150 + }, + { + "epoch": 2.217831325301205, + "grad_norm": 0.07819998264312744, + "learning_rate": 7.76703455321014e-06, + "loss": 0.0025, + "step": 1151 + }, + { + "epoch": 2.2197590361445783, + "grad_norm": 0.36211854219436646, + "learning_rate": 7.73158485854344e-06, + "loss": 0.0151, + "step": 1152 + }, + { + "epoch": 2.221686746987952, + "grad_norm": 0.09098304808139801, + "learning_rate": 7.696196853448612e-06, + "loss": 0.0027, + "step": 1153 + }, + { + "epoch": 2.2236144578313253, + "grad_norm": 0.17442144453525543, + "learning_rate": 7.660870715868018e-06, + "loss": 0.006, + "step": 1154 + }, + { + "epoch": 2.225542168674699, + "grad_norm": 0.09785338491201401, + "learning_rate": 7.625606623432933e-06, + "loss": 0.0041, + "step": 1155 + }, + { + "epoch": 2.2274698795180723, + "grad_norm": 0.19399888813495636, + "learning_rate": 7.590404753462653e-06, + "loss": 0.0125, + "step": 1156 + }, + { + "epoch": 2.2293975903614456, + "grad_norm": 0.11080623418092728, + "learning_rate": 7.55526528296362e-06, + "loss": 0.0022, + "step": 1157 + }, + { + "epoch": 2.2313253012048193, + "grad_norm": 0.14067359268665314, + "learning_rate": 7.520188388628473e-06, + "loss": 0.0123, + "step": 1158 + }, + { + "epoch": 2.2332530120481926, + "grad_norm": 0.14533625543117523, + "learning_rate": 7.485174246835227e-06, + "loss": 0.0039, + "step": 1159 + }, + { + "epoch": 2.2351807228915663, + "grad_norm": 0.1253812462091446, + "learning_rate": 7.4502230336463466e-06, + "loss": 0.003, + "step": 1160 + }, + { + "epoch": 2.2371084337349396, + "grad_norm": 0.12766572833061218, + "learning_rate": 7.415334924807869e-06, + "loss": 0.0044, + "step": 1161 + }, + { + "epoch": 2.2390361445783133, + "grad_norm": 0.11985791474580765, + "learning_rate": 7.380510095748535e-06, + "loss": 0.0071, + "step": 1162 + }, + { + "epoch": 2.2409638554216866, + "grad_norm": 0.15505346655845642, + "learning_rate": 7.3457487215788605e-06, + "loss": 0.0046, + "step": 1163 + }, + { + "epoch": 2.2428915662650604, + "grad_norm": 0.18983210623264313, + "learning_rate": 7.311050977090343e-06, + "loss": 0.0079, + "step": 1164 + }, + { + "epoch": 2.2448192771084337, + "grad_norm": 0.19279207289218903, + "learning_rate": 7.276417036754479e-06, + "loss": 0.0042, + "step": 1165 + }, + { + "epoch": 2.2467469879518074, + "grad_norm": 0.21539707481861115, + "learning_rate": 7.241847074721964e-06, + "loss": 0.0087, + "step": 1166 + }, + { + "epoch": 2.2486746987951807, + "grad_norm": 0.07004354894161224, + "learning_rate": 7.207341264821783e-06, + "loss": 0.002, + "step": 1167 + }, + { + "epoch": 2.2506024096385544, + "grad_norm": 0.2203039526939392, + "learning_rate": 7.172899780560345e-06, + "loss": 0.0069, + "step": 1168 + }, + { + "epoch": 2.2525301204819277, + "grad_norm": 0.12474718689918518, + "learning_rate": 7.138522795120606e-06, + "loss": 0.0122, + "step": 1169 + }, + { + "epoch": 2.2544578313253014, + "grad_norm": 0.09078995883464813, + "learning_rate": 7.104210481361204e-06, + "loss": 0.0025, + "step": 1170 + }, + { + "epoch": 2.2563855421686747, + "grad_norm": 0.141757071018219, + "learning_rate": 7.069963011815584e-06, + "loss": 0.0039, + "step": 1171 + }, + { + "epoch": 2.258313253012048, + "grad_norm": 0.14944659173488617, + "learning_rate": 7.035780558691141e-06, + "loss": 0.0025, + "step": 1172 + }, + { + "epoch": 2.2602409638554217, + "grad_norm": 0.06723666191101074, + "learning_rate": 7.001663293868328e-06, + "loss": 0.0014, + "step": 1173 + }, + { + "epoch": 2.262168674698795, + "grad_norm": 0.11966485530138016, + "learning_rate": 6.967611388899826e-06, + "loss": 0.0067, + "step": 1174 + }, + { + "epoch": 2.2640963855421687, + "grad_norm": 0.08943185210227966, + "learning_rate": 6.933625015009666e-06, + "loss": 0.0036, + "step": 1175 + }, + { + "epoch": 2.266024096385542, + "grad_norm": 0.04511453956365585, + "learning_rate": 6.899704343092359e-06, + "loss": 0.0014, + "step": 1176 + }, + { + "epoch": 2.2679518072289158, + "grad_norm": 0.1867951601743698, + "learning_rate": 6.865849543712058e-06, + "loss": 0.009, + "step": 1177 + }, + { + "epoch": 2.269879518072289, + "grad_norm": 0.23791250586509705, + "learning_rate": 6.832060787101658e-06, + "loss": 0.0117, + "step": 1178 + }, + { + "epoch": 2.271807228915663, + "grad_norm": 0.13210316002368927, + "learning_rate": 6.798338243162008e-06, + "loss": 0.0024, + "step": 1179 + }, + { + "epoch": 2.273734939759036, + "grad_norm": 0.1601375937461853, + "learning_rate": 6.764682081461002e-06, + "loss": 0.013, + "step": 1180 + }, + { + "epoch": 2.27566265060241, + "grad_norm": 0.21996766328811646, + "learning_rate": 6.73109247123273e-06, + "loss": 0.0074, + "step": 1181 + }, + { + "epoch": 2.277590361445783, + "grad_norm": 0.15780030190944672, + "learning_rate": 6.6975695813766465e-06, + "loss": 0.0052, + "step": 1182 + }, + { + "epoch": 2.279518072289157, + "grad_norm": 0.18146437406539917, + "learning_rate": 6.664113580456739e-06, + "loss": 0.0265, + "step": 1183 + }, + { + "epoch": 2.28144578313253, + "grad_norm": 0.12033495306968689, + "learning_rate": 6.630724636700618e-06, + "loss": 0.0026, + "step": 1184 + }, + { + "epoch": 2.283373493975904, + "grad_norm": 0.25268155336380005, + "learning_rate": 6.59740291799873e-06, + "loss": 0.0046, + "step": 1185 + }, + { + "epoch": 2.285301204819277, + "grad_norm": 0.19043004512786865, + "learning_rate": 6.564148591903488e-06, + "loss": 0.0063, + "step": 1186 + }, + { + "epoch": 2.2872289156626504, + "grad_norm": 0.06894923001527786, + "learning_rate": 6.530961825628432e-06, + "loss": 0.0012, + "step": 1187 + }, + { + "epoch": 2.289156626506024, + "grad_norm": 0.16378818452358246, + "learning_rate": 6.4978427860474015e-06, + "loss": 0.0048, + "step": 1188 + }, + { + "epoch": 2.2910843373493974, + "grad_norm": 0.11130444705486298, + "learning_rate": 6.464791639693648e-06, + "loss": 0.0049, + "step": 1189 + }, + { + "epoch": 2.293012048192771, + "grad_norm": 0.10573417693376541, + "learning_rate": 6.431808552759083e-06, + "loss": 0.0019, + "step": 1190 + }, + { + "epoch": 2.2949397590361444, + "grad_norm": 0.13344882428646088, + "learning_rate": 6.398893691093367e-06, + "loss": 0.0033, + "step": 1191 + }, + { + "epoch": 2.296867469879518, + "grad_norm": 0.12659135460853577, + "learning_rate": 6.366047220203088e-06, + "loss": 0.0032, + "step": 1192 + }, + { + "epoch": 2.2987951807228915, + "grad_norm": 0.10152821987867355, + "learning_rate": 6.333269305250971e-06, + "loss": 0.0027, + "step": 1193 + }, + { + "epoch": 2.300722891566265, + "grad_norm": 0.1889944225549698, + "learning_rate": 6.300560111055006e-06, + "loss": 0.0062, + "step": 1194 + }, + { + "epoch": 2.3026506024096385, + "grad_norm": 2.3101227283477783, + "learning_rate": 6.2679198020876275e-06, + "loss": 0.0113, + "step": 1195 + }, + { + "epoch": 2.304578313253012, + "grad_norm": 0.6224933862686157, + "learning_rate": 6.235348542474908e-06, + "loss": 0.0273, + "step": 1196 + } + ], + "logging_steps": 1, + "max_steps": 1557, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 92, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.5710541228602819e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1196/training_args.bin b/checkpoint-1196/training_args.bin new file mode 100644 index 0000000..ecc7b6b --- /dev/null +++ b/checkpoint-1196/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:342dfb3c86216e436950100c79812c54066d5572c4e9975b0f133c067f061bcf +size 7825 diff --git a/checkpoint-1288/chat_template.jinja b/checkpoint-1288/chat_template.jinja new file mode 100644 index 0000000..1bad6a0 --- /dev/null +++ b/checkpoint-1288/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/checkpoint-1288/config.json b/checkpoint-1288/config.json new file mode 100644 index 0000000..f8bf41e --- /dev/null +++ b/checkpoint-1288/config.json @@ -0,0 +1,36 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128009, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": 128009, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "transformers_version": "4.56.2", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/checkpoint-1288/generation_config.json b/checkpoint-1288/generation_config.json new file mode 100644 index 0000000..2152026 --- /dev/null +++ b/checkpoint-1288/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128009, + 128001, + 128008, + 128009 + ], + "pad_token_id": 128009, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.56.2" +} diff --git a/checkpoint-1288/model.safetensors b/checkpoint-1288/model.safetensors new file mode 100644 index 0000000..cf40da1 --- /dev/null +++ b/checkpoint-1288/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3b69901864cb0e0d60f3fbd7a0ec4180596e6ce58d048ca589a830797ae0a64 +size 2996982344 diff --git a/checkpoint-1288/special_tokens_map.json b/checkpoint-1288/special_tokens_map.json new file mode 100644 index 0000000..14daf45 --- /dev/null +++ b/checkpoint-1288/special_tokens_map.json @@ -0,0 +1,26 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/checkpoint-1288/tokenizer.json b/checkpoint-1288/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/checkpoint-1288/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-1288/tokenizer_config.json b/checkpoint-1288/tokenizer_config.json new file mode 100644 index 0000000..d1e1ea9 --- /dev/null +++ b/checkpoint-1288/tokenizer_config.json @@ -0,0 +1,2068 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-1288/trainer_state.json b/checkpoint-1288/trainer_state.json new file mode 100644 index 0000000..6c9824e --- /dev/null +++ b/checkpoint-1288/trainer_state.json @@ -0,0 +1,9050 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.4819277108433733, + "eval_steps": 500, + "global_step": 1288, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0019277108433734939, + "grad_norm": 2.8518834114074707, + "learning_rate": 0.0, + "loss": 0.0891, + "step": 1 + }, + { + "epoch": 0.0038554216867469878, + "grad_norm": 1.8441249132156372, + "learning_rate": 2.564102564102564e-07, + "loss": 0.0539, + "step": 2 + }, + { + "epoch": 0.005783132530120482, + "grad_norm": 2.8263237476348877, + "learning_rate": 5.128205128205128e-07, + "loss": 0.099, + "step": 3 + }, + { + "epoch": 0.0077108433734939755, + "grad_norm": 2.5051236152648926, + "learning_rate": 7.692307692307694e-07, + "loss": 0.0789, + "step": 4 + }, + { + "epoch": 0.00963855421686747, + "grad_norm": 2.6903438568115234, + "learning_rate": 1.0256410256410257e-06, + "loss": 0.0881, + "step": 5 + }, + { + "epoch": 0.011566265060240964, + "grad_norm": 2.6205761432647705, + "learning_rate": 1.282051282051282e-06, + "loss": 0.0776, + "step": 6 + }, + { + "epoch": 0.013493975903614458, + "grad_norm": 2.6309337615966797, + "learning_rate": 1.5384615384615387e-06, + "loss": 0.0827, + "step": 7 + }, + { + "epoch": 0.015421686746987951, + "grad_norm": 1.5427855253219604, + "learning_rate": 1.794871794871795e-06, + "loss": 0.0577, + "step": 8 + }, + { + "epoch": 0.017349397590361446, + "grad_norm": 1.0973446369171143, + "learning_rate": 2.0512820512820513e-06, + "loss": 0.04, + "step": 9 + }, + { + "epoch": 0.01927710843373494, + "grad_norm": 1.3253350257873535, + "learning_rate": 2.307692307692308e-06, + "loss": 0.0506, + "step": 10 + }, + { + "epoch": 0.021204819277108433, + "grad_norm": 1.588739037513733, + "learning_rate": 2.564102564102564e-06, + "loss": 0.0874, + "step": 11 + }, + { + "epoch": 0.02313253012048193, + "grad_norm": 1.4987014532089233, + "learning_rate": 2.8205128205128207e-06, + "loss": 0.0597, + "step": 12 + }, + { + "epoch": 0.02506024096385542, + "grad_norm": 1.6571592092514038, + "learning_rate": 3.0769230769230774e-06, + "loss": 0.0559, + "step": 13 + }, + { + "epoch": 0.026987951807228915, + "grad_norm": 1.8860628604888916, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.0688, + "step": 14 + }, + { + "epoch": 0.02891566265060241, + "grad_norm": 1.3202295303344727, + "learning_rate": 3.58974358974359e-06, + "loss": 0.0433, + "step": 15 + }, + { + "epoch": 0.030843373493975902, + "grad_norm": 1.5870612859725952, + "learning_rate": 3.846153846153847e-06, + "loss": 0.0695, + "step": 16 + }, + { + "epoch": 0.0327710843373494, + "grad_norm": 0.9192284345626831, + "learning_rate": 4.102564102564103e-06, + "loss": 0.0392, + "step": 17 + }, + { + "epoch": 0.03469879518072289, + "grad_norm": 0.7950155735015869, + "learning_rate": 4.358974358974359e-06, + "loss": 0.0351, + "step": 18 + }, + { + "epoch": 0.03662650602409639, + "grad_norm": 0.8854314684867859, + "learning_rate": 4.615384615384616e-06, + "loss": 0.0356, + "step": 19 + }, + { + "epoch": 0.03855421686746988, + "grad_norm": 0.9546788930892944, + "learning_rate": 4.871794871794872e-06, + "loss": 0.0427, + "step": 20 + }, + { + "epoch": 0.04048192771084337, + "grad_norm": 0.6315903663635254, + "learning_rate": 5.128205128205128e-06, + "loss": 0.0397, + "step": 21 + }, + { + "epoch": 0.042409638554216866, + "grad_norm": 0.9230924844741821, + "learning_rate": 5.384615384615385e-06, + "loss": 0.0481, + "step": 22 + }, + { + "epoch": 0.04433734939759036, + "grad_norm": 0.711546003818512, + "learning_rate": 5.641025641025641e-06, + "loss": 0.0479, + "step": 23 + }, + { + "epoch": 0.04626506024096386, + "grad_norm": 0.5288046598434448, + "learning_rate": 5.897435897435898e-06, + "loss": 0.0182, + "step": 24 + }, + { + "epoch": 0.04819277108433735, + "grad_norm": 0.9420496225357056, + "learning_rate": 6.153846153846155e-06, + "loss": 0.0389, + "step": 25 + }, + { + "epoch": 0.05012048192771084, + "grad_norm": 0.5001983046531677, + "learning_rate": 6.410256410256412e-06, + "loss": 0.0268, + "step": 26 + }, + { + "epoch": 0.052048192771084335, + "grad_norm": 0.8084653615951538, + "learning_rate": 6.666666666666667e-06, + "loss": 0.0367, + "step": 27 + }, + { + "epoch": 0.05397590361445783, + "grad_norm": 0.7195103764533997, + "learning_rate": 6.923076923076923e-06, + "loss": 0.0251, + "step": 28 + }, + { + "epoch": 0.055903614457831326, + "grad_norm": 0.529958963394165, + "learning_rate": 7.17948717948718e-06, + "loss": 0.0289, + "step": 29 + }, + { + "epoch": 0.05783132530120482, + "grad_norm": 0.795376181602478, + "learning_rate": 7.435897435897437e-06, + "loss": 0.043, + "step": 30 + }, + { + "epoch": 0.059759036144578316, + "grad_norm": 0.6366249918937683, + "learning_rate": 7.692307692307694e-06, + "loss": 0.029, + "step": 31 + }, + { + "epoch": 0.061686746987951804, + "grad_norm": 0.5414115190505981, + "learning_rate": 7.948717948717949e-06, + "loss": 0.0365, + "step": 32 + }, + { + "epoch": 0.0636144578313253, + "grad_norm": 0.9350972175598145, + "learning_rate": 8.205128205128205e-06, + "loss": 0.0283, + "step": 33 + }, + { + "epoch": 0.0655421686746988, + "grad_norm": 0.5660741925239563, + "learning_rate": 8.461538461538462e-06, + "loss": 0.0234, + "step": 34 + }, + { + "epoch": 0.06746987951807229, + "grad_norm": 0.5623988509178162, + "learning_rate": 8.717948717948719e-06, + "loss": 0.0307, + "step": 35 + }, + { + "epoch": 0.06939759036144579, + "grad_norm": 0.5260195732116699, + "learning_rate": 8.974358974358976e-06, + "loss": 0.0264, + "step": 36 + }, + { + "epoch": 0.07132530120481928, + "grad_norm": 0.4934785068035126, + "learning_rate": 9.230769230769232e-06, + "loss": 0.0224, + "step": 37 + }, + { + "epoch": 0.07325301204819278, + "grad_norm": 0.4797322154045105, + "learning_rate": 9.487179487179487e-06, + "loss": 0.0163, + "step": 38 + }, + { + "epoch": 0.07518072289156627, + "grad_norm": 0.4739217460155487, + "learning_rate": 9.743589743589744e-06, + "loss": 0.0165, + "step": 39 + }, + { + "epoch": 0.07710843373493977, + "grad_norm": 0.4527677595615387, + "learning_rate": 1e-05, + "loss": 0.0163, + "step": 40 + }, + { + "epoch": 0.07903614457831325, + "grad_norm": 0.6241316795349121, + "learning_rate": 1.0256410256410256e-05, + "loss": 0.0302, + "step": 41 + }, + { + "epoch": 0.08096385542168674, + "grad_norm": 0.639043927192688, + "learning_rate": 1.0512820512820514e-05, + "loss": 0.0312, + "step": 42 + }, + { + "epoch": 0.08289156626506024, + "grad_norm": 0.5121409296989441, + "learning_rate": 1.076923076923077e-05, + "loss": 0.0256, + "step": 43 + }, + { + "epoch": 0.08481927710843373, + "grad_norm": 0.6340477466583252, + "learning_rate": 1.1025641025641028e-05, + "loss": 0.04, + "step": 44 + }, + { + "epoch": 0.08674698795180723, + "grad_norm": 0.5260409712791443, + "learning_rate": 1.1282051282051283e-05, + "loss": 0.0282, + "step": 45 + }, + { + "epoch": 0.08867469879518072, + "grad_norm": 0.6390711069107056, + "learning_rate": 1.1538461538461538e-05, + "loss": 0.0243, + "step": 46 + }, + { + "epoch": 0.09060240963855422, + "grad_norm": 0.46469295024871826, + "learning_rate": 1.1794871794871796e-05, + "loss": 0.0208, + "step": 47 + }, + { + "epoch": 0.09253012048192771, + "grad_norm": 0.8711516857147217, + "learning_rate": 1.2051282051282051e-05, + "loss": 0.0291, + "step": 48 + }, + { + "epoch": 0.09445783132530121, + "grad_norm": 0.9164300560951233, + "learning_rate": 1.230769230769231e-05, + "loss": 0.0342, + "step": 49 + }, + { + "epoch": 0.0963855421686747, + "grad_norm": 0.5401139259338379, + "learning_rate": 1.2564102564102565e-05, + "loss": 0.0185, + "step": 50 + }, + { + "epoch": 0.0983132530120482, + "grad_norm": 0.44393008947372437, + "learning_rate": 1.2820512820512823e-05, + "loss": 0.0228, + "step": 51 + }, + { + "epoch": 0.10024096385542168, + "grad_norm": 0.3855767846107483, + "learning_rate": 1.3076923076923078e-05, + "loss": 0.0176, + "step": 52 + }, + { + "epoch": 0.10216867469879518, + "grad_norm": 0.8561235070228577, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.0433, + "step": 53 + }, + { + "epoch": 0.10409638554216867, + "grad_norm": 0.768002450466156, + "learning_rate": 1.3589743589743592e-05, + "loss": 0.0245, + "step": 54 + }, + { + "epoch": 0.10602409638554217, + "grad_norm": 0.4559759497642517, + "learning_rate": 1.3846153846153847e-05, + "loss": 0.0224, + "step": 55 + }, + { + "epoch": 0.10795180722891566, + "grad_norm": 0.6203847527503967, + "learning_rate": 1.4102564102564105e-05, + "loss": 0.0296, + "step": 56 + }, + { + "epoch": 0.10987951807228916, + "grad_norm": 0.6651368141174316, + "learning_rate": 1.435897435897436e-05, + "loss": 0.0336, + "step": 57 + }, + { + "epoch": 0.11180722891566265, + "grad_norm": 0.377734512090683, + "learning_rate": 1.4615384615384615e-05, + "loss": 0.0196, + "step": 58 + }, + { + "epoch": 0.11373493975903615, + "grad_norm": 0.687568724155426, + "learning_rate": 1.4871794871794874e-05, + "loss": 0.0207, + "step": 59 + }, + { + "epoch": 0.11566265060240964, + "grad_norm": 0.7905604243278503, + "learning_rate": 1.5128205128205129e-05, + "loss": 0.047, + "step": 60 + }, + { + "epoch": 0.11759036144578314, + "grad_norm": 0.7938196063041687, + "learning_rate": 1.5384615384615387e-05, + "loss": 0.0198, + "step": 61 + }, + { + "epoch": 0.11951807228915663, + "grad_norm": 0.41340553760528564, + "learning_rate": 1.5641025641025644e-05, + "loss": 0.0161, + "step": 62 + }, + { + "epoch": 0.12144578313253013, + "grad_norm": 0.5668172240257263, + "learning_rate": 1.5897435897435897e-05, + "loss": 0.0275, + "step": 63 + }, + { + "epoch": 0.12337349397590361, + "grad_norm": 0.48333367705345154, + "learning_rate": 1.6153846153846154e-05, + "loss": 0.0137, + "step": 64 + }, + { + "epoch": 0.12530120481927712, + "grad_norm": 0.6843933463096619, + "learning_rate": 1.641025641025641e-05, + "loss": 0.0294, + "step": 65 + }, + { + "epoch": 0.1272289156626506, + "grad_norm": 0.7789272665977478, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.0401, + "step": 66 + }, + { + "epoch": 0.1291566265060241, + "grad_norm": 0.6203492879867554, + "learning_rate": 1.6923076923076924e-05, + "loss": 0.0292, + "step": 67 + }, + { + "epoch": 0.1310843373493976, + "grad_norm": 0.5940662622451782, + "learning_rate": 1.717948717948718e-05, + "loss": 0.0178, + "step": 68 + }, + { + "epoch": 0.13301204819277107, + "grad_norm": 0.35504868626594543, + "learning_rate": 1.7435897435897438e-05, + "loss": 0.0129, + "step": 69 + }, + { + "epoch": 0.13493975903614458, + "grad_norm": 0.8796699643135071, + "learning_rate": 1.7692307692307694e-05, + "loss": 0.034, + "step": 70 + }, + { + "epoch": 0.13686746987951806, + "grad_norm": 0.967444896697998, + "learning_rate": 1.794871794871795e-05, + "loss": 0.0266, + "step": 71 + }, + { + "epoch": 0.13879518072289157, + "grad_norm": 0.4428526759147644, + "learning_rate": 1.8205128205128208e-05, + "loss": 0.0223, + "step": 72 + }, + { + "epoch": 0.14072289156626505, + "grad_norm": 0.42897751927375793, + "learning_rate": 1.8461538461538465e-05, + "loss": 0.0187, + "step": 73 + }, + { + "epoch": 0.14265060240963856, + "grad_norm": 0.5100914835929871, + "learning_rate": 1.8717948717948718e-05, + "loss": 0.0164, + "step": 74 + }, + { + "epoch": 0.14457831325301204, + "grad_norm": 0.6028861999511719, + "learning_rate": 1.8974358974358975e-05, + "loss": 0.0164, + "step": 75 + }, + { + "epoch": 0.14650602409638555, + "grad_norm": 0.6187024116516113, + "learning_rate": 1.923076923076923e-05, + "loss": 0.0296, + "step": 76 + }, + { + "epoch": 0.14843373493975903, + "grad_norm": 0.4822489619255066, + "learning_rate": 1.9487179487179488e-05, + "loss": 0.0148, + "step": 77 + }, + { + "epoch": 0.15036144578313254, + "grad_norm": 0.7231149673461914, + "learning_rate": 1.9743589743589745e-05, + "loss": 0.0395, + "step": 78 + }, + { + "epoch": 0.15228915662650602, + "grad_norm": 0.8409642577171326, + "learning_rate": 2e-05, + "loss": 0.0446, + "step": 79 + }, + { + "epoch": 0.15421686746987953, + "grad_norm": 0.4883500039577484, + "learning_rate": 2.025641025641026e-05, + "loss": 0.0206, + "step": 80 + }, + { + "epoch": 0.156144578313253, + "grad_norm": 0.6287479400634766, + "learning_rate": 2.0512820512820512e-05, + "loss": 0.0333, + "step": 81 + }, + { + "epoch": 0.1580722891566265, + "grad_norm": 0.5041632652282715, + "learning_rate": 2.0769230769230772e-05, + "loss": 0.0414, + "step": 82 + }, + { + "epoch": 0.16, + "grad_norm": 0.5103405117988586, + "learning_rate": 2.102564102564103e-05, + "loss": 0.045, + "step": 83 + }, + { + "epoch": 0.16192771084337348, + "grad_norm": 0.493161678314209, + "learning_rate": 2.1282051282051285e-05, + "loss": 0.021, + "step": 84 + }, + { + "epoch": 0.163855421686747, + "grad_norm": 0.908843994140625, + "learning_rate": 2.153846153846154e-05, + "loss": 0.0389, + "step": 85 + }, + { + "epoch": 0.16578313253012048, + "grad_norm": 0.5067003965377808, + "learning_rate": 2.1794871794871795e-05, + "loss": 0.0272, + "step": 86 + }, + { + "epoch": 0.16771084337349398, + "grad_norm": 0.5791381597518921, + "learning_rate": 2.2051282051282056e-05, + "loss": 0.0368, + "step": 87 + }, + { + "epoch": 0.16963855421686747, + "grad_norm": 0.7056036591529846, + "learning_rate": 2.230769230769231e-05, + "loss": 0.0284, + "step": 88 + }, + { + "epoch": 0.17156626506024097, + "grad_norm": 0.6563822031021118, + "learning_rate": 2.2564102564102566e-05, + "loss": 0.0646, + "step": 89 + }, + { + "epoch": 0.17349397590361446, + "grad_norm": 0.9483286142349243, + "learning_rate": 2.2820512820512822e-05, + "loss": 0.0439, + "step": 90 + }, + { + "epoch": 0.17542168674698796, + "grad_norm": 0.370664119720459, + "learning_rate": 2.3076923076923076e-05, + "loss": 0.0109, + "step": 91 + }, + { + "epoch": 0.17734939759036145, + "grad_norm": 0.9776477813720703, + "learning_rate": 2.3333333333333336e-05, + "loss": 0.0458, + "step": 92 + }, + { + "epoch": 0.17927710843373493, + "grad_norm": 0.45710092782974243, + "learning_rate": 2.3589743589743593e-05, + "loss": 0.0212, + "step": 93 + }, + { + "epoch": 0.18120481927710844, + "grad_norm": 0.8623896837234497, + "learning_rate": 2.384615384615385e-05, + "loss": 0.0215, + "step": 94 + }, + { + "epoch": 0.18313253012048192, + "grad_norm": 0.55814528465271, + "learning_rate": 2.4102564102564103e-05, + "loss": 0.0218, + "step": 95 + }, + { + "epoch": 0.18506024096385543, + "grad_norm": 0.49882641434669495, + "learning_rate": 2.435897435897436e-05, + "loss": 0.0268, + "step": 96 + }, + { + "epoch": 0.1869879518072289, + "grad_norm": 0.3508654534816742, + "learning_rate": 2.461538461538462e-05, + "loss": 0.0172, + "step": 97 + }, + { + "epoch": 0.18891566265060242, + "grad_norm": 0.601170003414154, + "learning_rate": 2.4871794871794873e-05, + "loss": 0.0208, + "step": 98 + }, + { + "epoch": 0.1908433734939759, + "grad_norm": 1.1748133897781372, + "learning_rate": 2.512820512820513e-05, + "loss": 0.0259, + "step": 99 + }, + { + "epoch": 0.1927710843373494, + "grad_norm": 0.46370384097099304, + "learning_rate": 2.5384615384615386e-05, + "loss": 0.0242, + "step": 100 + }, + { + "epoch": 0.1946987951807229, + "grad_norm": 0.525010883808136, + "learning_rate": 2.5641025641025646e-05, + "loss": 0.0188, + "step": 101 + }, + { + "epoch": 0.1966265060240964, + "grad_norm": 0.766501784324646, + "learning_rate": 2.58974358974359e-05, + "loss": 0.0584, + "step": 102 + }, + { + "epoch": 0.19855421686746988, + "grad_norm": 0.3572964370250702, + "learning_rate": 2.6153846153846157e-05, + "loss": 0.0131, + "step": 103 + }, + { + "epoch": 0.20048192771084336, + "grad_norm": 0.6467130780220032, + "learning_rate": 2.6410256410256413e-05, + "loss": 0.0231, + "step": 104 + }, + { + "epoch": 0.20240963855421687, + "grad_norm": 1.1852102279663086, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.027, + "step": 105 + }, + { + "epoch": 0.20433734939759035, + "grad_norm": 2.3659932613372803, + "learning_rate": 2.6923076923076927e-05, + "loss": 0.0224, + "step": 106 + }, + { + "epoch": 0.20626506024096386, + "grad_norm": 0.5343687534332275, + "learning_rate": 2.7179487179487183e-05, + "loss": 0.0198, + "step": 107 + }, + { + "epoch": 0.20819277108433734, + "grad_norm": 1.852160096168518, + "learning_rate": 2.7435897435897437e-05, + "loss": 0.032, + "step": 108 + }, + { + "epoch": 0.21012048192771085, + "grad_norm": 0.47291702032089233, + "learning_rate": 2.7692307692307694e-05, + "loss": 0.0117, + "step": 109 + }, + { + "epoch": 0.21204819277108433, + "grad_norm": 0.7623187899589539, + "learning_rate": 2.794871794871795e-05, + "loss": 0.0337, + "step": 110 + }, + { + "epoch": 0.21397590361445784, + "grad_norm": 0.5272570848464966, + "learning_rate": 2.820512820512821e-05, + "loss": 0.0131, + "step": 111 + }, + { + "epoch": 0.21590361445783132, + "grad_norm": 0.5568500757217407, + "learning_rate": 2.8461538461538464e-05, + "loss": 0.0233, + "step": 112 + }, + { + "epoch": 0.21783132530120483, + "grad_norm": 0.4008469879627228, + "learning_rate": 2.871794871794872e-05, + "loss": 0.0204, + "step": 113 + }, + { + "epoch": 0.2197590361445783, + "grad_norm": 0.4888612926006317, + "learning_rate": 2.8974358974358977e-05, + "loss": 0.016, + "step": 114 + }, + { + "epoch": 0.2216867469879518, + "grad_norm": 0.44903355836868286, + "learning_rate": 2.923076923076923e-05, + "loss": 0.0135, + "step": 115 + }, + { + "epoch": 0.2236144578313253, + "grad_norm": 0.9266762733459473, + "learning_rate": 2.948717948717949e-05, + "loss": 0.0233, + "step": 116 + }, + { + "epoch": 0.22554216867469878, + "grad_norm": 0.5352638959884644, + "learning_rate": 2.9743589743589747e-05, + "loss": 0.0198, + "step": 117 + }, + { + "epoch": 0.2274698795180723, + "grad_norm": 0.6051343679428101, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.0246, + "step": 118 + }, + { + "epoch": 0.22939759036144577, + "grad_norm": 0.9971133470535278, + "learning_rate": 3.0256410256410257e-05, + "loss": 0.025, + "step": 119 + }, + { + "epoch": 0.23132530120481928, + "grad_norm": 0.704236626625061, + "learning_rate": 3.0512820512820514e-05, + "loss": 0.031, + "step": 120 + }, + { + "epoch": 0.23325301204819276, + "grad_norm": 0.6137097477912903, + "learning_rate": 3.0769230769230774e-05, + "loss": 0.0519, + "step": 121 + }, + { + "epoch": 0.23518072289156627, + "grad_norm": 0.7396159768104553, + "learning_rate": 3.102564102564103e-05, + "loss": 0.0325, + "step": 122 + }, + { + "epoch": 0.23710843373493976, + "grad_norm": 1.3282053470611572, + "learning_rate": 3.128205128205129e-05, + "loss": 0.0252, + "step": 123 + }, + { + "epoch": 0.23903614457831326, + "grad_norm": 0.5220731496810913, + "learning_rate": 3.153846153846154e-05, + "loss": 0.0262, + "step": 124 + }, + { + "epoch": 0.24096385542168675, + "grad_norm": 0.5357242822647095, + "learning_rate": 3.1794871794871795e-05, + "loss": 0.0243, + "step": 125 + }, + { + "epoch": 0.24289156626506025, + "grad_norm": 0.48207753896713257, + "learning_rate": 3.205128205128206e-05, + "loss": 0.0178, + "step": 126 + }, + { + "epoch": 0.24481927710843374, + "grad_norm": 0.552988588809967, + "learning_rate": 3.230769230769231e-05, + "loss": 0.023, + "step": 127 + }, + { + "epoch": 0.24674698795180722, + "grad_norm": 1.7962840795516968, + "learning_rate": 3.2564102564102565e-05, + "loss": 0.032, + "step": 128 + }, + { + "epoch": 0.24867469879518073, + "grad_norm": 1.6404600143432617, + "learning_rate": 3.282051282051282e-05, + "loss": 0.0231, + "step": 129 + }, + { + "epoch": 0.25060240963855424, + "grad_norm": 0.39142486453056335, + "learning_rate": 3.307692307692308e-05, + "loss": 0.0147, + "step": 130 + }, + { + "epoch": 0.2525301204819277, + "grad_norm": 1.3272887468338013, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.0439, + "step": 131 + }, + { + "epoch": 0.2544578313253012, + "grad_norm": 1.5122811794281006, + "learning_rate": 3.358974358974359e-05, + "loss": 0.0282, + "step": 132 + }, + { + "epoch": 0.2563855421686747, + "grad_norm": 1.8542430400848389, + "learning_rate": 3.384615384615385e-05, + "loss": 0.0515, + "step": 133 + }, + { + "epoch": 0.2583132530120482, + "grad_norm": 4.059277534484863, + "learning_rate": 3.4102564102564105e-05, + "loss": 0.0781, + "step": 134 + }, + { + "epoch": 0.26024096385542167, + "grad_norm": 0.6206214427947998, + "learning_rate": 3.435897435897436e-05, + "loss": 0.0306, + "step": 135 + }, + { + "epoch": 0.2621686746987952, + "grad_norm": 0.4575510323047638, + "learning_rate": 3.461538461538462e-05, + "loss": 0.0154, + "step": 136 + }, + { + "epoch": 0.2640963855421687, + "grad_norm": 1.1556978225708008, + "learning_rate": 3.4871794871794875e-05, + "loss": 0.0235, + "step": 137 + }, + { + "epoch": 0.26602409638554214, + "grad_norm": 0.6975051760673523, + "learning_rate": 3.512820512820513e-05, + "loss": 0.0453, + "step": 138 + }, + { + "epoch": 0.26795180722891565, + "grad_norm": 0.8686623573303223, + "learning_rate": 3.538461538461539e-05, + "loss": 0.0427, + "step": 139 + }, + { + "epoch": 0.26987951807228916, + "grad_norm": 2.0681848526000977, + "learning_rate": 3.5641025641025646e-05, + "loss": 0.04, + "step": 140 + }, + { + "epoch": 0.27180722891566267, + "grad_norm": 0.4397984445095062, + "learning_rate": 3.58974358974359e-05, + "loss": 0.0188, + "step": 141 + }, + { + "epoch": 0.2737349397590361, + "grad_norm": 0.5871334075927734, + "learning_rate": 3.615384615384616e-05, + "loss": 0.0253, + "step": 142 + }, + { + "epoch": 0.27566265060240963, + "grad_norm": 1.1078568696975708, + "learning_rate": 3.6410256410256416e-05, + "loss": 0.0316, + "step": 143 + }, + { + "epoch": 0.27759036144578314, + "grad_norm": 0.5691841840744019, + "learning_rate": 3.6666666666666666e-05, + "loss": 0.0266, + "step": 144 + }, + { + "epoch": 0.27951807228915665, + "grad_norm": 0.7896255254745483, + "learning_rate": 3.692307692307693e-05, + "loss": 0.0281, + "step": 145 + }, + { + "epoch": 0.2814457831325301, + "grad_norm": 0.9988337159156799, + "learning_rate": 3.7179487179487186e-05, + "loss": 0.0295, + "step": 146 + }, + { + "epoch": 0.2833734939759036, + "grad_norm": 0.9811834692955017, + "learning_rate": 3.7435897435897436e-05, + "loss": 0.0322, + "step": 147 + }, + { + "epoch": 0.2853012048192771, + "grad_norm": 0.6503105759620667, + "learning_rate": 3.769230769230769e-05, + "loss": 0.0266, + "step": 148 + }, + { + "epoch": 0.28722891566265063, + "grad_norm": 1.9164355993270874, + "learning_rate": 3.794871794871795e-05, + "loss": 0.0677, + "step": 149 + }, + { + "epoch": 0.2891566265060241, + "grad_norm": 1.1724557876586914, + "learning_rate": 3.820512820512821e-05, + "loss": 0.0324, + "step": 150 + }, + { + "epoch": 0.2910843373493976, + "grad_norm": 0.8482469916343689, + "learning_rate": 3.846153846153846e-05, + "loss": 0.0259, + "step": 151 + }, + { + "epoch": 0.2930120481927711, + "grad_norm": 0.8572830557823181, + "learning_rate": 3.871794871794872e-05, + "loss": 0.0358, + "step": 152 + }, + { + "epoch": 0.29493975903614456, + "grad_norm": 0.6630825400352478, + "learning_rate": 3.8974358974358976e-05, + "loss": 0.0447, + "step": 153 + }, + { + "epoch": 0.29686746987951806, + "grad_norm": 0.9197093844413757, + "learning_rate": 3.923076923076923e-05, + "loss": 0.0409, + "step": 154 + }, + { + "epoch": 0.2987951807228916, + "grad_norm": 0.6976819634437561, + "learning_rate": 3.948717948717949e-05, + "loss": 0.0317, + "step": 155 + }, + { + "epoch": 0.3007228915662651, + "grad_norm": 0.7353514432907104, + "learning_rate": 3.9743589743589747e-05, + "loss": 0.0306, + "step": 156 + }, + { + "epoch": 0.30265060240963854, + "grad_norm": 0.5730232000350952, + "learning_rate": 4e-05, + "loss": 0.0324, + "step": 157 + }, + { + "epoch": 0.30457831325301205, + "grad_norm": 0.7852078676223755, + "learning_rate": 3.999994971675547e-05, + "loss": 0.0354, + "step": 158 + }, + { + "epoch": 0.30650602409638555, + "grad_norm": 0.5924715399742126, + "learning_rate": 3.999979886727471e-05, + "loss": 0.0366, + "step": 159 + }, + { + "epoch": 0.30843373493975906, + "grad_norm": 0.7359845638275146, + "learning_rate": 3.999954745231624e-05, + "loss": 0.0437, + "step": 160 + }, + { + "epoch": 0.3103614457831325, + "grad_norm": 0.7866976857185364, + "learning_rate": 3.999919547314426e-05, + "loss": 0.0363, + "step": 161 + }, + { + "epoch": 0.312289156626506, + "grad_norm": 0.7425745129585266, + "learning_rate": 3.999874293152863e-05, + "loss": 0.0259, + "step": 162 + }, + { + "epoch": 0.31421686746987953, + "grad_norm": 1.8922245502471924, + "learning_rate": 3.9998189829744885e-05, + "loss": 0.0341, + "step": 163 + }, + { + "epoch": 0.316144578313253, + "grad_norm": 0.7908634543418884, + "learning_rate": 3.99975361705742e-05, + "loss": 0.0424, + "step": 164 + }, + { + "epoch": 0.3180722891566265, + "grad_norm": 2.047368049621582, + "learning_rate": 3.999678195730337e-05, + "loss": 0.0535, + "step": 165 + }, + { + "epoch": 0.32, + "grad_norm": 0.5702639222145081, + "learning_rate": 3.999592719372484e-05, + "loss": 0.0284, + "step": 166 + }, + { + "epoch": 0.3219277108433735, + "grad_norm": 0.45015648007392883, + "learning_rate": 3.9994971884136636e-05, + "loss": 0.0313, + "step": 167 + }, + { + "epoch": 0.32385542168674697, + "grad_norm": 4.094679355621338, + "learning_rate": 3.9993916033342355e-05, + "loss": 0.0524, + "step": 168 + }, + { + "epoch": 0.3257831325301205, + "grad_norm": 0.800846517086029, + "learning_rate": 3.999275964665117e-05, + "loss": 0.0282, + "step": 169 + }, + { + "epoch": 0.327710843373494, + "grad_norm": 0.47881078720092773, + "learning_rate": 3.999150272987776e-05, + "loss": 0.0293, + "step": 170 + }, + { + "epoch": 0.3296385542168675, + "grad_norm": 0.5716657638549805, + "learning_rate": 3.999014528934232e-05, + "loss": 0.0221, + "step": 171 + }, + { + "epoch": 0.33156626506024095, + "grad_norm": 0.6333311200141907, + "learning_rate": 3.998868733187048e-05, + "loss": 0.0302, + "step": 172 + }, + { + "epoch": 0.33349397590361446, + "grad_norm": 6.642521858215332, + "learning_rate": 3.998712886479335e-05, + "loss": 0.0364, + "step": 173 + }, + { + "epoch": 0.33542168674698797, + "grad_norm": 0.7515506148338318, + "learning_rate": 3.998546989594739e-05, + "loss": 0.0296, + "step": 174 + }, + { + "epoch": 0.3373493975903614, + "grad_norm": 1.0728015899658203, + "learning_rate": 3.998371043367445e-05, + "loss": 0.0549, + "step": 175 + }, + { + "epoch": 0.33927710843373493, + "grad_norm": 1.3025579452514648, + "learning_rate": 3.998185048682166e-05, + "loss": 0.0577, + "step": 176 + }, + { + "epoch": 0.34120481927710844, + "grad_norm": 1.0962958335876465, + "learning_rate": 3.997989006474144e-05, + "loss": 0.0313, + "step": 177 + }, + { + "epoch": 0.34313253012048195, + "grad_norm": 0.7064313292503357, + "learning_rate": 3.997782917729143e-05, + "loss": 0.0309, + "step": 178 + }, + { + "epoch": 0.3450602409638554, + "grad_norm": 0.43374207615852356, + "learning_rate": 3.997566783483445e-05, + "loss": 0.0166, + "step": 179 + }, + { + "epoch": 0.3469879518072289, + "grad_norm": 0.7236390113830566, + "learning_rate": 3.9973406048238413e-05, + "loss": 0.0254, + "step": 180 + }, + { + "epoch": 0.3489156626506024, + "grad_norm": 0.5041500926017761, + "learning_rate": 3.9971043828876334e-05, + "loss": 0.0239, + "step": 181 + }, + { + "epoch": 0.35084337349397593, + "grad_norm": 1.2744532823562622, + "learning_rate": 3.9968581188626204e-05, + "loss": 0.0404, + "step": 182 + }, + { + "epoch": 0.3527710843373494, + "grad_norm": 0.45845362544059753, + "learning_rate": 3.996601813987098e-05, + "loss": 0.0127, + "step": 183 + }, + { + "epoch": 0.3546987951807229, + "grad_norm": 0.4426881968975067, + "learning_rate": 3.996335469549852e-05, + "loss": 0.0176, + "step": 184 + }, + { + "epoch": 0.3566265060240964, + "grad_norm": 1.0030732154846191, + "learning_rate": 3.9960590868901465e-05, + "loss": 0.0457, + "step": 185 + }, + { + "epoch": 0.35855421686746985, + "grad_norm": 0.6428582668304443, + "learning_rate": 3.995772667397725e-05, + "loss": 0.0271, + "step": 186 + }, + { + "epoch": 0.36048192771084336, + "grad_norm": 0.5335744619369507, + "learning_rate": 3.995476212512795e-05, + "loss": 0.0297, + "step": 187 + }, + { + "epoch": 0.3624096385542169, + "grad_norm": 0.6995761394500732, + "learning_rate": 3.99516972372603e-05, + "loss": 0.0322, + "step": 188 + }, + { + "epoch": 0.3643373493975904, + "grad_norm": 0.765511155128479, + "learning_rate": 3.9948532025785546e-05, + "loss": 0.0253, + "step": 189 + }, + { + "epoch": 0.36626506024096384, + "grad_norm": 0.6165828108787537, + "learning_rate": 3.9945266506619403e-05, + "loss": 0.0355, + "step": 190 + }, + { + "epoch": 0.36819277108433734, + "grad_norm": 0.851970911026001, + "learning_rate": 3.994190069618195e-05, + "loss": 0.056, + "step": 191 + }, + { + "epoch": 0.37012048192771085, + "grad_norm": 0.9850023984909058, + "learning_rate": 3.993843461139757e-05, + "loss": 0.0415, + "step": 192 + }, + { + "epoch": 0.37204819277108436, + "grad_norm": 0.7455295324325562, + "learning_rate": 3.9934868269694886e-05, + "loss": 0.0379, + "step": 193 + }, + { + "epoch": 0.3739759036144578, + "grad_norm": 1.159469723701477, + "learning_rate": 3.9931201689006595e-05, + "loss": 0.0237, + "step": 194 + }, + { + "epoch": 0.3759036144578313, + "grad_norm": 0.5490080118179321, + "learning_rate": 3.992743488776947e-05, + "loss": 0.024, + "step": 195 + }, + { + "epoch": 0.37783132530120483, + "grad_norm": 1.279831886291504, + "learning_rate": 3.992356788492421e-05, + "loss": 0.0273, + "step": 196 + }, + { + "epoch": 0.3797590361445783, + "grad_norm": 0.859104335308075, + "learning_rate": 3.9919600699915355e-05, + "loss": 0.0411, + "step": 197 + }, + { + "epoch": 0.3816867469879518, + "grad_norm": 1.2525300979614258, + "learning_rate": 3.991553335269119e-05, + "loss": 0.0857, + "step": 198 + }, + { + "epoch": 0.3836144578313253, + "grad_norm": 0.4924193024635315, + "learning_rate": 3.991136586370367e-05, + "loss": 0.0294, + "step": 199 + }, + { + "epoch": 0.3855421686746988, + "grad_norm": 1.417190670967102, + "learning_rate": 3.990709825390828e-05, + "loss": 0.0395, + "step": 200 + }, + { + "epoch": 0.38746987951807227, + "grad_norm": 0.6172056198120117, + "learning_rate": 3.9902730544763936e-05, + "loss": 0.0194, + "step": 201 + }, + { + "epoch": 0.3893975903614458, + "grad_norm": 0.7292149662971497, + "learning_rate": 3.989826275823291e-05, + "loss": 0.0381, + "step": 202 + }, + { + "epoch": 0.3913253012048193, + "grad_norm": 0.5949816107749939, + "learning_rate": 3.989369491678067e-05, + "loss": 0.0254, + "step": 203 + }, + { + "epoch": 0.3932530120481928, + "grad_norm": 0.6012582182884216, + "learning_rate": 3.988902704337582e-05, + "loss": 0.048, + "step": 204 + }, + { + "epoch": 0.39518072289156625, + "grad_norm": 0.6273590922355652, + "learning_rate": 3.9884259161489936e-05, + "loss": 0.0268, + "step": 205 + }, + { + "epoch": 0.39710843373493976, + "grad_norm": 0.9615244269371033, + "learning_rate": 3.987939129509746e-05, + "loss": 0.0192, + "step": 206 + }, + { + "epoch": 0.39903614457831327, + "grad_norm": 0.6009241342544556, + "learning_rate": 3.9874423468675624e-05, + "loss": 0.0362, + "step": 207 + }, + { + "epoch": 0.4009638554216867, + "grad_norm": 0.411335289478302, + "learning_rate": 3.9869355707204266e-05, + "loss": 0.017, + "step": 208 + }, + { + "epoch": 0.40289156626506023, + "grad_norm": 0.6151527166366577, + "learning_rate": 3.986418803616573e-05, + "loss": 0.0283, + "step": 209 + }, + { + "epoch": 0.40481927710843374, + "grad_norm": 0.33808204531669617, + "learning_rate": 3.985892048154474e-05, + "loss": 0.0158, + "step": 210 + }, + { + "epoch": 0.40674698795180725, + "grad_norm": 0.5464187860488892, + "learning_rate": 3.9853553069828284e-05, + "loss": 0.0292, + "step": 211 + }, + { + "epoch": 0.4086746987951807, + "grad_norm": 0.6658390760421753, + "learning_rate": 3.984808582800543e-05, + "loss": 0.0281, + "step": 212 + }, + { + "epoch": 0.4106024096385542, + "grad_norm": 0.4253764748573303, + "learning_rate": 3.984251878356726e-05, + "loss": 0.031, + "step": 213 + }, + { + "epoch": 0.4125301204819277, + "grad_norm": 0.32309481501579285, + "learning_rate": 3.983685196450667e-05, + "loss": 0.0166, + "step": 214 + }, + { + "epoch": 0.41445783132530123, + "grad_norm": 0.43756410479545593, + "learning_rate": 3.9831085399318265e-05, + "loss": 0.0326, + "step": 215 + }, + { + "epoch": 0.4163855421686747, + "grad_norm": 0.264046847820282, + "learning_rate": 3.982521911699822e-05, + "loss": 0.0118, + "step": 216 + }, + { + "epoch": 0.4183132530120482, + "grad_norm": 0.8630897402763367, + "learning_rate": 3.9819253147044084e-05, + "loss": 0.0246, + "step": 217 + }, + { + "epoch": 0.4202409638554217, + "grad_norm": 0.6923379898071289, + "learning_rate": 3.98131875194547e-05, + "loss": 0.036, + "step": 218 + }, + { + "epoch": 0.42216867469879515, + "grad_norm": 0.5874778628349304, + "learning_rate": 3.9807022264730024e-05, + "loss": 0.0255, + "step": 219 + }, + { + "epoch": 0.42409638554216866, + "grad_norm": 0.394336074590683, + "learning_rate": 3.980075741387094e-05, + "loss": 0.0187, + "step": 220 + }, + { + "epoch": 0.4260240963855422, + "grad_norm": 0.6300327777862549, + "learning_rate": 3.979439299837915e-05, + "loss": 0.0214, + "step": 221 + }, + { + "epoch": 0.4279518072289157, + "grad_norm": 0.5200467109680176, + "learning_rate": 3.978792905025702e-05, + "loss": 0.0628, + "step": 222 + }, + { + "epoch": 0.42987951807228914, + "grad_norm": 0.5713880062103271, + "learning_rate": 3.978136560200735e-05, + "loss": 0.0302, + "step": 223 + }, + { + "epoch": 0.43180722891566264, + "grad_norm": 0.5345383286476135, + "learning_rate": 3.977470268663331e-05, + "loss": 0.0125, + "step": 224 + }, + { + "epoch": 0.43373493975903615, + "grad_norm": 0.5378350019454956, + "learning_rate": 3.976794033763819e-05, + "loss": 0.0246, + "step": 225 + }, + { + "epoch": 0.43566265060240966, + "grad_norm": 0.5554935336112976, + "learning_rate": 3.9761078589025276e-05, + "loss": 0.0212, + "step": 226 + }, + { + "epoch": 0.4375903614457831, + "grad_norm": 0.2832634747028351, + "learning_rate": 3.9754117475297664e-05, + "loss": 0.0125, + "step": 227 + }, + { + "epoch": 0.4395180722891566, + "grad_norm": 1.2910150289535522, + "learning_rate": 3.97470570314581e-05, + "loss": 0.0364, + "step": 228 + }, + { + "epoch": 0.44144578313253013, + "grad_norm": 0.3731018602848053, + "learning_rate": 3.973989729300878e-05, + "loss": 0.0128, + "step": 229 + }, + { + "epoch": 0.4433734939759036, + "grad_norm": 0.9433871507644653, + "learning_rate": 3.9732638295951195e-05, + "loss": 0.0367, + "step": 230 + }, + { + "epoch": 0.4453012048192771, + "grad_norm": 1.0779197216033936, + "learning_rate": 3.972528007678594e-05, + "loss": 0.0667, + "step": 231 + }, + { + "epoch": 0.4472289156626506, + "grad_norm": 1.7009105682373047, + "learning_rate": 3.9717822672512516e-05, + "loss": 0.0655, + "step": 232 + }, + { + "epoch": 0.4491566265060241, + "grad_norm": 0.5646032094955444, + "learning_rate": 3.971026612062919e-05, + "loss": 0.064, + "step": 233 + }, + { + "epoch": 0.45108433734939757, + "grad_norm": 0.44474121928215027, + "learning_rate": 3.970261045913274e-05, + "loss": 0.0206, + "step": 234 + }, + { + "epoch": 0.4530120481927711, + "grad_norm": 1.3969277143478394, + "learning_rate": 3.969485572651833e-05, + "loss": 0.0486, + "step": 235 + }, + { + "epoch": 0.4549397590361446, + "grad_norm": 0.6401994228363037, + "learning_rate": 3.968700196177925e-05, + "loss": 0.0262, + "step": 236 + }, + { + "epoch": 0.4568674698795181, + "grad_norm": 0.7091913223266602, + "learning_rate": 3.96790492044068e-05, + "loss": 0.014, + "step": 237 + }, + { + "epoch": 0.45879518072289155, + "grad_norm": 0.6561547517776489, + "learning_rate": 3.967099749439002e-05, + "loss": 0.0482, + "step": 238 + }, + { + "epoch": 0.46072289156626506, + "grad_norm": 0.6924155354499817, + "learning_rate": 3.966284687221551e-05, + "loss": 0.0289, + "step": 239 + }, + { + "epoch": 0.46265060240963857, + "grad_norm": 0.5868663787841797, + "learning_rate": 3.9654597378867256e-05, + "loss": 0.0331, + "step": 240 + }, + { + "epoch": 0.464578313253012, + "grad_norm": 0.7930939793586731, + "learning_rate": 3.964624905582637e-05, + "loss": 0.0925, + "step": 241 + }, + { + "epoch": 0.46650602409638553, + "grad_norm": 0.4888836145401001, + "learning_rate": 3.9637801945070944e-05, + "loss": 0.015, + "step": 242 + }, + { + "epoch": 0.46843373493975904, + "grad_norm": 0.7820287346839905, + "learning_rate": 3.962925608907579e-05, + "loss": 0.0382, + "step": 243 + }, + { + "epoch": 0.47036144578313255, + "grad_norm": 0.4914316236972809, + "learning_rate": 3.962061153081224e-05, + "loss": 0.0257, + "step": 244 + }, + { + "epoch": 0.472289156626506, + "grad_norm": 0.5681505799293518, + "learning_rate": 3.961186831374793e-05, + "loss": 0.0551, + "step": 245 + }, + { + "epoch": 0.4742168674698795, + "grad_norm": 0.5049723386764526, + "learning_rate": 3.9603026481846616e-05, + "loss": 0.0186, + "step": 246 + }, + { + "epoch": 0.476144578313253, + "grad_norm": 0.5034119486808777, + "learning_rate": 3.959408607956787e-05, + "loss": 0.024, + "step": 247 + }, + { + "epoch": 0.47807228915662653, + "grad_norm": 0.4543336033821106, + "learning_rate": 3.958504715186695e-05, + "loss": 0.0256, + "step": 248 + }, + { + "epoch": 0.48, + "grad_norm": 0.5595743656158447, + "learning_rate": 3.957590974419452e-05, + "loss": 0.0222, + "step": 249 + }, + { + "epoch": 0.4819277108433735, + "grad_norm": 0.5701581239700317, + "learning_rate": 3.956667390249642e-05, + "loss": 0.0334, + "step": 250 + }, + { + "epoch": 0.483855421686747, + "grad_norm": 0.53755784034729, + "learning_rate": 3.9557339673213474e-05, + "loss": 0.0345, + "step": 251 + }, + { + "epoch": 0.4857831325301205, + "grad_norm": 0.4368877112865448, + "learning_rate": 3.95479071032812e-05, + "loss": 0.0183, + "step": 252 + }, + { + "epoch": 0.48771084337349396, + "grad_norm": 0.7972906827926636, + "learning_rate": 3.953837624012963e-05, + "loss": 0.0337, + "step": 253 + }, + { + "epoch": 0.48963855421686747, + "grad_norm": 0.6148451566696167, + "learning_rate": 3.9528747131683023e-05, + "loss": 0.0524, + "step": 254 + }, + { + "epoch": 0.491566265060241, + "grad_norm": 0.500840961933136, + "learning_rate": 3.9519019826359676e-05, + "loss": 0.0248, + "step": 255 + }, + { + "epoch": 0.49349397590361443, + "grad_norm": 0.5536255240440369, + "learning_rate": 3.9509194373071624e-05, + "loss": 0.0219, + "step": 256 + }, + { + "epoch": 0.49542168674698794, + "grad_norm": 0.6873176097869873, + "learning_rate": 3.9499270821224444e-05, + "loss": 0.0312, + "step": 257 + }, + { + "epoch": 0.49734939759036145, + "grad_norm": 0.37207168340682983, + "learning_rate": 3.9489249220716974e-05, + "loss": 0.0149, + "step": 258 + }, + { + "epoch": 0.49927710843373496, + "grad_norm": 0.4458799660205841, + "learning_rate": 3.947912962194107e-05, + "loss": 0.0214, + "step": 259 + }, + { + "epoch": 0.5012048192771085, + "grad_norm": 0.4272724390029907, + "learning_rate": 3.9468912075781345e-05, + "loss": 0.0263, + "step": 260 + }, + { + "epoch": 0.503132530120482, + "grad_norm": 0.5245792269706726, + "learning_rate": 3.945859663361496e-05, + "loss": 0.0103, + "step": 261 + }, + { + "epoch": 0.5050602409638554, + "grad_norm": 0.8799260854721069, + "learning_rate": 3.9448183347311284e-05, + "loss": 0.0292, + "step": 262 + }, + { + "epoch": 0.5069879518072289, + "grad_norm": 0.5996833443641663, + "learning_rate": 3.943767226923171e-05, + "loss": 0.0306, + "step": 263 + }, + { + "epoch": 0.5089156626506024, + "grad_norm": 0.6044682860374451, + "learning_rate": 3.942706345222935e-05, + "loss": 0.0218, + "step": 264 + }, + { + "epoch": 0.5108433734939759, + "grad_norm": 0.4770200848579407, + "learning_rate": 3.941635694964878e-05, + "loss": 0.0226, + "step": 265 + }, + { + "epoch": 0.5127710843373494, + "grad_norm": 0.5605704188346863, + "learning_rate": 3.940555281532576e-05, + "loss": 0.0354, + "step": 266 + }, + { + "epoch": 0.5146987951807229, + "grad_norm": 0.46532443165779114, + "learning_rate": 3.939465110358699e-05, + "loss": 0.0223, + "step": 267 + }, + { + "epoch": 0.5166265060240964, + "grad_norm": 0.5190595388412476, + "learning_rate": 3.93836518692498e-05, + "loss": 0.0219, + "step": 268 + }, + { + "epoch": 0.5185542168674698, + "grad_norm": 0.5767757892608643, + "learning_rate": 3.937255516762193e-05, + "loss": 0.0294, + "step": 269 + }, + { + "epoch": 0.5204819277108433, + "grad_norm": 0.4543164372444153, + "learning_rate": 3.936136105450119e-05, + "loss": 0.0244, + "step": 270 + }, + { + "epoch": 0.5224096385542168, + "grad_norm": 0.4155154526233673, + "learning_rate": 3.9350069586175195e-05, + "loss": 0.02, + "step": 271 + }, + { + "epoch": 0.5243373493975904, + "grad_norm": 0.5470768213272095, + "learning_rate": 3.933868081942113e-05, + "loss": 0.0187, + "step": 272 + }, + { + "epoch": 0.5262650602409639, + "grad_norm": 0.9491772651672363, + "learning_rate": 3.9327194811505406e-05, + "loss": 0.0337, + "step": 273 + }, + { + "epoch": 0.5281927710843374, + "grad_norm": 0.9313873052597046, + "learning_rate": 3.93156116201834e-05, + "loss": 0.0573, + "step": 274 + }, + { + "epoch": 0.5301204819277109, + "grad_norm": 0.7181005477905273, + "learning_rate": 3.930393130369915e-05, + "loss": 0.0405, + "step": 275 + }, + { + "epoch": 0.5320481927710843, + "grad_norm": 0.34231385588645935, + "learning_rate": 3.9292153920785076e-05, + "loss": 0.0153, + "step": 276 + }, + { + "epoch": 0.5339759036144578, + "grad_norm": 0.6899610161781311, + "learning_rate": 3.928027953066168e-05, + "loss": 0.0338, + "step": 277 + }, + { + "epoch": 0.5359036144578313, + "grad_norm": 0.7509781718254089, + "learning_rate": 3.926830819303726e-05, + "loss": 0.0416, + "step": 278 + }, + { + "epoch": 0.5378313253012048, + "grad_norm": 0.6326774954795837, + "learning_rate": 3.925623996810757e-05, + "loss": 0.0293, + "step": 279 + }, + { + "epoch": 0.5397590361445783, + "grad_norm": 0.5543203353881836, + "learning_rate": 3.924407491655557e-05, + "loss": 0.0263, + "step": 280 + }, + { + "epoch": 0.5416867469879518, + "grad_norm": 0.5367572903633118, + "learning_rate": 3.9231813099551086e-05, + "loss": 0.0276, + "step": 281 + }, + { + "epoch": 0.5436144578313253, + "grad_norm": 0.3143869638442993, + "learning_rate": 3.921945457875051e-05, + "loss": 0.0146, + "step": 282 + }, + { + "epoch": 0.5455421686746988, + "grad_norm": 0.47403043508529663, + "learning_rate": 3.920699941629649e-05, + "loss": 0.0267, + "step": 283 + }, + { + "epoch": 0.5474698795180722, + "grad_norm": 0.5082595348358154, + "learning_rate": 3.919444767481763e-05, + "loss": 0.0183, + "step": 284 + }, + { + "epoch": 0.5493975903614458, + "grad_norm": 0.747949481010437, + "learning_rate": 3.918179941742816e-05, + "loss": 0.0412, + "step": 285 + }, + { + "epoch": 0.5513253012048193, + "grad_norm": 0.6553886532783508, + "learning_rate": 3.916905470772762e-05, + "loss": 0.0505, + "step": 286 + }, + { + "epoch": 0.5532530120481928, + "grad_norm": 0.3838176131248474, + "learning_rate": 3.9156213609800545e-05, + "loss": 0.0156, + "step": 287 + }, + { + "epoch": 0.5551807228915663, + "grad_norm": 0.7427731156349182, + "learning_rate": 3.914327618821614e-05, + "loss": 0.0278, + "step": 288 + }, + { + "epoch": 0.5571084337349398, + "grad_norm": 0.2612821161746979, + "learning_rate": 3.913024250802796e-05, + "loss": 0.0101, + "step": 289 + }, + { + "epoch": 0.5590361445783133, + "grad_norm": 0.3799416124820709, + "learning_rate": 3.911711263477357e-05, + "loss": 0.0168, + "step": 290 + }, + { + "epoch": 0.5609638554216867, + "grad_norm": 0.5053854584693909, + "learning_rate": 3.910388663447425e-05, + "loss": 0.0249, + "step": 291 + }, + { + "epoch": 0.5628915662650602, + "grad_norm": 0.38095012307167053, + "learning_rate": 3.909056457363461e-05, + "loss": 0.0156, + "step": 292 + }, + { + "epoch": 0.5648192771084337, + "grad_norm": 0.4477892220020294, + "learning_rate": 3.907714651924229e-05, + "loss": 0.0309, + "step": 293 + }, + { + "epoch": 0.5667469879518072, + "grad_norm": 0.5875864624977112, + "learning_rate": 3.906363253876763e-05, + "loss": 0.0287, + "step": 294 + }, + { + "epoch": 0.5686746987951807, + "grad_norm": 0.522990882396698, + "learning_rate": 3.90500227001633e-05, + "loss": 0.0318, + "step": 295 + }, + { + "epoch": 0.5706024096385542, + "grad_norm": 0.4153876304626465, + "learning_rate": 3.9036317071863994e-05, + "loss": 0.0192, + "step": 296 + }, + { + "epoch": 0.5725301204819278, + "grad_norm": 0.4675769507884979, + "learning_rate": 3.902251572278605e-05, + "loss": 0.067, + "step": 297 + }, + { + "epoch": 0.5744578313253013, + "grad_norm": 0.35778650641441345, + "learning_rate": 3.900861872232713e-05, + "loss": 0.0197, + "step": 298 + }, + { + "epoch": 0.5763855421686747, + "grad_norm": 0.7382330894470215, + "learning_rate": 3.899462614036587e-05, + "loss": 0.0283, + "step": 299 + }, + { + "epoch": 0.5783132530120482, + "grad_norm": 0.41268599033355713, + "learning_rate": 3.89805380472615e-05, + "loss": 0.0207, + "step": 300 + }, + { + "epoch": 0.5802409638554217, + "grad_norm": 1.2013020515441895, + "learning_rate": 3.8966354513853535e-05, + "loss": 0.0301, + "step": 301 + }, + { + "epoch": 0.5821686746987952, + "grad_norm": 0.424757719039917, + "learning_rate": 3.895207561146137e-05, + "loss": 0.022, + "step": 302 + }, + { + "epoch": 0.5840963855421687, + "grad_norm": 0.4196677505970001, + "learning_rate": 3.893770141188396e-05, + "loss": 0.0424, + "step": 303 + }, + { + "epoch": 0.5860240963855422, + "grad_norm": 0.8644190430641174, + "learning_rate": 3.892323198739946e-05, + "loss": 0.08, + "step": 304 + }, + { + "epoch": 0.5879518072289157, + "grad_norm": 0.5645135045051575, + "learning_rate": 3.890866741076482e-05, + "loss": 0.0152, + "step": 305 + }, + { + "epoch": 0.5898795180722891, + "grad_norm": 0.5218387246131897, + "learning_rate": 3.889400775521545e-05, + "loss": 0.0205, + "step": 306 + }, + { + "epoch": 0.5918072289156626, + "grad_norm": 0.39709413051605225, + "learning_rate": 3.8879253094464865e-05, + "loss": 0.0233, + "step": 307 + }, + { + "epoch": 0.5937349397590361, + "grad_norm": 0.3572910726070404, + "learning_rate": 3.8864403502704285e-05, + "loss": 0.0198, + "step": 308 + }, + { + "epoch": 0.5956626506024096, + "grad_norm": 0.382709264755249, + "learning_rate": 3.8849459054602274e-05, + "loss": 0.0176, + "step": 309 + }, + { + "epoch": 0.5975903614457831, + "grad_norm": 3.4527227878570557, + "learning_rate": 3.883441982530436e-05, + "loss": 0.0239, + "step": 310 + }, + { + "epoch": 0.5995180722891567, + "grad_norm": 0.4467569589614868, + "learning_rate": 3.8819285890432674e-05, + "loss": 0.0284, + "step": 311 + }, + { + "epoch": 0.6014457831325302, + "grad_norm": 0.44513460993766785, + "learning_rate": 3.880405732608555e-05, + "loss": 0.0233, + "step": 312 + }, + { + "epoch": 0.6033734939759036, + "grad_norm": 0.8029689192771912, + "learning_rate": 3.8788734208837155e-05, + "loss": 0.0433, + "step": 313 + }, + { + "epoch": 0.6053012048192771, + "grad_norm": 0.7291454076766968, + "learning_rate": 3.877331661573709e-05, + "loss": 0.043, + "step": 314 + }, + { + "epoch": 0.6072289156626506, + "grad_norm": 0.6050467491149902, + "learning_rate": 3.8757804624310006e-05, + "loss": 0.0377, + "step": 315 + }, + { + "epoch": 0.6091566265060241, + "grad_norm": 0.6714366674423218, + "learning_rate": 3.874219831255524e-05, + "loss": 0.046, + "step": 316 + }, + { + "epoch": 0.6110843373493976, + "grad_norm": 0.336037278175354, + "learning_rate": 3.8726497758946394e-05, + "loss": 0.0149, + "step": 317 + }, + { + "epoch": 0.6130120481927711, + "grad_norm": 0.3057402968406677, + "learning_rate": 3.871070304243094e-05, + "loss": 0.014, + "step": 318 + }, + { + "epoch": 0.6149397590361446, + "grad_norm": 0.4537644684314728, + "learning_rate": 3.8694814242429834e-05, + "loss": 0.0503, + "step": 319 + }, + { + "epoch": 0.6168674698795181, + "grad_norm": 0.45573824644088745, + "learning_rate": 3.8678831438837116e-05, + "loss": 0.021, + "step": 320 + }, + { + "epoch": 0.6187951807228915, + "grad_norm": 0.30729591846466064, + "learning_rate": 3.866275471201952e-05, + "loss": 0.0163, + "step": 321 + }, + { + "epoch": 0.620722891566265, + "grad_norm": 0.7614850401878357, + "learning_rate": 3.8646584142816036e-05, + "loss": 0.0347, + "step": 322 + }, + { + "epoch": 0.6226506024096385, + "grad_norm": 0.5323611497879028, + "learning_rate": 3.863031981253754e-05, + "loss": 0.0201, + "step": 323 + }, + { + "epoch": 0.624578313253012, + "grad_norm": 0.34426453709602356, + "learning_rate": 3.861396180296635e-05, + "loss": 0.0243, + "step": 324 + }, + { + "epoch": 0.6265060240963856, + "grad_norm": 0.621636152267456, + "learning_rate": 3.859751019635585e-05, + "loss": 0.0166, + "step": 325 + }, + { + "epoch": 0.6284337349397591, + "grad_norm": 0.549324095249176, + "learning_rate": 3.858096507543006e-05, + "loss": 0.0274, + "step": 326 + }, + { + "epoch": 0.6303614457831326, + "grad_norm": 0.358426570892334, + "learning_rate": 3.8564326523383214e-05, + "loss": 0.0207, + "step": 327 + }, + { + "epoch": 0.632289156626506, + "grad_norm": 0.3639723062515259, + "learning_rate": 3.8547594623879346e-05, + "loss": 0.0297, + "step": 328 + }, + { + "epoch": 0.6342168674698795, + "grad_norm": 0.3402212858200073, + "learning_rate": 3.853076946105188e-05, + "loss": 0.0258, + "step": 329 + }, + { + "epoch": 0.636144578313253, + "grad_norm": 0.4083027243614197, + "learning_rate": 3.85138511195032e-05, + "loss": 0.0351, + "step": 330 + }, + { + "epoch": 0.6380722891566265, + "grad_norm": 0.43532121181488037, + "learning_rate": 3.84968396843042e-05, + "loss": 0.0388, + "step": 331 + }, + { + "epoch": 0.64, + "grad_norm": 0.35353463888168335, + "learning_rate": 3.8479735240993904e-05, + "loss": 0.0203, + "step": 332 + }, + { + "epoch": 0.6419277108433735, + "grad_norm": 0.350149929523468, + "learning_rate": 3.846253787557901e-05, + "loss": 0.0261, + "step": 333 + }, + { + "epoch": 0.643855421686747, + "grad_norm": 0.7665389180183411, + "learning_rate": 3.844524767453344e-05, + "loss": 0.0108, + "step": 334 + }, + { + "epoch": 0.6457831325301204, + "grad_norm": 0.44621360301971436, + "learning_rate": 3.842786472479795e-05, + "loss": 0.0282, + "step": 335 + }, + { + "epoch": 0.6477108433734939, + "grad_norm": 0.7787201404571533, + "learning_rate": 3.841038911377962e-05, + "loss": 0.0216, + "step": 336 + }, + { + "epoch": 0.6496385542168674, + "grad_norm": 0.48260653018951416, + "learning_rate": 3.839282092935153e-05, + "loss": 0.0234, + "step": 337 + }, + { + "epoch": 0.651566265060241, + "grad_norm": 0.4987852871417999, + "learning_rate": 3.837516025985219e-05, + "loss": 0.0515, + "step": 338 + }, + { + "epoch": 0.6534939759036145, + "grad_norm": 0.9030266404151917, + "learning_rate": 3.835740719408517e-05, + "loss": 0.0508, + "step": 339 + }, + { + "epoch": 0.655421686746988, + "grad_norm": 0.6381701231002808, + "learning_rate": 3.833956182131867e-05, + "loss": 0.0405, + "step": 340 + }, + { + "epoch": 0.6573493975903615, + "grad_norm": 0.42828986048698425, + "learning_rate": 3.832162423128499e-05, + "loss": 0.024, + "step": 341 + }, + { + "epoch": 0.659277108433735, + "grad_norm": 0.38725873827934265, + "learning_rate": 3.8303594514180164e-05, + "loss": 0.0199, + "step": 342 + }, + { + "epoch": 0.6612048192771084, + "grad_norm": 0.23280498385429382, + "learning_rate": 3.828547276066346e-05, + "loss": 0.0101, + "step": 343 + }, + { + "epoch": 0.6631325301204819, + "grad_norm": 0.7298216819763184, + "learning_rate": 3.8267259061856925e-05, + "loss": 0.0455, + "step": 344 + }, + { + "epoch": 0.6650602409638554, + "grad_norm": 0.5975687503814697, + "learning_rate": 3.824895350934496e-05, + "loss": 0.0372, + "step": 345 + }, + { + "epoch": 0.6669879518072289, + "grad_norm": 0.6295403242111206, + "learning_rate": 3.823055619517381e-05, + "loss": 0.0362, + "step": 346 + }, + { + "epoch": 0.6689156626506024, + "grad_norm": 0.5086020827293396, + "learning_rate": 3.821206721185115e-05, + "loss": 0.0368, + "step": 347 + }, + { + "epoch": 0.6708433734939759, + "grad_norm": 0.34506168961524963, + "learning_rate": 3.819348665234557e-05, + "loss": 0.0178, + "step": 348 + }, + { + "epoch": 0.6727710843373494, + "grad_norm": 1.309940218925476, + "learning_rate": 3.817481461008617e-05, + "loss": 0.024, + "step": 349 + }, + { + "epoch": 0.6746987951807228, + "grad_norm": 0.4074770510196686, + "learning_rate": 3.815605117896204e-05, + "loss": 0.0262, + "step": 350 + }, + { + "epoch": 0.6766265060240964, + "grad_norm": 0.48525840044021606, + "learning_rate": 3.8137196453321775e-05, + "loss": 0.0209, + "step": 351 + }, + { + "epoch": 0.6785542168674699, + "grad_norm": 0.7199739217758179, + "learning_rate": 3.811825052797308e-05, + "loss": 0.0396, + "step": 352 + }, + { + "epoch": 0.6804819277108434, + "grad_norm": 0.519540011882782, + "learning_rate": 3.8099213498182196e-05, + "loss": 0.0453, + "step": 353 + }, + { + "epoch": 0.6824096385542169, + "grad_norm": 0.9738391041755676, + "learning_rate": 3.808008545967349e-05, + "loss": 0.0317, + "step": 354 + }, + { + "epoch": 0.6843373493975904, + "grad_norm": 1.888344407081604, + "learning_rate": 3.8060866508628953e-05, + "loss": 0.0452, + "step": 355 + }, + { + "epoch": 0.6862650602409639, + "grad_norm": 0.48989811539649963, + "learning_rate": 3.8041556741687695e-05, + "loss": 0.0315, + "step": 356 + }, + { + "epoch": 0.6881927710843373, + "grad_norm": 0.3764645457267761, + "learning_rate": 3.8022156255945496e-05, + "loss": 0.0269, + "step": 357 + }, + { + "epoch": 0.6901204819277108, + "grad_norm": 0.46409738063812256, + "learning_rate": 3.800266514895429e-05, + "loss": 0.0171, + "step": 358 + }, + { + "epoch": 0.6920481927710843, + "grad_norm": 0.41091030836105347, + "learning_rate": 3.7983083518721695e-05, + "loss": 0.0167, + "step": 359 + }, + { + "epoch": 0.6939759036144578, + "grad_norm": 0.8375523090362549, + "learning_rate": 3.79634114637105e-05, + "loss": 0.0342, + "step": 360 + }, + { + "epoch": 0.6959036144578313, + "grad_norm": 1.7053394317626953, + "learning_rate": 3.794364908283817e-05, + "loss": 0.02, + "step": 361 + }, + { + "epoch": 0.6978313253012048, + "grad_norm": 0.4163115918636322, + "learning_rate": 3.792379647547637e-05, + "loss": 0.0138, + "step": 362 + }, + { + "epoch": 0.6997590361445784, + "grad_norm": 0.388751745223999, + "learning_rate": 3.790385374145046e-05, + "loss": 0.0172, + "step": 363 + }, + { + "epoch": 0.7016867469879519, + "grad_norm": 0.5584064722061157, + "learning_rate": 3.7883820981038966e-05, + "loss": 0.0254, + "step": 364 + }, + { + "epoch": 0.7036144578313253, + "grad_norm": 1.394264817237854, + "learning_rate": 3.7863698294973114e-05, + "loss": 0.037, + "step": 365 + }, + { + "epoch": 0.7055421686746988, + "grad_norm": 0.46280744671821594, + "learning_rate": 3.78434857844363e-05, + "loss": 0.0234, + "step": 366 + }, + { + "epoch": 0.7074698795180723, + "grad_norm": 0.39548924565315247, + "learning_rate": 3.782318355106358e-05, + "loss": 0.0164, + "step": 367 + }, + { + "epoch": 0.7093975903614458, + "grad_norm": 0.7307773232460022, + "learning_rate": 3.780279169694118e-05, + "loss": 0.0192, + "step": 368 + }, + { + "epoch": 0.7113253012048193, + "grad_norm": 0.28035807609558105, + "learning_rate": 3.778231032460594e-05, + "loss": 0.0131, + "step": 369 + }, + { + "epoch": 0.7132530120481928, + "grad_norm": 0.8376953601837158, + "learning_rate": 3.776173953704486e-05, + "loss": 0.0291, + "step": 370 + }, + { + "epoch": 0.7151807228915663, + "grad_norm": 0.7356843948364258, + "learning_rate": 3.774107943769454e-05, + "loss": 0.0214, + "step": 371 + }, + { + "epoch": 0.7171084337349397, + "grad_norm": 0.41503390669822693, + "learning_rate": 3.772033013044064e-05, + "loss": 0.0221, + "step": 372 + }, + { + "epoch": 0.7190361445783132, + "grad_norm": 0.35732385516166687, + "learning_rate": 3.7699491719617436e-05, + "loss": 0.015, + "step": 373 + }, + { + "epoch": 0.7209638554216867, + "grad_norm": 0.283778578042984, + "learning_rate": 3.76785643100072e-05, + "loss": 0.0146, + "step": 374 + }, + { + "epoch": 0.7228915662650602, + "grad_norm": 0.3219413459300995, + "learning_rate": 3.765754800683974e-05, + "loss": 0.015, + "step": 375 + }, + { + "epoch": 0.7248192771084337, + "grad_norm": 0.610431432723999, + "learning_rate": 3.7636442915791856e-05, + "loss": 0.0326, + "step": 376 + }, + { + "epoch": 0.7267469879518073, + "grad_norm": 4.944870948791504, + "learning_rate": 3.7615249142986784e-05, + "loss": 0.0432, + "step": 377 + }, + { + "epoch": 0.7286746987951808, + "grad_norm": 0.4894593060016632, + "learning_rate": 3.7593966794993696e-05, + "loss": 0.0174, + "step": 378 + }, + { + "epoch": 0.7306024096385542, + "grad_norm": 0.4211325943470001, + "learning_rate": 3.757259597882714e-05, + "loss": 0.023, + "step": 379 + }, + { + "epoch": 0.7325301204819277, + "grad_norm": 0.33621737360954285, + "learning_rate": 3.755113680194651e-05, + "loss": 0.0201, + "step": 380 + }, + { + "epoch": 0.7344578313253012, + "grad_norm": 0.5799694657325745, + "learning_rate": 3.7529589372255514e-05, + "loss": 0.0173, + "step": 381 + }, + { + "epoch": 0.7363855421686747, + "grad_norm": 0.5172572731971741, + "learning_rate": 3.750795379810162e-05, + "loss": 0.0284, + "step": 382 + }, + { + "epoch": 0.7383132530120482, + "grad_norm": 0.5715453028678894, + "learning_rate": 3.748623018827552e-05, + "loss": 0.0194, + "step": 383 + }, + { + "epoch": 0.7402409638554217, + "grad_norm": 0.5284178256988525, + "learning_rate": 3.746441865201056e-05, + "loss": 0.0247, + "step": 384 + }, + { + "epoch": 0.7421686746987952, + "grad_norm": 0.37828654050827026, + "learning_rate": 3.744251929898223e-05, + "loss": 0.0097, + "step": 385 + }, + { + "epoch": 0.7440963855421687, + "grad_norm": 0.3252779543399811, + "learning_rate": 3.742053223930758e-05, + "loss": 0.0238, + "step": 386 + }, + { + "epoch": 0.7460240963855421, + "grad_norm": 0.6031543612480164, + "learning_rate": 3.7398457583544674e-05, + "loss": 0.0332, + "step": 387 + }, + { + "epoch": 0.7479518072289156, + "grad_norm": 0.23846614360809326, + "learning_rate": 3.737629544269206e-05, + "loss": 0.0122, + "step": 388 + }, + { + "epoch": 0.7498795180722891, + "grad_norm": 0.5274029970169067, + "learning_rate": 3.7354045928188155e-05, + "loss": 0.0324, + "step": 389 + }, + { + "epoch": 0.7518072289156627, + "grad_norm": 0.4672217071056366, + "learning_rate": 3.733170915191075e-05, + "loss": 0.0196, + "step": 390 + }, + { + "epoch": 0.7537349397590362, + "grad_norm": 0.29819396138191223, + "learning_rate": 3.730928522617639e-05, + "loss": 0.0131, + "step": 391 + }, + { + "epoch": 0.7556626506024097, + "grad_norm": 0.43824997544288635, + "learning_rate": 3.7286774263739855e-05, + "loss": 0.0238, + "step": 392 + }, + { + "epoch": 0.7575903614457832, + "grad_norm": 0.2822072505950928, + "learning_rate": 3.726417637779357e-05, + "loss": 0.0314, + "step": 393 + }, + { + "epoch": 0.7595180722891566, + "grad_norm": 0.43815648555755615, + "learning_rate": 3.7241491681967044e-05, + "loss": 0.0144, + "step": 394 + }, + { + "epoch": 0.7614457831325301, + "grad_norm": 0.37194815278053284, + "learning_rate": 3.721872029032628e-05, + "loss": 0.0286, + "step": 395 + }, + { + "epoch": 0.7633734939759036, + "grad_norm": 0.7319737672805786, + "learning_rate": 3.719586231737322e-05, + "loss": 0.0427, + "step": 396 + }, + { + "epoch": 0.7653012048192771, + "grad_norm": 0.5870066285133362, + "learning_rate": 3.717291787804517e-05, + "loss": 0.0138, + "step": 397 + }, + { + "epoch": 0.7672289156626506, + "grad_norm": 0.6574277281761169, + "learning_rate": 3.7149887087714225e-05, + "loss": 0.061, + "step": 398 + }, + { + "epoch": 0.7691566265060241, + "grad_norm": 0.5467348694801331, + "learning_rate": 3.712677006218666e-05, + "loss": 0.022, + "step": 399 + }, + { + "epoch": 0.7710843373493976, + "grad_norm": 0.3589288890361786, + "learning_rate": 3.710356691770238e-05, + "loss": 0.0161, + "step": 400 + }, + { + "epoch": 0.7730120481927711, + "grad_norm": 0.574630618095398, + "learning_rate": 3.708027777093433e-05, + "loss": 0.0285, + "step": 401 + }, + { + "epoch": 0.7749397590361445, + "grad_norm": 0.39048445224761963, + "learning_rate": 3.70569027389879e-05, + "loss": 0.012, + "step": 402 + }, + { + "epoch": 0.776867469879518, + "grad_norm": 0.34803536534309387, + "learning_rate": 3.703344193940032e-05, + "loss": 0.0155, + "step": 403 + }, + { + "epoch": 0.7787951807228916, + "grad_norm": 1.188948392868042, + "learning_rate": 3.700989549014011e-05, + "loss": 0.0617, + "step": 404 + }, + { + "epoch": 0.7807228915662651, + "grad_norm": 0.473157674074173, + "learning_rate": 3.698626350960646e-05, + "loss": 0.0298, + "step": 405 + }, + { + "epoch": 0.7826506024096386, + "grad_norm": 0.42009076476097107, + "learning_rate": 3.6962546116628634e-05, + "loss": 0.03, + "step": 406 + }, + { + "epoch": 0.7845783132530121, + "grad_norm": 0.6334308981895447, + "learning_rate": 3.693874343046537e-05, + "loss": 0.0107, + "step": 407 + }, + { + "epoch": 0.7865060240963856, + "grad_norm": 0.35594677925109863, + "learning_rate": 3.6914855570804314e-05, + "loss": 0.0174, + "step": 408 + }, + { + "epoch": 0.788433734939759, + "grad_norm": 0.28985708951950073, + "learning_rate": 3.689088265776136e-05, + "loss": 0.0149, + "step": 409 + }, + { + "epoch": 0.7903614457831325, + "grad_norm": 0.3981950581073761, + "learning_rate": 3.686682481188011e-05, + "loss": 0.019, + "step": 410 + }, + { + "epoch": 0.792289156626506, + "grad_norm": 0.48819583654403687, + "learning_rate": 3.6842682154131193e-05, + "loss": 0.0217, + "step": 411 + }, + { + "epoch": 0.7942168674698795, + "grad_norm": 0.42819952964782715, + "learning_rate": 3.681845480591174e-05, + "loss": 0.0198, + "step": 412 + }, + { + "epoch": 0.796144578313253, + "grad_norm": 0.48591694235801697, + "learning_rate": 3.6794142889044727e-05, + "loss": 0.0253, + "step": 413 + }, + { + "epoch": 0.7980722891566265, + "grad_norm": 0.4730607271194458, + "learning_rate": 3.676974652577835e-05, + "loss": 0.0329, + "step": 414 + }, + { + "epoch": 0.8, + "grad_norm": 0.5390865802764893, + "learning_rate": 3.6745265838785434e-05, + "loss": 0.0479, + "step": 415 + }, + { + "epoch": 0.8019277108433734, + "grad_norm": 0.6377891302108765, + "learning_rate": 3.672070095116283e-05, + "loss": 0.019, + "step": 416 + }, + { + "epoch": 0.803855421686747, + "grad_norm": 0.8984615206718445, + "learning_rate": 3.669605198643075e-05, + "loss": 0.0444, + "step": 417 + }, + { + "epoch": 0.8057831325301205, + "grad_norm": 0.4913877546787262, + "learning_rate": 3.667131906853219e-05, + "loss": 0.031, + "step": 418 + }, + { + "epoch": 0.807710843373494, + "grad_norm": 0.37894028425216675, + "learning_rate": 3.664650232183229e-05, + "loss": 0.0195, + "step": 419 + }, + { + "epoch": 0.8096385542168675, + "grad_norm": 0.3644949495792389, + "learning_rate": 3.66216018711177e-05, + "loss": 0.018, + "step": 420 + }, + { + "epoch": 0.811566265060241, + "grad_norm": 0.414440393447876, + "learning_rate": 3.659661784159597e-05, + "loss": 0.0188, + "step": 421 + }, + { + "epoch": 0.8134939759036145, + "grad_norm": 0.49220341444015503, + "learning_rate": 3.65715503588949e-05, + "loss": 0.016, + "step": 422 + }, + { + "epoch": 0.815421686746988, + "grad_norm": 1.0939836502075195, + "learning_rate": 3.654639954906193e-05, + "loss": 0.0758, + "step": 423 + }, + { + "epoch": 0.8173493975903614, + "grad_norm": 0.43222442269325256, + "learning_rate": 3.652116553856349e-05, + "loss": 0.0308, + "step": 424 + }, + { + "epoch": 0.8192771084337349, + "grad_norm": 0.5081896185874939, + "learning_rate": 3.649584845428438e-05, + "loss": 0.0493, + "step": 425 + }, + { + "epoch": 0.8212048192771084, + "grad_norm": 0.9811948537826538, + "learning_rate": 3.64704484235271e-05, + "loss": 0.019, + "step": 426 + }, + { + "epoch": 0.8231325301204819, + "grad_norm": 0.31656572222709656, + "learning_rate": 3.6444965574011255e-05, + "loss": 0.0135, + "step": 427 + }, + { + "epoch": 0.8250602409638554, + "grad_norm": 0.7844433188438416, + "learning_rate": 3.641940003387289e-05, + "loss": 0.0402, + "step": 428 + }, + { + "epoch": 0.826987951807229, + "grad_norm": 0.3353273570537567, + "learning_rate": 3.6393751931663814e-05, + "loss": 0.0132, + "step": 429 + }, + { + "epoch": 0.8289156626506025, + "grad_norm": 0.7253058552742004, + "learning_rate": 3.6368021396351015e-05, + "loss": 0.0296, + "step": 430 + }, + { + "epoch": 0.8308433734939759, + "grad_norm": 0.45300304889678955, + "learning_rate": 3.634220855731598e-05, + "loss": 0.0258, + "step": 431 + }, + { + "epoch": 0.8327710843373494, + "grad_norm": 0.3480473458766937, + "learning_rate": 3.631631354435403e-05, + "loss": 0.0099, + "step": 432 + }, + { + "epoch": 0.8346987951807229, + "grad_norm": 2.1114516258239746, + "learning_rate": 3.62903364876737e-05, + "loss": 0.0457, + "step": 433 + }, + { + "epoch": 0.8366265060240964, + "grad_norm": 0.5649561882019043, + "learning_rate": 3.626427751789606e-05, + "loss": 0.0444, + "step": 434 + }, + { + "epoch": 0.8385542168674699, + "grad_norm": 0.3864995539188385, + "learning_rate": 3.623813676605405e-05, + "loss": 0.0223, + "step": 435 + }, + { + "epoch": 0.8404819277108434, + "grad_norm": 1.2134298086166382, + "learning_rate": 3.621191436359186e-05, + "loss": 0.0353, + "step": 436 + }, + { + "epoch": 0.8424096385542169, + "grad_norm": 0.4403415024280548, + "learning_rate": 3.6185610442364246e-05, + "loss": 0.0216, + "step": 437 + }, + { + "epoch": 0.8443373493975903, + "grad_norm": 0.6050297021865845, + "learning_rate": 3.6159225134635846e-05, + "loss": 0.0433, + "step": 438 + }, + { + "epoch": 0.8462650602409638, + "grad_norm": 0.7951678037643433, + "learning_rate": 3.6132758573080556e-05, + "loss": 0.031, + "step": 439 + }, + { + "epoch": 0.8481927710843373, + "grad_norm": 0.4991949796676636, + "learning_rate": 3.6106210890780834e-05, + "loss": 0.0313, + "step": 440 + }, + { + "epoch": 0.8501204819277108, + "grad_norm": 0.47951385378837585, + "learning_rate": 3.607958222122704e-05, + "loss": 0.0218, + "step": 441 + }, + { + "epoch": 0.8520481927710843, + "grad_norm": 0.7345194220542908, + "learning_rate": 3.6052872698316755e-05, + "loss": 0.0239, + "step": 442 + }, + { + "epoch": 0.8539759036144579, + "grad_norm": 1.4814884662628174, + "learning_rate": 3.602608245635414e-05, + "loss": 0.0127, + "step": 443 + }, + { + "epoch": 0.8559036144578314, + "grad_norm": 2.4240877628326416, + "learning_rate": 3.599921163004922e-05, + "loss": 0.0618, + "step": 444 + }, + { + "epoch": 0.8578313253012049, + "grad_norm": 0.41523510217666626, + "learning_rate": 3.5972260354517216e-05, + "loss": 0.0283, + "step": 445 + }, + { + "epoch": 0.8597590361445783, + "grad_norm": 0.5577677488327026, + "learning_rate": 3.594522876527791e-05, + "loss": 0.0271, + "step": 446 + }, + { + "epoch": 0.8616867469879518, + "grad_norm": 0.5829064846038818, + "learning_rate": 3.591811699825487e-05, + "loss": 0.0169, + "step": 447 + }, + { + "epoch": 0.8636144578313253, + "grad_norm": 0.4478822350502014, + "learning_rate": 3.5890925189774886e-05, + "loss": 0.0239, + "step": 448 + }, + { + "epoch": 0.8655421686746988, + "grad_norm": 0.3498048782348633, + "learning_rate": 3.586365347656718e-05, + "loss": 0.0137, + "step": 449 + }, + { + "epoch": 0.8674698795180723, + "grad_norm": 0.6571130156517029, + "learning_rate": 3.583630199576278e-05, + "loss": 0.027, + "step": 450 + }, + { + "epoch": 0.8693975903614458, + "grad_norm": 0.344970166683197, + "learning_rate": 3.58088708848938e-05, + "loss": 0.0167, + "step": 451 + }, + { + "epoch": 0.8713253012048193, + "grad_norm": 0.34611570835113525, + "learning_rate": 3.5781360281892775e-05, + "loss": 0.0468, + "step": 452 + }, + { + "epoch": 0.8732530120481927, + "grad_norm": 0.66157066822052, + "learning_rate": 3.575377032509194e-05, + "loss": 0.0344, + "step": 453 + }, + { + "epoch": 0.8751807228915662, + "grad_norm": 0.3676326870918274, + "learning_rate": 3.5726101153222534e-05, + "loss": 0.0366, + "step": 454 + }, + { + "epoch": 0.8771084337349397, + "grad_norm": 0.5958423018455505, + "learning_rate": 3.569835290541414e-05, + "loss": 0.0382, + "step": 455 + }, + { + "epoch": 0.8790361445783132, + "grad_norm": 0.36787471175193787, + "learning_rate": 3.567052572119397e-05, + "loss": 0.018, + "step": 456 + }, + { + "epoch": 0.8809638554216868, + "grad_norm": 0.9478234052658081, + "learning_rate": 3.564261974048611e-05, + "loss": 0.0179, + "step": 457 + }, + { + "epoch": 0.8828915662650603, + "grad_norm": 0.3337579369544983, + "learning_rate": 3.56146351036109e-05, + "loss": 0.0147, + "step": 458 + }, + { + "epoch": 0.8848192771084338, + "grad_norm": 0.4984932243824005, + "learning_rate": 3.558657195128416e-05, + "loss": 0.0224, + "step": 459 + }, + { + "epoch": 0.8867469879518072, + "grad_norm": 0.36718735098838806, + "learning_rate": 3.555843042461653e-05, + "loss": 0.0202, + "step": 460 + }, + { + "epoch": 0.8886746987951807, + "grad_norm": 0.4081745445728302, + "learning_rate": 3.553021066511274e-05, + "loss": 0.0288, + "step": 461 + }, + { + "epoch": 0.8906024096385542, + "grad_norm": 0.3233242332935333, + "learning_rate": 3.55019128146709e-05, + "loss": 0.0362, + "step": 462 + }, + { + "epoch": 0.8925301204819277, + "grad_norm": 0.6560158729553223, + "learning_rate": 3.547353701558178e-05, + "loss": 0.038, + "step": 463 + }, + { + "epoch": 0.8944578313253012, + "grad_norm": 0.47668641805648804, + "learning_rate": 3.544508341052811e-05, + "loss": 0.0399, + "step": 464 + }, + { + "epoch": 0.8963855421686747, + "grad_norm": 0.45512664318084717, + "learning_rate": 3.541655214258383e-05, + "loss": 0.022, + "step": 465 + }, + { + "epoch": 0.8983132530120482, + "grad_norm": 0.8410730361938477, + "learning_rate": 3.538794335521343e-05, + "loss": 0.0315, + "step": 466 + }, + { + "epoch": 0.9002409638554217, + "grad_norm": 0.4872909486293793, + "learning_rate": 3.535925719227117e-05, + "loss": 0.0152, + "step": 467 + }, + { + "epoch": 0.9021686746987951, + "grad_norm": 0.45623311400413513, + "learning_rate": 3.533049379800038e-05, + "loss": 0.0305, + "step": 468 + }, + { + "epoch": 0.9040963855421686, + "grad_norm": 0.43087029457092285, + "learning_rate": 3.530165331703275e-05, + "loss": 0.0131, + "step": 469 + }, + { + "epoch": 0.9060240963855422, + "grad_norm": 0.4610525369644165, + "learning_rate": 3.527273589438756e-05, + "loss": 0.0187, + "step": 470 + }, + { + "epoch": 0.9079518072289157, + "grad_norm": 0.3356114327907562, + "learning_rate": 3.5243741675471006e-05, + "loss": 0.0185, + "step": 471 + }, + { + "epoch": 0.9098795180722892, + "grad_norm": 0.9065960049629211, + "learning_rate": 3.5214670806075426e-05, + "loss": 0.0433, + "step": 472 + }, + { + "epoch": 0.9118072289156627, + "grad_norm": 0.3652578294277191, + "learning_rate": 3.518552343237858e-05, + "loss": 0.02, + "step": 473 + }, + { + "epoch": 0.9137349397590362, + "grad_norm": 0.32377883791923523, + "learning_rate": 3.5156299700942916e-05, + "loss": 0.0165, + "step": 474 + }, + { + "epoch": 0.9156626506024096, + "grad_norm": 0.2431817352771759, + "learning_rate": 3.512699975871485e-05, + "loss": 0.0172, + "step": 475 + }, + { + "epoch": 0.9175903614457831, + "grad_norm": 0.6390707492828369, + "learning_rate": 3.509762375302399e-05, + "loss": 0.0356, + "step": 476 + }, + { + "epoch": 0.9195180722891566, + "grad_norm": 0.2283092886209488, + "learning_rate": 3.506817183158243e-05, + "loss": 0.0088, + "step": 477 + }, + { + "epoch": 0.9214457831325301, + "grad_norm": 0.5053914189338684, + "learning_rate": 3.5038644142483966e-05, + "loss": 0.0389, + "step": 478 + }, + { + "epoch": 0.9233734939759036, + "grad_norm": 0.2567576467990875, + "learning_rate": 3.500904083420342e-05, + "loss": 0.0155, + "step": 479 + }, + { + "epoch": 0.9253012048192771, + "grad_norm": 0.6852384209632874, + "learning_rate": 3.497936205559583e-05, + "loss": 0.0247, + "step": 480 + }, + { + "epoch": 0.9272289156626506, + "grad_norm": 0.36403414607048035, + "learning_rate": 3.494960795589572e-05, + "loss": 0.023, + "step": 481 + }, + { + "epoch": 0.929156626506024, + "grad_norm": 0.506554901599884, + "learning_rate": 3.491977868471635e-05, + "loss": 0.0273, + "step": 482 + }, + { + "epoch": 0.9310843373493976, + "grad_norm": 0.38329923152923584, + "learning_rate": 3.4889874392048985e-05, + "loss": 0.0169, + "step": 483 + }, + { + "epoch": 0.9330120481927711, + "grad_norm": 0.2805836498737335, + "learning_rate": 3.48598952282621e-05, + "loss": 0.0105, + "step": 484 + }, + { + "epoch": 0.9349397590361446, + "grad_norm": 0.6315302848815918, + "learning_rate": 3.482984134410067e-05, + "loss": 0.0289, + "step": 485 + }, + { + "epoch": 0.9368674698795181, + "grad_norm": 0.6431388854980469, + "learning_rate": 3.479971289068537e-05, + "loss": 0.0311, + "step": 486 + }, + { + "epoch": 0.9387951807228916, + "grad_norm": 0.9794723391532898, + "learning_rate": 3.476951001951184e-05, + "loss": 0.0452, + "step": 487 + }, + { + "epoch": 0.9407228915662651, + "grad_norm": 0.7984824180603027, + "learning_rate": 3.473923288244991e-05, + "loss": 0.0689, + "step": 488 + }, + { + "epoch": 0.9426506024096386, + "grad_norm": 0.46362006664276123, + "learning_rate": 3.470888163174286e-05, + "loss": 0.0241, + "step": 489 + }, + { + "epoch": 0.944578313253012, + "grad_norm": 0.5051195025444031, + "learning_rate": 3.467845642000661e-05, + "loss": 0.0228, + "step": 490 + }, + { + "epoch": 0.9465060240963855, + "grad_norm": 0.3082812428474426, + "learning_rate": 3.4647957400229004e-05, + "loss": 0.0144, + "step": 491 + }, + { + "epoch": 0.948433734939759, + "grad_norm": 0.2691391110420227, + "learning_rate": 3.461738472576902e-05, + "loss": 0.0167, + "step": 492 + }, + { + "epoch": 0.9503614457831325, + "grad_norm": 0.5627671480178833, + "learning_rate": 3.458673855035597e-05, + "loss": 0.031, + "step": 493 + }, + { + "epoch": 0.952289156626506, + "grad_norm": 0.4571435749530792, + "learning_rate": 3.455601902808876e-05, + "loss": 0.0191, + "step": 494 + }, + { + "epoch": 0.9542168674698795, + "grad_norm": 1.0117709636688232, + "learning_rate": 3.452522631343515e-05, + "loss": 0.0192, + "step": 495 + }, + { + "epoch": 0.9561445783132531, + "grad_norm": 0.28375712037086487, + "learning_rate": 3.449436056123086e-05, + "loss": 0.0159, + "step": 496 + }, + { + "epoch": 0.9580722891566265, + "grad_norm": 0.26381856203079224, + "learning_rate": 3.446342192667893e-05, + "loss": 0.0113, + "step": 497 + }, + { + "epoch": 0.96, + "grad_norm": 0.49317577481269836, + "learning_rate": 3.443241056534884e-05, + "loss": 0.0332, + "step": 498 + }, + { + "epoch": 0.9619277108433735, + "grad_norm": 0.28884485363960266, + "learning_rate": 3.440132663317579e-05, + "loss": 0.0117, + "step": 499 + }, + { + "epoch": 0.963855421686747, + "grad_norm": 0.36255285143852234, + "learning_rate": 3.4370170286459864e-05, + "loss": 0.0169, + "step": 500 + }, + { + "epoch": 0.9657831325301205, + "grad_norm": 0.4265049993991852, + "learning_rate": 3.433894168186529e-05, + "loss": 0.0217, + "step": 501 + }, + { + "epoch": 0.967710843373494, + "grad_norm": 0.8169426321983337, + "learning_rate": 3.430764097641962e-05, + "loss": 0.0207, + "step": 502 + }, + { + "epoch": 0.9696385542168675, + "grad_norm": 1.866077184677124, + "learning_rate": 3.427626832751296e-05, + "loss": 0.0381, + "step": 503 + }, + { + "epoch": 0.971566265060241, + "grad_norm": 0.33124980330467224, + "learning_rate": 3.424482389289716e-05, + "loss": 0.0245, + "step": 504 + }, + { + "epoch": 0.9734939759036144, + "grad_norm": 0.37479540705680847, + "learning_rate": 3.4213307830685055e-05, + "loss": 0.0164, + "step": 505 + }, + { + "epoch": 0.9754216867469879, + "grad_norm": 0.39738863706588745, + "learning_rate": 3.4181720299349615e-05, + "loss": 0.0297, + "step": 506 + }, + { + "epoch": 0.9773493975903614, + "grad_norm": 0.2567287087440491, + "learning_rate": 3.4150061457723205e-05, + "loss": 0.0102, + "step": 507 + }, + { + "epoch": 0.9792771084337349, + "grad_norm": 0.6230517029762268, + "learning_rate": 3.411833146499675e-05, + "loss": 0.0243, + "step": 508 + }, + { + "epoch": 0.9812048192771085, + "grad_norm": 0.44843971729278564, + "learning_rate": 3.408653048071894e-05, + "loss": 0.0357, + "step": 509 + }, + { + "epoch": 0.983132530120482, + "grad_norm": 1.0569655895233154, + "learning_rate": 3.405465866479546e-05, + "loss": 0.037, + "step": 510 + }, + { + "epoch": 0.9850602409638555, + "grad_norm": 0.29000964760780334, + "learning_rate": 3.402271617748812e-05, + "loss": 0.0129, + "step": 511 + }, + { + "epoch": 0.9869879518072289, + "grad_norm": 2.1627447605133057, + "learning_rate": 3.399070317941413e-05, + "loss": 0.0442, + "step": 512 + }, + { + "epoch": 0.9889156626506024, + "grad_norm": 0.27371272444725037, + "learning_rate": 3.395861983154522e-05, + "loss": 0.0119, + "step": 513 + }, + { + "epoch": 0.9908433734939759, + "grad_norm": 0.4117226302623749, + "learning_rate": 3.392646629520688e-05, + "loss": 0.0455, + "step": 514 + }, + { + "epoch": 0.9927710843373494, + "grad_norm": 0.5098996758460999, + "learning_rate": 3.389424273207752e-05, + "loss": 0.0203, + "step": 515 + }, + { + "epoch": 0.9946987951807229, + "grad_norm": 0.5192157626152039, + "learning_rate": 3.386194930418767e-05, + "loss": 0.0329, + "step": 516 + }, + { + "epoch": 0.9966265060240964, + "grad_norm": 0.18757697939872742, + "learning_rate": 3.382958617391915e-05, + "loss": 0.0065, + "step": 517 + }, + { + "epoch": 0.9985542168674699, + "grad_norm": 0.3334413170814514, + "learning_rate": 3.3797153504004296e-05, + "loss": 0.0266, + "step": 518 + }, + { + "epoch": 1.0, + "grad_norm": 0.4152225852012634, + "learning_rate": 3.3764651457525095e-05, + "loss": 0.0169, + "step": 519 + }, + { + "epoch": 1.0019277108433735, + "grad_norm": 0.43535247445106506, + "learning_rate": 3.373208019791237e-05, + "loss": 0.0221, + "step": 520 + }, + { + "epoch": 1.003855421686747, + "grad_norm": 0.39292722940444946, + "learning_rate": 3.3699439888945e-05, + "loss": 0.0211, + "step": 521 + }, + { + "epoch": 1.0057831325301205, + "grad_norm": 0.19566713273525238, + "learning_rate": 3.366673069474904e-05, + "loss": 0.0069, + "step": 522 + }, + { + "epoch": 1.007710843373494, + "grad_norm": 0.5101853609085083, + "learning_rate": 3.3633952779796914e-05, + "loss": 0.0191, + "step": 523 + }, + { + "epoch": 1.0096385542168675, + "grad_norm": 0.999434769153595, + "learning_rate": 3.360110630890664e-05, + "loss": 0.0196, + "step": 524 + }, + { + "epoch": 1.011566265060241, + "grad_norm": 0.4646223783493042, + "learning_rate": 3.356819144724092e-05, + "loss": 0.0328, + "step": 525 + }, + { + "epoch": 1.0134939759036146, + "grad_norm": 0.3132480978965759, + "learning_rate": 3.3535208360306354e-05, + "loss": 0.0203, + "step": 526 + }, + { + "epoch": 1.0154216867469879, + "grad_norm": 0.3038032352924347, + "learning_rate": 3.350215721395261e-05, + "loss": 0.0122, + "step": 527 + }, + { + "epoch": 1.0173493975903614, + "grad_norm": 0.45082882046699524, + "learning_rate": 3.346903817437157e-05, + "loss": 0.0437, + "step": 528 + }, + { + "epoch": 1.0192771084337349, + "grad_norm": 0.26917046308517456, + "learning_rate": 3.343585140809651e-05, + "loss": 0.013, + "step": 529 + }, + { + "epoch": 1.0212048192771084, + "grad_norm": 0.23869264125823975, + "learning_rate": 3.3402597082001276e-05, + "loss": 0.008, + "step": 530 + }, + { + "epoch": 1.0231325301204819, + "grad_norm": 0.31315353512763977, + "learning_rate": 3.3369275363299394e-05, + "loss": 0.0078, + "step": 531 + }, + { + "epoch": 1.0250602409638554, + "grad_norm": 0.4780346751213074, + "learning_rate": 3.333588641954327e-05, + "loss": 0.0225, + "step": 532 + }, + { + "epoch": 1.026987951807229, + "grad_norm": 0.2920368015766144, + "learning_rate": 3.330243041862336e-05, + "loss": 0.0118, + "step": 533 + }, + { + "epoch": 1.0289156626506024, + "grad_norm": 0.543669581413269, + "learning_rate": 3.326890752876728e-05, + "loss": 0.0338, + "step": 534 + }, + { + "epoch": 1.030843373493976, + "grad_norm": 0.4288000464439392, + "learning_rate": 3.323531791853901e-05, + "loss": 0.0341, + "step": 535 + }, + { + "epoch": 1.0327710843373494, + "grad_norm": 0.26600322127342224, + "learning_rate": 3.3201661756838e-05, + "loss": 0.0184, + "step": 536 + }, + { + "epoch": 1.034698795180723, + "grad_norm": 0.290937602519989, + "learning_rate": 3.316793921289835e-05, + "loss": 0.0152, + "step": 537 + }, + { + "epoch": 1.0366265060240965, + "grad_norm": 0.7621443271636963, + "learning_rate": 3.313415045628795e-05, + "loss": 0.0326, + "step": 538 + }, + { + "epoch": 1.03855421686747, + "grad_norm": 0.5581283569335938, + "learning_rate": 3.3100295656907646e-05, + "loss": 0.0164, + "step": 539 + }, + { + "epoch": 1.0404819277108435, + "grad_norm": 0.20930901169776917, + "learning_rate": 3.306637498499034e-05, + "loss": 0.0091, + "step": 540 + }, + { + "epoch": 1.0424096385542168, + "grad_norm": 0.46212059259414673, + "learning_rate": 3.303238861110018e-05, + "loss": 0.0118, + "step": 541 + }, + { + "epoch": 1.0443373493975903, + "grad_norm": 0.38259151577949524, + "learning_rate": 3.299833670613168e-05, + "loss": 0.0081, + "step": 542 + }, + { + "epoch": 1.0462650602409638, + "grad_norm": 0.4888618290424347, + "learning_rate": 3.2964219441308865e-05, + "loss": 0.0138, + "step": 543 + }, + { + "epoch": 1.0481927710843373, + "grad_norm": 0.32103127241134644, + "learning_rate": 3.2930036988184425e-05, + "loss": 0.0171, + "step": 544 + }, + { + "epoch": 1.0501204819277108, + "grad_norm": 0.27787327766418457, + "learning_rate": 3.28957895186388e-05, + "loss": 0.0106, + "step": 545 + }, + { + "epoch": 1.0520481927710843, + "grad_norm": 0.35597777366638184, + "learning_rate": 3.2861477204879395e-05, + "loss": 0.0123, + "step": 546 + }, + { + "epoch": 1.0539759036144578, + "grad_norm": 0.3619804084300995, + "learning_rate": 3.2827100219439656e-05, + "loss": 0.0088, + "step": 547 + }, + { + "epoch": 1.0559036144578313, + "grad_norm": 0.2525513470172882, + "learning_rate": 3.279265873517822e-05, + "loss": 0.0179, + "step": 548 + }, + { + "epoch": 1.0578313253012048, + "grad_norm": 0.3910020887851715, + "learning_rate": 3.275815292527804e-05, + "loss": 0.0142, + "step": 549 + }, + { + "epoch": 1.0597590361445783, + "grad_norm": 0.30515050888061523, + "learning_rate": 3.2723582963245526e-05, + "loss": 0.0123, + "step": 550 + }, + { + "epoch": 1.0616867469879518, + "grad_norm": 0.21708644926548004, + "learning_rate": 3.2688949022909665e-05, + "loss": 0.0098, + "step": 551 + }, + { + "epoch": 1.0636144578313254, + "grad_norm": 0.23307719826698303, + "learning_rate": 3.265425127842114e-05, + "loss": 0.0097, + "step": 552 + }, + { + "epoch": 1.0655421686746989, + "grad_norm": 0.676654577255249, + "learning_rate": 3.261948990425147e-05, + "loss": 0.0227, + "step": 553 + }, + { + "epoch": 1.0674698795180724, + "grad_norm": 0.4593975841999054, + "learning_rate": 3.258466507519213e-05, + "loss": 0.047, + "step": 554 + }, + { + "epoch": 1.0693975903614459, + "grad_norm": 0.19405829906463623, + "learning_rate": 3.254977696635366e-05, + "loss": 0.0314, + "step": 555 + }, + { + "epoch": 1.0713253012048192, + "grad_norm": 0.14563389122486115, + "learning_rate": 3.2514825753164774e-05, + "loss": 0.0046, + "step": 556 + }, + { + "epoch": 1.0732530120481927, + "grad_norm": 0.2642340064048767, + "learning_rate": 3.247981161137153e-05, + "loss": 0.022, + "step": 557 + }, + { + "epoch": 1.0751807228915662, + "grad_norm": 0.17274761199951172, + "learning_rate": 3.2444734717036386e-05, + "loss": 0.0134, + "step": 558 + }, + { + "epoch": 1.0771084337349397, + "grad_norm": 0.44354626536369324, + "learning_rate": 3.240959524653735e-05, + "loss": 0.0211, + "step": 559 + }, + { + "epoch": 1.0790361445783132, + "grad_norm": 0.2806888818740845, + "learning_rate": 3.237439337656708e-05, + "loss": 0.0141, + "step": 560 + }, + { + "epoch": 1.0809638554216867, + "grad_norm": 0.21679501235485077, + "learning_rate": 3.2339129284131994e-05, + "loss": 0.019, + "step": 561 + }, + { + "epoch": 1.0828915662650602, + "grad_norm": 0.3040260076522827, + "learning_rate": 3.2303803146551386e-05, + "loss": 0.0249, + "step": 562 + }, + { + "epoch": 1.0848192771084337, + "grad_norm": 0.2793775200843811, + "learning_rate": 3.226841514145656e-05, + "loss": 0.0088, + "step": 563 + }, + { + "epoch": 1.0867469879518072, + "grad_norm": 0.149955615401268, + "learning_rate": 3.223296544678987e-05, + "loss": 0.0054, + "step": 564 + }, + { + "epoch": 1.0886746987951808, + "grad_norm": 0.22166767716407776, + "learning_rate": 3.219745424080389e-05, + "loss": 0.0109, + "step": 565 + }, + { + "epoch": 1.0906024096385543, + "grad_norm": 0.22399431467056274, + "learning_rate": 3.2161881702060476e-05, + "loss": 0.0106, + "step": 566 + }, + { + "epoch": 1.0925301204819278, + "grad_norm": 0.18537986278533936, + "learning_rate": 3.2126248009429905e-05, + "loss": 0.0077, + "step": 567 + }, + { + "epoch": 1.0944578313253013, + "grad_norm": 0.24511495232582092, + "learning_rate": 3.2090553342089935e-05, + "loss": 0.0093, + "step": 568 + }, + { + "epoch": 1.0963855421686748, + "grad_norm": 0.4766045808792114, + "learning_rate": 3.205479787952494e-05, + "loss": 0.036, + "step": 569 + }, + { + "epoch": 1.0983132530120483, + "grad_norm": 0.1425715535879135, + "learning_rate": 3.201898180152499e-05, + "loss": 0.0085, + "step": 570 + }, + { + "epoch": 1.1002409638554216, + "grad_norm": 0.1909666359424591, + "learning_rate": 3.1983105288184945e-05, + "loss": 0.0081, + "step": 571 + }, + { + "epoch": 1.102168674698795, + "grad_norm": 0.44077104330062866, + "learning_rate": 3.194716851990355e-05, + "loss": 0.017, + "step": 572 + }, + { + "epoch": 1.1040963855421686, + "grad_norm": 0.5757400989532471, + "learning_rate": 3.191117167738253e-05, + "loss": 0.021, + "step": 573 + }, + { + "epoch": 1.106024096385542, + "grad_norm": 0.1977701038122177, + "learning_rate": 3.1875114941625705e-05, + "loss": 0.0096, + "step": 574 + }, + { + "epoch": 1.1079518072289156, + "grad_norm": 0.3524581491947174, + "learning_rate": 3.1838998493938026e-05, + "loss": 0.0118, + "step": 575 + }, + { + "epoch": 1.1098795180722891, + "grad_norm": 0.3301331698894501, + "learning_rate": 3.180282251592472e-05, + "loss": 0.0094, + "step": 576 + }, + { + "epoch": 1.1118072289156626, + "grad_norm": 0.2774488925933838, + "learning_rate": 3.1766587189490336e-05, + "loss": 0.0131, + "step": 577 + }, + { + "epoch": 1.1137349397590361, + "grad_norm": 1.732595443725586, + "learning_rate": 3.173029269683785e-05, + "loss": 0.0445, + "step": 578 + }, + { + "epoch": 1.1156626506024097, + "grad_norm": 0.28746843338012695, + "learning_rate": 3.169393922046776e-05, + "loss": 0.0116, + "step": 579 + }, + { + "epoch": 1.1175903614457832, + "grad_norm": 0.2952995002269745, + "learning_rate": 3.165752694317713e-05, + "loss": 0.0116, + "step": 580 + }, + { + "epoch": 1.1195180722891567, + "grad_norm": 0.2938575744628906, + "learning_rate": 3.16210560480587e-05, + "loss": 0.013, + "step": 581 + }, + { + "epoch": 1.1214457831325302, + "grad_norm": 0.22283495962619781, + "learning_rate": 3.158452671849998e-05, + "loss": 0.0052, + "step": 582 + }, + { + "epoch": 1.1233734939759037, + "grad_norm": 0.6272858381271362, + "learning_rate": 3.154793913818226e-05, + "loss": 0.0182, + "step": 583 + }, + { + "epoch": 1.1253012048192772, + "grad_norm": 0.479753702878952, + "learning_rate": 3.1511293491079804e-05, + "loss": 0.0146, + "step": 584 + }, + { + "epoch": 1.1272289156626507, + "grad_norm": 0.31104400753974915, + "learning_rate": 3.1474589961458786e-05, + "loss": 0.0139, + "step": 585 + }, + { + "epoch": 1.129156626506024, + "grad_norm": 0.4932832419872284, + "learning_rate": 3.1437828733876477e-05, + "loss": 0.0236, + "step": 586 + }, + { + "epoch": 1.1310843373493975, + "grad_norm": 0.222808837890625, + "learning_rate": 3.140100999318025e-05, + "loss": 0.0084, + "step": 587 + }, + { + "epoch": 1.133012048192771, + "grad_norm": 0.4515356719493866, + "learning_rate": 3.136413392450668e-05, + "loss": 0.0215, + "step": 588 + }, + { + "epoch": 1.1349397590361445, + "grad_norm": 0.39302268624305725, + "learning_rate": 3.132720071328061e-05, + "loss": 0.0154, + "step": 589 + }, + { + "epoch": 1.136867469879518, + "grad_norm": 0.43382835388183594, + "learning_rate": 3.1290210545214205e-05, + "loss": 0.0088, + "step": 590 + }, + { + "epoch": 1.1387951807228915, + "grad_norm": 0.18707136809825897, + "learning_rate": 3.125316360630602e-05, + "loss": 0.0126, + "step": 591 + }, + { + "epoch": 1.140722891566265, + "grad_norm": 0.5688219666481018, + "learning_rate": 3.121606008284011e-05, + "loss": 0.0147, + "step": 592 + }, + { + "epoch": 1.1426506024096386, + "grad_norm": 0.3321833312511444, + "learning_rate": 3.1178900161385005e-05, + "loss": 0.0119, + "step": 593 + }, + { + "epoch": 1.144578313253012, + "grad_norm": 0.3738424777984619, + "learning_rate": 3.114168402879286e-05, + "loss": 0.0158, + "step": 594 + }, + { + "epoch": 1.1465060240963856, + "grad_norm": 0.2386978417634964, + "learning_rate": 3.110441187219846e-05, + "loss": 0.0107, + "step": 595 + }, + { + "epoch": 1.148433734939759, + "grad_norm": 0.2165699452161789, + "learning_rate": 3.10670838790183e-05, + "loss": 0.0079, + "step": 596 + }, + { + "epoch": 1.1503614457831326, + "grad_norm": 0.25952696800231934, + "learning_rate": 3.102970023694965e-05, + "loss": 0.0147, + "step": 597 + }, + { + "epoch": 1.152289156626506, + "grad_norm": 0.21448305249214172, + "learning_rate": 3.099226113396959e-05, + "loss": 0.0099, + "step": 598 + }, + { + "epoch": 1.1542168674698796, + "grad_norm": 0.37226060032844543, + "learning_rate": 3.095476675833405e-05, + "loss": 0.0214, + "step": 599 + }, + { + "epoch": 1.1561445783132531, + "grad_norm": 0.29637983441352844, + "learning_rate": 3.0917217298576955e-05, + "loss": 0.0118, + "step": 600 + }, + { + "epoch": 1.1580722891566264, + "grad_norm": 0.18535609543323517, + "learning_rate": 3.0879612943509154e-05, + "loss": 0.0086, + "step": 601 + }, + { + "epoch": 1.16, + "grad_norm": 0.25874125957489014, + "learning_rate": 3.0841953882217536e-05, + "loss": 0.0088, + "step": 602 + }, + { + "epoch": 1.1619277108433734, + "grad_norm": 0.46092745661735535, + "learning_rate": 3.08042403040641e-05, + "loss": 0.0241, + "step": 603 + }, + { + "epoch": 1.163855421686747, + "grad_norm": 0.27023249864578247, + "learning_rate": 3.076647239868494e-05, + "loss": 0.0154, + "step": 604 + }, + { + "epoch": 1.1657831325301204, + "grad_norm": 0.445157527923584, + "learning_rate": 3.072865035598933e-05, + "loss": 0.0197, + "step": 605 + }, + { + "epoch": 1.167710843373494, + "grad_norm": 0.18097272515296936, + "learning_rate": 3.06907743661588e-05, + "loss": 0.0093, + "step": 606 + }, + { + "epoch": 1.1696385542168675, + "grad_norm": 0.22469942271709442, + "learning_rate": 3.065284461964609e-05, + "loss": 0.0171, + "step": 607 + }, + { + "epoch": 1.171566265060241, + "grad_norm": 0.20190906524658203, + "learning_rate": 3.061486130717428e-05, + "loss": 0.008, + "step": 608 + }, + { + "epoch": 1.1734939759036145, + "grad_norm": 0.18294145166873932, + "learning_rate": 3.057682461973579e-05, + "loss": 0.0155, + "step": 609 + }, + { + "epoch": 1.175421686746988, + "grad_norm": 0.34203943610191345, + "learning_rate": 3.053873474859143e-05, + "loss": 0.0212, + "step": 610 + }, + { + "epoch": 1.1773493975903615, + "grad_norm": 0.49073582887649536, + "learning_rate": 3.050059188526942e-05, + "loss": 0.019, + "step": 611 + }, + { + "epoch": 1.179277108433735, + "grad_norm": 0.3537680506706238, + "learning_rate": 3.046239622156446e-05, + "loss": 0.0147, + "step": 612 + }, + { + "epoch": 1.1812048192771085, + "grad_norm": 0.2584632635116577, + "learning_rate": 3.042414794953674e-05, + "loss": 0.0088, + "step": 613 + }, + { + "epoch": 1.1831325301204818, + "grad_norm": 0.3529360890388489, + "learning_rate": 3.0385847261510975e-05, + "loss": 0.0187, + "step": 614 + }, + { + "epoch": 1.1850602409638555, + "grad_norm": 0.3331570327281952, + "learning_rate": 3.0347494350075465e-05, + "loss": 0.0124, + "step": 615 + }, + { + "epoch": 1.1869879518072288, + "grad_norm": 0.2223527580499649, + "learning_rate": 3.0309089408081074e-05, + "loss": 0.01, + "step": 616 + }, + { + "epoch": 1.1889156626506023, + "grad_norm": 0.21985746920108795, + "learning_rate": 3.027063262864032e-05, + "loss": 0.0087, + "step": 617 + }, + { + "epoch": 1.1908433734939758, + "grad_norm": 0.2989653944969177, + "learning_rate": 3.023212420512637e-05, + "loss": 0.0137, + "step": 618 + }, + { + "epoch": 1.1927710843373494, + "grad_norm": 0.17423275113105774, + "learning_rate": 3.0193564331172074e-05, + "loss": 0.0056, + "step": 619 + }, + { + "epoch": 1.1946987951807229, + "grad_norm": 1.0992127656936646, + "learning_rate": 3.0154953200668976e-05, + "loss": 0.0274, + "step": 620 + }, + { + "epoch": 1.1966265060240964, + "grad_norm": 0.21641989052295685, + "learning_rate": 3.011629100776638e-05, + "loss": 0.0151, + "step": 621 + }, + { + "epoch": 1.1985542168674699, + "grad_norm": 0.4558199644088745, + "learning_rate": 3.007757794687033e-05, + "loss": 0.0424, + "step": 622 + }, + { + "epoch": 1.2004819277108434, + "grad_norm": 0.42380189895629883, + "learning_rate": 3.003881421264266e-05, + "loss": 0.0079, + "step": 623 + }, + { + "epoch": 1.202409638554217, + "grad_norm": 0.28791171312332153, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.0142, + "step": 624 + }, + { + "epoch": 1.2043373493975904, + "grad_norm": 0.3906581997871399, + "learning_rate": 2.996113550411281e-05, + "loss": 0.0251, + "step": 625 + }, + { + "epoch": 1.206265060240964, + "grad_norm": 0.47848746180534363, + "learning_rate": 2.9922220920404375e-05, + "loss": 0.0137, + "step": 626 + }, + { + "epoch": 1.2081927710843374, + "grad_norm": 0.22666941583156586, + "learning_rate": 2.9883256444549862e-05, + "loss": 0.0105, + "step": 627 + }, + { + "epoch": 1.210120481927711, + "grad_norm": 0.18968136608600616, + "learning_rate": 2.984424227247529e-05, + "loss": 0.0089, + "step": 628 + }, + { + "epoch": 1.2120481927710842, + "grad_norm": 0.28732606768608093, + "learning_rate": 2.980517860035656e-05, + "loss": 0.0253, + "step": 629 + }, + { + "epoch": 1.213975903614458, + "grad_norm": 0.21131543815135956, + "learning_rate": 2.9766065624618518e-05, + "loss": 0.0134, + "step": 630 + }, + { + "epoch": 1.2159036144578312, + "grad_norm": 0.7594877481460571, + "learning_rate": 2.972690354193388e-05, + "loss": 0.0157, + "step": 631 + }, + { + "epoch": 1.2178313253012047, + "grad_norm": 0.730291485786438, + "learning_rate": 2.96876925492223e-05, + "loss": 0.0204, + "step": 632 + }, + { + "epoch": 1.2197590361445783, + "grad_norm": 0.20333674550056458, + "learning_rate": 2.9648432843649382e-05, + "loss": 0.0114, + "step": 633 + }, + { + "epoch": 1.2216867469879518, + "grad_norm": 0.5680793523788452, + "learning_rate": 2.960912462262566e-05, + "loss": 0.0146, + "step": 634 + }, + { + "epoch": 1.2236144578313253, + "grad_norm": 0.4591079354286194, + "learning_rate": 2.9569768083805618e-05, + "loss": 0.0112, + "step": 635 + }, + { + "epoch": 1.2255421686746988, + "grad_norm": 0.3793511390686035, + "learning_rate": 2.953036342508671e-05, + "loss": 0.0377, + "step": 636 + }, + { + "epoch": 1.2274698795180723, + "grad_norm": 1.118723750114441, + "learning_rate": 2.9490910844608346e-05, + "loss": 0.0432, + "step": 637 + }, + { + "epoch": 1.2293975903614458, + "grad_norm": 0.36990776658058167, + "learning_rate": 2.9451410540750887e-05, + "loss": 0.0203, + "step": 638 + }, + { + "epoch": 1.2313253012048193, + "grad_norm": 0.930397629737854, + "learning_rate": 2.94118627121347e-05, + "loss": 0.0311, + "step": 639 + }, + { + "epoch": 1.2332530120481928, + "grad_norm": 0.2347625195980072, + "learning_rate": 2.9372267557619075e-05, + "loss": 0.0168, + "step": 640 + }, + { + "epoch": 1.2351807228915663, + "grad_norm": 0.3720332384109497, + "learning_rate": 2.933262527630131e-05, + "loss": 0.0136, + "step": 641 + }, + { + "epoch": 1.2371084337349398, + "grad_norm": 0.4871984124183655, + "learning_rate": 2.929293606751565e-05, + "loss": 0.0339, + "step": 642 + }, + { + "epoch": 1.2390361445783133, + "grad_norm": 0.35853689908981323, + "learning_rate": 2.9253200130832322e-05, + "loss": 0.0095, + "step": 643 + }, + { + "epoch": 1.2409638554216866, + "grad_norm": 0.42003703117370605, + "learning_rate": 2.92134176660565e-05, + "loss": 0.0142, + "step": 644 + }, + { + "epoch": 1.2428915662650604, + "grad_norm": 0.3854500651359558, + "learning_rate": 2.9173588873227338e-05, + "loss": 0.0209, + "step": 645 + }, + { + "epoch": 1.2448192771084337, + "grad_norm": 0.24665917456150055, + "learning_rate": 2.913371395261691e-05, + "loss": 0.0087, + "step": 646 + }, + { + "epoch": 1.2467469879518072, + "grad_norm": 0.41571593284606934, + "learning_rate": 2.9093793104729268e-05, + "loss": 0.0164, + "step": 647 + }, + { + "epoch": 1.2486746987951807, + "grad_norm": 0.4597891569137573, + "learning_rate": 2.9053826530299377e-05, + "loss": 0.0138, + "step": 648 + }, + { + "epoch": 1.2506024096385542, + "grad_norm": 0.43345385789871216, + "learning_rate": 2.901381443029215e-05, + "loss": 0.0353, + "step": 649 + }, + { + "epoch": 1.2525301204819277, + "grad_norm": 0.3706768751144409, + "learning_rate": 2.897375700590141e-05, + "loss": 0.007, + "step": 650 + }, + { + "epoch": 1.2544578313253012, + "grad_norm": 0.30305296182632446, + "learning_rate": 2.8933654458548873e-05, + "loss": 0.0123, + "step": 651 + }, + { + "epoch": 1.2563855421686747, + "grad_norm": 0.2042127549648285, + "learning_rate": 2.8893506989883167e-05, + "loss": 0.0099, + "step": 652 + }, + { + "epoch": 1.2583132530120482, + "grad_norm": 0.20524422824382782, + "learning_rate": 2.8853314801778784e-05, + "loss": 0.0097, + "step": 653 + }, + { + "epoch": 1.2602409638554217, + "grad_norm": 0.2351921945810318, + "learning_rate": 2.8813078096335093e-05, + "loss": 0.0091, + "step": 654 + }, + { + "epoch": 1.2621686746987952, + "grad_norm": 0.34547340869903564, + "learning_rate": 2.87727970758753e-05, + "loss": 0.0088, + "step": 655 + }, + { + "epoch": 1.2640963855421687, + "grad_norm": 0.35163217782974243, + "learning_rate": 2.8732471942945443e-05, + "loss": 0.0145, + "step": 656 + }, + { + "epoch": 1.266024096385542, + "grad_norm": 1.715137243270874, + "learning_rate": 2.8692102900313378e-05, + "loss": 0.0198, + "step": 657 + }, + { + "epoch": 1.2679518072289158, + "grad_norm": 0.2860178053379059, + "learning_rate": 2.8651690150967748e-05, + "loss": 0.0085, + "step": 658 + }, + { + "epoch": 1.269879518072289, + "grad_norm": 0.21175967156887054, + "learning_rate": 2.8611233898116967e-05, + "loss": 0.0071, + "step": 659 + }, + { + "epoch": 1.2718072289156628, + "grad_norm": 0.33726972341537476, + "learning_rate": 2.85707343451882e-05, + "loss": 0.012, + "step": 660 + }, + { + "epoch": 1.273734939759036, + "grad_norm": 0.2138456553220749, + "learning_rate": 2.853019169582635e-05, + "loss": 0.0092, + "step": 661 + }, + { + "epoch": 1.2756626506024096, + "grad_norm": 0.2304934412240982, + "learning_rate": 2.8489606153892997e-05, + "loss": 0.0144, + "step": 662 + }, + { + "epoch": 1.277590361445783, + "grad_norm": 0.2691061794757843, + "learning_rate": 2.8448977923465425e-05, + "loss": 0.0121, + "step": 663 + }, + { + "epoch": 1.2795180722891566, + "grad_norm": 0.35254305601119995, + "learning_rate": 2.840830720883555e-05, + "loss": 0.0125, + "step": 664 + }, + { + "epoch": 1.28144578313253, + "grad_norm": 0.36552608013153076, + "learning_rate": 2.836759421450893e-05, + "loss": 0.021, + "step": 665 + }, + { + "epoch": 1.2833734939759036, + "grad_norm": 0.37177154421806335, + "learning_rate": 2.83268391452037e-05, + "loss": 0.0216, + "step": 666 + }, + { + "epoch": 1.2853012048192771, + "grad_norm": 0.20932547748088837, + "learning_rate": 2.828604220584958e-05, + "loss": 0.0077, + "step": 667 + }, + { + "epoch": 1.2872289156626506, + "grad_norm": 0.5158557295799255, + "learning_rate": 2.824520360158681e-05, + "loss": 0.0394, + "step": 668 + }, + { + "epoch": 1.2891566265060241, + "grad_norm": 0.22623969614505768, + "learning_rate": 2.820432353776515e-05, + "loss": 0.0087, + "step": 669 + }, + { + "epoch": 1.2910843373493976, + "grad_norm": 0.2996046245098114, + "learning_rate": 2.8163402219942822e-05, + "loss": 0.01, + "step": 670 + }, + { + "epoch": 1.2930120481927712, + "grad_norm": 0.24957989156246185, + "learning_rate": 2.8122439853885488e-05, + "loss": 0.0127, + "step": 671 + }, + { + "epoch": 1.2949397590361444, + "grad_norm": 0.2636559307575226, + "learning_rate": 2.8081436645565216e-05, + "loss": 0.0128, + "step": 672 + }, + { + "epoch": 1.2968674698795182, + "grad_norm": 0.3531591296195984, + "learning_rate": 2.804039280115944e-05, + "loss": 0.0199, + "step": 673 + }, + { + "epoch": 1.2987951807228915, + "grad_norm": 0.3682299852371216, + "learning_rate": 2.7999308527049927e-05, + "loss": 0.0088, + "step": 674 + }, + { + "epoch": 1.3007228915662652, + "grad_norm": 0.19555217027664185, + "learning_rate": 2.795818402982174e-05, + "loss": 0.0084, + "step": 675 + }, + { + "epoch": 1.3026506024096385, + "grad_norm": 0.2864912450313568, + "learning_rate": 2.7917019516262186e-05, + "loss": 0.0154, + "step": 676 + }, + { + "epoch": 1.304578313253012, + "grad_norm": 0.2211237996816635, + "learning_rate": 2.78758151933598e-05, + "loss": 0.0078, + "step": 677 + }, + { + "epoch": 1.3065060240963855, + "grad_norm": 0.13646945357322693, + "learning_rate": 2.7834571268303294e-05, + "loss": 0.0058, + "step": 678 + }, + { + "epoch": 1.308433734939759, + "grad_norm": 0.16530285775661469, + "learning_rate": 2.779328794848049e-05, + "loss": 0.007, + "step": 679 + }, + { + "epoch": 1.3103614457831325, + "grad_norm": 0.2145693302154541, + "learning_rate": 2.7751965441477325e-05, + "loss": 0.0203, + "step": 680 + }, + { + "epoch": 1.312289156626506, + "grad_norm": 0.24273739755153656, + "learning_rate": 2.771060395507677e-05, + "loss": 0.0106, + "step": 681 + }, + { + "epoch": 1.3142168674698795, + "grad_norm": 0.20430618524551392, + "learning_rate": 2.7669203697257794e-05, + "loss": 0.0122, + "step": 682 + }, + { + "epoch": 1.316144578313253, + "grad_norm": 0.2502615749835968, + "learning_rate": 2.7627764876194335e-05, + "loss": 0.0101, + "step": 683 + }, + { + "epoch": 1.3180722891566266, + "grad_norm": 0.287239670753479, + "learning_rate": 2.7586287700254214e-05, + "loss": 0.0203, + "step": 684 + }, + { + "epoch": 1.32, + "grad_norm": 0.16239754855632782, + "learning_rate": 2.7544772377998147e-05, + "loss": 0.0084, + "step": 685 + }, + { + "epoch": 1.3219277108433736, + "grad_norm": 0.27174142003059387, + "learning_rate": 2.7503219118178636e-05, + "loss": 0.008, + "step": 686 + }, + { + "epoch": 1.3238554216867469, + "grad_norm": 0.12878240644931793, + "learning_rate": 2.7461628129738954e-05, + "loss": 0.0053, + "step": 687 + }, + { + "epoch": 1.3257831325301206, + "grad_norm": 0.16112515330314636, + "learning_rate": 2.7419999621812086e-05, + "loss": 0.0059, + "step": 688 + }, + { + "epoch": 1.3277108433734939, + "grad_norm": 0.2398834228515625, + "learning_rate": 2.7378333803719672e-05, + "loss": 0.0095, + "step": 689 + }, + { + "epoch": 1.3296385542168676, + "grad_norm": 0.18516193330287933, + "learning_rate": 2.733663088497097e-05, + "loss": 0.0071, + "step": 690 + }, + { + "epoch": 1.331566265060241, + "grad_norm": 0.2974924147129059, + "learning_rate": 2.7294891075261785e-05, + "loss": 0.0227, + "step": 691 + }, + { + "epoch": 1.3334939759036144, + "grad_norm": 0.12931054830551147, + "learning_rate": 2.7253114584473418e-05, + "loss": 0.0039, + "step": 692 + }, + { + "epoch": 1.335421686746988, + "grad_norm": 0.16319474577903748, + "learning_rate": 2.7211301622671623e-05, + "loss": 0.008, + "step": 693 + }, + { + "epoch": 1.3373493975903614, + "grad_norm": 0.27622169256210327, + "learning_rate": 2.7169452400105533e-05, + "loss": 0.0238, + "step": 694 + }, + { + "epoch": 1.339277108433735, + "grad_norm": 0.45309779047966003, + "learning_rate": 2.712756712720663e-05, + "loss": 0.0439, + "step": 695 + }, + { + "epoch": 1.3412048192771084, + "grad_norm": 0.2469855099916458, + "learning_rate": 2.708564601458765e-05, + "loss": 0.0085, + "step": 696 + }, + { + "epoch": 1.343132530120482, + "grad_norm": 0.4245856702327728, + "learning_rate": 2.7043689273041535e-05, + "loss": 0.0097, + "step": 697 + }, + { + "epoch": 1.3450602409638555, + "grad_norm": 0.26796087622642517, + "learning_rate": 2.7001697113540414e-05, + "loss": 0.0119, + "step": 698 + }, + { + "epoch": 1.346987951807229, + "grad_norm": 0.3569283187389374, + "learning_rate": 2.6959669747234482e-05, + "loss": 0.0096, + "step": 699 + }, + { + "epoch": 1.3489156626506025, + "grad_norm": 0.7038524150848389, + "learning_rate": 2.6917607385450973e-05, + "loss": 0.0317, + "step": 700 + }, + { + "epoch": 1.350843373493976, + "grad_norm": 0.23568563163280487, + "learning_rate": 2.687551023969308e-05, + "loss": 0.0112, + "step": 701 + }, + { + "epoch": 1.3527710843373493, + "grad_norm": 0.20338499546051025, + "learning_rate": 2.6833378521638935e-05, + "loss": 0.0092, + "step": 702 + }, + { + "epoch": 1.354698795180723, + "grad_norm": 4.22187614440918, + "learning_rate": 2.679121244314046e-05, + "loss": 0.0314, + "step": 703 + }, + { + "epoch": 1.3566265060240963, + "grad_norm": 0.2542206048965454, + "learning_rate": 2.674901221622239e-05, + "loss": 0.0158, + "step": 704 + }, + { + "epoch": 1.3585542168674698, + "grad_norm": 0.49705010652542114, + "learning_rate": 2.670677805308116e-05, + "loss": 0.0162, + "step": 705 + }, + { + "epoch": 1.3604819277108433, + "grad_norm": 0.17502115666866302, + "learning_rate": 2.666451016608383e-05, + "loss": 0.0074, + "step": 706 + }, + { + "epoch": 1.3624096385542168, + "grad_norm": 0.21738742291927338, + "learning_rate": 2.6622208767767075e-05, + "loss": 0.0135, + "step": 707 + }, + { + "epoch": 1.3643373493975903, + "grad_norm": 0.3309847414493561, + "learning_rate": 2.6579874070836032e-05, + "loss": 0.0107, + "step": 708 + }, + { + "epoch": 1.3662650602409638, + "grad_norm": 0.10706827789545059, + "learning_rate": 2.6537506288163303e-05, + "loss": 0.0043, + "step": 709 + }, + { + "epoch": 1.3681927710843373, + "grad_norm": 0.173640176653862, + "learning_rate": 2.6495105632787835e-05, + "loss": 0.0092, + "step": 710 + }, + { + "epoch": 1.3701204819277109, + "grad_norm": 0.2636397182941437, + "learning_rate": 2.6452672317913893e-05, + "loss": 0.0097, + "step": 711 + }, + { + "epoch": 1.3720481927710844, + "grad_norm": 0.28485360741615295, + "learning_rate": 2.6410206556909943e-05, + "loss": 0.0193, + "step": 712 + }, + { + "epoch": 1.3739759036144579, + "grad_norm": 0.23210027813911438, + "learning_rate": 2.636770856330761e-05, + "loss": 0.0229, + "step": 713 + }, + { + "epoch": 1.3759036144578314, + "grad_norm": 0.13388316333293915, + "learning_rate": 2.6325178550800596e-05, + "loss": 0.004, + "step": 714 + }, + { + "epoch": 1.377831325301205, + "grad_norm": 0.5131422877311707, + "learning_rate": 2.6282616733243603e-05, + "loss": 0.0137, + "step": 715 + }, + { + "epoch": 1.3797590361445784, + "grad_norm": 0.3243267834186554, + "learning_rate": 2.6240023324651258e-05, + "loss": 0.0153, + "step": 716 + }, + { + "epoch": 1.3816867469879517, + "grad_norm": 0.1440611034631729, + "learning_rate": 2.619739853919704e-05, + "loss": 0.0031, + "step": 717 + }, + { + "epoch": 1.3836144578313254, + "grad_norm": 0.30346596240997314, + "learning_rate": 2.6154742591212196e-05, + "loss": 0.0109, + "step": 718 + }, + { + "epoch": 1.3855421686746987, + "grad_norm": 0.19109240174293518, + "learning_rate": 2.611205569518468e-05, + "loss": 0.0094, + "step": 719 + }, + { + "epoch": 1.3874698795180722, + "grad_norm": 0.28636518120765686, + "learning_rate": 2.6069338065758056e-05, + "loss": 0.0123, + "step": 720 + }, + { + "epoch": 1.3893975903614457, + "grad_norm": 0.28083911538124084, + "learning_rate": 2.6026589917730416e-05, + "loss": 0.0104, + "step": 721 + }, + { + "epoch": 1.3913253012048192, + "grad_norm": 0.36553966999053955, + "learning_rate": 2.5983811466053327e-05, + "loss": 0.0143, + "step": 722 + }, + { + "epoch": 1.3932530120481927, + "grad_norm": 0.23317205905914307, + "learning_rate": 2.5941002925830708e-05, + "loss": 0.011, + "step": 723 + }, + { + "epoch": 1.3951807228915662, + "grad_norm": 0.3825171887874603, + "learning_rate": 2.589816451231781e-05, + "loss": 0.0098, + "step": 724 + }, + { + "epoch": 1.3971084337349398, + "grad_norm": 0.19916608929634094, + "learning_rate": 2.585529644092006e-05, + "loss": 0.0094, + "step": 725 + }, + { + "epoch": 1.3990361445783133, + "grad_norm": 0.19990523159503937, + "learning_rate": 2.5812398927192027e-05, + "loss": 0.0128, + "step": 726 + }, + { + "epoch": 1.4009638554216868, + "grad_norm": 0.34662899374961853, + "learning_rate": 2.5769472186836347e-05, + "loss": 0.0091, + "step": 727 + }, + { + "epoch": 1.4028915662650603, + "grad_norm": 0.23481112718582153, + "learning_rate": 2.5726516435702583e-05, + "loss": 0.0154, + "step": 728 + }, + { + "epoch": 1.4048192771084338, + "grad_norm": 0.1846667379140854, + "learning_rate": 2.5683531889786194e-05, + "loss": 0.0088, + "step": 729 + }, + { + "epoch": 1.4067469879518073, + "grad_norm": 0.16717663407325745, + "learning_rate": 2.564051876522742e-05, + "loss": 0.0083, + "step": 730 + }, + { + "epoch": 1.4086746987951808, + "grad_norm": 0.4116475284099579, + "learning_rate": 2.5597477278310202e-05, + "loss": 0.0179, + "step": 731 + }, + { + "epoch": 1.410602409638554, + "grad_norm": 0.171807661652565, + "learning_rate": 2.5554407645461115e-05, + "loss": 0.0063, + "step": 732 + }, + { + "epoch": 1.4125301204819278, + "grad_norm": 0.1954439878463745, + "learning_rate": 2.5511310083248243e-05, + "loss": 0.017, + "step": 733 + }, + { + "epoch": 1.4144578313253011, + "grad_norm": 0.37158989906311035, + "learning_rate": 2.5468184808380104e-05, + "loss": 0.0173, + "step": 734 + }, + { + "epoch": 1.4163855421686746, + "grad_norm": 0.2001633644104004, + "learning_rate": 2.542503203770458e-05, + "loss": 0.0165, + "step": 735 + }, + { + "epoch": 1.4183132530120481, + "grad_norm": 0.45673373341560364, + "learning_rate": 2.53818519882078e-05, + "loss": 0.0185, + "step": 736 + }, + { + "epoch": 1.4202409638554216, + "grad_norm": 0.3838701546192169, + "learning_rate": 2.5338644877013067e-05, + "loss": 0.0134, + "step": 737 + }, + { + "epoch": 1.4221686746987952, + "grad_norm": 0.32032477855682373, + "learning_rate": 2.5295410921379745e-05, + "loss": 0.0143, + "step": 738 + }, + { + "epoch": 1.4240963855421687, + "grad_norm": 0.4594039022922516, + "learning_rate": 2.52521503387022e-05, + "loss": 0.0193, + "step": 739 + }, + { + "epoch": 1.4260240963855422, + "grad_norm": 0.3889620900154114, + "learning_rate": 2.5208863346508667e-05, + "loss": 0.0114, + "step": 740 + }, + { + "epoch": 1.4279518072289157, + "grad_norm": 0.33153319358825684, + "learning_rate": 2.5165550162460203e-05, + "loss": 0.0102, + "step": 741 + }, + { + "epoch": 1.4298795180722892, + "grad_norm": 0.7269518375396729, + "learning_rate": 2.5122211004349536e-05, + "loss": 0.0215, + "step": 742 + }, + { + "epoch": 1.4318072289156627, + "grad_norm": 0.31653261184692383, + "learning_rate": 2.5078846090100023e-05, + "loss": 0.0115, + "step": 743 + }, + { + "epoch": 1.4337349397590362, + "grad_norm": 0.20620353519916534, + "learning_rate": 2.5035455637764518e-05, + "loss": 0.0153, + "step": 744 + }, + { + "epoch": 1.4356626506024097, + "grad_norm": 0.17266008257865906, + "learning_rate": 2.4992039865524297e-05, + "loss": 0.0069, + "step": 745 + }, + { + "epoch": 1.4375903614457832, + "grad_norm": 0.24760811030864716, + "learning_rate": 2.494859899168795e-05, + "loss": 0.0108, + "step": 746 + }, + { + "epoch": 1.4395180722891565, + "grad_norm": 0.2584865391254425, + "learning_rate": 2.4905133234690282e-05, + "loss": 0.0095, + "step": 747 + }, + { + "epoch": 1.4414457831325302, + "grad_norm": 0.48847514390945435, + "learning_rate": 2.486164281309122e-05, + "loss": 0.0181, + "step": 748 + }, + { + "epoch": 1.4433734939759035, + "grad_norm": 0.42942047119140625, + "learning_rate": 2.4818127945574717e-05, + "loss": 0.025, + "step": 749 + }, + { + "epoch": 1.445301204819277, + "grad_norm": 0.23713800311088562, + "learning_rate": 2.4774588850947648e-05, + "loss": 0.0085, + "step": 750 + }, + { + "epoch": 1.4472289156626506, + "grad_norm": 0.8797569870948792, + "learning_rate": 2.473102574813871e-05, + "loss": 0.0097, + "step": 751 + }, + { + "epoch": 1.449156626506024, + "grad_norm": 0.2744862735271454, + "learning_rate": 2.4687438856197302e-05, + "loss": 0.0122, + "step": 752 + }, + { + "epoch": 1.4510843373493976, + "grad_norm": 0.12747010588645935, + "learning_rate": 2.4643828394292478e-05, + "loss": 0.0056, + "step": 753 + }, + { + "epoch": 1.453012048192771, + "grad_norm": 0.37376829981803894, + "learning_rate": 2.4600194581711775e-05, + "loss": 0.0052, + "step": 754 + }, + { + "epoch": 1.4549397590361446, + "grad_norm": 0.2536911368370056, + "learning_rate": 2.4556537637860176e-05, + "loss": 0.0113, + "step": 755 + }, + { + "epoch": 1.456867469879518, + "grad_norm": 0.25950780510902405, + "learning_rate": 2.451285778225894e-05, + "loss": 0.0099, + "step": 756 + }, + { + "epoch": 1.4587951807228916, + "grad_norm": 0.19535955786705017, + "learning_rate": 2.4469155234544565e-05, + "loss": 0.0069, + "step": 757 + }, + { + "epoch": 1.4607228915662651, + "grad_norm": 0.22816115617752075, + "learning_rate": 2.442543021446764e-05, + "loss": 0.0088, + "step": 758 + }, + { + "epoch": 1.4626506024096386, + "grad_norm": 0.3363986313343048, + "learning_rate": 2.4381682941891755e-05, + "loss": 0.0182, + "step": 759 + }, + { + "epoch": 1.464578313253012, + "grad_norm": 0.21492891013622284, + "learning_rate": 2.4337913636792382e-05, + "loss": 0.0069, + "step": 760 + }, + { + "epoch": 1.4665060240963856, + "grad_norm": 0.6070862412452698, + "learning_rate": 2.429412251925579e-05, + "loss": 0.0406, + "step": 761 + }, + { + "epoch": 1.468433734939759, + "grad_norm": 2.6469690799713135, + "learning_rate": 2.425030980947793e-05, + "loss": 0.0205, + "step": 762 + }, + { + "epoch": 1.4703614457831327, + "grad_norm": 0.30909740924835205, + "learning_rate": 2.420647572776332e-05, + "loss": 0.0136, + "step": 763 + }, + { + "epoch": 1.472289156626506, + "grad_norm": 0.6639553904533386, + "learning_rate": 2.416262049452395e-05, + "loss": 0.011, + "step": 764 + }, + { + "epoch": 1.4742168674698795, + "grad_norm": 0.2919616997241974, + "learning_rate": 2.4118744330278147e-05, + "loss": 0.0131, + "step": 765 + }, + { + "epoch": 1.476144578313253, + "grad_norm": 0.5232429504394531, + "learning_rate": 2.4074847455649523e-05, + "loss": 0.0138, + "step": 766 + }, + { + "epoch": 1.4780722891566265, + "grad_norm": 5.630630970001221, + "learning_rate": 2.403093009136579e-05, + "loss": 0.0264, + "step": 767 + }, + { + "epoch": 1.48, + "grad_norm": 0.33234721422195435, + "learning_rate": 2.3986992458257707e-05, + "loss": 0.0111, + "step": 768 + }, + { + "epoch": 1.4819277108433735, + "grad_norm": 0.28444772958755493, + "learning_rate": 2.3943034777257945e-05, + "loss": 0.0144, + "step": 769 + }, + { + "epoch": 1.483855421686747, + "grad_norm": 0.16229979693889618, + "learning_rate": 2.38990572694e-05, + "loss": 0.0062, + "step": 770 + }, + { + "epoch": 1.4857831325301205, + "grad_norm": 0.27474716305732727, + "learning_rate": 2.385506015581704e-05, + "loss": 0.0172, + "step": 771 + }, + { + "epoch": 1.487710843373494, + "grad_norm": 0.246526300907135, + "learning_rate": 2.381104365774083e-05, + "loss": 0.012, + "step": 772 + }, + { + "epoch": 1.4896385542168675, + "grad_norm": 0.282047837972641, + "learning_rate": 2.37670079965006e-05, + "loss": 0.0116, + "step": 773 + }, + { + "epoch": 1.491566265060241, + "grad_norm": 0.2878139317035675, + "learning_rate": 2.3722953393521944e-05, + "loss": 0.0147, + "step": 774 + }, + { + "epoch": 1.4934939759036143, + "grad_norm": 0.5586277842521667, + "learning_rate": 2.367888007032571e-05, + "loss": 0.0111, + "step": 775 + }, + { + "epoch": 1.495421686746988, + "grad_norm": 0.562160313129425, + "learning_rate": 2.3634788248526846e-05, + "loss": 0.0061, + "step": 776 + }, + { + "epoch": 1.4973493975903613, + "grad_norm": 0.3452005982398987, + "learning_rate": 2.3590678149833356e-05, + "loss": 0.0205, + "step": 777 + }, + { + "epoch": 1.499277108433735, + "grad_norm": 0.7757686376571655, + "learning_rate": 2.3546549996045114e-05, + "loss": 0.0273, + "step": 778 + }, + { + "epoch": 1.5012048192771084, + "grad_norm": 0.19530551135540009, + "learning_rate": 2.3502404009052812e-05, + "loss": 0.0083, + "step": 779 + }, + { + "epoch": 1.503132530120482, + "grad_norm": 0.2586531639099121, + "learning_rate": 2.3458240410836775e-05, + "loss": 0.0122, + "step": 780 + }, + { + "epoch": 1.5050602409638554, + "grad_norm": 0.30063286423683167, + "learning_rate": 2.3414059423465924e-05, + "loss": 0.0083, + "step": 781 + }, + { + "epoch": 1.5069879518072289, + "grad_norm": 0.18663185834884644, + "learning_rate": 2.3369861269096575e-05, + "loss": 0.0104, + "step": 782 + }, + { + "epoch": 1.5089156626506024, + "grad_norm": 0.4405941069126129, + "learning_rate": 2.3325646169971416e-05, + "loss": 0.0264, + "step": 783 + }, + { + "epoch": 1.510843373493976, + "grad_norm": 0.2947913110256195, + "learning_rate": 2.3281414348418294e-05, + "loss": 0.0107, + "step": 784 + }, + { + "epoch": 1.5127710843373494, + "grad_norm": 0.23813778162002563, + "learning_rate": 2.3237166026849158e-05, + "loss": 0.0084, + "step": 785 + }, + { + "epoch": 1.514698795180723, + "grad_norm": 0.33380329608917236, + "learning_rate": 2.3192901427758932e-05, + "loss": 0.0111, + "step": 786 + }, + { + "epoch": 1.5166265060240964, + "grad_norm": 0.3736988306045532, + "learning_rate": 2.314862077372438e-05, + "loss": 0.0135, + "step": 787 + }, + { + "epoch": 1.5185542168674697, + "grad_norm": 0.3785395920276642, + "learning_rate": 2.3104324287402996e-05, + "loss": 0.0265, + "step": 788 + }, + { + "epoch": 1.5204819277108435, + "grad_norm": 0.3359154462814331, + "learning_rate": 2.3060012191531885e-05, + "loss": 0.0127, + "step": 789 + }, + { + "epoch": 1.5224096385542167, + "grad_norm": 0.720753014087677, + "learning_rate": 2.301568470892664e-05, + "loss": 0.0134, + "step": 790 + }, + { + "epoch": 1.5243373493975905, + "grad_norm": 0.36473193764686584, + "learning_rate": 2.297134206248024e-05, + "loss": 0.0318, + "step": 791 + }, + { + "epoch": 1.5262650602409638, + "grad_norm": 0.29987087845802307, + "learning_rate": 2.2926984475161884e-05, + "loss": 0.008, + "step": 792 + }, + { + "epoch": 1.5281927710843375, + "grad_norm": 0.2883112132549286, + "learning_rate": 2.2882612170015914e-05, + "loss": 0.0125, + "step": 793 + }, + { + "epoch": 1.5301204819277108, + "grad_norm": 0.28983229398727417, + "learning_rate": 2.2838225370160682e-05, + "loss": 0.0155, + "step": 794 + }, + { + "epoch": 1.5320481927710843, + "grad_norm": 0.47236886620521545, + "learning_rate": 2.2793824298787414e-05, + "loss": 0.0132, + "step": 795 + }, + { + "epoch": 1.5339759036144578, + "grad_norm": 0.8328865170478821, + "learning_rate": 2.2749409179159104e-05, + "loss": 0.026, + "step": 796 + }, + { + "epoch": 1.5359036144578313, + "grad_norm": 0.3129172623157501, + "learning_rate": 2.2704980234609396e-05, + "loss": 0.0099, + "step": 797 + }, + { + "epoch": 1.5378313253012048, + "grad_norm": 0.22284500300884247, + "learning_rate": 2.2660537688541416e-05, + "loss": 0.009, + "step": 798 + }, + { + "epoch": 1.5397590361445783, + "grad_norm": 0.3346405625343323, + "learning_rate": 2.2616081764426726e-05, + "loss": 0.0077, + "step": 799 + }, + { + "epoch": 1.5416867469879518, + "grad_norm": 0.2923565208911896, + "learning_rate": 2.2571612685804124e-05, + "loss": 0.0119, + "step": 800 + }, + { + "epoch": 1.5436144578313253, + "grad_norm": 0.1921311914920807, + "learning_rate": 2.252713067627857e-05, + "loss": 0.0083, + "step": 801 + }, + { + "epoch": 1.5455421686746988, + "grad_norm": 0.23221106827259064, + "learning_rate": 2.2482635959520044e-05, + "loss": 0.0049, + "step": 802 + }, + { + "epoch": 1.5474698795180721, + "grad_norm": 0.6340724229812622, + "learning_rate": 2.243812875926241e-05, + "loss": 0.0273, + "step": 803 + }, + { + "epoch": 1.5493975903614459, + "grad_norm": 0.2699439823627472, + "learning_rate": 2.2393609299302314e-05, + "loss": 0.0108, + "step": 804 + }, + { + "epoch": 1.5513253012048192, + "grad_norm": 0.2005189210176468, + "learning_rate": 2.2349077803498052e-05, + "loss": 0.0076, + "step": 805 + }, + { + "epoch": 1.5532530120481929, + "grad_norm": 0.39668548107147217, + "learning_rate": 2.230453449576842e-05, + "loss": 0.0135, + "step": 806 + }, + { + "epoch": 1.5551807228915662, + "grad_norm": 0.2406950294971466, + "learning_rate": 2.2259979600091635e-05, + "loss": 0.0094, + "step": 807 + }, + { + "epoch": 1.55710843373494, + "grad_norm": 0.30363157391548157, + "learning_rate": 2.2215413340504158e-05, + "loss": 0.0178, + "step": 808 + }, + { + "epoch": 1.5590361445783132, + "grad_norm": 0.19508181512355804, + "learning_rate": 2.2170835941099605e-05, + "loss": 0.0069, + "step": 809 + }, + { + "epoch": 1.5609638554216867, + "grad_norm": 0.734106719493866, + "learning_rate": 2.2126247626027615e-05, + "loss": 0.0319, + "step": 810 + }, + { + "epoch": 1.5628915662650602, + "grad_norm": 0.2591583728790283, + "learning_rate": 2.208164861949268e-05, + "loss": 0.0168, + "step": 811 + }, + { + "epoch": 1.5648192771084337, + "grad_norm": 0.2386734038591385, + "learning_rate": 2.20370391457531e-05, + "loss": 0.0041, + "step": 812 + }, + { + "epoch": 1.5667469879518072, + "grad_norm": 0.1675218939781189, + "learning_rate": 2.1992419429119764e-05, + "loss": 0.0078, + "step": 813 + }, + { + "epoch": 1.5686746987951807, + "grad_norm": 0.45591506361961365, + "learning_rate": 2.1947789693955097e-05, + "loss": 0.0166, + "step": 814 + }, + { + "epoch": 1.5706024096385542, + "grad_norm": 0.46940621733665466, + "learning_rate": 2.190315016467188e-05, + "loss": 0.0176, + "step": 815 + }, + { + "epoch": 1.5725301204819278, + "grad_norm": 0.2294205278158188, + "learning_rate": 2.1858501065732146e-05, + "loss": 0.0102, + "step": 816 + }, + { + "epoch": 1.5744578313253013, + "grad_norm": 0.28922322392463684, + "learning_rate": 2.181384262164606e-05, + "loss": 0.0111, + "step": 817 + }, + { + "epoch": 1.5763855421686745, + "grad_norm": 0.19650064408779144, + "learning_rate": 2.1769175056970765e-05, + "loss": 0.0076, + "step": 818 + }, + { + "epoch": 1.5783132530120483, + "grad_norm": 0.19538825750350952, + "learning_rate": 2.172449859630927e-05, + "loss": 0.0118, + "step": 819 + }, + { + "epoch": 1.5802409638554216, + "grad_norm": 0.1900389939546585, + "learning_rate": 2.167981346430931e-05, + "loss": 0.0066, + "step": 820 + }, + { + "epoch": 1.5821686746987953, + "grad_norm": 0.21593710780143738, + "learning_rate": 2.1635119885662235e-05, + "loss": 0.0101, + "step": 821 + }, + { + "epoch": 1.5840963855421686, + "grad_norm": 0.2699289321899414, + "learning_rate": 2.159041808510185e-05, + "loss": 0.0118, + "step": 822 + }, + { + "epoch": 1.5860240963855423, + "grad_norm": 0.31867673993110657, + "learning_rate": 2.1545708287403322e-05, + "loss": 0.0122, + "step": 823 + }, + { + "epoch": 1.5879518072289156, + "grad_norm": 0.2862400412559509, + "learning_rate": 2.1500990717382004e-05, + "loss": 0.0216, + "step": 824 + }, + { + "epoch": 1.589879518072289, + "grad_norm": 0.28482481837272644, + "learning_rate": 2.145626559989237e-05, + "loss": 0.0136, + "step": 825 + }, + { + "epoch": 1.5918072289156626, + "grad_norm": 0.2866958975791931, + "learning_rate": 2.1411533159826803e-05, + "loss": 0.0298, + "step": 826 + }, + { + "epoch": 1.5937349397590361, + "grad_norm": 0.39092838764190674, + "learning_rate": 2.1366793622114533e-05, + "loss": 0.0382, + "step": 827 + }, + { + "epoch": 1.5956626506024096, + "grad_norm": 0.16381537914276123, + "learning_rate": 2.1322047211720468e-05, + "loss": 0.0074, + "step": 828 + }, + { + "epoch": 1.5975903614457831, + "grad_norm": 0.22146940231323242, + "learning_rate": 2.1277294153644083e-05, + "loss": 0.0103, + "step": 829 + }, + { + "epoch": 1.5995180722891567, + "grad_norm": 0.2155209183692932, + "learning_rate": 2.123253467291827e-05, + "loss": 0.0095, + "step": 830 + }, + { + "epoch": 1.6014457831325302, + "grad_norm": 0.41510409116744995, + "learning_rate": 2.118776899460822e-05, + "loss": 0.0457, + "step": 831 + }, + { + "epoch": 1.6033734939759037, + "grad_norm": 0.19718150794506073, + "learning_rate": 2.1142997343810293e-05, + "loss": 0.0192, + "step": 832 + }, + { + "epoch": 1.605301204819277, + "grad_norm": 0.40924403071403503, + "learning_rate": 2.1098219945650865e-05, + "loss": 0.0278, + "step": 833 + }, + { + "epoch": 1.6072289156626507, + "grad_norm": 0.18657824397087097, + "learning_rate": 2.105343702528524e-05, + "loss": 0.0076, + "step": 834 + }, + { + "epoch": 1.609156626506024, + "grad_norm": 0.1727641075849533, + "learning_rate": 2.100864880789645e-05, + "loss": 0.0076, + "step": 835 + }, + { + "epoch": 1.6110843373493977, + "grad_norm": 0.18138745427131653, + "learning_rate": 2.0963855518694203e-05, + "loss": 0.005, + "step": 836 + }, + { + "epoch": 1.613012048192771, + "grad_norm": 0.19173955917358398, + "learning_rate": 2.0919057382913675e-05, + "loss": 0.0084, + "step": 837 + }, + { + "epoch": 1.6149397590361447, + "grad_norm": 0.3812403380870819, + "learning_rate": 2.0874254625814435e-05, + "loss": 0.009, + "step": 838 + }, + { + "epoch": 1.616867469879518, + "grad_norm": 0.2009759545326233, + "learning_rate": 2.0829447472679285e-05, + "loss": 0.0098, + "step": 839 + }, + { + "epoch": 1.6187951807228915, + "grad_norm": 0.48703446984291077, + "learning_rate": 2.0784636148813124e-05, + "loss": 0.0099, + "step": 840 + }, + { + "epoch": 1.620722891566265, + "grad_norm": 0.28995075821876526, + "learning_rate": 2.0739820879541827e-05, + "loss": 0.0075, + "step": 841 + }, + { + "epoch": 1.6226506024096385, + "grad_norm": 0.2130059450864792, + "learning_rate": 2.069500189021111e-05, + "loss": 0.007, + "step": 842 + }, + { + "epoch": 1.624578313253012, + "grad_norm": 0.252524733543396, + "learning_rate": 2.0650179406185397e-05, + "loss": 0.0249, + "step": 843 + }, + { + "epoch": 1.6265060240963856, + "grad_norm": 0.23069098591804504, + "learning_rate": 2.060535365284668e-05, + "loss": 0.0084, + "step": 844 + }, + { + "epoch": 1.628433734939759, + "grad_norm": 0.25051403045654297, + "learning_rate": 2.056052485559338e-05, + "loss": 0.0071, + "step": 845 + }, + { + "epoch": 1.6303614457831326, + "grad_norm": 0.27664798498153687, + "learning_rate": 2.051569323983924e-05, + "loss": 0.0198, + "step": 846 + }, + { + "epoch": 1.632289156626506, + "grad_norm": 0.2954922318458557, + "learning_rate": 2.047085903101218e-05, + "loss": 0.006, + "step": 847 + }, + { + "epoch": 1.6342168674698794, + "grad_norm": 0.28477591276168823, + "learning_rate": 2.0426022454553137e-05, + "loss": 0.0147, + "step": 848 + }, + { + "epoch": 1.636144578313253, + "grad_norm": 0.2785305678844452, + "learning_rate": 2.0381183735914968e-05, + "loss": 0.0117, + "step": 849 + }, + { + "epoch": 1.6380722891566264, + "grad_norm": 0.2500309348106384, + "learning_rate": 2.0336343100561295e-05, + "loss": 0.008, + "step": 850 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.18932047486305237, + "learning_rate": 2.0291500773965392e-05, + "loss": 0.0256, + "step": 851 + }, + { + "epoch": 1.6419277108433734, + "grad_norm": 0.6396257877349854, + "learning_rate": 2.0246656981609013e-05, + "loss": 0.0141, + "step": 852 + }, + { + "epoch": 1.6438554216867471, + "grad_norm": 0.5072891116142273, + "learning_rate": 2.02018119489813e-05, + "loss": 0.008, + "step": 853 + }, + { + "epoch": 1.6457831325301204, + "grad_norm": 0.2920839488506317, + "learning_rate": 2.0156965901577635e-05, + "loss": 0.0085, + "step": 854 + }, + { + "epoch": 1.647710843373494, + "grad_norm": 0.1391262263059616, + "learning_rate": 2.011211906489848e-05, + "loss": 0.0078, + "step": 855 + }, + { + "epoch": 1.6496385542168674, + "grad_norm": 0.29620468616485596, + "learning_rate": 2.00672716644483e-05, + "loss": 0.0109, + "step": 856 + }, + { + "epoch": 1.651566265060241, + "grad_norm": 0.13946573436260223, + "learning_rate": 2.002242392573436e-05, + "loss": 0.0076, + "step": 857 + }, + { + "epoch": 1.6534939759036145, + "grad_norm": 0.9766128659248352, + "learning_rate": 1.997757607426565e-05, + "loss": 0.0309, + "step": 858 + }, + { + "epoch": 1.655421686746988, + "grad_norm": 0.18002203106880188, + "learning_rate": 1.9932728335551702e-05, + "loss": 0.0072, + "step": 859 + }, + { + "epoch": 1.6573493975903615, + "grad_norm": 0.28073111176490784, + "learning_rate": 1.988788093510152e-05, + "loss": 0.0246, + "step": 860 + }, + { + "epoch": 1.659277108433735, + "grad_norm": 0.1919957399368286, + "learning_rate": 1.9843034098422375e-05, + "loss": 0.0087, + "step": 861 + }, + { + "epoch": 1.6612048192771085, + "grad_norm": 0.1825258433818817, + "learning_rate": 1.9798188051018705e-05, + "loss": 0.0092, + "step": 862 + }, + { + "epoch": 1.6631325301204818, + "grad_norm": 0.32412952184677124, + "learning_rate": 1.9753343018390997e-05, + "loss": 0.0118, + "step": 863 + }, + { + "epoch": 1.6650602409638555, + "grad_norm": 0.12828563153743744, + "learning_rate": 1.9708499226034618e-05, + "loss": 0.0056, + "step": 864 + }, + { + "epoch": 1.6669879518072288, + "grad_norm": 0.18647560477256775, + "learning_rate": 1.966365689943871e-05, + "loss": 0.0094, + "step": 865 + }, + { + "epoch": 1.6689156626506025, + "grad_norm": 0.19835828244686127, + "learning_rate": 1.9618816264085042e-05, + "loss": 0.0097, + "step": 866 + }, + { + "epoch": 1.6708433734939758, + "grad_norm": 0.22364282608032227, + "learning_rate": 1.957397754544687e-05, + "loss": 0.0062, + "step": 867 + }, + { + "epoch": 1.6727710843373496, + "grad_norm": 0.29420018196105957, + "learning_rate": 1.952914096898783e-05, + "loss": 0.0182, + "step": 868 + }, + { + "epoch": 1.6746987951807228, + "grad_norm": 0.2149929702281952, + "learning_rate": 1.9484306760160766e-05, + "loss": 0.0125, + "step": 869 + }, + { + "epoch": 1.6766265060240964, + "grad_norm": 0.16844330728054047, + "learning_rate": 1.9439475144406623e-05, + "loss": 0.0074, + "step": 870 + }, + { + "epoch": 1.6785542168674699, + "grad_norm": 0.5010282397270203, + "learning_rate": 1.9394646347153334e-05, + "loss": 0.0213, + "step": 871 + }, + { + "epoch": 1.6804819277108434, + "grad_norm": 0.29847195744514465, + "learning_rate": 1.9349820593814606e-05, + "loss": 0.0173, + "step": 872 + }, + { + "epoch": 1.6824096385542169, + "grad_norm": 0.23835812509059906, + "learning_rate": 1.930499810978889e-05, + "loss": 0.011, + "step": 873 + }, + { + "epoch": 1.6843373493975904, + "grad_norm": 0.3269020617008209, + "learning_rate": 1.9260179120458177e-05, + "loss": 0.0285, + "step": 874 + }, + { + "epoch": 1.686265060240964, + "grad_norm": 0.2142144739627838, + "learning_rate": 1.9215363851186883e-05, + "loss": 0.0146, + "step": 875 + }, + { + "epoch": 1.6881927710843372, + "grad_norm": 0.3098377585411072, + "learning_rate": 1.9170552527320725e-05, + "loss": 0.0104, + "step": 876 + }, + { + "epoch": 1.690120481927711, + "grad_norm": 0.22504115104675293, + "learning_rate": 1.9125745374185568e-05, + "loss": 0.0091, + "step": 877 + }, + { + "epoch": 1.6920481927710842, + "grad_norm": 0.20633333921432495, + "learning_rate": 1.908094261708633e-05, + "loss": 0.0097, + "step": 878 + }, + { + "epoch": 1.693975903614458, + "grad_norm": 1.179566502571106, + "learning_rate": 1.9036144481305807e-05, + "loss": 0.0143, + "step": 879 + }, + { + "epoch": 1.6959036144578312, + "grad_norm": 0.15525613725185394, + "learning_rate": 1.8991351192103554e-05, + "loss": 0.0062, + "step": 880 + }, + { + "epoch": 1.697831325301205, + "grad_norm": 0.15966367721557617, + "learning_rate": 1.8946562974714763e-05, + "loss": 0.0048, + "step": 881 + }, + { + "epoch": 1.6997590361445782, + "grad_norm": 0.18902607262134552, + "learning_rate": 1.890178005434914e-05, + "loss": 0.0124, + "step": 882 + }, + { + "epoch": 1.701686746987952, + "grad_norm": 0.21692413091659546, + "learning_rate": 1.885700265618971e-05, + "loss": 0.0135, + "step": 883 + }, + { + "epoch": 1.7036144578313253, + "grad_norm": 0.38948455452919006, + "learning_rate": 1.8812231005391786e-05, + "loss": 0.0365, + "step": 884 + }, + { + "epoch": 1.7055421686746988, + "grad_norm": 0.2483491599559784, + "learning_rate": 1.8767465327081736e-05, + "loss": 0.0202, + "step": 885 + }, + { + "epoch": 1.7074698795180723, + "grad_norm": 0.15305832028388977, + "learning_rate": 1.872270584635592e-05, + "loss": 0.0035, + "step": 886 + }, + { + "epoch": 1.7093975903614458, + "grad_norm": 0.17794466018676758, + "learning_rate": 1.867795278827954e-05, + "loss": 0.0157, + "step": 887 + }, + { + "epoch": 1.7113253012048193, + "grad_norm": 0.1938813328742981, + "learning_rate": 1.863320637788547e-05, + "loss": 0.0071, + "step": 888 + }, + { + "epoch": 1.7132530120481928, + "grad_norm": 0.27061617374420166, + "learning_rate": 1.8588466840173207e-05, + "loss": 0.0347, + "step": 889 + }, + { + "epoch": 1.7151807228915663, + "grad_norm": 0.1541014313697815, + "learning_rate": 1.8543734400107637e-05, + "loss": 0.006, + "step": 890 + }, + { + "epoch": 1.7171084337349396, + "grad_norm": 0.1436876654624939, + "learning_rate": 1.8499009282617996e-05, + "loss": 0.0059, + "step": 891 + }, + { + "epoch": 1.7190361445783133, + "grad_norm": 1.0573723316192627, + "learning_rate": 1.8454291712596688e-05, + "loss": 0.008, + "step": 892 + }, + { + "epoch": 1.7209638554216866, + "grad_norm": 0.15406259894371033, + "learning_rate": 1.8409581914898157e-05, + "loss": 0.0061, + "step": 893 + }, + { + "epoch": 1.7228915662650603, + "grad_norm": 0.24822913110256195, + "learning_rate": 1.836488011433777e-05, + "loss": 0.0085, + "step": 894 + }, + { + "epoch": 1.7248192771084336, + "grad_norm": 0.21049316227436066, + "learning_rate": 1.83201865356907e-05, + "loss": 0.0075, + "step": 895 + }, + { + "epoch": 1.7267469879518074, + "grad_norm": 0.24159866571426392, + "learning_rate": 1.8275501403690733e-05, + "loss": 0.0156, + "step": 896 + }, + { + "epoch": 1.7286746987951807, + "grad_norm": 0.3191063106060028, + "learning_rate": 1.823082494302924e-05, + "loss": 0.0218, + "step": 897 + }, + { + "epoch": 1.7306024096385542, + "grad_norm": 0.20296362042427063, + "learning_rate": 1.8186157378353945e-05, + "loss": 0.0126, + "step": 898 + }, + { + "epoch": 1.7325301204819277, + "grad_norm": 0.1905524581670761, + "learning_rate": 1.8141498934267858e-05, + "loss": 0.0131, + "step": 899 + }, + { + "epoch": 1.7344578313253012, + "grad_norm": 0.5350520610809326, + "learning_rate": 1.809684983532813e-05, + "loss": 0.0115, + "step": 900 + }, + { + "epoch": 1.7363855421686747, + "grad_norm": 0.17144092917442322, + "learning_rate": 1.8052210306044907e-05, + "loss": 0.0113, + "step": 901 + }, + { + "epoch": 1.7383132530120482, + "grad_norm": 0.11777982115745544, + "learning_rate": 1.8007580570880236e-05, + "loss": 0.0058, + "step": 902 + }, + { + "epoch": 1.7402409638554217, + "grad_norm": 0.2078275978565216, + "learning_rate": 1.7962960854246908e-05, + "loss": 0.0106, + "step": 903 + }, + { + "epoch": 1.7421686746987952, + "grad_norm": 0.2550877630710602, + "learning_rate": 1.791835138050732e-05, + "loss": 0.0076, + "step": 904 + }, + { + "epoch": 1.7440963855421687, + "grad_norm": 0.11553912609815598, + "learning_rate": 1.7873752373972395e-05, + "loss": 0.0038, + "step": 905 + }, + { + "epoch": 1.746024096385542, + "grad_norm": 0.10724586248397827, + "learning_rate": 1.7829164058900398e-05, + "loss": 0.0043, + "step": 906 + }, + { + "epoch": 1.7479518072289157, + "grad_norm": 0.30152231454849243, + "learning_rate": 1.7784586659495845e-05, + "loss": 0.0099, + "step": 907 + }, + { + "epoch": 1.749879518072289, + "grad_norm": 0.18372933566570282, + "learning_rate": 1.7740020399908372e-05, + "loss": 0.0074, + "step": 908 + }, + { + "epoch": 1.7518072289156628, + "grad_norm": 0.35184428095817566, + "learning_rate": 1.7695465504231586e-05, + "loss": 0.0184, + "step": 909 + }, + { + "epoch": 1.753734939759036, + "grad_norm": 0.15083615481853485, + "learning_rate": 1.765092219650196e-05, + "loss": 0.0061, + "step": 910 + }, + { + "epoch": 1.7556626506024098, + "grad_norm": 0.2599961459636688, + "learning_rate": 1.7606390700697693e-05, + "loss": 0.0101, + "step": 911 + }, + { + "epoch": 1.757590361445783, + "grad_norm": 0.10829206556081772, + "learning_rate": 1.7561871240737595e-05, + "loss": 0.0034, + "step": 912 + }, + { + "epoch": 1.7595180722891566, + "grad_norm": 0.38098782300949097, + "learning_rate": 1.7517364040479966e-05, + "loss": 0.0384, + "step": 913 + }, + { + "epoch": 1.76144578313253, + "grad_norm": 0.14975085854530334, + "learning_rate": 1.7472869323721432e-05, + "loss": 0.0055, + "step": 914 + }, + { + "epoch": 1.7633734939759036, + "grad_norm": 0.4151444733142853, + "learning_rate": 1.742838731419588e-05, + "loss": 0.0307, + "step": 915 + }, + { + "epoch": 1.765301204819277, + "grad_norm": 0.22238481044769287, + "learning_rate": 1.738391823557328e-05, + "loss": 0.0059, + "step": 916 + }, + { + "epoch": 1.7672289156626506, + "grad_norm": 0.23386356234550476, + "learning_rate": 1.7339462311458587e-05, + "loss": 0.0113, + "step": 917 + }, + { + "epoch": 1.7691566265060241, + "grad_norm": 0.21911191940307617, + "learning_rate": 1.7295019765390618e-05, + "loss": 0.0071, + "step": 918 + }, + { + "epoch": 1.7710843373493976, + "grad_norm": 0.343159943819046, + "learning_rate": 1.7250590820840903e-05, + "loss": 0.0144, + "step": 919 + }, + { + "epoch": 1.7730120481927711, + "grad_norm": 0.32204556465148926, + "learning_rate": 1.720617570121259e-05, + "loss": 0.0131, + "step": 920 + }, + { + "epoch": 1.7749397590361444, + "grad_norm": 0.4105585515499115, + "learning_rate": 1.7161774629839328e-05, + "loss": 0.0148, + "step": 921 + }, + { + "epoch": 1.7768674698795182, + "grad_norm": 0.16380974650382996, + "learning_rate": 1.7117387829984093e-05, + "loss": 0.0066, + "step": 922 + }, + { + "epoch": 1.7787951807228914, + "grad_norm": 0.22920913994312286, + "learning_rate": 1.707301552483813e-05, + "loss": 0.0105, + "step": 923 + }, + { + "epoch": 1.7807228915662652, + "grad_norm": 0.2075149267911911, + "learning_rate": 1.7028657937519767e-05, + "loss": 0.0104, + "step": 924 + }, + { + "epoch": 1.7826506024096385, + "grad_norm": 0.44439977407455444, + "learning_rate": 1.6984315291073355e-05, + "loss": 0.0134, + "step": 925 + }, + { + "epoch": 1.7845783132530122, + "grad_norm": 0.24068203568458557, + "learning_rate": 1.6939987808468125e-05, + "loss": 0.0078, + "step": 926 + }, + { + "epoch": 1.7865060240963855, + "grad_norm": 0.34044349193573, + "learning_rate": 1.689567571259701e-05, + "loss": 0.0108, + "step": 927 + }, + { + "epoch": 1.788433734939759, + "grad_norm": 0.34082743525505066, + "learning_rate": 1.6851379226275624e-05, + "loss": 0.0266, + "step": 928 + }, + { + "epoch": 1.7903614457831325, + "grad_norm": 0.19490115344524384, + "learning_rate": 1.6807098572241075e-05, + "loss": 0.0109, + "step": 929 + }, + { + "epoch": 1.792289156626506, + "grad_norm": 0.16208237409591675, + "learning_rate": 1.6762833973150846e-05, + "loss": 0.0113, + "step": 930 + }, + { + "epoch": 1.7942168674698795, + "grad_norm": 0.35555699467658997, + "learning_rate": 1.671858565158172e-05, + "loss": 0.0196, + "step": 931 + }, + { + "epoch": 1.796144578313253, + "grad_norm": 0.1600857824087143, + "learning_rate": 1.6674353830028587e-05, + "loss": 0.0089, + "step": 932 + }, + { + "epoch": 1.7980722891566265, + "grad_norm": 0.1699574887752533, + "learning_rate": 1.663013873090342e-05, + "loss": 0.0074, + "step": 933 + }, + { + "epoch": 1.8, + "grad_norm": 0.2472933828830719, + "learning_rate": 1.6585940576534086e-05, + "loss": 0.0063, + "step": 934 + }, + { + "epoch": 1.8019277108433736, + "grad_norm": 0.23491555452346802, + "learning_rate": 1.654175958916323e-05, + "loss": 0.0101, + "step": 935 + }, + { + "epoch": 1.8038554216867468, + "grad_norm": 0.28635191917419434, + "learning_rate": 1.6497595990947195e-05, + "loss": 0.0131, + "step": 936 + }, + { + "epoch": 1.8057831325301206, + "grad_norm": 0.15400712192058563, + "learning_rate": 1.645345000395489e-05, + "loss": 0.0068, + "step": 937 + }, + { + "epoch": 1.8077108433734939, + "grad_norm": 0.18223172426223755, + "learning_rate": 1.6409321850166647e-05, + "loss": 0.0094, + "step": 938 + }, + { + "epoch": 1.8096385542168676, + "grad_norm": 0.2789457142353058, + "learning_rate": 1.636521175147316e-05, + "loss": 0.0202, + "step": 939 + }, + { + "epoch": 1.8115662650602409, + "grad_norm": 0.4267627000808716, + "learning_rate": 1.6321119929674297e-05, + "loss": 0.0176, + "step": 940 + }, + { + "epoch": 1.8134939759036146, + "grad_norm": 0.3021615445613861, + "learning_rate": 1.6277046606478056e-05, + "loss": 0.0085, + "step": 941 + }, + { + "epoch": 1.815421686746988, + "grad_norm": 0.3724934756755829, + "learning_rate": 1.6232992003499405e-05, + "loss": 0.0474, + "step": 942 + }, + { + "epoch": 1.8173493975903614, + "grad_norm": 0.20904326438903809, + "learning_rate": 1.6188956342259177e-05, + "loss": 0.0078, + "step": 943 + }, + { + "epoch": 1.819277108433735, + "grad_norm": 0.31168171763420105, + "learning_rate": 1.614493984418297e-05, + "loss": 0.0174, + "step": 944 + }, + { + "epoch": 1.8212048192771084, + "grad_norm": 0.21273556351661682, + "learning_rate": 1.6100942730600003e-05, + "loss": 0.0054, + "step": 945 + }, + { + "epoch": 1.823132530120482, + "grad_norm": 0.16991695761680603, + "learning_rate": 1.6056965222742055e-05, + "loss": 0.0063, + "step": 946 + }, + { + "epoch": 1.8250602409638554, + "grad_norm": 0.22762684524059296, + "learning_rate": 1.6013007541742303e-05, + "loss": 0.0234, + "step": 947 + }, + { + "epoch": 1.826987951807229, + "grad_norm": 0.20128795504570007, + "learning_rate": 1.596906990863422e-05, + "loss": 0.0095, + "step": 948 + }, + { + "epoch": 1.8289156626506025, + "grad_norm": 0.30772027373313904, + "learning_rate": 1.592515254435048e-05, + "loss": 0.0356, + "step": 949 + }, + { + "epoch": 1.830843373493976, + "grad_norm": 0.12954631447792053, + "learning_rate": 1.5881255669721857e-05, + "loss": 0.008, + "step": 950 + }, + { + "epoch": 1.8327710843373493, + "grad_norm": 0.7787145972251892, + "learning_rate": 1.5837379505476054e-05, + "loss": 0.0108, + "step": 951 + }, + { + "epoch": 1.834698795180723, + "grad_norm": 0.1683879941701889, + "learning_rate": 1.5793524272236683e-05, + "loss": 0.006, + "step": 952 + }, + { + "epoch": 1.8366265060240963, + "grad_norm": 0.16475361585617065, + "learning_rate": 1.5749690190522076e-05, + "loss": 0.0065, + "step": 953 + }, + { + "epoch": 1.83855421686747, + "grad_norm": 0.211905375123024, + "learning_rate": 1.5705877480744214e-05, + "loss": 0.0092, + "step": 954 + }, + { + "epoch": 1.8404819277108433, + "grad_norm": 0.23850117623806, + "learning_rate": 1.5662086363207628e-05, + "loss": 0.012, + "step": 955 + }, + { + "epoch": 1.842409638554217, + "grad_norm": 0.19100065529346466, + "learning_rate": 1.561831705810825e-05, + "loss": 0.0113, + "step": 956 + }, + { + "epoch": 1.8443373493975903, + "grad_norm": 0.3635985255241394, + "learning_rate": 1.557456978553236e-05, + "loss": 0.0168, + "step": 957 + }, + { + "epoch": 1.8462650602409638, + "grad_norm": 0.16449116170406342, + "learning_rate": 1.553084476545544e-05, + "loss": 0.0042, + "step": 958 + }, + { + "epoch": 1.8481927710843373, + "grad_norm": 0.566093385219574, + "learning_rate": 1.5487142217741062e-05, + "loss": 0.0145, + "step": 959 + }, + { + "epoch": 1.8501204819277108, + "grad_norm": 0.15960252285003662, + "learning_rate": 1.5443462362139834e-05, + "loss": 0.0059, + "step": 960 + }, + { + "epoch": 1.8520481927710843, + "grad_norm": 0.40773797035217285, + "learning_rate": 1.539980541828823e-05, + "loss": 0.0257, + "step": 961 + }, + { + "epoch": 1.8539759036144579, + "grad_norm": 0.4802496135234833, + "learning_rate": 1.5356171605707522e-05, + "loss": 0.0111, + "step": 962 + }, + { + "epoch": 1.8559036144578314, + "grad_norm": 0.15745794773101807, + "learning_rate": 1.5312561143802704e-05, + "loss": 0.0049, + "step": 963 + }, + { + "epoch": 1.8578313253012049, + "grad_norm": 0.15139251947402954, + "learning_rate": 1.5268974251861298e-05, + "loss": 0.0077, + "step": 964 + }, + { + "epoch": 1.8597590361445784, + "grad_norm": 0.2188841849565506, + "learning_rate": 1.5225411149052356e-05, + "loss": 0.017, + "step": 965 + }, + { + "epoch": 1.8616867469879517, + "grad_norm": 0.10853131115436554, + "learning_rate": 1.5181872054425287e-05, + "loss": 0.0049, + "step": 966 + }, + { + "epoch": 1.8636144578313254, + "grad_norm": 0.8254880905151367, + "learning_rate": 1.5138357186908785e-05, + "loss": 0.0317, + "step": 967 + }, + { + "epoch": 1.8655421686746987, + "grad_norm": 0.2989620566368103, + "learning_rate": 1.5094866765309728e-05, + "loss": 0.0126, + "step": 968 + }, + { + "epoch": 1.8674698795180724, + "grad_norm": 0.16411150991916656, + "learning_rate": 1.5051401008312054e-05, + "loss": 0.0101, + "step": 969 + }, + { + "epoch": 1.8693975903614457, + "grad_norm": 0.2861763834953308, + "learning_rate": 1.5007960134475706e-05, + "loss": 0.0155, + "step": 970 + }, + { + "epoch": 1.8713253012048194, + "grad_norm": 0.24879588186740875, + "learning_rate": 1.4964544362235487e-05, + "loss": 0.0187, + "step": 971 + }, + { + "epoch": 1.8732530120481927, + "grad_norm": 0.2433672398328781, + "learning_rate": 1.4921153909899983e-05, + "loss": 0.0084, + "step": 972 + }, + { + "epoch": 1.8751807228915662, + "grad_norm": 0.15097154676914215, + "learning_rate": 1.487778899565047e-05, + "loss": 0.007, + "step": 973 + }, + { + "epoch": 1.8771084337349397, + "grad_norm": 0.1629047691822052, + "learning_rate": 1.4834449837539806e-05, + "loss": 0.0058, + "step": 974 + }, + { + "epoch": 1.8790361445783132, + "grad_norm": 0.9937071204185486, + "learning_rate": 1.4791136653491333e-05, + "loss": 0.0323, + "step": 975 + }, + { + "epoch": 1.8809638554216868, + "grad_norm": 0.19555562734603882, + "learning_rate": 1.4747849661297808e-05, + "loss": 0.0126, + "step": 976 + }, + { + "epoch": 1.8828915662650603, + "grad_norm": 0.16147711873054504, + "learning_rate": 1.470458907862026e-05, + "loss": 0.0067, + "step": 977 + }, + { + "epoch": 1.8848192771084338, + "grad_norm": 0.2730027735233307, + "learning_rate": 1.4661355122986945e-05, + "loss": 0.0147, + "step": 978 + }, + { + "epoch": 1.886746987951807, + "grad_norm": 0.13759832084178925, + "learning_rate": 1.4618148011792206e-05, + "loss": 0.0038, + "step": 979 + }, + { + "epoch": 1.8886746987951808, + "grad_norm": 0.33516690135002136, + "learning_rate": 1.4574967962295419e-05, + "loss": 0.0139, + "step": 980 + }, + { + "epoch": 1.890602409638554, + "grad_norm": 0.2345741093158722, + "learning_rate": 1.4531815191619903e-05, + "loss": 0.0094, + "step": 981 + }, + { + "epoch": 1.8925301204819278, + "grad_norm": 0.14681044220924377, + "learning_rate": 1.4488689916751762e-05, + "loss": 0.0065, + "step": 982 + }, + { + "epoch": 1.894457831325301, + "grad_norm": 0.21143914759159088, + "learning_rate": 1.4445592354538885e-05, + "loss": 0.0057, + "step": 983 + }, + { + "epoch": 1.8963855421686748, + "grad_norm": 0.3109160363674164, + "learning_rate": 1.44025227216898e-05, + "loss": 0.0142, + "step": 984 + }, + { + "epoch": 1.8983132530120481, + "grad_norm": 0.24301907420158386, + "learning_rate": 1.435948123477259e-05, + "loss": 0.012, + "step": 985 + }, + { + "epoch": 1.9002409638554218, + "grad_norm": 0.19817675650119781, + "learning_rate": 1.431646811021382e-05, + "loss": 0.0097, + "step": 986 + }, + { + "epoch": 1.9021686746987951, + "grad_norm": 0.13464932143688202, + "learning_rate": 1.4273483564297425e-05, + "loss": 0.0046, + "step": 987 + }, + { + "epoch": 1.9040963855421686, + "grad_norm": 0.1698642522096634, + "learning_rate": 1.4230527813163656e-05, + "loss": 0.0038, + "step": 988 + }, + { + "epoch": 1.9060240963855422, + "grad_norm": 0.19395388662815094, + "learning_rate": 1.4187601072807975e-05, + "loss": 0.0123, + "step": 989 + }, + { + "epoch": 1.9079518072289157, + "grad_norm": 0.2093188613653183, + "learning_rate": 1.4144703559079948e-05, + "loss": 0.0093, + "step": 990 + }, + { + "epoch": 1.9098795180722892, + "grad_norm": 0.1529311090707779, + "learning_rate": 1.4101835487682198e-05, + "loss": 0.0051, + "step": 991 + }, + { + "epoch": 1.9118072289156627, + "grad_norm": 0.18725350499153137, + "learning_rate": 1.4058997074169299e-05, + "loss": 0.0083, + "step": 992 + }, + { + "epoch": 1.9137349397590362, + "grad_norm": 0.15601560473442078, + "learning_rate": 1.401618853394668e-05, + "loss": 0.0086, + "step": 993 + }, + { + "epoch": 1.9156626506024095, + "grad_norm": 0.23890644311904907, + "learning_rate": 1.3973410082269591e-05, + "loss": 0.015, + "step": 994 + }, + { + "epoch": 1.9175903614457832, + "grad_norm": 0.2442619949579239, + "learning_rate": 1.3930661934241947e-05, + "loss": 0.0089, + "step": 995 + }, + { + "epoch": 1.9195180722891565, + "grad_norm": 0.1540212482213974, + "learning_rate": 1.388794430481532e-05, + "loss": 0.0072, + "step": 996 + }, + { + "epoch": 1.9214457831325302, + "grad_norm": 0.1359291970729828, + "learning_rate": 1.3845257408787807e-05, + "loss": 0.0131, + "step": 997 + }, + { + "epoch": 1.9233734939759035, + "grad_norm": 0.25486138463020325, + "learning_rate": 1.3802601460802967e-05, + "loss": 0.0198, + "step": 998 + }, + { + "epoch": 1.9253012048192772, + "grad_norm": 0.28815609216690063, + "learning_rate": 1.3759976675348754e-05, + "loss": 0.014, + "step": 999 + }, + { + "epoch": 1.9272289156626505, + "grad_norm": 0.15648497641086578, + "learning_rate": 1.3717383266756403e-05, + "loss": 0.0065, + "step": 1000 + }, + { + "epoch": 1.929156626506024, + "grad_norm": 0.16912540793418884, + "learning_rate": 1.367482144919941e-05, + "loss": 0.0059, + "step": 1001 + }, + { + "epoch": 1.9310843373493976, + "grad_norm": 0.16896723210811615, + "learning_rate": 1.3632291436692397e-05, + "loss": 0.0054, + "step": 1002 + }, + { + "epoch": 1.933012048192771, + "grad_norm": 0.20287497341632843, + "learning_rate": 1.3589793443090064e-05, + "loss": 0.0097, + "step": 1003 + }, + { + "epoch": 1.9349397590361446, + "grad_norm": 0.14804276823997498, + "learning_rate": 1.3547327682086114e-05, + "loss": 0.0125, + "step": 1004 + }, + { + "epoch": 1.936867469879518, + "grad_norm": 0.23820064961910248, + "learning_rate": 1.3504894367212171e-05, + "loss": 0.0131, + "step": 1005 + }, + { + "epoch": 1.9387951807228916, + "grad_norm": 0.25607362389564514, + "learning_rate": 1.34624937118367e-05, + "loss": 0.0115, + "step": 1006 + }, + { + "epoch": 1.940722891566265, + "grad_norm": 0.37233737111091614, + "learning_rate": 1.3420125929163976e-05, + "loss": 0.0309, + "step": 1007 + }, + { + "epoch": 1.9426506024096386, + "grad_norm": 0.19426730275154114, + "learning_rate": 1.3377791232232929e-05, + "loss": 0.0078, + "step": 1008 + }, + { + "epoch": 1.944578313253012, + "grad_norm": 0.2784160077571869, + "learning_rate": 1.333548983391617e-05, + "loss": 0.0142, + "step": 1009 + }, + { + "epoch": 1.9465060240963856, + "grad_norm": 0.11407195776700974, + "learning_rate": 1.3293221946918853e-05, + "loss": 0.0035, + "step": 1010 + }, + { + "epoch": 1.948433734939759, + "grad_norm": 0.3965436816215515, + "learning_rate": 1.325098778377762e-05, + "loss": 0.0242, + "step": 1011 + }, + { + "epoch": 1.9503614457831326, + "grad_norm": 0.18520519137382507, + "learning_rate": 1.3208787556859543e-05, + "loss": 0.0096, + "step": 1012 + }, + { + "epoch": 1.952289156626506, + "grad_norm": 0.2783315181732178, + "learning_rate": 1.3166621478361075e-05, + "loss": 0.0103, + "step": 1013 + }, + { + "epoch": 1.9542168674698797, + "grad_norm": 0.22714459896087646, + "learning_rate": 1.3124489760306917e-05, + "loss": 0.0078, + "step": 1014 + }, + { + "epoch": 1.956144578313253, + "grad_norm": 0.1257915049791336, + "learning_rate": 1.3082392614549036e-05, + "loss": 0.0077, + "step": 1015 + }, + { + "epoch": 1.9580722891566265, + "grad_norm": 0.15592887997627258, + "learning_rate": 1.3040330252765526e-05, + "loss": 0.0106, + "step": 1016 + }, + { + "epoch": 1.96, + "grad_norm": 0.19295449554920197, + "learning_rate": 1.2998302886459586e-05, + "loss": 0.0082, + "step": 1017 + }, + { + "epoch": 1.9619277108433735, + "grad_norm": 0.15544794499874115, + "learning_rate": 1.2956310726958472e-05, + "loss": 0.0068, + "step": 1018 + }, + { + "epoch": 1.963855421686747, + "grad_norm": 0.25899502635002136, + "learning_rate": 1.291435398541236e-05, + "loss": 0.0086, + "step": 1019 + }, + { + "epoch": 1.9657831325301205, + "grad_norm": 0.34639033675193787, + "learning_rate": 1.2872432872793379e-05, + "loss": 0.0116, + "step": 1020 + }, + { + "epoch": 1.967710843373494, + "grad_norm": 0.1628410518169403, + "learning_rate": 1.283054759989447e-05, + "loss": 0.0055, + "step": 1021 + }, + { + "epoch": 1.9696385542168675, + "grad_norm": 0.9273788928985596, + "learning_rate": 1.2788698377328385e-05, + "loss": 0.0264, + "step": 1022 + }, + { + "epoch": 1.971566265060241, + "grad_norm": 0.163126140832901, + "learning_rate": 1.2746885415526594e-05, + "loss": 0.0046, + "step": 1023 + }, + { + "epoch": 1.9734939759036143, + "grad_norm": 0.1475439816713333, + "learning_rate": 1.2705108924738223e-05, + "loss": 0.0056, + "step": 1024 + }, + { + "epoch": 1.975421686746988, + "grad_norm": 0.1654318869113922, + "learning_rate": 1.2663369115029034e-05, + "loss": 0.0056, + "step": 1025 + }, + { + "epoch": 1.9773493975903613, + "grad_norm": 0.20536045730113983, + "learning_rate": 1.2621666196280333e-05, + "loss": 0.0101, + "step": 1026 + }, + { + "epoch": 1.979277108433735, + "grad_norm": 0.19256474077701569, + "learning_rate": 1.258000037818792e-05, + "loss": 0.0059, + "step": 1027 + }, + { + "epoch": 1.9812048192771083, + "grad_norm": 0.2605120539665222, + "learning_rate": 1.2538371870261053e-05, + "loss": 0.0115, + "step": 1028 + }, + { + "epoch": 1.983132530120482, + "grad_norm": 0.14840295910835266, + "learning_rate": 1.249678088182137e-05, + "loss": 0.0046, + "step": 1029 + }, + { + "epoch": 1.9850602409638554, + "grad_norm": 0.17585207521915436, + "learning_rate": 1.2455227622001851e-05, + "loss": 0.0086, + "step": 1030 + }, + { + "epoch": 1.9869879518072289, + "grad_norm": 0.11044781655073166, + "learning_rate": 1.241371229974579e-05, + "loss": 0.0034, + "step": 1031 + }, + { + "epoch": 1.9889156626506024, + "grad_norm": 0.25584840774536133, + "learning_rate": 1.2372235123805672e-05, + "loss": 0.0245, + "step": 1032 + }, + { + "epoch": 1.9908433734939759, + "grad_norm": 0.25962474942207336, + "learning_rate": 1.2330796302742211e-05, + "loss": 0.0104, + "step": 1033 + }, + { + "epoch": 1.9927710843373494, + "grad_norm": 0.33408522605895996, + "learning_rate": 1.2289396044923238e-05, + "loss": 0.0176, + "step": 1034 + }, + { + "epoch": 1.994698795180723, + "grad_norm": 0.479950487613678, + "learning_rate": 1.2248034558522682e-05, + "loss": 0.0113, + "step": 1035 + }, + { + "epoch": 1.9966265060240964, + "grad_norm": 0.16567294299602509, + "learning_rate": 1.2206712051519518e-05, + "loss": 0.0036, + "step": 1036 + }, + { + "epoch": 1.99855421686747, + "grad_norm": 0.19343771040439606, + "learning_rate": 1.2165428731696713e-05, + "loss": 0.0077, + "step": 1037 + }, + { + "epoch": 2.0, + "grad_norm": 0.22895601391792297, + "learning_rate": 1.2124184806640202e-05, + "loss": 0.0114, + "step": 1038 + }, + { + "epoch": 2.0019277108433733, + "grad_norm": 0.15838384628295898, + "learning_rate": 1.208298048373782e-05, + "loss": 0.0043, + "step": 1039 + }, + { + "epoch": 2.003855421686747, + "grad_norm": 0.681065559387207, + "learning_rate": 1.2041815970178268e-05, + "loss": 0.0214, + "step": 1040 + }, + { + "epoch": 2.0057831325301203, + "grad_norm": 0.3357350528240204, + "learning_rate": 1.2000691472950081e-05, + "loss": 0.0079, + "step": 1041 + }, + { + "epoch": 2.007710843373494, + "grad_norm": 0.15238308906555176, + "learning_rate": 1.1959607198840568e-05, + "loss": 0.0041, + "step": 1042 + }, + { + "epoch": 2.0096385542168673, + "grad_norm": 0.11763229966163635, + "learning_rate": 1.1918563354434784e-05, + "loss": 0.0033, + "step": 1043 + }, + { + "epoch": 2.011566265060241, + "grad_norm": 0.3759301006793976, + "learning_rate": 1.1877560146114515e-05, + "loss": 0.0128, + "step": 1044 + }, + { + "epoch": 2.0134939759036143, + "grad_norm": 0.1143188625574112, + "learning_rate": 1.1836597780057183e-05, + "loss": 0.0078, + "step": 1045 + }, + { + "epoch": 2.015421686746988, + "grad_norm": 0.20059260725975037, + "learning_rate": 1.179567646223485e-05, + "loss": 0.0149, + "step": 1046 + }, + { + "epoch": 2.0173493975903614, + "grad_norm": 0.15569567680358887, + "learning_rate": 1.1754796398413196e-05, + "loss": 0.0038, + "step": 1047 + }, + { + "epoch": 2.019277108433735, + "grad_norm": 0.1153278723359108, + "learning_rate": 1.1713957794150423e-05, + "loss": 0.0041, + "step": 1048 + }, + { + "epoch": 2.0212048192771084, + "grad_norm": 0.1838717758655548, + "learning_rate": 1.1673160854796307e-05, + "loss": 0.0041, + "step": 1049 + }, + { + "epoch": 2.023132530120482, + "grad_norm": 0.12264502793550491, + "learning_rate": 1.1632405785491077e-05, + "loss": 0.0043, + "step": 1050 + }, + { + "epoch": 2.0250602409638554, + "grad_norm": 0.14363229274749756, + "learning_rate": 1.159169279116445e-05, + "loss": 0.0066, + "step": 1051 + }, + { + "epoch": 2.026987951807229, + "grad_norm": 0.1316995471715927, + "learning_rate": 1.1551022076534585e-05, + "loss": 0.0024, + "step": 1052 + }, + { + "epoch": 2.0289156626506024, + "grad_norm": 0.13392619788646698, + "learning_rate": 1.1510393846107001e-05, + "loss": 0.0051, + "step": 1053 + }, + { + "epoch": 2.0308433734939757, + "grad_norm": 3.0086817741394043, + "learning_rate": 1.1469808304173658e-05, + "loss": 0.0334, + "step": 1054 + }, + { + "epoch": 2.0327710843373494, + "grad_norm": 0.17756076157093048, + "learning_rate": 1.1429265654811803e-05, + "loss": 0.0068, + "step": 1055 + }, + { + "epoch": 2.0346987951807227, + "grad_norm": 0.13250532746315002, + "learning_rate": 1.1388766101883038e-05, + "loss": 0.0087, + "step": 1056 + }, + { + "epoch": 2.0366265060240965, + "grad_norm": 0.3534089922904968, + "learning_rate": 1.1348309849032257e-05, + "loss": 0.0076, + "step": 1057 + }, + { + "epoch": 2.0385542168674697, + "grad_norm": 0.11939049512147903, + "learning_rate": 1.1307897099686627e-05, + "loss": 0.0029, + "step": 1058 + }, + { + "epoch": 2.0404819277108435, + "grad_norm": 0.11862517893314362, + "learning_rate": 1.1267528057054562e-05, + "loss": 0.0062, + "step": 1059 + }, + { + "epoch": 2.0424096385542168, + "grad_norm": 0.1539212018251419, + "learning_rate": 1.1227202924124704e-05, + "loss": 0.0067, + "step": 1060 + }, + { + "epoch": 2.0443373493975905, + "grad_norm": 0.17163440585136414, + "learning_rate": 1.118692190366491e-05, + "loss": 0.0055, + "step": 1061 + }, + { + "epoch": 2.0462650602409638, + "grad_norm": 0.12304897606372833, + "learning_rate": 1.1146685198221222e-05, + "loss": 0.0036, + "step": 1062 + }, + { + "epoch": 2.0481927710843375, + "grad_norm": 0.17319051921367645, + "learning_rate": 1.1106493010116842e-05, + "loss": 0.0058, + "step": 1063 + }, + { + "epoch": 2.050120481927711, + "grad_norm": 0.2242443859577179, + "learning_rate": 1.1066345541451127e-05, + "loss": 0.0059, + "step": 1064 + }, + { + "epoch": 2.0520481927710845, + "grad_norm": 0.09533938020467758, + "learning_rate": 1.1026242994098597e-05, + "loss": 0.0033, + "step": 1065 + }, + { + "epoch": 2.053975903614458, + "grad_norm": 0.11697929352521896, + "learning_rate": 1.0986185569707852e-05, + "loss": 0.0038, + "step": 1066 + }, + { + "epoch": 2.0559036144578315, + "grad_norm": 0.2563149333000183, + "learning_rate": 1.0946173469700625e-05, + "loss": 0.0158, + "step": 1067 + }, + { + "epoch": 2.057831325301205, + "grad_norm": 0.21836932003498077, + "learning_rate": 1.0906206895270739e-05, + "loss": 0.0085, + "step": 1068 + }, + { + "epoch": 2.059759036144578, + "grad_norm": 0.1798071414232254, + "learning_rate": 1.0866286047383094e-05, + "loss": 0.0053, + "step": 1069 + }, + { + "epoch": 2.061686746987952, + "grad_norm": 0.08937730640172958, + "learning_rate": 1.0826411126772675e-05, + "loss": 0.0025, + "step": 1070 + }, + { + "epoch": 2.063614457831325, + "grad_norm": 0.0942138060927391, + "learning_rate": 1.0786582333943499e-05, + "loss": 0.0017, + "step": 1071 + }, + { + "epoch": 2.065542168674699, + "grad_norm": 0.13076582551002502, + "learning_rate": 1.0746799869167679e-05, + "loss": 0.0033, + "step": 1072 + }, + { + "epoch": 2.067469879518072, + "grad_norm": 0.0993233174085617, + "learning_rate": 1.0707063932484357e-05, + "loss": 0.0046, + "step": 1073 + }, + { + "epoch": 2.069397590361446, + "grad_norm": 0.3046741485595703, + "learning_rate": 1.0667374723698698e-05, + "loss": 0.009, + "step": 1074 + }, + { + "epoch": 2.071325301204819, + "grad_norm": 0.12197669595479965, + "learning_rate": 1.0627732442380932e-05, + "loss": 0.0034, + "step": 1075 + }, + { + "epoch": 2.073253012048193, + "grad_norm": 0.12721140682697296, + "learning_rate": 1.058813728786531e-05, + "loss": 0.0048, + "step": 1076 + }, + { + "epoch": 2.075180722891566, + "grad_norm": 0.10011966526508331, + "learning_rate": 1.0548589459249112e-05, + "loss": 0.0026, + "step": 1077 + }, + { + "epoch": 2.07710843373494, + "grad_norm": 0.3314201831817627, + "learning_rate": 1.0509089155391661e-05, + "loss": 0.0284, + "step": 1078 + }, + { + "epoch": 2.079036144578313, + "grad_norm": 0.32739701867103577, + "learning_rate": 1.0469636574913288e-05, + "loss": 0.0088, + "step": 1079 + }, + { + "epoch": 2.080963855421687, + "grad_norm": 0.13805675506591797, + "learning_rate": 1.043023191619438e-05, + "loss": 0.0042, + "step": 1080 + }, + { + "epoch": 2.0828915662650602, + "grad_norm": 0.14789745211601257, + "learning_rate": 1.039087537737435e-05, + "loss": 0.0037, + "step": 1081 + }, + { + "epoch": 2.0848192771084335, + "grad_norm": 0.15518991649150848, + "learning_rate": 1.0351567156350617e-05, + "loss": 0.0044, + "step": 1082 + }, + { + "epoch": 2.0867469879518072, + "grad_norm": 0.08380113542079926, + "learning_rate": 1.0312307450777706e-05, + "loss": 0.0019, + "step": 1083 + }, + { + "epoch": 2.0886746987951805, + "grad_norm": 0.17892400920391083, + "learning_rate": 1.027309645806613e-05, + "loss": 0.0065, + "step": 1084 + }, + { + "epoch": 2.0906024096385543, + "grad_norm": 0.5497608780860901, + "learning_rate": 1.0233934375381489e-05, + "loss": 0.0238, + "step": 1085 + }, + { + "epoch": 2.0925301204819275, + "grad_norm": 1.0189186334609985, + "learning_rate": 1.019482139964344e-05, + "loss": 0.0092, + "step": 1086 + }, + { + "epoch": 2.0944578313253013, + "grad_norm": 0.12144117057323456, + "learning_rate": 1.015575772752472e-05, + "loss": 0.0038, + "step": 1087 + }, + { + "epoch": 2.0963855421686746, + "grad_norm": 0.1115315854549408, + "learning_rate": 1.0116743555450148e-05, + "loss": 0.0024, + "step": 1088 + }, + { + "epoch": 2.0983132530120483, + "grad_norm": 0.22671759128570557, + "learning_rate": 1.0077779079595631e-05, + "loss": 0.0136, + "step": 1089 + }, + { + "epoch": 2.1002409638554216, + "grad_norm": 2.0009827613830566, + "learning_rate": 1.003886449588719e-05, + "loss": 0.0493, + "step": 1090 + }, + { + "epoch": 2.1021686746987953, + "grad_norm": 0.11907301843166351, + "learning_rate": 1.0000000000000006e-05, + "loss": 0.0034, + "step": 1091 + }, + { + "epoch": 2.1040963855421686, + "grad_norm": 0.31257638335227966, + "learning_rate": 9.961185787357346e-06, + "loss": 0.0129, + "step": 1092 + }, + { + "epoch": 2.1060240963855423, + "grad_norm": 0.11033743619918823, + "learning_rate": 9.922422053129674e-06, + "loss": 0.0184, + "step": 1093 + }, + { + "epoch": 2.1079518072289156, + "grad_norm": 0.2575698494911194, + "learning_rate": 9.883708992233626e-06, + "loss": 0.0054, + "step": 1094 + }, + { + "epoch": 2.1098795180722894, + "grad_norm": 0.12921132147312164, + "learning_rate": 9.845046799331029e-06, + "loss": 0.0037, + "step": 1095 + }, + { + "epoch": 2.1118072289156626, + "grad_norm": 0.21405921876430511, + "learning_rate": 9.806435668827941e-06, + "loss": 0.006, + "step": 1096 + }, + { + "epoch": 2.113734939759036, + "grad_norm": 0.12929430603981018, + "learning_rate": 9.76787579487363e-06, + "loss": 0.0049, + "step": 1097 + }, + { + "epoch": 2.1156626506024097, + "grad_norm": 0.1793181151151657, + "learning_rate": 9.729367371359681e-06, + "loss": 0.0086, + "step": 1098 + }, + { + "epoch": 2.117590361445783, + "grad_norm": 0.2182074338197708, + "learning_rate": 9.690910591918936e-06, + "loss": 0.0106, + "step": 1099 + }, + { + "epoch": 2.1195180722891567, + "grad_norm": 0.0705680400133133, + "learning_rate": 9.652505649924547e-06, + "loss": 0.0012, + "step": 1100 + }, + { + "epoch": 2.12144578313253, + "grad_norm": 0.10509738326072693, + "learning_rate": 9.614152738489021e-06, + "loss": 0.0048, + "step": 1101 + }, + { + "epoch": 2.1233734939759037, + "grad_norm": 0.13775436580181122, + "learning_rate": 9.575852050463268e-06, + "loss": 0.0089, + "step": 1102 + }, + { + "epoch": 2.125301204819277, + "grad_norm": 0.15230101346969604, + "learning_rate": 9.537603778435545e-06, + "loss": 0.0065, + "step": 1103 + }, + { + "epoch": 2.1272289156626507, + "grad_norm": 0.24702346324920654, + "learning_rate": 9.499408114730583e-06, + "loss": 0.016, + "step": 1104 + }, + { + "epoch": 2.129156626506024, + "grad_norm": 0.1082577034831047, + "learning_rate": 9.461265251408575e-06, + "loss": 0.0036, + "step": 1105 + }, + { + "epoch": 2.1310843373493977, + "grad_norm": 0.1063847690820694, + "learning_rate": 9.423175380264211e-06, + "loss": 0.0037, + "step": 1106 + }, + { + "epoch": 2.133012048192771, + "grad_norm": 0.07686953246593475, + "learning_rate": 9.385138692825729e-06, + "loss": 0.0031, + "step": 1107 + }, + { + "epoch": 2.1349397590361447, + "grad_norm": 0.2046380341053009, + "learning_rate": 9.347155380353912e-06, + "loss": 0.0087, + "step": 1108 + }, + { + "epoch": 2.136867469879518, + "grad_norm": 0.1341692954301834, + "learning_rate": 9.30922563384121e-06, + "loss": 0.0045, + "step": 1109 + }, + { + "epoch": 2.1387951807228918, + "grad_norm": 0.09870535880327225, + "learning_rate": 9.271349644010672e-06, + "loss": 0.003, + "step": 1110 + }, + { + "epoch": 2.140722891566265, + "grad_norm": 0.18708615005016327, + "learning_rate": 9.233527601315069e-06, + "loss": 0.0042, + "step": 1111 + }, + { + "epoch": 2.1426506024096383, + "grad_norm": 0.5175634026527405, + "learning_rate": 9.195759695935907e-06, + "loss": 0.0173, + "step": 1112 + }, + { + "epoch": 2.144578313253012, + "grad_norm": 0.14939036965370178, + "learning_rate": 9.158046117782464e-06, + "loss": 0.0031, + "step": 1113 + }, + { + "epoch": 2.1465060240963854, + "grad_norm": 0.2837410569190979, + "learning_rate": 9.120387056490851e-06, + "loss": 0.0097, + "step": 1114 + }, + { + "epoch": 2.148433734939759, + "grad_norm": 0.11088677495718002, + "learning_rate": 9.082782701423047e-06, + "loss": 0.0026, + "step": 1115 + }, + { + "epoch": 2.1503614457831324, + "grad_norm": 0.07785166054964066, + "learning_rate": 9.045233241665947e-06, + "loss": 0.0019, + "step": 1116 + }, + { + "epoch": 2.152289156626506, + "grad_norm": 0.17568141222000122, + "learning_rate": 9.007738866030427e-06, + "loss": 0.0039, + "step": 1117 + }, + { + "epoch": 2.1542168674698794, + "grad_norm": 0.12652266025543213, + "learning_rate": 8.970299763050356e-06, + "loss": 0.0033, + "step": 1118 + }, + { + "epoch": 2.156144578313253, + "grad_norm": 0.16801467537879944, + "learning_rate": 8.932916120981695e-06, + "loss": 0.0076, + "step": 1119 + }, + { + "epoch": 2.1580722891566264, + "grad_norm": 0.18313169479370117, + "learning_rate": 8.895588127801545e-06, + "loss": 0.0052, + "step": 1120 + }, + { + "epoch": 2.16, + "grad_norm": 0.07546049356460571, + "learning_rate": 8.858315971207146e-06, + "loss": 0.0022, + "step": 1121 + }, + { + "epoch": 2.1619277108433734, + "grad_norm": 0.4039839208126068, + "learning_rate": 8.821099838614996e-06, + "loss": 0.0203, + "step": 1122 + }, + { + "epoch": 2.163855421686747, + "grad_norm": 0.09244243055582047, + "learning_rate": 8.783939917159897e-06, + "loss": 0.002, + "step": 1123 + }, + { + "epoch": 2.1657831325301204, + "grad_norm": 0.18327835202217102, + "learning_rate": 8.746836393693978e-06, + "loss": 0.0055, + "step": 1124 + }, + { + "epoch": 2.167710843373494, + "grad_norm": 0.22010307013988495, + "learning_rate": 8.709789454785809e-06, + "loss": 0.0077, + "step": 1125 + }, + { + "epoch": 2.1696385542168675, + "grad_norm": 0.09438297897577286, + "learning_rate": 8.67279928671939e-06, + "loss": 0.0032, + "step": 1126 + }, + { + "epoch": 2.1715662650602408, + "grad_norm": 0.20782770216464996, + "learning_rate": 8.635866075493318e-06, + "loss": 0.0028, + "step": 1127 + }, + { + "epoch": 2.1734939759036145, + "grad_norm": 0.1958685964345932, + "learning_rate": 8.598990006819756e-06, + "loss": 0.0047, + "step": 1128 + }, + { + "epoch": 2.1754216867469878, + "grad_norm": 0.06459935009479523, + "learning_rate": 8.562171266123528e-06, + "loss": 0.0015, + "step": 1129 + }, + { + "epoch": 2.1773493975903615, + "grad_norm": 0.33486708998680115, + "learning_rate": 8.525410038541218e-06, + "loss": 0.0094, + "step": 1130 + }, + { + "epoch": 2.179277108433735, + "grad_norm": 0.5755940079689026, + "learning_rate": 8.488706508920202e-06, + "loss": 0.0067, + "step": 1131 + }, + { + "epoch": 2.1812048192771085, + "grad_norm": 0.10840924829244614, + "learning_rate": 8.452060861817738e-06, + "loss": 0.0082, + "step": 1132 + }, + { + "epoch": 2.183132530120482, + "grad_norm": 0.18611350655555725, + "learning_rate": 8.415473281500037e-06, + "loss": 0.0059, + "step": 1133 + }, + { + "epoch": 2.1850602409638555, + "grad_norm": 0.11245249956846237, + "learning_rate": 8.378943951941301e-06, + "loss": 0.0107, + "step": 1134 + }, + { + "epoch": 2.186987951807229, + "grad_norm": 0.12284426391124725, + "learning_rate": 8.342473056822873e-06, + "loss": 0.0025, + "step": 1135 + }, + { + "epoch": 2.1889156626506026, + "grad_norm": 0.12542888522148132, + "learning_rate": 8.306060779532245e-06, + "loss": 0.0059, + "step": 1136 + }, + { + "epoch": 2.190843373493976, + "grad_norm": 0.1287655532360077, + "learning_rate": 8.26970730316215e-06, + "loss": 0.0022, + "step": 1137 + }, + { + "epoch": 2.1927710843373496, + "grad_norm": 0.1818632185459137, + "learning_rate": 8.233412810509669e-06, + "loss": 0.0131, + "step": 1138 + }, + { + "epoch": 2.194698795180723, + "grad_norm": 0.09687745571136475, + "learning_rate": 8.197177484075284e-06, + "loss": 0.0025, + "step": 1139 + }, + { + "epoch": 2.1966265060240966, + "grad_norm": 0.16103452444076538, + "learning_rate": 8.161001506061979e-06, + "loss": 0.0031, + "step": 1140 + }, + { + "epoch": 2.19855421686747, + "grad_norm": 0.2711680233478546, + "learning_rate": 8.124885058374302e-06, + "loss": 0.0034, + "step": 1141 + }, + { + "epoch": 2.200481927710843, + "grad_norm": 0.17613105475902557, + "learning_rate": 8.088828322617473e-06, + "loss": 0.0044, + "step": 1142 + }, + { + "epoch": 2.202409638554217, + "grad_norm": 0.2298487424850464, + "learning_rate": 8.052831480096464e-06, + "loss": 0.0168, + "step": 1143 + }, + { + "epoch": 2.20433734939759, + "grad_norm": 0.17042206227779388, + "learning_rate": 8.016894711815067e-06, + "loss": 0.007, + "step": 1144 + }, + { + "epoch": 2.206265060240964, + "grad_norm": 0.2830466628074646, + "learning_rate": 7.98101819847501e-06, + "loss": 0.0091, + "step": 1145 + }, + { + "epoch": 2.208192771084337, + "grad_norm": 0.22089065611362457, + "learning_rate": 7.945202120475063e-06, + "loss": 0.0046, + "step": 1146 + }, + { + "epoch": 2.210120481927711, + "grad_norm": 0.1716073900461197, + "learning_rate": 7.909446657910072e-06, + "loss": 0.0032, + "step": 1147 + }, + { + "epoch": 2.212048192771084, + "grad_norm": 0.16140373051166534, + "learning_rate": 7.873751990570104e-06, + "loss": 0.0057, + "step": 1148 + }, + { + "epoch": 2.213975903614458, + "grad_norm": 0.1671605408191681, + "learning_rate": 7.838118297939529e-06, + "loss": 0.0039, + "step": 1149 + }, + { + "epoch": 2.2159036144578312, + "grad_norm": 0.10933005809783936, + "learning_rate": 7.802545759196117e-06, + "loss": 0.005, + "step": 1150 + }, + { + "epoch": 2.217831325301205, + "grad_norm": 0.07819998264312744, + "learning_rate": 7.76703455321014e-06, + "loss": 0.0025, + "step": 1151 + }, + { + "epoch": 2.2197590361445783, + "grad_norm": 0.36211854219436646, + "learning_rate": 7.73158485854344e-06, + "loss": 0.0151, + "step": 1152 + }, + { + "epoch": 2.221686746987952, + "grad_norm": 0.09098304808139801, + "learning_rate": 7.696196853448612e-06, + "loss": 0.0027, + "step": 1153 + }, + { + "epoch": 2.2236144578313253, + "grad_norm": 0.17442144453525543, + "learning_rate": 7.660870715868018e-06, + "loss": 0.006, + "step": 1154 + }, + { + "epoch": 2.225542168674699, + "grad_norm": 0.09785338491201401, + "learning_rate": 7.625606623432933e-06, + "loss": 0.0041, + "step": 1155 + }, + { + "epoch": 2.2274698795180723, + "grad_norm": 0.19399888813495636, + "learning_rate": 7.590404753462653e-06, + "loss": 0.0125, + "step": 1156 + }, + { + "epoch": 2.2293975903614456, + "grad_norm": 0.11080623418092728, + "learning_rate": 7.55526528296362e-06, + "loss": 0.0022, + "step": 1157 + }, + { + "epoch": 2.2313253012048193, + "grad_norm": 0.14067359268665314, + "learning_rate": 7.520188388628473e-06, + "loss": 0.0123, + "step": 1158 + }, + { + "epoch": 2.2332530120481926, + "grad_norm": 0.14533625543117523, + "learning_rate": 7.485174246835227e-06, + "loss": 0.0039, + "step": 1159 + }, + { + "epoch": 2.2351807228915663, + "grad_norm": 0.1253812462091446, + "learning_rate": 7.4502230336463466e-06, + "loss": 0.003, + "step": 1160 + }, + { + "epoch": 2.2371084337349396, + "grad_norm": 0.12766572833061218, + "learning_rate": 7.415334924807869e-06, + "loss": 0.0044, + "step": 1161 + }, + { + "epoch": 2.2390361445783133, + "grad_norm": 0.11985791474580765, + "learning_rate": 7.380510095748535e-06, + "loss": 0.0071, + "step": 1162 + }, + { + "epoch": 2.2409638554216866, + "grad_norm": 0.15505346655845642, + "learning_rate": 7.3457487215788605e-06, + "loss": 0.0046, + "step": 1163 + }, + { + "epoch": 2.2428915662650604, + "grad_norm": 0.18983210623264313, + "learning_rate": 7.311050977090343e-06, + "loss": 0.0079, + "step": 1164 + }, + { + "epoch": 2.2448192771084337, + "grad_norm": 0.19279207289218903, + "learning_rate": 7.276417036754479e-06, + "loss": 0.0042, + "step": 1165 + }, + { + "epoch": 2.2467469879518074, + "grad_norm": 0.21539707481861115, + "learning_rate": 7.241847074721964e-06, + "loss": 0.0087, + "step": 1166 + }, + { + "epoch": 2.2486746987951807, + "grad_norm": 0.07004354894161224, + "learning_rate": 7.207341264821783e-06, + "loss": 0.002, + "step": 1167 + }, + { + "epoch": 2.2506024096385544, + "grad_norm": 0.2203039526939392, + "learning_rate": 7.172899780560345e-06, + "loss": 0.0069, + "step": 1168 + }, + { + "epoch": 2.2525301204819277, + "grad_norm": 0.12474718689918518, + "learning_rate": 7.138522795120606e-06, + "loss": 0.0122, + "step": 1169 + }, + { + "epoch": 2.2544578313253014, + "grad_norm": 0.09078995883464813, + "learning_rate": 7.104210481361204e-06, + "loss": 0.0025, + "step": 1170 + }, + { + "epoch": 2.2563855421686747, + "grad_norm": 0.141757071018219, + "learning_rate": 7.069963011815584e-06, + "loss": 0.0039, + "step": 1171 + }, + { + "epoch": 2.258313253012048, + "grad_norm": 0.14944659173488617, + "learning_rate": 7.035780558691141e-06, + "loss": 0.0025, + "step": 1172 + }, + { + "epoch": 2.2602409638554217, + "grad_norm": 0.06723666191101074, + "learning_rate": 7.001663293868328e-06, + "loss": 0.0014, + "step": 1173 + }, + { + "epoch": 2.262168674698795, + "grad_norm": 0.11966485530138016, + "learning_rate": 6.967611388899826e-06, + "loss": 0.0067, + "step": 1174 + }, + { + "epoch": 2.2640963855421687, + "grad_norm": 0.08943185210227966, + "learning_rate": 6.933625015009666e-06, + "loss": 0.0036, + "step": 1175 + }, + { + "epoch": 2.266024096385542, + "grad_norm": 0.04511453956365585, + "learning_rate": 6.899704343092359e-06, + "loss": 0.0014, + "step": 1176 + }, + { + "epoch": 2.2679518072289158, + "grad_norm": 0.1867951601743698, + "learning_rate": 6.865849543712058e-06, + "loss": 0.009, + "step": 1177 + }, + { + "epoch": 2.269879518072289, + "grad_norm": 0.23791250586509705, + "learning_rate": 6.832060787101658e-06, + "loss": 0.0117, + "step": 1178 + }, + { + "epoch": 2.271807228915663, + "grad_norm": 0.13210316002368927, + "learning_rate": 6.798338243162008e-06, + "loss": 0.0024, + "step": 1179 + }, + { + "epoch": 2.273734939759036, + "grad_norm": 0.1601375937461853, + "learning_rate": 6.764682081461002e-06, + "loss": 0.013, + "step": 1180 + }, + { + "epoch": 2.27566265060241, + "grad_norm": 0.21996766328811646, + "learning_rate": 6.73109247123273e-06, + "loss": 0.0074, + "step": 1181 + }, + { + "epoch": 2.277590361445783, + "grad_norm": 0.15780030190944672, + "learning_rate": 6.6975695813766465e-06, + "loss": 0.0052, + "step": 1182 + }, + { + "epoch": 2.279518072289157, + "grad_norm": 0.18146437406539917, + "learning_rate": 6.664113580456739e-06, + "loss": 0.0265, + "step": 1183 + }, + { + "epoch": 2.28144578313253, + "grad_norm": 0.12033495306968689, + "learning_rate": 6.630724636700618e-06, + "loss": 0.0026, + "step": 1184 + }, + { + "epoch": 2.283373493975904, + "grad_norm": 0.25268155336380005, + "learning_rate": 6.59740291799873e-06, + "loss": 0.0046, + "step": 1185 + }, + { + "epoch": 2.285301204819277, + "grad_norm": 0.19043004512786865, + "learning_rate": 6.564148591903488e-06, + "loss": 0.0063, + "step": 1186 + }, + { + "epoch": 2.2872289156626504, + "grad_norm": 0.06894923001527786, + "learning_rate": 6.530961825628432e-06, + "loss": 0.0012, + "step": 1187 + }, + { + "epoch": 2.289156626506024, + "grad_norm": 0.16378818452358246, + "learning_rate": 6.4978427860474015e-06, + "loss": 0.0048, + "step": 1188 + }, + { + "epoch": 2.2910843373493974, + "grad_norm": 0.11130444705486298, + "learning_rate": 6.464791639693648e-06, + "loss": 0.0049, + "step": 1189 + }, + { + "epoch": 2.293012048192771, + "grad_norm": 0.10573417693376541, + "learning_rate": 6.431808552759083e-06, + "loss": 0.0019, + "step": 1190 + }, + { + "epoch": 2.2949397590361444, + "grad_norm": 0.13344882428646088, + "learning_rate": 6.398893691093367e-06, + "loss": 0.0033, + "step": 1191 + }, + { + "epoch": 2.296867469879518, + "grad_norm": 0.12659135460853577, + "learning_rate": 6.366047220203088e-06, + "loss": 0.0032, + "step": 1192 + }, + { + "epoch": 2.2987951807228915, + "grad_norm": 0.10152821987867355, + "learning_rate": 6.333269305250971e-06, + "loss": 0.0027, + "step": 1193 + }, + { + "epoch": 2.300722891566265, + "grad_norm": 0.1889944225549698, + "learning_rate": 6.300560111055006e-06, + "loss": 0.0062, + "step": 1194 + }, + { + "epoch": 2.3026506024096385, + "grad_norm": 2.3101227283477783, + "learning_rate": 6.2679198020876275e-06, + "loss": 0.0113, + "step": 1195 + }, + { + "epoch": 2.304578313253012, + "grad_norm": 0.6224933862686157, + "learning_rate": 6.235348542474908e-06, + "loss": 0.0273, + "step": 1196 + }, + { + "epoch": 2.3065060240963855, + "grad_norm": 0.1908419281244278, + "learning_rate": 6.202846495995705e-06, + "loss": 0.0056, + "step": 1197 + }, + { + "epoch": 2.3084337349397592, + "grad_norm": 0.10968491435050964, + "learning_rate": 6.170413826080856e-06, + "loss": 0.0034, + "step": 1198 + }, + { + "epoch": 2.3103614457831325, + "grad_norm": 0.23200668394565582, + "learning_rate": 6.138050695812343e-06, + "loss": 0.0042, + "step": 1199 + }, + { + "epoch": 2.3122891566265062, + "grad_norm": 0.12442032992839813, + "learning_rate": 6.105757267922481e-06, + "loss": 0.0045, + "step": 1200 + }, + { + "epoch": 2.3142168674698795, + "grad_norm": 0.14563624560832977, + "learning_rate": 6.073533704793122e-06, + "loss": 0.0035, + "step": 1201 + }, + { + "epoch": 2.316144578313253, + "grad_norm": 0.11523722857236862, + "learning_rate": 6.04138016845478e-06, + "loss": 0.0088, + "step": 1202 + }, + { + "epoch": 2.3180722891566266, + "grad_norm": 0.2000943422317505, + "learning_rate": 6.009296820585871e-06, + "loss": 0.0059, + "step": 1203 + }, + { + "epoch": 2.32, + "grad_norm": 0.10698592662811279, + "learning_rate": 5.977283822511879e-06, + "loss": 0.0028, + "step": 1204 + }, + { + "epoch": 2.3219277108433736, + "grad_norm": 0.1533137410879135, + "learning_rate": 5.945341335204547e-06, + "loss": 0.0044, + "step": 1205 + }, + { + "epoch": 2.323855421686747, + "grad_norm": 0.1235835999250412, + "learning_rate": 5.9134695192810695e-06, + "loss": 0.0043, + "step": 1206 + }, + { + "epoch": 2.3257831325301206, + "grad_norm": 0.1916925013065338, + "learning_rate": 5.8816685350032575e-06, + "loss": 0.0066, + "step": 1207 + }, + { + "epoch": 2.327710843373494, + "grad_norm": 0.08812380582094193, + "learning_rate": 5.849938542276801e-06, + "loss": 0.0022, + "step": 1208 + }, + { + "epoch": 2.3296385542168676, + "grad_norm": 0.13387660682201385, + "learning_rate": 5.818279700650393e-06, + "loss": 0.0037, + "step": 1209 + }, + { + "epoch": 2.331566265060241, + "grad_norm": 0.2309022694826126, + "learning_rate": 5.786692169314954e-06, + "loss": 0.0049, + "step": 1210 + }, + { + "epoch": 2.3334939759036146, + "grad_norm": 0.09956549853086472, + "learning_rate": 5.755176107102833e-06, + "loss": 0.002, + "step": 1211 + }, + { + "epoch": 2.335421686746988, + "grad_norm": 0.06035687029361725, + "learning_rate": 5.723731672487043e-06, + "loss": 0.002, + "step": 1212 + }, + { + "epoch": 2.337349397590361, + "grad_norm": 0.06850237399339676, + "learning_rate": 5.69235902358038e-06, + "loss": 0.0013, + "step": 1213 + }, + { + "epoch": 2.339277108433735, + "grad_norm": 0.12068171054124832, + "learning_rate": 5.661058318134711e-06, + "loss": 0.0041, + "step": 1214 + }, + { + "epoch": 2.3412048192771087, + "grad_norm": 0.13146616518497467, + "learning_rate": 5.6298297135401355e-06, + "loss": 0.0022, + "step": 1215 + }, + { + "epoch": 2.343132530120482, + "grad_norm": 0.15160737931728363, + "learning_rate": 5.598673366824212e-06, + "loss": 0.0036, + "step": 1216 + }, + { + "epoch": 2.3450602409638552, + "grad_norm": 0.26196014881134033, + "learning_rate": 5.567589434651164e-06, + "loss": 0.0151, + "step": 1217 + }, + { + "epoch": 2.346987951807229, + "grad_norm": 0.12898831069469452, + "learning_rate": 5.536578073321073e-06, + "loss": 0.006, + "step": 1218 + }, + { + "epoch": 2.3489156626506023, + "grad_norm": 0.11385104805231094, + "learning_rate": 5.505639438769146e-06, + "loss": 0.0052, + "step": 1219 + }, + { + "epoch": 2.350843373493976, + "grad_norm": 0.14569509029388428, + "learning_rate": 5.47477368656486e-06, + "loss": 0.0048, + "step": 1220 + }, + { + "epoch": 2.3527710843373493, + "grad_norm": 0.12406075745820999, + "learning_rate": 5.443980971911238e-06, + "loss": 0.0028, + "step": 1221 + }, + { + "epoch": 2.354698795180723, + "grad_norm": 0.3730498254299164, + "learning_rate": 5.413261449644039e-06, + "loss": 0.0043, + "step": 1222 + }, + { + "epoch": 2.3566265060240963, + "grad_norm": 0.1449914574623108, + "learning_rate": 5.382615274230987e-06, + "loss": 0.0075, + "step": 1223 + }, + { + "epoch": 2.35855421686747, + "grad_norm": 0.20739100873470306, + "learning_rate": 5.352042599770995e-06, + "loss": 0.0061, + "step": 1224 + }, + { + "epoch": 2.3604819277108433, + "grad_norm": 0.05786775052547455, + "learning_rate": 5.321543579993398e-06, + "loss": 0.0015, + "step": 1225 + }, + { + "epoch": 2.362409638554217, + "grad_norm": 0.09043122828006744, + "learning_rate": 5.2911183682571446e-06, + "loss": 0.0034, + "step": 1226 + }, + { + "epoch": 2.3643373493975903, + "grad_norm": 0.2685496211051941, + "learning_rate": 5.260767117550094e-06, + "loss": 0.0076, + "step": 1227 + }, + { + "epoch": 2.3662650602409636, + "grad_norm": 0.17694126069545746, + "learning_rate": 5.230489980488165e-06, + "loss": 0.0148, + "step": 1228 + }, + { + "epoch": 2.3681927710843373, + "grad_norm": 0.11609307676553726, + "learning_rate": 5.200287109314633e-06, + "loss": 0.0049, + "step": 1229 + }, + { + "epoch": 2.370120481927711, + "grad_norm": 0.1257704645395279, + "learning_rate": 5.1701586558993285e-06, + "loss": 0.0031, + "step": 1230 + }, + { + "epoch": 2.3720481927710844, + "grad_norm": 0.27177703380584717, + "learning_rate": 5.140104771737899e-06, + "loss": 0.0058, + "step": 1231 + }, + { + "epoch": 2.3739759036144576, + "grad_norm": 0.13928169012069702, + "learning_rate": 5.110125607951024e-06, + "loss": 0.0051, + "step": 1232 + }, + { + "epoch": 2.3759036144578314, + "grad_norm": 0.679577648639679, + "learning_rate": 5.0802213152836514e-06, + "loss": 0.0173, + "step": 1233 + }, + { + "epoch": 2.3778313253012047, + "grad_norm": 0.16769403219223022, + "learning_rate": 5.0503920441042845e-06, + "loss": 0.0045, + "step": 1234 + }, + { + "epoch": 2.3797590361445784, + "grad_norm": 0.09427493065595627, + "learning_rate": 5.0206379444041764e-06, + "loss": 0.0024, + "step": 1235 + }, + { + "epoch": 2.3816867469879517, + "grad_norm": 0.33908671140670776, + "learning_rate": 4.990959165796585e-06, + "loss": 0.0088, + "step": 1236 + }, + { + "epoch": 2.3836144578313254, + "grad_norm": 0.18106943368911743, + "learning_rate": 4.961355857516034e-06, + "loss": 0.0094, + "step": 1237 + }, + { + "epoch": 2.3855421686746987, + "grad_norm": 0.5833203196525574, + "learning_rate": 4.931828168417583e-06, + "loss": 0.0086, + "step": 1238 + }, + { + "epoch": 2.3874698795180724, + "grad_norm": 0.09108569473028183, + "learning_rate": 4.902376246976015e-06, + "loss": 0.0014, + "step": 1239 + }, + { + "epoch": 2.3893975903614457, + "grad_norm": 0.10596407204866409, + "learning_rate": 4.873000241285153e-06, + "loss": 0.0043, + "step": 1240 + }, + { + "epoch": 2.3913253012048195, + "grad_norm": 0.10775511711835861, + "learning_rate": 4.8437002990570835e-06, + "loss": 0.0014, + "step": 1241 + }, + { + "epoch": 2.3932530120481927, + "grad_norm": 0.9646345973014832, + "learning_rate": 4.8144765676214245e-06, + "loss": 0.0525, + "step": 1242 + }, + { + "epoch": 2.395180722891566, + "grad_norm": 0.20530278980731964, + "learning_rate": 4.7853291939245814e-06, + "loss": 0.008, + "step": 1243 + }, + { + "epoch": 2.3971084337349398, + "grad_norm": 0.1682119369506836, + "learning_rate": 4.756258324528995e-06, + "loss": 0.0044, + "step": 1244 + }, + { + "epoch": 2.3990361445783135, + "grad_norm": 0.45536917448043823, + "learning_rate": 4.727264105612439e-06, + "loss": 0.0186, + "step": 1245 + }, + { + "epoch": 2.4009638554216868, + "grad_norm": 0.3017471730709076, + "learning_rate": 4.698346682967258e-06, + "loss": 0.0106, + "step": 1246 + }, + { + "epoch": 2.40289156626506, + "grad_norm": 0.1226554661989212, + "learning_rate": 4.669506201999625e-06, + "loss": 0.0035, + "step": 1247 + }, + { + "epoch": 2.404819277108434, + "grad_norm": 0.13750068843364716, + "learning_rate": 4.640742807728837e-06, + "loss": 0.0038, + "step": 1248 + }, + { + "epoch": 2.406746987951807, + "grad_norm": 0.11531024426221848, + "learning_rate": 4.612056644786575e-06, + "loss": 0.0021, + "step": 1249 + }, + { + "epoch": 2.408674698795181, + "grad_norm": 0.1143675372004509, + "learning_rate": 4.583447857416175e-06, + "loss": 0.0028, + "step": 1250 + }, + { + "epoch": 2.410602409638554, + "grad_norm": 0.0914216861128807, + "learning_rate": 4.554916589471898e-06, + "loss": 0.0027, + "step": 1251 + }, + { + "epoch": 2.412530120481928, + "grad_norm": 0.18339012563228607, + "learning_rate": 4.526462984418221e-06, + "loss": 0.0037, + "step": 1252 + }, + { + "epoch": 2.414457831325301, + "grad_norm": 0.11073138564825058, + "learning_rate": 4.498087185329105e-06, + "loss": 0.003, + "step": 1253 + }, + { + "epoch": 2.416385542168675, + "grad_norm": 0.20792435109615326, + "learning_rate": 4.469789334887265e-06, + "loss": 0.009, + "step": 1254 + }, + { + "epoch": 2.418313253012048, + "grad_norm": 0.09485629945993423, + "learning_rate": 4.441569575383471e-06, + "loss": 0.0033, + "step": 1255 + }, + { + "epoch": 2.420240963855422, + "grad_norm": 0.11831793934106827, + "learning_rate": 4.413428048715851e-06, + "loss": 0.0021, + "step": 1256 + }, + { + "epoch": 2.422168674698795, + "grad_norm": 0.11818034201860428, + "learning_rate": 4.38536489638911e-06, + "loss": 0.0041, + "step": 1257 + }, + { + "epoch": 2.4240963855421684, + "grad_norm": 0.2583082616329193, + "learning_rate": 4.3573802595138945e-06, + "loss": 0.0039, + "step": 1258 + }, + { + "epoch": 2.426024096385542, + "grad_norm": 0.3120201826095581, + "learning_rate": 4.329474278806034e-06, + "loss": 0.0087, + "step": 1259 + }, + { + "epoch": 2.427951807228916, + "grad_norm": 0.1258879452943802, + "learning_rate": 4.301647094585855e-06, + "loss": 0.0046, + "step": 1260 + }, + { + "epoch": 2.429879518072289, + "grad_norm": 0.15144586563110352, + "learning_rate": 4.273898846777473e-06, + "loss": 0.0054, + "step": 1261 + }, + { + "epoch": 2.4318072289156625, + "grad_norm": 0.15615184605121613, + "learning_rate": 4.246229674908067e-06, + "loss": 0.0072, + "step": 1262 + }, + { + "epoch": 2.433734939759036, + "grad_norm": 0.09690173715353012, + "learning_rate": 4.218639718107225e-06, + "loss": 0.003, + "step": 1263 + }, + { + "epoch": 2.4356626506024095, + "grad_norm": 0.23884955048561096, + "learning_rate": 4.1911291151062e-06, + "loss": 0.0109, + "step": 1264 + }, + { + "epoch": 2.4375903614457832, + "grad_norm": 0.0905768945813179, + "learning_rate": 4.163698004237222e-06, + "loss": 0.0027, + "step": 1265 + }, + { + "epoch": 2.4395180722891565, + "grad_norm": 0.09168912470340729, + "learning_rate": 4.136346523432821e-06, + "loss": 0.0018, + "step": 1266 + }, + { + "epoch": 2.4414457831325302, + "grad_norm": 0.17878012359142303, + "learning_rate": 4.109074810225118e-06, + "loss": 0.0048, + "step": 1267 + }, + { + "epoch": 2.4433734939759035, + "grad_norm": 0.09913790971040726, + "learning_rate": 4.08188300174513e-06, + "loss": 0.0021, + "step": 1268 + }, + { + "epoch": 2.4453012048192773, + "grad_norm": 0.16615812480449677, + "learning_rate": 4.054771234722106e-06, + "loss": 0.0066, + "step": 1269 + }, + { + "epoch": 2.4472289156626506, + "grad_norm": 0.09618276357650757, + "learning_rate": 4.027739645482784e-06, + "loss": 0.0043, + "step": 1270 + }, + { + "epoch": 2.4491566265060243, + "grad_norm": 0.33473479747772217, + "learning_rate": 4.0007883699507855e-06, + "loss": 0.0236, + "step": 1271 + }, + { + "epoch": 2.4510843373493976, + "grad_norm": 0.15051880478858948, + "learning_rate": 3.973917543645867e-06, + "loss": 0.0068, + "step": 1272 + }, + { + "epoch": 2.453012048192771, + "grad_norm": 0.24134816229343414, + "learning_rate": 3.947127301683249e-06, + "loss": 0.0194, + "step": 1273 + }, + { + "epoch": 2.4549397590361446, + "grad_norm": 0.10495353490114212, + "learning_rate": 3.920417778772967e-06, + "loss": 0.0042, + "step": 1274 + }, + { + "epoch": 2.4568674698795183, + "grad_norm": 0.2294938713312149, + "learning_rate": 3.893789109219171e-06, + "loss": 0.0224, + "step": 1275 + }, + { + "epoch": 2.4587951807228916, + "grad_norm": 0.13710513710975647, + "learning_rate": 3.867241426919446e-06, + "loss": 0.0046, + "step": 1276 + }, + { + "epoch": 2.460722891566265, + "grad_norm": 0.06754808127880096, + "learning_rate": 3.840774865364157e-06, + "loss": 0.0019, + "step": 1277 + }, + { + "epoch": 2.4626506024096386, + "grad_norm": 0.24797780811786652, + "learning_rate": 3.8143895576357605e-06, + "loss": 0.0063, + "step": 1278 + }, + { + "epoch": 2.464578313253012, + "grad_norm": 0.1476449817419052, + "learning_rate": 3.788085636408143e-06, + "loss": 0.0055, + "step": 1279 + }, + { + "epoch": 2.4665060240963856, + "grad_norm": 0.22397096455097198, + "learning_rate": 3.7618632339459616e-06, + "loss": 0.0164, + "step": 1280 + }, + { + "epoch": 2.468433734939759, + "grad_norm": 0.21596969664096832, + "learning_rate": 3.7357224821039497e-06, + "loss": 0.0112, + "step": 1281 + }, + { + "epoch": 2.4703614457831327, + "grad_norm": 0.2775099575519562, + "learning_rate": 3.7096635123263068e-06, + "loss": 0.0112, + "step": 1282 + }, + { + "epoch": 2.472289156626506, + "grad_norm": 0.07963326573371887, + "learning_rate": 3.683686455645974e-06, + "loss": 0.0013, + "step": 1283 + }, + { + "epoch": 2.4742168674698797, + "grad_norm": 0.1253802627325058, + "learning_rate": 3.6577914426840266e-06, + "loss": 0.0038, + "step": 1284 + }, + { + "epoch": 2.476144578313253, + "grad_norm": 0.10258597880601883, + "learning_rate": 3.631978603648989e-06, + "loss": 0.0023, + "step": 1285 + }, + { + "epoch": 2.4780722891566267, + "grad_norm": 0.17102380096912384, + "learning_rate": 3.6062480683361935e-06, + "loss": 0.0025, + "step": 1286 + }, + { + "epoch": 2.48, + "grad_norm": 0.09547360241413116, + "learning_rate": 3.580599966127123e-06, + "loss": 0.003, + "step": 1287 + }, + { + "epoch": 2.4819277108433733, + "grad_norm": 0.08008653670549393, + "learning_rate": 3.5550344259887438e-06, + "loss": 0.0023, + "step": 1288 + } + ], + "logging_steps": 1, + "max_steps": 1557, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 92, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.690243626006741e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1288/training_args.bin b/checkpoint-1288/training_args.bin new file mode 100644 index 0000000..ecc7b6b --- /dev/null +++ b/checkpoint-1288/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:342dfb3c86216e436950100c79812c54066d5572c4e9975b0f133c067f061bcf +size 7825 diff --git a/checkpoint-1380/chat_template.jinja b/checkpoint-1380/chat_template.jinja new file mode 100644 index 0000000..1bad6a0 --- /dev/null +++ b/checkpoint-1380/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/checkpoint-1380/config.json b/checkpoint-1380/config.json new file mode 100644 index 0000000..f8bf41e --- /dev/null +++ b/checkpoint-1380/config.json @@ -0,0 +1,36 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128009, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": 128009, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "transformers_version": "4.56.2", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/checkpoint-1380/generation_config.json b/checkpoint-1380/generation_config.json new file mode 100644 index 0000000..2152026 --- /dev/null +++ b/checkpoint-1380/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128009, + 128001, + 128008, + 128009 + ], + "pad_token_id": 128009, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.56.2" +} diff --git a/checkpoint-1380/model.safetensors b/checkpoint-1380/model.safetensors new file mode 100644 index 0000000..228adc7 --- /dev/null +++ b/checkpoint-1380/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c05253224e9fb5884d7e946e571f525224eabd60605ccf17ef47398b9b1340da +size 2996982344 diff --git a/checkpoint-1380/special_tokens_map.json b/checkpoint-1380/special_tokens_map.json new file mode 100644 index 0000000..14daf45 --- /dev/null +++ b/checkpoint-1380/special_tokens_map.json @@ -0,0 +1,26 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/checkpoint-1380/tokenizer.json b/checkpoint-1380/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/checkpoint-1380/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-1380/tokenizer_config.json b/checkpoint-1380/tokenizer_config.json new file mode 100644 index 0000000..d1e1ea9 --- /dev/null +++ b/checkpoint-1380/tokenizer_config.json @@ -0,0 +1,2068 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-1380/trainer_state.json b/checkpoint-1380/trainer_state.json new file mode 100644 index 0000000..3fa1399 --- /dev/null +++ b/checkpoint-1380/trainer_state.json @@ -0,0 +1,9694 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.659277108433735, + "eval_steps": 500, + "global_step": 1380, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0019277108433734939, + "grad_norm": 2.8518834114074707, + "learning_rate": 0.0, + "loss": 0.0891, + "step": 1 + }, + { + "epoch": 0.0038554216867469878, + "grad_norm": 1.8441249132156372, + "learning_rate": 2.564102564102564e-07, + "loss": 0.0539, + "step": 2 + }, + { + "epoch": 0.005783132530120482, + "grad_norm": 2.8263237476348877, + "learning_rate": 5.128205128205128e-07, + "loss": 0.099, + "step": 3 + }, + { + "epoch": 0.0077108433734939755, + "grad_norm": 2.5051236152648926, + "learning_rate": 7.692307692307694e-07, + "loss": 0.0789, + "step": 4 + }, + { + "epoch": 0.00963855421686747, + "grad_norm": 2.6903438568115234, + "learning_rate": 1.0256410256410257e-06, + "loss": 0.0881, + "step": 5 + }, + { + "epoch": 0.011566265060240964, + "grad_norm": 2.6205761432647705, + "learning_rate": 1.282051282051282e-06, + "loss": 0.0776, + "step": 6 + }, + { + "epoch": 0.013493975903614458, + "grad_norm": 2.6309337615966797, + "learning_rate": 1.5384615384615387e-06, + "loss": 0.0827, + "step": 7 + }, + { + "epoch": 0.015421686746987951, + "grad_norm": 1.5427855253219604, + "learning_rate": 1.794871794871795e-06, + "loss": 0.0577, + "step": 8 + }, + { + "epoch": 0.017349397590361446, + "grad_norm": 1.0973446369171143, + "learning_rate": 2.0512820512820513e-06, + "loss": 0.04, + "step": 9 + }, + { + "epoch": 0.01927710843373494, + "grad_norm": 1.3253350257873535, + "learning_rate": 2.307692307692308e-06, + "loss": 0.0506, + "step": 10 + }, + { + "epoch": 0.021204819277108433, + "grad_norm": 1.588739037513733, + "learning_rate": 2.564102564102564e-06, + "loss": 0.0874, + "step": 11 + }, + { + "epoch": 0.02313253012048193, + "grad_norm": 1.4987014532089233, + "learning_rate": 2.8205128205128207e-06, + "loss": 0.0597, + "step": 12 + }, + { + "epoch": 0.02506024096385542, + "grad_norm": 1.6571592092514038, + "learning_rate": 3.0769230769230774e-06, + "loss": 0.0559, + "step": 13 + }, + { + "epoch": 0.026987951807228915, + "grad_norm": 1.8860628604888916, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.0688, + "step": 14 + }, + { + "epoch": 0.02891566265060241, + "grad_norm": 1.3202295303344727, + "learning_rate": 3.58974358974359e-06, + "loss": 0.0433, + "step": 15 + }, + { + "epoch": 0.030843373493975902, + "grad_norm": 1.5870612859725952, + "learning_rate": 3.846153846153847e-06, + "loss": 0.0695, + "step": 16 + }, + { + "epoch": 0.0327710843373494, + "grad_norm": 0.9192284345626831, + "learning_rate": 4.102564102564103e-06, + "loss": 0.0392, + "step": 17 + }, + { + "epoch": 0.03469879518072289, + "grad_norm": 0.7950155735015869, + "learning_rate": 4.358974358974359e-06, + "loss": 0.0351, + "step": 18 + }, + { + "epoch": 0.03662650602409639, + "grad_norm": 0.8854314684867859, + "learning_rate": 4.615384615384616e-06, + "loss": 0.0356, + "step": 19 + }, + { + "epoch": 0.03855421686746988, + "grad_norm": 0.9546788930892944, + "learning_rate": 4.871794871794872e-06, + "loss": 0.0427, + "step": 20 + }, + { + "epoch": 0.04048192771084337, + "grad_norm": 0.6315903663635254, + "learning_rate": 5.128205128205128e-06, + "loss": 0.0397, + "step": 21 + }, + { + "epoch": 0.042409638554216866, + "grad_norm": 0.9230924844741821, + "learning_rate": 5.384615384615385e-06, + "loss": 0.0481, + "step": 22 + }, + { + "epoch": 0.04433734939759036, + "grad_norm": 0.711546003818512, + "learning_rate": 5.641025641025641e-06, + "loss": 0.0479, + "step": 23 + }, + { + "epoch": 0.04626506024096386, + "grad_norm": 0.5288046598434448, + "learning_rate": 5.897435897435898e-06, + "loss": 0.0182, + "step": 24 + }, + { + "epoch": 0.04819277108433735, + "grad_norm": 0.9420496225357056, + "learning_rate": 6.153846153846155e-06, + "loss": 0.0389, + "step": 25 + }, + { + "epoch": 0.05012048192771084, + "grad_norm": 0.5001983046531677, + "learning_rate": 6.410256410256412e-06, + "loss": 0.0268, + "step": 26 + }, + { + "epoch": 0.052048192771084335, + "grad_norm": 0.8084653615951538, + "learning_rate": 6.666666666666667e-06, + "loss": 0.0367, + "step": 27 + }, + { + "epoch": 0.05397590361445783, + "grad_norm": 0.7195103764533997, + "learning_rate": 6.923076923076923e-06, + "loss": 0.0251, + "step": 28 + }, + { + "epoch": 0.055903614457831326, + "grad_norm": 0.529958963394165, + "learning_rate": 7.17948717948718e-06, + "loss": 0.0289, + "step": 29 + }, + { + "epoch": 0.05783132530120482, + "grad_norm": 0.795376181602478, + "learning_rate": 7.435897435897437e-06, + "loss": 0.043, + "step": 30 + }, + { + "epoch": 0.059759036144578316, + "grad_norm": 0.6366249918937683, + "learning_rate": 7.692307692307694e-06, + "loss": 0.029, + "step": 31 + }, + { + "epoch": 0.061686746987951804, + "grad_norm": 0.5414115190505981, + "learning_rate": 7.948717948717949e-06, + "loss": 0.0365, + "step": 32 + }, + { + "epoch": 0.0636144578313253, + "grad_norm": 0.9350972175598145, + "learning_rate": 8.205128205128205e-06, + "loss": 0.0283, + "step": 33 + }, + { + "epoch": 0.0655421686746988, + "grad_norm": 0.5660741925239563, + "learning_rate": 8.461538461538462e-06, + "loss": 0.0234, + "step": 34 + }, + { + "epoch": 0.06746987951807229, + "grad_norm": 0.5623988509178162, + "learning_rate": 8.717948717948719e-06, + "loss": 0.0307, + "step": 35 + }, + { + "epoch": 0.06939759036144579, + "grad_norm": 0.5260195732116699, + "learning_rate": 8.974358974358976e-06, + "loss": 0.0264, + "step": 36 + }, + { + "epoch": 0.07132530120481928, + "grad_norm": 0.4934785068035126, + "learning_rate": 9.230769230769232e-06, + "loss": 0.0224, + "step": 37 + }, + { + "epoch": 0.07325301204819278, + "grad_norm": 0.4797322154045105, + "learning_rate": 9.487179487179487e-06, + "loss": 0.0163, + "step": 38 + }, + { + "epoch": 0.07518072289156627, + "grad_norm": 0.4739217460155487, + "learning_rate": 9.743589743589744e-06, + "loss": 0.0165, + "step": 39 + }, + { + "epoch": 0.07710843373493977, + "grad_norm": 0.4527677595615387, + "learning_rate": 1e-05, + "loss": 0.0163, + "step": 40 + }, + { + "epoch": 0.07903614457831325, + "grad_norm": 0.6241316795349121, + "learning_rate": 1.0256410256410256e-05, + "loss": 0.0302, + "step": 41 + }, + { + "epoch": 0.08096385542168674, + "grad_norm": 0.639043927192688, + "learning_rate": 1.0512820512820514e-05, + "loss": 0.0312, + "step": 42 + }, + { + "epoch": 0.08289156626506024, + "grad_norm": 0.5121409296989441, + "learning_rate": 1.076923076923077e-05, + "loss": 0.0256, + "step": 43 + }, + { + "epoch": 0.08481927710843373, + "grad_norm": 0.6340477466583252, + "learning_rate": 1.1025641025641028e-05, + "loss": 0.04, + "step": 44 + }, + { + "epoch": 0.08674698795180723, + "grad_norm": 0.5260409712791443, + "learning_rate": 1.1282051282051283e-05, + "loss": 0.0282, + "step": 45 + }, + { + "epoch": 0.08867469879518072, + "grad_norm": 0.6390711069107056, + "learning_rate": 1.1538461538461538e-05, + "loss": 0.0243, + "step": 46 + }, + { + "epoch": 0.09060240963855422, + "grad_norm": 0.46469295024871826, + "learning_rate": 1.1794871794871796e-05, + "loss": 0.0208, + "step": 47 + }, + { + "epoch": 0.09253012048192771, + "grad_norm": 0.8711516857147217, + "learning_rate": 1.2051282051282051e-05, + "loss": 0.0291, + "step": 48 + }, + { + "epoch": 0.09445783132530121, + "grad_norm": 0.9164300560951233, + "learning_rate": 1.230769230769231e-05, + "loss": 0.0342, + "step": 49 + }, + { + "epoch": 0.0963855421686747, + "grad_norm": 0.5401139259338379, + "learning_rate": 1.2564102564102565e-05, + "loss": 0.0185, + "step": 50 + }, + { + "epoch": 0.0983132530120482, + "grad_norm": 0.44393008947372437, + "learning_rate": 1.2820512820512823e-05, + "loss": 0.0228, + "step": 51 + }, + { + "epoch": 0.10024096385542168, + "grad_norm": 0.3855767846107483, + "learning_rate": 1.3076923076923078e-05, + "loss": 0.0176, + "step": 52 + }, + { + "epoch": 0.10216867469879518, + "grad_norm": 0.8561235070228577, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.0433, + "step": 53 + }, + { + "epoch": 0.10409638554216867, + "grad_norm": 0.768002450466156, + "learning_rate": 1.3589743589743592e-05, + "loss": 0.0245, + "step": 54 + }, + { + "epoch": 0.10602409638554217, + "grad_norm": 0.4559759497642517, + "learning_rate": 1.3846153846153847e-05, + "loss": 0.0224, + "step": 55 + }, + { + "epoch": 0.10795180722891566, + "grad_norm": 0.6203847527503967, + "learning_rate": 1.4102564102564105e-05, + "loss": 0.0296, + "step": 56 + }, + { + "epoch": 0.10987951807228916, + "grad_norm": 0.6651368141174316, + "learning_rate": 1.435897435897436e-05, + "loss": 0.0336, + "step": 57 + }, + { + "epoch": 0.11180722891566265, + "grad_norm": 0.377734512090683, + "learning_rate": 1.4615384615384615e-05, + "loss": 0.0196, + "step": 58 + }, + { + "epoch": 0.11373493975903615, + "grad_norm": 0.687568724155426, + "learning_rate": 1.4871794871794874e-05, + "loss": 0.0207, + "step": 59 + }, + { + "epoch": 0.11566265060240964, + "grad_norm": 0.7905604243278503, + "learning_rate": 1.5128205128205129e-05, + "loss": 0.047, + "step": 60 + }, + { + "epoch": 0.11759036144578314, + "grad_norm": 0.7938196063041687, + "learning_rate": 1.5384615384615387e-05, + "loss": 0.0198, + "step": 61 + }, + { + "epoch": 0.11951807228915663, + "grad_norm": 0.41340553760528564, + "learning_rate": 1.5641025641025644e-05, + "loss": 0.0161, + "step": 62 + }, + { + "epoch": 0.12144578313253013, + "grad_norm": 0.5668172240257263, + "learning_rate": 1.5897435897435897e-05, + "loss": 0.0275, + "step": 63 + }, + { + "epoch": 0.12337349397590361, + "grad_norm": 0.48333367705345154, + "learning_rate": 1.6153846153846154e-05, + "loss": 0.0137, + "step": 64 + }, + { + "epoch": 0.12530120481927712, + "grad_norm": 0.6843933463096619, + "learning_rate": 1.641025641025641e-05, + "loss": 0.0294, + "step": 65 + }, + { + "epoch": 0.1272289156626506, + "grad_norm": 0.7789272665977478, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.0401, + "step": 66 + }, + { + "epoch": 0.1291566265060241, + "grad_norm": 0.6203492879867554, + "learning_rate": 1.6923076923076924e-05, + "loss": 0.0292, + "step": 67 + }, + { + "epoch": 0.1310843373493976, + "grad_norm": 0.5940662622451782, + "learning_rate": 1.717948717948718e-05, + "loss": 0.0178, + "step": 68 + }, + { + "epoch": 0.13301204819277107, + "grad_norm": 0.35504868626594543, + "learning_rate": 1.7435897435897438e-05, + "loss": 0.0129, + "step": 69 + }, + { + "epoch": 0.13493975903614458, + "grad_norm": 0.8796699643135071, + "learning_rate": 1.7692307692307694e-05, + "loss": 0.034, + "step": 70 + }, + { + "epoch": 0.13686746987951806, + "grad_norm": 0.967444896697998, + "learning_rate": 1.794871794871795e-05, + "loss": 0.0266, + "step": 71 + }, + { + "epoch": 0.13879518072289157, + "grad_norm": 0.4428526759147644, + "learning_rate": 1.8205128205128208e-05, + "loss": 0.0223, + "step": 72 + }, + { + "epoch": 0.14072289156626505, + "grad_norm": 0.42897751927375793, + "learning_rate": 1.8461538461538465e-05, + "loss": 0.0187, + "step": 73 + }, + { + "epoch": 0.14265060240963856, + "grad_norm": 0.5100914835929871, + "learning_rate": 1.8717948717948718e-05, + "loss": 0.0164, + "step": 74 + }, + { + "epoch": 0.14457831325301204, + "grad_norm": 0.6028861999511719, + "learning_rate": 1.8974358974358975e-05, + "loss": 0.0164, + "step": 75 + }, + { + "epoch": 0.14650602409638555, + "grad_norm": 0.6187024116516113, + "learning_rate": 1.923076923076923e-05, + "loss": 0.0296, + "step": 76 + }, + { + "epoch": 0.14843373493975903, + "grad_norm": 0.4822489619255066, + "learning_rate": 1.9487179487179488e-05, + "loss": 0.0148, + "step": 77 + }, + { + "epoch": 0.15036144578313254, + "grad_norm": 0.7231149673461914, + "learning_rate": 1.9743589743589745e-05, + "loss": 0.0395, + "step": 78 + }, + { + "epoch": 0.15228915662650602, + "grad_norm": 0.8409642577171326, + "learning_rate": 2e-05, + "loss": 0.0446, + "step": 79 + }, + { + "epoch": 0.15421686746987953, + "grad_norm": 0.4883500039577484, + "learning_rate": 2.025641025641026e-05, + "loss": 0.0206, + "step": 80 + }, + { + "epoch": 0.156144578313253, + "grad_norm": 0.6287479400634766, + "learning_rate": 2.0512820512820512e-05, + "loss": 0.0333, + "step": 81 + }, + { + "epoch": 0.1580722891566265, + "grad_norm": 0.5041632652282715, + "learning_rate": 2.0769230769230772e-05, + "loss": 0.0414, + "step": 82 + }, + { + "epoch": 0.16, + "grad_norm": 0.5103405117988586, + "learning_rate": 2.102564102564103e-05, + "loss": 0.045, + "step": 83 + }, + { + "epoch": 0.16192771084337348, + "grad_norm": 0.493161678314209, + "learning_rate": 2.1282051282051285e-05, + "loss": 0.021, + "step": 84 + }, + { + "epoch": 0.163855421686747, + "grad_norm": 0.908843994140625, + "learning_rate": 2.153846153846154e-05, + "loss": 0.0389, + "step": 85 + }, + { + "epoch": 0.16578313253012048, + "grad_norm": 0.5067003965377808, + "learning_rate": 2.1794871794871795e-05, + "loss": 0.0272, + "step": 86 + }, + { + "epoch": 0.16771084337349398, + "grad_norm": 0.5791381597518921, + "learning_rate": 2.2051282051282056e-05, + "loss": 0.0368, + "step": 87 + }, + { + "epoch": 0.16963855421686747, + "grad_norm": 0.7056036591529846, + "learning_rate": 2.230769230769231e-05, + "loss": 0.0284, + "step": 88 + }, + { + "epoch": 0.17156626506024097, + "grad_norm": 0.6563822031021118, + "learning_rate": 2.2564102564102566e-05, + "loss": 0.0646, + "step": 89 + }, + { + "epoch": 0.17349397590361446, + "grad_norm": 0.9483286142349243, + "learning_rate": 2.2820512820512822e-05, + "loss": 0.0439, + "step": 90 + }, + { + "epoch": 0.17542168674698796, + "grad_norm": 0.370664119720459, + "learning_rate": 2.3076923076923076e-05, + "loss": 0.0109, + "step": 91 + }, + { + "epoch": 0.17734939759036145, + "grad_norm": 0.9776477813720703, + "learning_rate": 2.3333333333333336e-05, + "loss": 0.0458, + "step": 92 + }, + { + "epoch": 0.17927710843373493, + "grad_norm": 0.45710092782974243, + "learning_rate": 2.3589743589743593e-05, + "loss": 0.0212, + "step": 93 + }, + { + "epoch": 0.18120481927710844, + "grad_norm": 0.8623896837234497, + "learning_rate": 2.384615384615385e-05, + "loss": 0.0215, + "step": 94 + }, + { + "epoch": 0.18313253012048192, + "grad_norm": 0.55814528465271, + "learning_rate": 2.4102564102564103e-05, + "loss": 0.0218, + "step": 95 + }, + { + "epoch": 0.18506024096385543, + "grad_norm": 0.49882641434669495, + "learning_rate": 2.435897435897436e-05, + "loss": 0.0268, + "step": 96 + }, + { + "epoch": 0.1869879518072289, + "grad_norm": 0.3508654534816742, + "learning_rate": 2.461538461538462e-05, + "loss": 0.0172, + "step": 97 + }, + { + "epoch": 0.18891566265060242, + "grad_norm": 0.601170003414154, + "learning_rate": 2.4871794871794873e-05, + "loss": 0.0208, + "step": 98 + }, + { + "epoch": 0.1908433734939759, + "grad_norm": 1.1748133897781372, + "learning_rate": 2.512820512820513e-05, + "loss": 0.0259, + "step": 99 + }, + { + "epoch": 0.1927710843373494, + "grad_norm": 0.46370384097099304, + "learning_rate": 2.5384615384615386e-05, + "loss": 0.0242, + "step": 100 + }, + { + "epoch": 0.1946987951807229, + "grad_norm": 0.525010883808136, + "learning_rate": 2.5641025641025646e-05, + "loss": 0.0188, + "step": 101 + }, + { + "epoch": 0.1966265060240964, + "grad_norm": 0.766501784324646, + "learning_rate": 2.58974358974359e-05, + "loss": 0.0584, + "step": 102 + }, + { + "epoch": 0.19855421686746988, + "grad_norm": 0.3572964370250702, + "learning_rate": 2.6153846153846157e-05, + "loss": 0.0131, + "step": 103 + }, + { + "epoch": 0.20048192771084336, + "grad_norm": 0.6467130780220032, + "learning_rate": 2.6410256410256413e-05, + "loss": 0.0231, + "step": 104 + }, + { + "epoch": 0.20240963855421687, + "grad_norm": 1.1852102279663086, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.027, + "step": 105 + }, + { + "epoch": 0.20433734939759035, + "grad_norm": 2.3659932613372803, + "learning_rate": 2.6923076923076927e-05, + "loss": 0.0224, + "step": 106 + }, + { + "epoch": 0.20626506024096386, + "grad_norm": 0.5343687534332275, + "learning_rate": 2.7179487179487183e-05, + "loss": 0.0198, + "step": 107 + }, + { + "epoch": 0.20819277108433734, + "grad_norm": 1.852160096168518, + "learning_rate": 2.7435897435897437e-05, + "loss": 0.032, + "step": 108 + }, + { + "epoch": 0.21012048192771085, + "grad_norm": 0.47291702032089233, + "learning_rate": 2.7692307692307694e-05, + "loss": 0.0117, + "step": 109 + }, + { + "epoch": 0.21204819277108433, + "grad_norm": 0.7623187899589539, + "learning_rate": 2.794871794871795e-05, + "loss": 0.0337, + "step": 110 + }, + { + "epoch": 0.21397590361445784, + "grad_norm": 0.5272570848464966, + "learning_rate": 2.820512820512821e-05, + "loss": 0.0131, + "step": 111 + }, + { + "epoch": 0.21590361445783132, + "grad_norm": 0.5568500757217407, + "learning_rate": 2.8461538461538464e-05, + "loss": 0.0233, + "step": 112 + }, + { + "epoch": 0.21783132530120483, + "grad_norm": 0.4008469879627228, + "learning_rate": 2.871794871794872e-05, + "loss": 0.0204, + "step": 113 + }, + { + "epoch": 0.2197590361445783, + "grad_norm": 0.4888612926006317, + "learning_rate": 2.8974358974358977e-05, + "loss": 0.016, + "step": 114 + }, + { + "epoch": 0.2216867469879518, + "grad_norm": 0.44903355836868286, + "learning_rate": 2.923076923076923e-05, + "loss": 0.0135, + "step": 115 + }, + { + "epoch": 0.2236144578313253, + "grad_norm": 0.9266762733459473, + "learning_rate": 2.948717948717949e-05, + "loss": 0.0233, + "step": 116 + }, + { + "epoch": 0.22554216867469878, + "grad_norm": 0.5352638959884644, + "learning_rate": 2.9743589743589747e-05, + "loss": 0.0198, + "step": 117 + }, + { + "epoch": 0.2274698795180723, + "grad_norm": 0.6051343679428101, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.0246, + "step": 118 + }, + { + "epoch": 0.22939759036144577, + "grad_norm": 0.9971133470535278, + "learning_rate": 3.0256410256410257e-05, + "loss": 0.025, + "step": 119 + }, + { + "epoch": 0.23132530120481928, + "grad_norm": 0.704236626625061, + "learning_rate": 3.0512820512820514e-05, + "loss": 0.031, + "step": 120 + }, + { + "epoch": 0.23325301204819276, + "grad_norm": 0.6137097477912903, + "learning_rate": 3.0769230769230774e-05, + "loss": 0.0519, + "step": 121 + }, + { + "epoch": 0.23518072289156627, + "grad_norm": 0.7396159768104553, + "learning_rate": 3.102564102564103e-05, + "loss": 0.0325, + "step": 122 + }, + { + "epoch": 0.23710843373493976, + "grad_norm": 1.3282053470611572, + "learning_rate": 3.128205128205129e-05, + "loss": 0.0252, + "step": 123 + }, + { + "epoch": 0.23903614457831326, + "grad_norm": 0.5220731496810913, + "learning_rate": 3.153846153846154e-05, + "loss": 0.0262, + "step": 124 + }, + { + "epoch": 0.24096385542168675, + "grad_norm": 0.5357242822647095, + "learning_rate": 3.1794871794871795e-05, + "loss": 0.0243, + "step": 125 + }, + { + "epoch": 0.24289156626506025, + "grad_norm": 0.48207753896713257, + "learning_rate": 3.205128205128206e-05, + "loss": 0.0178, + "step": 126 + }, + { + "epoch": 0.24481927710843374, + "grad_norm": 0.552988588809967, + "learning_rate": 3.230769230769231e-05, + "loss": 0.023, + "step": 127 + }, + { + "epoch": 0.24674698795180722, + "grad_norm": 1.7962840795516968, + "learning_rate": 3.2564102564102565e-05, + "loss": 0.032, + "step": 128 + }, + { + "epoch": 0.24867469879518073, + "grad_norm": 1.6404600143432617, + "learning_rate": 3.282051282051282e-05, + "loss": 0.0231, + "step": 129 + }, + { + "epoch": 0.25060240963855424, + "grad_norm": 0.39142486453056335, + "learning_rate": 3.307692307692308e-05, + "loss": 0.0147, + "step": 130 + }, + { + "epoch": 0.2525301204819277, + "grad_norm": 1.3272887468338013, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.0439, + "step": 131 + }, + { + "epoch": 0.2544578313253012, + "grad_norm": 1.5122811794281006, + "learning_rate": 3.358974358974359e-05, + "loss": 0.0282, + "step": 132 + }, + { + "epoch": 0.2563855421686747, + "grad_norm": 1.8542430400848389, + "learning_rate": 3.384615384615385e-05, + "loss": 0.0515, + "step": 133 + }, + { + "epoch": 0.2583132530120482, + "grad_norm": 4.059277534484863, + "learning_rate": 3.4102564102564105e-05, + "loss": 0.0781, + "step": 134 + }, + { + "epoch": 0.26024096385542167, + "grad_norm": 0.6206214427947998, + "learning_rate": 3.435897435897436e-05, + "loss": 0.0306, + "step": 135 + }, + { + "epoch": 0.2621686746987952, + "grad_norm": 0.4575510323047638, + "learning_rate": 3.461538461538462e-05, + "loss": 0.0154, + "step": 136 + }, + { + "epoch": 0.2640963855421687, + "grad_norm": 1.1556978225708008, + "learning_rate": 3.4871794871794875e-05, + "loss": 0.0235, + "step": 137 + }, + { + "epoch": 0.26602409638554214, + "grad_norm": 0.6975051760673523, + "learning_rate": 3.512820512820513e-05, + "loss": 0.0453, + "step": 138 + }, + { + "epoch": 0.26795180722891565, + "grad_norm": 0.8686623573303223, + "learning_rate": 3.538461538461539e-05, + "loss": 0.0427, + "step": 139 + }, + { + "epoch": 0.26987951807228916, + "grad_norm": 2.0681848526000977, + "learning_rate": 3.5641025641025646e-05, + "loss": 0.04, + "step": 140 + }, + { + "epoch": 0.27180722891566267, + "grad_norm": 0.4397984445095062, + "learning_rate": 3.58974358974359e-05, + "loss": 0.0188, + "step": 141 + }, + { + "epoch": 0.2737349397590361, + "grad_norm": 0.5871334075927734, + "learning_rate": 3.615384615384616e-05, + "loss": 0.0253, + "step": 142 + }, + { + "epoch": 0.27566265060240963, + "grad_norm": 1.1078568696975708, + "learning_rate": 3.6410256410256416e-05, + "loss": 0.0316, + "step": 143 + }, + { + "epoch": 0.27759036144578314, + "grad_norm": 0.5691841840744019, + "learning_rate": 3.6666666666666666e-05, + "loss": 0.0266, + "step": 144 + }, + { + "epoch": 0.27951807228915665, + "grad_norm": 0.7896255254745483, + "learning_rate": 3.692307692307693e-05, + "loss": 0.0281, + "step": 145 + }, + { + "epoch": 0.2814457831325301, + "grad_norm": 0.9988337159156799, + "learning_rate": 3.7179487179487186e-05, + "loss": 0.0295, + "step": 146 + }, + { + "epoch": 0.2833734939759036, + "grad_norm": 0.9811834692955017, + "learning_rate": 3.7435897435897436e-05, + "loss": 0.0322, + "step": 147 + }, + { + "epoch": 0.2853012048192771, + "grad_norm": 0.6503105759620667, + "learning_rate": 3.769230769230769e-05, + "loss": 0.0266, + "step": 148 + }, + { + "epoch": 0.28722891566265063, + "grad_norm": 1.9164355993270874, + "learning_rate": 3.794871794871795e-05, + "loss": 0.0677, + "step": 149 + }, + { + "epoch": 0.2891566265060241, + "grad_norm": 1.1724557876586914, + "learning_rate": 3.820512820512821e-05, + "loss": 0.0324, + "step": 150 + }, + { + "epoch": 0.2910843373493976, + "grad_norm": 0.8482469916343689, + "learning_rate": 3.846153846153846e-05, + "loss": 0.0259, + "step": 151 + }, + { + "epoch": 0.2930120481927711, + "grad_norm": 0.8572830557823181, + "learning_rate": 3.871794871794872e-05, + "loss": 0.0358, + "step": 152 + }, + { + "epoch": 0.29493975903614456, + "grad_norm": 0.6630825400352478, + "learning_rate": 3.8974358974358976e-05, + "loss": 0.0447, + "step": 153 + }, + { + "epoch": 0.29686746987951806, + "grad_norm": 0.9197093844413757, + "learning_rate": 3.923076923076923e-05, + "loss": 0.0409, + "step": 154 + }, + { + "epoch": 0.2987951807228916, + "grad_norm": 0.6976819634437561, + "learning_rate": 3.948717948717949e-05, + "loss": 0.0317, + "step": 155 + }, + { + "epoch": 0.3007228915662651, + "grad_norm": 0.7353514432907104, + "learning_rate": 3.9743589743589747e-05, + "loss": 0.0306, + "step": 156 + }, + { + "epoch": 0.30265060240963854, + "grad_norm": 0.5730232000350952, + "learning_rate": 4e-05, + "loss": 0.0324, + "step": 157 + }, + { + "epoch": 0.30457831325301205, + "grad_norm": 0.7852078676223755, + "learning_rate": 3.999994971675547e-05, + "loss": 0.0354, + "step": 158 + }, + { + "epoch": 0.30650602409638555, + "grad_norm": 0.5924715399742126, + "learning_rate": 3.999979886727471e-05, + "loss": 0.0366, + "step": 159 + }, + { + "epoch": 0.30843373493975906, + "grad_norm": 0.7359845638275146, + "learning_rate": 3.999954745231624e-05, + "loss": 0.0437, + "step": 160 + }, + { + "epoch": 0.3103614457831325, + "grad_norm": 0.7866976857185364, + "learning_rate": 3.999919547314426e-05, + "loss": 0.0363, + "step": 161 + }, + { + "epoch": 0.312289156626506, + "grad_norm": 0.7425745129585266, + "learning_rate": 3.999874293152863e-05, + "loss": 0.0259, + "step": 162 + }, + { + "epoch": 0.31421686746987953, + "grad_norm": 1.8922245502471924, + "learning_rate": 3.9998189829744885e-05, + "loss": 0.0341, + "step": 163 + }, + { + "epoch": 0.316144578313253, + "grad_norm": 0.7908634543418884, + "learning_rate": 3.99975361705742e-05, + "loss": 0.0424, + "step": 164 + }, + { + "epoch": 0.3180722891566265, + "grad_norm": 2.047368049621582, + "learning_rate": 3.999678195730337e-05, + "loss": 0.0535, + "step": 165 + }, + { + "epoch": 0.32, + "grad_norm": 0.5702639222145081, + "learning_rate": 3.999592719372484e-05, + "loss": 0.0284, + "step": 166 + }, + { + "epoch": 0.3219277108433735, + "grad_norm": 0.45015648007392883, + "learning_rate": 3.9994971884136636e-05, + "loss": 0.0313, + "step": 167 + }, + { + "epoch": 0.32385542168674697, + "grad_norm": 4.094679355621338, + "learning_rate": 3.9993916033342355e-05, + "loss": 0.0524, + "step": 168 + }, + { + "epoch": 0.3257831325301205, + "grad_norm": 0.800846517086029, + "learning_rate": 3.999275964665117e-05, + "loss": 0.0282, + "step": 169 + }, + { + "epoch": 0.327710843373494, + "grad_norm": 0.47881078720092773, + "learning_rate": 3.999150272987776e-05, + "loss": 0.0293, + "step": 170 + }, + { + "epoch": 0.3296385542168675, + "grad_norm": 0.5716657638549805, + "learning_rate": 3.999014528934232e-05, + "loss": 0.0221, + "step": 171 + }, + { + "epoch": 0.33156626506024095, + "grad_norm": 0.6333311200141907, + "learning_rate": 3.998868733187048e-05, + "loss": 0.0302, + "step": 172 + }, + { + "epoch": 0.33349397590361446, + "grad_norm": 6.642521858215332, + "learning_rate": 3.998712886479335e-05, + "loss": 0.0364, + "step": 173 + }, + { + "epoch": 0.33542168674698797, + "grad_norm": 0.7515506148338318, + "learning_rate": 3.998546989594739e-05, + "loss": 0.0296, + "step": 174 + }, + { + "epoch": 0.3373493975903614, + "grad_norm": 1.0728015899658203, + "learning_rate": 3.998371043367445e-05, + "loss": 0.0549, + "step": 175 + }, + { + "epoch": 0.33927710843373493, + "grad_norm": 1.3025579452514648, + "learning_rate": 3.998185048682166e-05, + "loss": 0.0577, + "step": 176 + }, + { + "epoch": 0.34120481927710844, + "grad_norm": 1.0962958335876465, + "learning_rate": 3.997989006474144e-05, + "loss": 0.0313, + "step": 177 + }, + { + "epoch": 0.34313253012048195, + "grad_norm": 0.7064313292503357, + "learning_rate": 3.997782917729143e-05, + "loss": 0.0309, + "step": 178 + }, + { + "epoch": 0.3450602409638554, + "grad_norm": 0.43374207615852356, + "learning_rate": 3.997566783483445e-05, + "loss": 0.0166, + "step": 179 + }, + { + "epoch": 0.3469879518072289, + "grad_norm": 0.7236390113830566, + "learning_rate": 3.9973406048238413e-05, + "loss": 0.0254, + "step": 180 + }, + { + "epoch": 0.3489156626506024, + "grad_norm": 0.5041500926017761, + "learning_rate": 3.9971043828876334e-05, + "loss": 0.0239, + "step": 181 + }, + { + "epoch": 0.35084337349397593, + "grad_norm": 1.2744532823562622, + "learning_rate": 3.9968581188626204e-05, + "loss": 0.0404, + "step": 182 + }, + { + "epoch": 0.3527710843373494, + "grad_norm": 0.45845362544059753, + "learning_rate": 3.996601813987098e-05, + "loss": 0.0127, + "step": 183 + }, + { + "epoch": 0.3546987951807229, + "grad_norm": 0.4426881968975067, + "learning_rate": 3.996335469549852e-05, + "loss": 0.0176, + "step": 184 + }, + { + "epoch": 0.3566265060240964, + "grad_norm": 1.0030732154846191, + "learning_rate": 3.9960590868901465e-05, + "loss": 0.0457, + "step": 185 + }, + { + "epoch": 0.35855421686746985, + "grad_norm": 0.6428582668304443, + "learning_rate": 3.995772667397725e-05, + "loss": 0.0271, + "step": 186 + }, + { + "epoch": 0.36048192771084336, + "grad_norm": 0.5335744619369507, + "learning_rate": 3.995476212512795e-05, + "loss": 0.0297, + "step": 187 + }, + { + "epoch": 0.3624096385542169, + "grad_norm": 0.6995761394500732, + "learning_rate": 3.99516972372603e-05, + "loss": 0.0322, + "step": 188 + }, + { + "epoch": 0.3643373493975904, + "grad_norm": 0.765511155128479, + "learning_rate": 3.9948532025785546e-05, + "loss": 0.0253, + "step": 189 + }, + { + "epoch": 0.36626506024096384, + "grad_norm": 0.6165828108787537, + "learning_rate": 3.9945266506619403e-05, + "loss": 0.0355, + "step": 190 + }, + { + "epoch": 0.36819277108433734, + "grad_norm": 0.851970911026001, + "learning_rate": 3.994190069618195e-05, + "loss": 0.056, + "step": 191 + }, + { + "epoch": 0.37012048192771085, + "grad_norm": 0.9850023984909058, + "learning_rate": 3.993843461139757e-05, + "loss": 0.0415, + "step": 192 + }, + { + "epoch": 0.37204819277108436, + "grad_norm": 0.7455295324325562, + "learning_rate": 3.9934868269694886e-05, + "loss": 0.0379, + "step": 193 + }, + { + "epoch": 0.3739759036144578, + "grad_norm": 1.159469723701477, + "learning_rate": 3.9931201689006595e-05, + "loss": 0.0237, + "step": 194 + }, + { + "epoch": 0.3759036144578313, + "grad_norm": 0.5490080118179321, + "learning_rate": 3.992743488776947e-05, + "loss": 0.024, + "step": 195 + }, + { + "epoch": 0.37783132530120483, + "grad_norm": 1.279831886291504, + "learning_rate": 3.992356788492421e-05, + "loss": 0.0273, + "step": 196 + }, + { + "epoch": 0.3797590361445783, + "grad_norm": 0.859104335308075, + "learning_rate": 3.9919600699915355e-05, + "loss": 0.0411, + "step": 197 + }, + { + "epoch": 0.3816867469879518, + "grad_norm": 1.2525300979614258, + "learning_rate": 3.991553335269119e-05, + "loss": 0.0857, + "step": 198 + }, + { + "epoch": 0.3836144578313253, + "grad_norm": 0.4924193024635315, + "learning_rate": 3.991136586370367e-05, + "loss": 0.0294, + "step": 199 + }, + { + "epoch": 0.3855421686746988, + "grad_norm": 1.417190670967102, + "learning_rate": 3.990709825390828e-05, + "loss": 0.0395, + "step": 200 + }, + { + "epoch": 0.38746987951807227, + "grad_norm": 0.6172056198120117, + "learning_rate": 3.9902730544763936e-05, + "loss": 0.0194, + "step": 201 + }, + { + "epoch": 0.3893975903614458, + "grad_norm": 0.7292149662971497, + "learning_rate": 3.989826275823291e-05, + "loss": 0.0381, + "step": 202 + }, + { + "epoch": 0.3913253012048193, + "grad_norm": 0.5949816107749939, + "learning_rate": 3.989369491678067e-05, + "loss": 0.0254, + "step": 203 + }, + { + "epoch": 0.3932530120481928, + "grad_norm": 0.6012582182884216, + "learning_rate": 3.988902704337582e-05, + "loss": 0.048, + "step": 204 + }, + { + "epoch": 0.39518072289156625, + "grad_norm": 0.6273590922355652, + "learning_rate": 3.9884259161489936e-05, + "loss": 0.0268, + "step": 205 + }, + { + "epoch": 0.39710843373493976, + "grad_norm": 0.9615244269371033, + "learning_rate": 3.987939129509746e-05, + "loss": 0.0192, + "step": 206 + }, + { + "epoch": 0.39903614457831327, + "grad_norm": 0.6009241342544556, + "learning_rate": 3.9874423468675624e-05, + "loss": 0.0362, + "step": 207 + }, + { + "epoch": 0.4009638554216867, + "grad_norm": 0.411335289478302, + "learning_rate": 3.9869355707204266e-05, + "loss": 0.017, + "step": 208 + }, + { + "epoch": 0.40289156626506023, + "grad_norm": 0.6151527166366577, + "learning_rate": 3.986418803616573e-05, + "loss": 0.0283, + "step": 209 + }, + { + "epoch": 0.40481927710843374, + "grad_norm": 0.33808204531669617, + "learning_rate": 3.985892048154474e-05, + "loss": 0.0158, + "step": 210 + }, + { + "epoch": 0.40674698795180725, + "grad_norm": 0.5464187860488892, + "learning_rate": 3.9853553069828284e-05, + "loss": 0.0292, + "step": 211 + }, + { + "epoch": 0.4086746987951807, + "grad_norm": 0.6658390760421753, + "learning_rate": 3.984808582800543e-05, + "loss": 0.0281, + "step": 212 + }, + { + "epoch": 0.4106024096385542, + "grad_norm": 0.4253764748573303, + "learning_rate": 3.984251878356726e-05, + "loss": 0.031, + "step": 213 + }, + { + "epoch": 0.4125301204819277, + "grad_norm": 0.32309481501579285, + "learning_rate": 3.983685196450667e-05, + "loss": 0.0166, + "step": 214 + }, + { + "epoch": 0.41445783132530123, + "grad_norm": 0.43756410479545593, + "learning_rate": 3.9831085399318265e-05, + "loss": 0.0326, + "step": 215 + }, + { + "epoch": 0.4163855421686747, + "grad_norm": 0.264046847820282, + "learning_rate": 3.982521911699822e-05, + "loss": 0.0118, + "step": 216 + }, + { + "epoch": 0.4183132530120482, + "grad_norm": 0.8630897402763367, + "learning_rate": 3.9819253147044084e-05, + "loss": 0.0246, + "step": 217 + }, + { + "epoch": 0.4202409638554217, + "grad_norm": 0.6923379898071289, + "learning_rate": 3.98131875194547e-05, + "loss": 0.036, + "step": 218 + }, + { + "epoch": 0.42216867469879515, + "grad_norm": 0.5874778628349304, + "learning_rate": 3.9807022264730024e-05, + "loss": 0.0255, + "step": 219 + }, + { + "epoch": 0.42409638554216866, + "grad_norm": 0.394336074590683, + "learning_rate": 3.980075741387094e-05, + "loss": 0.0187, + "step": 220 + }, + { + "epoch": 0.4260240963855422, + "grad_norm": 0.6300327777862549, + "learning_rate": 3.979439299837915e-05, + "loss": 0.0214, + "step": 221 + }, + { + "epoch": 0.4279518072289157, + "grad_norm": 0.5200467109680176, + "learning_rate": 3.978792905025702e-05, + "loss": 0.0628, + "step": 222 + }, + { + "epoch": 0.42987951807228914, + "grad_norm": 0.5713880062103271, + "learning_rate": 3.978136560200735e-05, + "loss": 0.0302, + "step": 223 + }, + { + "epoch": 0.43180722891566264, + "grad_norm": 0.5345383286476135, + "learning_rate": 3.977470268663331e-05, + "loss": 0.0125, + "step": 224 + }, + { + "epoch": 0.43373493975903615, + "grad_norm": 0.5378350019454956, + "learning_rate": 3.976794033763819e-05, + "loss": 0.0246, + "step": 225 + }, + { + "epoch": 0.43566265060240966, + "grad_norm": 0.5554935336112976, + "learning_rate": 3.9761078589025276e-05, + "loss": 0.0212, + "step": 226 + }, + { + "epoch": 0.4375903614457831, + "grad_norm": 0.2832634747028351, + "learning_rate": 3.9754117475297664e-05, + "loss": 0.0125, + "step": 227 + }, + { + "epoch": 0.4395180722891566, + "grad_norm": 1.2910150289535522, + "learning_rate": 3.97470570314581e-05, + "loss": 0.0364, + "step": 228 + }, + { + "epoch": 0.44144578313253013, + "grad_norm": 0.3731018602848053, + "learning_rate": 3.973989729300878e-05, + "loss": 0.0128, + "step": 229 + }, + { + "epoch": 0.4433734939759036, + "grad_norm": 0.9433871507644653, + "learning_rate": 3.9732638295951195e-05, + "loss": 0.0367, + "step": 230 + }, + { + "epoch": 0.4453012048192771, + "grad_norm": 1.0779197216033936, + "learning_rate": 3.972528007678594e-05, + "loss": 0.0667, + "step": 231 + }, + { + "epoch": 0.4472289156626506, + "grad_norm": 1.7009105682373047, + "learning_rate": 3.9717822672512516e-05, + "loss": 0.0655, + "step": 232 + }, + { + "epoch": 0.4491566265060241, + "grad_norm": 0.5646032094955444, + "learning_rate": 3.971026612062919e-05, + "loss": 0.064, + "step": 233 + }, + { + "epoch": 0.45108433734939757, + "grad_norm": 0.44474121928215027, + "learning_rate": 3.970261045913274e-05, + "loss": 0.0206, + "step": 234 + }, + { + "epoch": 0.4530120481927711, + "grad_norm": 1.3969277143478394, + "learning_rate": 3.969485572651833e-05, + "loss": 0.0486, + "step": 235 + }, + { + "epoch": 0.4549397590361446, + "grad_norm": 0.6401994228363037, + "learning_rate": 3.968700196177925e-05, + "loss": 0.0262, + "step": 236 + }, + { + "epoch": 0.4568674698795181, + "grad_norm": 0.7091913223266602, + "learning_rate": 3.96790492044068e-05, + "loss": 0.014, + "step": 237 + }, + { + "epoch": 0.45879518072289155, + "grad_norm": 0.6561547517776489, + "learning_rate": 3.967099749439002e-05, + "loss": 0.0482, + "step": 238 + }, + { + "epoch": 0.46072289156626506, + "grad_norm": 0.6924155354499817, + "learning_rate": 3.966284687221551e-05, + "loss": 0.0289, + "step": 239 + }, + { + "epoch": 0.46265060240963857, + "grad_norm": 0.5868663787841797, + "learning_rate": 3.9654597378867256e-05, + "loss": 0.0331, + "step": 240 + }, + { + "epoch": 0.464578313253012, + "grad_norm": 0.7930939793586731, + "learning_rate": 3.964624905582637e-05, + "loss": 0.0925, + "step": 241 + }, + { + "epoch": 0.46650602409638553, + "grad_norm": 0.4888836145401001, + "learning_rate": 3.9637801945070944e-05, + "loss": 0.015, + "step": 242 + }, + { + "epoch": 0.46843373493975904, + "grad_norm": 0.7820287346839905, + "learning_rate": 3.962925608907579e-05, + "loss": 0.0382, + "step": 243 + }, + { + "epoch": 0.47036144578313255, + "grad_norm": 0.4914316236972809, + "learning_rate": 3.962061153081224e-05, + "loss": 0.0257, + "step": 244 + }, + { + "epoch": 0.472289156626506, + "grad_norm": 0.5681505799293518, + "learning_rate": 3.961186831374793e-05, + "loss": 0.0551, + "step": 245 + }, + { + "epoch": 0.4742168674698795, + "grad_norm": 0.5049723386764526, + "learning_rate": 3.9603026481846616e-05, + "loss": 0.0186, + "step": 246 + }, + { + "epoch": 0.476144578313253, + "grad_norm": 0.5034119486808777, + "learning_rate": 3.959408607956787e-05, + "loss": 0.024, + "step": 247 + }, + { + "epoch": 0.47807228915662653, + "grad_norm": 0.4543336033821106, + "learning_rate": 3.958504715186695e-05, + "loss": 0.0256, + "step": 248 + }, + { + "epoch": 0.48, + "grad_norm": 0.5595743656158447, + "learning_rate": 3.957590974419452e-05, + "loss": 0.0222, + "step": 249 + }, + { + "epoch": 0.4819277108433735, + "grad_norm": 0.5701581239700317, + "learning_rate": 3.956667390249642e-05, + "loss": 0.0334, + "step": 250 + }, + { + "epoch": 0.483855421686747, + "grad_norm": 0.53755784034729, + "learning_rate": 3.9557339673213474e-05, + "loss": 0.0345, + "step": 251 + }, + { + "epoch": 0.4857831325301205, + "grad_norm": 0.4368877112865448, + "learning_rate": 3.95479071032812e-05, + "loss": 0.0183, + "step": 252 + }, + { + "epoch": 0.48771084337349396, + "grad_norm": 0.7972906827926636, + "learning_rate": 3.953837624012963e-05, + "loss": 0.0337, + "step": 253 + }, + { + "epoch": 0.48963855421686747, + "grad_norm": 0.6148451566696167, + "learning_rate": 3.9528747131683023e-05, + "loss": 0.0524, + "step": 254 + }, + { + "epoch": 0.491566265060241, + "grad_norm": 0.500840961933136, + "learning_rate": 3.9519019826359676e-05, + "loss": 0.0248, + "step": 255 + }, + { + "epoch": 0.49349397590361443, + "grad_norm": 0.5536255240440369, + "learning_rate": 3.9509194373071624e-05, + "loss": 0.0219, + "step": 256 + }, + { + "epoch": 0.49542168674698794, + "grad_norm": 0.6873176097869873, + "learning_rate": 3.9499270821224444e-05, + "loss": 0.0312, + "step": 257 + }, + { + "epoch": 0.49734939759036145, + "grad_norm": 0.37207168340682983, + "learning_rate": 3.9489249220716974e-05, + "loss": 0.0149, + "step": 258 + }, + { + "epoch": 0.49927710843373496, + "grad_norm": 0.4458799660205841, + "learning_rate": 3.947912962194107e-05, + "loss": 0.0214, + "step": 259 + }, + { + "epoch": 0.5012048192771085, + "grad_norm": 0.4272724390029907, + "learning_rate": 3.9468912075781345e-05, + "loss": 0.0263, + "step": 260 + }, + { + "epoch": 0.503132530120482, + "grad_norm": 0.5245792269706726, + "learning_rate": 3.945859663361496e-05, + "loss": 0.0103, + "step": 261 + }, + { + "epoch": 0.5050602409638554, + "grad_norm": 0.8799260854721069, + "learning_rate": 3.9448183347311284e-05, + "loss": 0.0292, + "step": 262 + }, + { + "epoch": 0.5069879518072289, + "grad_norm": 0.5996833443641663, + "learning_rate": 3.943767226923171e-05, + "loss": 0.0306, + "step": 263 + }, + { + "epoch": 0.5089156626506024, + "grad_norm": 0.6044682860374451, + "learning_rate": 3.942706345222935e-05, + "loss": 0.0218, + "step": 264 + }, + { + "epoch": 0.5108433734939759, + "grad_norm": 0.4770200848579407, + "learning_rate": 3.941635694964878e-05, + "loss": 0.0226, + "step": 265 + }, + { + "epoch": 0.5127710843373494, + "grad_norm": 0.5605704188346863, + "learning_rate": 3.940555281532576e-05, + "loss": 0.0354, + "step": 266 + }, + { + "epoch": 0.5146987951807229, + "grad_norm": 0.46532443165779114, + "learning_rate": 3.939465110358699e-05, + "loss": 0.0223, + "step": 267 + }, + { + "epoch": 0.5166265060240964, + "grad_norm": 0.5190595388412476, + "learning_rate": 3.93836518692498e-05, + "loss": 0.0219, + "step": 268 + }, + { + "epoch": 0.5185542168674698, + "grad_norm": 0.5767757892608643, + "learning_rate": 3.937255516762193e-05, + "loss": 0.0294, + "step": 269 + }, + { + "epoch": 0.5204819277108433, + "grad_norm": 0.4543164372444153, + "learning_rate": 3.936136105450119e-05, + "loss": 0.0244, + "step": 270 + }, + { + "epoch": 0.5224096385542168, + "grad_norm": 0.4155154526233673, + "learning_rate": 3.9350069586175195e-05, + "loss": 0.02, + "step": 271 + }, + { + "epoch": 0.5243373493975904, + "grad_norm": 0.5470768213272095, + "learning_rate": 3.933868081942113e-05, + "loss": 0.0187, + "step": 272 + }, + { + "epoch": 0.5262650602409639, + "grad_norm": 0.9491772651672363, + "learning_rate": 3.9327194811505406e-05, + "loss": 0.0337, + "step": 273 + }, + { + "epoch": 0.5281927710843374, + "grad_norm": 0.9313873052597046, + "learning_rate": 3.93156116201834e-05, + "loss": 0.0573, + "step": 274 + }, + { + "epoch": 0.5301204819277109, + "grad_norm": 0.7181005477905273, + "learning_rate": 3.930393130369915e-05, + "loss": 0.0405, + "step": 275 + }, + { + "epoch": 0.5320481927710843, + "grad_norm": 0.34231385588645935, + "learning_rate": 3.9292153920785076e-05, + "loss": 0.0153, + "step": 276 + }, + { + "epoch": 0.5339759036144578, + "grad_norm": 0.6899610161781311, + "learning_rate": 3.928027953066168e-05, + "loss": 0.0338, + "step": 277 + }, + { + "epoch": 0.5359036144578313, + "grad_norm": 0.7509781718254089, + "learning_rate": 3.926830819303726e-05, + "loss": 0.0416, + "step": 278 + }, + { + "epoch": 0.5378313253012048, + "grad_norm": 0.6326774954795837, + "learning_rate": 3.925623996810757e-05, + "loss": 0.0293, + "step": 279 + }, + { + "epoch": 0.5397590361445783, + "grad_norm": 0.5543203353881836, + "learning_rate": 3.924407491655557e-05, + "loss": 0.0263, + "step": 280 + }, + { + "epoch": 0.5416867469879518, + "grad_norm": 0.5367572903633118, + "learning_rate": 3.9231813099551086e-05, + "loss": 0.0276, + "step": 281 + }, + { + "epoch": 0.5436144578313253, + "grad_norm": 0.3143869638442993, + "learning_rate": 3.921945457875051e-05, + "loss": 0.0146, + "step": 282 + }, + { + "epoch": 0.5455421686746988, + "grad_norm": 0.47403043508529663, + "learning_rate": 3.920699941629649e-05, + "loss": 0.0267, + "step": 283 + }, + { + "epoch": 0.5474698795180722, + "grad_norm": 0.5082595348358154, + "learning_rate": 3.919444767481763e-05, + "loss": 0.0183, + "step": 284 + }, + { + "epoch": 0.5493975903614458, + "grad_norm": 0.747949481010437, + "learning_rate": 3.918179941742816e-05, + "loss": 0.0412, + "step": 285 + }, + { + "epoch": 0.5513253012048193, + "grad_norm": 0.6553886532783508, + "learning_rate": 3.916905470772762e-05, + "loss": 0.0505, + "step": 286 + }, + { + "epoch": 0.5532530120481928, + "grad_norm": 0.3838176131248474, + "learning_rate": 3.9156213609800545e-05, + "loss": 0.0156, + "step": 287 + }, + { + "epoch": 0.5551807228915663, + "grad_norm": 0.7427731156349182, + "learning_rate": 3.914327618821614e-05, + "loss": 0.0278, + "step": 288 + }, + { + "epoch": 0.5571084337349398, + "grad_norm": 0.2612821161746979, + "learning_rate": 3.913024250802796e-05, + "loss": 0.0101, + "step": 289 + }, + { + "epoch": 0.5590361445783133, + "grad_norm": 0.3799416124820709, + "learning_rate": 3.911711263477357e-05, + "loss": 0.0168, + "step": 290 + }, + { + "epoch": 0.5609638554216867, + "grad_norm": 0.5053854584693909, + "learning_rate": 3.910388663447425e-05, + "loss": 0.0249, + "step": 291 + }, + { + "epoch": 0.5628915662650602, + "grad_norm": 0.38095012307167053, + "learning_rate": 3.909056457363461e-05, + "loss": 0.0156, + "step": 292 + }, + { + "epoch": 0.5648192771084337, + "grad_norm": 0.4477892220020294, + "learning_rate": 3.907714651924229e-05, + "loss": 0.0309, + "step": 293 + }, + { + "epoch": 0.5667469879518072, + "grad_norm": 0.5875864624977112, + "learning_rate": 3.906363253876763e-05, + "loss": 0.0287, + "step": 294 + }, + { + "epoch": 0.5686746987951807, + "grad_norm": 0.522990882396698, + "learning_rate": 3.90500227001633e-05, + "loss": 0.0318, + "step": 295 + }, + { + "epoch": 0.5706024096385542, + "grad_norm": 0.4153876304626465, + "learning_rate": 3.9036317071863994e-05, + "loss": 0.0192, + "step": 296 + }, + { + "epoch": 0.5725301204819278, + "grad_norm": 0.4675769507884979, + "learning_rate": 3.902251572278605e-05, + "loss": 0.067, + "step": 297 + }, + { + "epoch": 0.5744578313253013, + "grad_norm": 0.35778650641441345, + "learning_rate": 3.900861872232713e-05, + "loss": 0.0197, + "step": 298 + }, + { + "epoch": 0.5763855421686747, + "grad_norm": 0.7382330894470215, + "learning_rate": 3.899462614036587e-05, + "loss": 0.0283, + "step": 299 + }, + { + "epoch": 0.5783132530120482, + "grad_norm": 0.41268599033355713, + "learning_rate": 3.89805380472615e-05, + "loss": 0.0207, + "step": 300 + }, + { + "epoch": 0.5802409638554217, + "grad_norm": 1.2013020515441895, + "learning_rate": 3.8966354513853535e-05, + "loss": 0.0301, + "step": 301 + }, + { + "epoch": 0.5821686746987952, + "grad_norm": 0.424757719039917, + "learning_rate": 3.895207561146137e-05, + "loss": 0.022, + "step": 302 + }, + { + "epoch": 0.5840963855421687, + "grad_norm": 0.4196677505970001, + "learning_rate": 3.893770141188396e-05, + "loss": 0.0424, + "step": 303 + }, + { + "epoch": 0.5860240963855422, + "grad_norm": 0.8644190430641174, + "learning_rate": 3.892323198739946e-05, + "loss": 0.08, + "step": 304 + }, + { + "epoch": 0.5879518072289157, + "grad_norm": 0.5645135045051575, + "learning_rate": 3.890866741076482e-05, + "loss": 0.0152, + "step": 305 + }, + { + "epoch": 0.5898795180722891, + "grad_norm": 0.5218387246131897, + "learning_rate": 3.889400775521545e-05, + "loss": 0.0205, + "step": 306 + }, + { + "epoch": 0.5918072289156626, + "grad_norm": 0.39709413051605225, + "learning_rate": 3.8879253094464865e-05, + "loss": 0.0233, + "step": 307 + }, + { + "epoch": 0.5937349397590361, + "grad_norm": 0.3572910726070404, + "learning_rate": 3.8864403502704285e-05, + "loss": 0.0198, + "step": 308 + }, + { + "epoch": 0.5956626506024096, + "grad_norm": 0.382709264755249, + "learning_rate": 3.8849459054602274e-05, + "loss": 0.0176, + "step": 309 + }, + { + "epoch": 0.5975903614457831, + "grad_norm": 3.4527227878570557, + "learning_rate": 3.883441982530436e-05, + "loss": 0.0239, + "step": 310 + }, + { + "epoch": 0.5995180722891567, + "grad_norm": 0.4467569589614868, + "learning_rate": 3.8819285890432674e-05, + "loss": 0.0284, + "step": 311 + }, + { + "epoch": 0.6014457831325302, + "grad_norm": 0.44513460993766785, + "learning_rate": 3.880405732608555e-05, + "loss": 0.0233, + "step": 312 + }, + { + "epoch": 0.6033734939759036, + "grad_norm": 0.8029689192771912, + "learning_rate": 3.8788734208837155e-05, + "loss": 0.0433, + "step": 313 + }, + { + "epoch": 0.6053012048192771, + "grad_norm": 0.7291454076766968, + "learning_rate": 3.877331661573709e-05, + "loss": 0.043, + "step": 314 + }, + { + "epoch": 0.6072289156626506, + "grad_norm": 0.6050467491149902, + "learning_rate": 3.8757804624310006e-05, + "loss": 0.0377, + "step": 315 + }, + { + "epoch": 0.6091566265060241, + "grad_norm": 0.6714366674423218, + "learning_rate": 3.874219831255524e-05, + "loss": 0.046, + "step": 316 + }, + { + "epoch": 0.6110843373493976, + "grad_norm": 0.336037278175354, + "learning_rate": 3.8726497758946394e-05, + "loss": 0.0149, + "step": 317 + }, + { + "epoch": 0.6130120481927711, + "grad_norm": 0.3057402968406677, + "learning_rate": 3.871070304243094e-05, + "loss": 0.014, + "step": 318 + }, + { + "epoch": 0.6149397590361446, + "grad_norm": 0.4537644684314728, + "learning_rate": 3.8694814242429834e-05, + "loss": 0.0503, + "step": 319 + }, + { + "epoch": 0.6168674698795181, + "grad_norm": 0.45573824644088745, + "learning_rate": 3.8678831438837116e-05, + "loss": 0.021, + "step": 320 + }, + { + "epoch": 0.6187951807228915, + "grad_norm": 0.30729591846466064, + "learning_rate": 3.866275471201952e-05, + "loss": 0.0163, + "step": 321 + }, + { + "epoch": 0.620722891566265, + "grad_norm": 0.7614850401878357, + "learning_rate": 3.8646584142816036e-05, + "loss": 0.0347, + "step": 322 + }, + { + "epoch": 0.6226506024096385, + "grad_norm": 0.5323611497879028, + "learning_rate": 3.863031981253754e-05, + "loss": 0.0201, + "step": 323 + }, + { + "epoch": 0.624578313253012, + "grad_norm": 0.34426453709602356, + "learning_rate": 3.861396180296635e-05, + "loss": 0.0243, + "step": 324 + }, + { + "epoch": 0.6265060240963856, + "grad_norm": 0.621636152267456, + "learning_rate": 3.859751019635585e-05, + "loss": 0.0166, + "step": 325 + }, + { + "epoch": 0.6284337349397591, + "grad_norm": 0.549324095249176, + "learning_rate": 3.858096507543006e-05, + "loss": 0.0274, + "step": 326 + }, + { + "epoch": 0.6303614457831326, + "grad_norm": 0.358426570892334, + "learning_rate": 3.8564326523383214e-05, + "loss": 0.0207, + "step": 327 + }, + { + "epoch": 0.632289156626506, + "grad_norm": 0.3639723062515259, + "learning_rate": 3.8547594623879346e-05, + "loss": 0.0297, + "step": 328 + }, + { + "epoch": 0.6342168674698795, + "grad_norm": 0.3402212858200073, + "learning_rate": 3.853076946105188e-05, + "loss": 0.0258, + "step": 329 + }, + { + "epoch": 0.636144578313253, + "grad_norm": 0.4083027243614197, + "learning_rate": 3.85138511195032e-05, + "loss": 0.0351, + "step": 330 + }, + { + "epoch": 0.6380722891566265, + "grad_norm": 0.43532121181488037, + "learning_rate": 3.84968396843042e-05, + "loss": 0.0388, + "step": 331 + }, + { + "epoch": 0.64, + "grad_norm": 0.35353463888168335, + "learning_rate": 3.8479735240993904e-05, + "loss": 0.0203, + "step": 332 + }, + { + "epoch": 0.6419277108433735, + "grad_norm": 0.350149929523468, + "learning_rate": 3.846253787557901e-05, + "loss": 0.0261, + "step": 333 + }, + { + "epoch": 0.643855421686747, + "grad_norm": 0.7665389180183411, + "learning_rate": 3.844524767453344e-05, + "loss": 0.0108, + "step": 334 + }, + { + "epoch": 0.6457831325301204, + "grad_norm": 0.44621360301971436, + "learning_rate": 3.842786472479795e-05, + "loss": 0.0282, + "step": 335 + }, + { + "epoch": 0.6477108433734939, + "grad_norm": 0.7787201404571533, + "learning_rate": 3.841038911377962e-05, + "loss": 0.0216, + "step": 336 + }, + { + "epoch": 0.6496385542168674, + "grad_norm": 0.48260653018951416, + "learning_rate": 3.839282092935153e-05, + "loss": 0.0234, + "step": 337 + }, + { + "epoch": 0.651566265060241, + "grad_norm": 0.4987852871417999, + "learning_rate": 3.837516025985219e-05, + "loss": 0.0515, + "step": 338 + }, + { + "epoch": 0.6534939759036145, + "grad_norm": 0.9030266404151917, + "learning_rate": 3.835740719408517e-05, + "loss": 0.0508, + "step": 339 + }, + { + "epoch": 0.655421686746988, + "grad_norm": 0.6381701231002808, + "learning_rate": 3.833956182131867e-05, + "loss": 0.0405, + "step": 340 + }, + { + "epoch": 0.6573493975903615, + "grad_norm": 0.42828986048698425, + "learning_rate": 3.832162423128499e-05, + "loss": 0.024, + "step": 341 + }, + { + "epoch": 0.659277108433735, + "grad_norm": 0.38725873827934265, + "learning_rate": 3.8303594514180164e-05, + "loss": 0.0199, + "step": 342 + }, + { + "epoch": 0.6612048192771084, + "grad_norm": 0.23280498385429382, + "learning_rate": 3.828547276066346e-05, + "loss": 0.0101, + "step": 343 + }, + { + "epoch": 0.6631325301204819, + "grad_norm": 0.7298216819763184, + "learning_rate": 3.8267259061856925e-05, + "loss": 0.0455, + "step": 344 + }, + { + "epoch": 0.6650602409638554, + "grad_norm": 0.5975687503814697, + "learning_rate": 3.824895350934496e-05, + "loss": 0.0372, + "step": 345 + }, + { + "epoch": 0.6669879518072289, + "grad_norm": 0.6295403242111206, + "learning_rate": 3.823055619517381e-05, + "loss": 0.0362, + "step": 346 + }, + { + "epoch": 0.6689156626506024, + "grad_norm": 0.5086020827293396, + "learning_rate": 3.821206721185115e-05, + "loss": 0.0368, + "step": 347 + }, + { + "epoch": 0.6708433734939759, + "grad_norm": 0.34506168961524963, + "learning_rate": 3.819348665234557e-05, + "loss": 0.0178, + "step": 348 + }, + { + "epoch": 0.6727710843373494, + "grad_norm": 1.309940218925476, + "learning_rate": 3.817481461008617e-05, + "loss": 0.024, + "step": 349 + }, + { + "epoch": 0.6746987951807228, + "grad_norm": 0.4074770510196686, + "learning_rate": 3.815605117896204e-05, + "loss": 0.0262, + "step": 350 + }, + { + "epoch": 0.6766265060240964, + "grad_norm": 0.48525840044021606, + "learning_rate": 3.8137196453321775e-05, + "loss": 0.0209, + "step": 351 + }, + { + "epoch": 0.6785542168674699, + "grad_norm": 0.7199739217758179, + "learning_rate": 3.811825052797308e-05, + "loss": 0.0396, + "step": 352 + }, + { + "epoch": 0.6804819277108434, + "grad_norm": 0.519540011882782, + "learning_rate": 3.8099213498182196e-05, + "loss": 0.0453, + "step": 353 + }, + { + "epoch": 0.6824096385542169, + "grad_norm": 0.9738391041755676, + "learning_rate": 3.808008545967349e-05, + "loss": 0.0317, + "step": 354 + }, + { + "epoch": 0.6843373493975904, + "grad_norm": 1.888344407081604, + "learning_rate": 3.8060866508628953e-05, + "loss": 0.0452, + "step": 355 + }, + { + "epoch": 0.6862650602409639, + "grad_norm": 0.48989811539649963, + "learning_rate": 3.8041556741687695e-05, + "loss": 0.0315, + "step": 356 + }, + { + "epoch": 0.6881927710843373, + "grad_norm": 0.3764645457267761, + "learning_rate": 3.8022156255945496e-05, + "loss": 0.0269, + "step": 357 + }, + { + "epoch": 0.6901204819277108, + "grad_norm": 0.46409738063812256, + "learning_rate": 3.800266514895429e-05, + "loss": 0.0171, + "step": 358 + }, + { + "epoch": 0.6920481927710843, + "grad_norm": 0.41091030836105347, + "learning_rate": 3.7983083518721695e-05, + "loss": 0.0167, + "step": 359 + }, + { + "epoch": 0.6939759036144578, + "grad_norm": 0.8375523090362549, + "learning_rate": 3.79634114637105e-05, + "loss": 0.0342, + "step": 360 + }, + { + "epoch": 0.6959036144578313, + "grad_norm": 1.7053394317626953, + "learning_rate": 3.794364908283817e-05, + "loss": 0.02, + "step": 361 + }, + { + "epoch": 0.6978313253012048, + "grad_norm": 0.4163115918636322, + "learning_rate": 3.792379647547637e-05, + "loss": 0.0138, + "step": 362 + }, + { + "epoch": 0.6997590361445784, + "grad_norm": 0.388751745223999, + "learning_rate": 3.790385374145046e-05, + "loss": 0.0172, + "step": 363 + }, + { + "epoch": 0.7016867469879519, + "grad_norm": 0.5584064722061157, + "learning_rate": 3.7883820981038966e-05, + "loss": 0.0254, + "step": 364 + }, + { + "epoch": 0.7036144578313253, + "grad_norm": 1.394264817237854, + "learning_rate": 3.7863698294973114e-05, + "loss": 0.037, + "step": 365 + }, + { + "epoch": 0.7055421686746988, + "grad_norm": 0.46280744671821594, + "learning_rate": 3.78434857844363e-05, + "loss": 0.0234, + "step": 366 + }, + { + "epoch": 0.7074698795180723, + "grad_norm": 0.39548924565315247, + "learning_rate": 3.782318355106358e-05, + "loss": 0.0164, + "step": 367 + }, + { + "epoch": 0.7093975903614458, + "grad_norm": 0.7307773232460022, + "learning_rate": 3.780279169694118e-05, + "loss": 0.0192, + "step": 368 + }, + { + "epoch": 0.7113253012048193, + "grad_norm": 0.28035807609558105, + "learning_rate": 3.778231032460594e-05, + "loss": 0.0131, + "step": 369 + }, + { + "epoch": 0.7132530120481928, + "grad_norm": 0.8376953601837158, + "learning_rate": 3.776173953704486e-05, + "loss": 0.0291, + "step": 370 + }, + { + "epoch": 0.7151807228915663, + "grad_norm": 0.7356843948364258, + "learning_rate": 3.774107943769454e-05, + "loss": 0.0214, + "step": 371 + }, + { + "epoch": 0.7171084337349397, + "grad_norm": 0.41503390669822693, + "learning_rate": 3.772033013044064e-05, + "loss": 0.0221, + "step": 372 + }, + { + "epoch": 0.7190361445783132, + "grad_norm": 0.35732385516166687, + "learning_rate": 3.7699491719617436e-05, + "loss": 0.015, + "step": 373 + }, + { + "epoch": 0.7209638554216867, + "grad_norm": 0.283778578042984, + "learning_rate": 3.76785643100072e-05, + "loss": 0.0146, + "step": 374 + }, + { + "epoch": 0.7228915662650602, + "grad_norm": 0.3219413459300995, + "learning_rate": 3.765754800683974e-05, + "loss": 0.015, + "step": 375 + }, + { + "epoch": 0.7248192771084337, + "grad_norm": 0.610431432723999, + "learning_rate": 3.7636442915791856e-05, + "loss": 0.0326, + "step": 376 + }, + { + "epoch": 0.7267469879518073, + "grad_norm": 4.944870948791504, + "learning_rate": 3.7615249142986784e-05, + "loss": 0.0432, + "step": 377 + }, + { + "epoch": 0.7286746987951808, + "grad_norm": 0.4894593060016632, + "learning_rate": 3.7593966794993696e-05, + "loss": 0.0174, + "step": 378 + }, + { + "epoch": 0.7306024096385542, + "grad_norm": 0.4211325943470001, + "learning_rate": 3.757259597882714e-05, + "loss": 0.023, + "step": 379 + }, + { + "epoch": 0.7325301204819277, + "grad_norm": 0.33621737360954285, + "learning_rate": 3.755113680194651e-05, + "loss": 0.0201, + "step": 380 + }, + { + "epoch": 0.7344578313253012, + "grad_norm": 0.5799694657325745, + "learning_rate": 3.7529589372255514e-05, + "loss": 0.0173, + "step": 381 + }, + { + "epoch": 0.7363855421686747, + "grad_norm": 0.5172572731971741, + "learning_rate": 3.750795379810162e-05, + "loss": 0.0284, + "step": 382 + }, + { + "epoch": 0.7383132530120482, + "grad_norm": 0.5715453028678894, + "learning_rate": 3.748623018827552e-05, + "loss": 0.0194, + "step": 383 + }, + { + "epoch": 0.7402409638554217, + "grad_norm": 0.5284178256988525, + "learning_rate": 3.746441865201056e-05, + "loss": 0.0247, + "step": 384 + }, + { + "epoch": 0.7421686746987952, + "grad_norm": 0.37828654050827026, + "learning_rate": 3.744251929898223e-05, + "loss": 0.0097, + "step": 385 + }, + { + "epoch": 0.7440963855421687, + "grad_norm": 0.3252779543399811, + "learning_rate": 3.742053223930758e-05, + "loss": 0.0238, + "step": 386 + }, + { + "epoch": 0.7460240963855421, + "grad_norm": 0.6031543612480164, + "learning_rate": 3.7398457583544674e-05, + "loss": 0.0332, + "step": 387 + }, + { + "epoch": 0.7479518072289156, + "grad_norm": 0.23846614360809326, + "learning_rate": 3.737629544269206e-05, + "loss": 0.0122, + "step": 388 + }, + { + "epoch": 0.7498795180722891, + "grad_norm": 0.5274029970169067, + "learning_rate": 3.7354045928188155e-05, + "loss": 0.0324, + "step": 389 + }, + { + "epoch": 0.7518072289156627, + "grad_norm": 0.4672217071056366, + "learning_rate": 3.733170915191075e-05, + "loss": 0.0196, + "step": 390 + }, + { + "epoch": 0.7537349397590362, + "grad_norm": 0.29819396138191223, + "learning_rate": 3.730928522617639e-05, + "loss": 0.0131, + "step": 391 + }, + { + "epoch": 0.7556626506024097, + "grad_norm": 0.43824997544288635, + "learning_rate": 3.7286774263739855e-05, + "loss": 0.0238, + "step": 392 + }, + { + "epoch": 0.7575903614457832, + "grad_norm": 0.2822072505950928, + "learning_rate": 3.726417637779357e-05, + "loss": 0.0314, + "step": 393 + }, + { + "epoch": 0.7595180722891566, + "grad_norm": 0.43815648555755615, + "learning_rate": 3.7241491681967044e-05, + "loss": 0.0144, + "step": 394 + }, + { + "epoch": 0.7614457831325301, + "grad_norm": 0.37194815278053284, + "learning_rate": 3.721872029032628e-05, + "loss": 0.0286, + "step": 395 + }, + { + "epoch": 0.7633734939759036, + "grad_norm": 0.7319737672805786, + "learning_rate": 3.719586231737322e-05, + "loss": 0.0427, + "step": 396 + }, + { + "epoch": 0.7653012048192771, + "grad_norm": 0.5870066285133362, + "learning_rate": 3.717291787804517e-05, + "loss": 0.0138, + "step": 397 + }, + { + "epoch": 0.7672289156626506, + "grad_norm": 0.6574277281761169, + "learning_rate": 3.7149887087714225e-05, + "loss": 0.061, + "step": 398 + }, + { + "epoch": 0.7691566265060241, + "grad_norm": 0.5467348694801331, + "learning_rate": 3.712677006218666e-05, + "loss": 0.022, + "step": 399 + }, + { + "epoch": 0.7710843373493976, + "grad_norm": 0.3589288890361786, + "learning_rate": 3.710356691770238e-05, + "loss": 0.0161, + "step": 400 + }, + { + "epoch": 0.7730120481927711, + "grad_norm": 0.574630618095398, + "learning_rate": 3.708027777093433e-05, + "loss": 0.0285, + "step": 401 + }, + { + "epoch": 0.7749397590361445, + "grad_norm": 0.39048445224761963, + "learning_rate": 3.70569027389879e-05, + "loss": 0.012, + "step": 402 + }, + { + "epoch": 0.776867469879518, + "grad_norm": 0.34803536534309387, + "learning_rate": 3.703344193940032e-05, + "loss": 0.0155, + "step": 403 + }, + { + "epoch": 0.7787951807228916, + "grad_norm": 1.188948392868042, + "learning_rate": 3.700989549014011e-05, + "loss": 0.0617, + "step": 404 + }, + { + "epoch": 0.7807228915662651, + "grad_norm": 0.473157674074173, + "learning_rate": 3.698626350960646e-05, + "loss": 0.0298, + "step": 405 + }, + { + "epoch": 0.7826506024096386, + "grad_norm": 0.42009076476097107, + "learning_rate": 3.6962546116628634e-05, + "loss": 0.03, + "step": 406 + }, + { + "epoch": 0.7845783132530121, + "grad_norm": 0.6334308981895447, + "learning_rate": 3.693874343046537e-05, + "loss": 0.0107, + "step": 407 + }, + { + "epoch": 0.7865060240963856, + "grad_norm": 0.35594677925109863, + "learning_rate": 3.6914855570804314e-05, + "loss": 0.0174, + "step": 408 + }, + { + "epoch": 0.788433734939759, + "grad_norm": 0.28985708951950073, + "learning_rate": 3.689088265776136e-05, + "loss": 0.0149, + "step": 409 + }, + { + "epoch": 0.7903614457831325, + "grad_norm": 0.3981950581073761, + "learning_rate": 3.686682481188011e-05, + "loss": 0.019, + "step": 410 + }, + { + "epoch": 0.792289156626506, + "grad_norm": 0.48819583654403687, + "learning_rate": 3.6842682154131193e-05, + "loss": 0.0217, + "step": 411 + }, + { + "epoch": 0.7942168674698795, + "grad_norm": 0.42819952964782715, + "learning_rate": 3.681845480591174e-05, + "loss": 0.0198, + "step": 412 + }, + { + "epoch": 0.796144578313253, + "grad_norm": 0.48591694235801697, + "learning_rate": 3.6794142889044727e-05, + "loss": 0.0253, + "step": 413 + }, + { + "epoch": 0.7980722891566265, + "grad_norm": 0.4730607271194458, + "learning_rate": 3.676974652577835e-05, + "loss": 0.0329, + "step": 414 + }, + { + "epoch": 0.8, + "grad_norm": 0.5390865802764893, + "learning_rate": 3.6745265838785434e-05, + "loss": 0.0479, + "step": 415 + }, + { + "epoch": 0.8019277108433734, + "grad_norm": 0.6377891302108765, + "learning_rate": 3.672070095116283e-05, + "loss": 0.019, + "step": 416 + }, + { + "epoch": 0.803855421686747, + "grad_norm": 0.8984615206718445, + "learning_rate": 3.669605198643075e-05, + "loss": 0.0444, + "step": 417 + }, + { + "epoch": 0.8057831325301205, + "grad_norm": 0.4913877546787262, + "learning_rate": 3.667131906853219e-05, + "loss": 0.031, + "step": 418 + }, + { + "epoch": 0.807710843373494, + "grad_norm": 0.37894028425216675, + "learning_rate": 3.664650232183229e-05, + "loss": 0.0195, + "step": 419 + }, + { + "epoch": 0.8096385542168675, + "grad_norm": 0.3644949495792389, + "learning_rate": 3.66216018711177e-05, + "loss": 0.018, + "step": 420 + }, + { + "epoch": 0.811566265060241, + "grad_norm": 0.414440393447876, + "learning_rate": 3.659661784159597e-05, + "loss": 0.0188, + "step": 421 + }, + { + "epoch": 0.8134939759036145, + "grad_norm": 0.49220341444015503, + "learning_rate": 3.65715503588949e-05, + "loss": 0.016, + "step": 422 + }, + { + "epoch": 0.815421686746988, + "grad_norm": 1.0939836502075195, + "learning_rate": 3.654639954906193e-05, + "loss": 0.0758, + "step": 423 + }, + { + "epoch": 0.8173493975903614, + "grad_norm": 0.43222442269325256, + "learning_rate": 3.652116553856349e-05, + "loss": 0.0308, + "step": 424 + }, + { + "epoch": 0.8192771084337349, + "grad_norm": 0.5081896185874939, + "learning_rate": 3.649584845428438e-05, + "loss": 0.0493, + "step": 425 + }, + { + "epoch": 0.8212048192771084, + "grad_norm": 0.9811948537826538, + "learning_rate": 3.64704484235271e-05, + "loss": 0.019, + "step": 426 + }, + { + "epoch": 0.8231325301204819, + "grad_norm": 0.31656572222709656, + "learning_rate": 3.6444965574011255e-05, + "loss": 0.0135, + "step": 427 + }, + { + "epoch": 0.8250602409638554, + "grad_norm": 0.7844433188438416, + "learning_rate": 3.641940003387289e-05, + "loss": 0.0402, + "step": 428 + }, + { + "epoch": 0.826987951807229, + "grad_norm": 0.3353273570537567, + "learning_rate": 3.6393751931663814e-05, + "loss": 0.0132, + "step": 429 + }, + { + "epoch": 0.8289156626506025, + "grad_norm": 0.7253058552742004, + "learning_rate": 3.6368021396351015e-05, + "loss": 0.0296, + "step": 430 + }, + { + "epoch": 0.8308433734939759, + "grad_norm": 0.45300304889678955, + "learning_rate": 3.634220855731598e-05, + "loss": 0.0258, + "step": 431 + }, + { + "epoch": 0.8327710843373494, + "grad_norm": 0.3480473458766937, + "learning_rate": 3.631631354435403e-05, + "loss": 0.0099, + "step": 432 + }, + { + "epoch": 0.8346987951807229, + "grad_norm": 2.1114516258239746, + "learning_rate": 3.62903364876737e-05, + "loss": 0.0457, + "step": 433 + }, + { + "epoch": 0.8366265060240964, + "grad_norm": 0.5649561882019043, + "learning_rate": 3.626427751789606e-05, + "loss": 0.0444, + "step": 434 + }, + { + "epoch": 0.8385542168674699, + "grad_norm": 0.3864995539188385, + "learning_rate": 3.623813676605405e-05, + "loss": 0.0223, + "step": 435 + }, + { + "epoch": 0.8404819277108434, + "grad_norm": 1.2134298086166382, + "learning_rate": 3.621191436359186e-05, + "loss": 0.0353, + "step": 436 + }, + { + "epoch": 0.8424096385542169, + "grad_norm": 0.4403415024280548, + "learning_rate": 3.6185610442364246e-05, + "loss": 0.0216, + "step": 437 + }, + { + "epoch": 0.8443373493975903, + "grad_norm": 0.6050297021865845, + "learning_rate": 3.6159225134635846e-05, + "loss": 0.0433, + "step": 438 + }, + { + "epoch": 0.8462650602409638, + "grad_norm": 0.7951678037643433, + "learning_rate": 3.6132758573080556e-05, + "loss": 0.031, + "step": 439 + }, + { + "epoch": 0.8481927710843373, + "grad_norm": 0.4991949796676636, + "learning_rate": 3.6106210890780834e-05, + "loss": 0.0313, + "step": 440 + }, + { + "epoch": 0.8501204819277108, + "grad_norm": 0.47951385378837585, + "learning_rate": 3.607958222122704e-05, + "loss": 0.0218, + "step": 441 + }, + { + "epoch": 0.8520481927710843, + "grad_norm": 0.7345194220542908, + "learning_rate": 3.6052872698316755e-05, + "loss": 0.0239, + "step": 442 + }, + { + "epoch": 0.8539759036144579, + "grad_norm": 1.4814884662628174, + "learning_rate": 3.602608245635414e-05, + "loss": 0.0127, + "step": 443 + }, + { + "epoch": 0.8559036144578314, + "grad_norm": 2.4240877628326416, + "learning_rate": 3.599921163004922e-05, + "loss": 0.0618, + "step": 444 + }, + { + "epoch": 0.8578313253012049, + "grad_norm": 0.41523510217666626, + "learning_rate": 3.5972260354517216e-05, + "loss": 0.0283, + "step": 445 + }, + { + "epoch": 0.8597590361445783, + "grad_norm": 0.5577677488327026, + "learning_rate": 3.594522876527791e-05, + "loss": 0.0271, + "step": 446 + }, + { + "epoch": 0.8616867469879518, + "grad_norm": 0.5829064846038818, + "learning_rate": 3.591811699825487e-05, + "loss": 0.0169, + "step": 447 + }, + { + "epoch": 0.8636144578313253, + "grad_norm": 0.4478822350502014, + "learning_rate": 3.5890925189774886e-05, + "loss": 0.0239, + "step": 448 + }, + { + "epoch": 0.8655421686746988, + "grad_norm": 0.3498048782348633, + "learning_rate": 3.586365347656718e-05, + "loss": 0.0137, + "step": 449 + }, + { + "epoch": 0.8674698795180723, + "grad_norm": 0.6571130156517029, + "learning_rate": 3.583630199576278e-05, + "loss": 0.027, + "step": 450 + }, + { + "epoch": 0.8693975903614458, + "grad_norm": 0.344970166683197, + "learning_rate": 3.58088708848938e-05, + "loss": 0.0167, + "step": 451 + }, + { + "epoch": 0.8713253012048193, + "grad_norm": 0.34611570835113525, + "learning_rate": 3.5781360281892775e-05, + "loss": 0.0468, + "step": 452 + }, + { + "epoch": 0.8732530120481927, + "grad_norm": 0.66157066822052, + "learning_rate": 3.575377032509194e-05, + "loss": 0.0344, + "step": 453 + }, + { + "epoch": 0.8751807228915662, + "grad_norm": 0.3676326870918274, + "learning_rate": 3.5726101153222534e-05, + "loss": 0.0366, + "step": 454 + }, + { + "epoch": 0.8771084337349397, + "grad_norm": 0.5958423018455505, + "learning_rate": 3.569835290541414e-05, + "loss": 0.0382, + "step": 455 + }, + { + "epoch": 0.8790361445783132, + "grad_norm": 0.36787471175193787, + "learning_rate": 3.567052572119397e-05, + "loss": 0.018, + "step": 456 + }, + { + "epoch": 0.8809638554216868, + "grad_norm": 0.9478234052658081, + "learning_rate": 3.564261974048611e-05, + "loss": 0.0179, + "step": 457 + }, + { + "epoch": 0.8828915662650603, + "grad_norm": 0.3337579369544983, + "learning_rate": 3.56146351036109e-05, + "loss": 0.0147, + "step": 458 + }, + { + "epoch": 0.8848192771084338, + "grad_norm": 0.4984932243824005, + "learning_rate": 3.558657195128416e-05, + "loss": 0.0224, + "step": 459 + }, + { + "epoch": 0.8867469879518072, + "grad_norm": 0.36718735098838806, + "learning_rate": 3.555843042461653e-05, + "loss": 0.0202, + "step": 460 + }, + { + "epoch": 0.8886746987951807, + "grad_norm": 0.4081745445728302, + "learning_rate": 3.553021066511274e-05, + "loss": 0.0288, + "step": 461 + }, + { + "epoch": 0.8906024096385542, + "grad_norm": 0.3233242332935333, + "learning_rate": 3.55019128146709e-05, + "loss": 0.0362, + "step": 462 + }, + { + "epoch": 0.8925301204819277, + "grad_norm": 0.6560158729553223, + "learning_rate": 3.547353701558178e-05, + "loss": 0.038, + "step": 463 + }, + { + "epoch": 0.8944578313253012, + "grad_norm": 0.47668641805648804, + "learning_rate": 3.544508341052811e-05, + "loss": 0.0399, + "step": 464 + }, + { + "epoch": 0.8963855421686747, + "grad_norm": 0.45512664318084717, + "learning_rate": 3.541655214258383e-05, + "loss": 0.022, + "step": 465 + }, + { + "epoch": 0.8983132530120482, + "grad_norm": 0.8410730361938477, + "learning_rate": 3.538794335521343e-05, + "loss": 0.0315, + "step": 466 + }, + { + "epoch": 0.9002409638554217, + "grad_norm": 0.4872909486293793, + "learning_rate": 3.535925719227117e-05, + "loss": 0.0152, + "step": 467 + }, + { + "epoch": 0.9021686746987951, + "grad_norm": 0.45623311400413513, + "learning_rate": 3.533049379800038e-05, + "loss": 0.0305, + "step": 468 + }, + { + "epoch": 0.9040963855421686, + "grad_norm": 0.43087029457092285, + "learning_rate": 3.530165331703275e-05, + "loss": 0.0131, + "step": 469 + }, + { + "epoch": 0.9060240963855422, + "grad_norm": 0.4610525369644165, + "learning_rate": 3.527273589438756e-05, + "loss": 0.0187, + "step": 470 + }, + { + "epoch": 0.9079518072289157, + "grad_norm": 0.3356114327907562, + "learning_rate": 3.5243741675471006e-05, + "loss": 0.0185, + "step": 471 + }, + { + "epoch": 0.9098795180722892, + "grad_norm": 0.9065960049629211, + "learning_rate": 3.5214670806075426e-05, + "loss": 0.0433, + "step": 472 + }, + { + "epoch": 0.9118072289156627, + "grad_norm": 0.3652578294277191, + "learning_rate": 3.518552343237858e-05, + "loss": 0.02, + "step": 473 + }, + { + "epoch": 0.9137349397590362, + "grad_norm": 0.32377883791923523, + "learning_rate": 3.5156299700942916e-05, + "loss": 0.0165, + "step": 474 + }, + { + "epoch": 0.9156626506024096, + "grad_norm": 0.2431817352771759, + "learning_rate": 3.512699975871485e-05, + "loss": 0.0172, + "step": 475 + }, + { + "epoch": 0.9175903614457831, + "grad_norm": 0.6390707492828369, + "learning_rate": 3.509762375302399e-05, + "loss": 0.0356, + "step": 476 + }, + { + "epoch": 0.9195180722891566, + "grad_norm": 0.2283092886209488, + "learning_rate": 3.506817183158243e-05, + "loss": 0.0088, + "step": 477 + }, + { + "epoch": 0.9214457831325301, + "grad_norm": 0.5053914189338684, + "learning_rate": 3.5038644142483966e-05, + "loss": 0.0389, + "step": 478 + }, + { + "epoch": 0.9233734939759036, + "grad_norm": 0.2567576467990875, + "learning_rate": 3.500904083420342e-05, + "loss": 0.0155, + "step": 479 + }, + { + "epoch": 0.9253012048192771, + "grad_norm": 0.6852384209632874, + "learning_rate": 3.497936205559583e-05, + "loss": 0.0247, + "step": 480 + }, + { + "epoch": 0.9272289156626506, + "grad_norm": 0.36403414607048035, + "learning_rate": 3.494960795589572e-05, + "loss": 0.023, + "step": 481 + }, + { + "epoch": 0.929156626506024, + "grad_norm": 0.506554901599884, + "learning_rate": 3.491977868471635e-05, + "loss": 0.0273, + "step": 482 + }, + { + "epoch": 0.9310843373493976, + "grad_norm": 0.38329923152923584, + "learning_rate": 3.4889874392048985e-05, + "loss": 0.0169, + "step": 483 + }, + { + "epoch": 0.9330120481927711, + "grad_norm": 0.2805836498737335, + "learning_rate": 3.48598952282621e-05, + "loss": 0.0105, + "step": 484 + }, + { + "epoch": 0.9349397590361446, + "grad_norm": 0.6315302848815918, + "learning_rate": 3.482984134410067e-05, + "loss": 0.0289, + "step": 485 + }, + { + "epoch": 0.9368674698795181, + "grad_norm": 0.6431388854980469, + "learning_rate": 3.479971289068537e-05, + "loss": 0.0311, + "step": 486 + }, + { + "epoch": 0.9387951807228916, + "grad_norm": 0.9794723391532898, + "learning_rate": 3.476951001951184e-05, + "loss": 0.0452, + "step": 487 + }, + { + "epoch": 0.9407228915662651, + "grad_norm": 0.7984824180603027, + "learning_rate": 3.473923288244991e-05, + "loss": 0.0689, + "step": 488 + }, + { + "epoch": 0.9426506024096386, + "grad_norm": 0.46362006664276123, + "learning_rate": 3.470888163174286e-05, + "loss": 0.0241, + "step": 489 + }, + { + "epoch": 0.944578313253012, + "grad_norm": 0.5051195025444031, + "learning_rate": 3.467845642000661e-05, + "loss": 0.0228, + "step": 490 + }, + { + "epoch": 0.9465060240963855, + "grad_norm": 0.3082812428474426, + "learning_rate": 3.4647957400229004e-05, + "loss": 0.0144, + "step": 491 + }, + { + "epoch": 0.948433734939759, + "grad_norm": 0.2691391110420227, + "learning_rate": 3.461738472576902e-05, + "loss": 0.0167, + "step": 492 + }, + { + "epoch": 0.9503614457831325, + "grad_norm": 0.5627671480178833, + "learning_rate": 3.458673855035597e-05, + "loss": 0.031, + "step": 493 + }, + { + "epoch": 0.952289156626506, + "grad_norm": 0.4571435749530792, + "learning_rate": 3.455601902808876e-05, + "loss": 0.0191, + "step": 494 + }, + { + "epoch": 0.9542168674698795, + "grad_norm": 1.0117709636688232, + "learning_rate": 3.452522631343515e-05, + "loss": 0.0192, + "step": 495 + }, + { + "epoch": 0.9561445783132531, + "grad_norm": 0.28375712037086487, + "learning_rate": 3.449436056123086e-05, + "loss": 0.0159, + "step": 496 + }, + { + "epoch": 0.9580722891566265, + "grad_norm": 0.26381856203079224, + "learning_rate": 3.446342192667893e-05, + "loss": 0.0113, + "step": 497 + }, + { + "epoch": 0.96, + "grad_norm": 0.49317577481269836, + "learning_rate": 3.443241056534884e-05, + "loss": 0.0332, + "step": 498 + }, + { + "epoch": 0.9619277108433735, + "grad_norm": 0.28884485363960266, + "learning_rate": 3.440132663317579e-05, + "loss": 0.0117, + "step": 499 + }, + { + "epoch": 0.963855421686747, + "grad_norm": 0.36255285143852234, + "learning_rate": 3.4370170286459864e-05, + "loss": 0.0169, + "step": 500 + }, + { + "epoch": 0.9657831325301205, + "grad_norm": 0.4265049993991852, + "learning_rate": 3.433894168186529e-05, + "loss": 0.0217, + "step": 501 + }, + { + "epoch": 0.967710843373494, + "grad_norm": 0.8169426321983337, + "learning_rate": 3.430764097641962e-05, + "loss": 0.0207, + "step": 502 + }, + { + "epoch": 0.9696385542168675, + "grad_norm": 1.866077184677124, + "learning_rate": 3.427626832751296e-05, + "loss": 0.0381, + "step": 503 + }, + { + "epoch": 0.971566265060241, + "grad_norm": 0.33124980330467224, + "learning_rate": 3.424482389289716e-05, + "loss": 0.0245, + "step": 504 + }, + { + "epoch": 0.9734939759036144, + "grad_norm": 0.37479540705680847, + "learning_rate": 3.4213307830685055e-05, + "loss": 0.0164, + "step": 505 + }, + { + "epoch": 0.9754216867469879, + "grad_norm": 0.39738863706588745, + "learning_rate": 3.4181720299349615e-05, + "loss": 0.0297, + "step": 506 + }, + { + "epoch": 0.9773493975903614, + "grad_norm": 0.2567287087440491, + "learning_rate": 3.4150061457723205e-05, + "loss": 0.0102, + "step": 507 + }, + { + "epoch": 0.9792771084337349, + "grad_norm": 0.6230517029762268, + "learning_rate": 3.411833146499675e-05, + "loss": 0.0243, + "step": 508 + }, + { + "epoch": 0.9812048192771085, + "grad_norm": 0.44843971729278564, + "learning_rate": 3.408653048071894e-05, + "loss": 0.0357, + "step": 509 + }, + { + "epoch": 0.983132530120482, + "grad_norm": 1.0569655895233154, + "learning_rate": 3.405465866479546e-05, + "loss": 0.037, + "step": 510 + }, + { + "epoch": 0.9850602409638555, + "grad_norm": 0.29000964760780334, + "learning_rate": 3.402271617748812e-05, + "loss": 0.0129, + "step": 511 + }, + { + "epoch": 0.9869879518072289, + "grad_norm": 2.1627447605133057, + "learning_rate": 3.399070317941413e-05, + "loss": 0.0442, + "step": 512 + }, + { + "epoch": 0.9889156626506024, + "grad_norm": 0.27371272444725037, + "learning_rate": 3.395861983154522e-05, + "loss": 0.0119, + "step": 513 + }, + { + "epoch": 0.9908433734939759, + "grad_norm": 0.4117226302623749, + "learning_rate": 3.392646629520688e-05, + "loss": 0.0455, + "step": 514 + }, + { + "epoch": 0.9927710843373494, + "grad_norm": 0.5098996758460999, + "learning_rate": 3.389424273207752e-05, + "loss": 0.0203, + "step": 515 + }, + { + "epoch": 0.9946987951807229, + "grad_norm": 0.5192157626152039, + "learning_rate": 3.386194930418767e-05, + "loss": 0.0329, + "step": 516 + }, + { + "epoch": 0.9966265060240964, + "grad_norm": 0.18757697939872742, + "learning_rate": 3.382958617391915e-05, + "loss": 0.0065, + "step": 517 + }, + { + "epoch": 0.9985542168674699, + "grad_norm": 0.3334413170814514, + "learning_rate": 3.3797153504004296e-05, + "loss": 0.0266, + "step": 518 + }, + { + "epoch": 1.0, + "grad_norm": 0.4152225852012634, + "learning_rate": 3.3764651457525095e-05, + "loss": 0.0169, + "step": 519 + }, + { + "epoch": 1.0019277108433735, + "grad_norm": 0.43535247445106506, + "learning_rate": 3.373208019791237e-05, + "loss": 0.0221, + "step": 520 + }, + { + "epoch": 1.003855421686747, + "grad_norm": 0.39292722940444946, + "learning_rate": 3.3699439888945e-05, + "loss": 0.0211, + "step": 521 + }, + { + "epoch": 1.0057831325301205, + "grad_norm": 0.19566713273525238, + "learning_rate": 3.366673069474904e-05, + "loss": 0.0069, + "step": 522 + }, + { + "epoch": 1.007710843373494, + "grad_norm": 0.5101853609085083, + "learning_rate": 3.3633952779796914e-05, + "loss": 0.0191, + "step": 523 + }, + { + "epoch": 1.0096385542168675, + "grad_norm": 0.999434769153595, + "learning_rate": 3.360110630890664e-05, + "loss": 0.0196, + "step": 524 + }, + { + "epoch": 1.011566265060241, + "grad_norm": 0.4646223783493042, + "learning_rate": 3.356819144724092e-05, + "loss": 0.0328, + "step": 525 + }, + { + "epoch": 1.0134939759036146, + "grad_norm": 0.3132480978965759, + "learning_rate": 3.3535208360306354e-05, + "loss": 0.0203, + "step": 526 + }, + { + "epoch": 1.0154216867469879, + "grad_norm": 0.3038032352924347, + "learning_rate": 3.350215721395261e-05, + "loss": 0.0122, + "step": 527 + }, + { + "epoch": 1.0173493975903614, + "grad_norm": 0.45082882046699524, + "learning_rate": 3.346903817437157e-05, + "loss": 0.0437, + "step": 528 + }, + { + "epoch": 1.0192771084337349, + "grad_norm": 0.26917046308517456, + "learning_rate": 3.343585140809651e-05, + "loss": 0.013, + "step": 529 + }, + { + "epoch": 1.0212048192771084, + "grad_norm": 0.23869264125823975, + "learning_rate": 3.3402597082001276e-05, + "loss": 0.008, + "step": 530 + }, + { + "epoch": 1.0231325301204819, + "grad_norm": 0.31315353512763977, + "learning_rate": 3.3369275363299394e-05, + "loss": 0.0078, + "step": 531 + }, + { + "epoch": 1.0250602409638554, + "grad_norm": 0.4780346751213074, + "learning_rate": 3.333588641954327e-05, + "loss": 0.0225, + "step": 532 + }, + { + "epoch": 1.026987951807229, + "grad_norm": 0.2920368015766144, + "learning_rate": 3.330243041862336e-05, + "loss": 0.0118, + "step": 533 + }, + { + "epoch": 1.0289156626506024, + "grad_norm": 0.543669581413269, + "learning_rate": 3.326890752876728e-05, + "loss": 0.0338, + "step": 534 + }, + { + "epoch": 1.030843373493976, + "grad_norm": 0.4288000464439392, + "learning_rate": 3.323531791853901e-05, + "loss": 0.0341, + "step": 535 + }, + { + "epoch": 1.0327710843373494, + "grad_norm": 0.26600322127342224, + "learning_rate": 3.3201661756838e-05, + "loss": 0.0184, + "step": 536 + }, + { + "epoch": 1.034698795180723, + "grad_norm": 0.290937602519989, + "learning_rate": 3.316793921289835e-05, + "loss": 0.0152, + "step": 537 + }, + { + "epoch": 1.0366265060240965, + "grad_norm": 0.7621443271636963, + "learning_rate": 3.313415045628795e-05, + "loss": 0.0326, + "step": 538 + }, + { + "epoch": 1.03855421686747, + "grad_norm": 0.5581283569335938, + "learning_rate": 3.3100295656907646e-05, + "loss": 0.0164, + "step": 539 + }, + { + "epoch": 1.0404819277108435, + "grad_norm": 0.20930901169776917, + "learning_rate": 3.306637498499034e-05, + "loss": 0.0091, + "step": 540 + }, + { + "epoch": 1.0424096385542168, + "grad_norm": 0.46212059259414673, + "learning_rate": 3.303238861110018e-05, + "loss": 0.0118, + "step": 541 + }, + { + "epoch": 1.0443373493975903, + "grad_norm": 0.38259151577949524, + "learning_rate": 3.299833670613168e-05, + "loss": 0.0081, + "step": 542 + }, + { + "epoch": 1.0462650602409638, + "grad_norm": 0.4888618290424347, + "learning_rate": 3.2964219441308865e-05, + "loss": 0.0138, + "step": 543 + }, + { + "epoch": 1.0481927710843373, + "grad_norm": 0.32103127241134644, + "learning_rate": 3.2930036988184425e-05, + "loss": 0.0171, + "step": 544 + }, + { + "epoch": 1.0501204819277108, + "grad_norm": 0.27787327766418457, + "learning_rate": 3.28957895186388e-05, + "loss": 0.0106, + "step": 545 + }, + { + "epoch": 1.0520481927710843, + "grad_norm": 0.35597777366638184, + "learning_rate": 3.2861477204879395e-05, + "loss": 0.0123, + "step": 546 + }, + { + "epoch": 1.0539759036144578, + "grad_norm": 0.3619804084300995, + "learning_rate": 3.2827100219439656e-05, + "loss": 0.0088, + "step": 547 + }, + { + "epoch": 1.0559036144578313, + "grad_norm": 0.2525513470172882, + "learning_rate": 3.279265873517822e-05, + "loss": 0.0179, + "step": 548 + }, + { + "epoch": 1.0578313253012048, + "grad_norm": 0.3910020887851715, + "learning_rate": 3.275815292527804e-05, + "loss": 0.0142, + "step": 549 + }, + { + "epoch": 1.0597590361445783, + "grad_norm": 0.30515050888061523, + "learning_rate": 3.2723582963245526e-05, + "loss": 0.0123, + "step": 550 + }, + { + "epoch": 1.0616867469879518, + "grad_norm": 0.21708644926548004, + "learning_rate": 3.2688949022909665e-05, + "loss": 0.0098, + "step": 551 + }, + { + "epoch": 1.0636144578313254, + "grad_norm": 0.23307719826698303, + "learning_rate": 3.265425127842114e-05, + "loss": 0.0097, + "step": 552 + }, + { + "epoch": 1.0655421686746989, + "grad_norm": 0.676654577255249, + "learning_rate": 3.261948990425147e-05, + "loss": 0.0227, + "step": 553 + }, + { + "epoch": 1.0674698795180724, + "grad_norm": 0.4593975841999054, + "learning_rate": 3.258466507519213e-05, + "loss": 0.047, + "step": 554 + }, + { + "epoch": 1.0693975903614459, + "grad_norm": 0.19405829906463623, + "learning_rate": 3.254977696635366e-05, + "loss": 0.0314, + "step": 555 + }, + { + "epoch": 1.0713253012048192, + "grad_norm": 0.14563389122486115, + "learning_rate": 3.2514825753164774e-05, + "loss": 0.0046, + "step": 556 + }, + { + "epoch": 1.0732530120481927, + "grad_norm": 0.2642340064048767, + "learning_rate": 3.247981161137153e-05, + "loss": 0.022, + "step": 557 + }, + { + "epoch": 1.0751807228915662, + "grad_norm": 0.17274761199951172, + "learning_rate": 3.2444734717036386e-05, + "loss": 0.0134, + "step": 558 + }, + { + "epoch": 1.0771084337349397, + "grad_norm": 0.44354626536369324, + "learning_rate": 3.240959524653735e-05, + "loss": 0.0211, + "step": 559 + }, + { + "epoch": 1.0790361445783132, + "grad_norm": 0.2806888818740845, + "learning_rate": 3.237439337656708e-05, + "loss": 0.0141, + "step": 560 + }, + { + "epoch": 1.0809638554216867, + "grad_norm": 0.21679501235485077, + "learning_rate": 3.2339129284131994e-05, + "loss": 0.019, + "step": 561 + }, + { + "epoch": 1.0828915662650602, + "grad_norm": 0.3040260076522827, + "learning_rate": 3.2303803146551386e-05, + "loss": 0.0249, + "step": 562 + }, + { + "epoch": 1.0848192771084337, + "grad_norm": 0.2793775200843811, + "learning_rate": 3.226841514145656e-05, + "loss": 0.0088, + "step": 563 + }, + { + "epoch": 1.0867469879518072, + "grad_norm": 0.149955615401268, + "learning_rate": 3.223296544678987e-05, + "loss": 0.0054, + "step": 564 + }, + { + "epoch": 1.0886746987951808, + "grad_norm": 0.22166767716407776, + "learning_rate": 3.219745424080389e-05, + "loss": 0.0109, + "step": 565 + }, + { + "epoch": 1.0906024096385543, + "grad_norm": 0.22399431467056274, + "learning_rate": 3.2161881702060476e-05, + "loss": 0.0106, + "step": 566 + }, + { + "epoch": 1.0925301204819278, + "grad_norm": 0.18537986278533936, + "learning_rate": 3.2126248009429905e-05, + "loss": 0.0077, + "step": 567 + }, + { + "epoch": 1.0944578313253013, + "grad_norm": 0.24511495232582092, + "learning_rate": 3.2090553342089935e-05, + "loss": 0.0093, + "step": 568 + }, + { + "epoch": 1.0963855421686748, + "grad_norm": 0.4766045808792114, + "learning_rate": 3.205479787952494e-05, + "loss": 0.036, + "step": 569 + }, + { + "epoch": 1.0983132530120483, + "grad_norm": 0.1425715535879135, + "learning_rate": 3.201898180152499e-05, + "loss": 0.0085, + "step": 570 + }, + { + "epoch": 1.1002409638554216, + "grad_norm": 0.1909666359424591, + "learning_rate": 3.1983105288184945e-05, + "loss": 0.0081, + "step": 571 + }, + { + "epoch": 1.102168674698795, + "grad_norm": 0.44077104330062866, + "learning_rate": 3.194716851990355e-05, + "loss": 0.017, + "step": 572 + }, + { + "epoch": 1.1040963855421686, + "grad_norm": 0.5757400989532471, + "learning_rate": 3.191117167738253e-05, + "loss": 0.021, + "step": 573 + }, + { + "epoch": 1.106024096385542, + "grad_norm": 0.1977701038122177, + "learning_rate": 3.1875114941625705e-05, + "loss": 0.0096, + "step": 574 + }, + { + "epoch": 1.1079518072289156, + "grad_norm": 0.3524581491947174, + "learning_rate": 3.1838998493938026e-05, + "loss": 0.0118, + "step": 575 + }, + { + "epoch": 1.1098795180722891, + "grad_norm": 0.3301331698894501, + "learning_rate": 3.180282251592472e-05, + "loss": 0.0094, + "step": 576 + }, + { + "epoch": 1.1118072289156626, + "grad_norm": 0.2774488925933838, + "learning_rate": 3.1766587189490336e-05, + "loss": 0.0131, + "step": 577 + }, + { + "epoch": 1.1137349397590361, + "grad_norm": 1.732595443725586, + "learning_rate": 3.173029269683785e-05, + "loss": 0.0445, + "step": 578 + }, + { + "epoch": 1.1156626506024097, + "grad_norm": 0.28746843338012695, + "learning_rate": 3.169393922046776e-05, + "loss": 0.0116, + "step": 579 + }, + { + "epoch": 1.1175903614457832, + "grad_norm": 0.2952995002269745, + "learning_rate": 3.165752694317713e-05, + "loss": 0.0116, + "step": 580 + }, + { + "epoch": 1.1195180722891567, + "grad_norm": 0.2938575744628906, + "learning_rate": 3.16210560480587e-05, + "loss": 0.013, + "step": 581 + }, + { + "epoch": 1.1214457831325302, + "grad_norm": 0.22283495962619781, + "learning_rate": 3.158452671849998e-05, + "loss": 0.0052, + "step": 582 + }, + { + "epoch": 1.1233734939759037, + "grad_norm": 0.6272858381271362, + "learning_rate": 3.154793913818226e-05, + "loss": 0.0182, + "step": 583 + }, + { + "epoch": 1.1253012048192772, + "grad_norm": 0.479753702878952, + "learning_rate": 3.1511293491079804e-05, + "loss": 0.0146, + "step": 584 + }, + { + "epoch": 1.1272289156626507, + "grad_norm": 0.31104400753974915, + "learning_rate": 3.1474589961458786e-05, + "loss": 0.0139, + "step": 585 + }, + { + "epoch": 1.129156626506024, + "grad_norm": 0.4932832419872284, + "learning_rate": 3.1437828733876477e-05, + "loss": 0.0236, + "step": 586 + }, + { + "epoch": 1.1310843373493975, + "grad_norm": 0.222808837890625, + "learning_rate": 3.140100999318025e-05, + "loss": 0.0084, + "step": 587 + }, + { + "epoch": 1.133012048192771, + "grad_norm": 0.4515356719493866, + "learning_rate": 3.136413392450668e-05, + "loss": 0.0215, + "step": 588 + }, + { + "epoch": 1.1349397590361445, + "grad_norm": 0.39302268624305725, + "learning_rate": 3.132720071328061e-05, + "loss": 0.0154, + "step": 589 + }, + { + "epoch": 1.136867469879518, + "grad_norm": 0.43382835388183594, + "learning_rate": 3.1290210545214205e-05, + "loss": 0.0088, + "step": 590 + }, + { + "epoch": 1.1387951807228915, + "grad_norm": 0.18707136809825897, + "learning_rate": 3.125316360630602e-05, + "loss": 0.0126, + "step": 591 + }, + { + "epoch": 1.140722891566265, + "grad_norm": 0.5688219666481018, + "learning_rate": 3.121606008284011e-05, + "loss": 0.0147, + "step": 592 + }, + { + "epoch": 1.1426506024096386, + "grad_norm": 0.3321833312511444, + "learning_rate": 3.1178900161385005e-05, + "loss": 0.0119, + "step": 593 + }, + { + "epoch": 1.144578313253012, + "grad_norm": 0.3738424777984619, + "learning_rate": 3.114168402879286e-05, + "loss": 0.0158, + "step": 594 + }, + { + "epoch": 1.1465060240963856, + "grad_norm": 0.2386978417634964, + "learning_rate": 3.110441187219846e-05, + "loss": 0.0107, + "step": 595 + }, + { + "epoch": 1.148433734939759, + "grad_norm": 0.2165699452161789, + "learning_rate": 3.10670838790183e-05, + "loss": 0.0079, + "step": 596 + }, + { + "epoch": 1.1503614457831326, + "grad_norm": 0.25952696800231934, + "learning_rate": 3.102970023694965e-05, + "loss": 0.0147, + "step": 597 + }, + { + "epoch": 1.152289156626506, + "grad_norm": 0.21448305249214172, + "learning_rate": 3.099226113396959e-05, + "loss": 0.0099, + "step": 598 + }, + { + "epoch": 1.1542168674698796, + "grad_norm": 0.37226060032844543, + "learning_rate": 3.095476675833405e-05, + "loss": 0.0214, + "step": 599 + }, + { + "epoch": 1.1561445783132531, + "grad_norm": 0.29637983441352844, + "learning_rate": 3.0917217298576955e-05, + "loss": 0.0118, + "step": 600 + }, + { + "epoch": 1.1580722891566264, + "grad_norm": 0.18535609543323517, + "learning_rate": 3.0879612943509154e-05, + "loss": 0.0086, + "step": 601 + }, + { + "epoch": 1.16, + "grad_norm": 0.25874125957489014, + "learning_rate": 3.0841953882217536e-05, + "loss": 0.0088, + "step": 602 + }, + { + "epoch": 1.1619277108433734, + "grad_norm": 0.46092745661735535, + "learning_rate": 3.08042403040641e-05, + "loss": 0.0241, + "step": 603 + }, + { + "epoch": 1.163855421686747, + "grad_norm": 0.27023249864578247, + "learning_rate": 3.076647239868494e-05, + "loss": 0.0154, + "step": 604 + }, + { + "epoch": 1.1657831325301204, + "grad_norm": 0.445157527923584, + "learning_rate": 3.072865035598933e-05, + "loss": 0.0197, + "step": 605 + }, + { + "epoch": 1.167710843373494, + "grad_norm": 0.18097272515296936, + "learning_rate": 3.06907743661588e-05, + "loss": 0.0093, + "step": 606 + }, + { + "epoch": 1.1696385542168675, + "grad_norm": 0.22469942271709442, + "learning_rate": 3.065284461964609e-05, + "loss": 0.0171, + "step": 607 + }, + { + "epoch": 1.171566265060241, + "grad_norm": 0.20190906524658203, + "learning_rate": 3.061486130717428e-05, + "loss": 0.008, + "step": 608 + }, + { + "epoch": 1.1734939759036145, + "grad_norm": 0.18294145166873932, + "learning_rate": 3.057682461973579e-05, + "loss": 0.0155, + "step": 609 + }, + { + "epoch": 1.175421686746988, + "grad_norm": 0.34203943610191345, + "learning_rate": 3.053873474859143e-05, + "loss": 0.0212, + "step": 610 + }, + { + "epoch": 1.1773493975903615, + "grad_norm": 0.49073582887649536, + "learning_rate": 3.050059188526942e-05, + "loss": 0.019, + "step": 611 + }, + { + "epoch": 1.179277108433735, + "grad_norm": 0.3537680506706238, + "learning_rate": 3.046239622156446e-05, + "loss": 0.0147, + "step": 612 + }, + { + "epoch": 1.1812048192771085, + "grad_norm": 0.2584632635116577, + "learning_rate": 3.042414794953674e-05, + "loss": 0.0088, + "step": 613 + }, + { + "epoch": 1.1831325301204818, + "grad_norm": 0.3529360890388489, + "learning_rate": 3.0385847261510975e-05, + "loss": 0.0187, + "step": 614 + }, + { + "epoch": 1.1850602409638555, + "grad_norm": 0.3331570327281952, + "learning_rate": 3.0347494350075465e-05, + "loss": 0.0124, + "step": 615 + }, + { + "epoch": 1.1869879518072288, + "grad_norm": 0.2223527580499649, + "learning_rate": 3.0309089408081074e-05, + "loss": 0.01, + "step": 616 + }, + { + "epoch": 1.1889156626506023, + "grad_norm": 0.21985746920108795, + "learning_rate": 3.027063262864032e-05, + "loss": 0.0087, + "step": 617 + }, + { + "epoch": 1.1908433734939758, + "grad_norm": 0.2989653944969177, + "learning_rate": 3.023212420512637e-05, + "loss": 0.0137, + "step": 618 + }, + { + "epoch": 1.1927710843373494, + "grad_norm": 0.17423275113105774, + "learning_rate": 3.0193564331172074e-05, + "loss": 0.0056, + "step": 619 + }, + { + "epoch": 1.1946987951807229, + "grad_norm": 1.0992127656936646, + "learning_rate": 3.0154953200668976e-05, + "loss": 0.0274, + "step": 620 + }, + { + "epoch": 1.1966265060240964, + "grad_norm": 0.21641989052295685, + "learning_rate": 3.011629100776638e-05, + "loss": 0.0151, + "step": 621 + }, + { + "epoch": 1.1985542168674699, + "grad_norm": 0.4558199644088745, + "learning_rate": 3.007757794687033e-05, + "loss": 0.0424, + "step": 622 + }, + { + "epoch": 1.2004819277108434, + "grad_norm": 0.42380189895629883, + "learning_rate": 3.003881421264266e-05, + "loss": 0.0079, + "step": 623 + }, + { + "epoch": 1.202409638554217, + "grad_norm": 0.28791171312332153, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.0142, + "step": 624 + }, + { + "epoch": 1.2043373493975904, + "grad_norm": 0.3906581997871399, + "learning_rate": 2.996113550411281e-05, + "loss": 0.0251, + "step": 625 + }, + { + "epoch": 1.206265060240964, + "grad_norm": 0.47848746180534363, + "learning_rate": 2.9922220920404375e-05, + "loss": 0.0137, + "step": 626 + }, + { + "epoch": 1.2081927710843374, + "grad_norm": 0.22666941583156586, + "learning_rate": 2.9883256444549862e-05, + "loss": 0.0105, + "step": 627 + }, + { + "epoch": 1.210120481927711, + "grad_norm": 0.18968136608600616, + "learning_rate": 2.984424227247529e-05, + "loss": 0.0089, + "step": 628 + }, + { + "epoch": 1.2120481927710842, + "grad_norm": 0.28732606768608093, + "learning_rate": 2.980517860035656e-05, + "loss": 0.0253, + "step": 629 + }, + { + "epoch": 1.213975903614458, + "grad_norm": 0.21131543815135956, + "learning_rate": 2.9766065624618518e-05, + "loss": 0.0134, + "step": 630 + }, + { + "epoch": 1.2159036144578312, + "grad_norm": 0.7594877481460571, + "learning_rate": 2.972690354193388e-05, + "loss": 0.0157, + "step": 631 + }, + { + "epoch": 1.2178313253012047, + "grad_norm": 0.730291485786438, + "learning_rate": 2.96876925492223e-05, + "loss": 0.0204, + "step": 632 + }, + { + "epoch": 1.2197590361445783, + "grad_norm": 0.20333674550056458, + "learning_rate": 2.9648432843649382e-05, + "loss": 0.0114, + "step": 633 + }, + { + "epoch": 1.2216867469879518, + "grad_norm": 0.5680793523788452, + "learning_rate": 2.960912462262566e-05, + "loss": 0.0146, + "step": 634 + }, + { + "epoch": 1.2236144578313253, + "grad_norm": 0.4591079354286194, + "learning_rate": 2.9569768083805618e-05, + "loss": 0.0112, + "step": 635 + }, + { + "epoch": 1.2255421686746988, + "grad_norm": 0.3793511390686035, + "learning_rate": 2.953036342508671e-05, + "loss": 0.0377, + "step": 636 + }, + { + "epoch": 1.2274698795180723, + "grad_norm": 1.118723750114441, + "learning_rate": 2.9490910844608346e-05, + "loss": 0.0432, + "step": 637 + }, + { + "epoch": 1.2293975903614458, + "grad_norm": 0.36990776658058167, + "learning_rate": 2.9451410540750887e-05, + "loss": 0.0203, + "step": 638 + }, + { + "epoch": 1.2313253012048193, + "grad_norm": 0.930397629737854, + "learning_rate": 2.94118627121347e-05, + "loss": 0.0311, + "step": 639 + }, + { + "epoch": 1.2332530120481928, + "grad_norm": 0.2347625195980072, + "learning_rate": 2.9372267557619075e-05, + "loss": 0.0168, + "step": 640 + }, + { + "epoch": 1.2351807228915663, + "grad_norm": 0.3720332384109497, + "learning_rate": 2.933262527630131e-05, + "loss": 0.0136, + "step": 641 + }, + { + "epoch": 1.2371084337349398, + "grad_norm": 0.4871984124183655, + "learning_rate": 2.929293606751565e-05, + "loss": 0.0339, + "step": 642 + }, + { + "epoch": 1.2390361445783133, + "grad_norm": 0.35853689908981323, + "learning_rate": 2.9253200130832322e-05, + "loss": 0.0095, + "step": 643 + }, + { + "epoch": 1.2409638554216866, + "grad_norm": 0.42003703117370605, + "learning_rate": 2.92134176660565e-05, + "loss": 0.0142, + "step": 644 + }, + { + "epoch": 1.2428915662650604, + "grad_norm": 0.3854500651359558, + "learning_rate": 2.9173588873227338e-05, + "loss": 0.0209, + "step": 645 + }, + { + "epoch": 1.2448192771084337, + "grad_norm": 0.24665917456150055, + "learning_rate": 2.913371395261691e-05, + "loss": 0.0087, + "step": 646 + }, + { + "epoch": 1.2467469879518072, + "grad_norm": 0.41571593284606934, + "learning_rate": 2.9093793104729268e-05, + "loss": 0.0164, + "step": 647 + }, + { + "epoch": 1.2486746987951807, + "grad_norm": 0.4597891569137573, + "learning_rate": 2.9053826530299377e-05, + "loss": 0.0138, + "step": 648 + }, + { + "epoch": 1.2506024096385542, + "grad_norm": 0.43345385789871216, + "learning_rate": 2.901381443029215e-05, + "loss": 0.0353, + "step": 649 + }, + { + "epoch": 1.2525301204819277, + "grad_norm": 0.3706768751144409, + "learning_rate": 2.897375700590141e-05, + "loss": 0.007, + "step": 650 + }, + { + "epoch": 1.2544578313253012, + "grad_norm": 0.30305296182632446, + "learning_rate": 2.8933654458548873e-05, + "loss": 0.0123, + "step": 651 + }, + { + "epoch": 1.2563855421686747, + "grad_norm": 0.2042127549648285, + "learning_rate": 2.8893506989883167e-05, + "loss": 0.0099, + "step": 652 + }, + { + "epoch": 1.2583132530120482, + "grad_norm": 0.20524422824382782, + "learning_rate": 2.8853314801778784e-05, + "loss": 0.0097, + "step": 653 + }, + { + "epoch": 1.2602409638554217, + "grad_norm": 0.2351921945810318, + "learning_rate": 2.8813078096335093e-05, + "loss": 0.0091, + "step": 654 + }, + { + "epoch": 1.2621686746987952, + "grad_norm": 0.34547340869903564, + "learning_rate": 2.87727970758753e-05, + "loss": 0.0088, + "step": 655 + }, + { + "epoch": 1.2640963855421687, + "grad_norm": 0.35163217782974243, + "learning_rate": 2.8732471942945443e-05, + "loss": 0.0145, + "step": 656 + }, + { + "epoch": 1.266024096385542, + "grad_norm": 1.715137243270874, + "learning_rate": 2.8692102900313378e-05, + "loss": 0.0198, + "step": 657 + }, + { + "epoch": 1.2679518072289158, + "grad_norm": 0.2860178053379059, + "learning_rate": 2.8651690150967748e-05, + "loss": 0.0085, + "step": 658 + }, + { + "epoch": 1.269879518072289, + "grad_norm": 0.21175967156887054, + "learning_rate": 2.8611233898116967e-05, + "loss": 0.0071, + "step": 659 + }, + { + "epoch": 1.2718072289156628, + "grad_norm": 0.33726972341537476, + "learning_rate": 2.85707343451882e-05, + "loss": 0.012, + "step": 660 + }, + { + "epoch": 1.273734939759036, + "grad_norm": 0.2138456553220749, + "learning_rate": 2.853019169582635e-05, + "loss": 0.0092, + "step": 661 + }, + { + "epoch": 1.2756626506024096, + "grad_norm": 0.2304934412240982, + "learning_rate": 2.8489606153892997e-05, + "loss": 0.0144, + "step": 662 + }, + { + "epoch": 1.277590361445783, + "grad_norm": 0.2691061794757843, + "learning_rate": 2.8448977923465425e-05, + "loss": 0.0121, + "step": 663 + }, + { + "epoch": 1.2795180722891566, + "grad_norm": 0.35254305601119995, + "learning_rate": 2.840830720883555e-05, + "loss": 0.0125, + "step": 664 + }, + { + "epoch": 1.28144578313253, + "grad_norm": 0.36552608013153076, + "learning_rate": 2.836759421450893e-05, + "loss": 0.021, + "step": 665 + }, + { + "epoch": 1.2833734939759036, + "grad_norm": 0.37177154421806335, + "learning_rate": 2.83268391452037e-05, + "loss": 0.0216, + "step": 666 + }, + { + "epoch": 1.2853012048192771, + "grad_norm": 0.20932547748088837, + "learning_rate": 2.828604220584958e-05, + "loss": 0.0077, + "step": 667 + }, + { + "epoch": 1.2872289156626506, + "grad_norm": 0.5158557295799255, + "learning_rate": 2.824520360158681e-05, + "loss": 0.0394, + "step": 668 + }, + { + "epoch": 1.2891566265060241, + "grad_norm": 0.22623969614505768, + "learning_rate": 2.820432353776515e-05, + "loss": 0.0087, + "step": 669 + }, + { + "epoch": 1.2910843373493976, + "grad_norm": 0.2996046245098114, + "learning_rate": 2.8163402219942822e-05, + "loss": 0.01, + "step": 670 + }, + { + "epoch": 1.2930120481927712, + "grad_norm": 0.24957989156246185, + "learning_rate": 2.8122439853885488e-05, + "loss": 0.0127, + "step": 671 + }, + { + "epoch": 1.2949397590361444, + "grad_norm": 0.2636559307575226, + "learning_rate": 2.8081436645565216e-05, + "loss": 0.0128, + "step": 672 + }, + { + "epoch": 1.2968674698795182, + "grad_norm": 0.3531591296195984, + "learning_rate": 2.804039280115944e-05, + "loss": 0.0199, + "step": 673 + }, + { + "epoch": 1.2987951807228915, + "grad_norm": 0.3682299852371216, + "learning_rate": 2.7999308527049927e-05, + "loss": 0.0088, + "step": 674 + }, + { + "epoch": 1.3007228915662652, + "grad_norm": 0.19555217027664185, + "learning_rate": 2.795818402982174e-05, + "loss": 0.0084, + "step": 675 + }, + { + "epoch": 1.3026506024096385, + "grad_norm": 0.2864912450313568, + "learning_rate": 2.7917019516262186e-05, + "loss": 0.0154, + "step": 676 + }, + { + "epoch": 1.304578313253012, + "grad_norm": 0.2211237996816635, + "learning_rate": 2.78758151933598e-05, + "loss": 0.0078, + "step": 677 + }, + { + "epoch": 1.3065060240963855, + "grad_norm": 0.13646945357322693, + "learning_rate": 2.7834571268303294e-05, + "loss": 0.0058, + "step": 678 + }, + { + "epoch": 1.308433734939759, + "grad_norm": 0.16530285775661469, + "learning_rate": 2.779328794848049e-05, + "loss": 0.007, + "step": 679 + }, + { + "epoch": 1.3103614457831325, + "grad_norm": 0.2145693302154541, + "learning_rate": 2.7751965441477325e-05, + "loss": 0.0203, + "step": 680 + }, + { + "epoch": 1.312289156626506, + "grad_norm": 0.24273739755153656, + "learning_rate": 2.771060395507677e-05, + "loss": 0.0106, + "step": 681 + }, + { + "epoch": 1.3142168674698795, + "grad_norm": 0.20430618524551392, + "learning_rate": 2.7669203697257794e-05, + "loss": 0.0122, + "step": 682 + }, + { + "epoch": 1.316144578313253, + "grad_norm": 0.2502615749835968, + "learning_rate": 2.7627764876194335e-05, + "loss": 0.0101, + "step": 683 + }, + { + "epoch": 1.3180722891566266, + "grad_norm": 0.287239670753479, + "learning_rate": 2.7586287700254214e-05, + "loss": 0.0203, + "step": 684 + }, + { + "epoch": 1.32, + "grad_norm": 0.16239754855632782, + "learning_rate": 2.7544772377998147e-05, + "loss": 0.0084, + "step": 685 + }, + { + "epoch": 1.3219277108433736, + "grad_norm": 0.27174142003059387, + "learning_rate": 2.7503219118178636e-05, + "loss": 0.008, + "step": 686 + }, + { + "epoch": 1.3238554216867469, + "grad_norm": 0.12878240644931793, + "learning_rate": 2.7461628129738954e-05, + "loss": 0.0053, + "step": 687 + }, + { + "epoch": 1.3257831325301206, + "grad_norm": 0.16112515330314636, + "learning_rate": 2.7419999621812086e-05, + "loss": 0.0059, + "step": 688 + }, + { + "epoch": 1.3277108433734939, + "grad_norm": 0.2398834228515625, + "learning_rate": 2.7378333803719672e-05, + "loss": 0.0095, + "step": 689 + }, + { + "epoch": 1.3296385542168676, + "grad_norm": 0.18516193330287933, + "learning_rate": 2.733663088497097e-05, + "loss": 0.0071, + "step": 690 + }, + { + "epoch": 1.331566265060241, + "grad_norm": 0.2974924147129059, + "learning_rate": 2.7294891075261785e-05, + "loss": 0.0227, + "step": 691 + }, + { + "epoch": 1.3334939759036144, + "grad_norm": 0.12931054830551147, + "learning_rate": 2.7253114584473418e-05, + "loss": 0.0039, + "step": 692 + }, + { + "epoch": 1.335421686746988, + "grad_norm": 0.16319474577903748, + "learning_rate": 2.7211301622671623e-05, + "loss": 0.008, + "step": 693 + }, + { + "epoch": 1.3373493975903614, + "grad_norm": 0.27622169256210327, + "learning_rate": 2.7169452400105533e-05, + "loss": 0.0238, + "step": 694 + }, + { + "epoch": 1.339277108433735, + "grad_norm": 0.45309779047966003, + "learning_rate": 2.712756712720663e-05, + "loss": 0.0439, + "step": 695 + }, + { + "epoch": 1.3412048192771084, + "grad_norm": 0.2469855099916458, + "learning_rate": 2.708564601458765e-05, + "loss": 0.0085, + "step": 696 + }, + { + "epoch": 1.343132530120482, + "grad_norm": 0.4245856702327728, + "learning_rate": 2.7043689273041535e-05, + "loss": 0.0097, + "step": 697 + }, + { + "epoch": 1.3450602409638555, + "grad_norm": 0.26796087622642517, + "learning_rate": 2.7001697113540414e-05, + "loss": 0.0119, + "step": 698 + }, + { + "epoch": 1.346987951807229, + "grad_norm": 0.3569283187389374, + "learning_rate": 2.6959669747234482e-05, + "loss": 0.0096, + "step": 699 + }, + { + "epoch": 1.3489156626506025, + "grad_norm": 0.7038524150848389, + "learning_rate": 2.6917607385450973e-05, + "loss": 0.0317, + "step": 700 + }, + { + "epoch": 1.350843373493976, + "grad_norm": 0.23568563163280487, + "learning_rate": 2.687551023969308e-05, + "loss": 0.0112, + "step": 701 + }, + { + "epoch": 1.3527710843373493, + "grad_norm": 0.20338499546051025, + "learning_rate": 2.6833378521638935e-05, + "loss": 0.0092, + "step": 702 + }, + { + "epoch": 1.354698795180723, + "grad_norm": 4.22187614440918, + "learning_rate": 2.679121244314046e-05, + "loss": 0.0314, + "step": 703 + }, + { + "epoch": 1.3566265060240963, + "grad_norm": 0.2542206048965454, + "learning_rate": 2.674901221622239e-05, + "loss": 0.0158, + "step": 704 + }, + { + "epoch": 1.3585542168674698, + "grad_norm": 0.49705010652542114, + "learning_rate": 2.670677805308116e-05, + "loss": 0.0162, + "step": 705 + }, + { + "epoch": 1.3604819277108433, + "grad_norm": 0.17502115666866302, + "learning_rate": 2.666451016608383e-05, + "loss": 0.0074, + "step": 706 + }, + { + "epoch": 1.3624096385542168, + "grad_norm": 0.21738742291927338, + "learning_rate": 2.6622208767767075e-05, + "loss": 0.0135, + "step": 707 + }, + { + "epoch": 1.3643373493975903, + "grad_norm": 0.3309847414493561, + "learning_rate": 2.6579874070836032e-05, + "loss": 0.0107, + "step": 708 + }, + { + "epoch": 1.3662650602409638, + "grad_norm": 0.10706827789545059, + "learning_rate": 2.6537506288163303e-05, + "loss": 0.0043, + "step": 709 + }, + { + "epoch": 1.3681927710843373, + "grad_norm": 0.173640176653862, + "learning_rate": 2.6495105632787835e-05, + "loss": 0.0092, + "step": 710 + }, + { + "epoch": 1.3701204819277109, + "grad_norm": 0.2636397182941437, + "learning_rate": 2.6452672317913893e-05, + "loss": 0.0097, + "step": 711 + }, + { + "epoch": 1.3720481927710844, + "grad_norm": 0.28485360741615295, + "learning_rate": 2.6410206556909943e-05, + "loss": 0.0193, + "step": 712 + }, + { + "epoch": 1.3739759036144579, + "grad_norm": 0.23210027813911438, + "learning_rate": 2.636770856330761e-05, + "loss": 0.0229, + "step": 713 + }, + { + "epoch": 1.3759036144578314, + "grad_norm": 0.13388316333293915, + "learning_rate": 2.6325178550800596e-05, + "loss": 0.004, + "step": 714 + }, + { + "epoch": 1.377831325301205, + "grad_norm": 0.5131422877311707, + "learning_rate": 2.6282616733243603e-05, + "loss": 0.0137, + "step": 715 + }, + { + "epoch": 1.3797590361445784, + "grad_norm": 0.3243267834186554, + "learning_rate": 2.6240023324651258e-05, + "loss": 0.0153, + "step": 716 + }, + { + "epoch": 1.3816867469879517, + "grad_norm": 0.1440611034631729, + "learning_rate": 2.619739853919704e-05, + "loss": 0.0031, + "step": 717 + }, + { + "epoch": 1.3836144578313254, + "grad_norm": 0.30346596240997314, + "learning_rate": 2.6154742591212196e-05, + "loss": 0.0109, + "step": 718 + }, + { + "epoch": 1.3855421686746987, + "grad_norm": 0.19109240174293518, + "learning_rate": 2.611205569518468e-05, + "loss": 0.0094, + "step": 719 + }, + { + "epoch": 1.3874698795180722, + "grad_norm": 0.28636518120765686, + "learning_rate": 2.6069338065758056e-05, + "loss": 0.0123, + "step": 720 + }, + { + "epoch": 1.3893975903614457, + "grad_norm": 0.28083911538124084, + "learning_rate": 2.6026589917730416e-05, + "loss": 0.0104, + "step": 721 + }, + { + "epoch": 1.3913253012048192, + "grad_norm": 0.36553966999053955, + "learning_rate": 2.5983811466053327e-05, + "loss": 0.0143, + "step": 722 + }, + { + "epoch": 1.3932530120481927, + "grad_norm": 0.23317205905914307, + "learning_rate": 2.5941002925830708e-05, + "loss": 0.011, + "step": 723 + }, + { + "epoch": 1.3951807228915662, + "grad_norm": 0.3825171887874603, + "learning_rate": 2.589816451231781e-05, + "loss": 0.0098, + "step": 724 + }, + { + "epoch": 1.3971084337349398, + "grad_norm": 0.19916608929634094, + "learning_rate": 2.585529644092006e-05, + "loss": 0.0094, + "step": 725 + }, + { + "epoch": 1.3990361445783133, + "grad_norm": 0.19990523159503937, + "learning_rate": 2.5812398927192027e-05, + "loss": 0.0128, + "step": 726 + }, + { + "epoch": 1.4009638554216868, + "grad_norm": 0.34662899374961853, + "learning_rate": 2.5769472186836347e-05, + "loss": 0.0091, + "step": 727 + }, + { + "epoch": 1.4028915662650603, + "grad_norm": 0.23481112718582153, + "learning_rate": 2.5726516435702583e-05, + "loss": 0.0154, + "step": 728 + }, + { + "epoch": 1.4048192771084338, + "grad_norm": 0.1846667379140854, + "learning_rate": 2.5683531889786194e-05, + "loss": 0.0088, + "step": 729 + }, + { + "epoch": 1.4067469879518073, + "grad_norm": 0.16717663407325745, + "learning_rate": 2.564051876522742e-05, + "loss": 0.0083, + "step": 730 + }, + { + "epoch": 1.4086746987951808, + "grad_norm": 0.4116475284099579, + "learning_rate": 2.5597477278310202e-05, + "loss": 0.0179, + "step": 731 + }, + { + "epoch": 1.410602409638554, + "grad_norm": 0.171807661652565, + "learning_rate": 2.5554407645461115e-05, + "loss": 0.0063, + "step": 732 + }, + { + "epoch": 1.4125301204819278, + "grad_norm": 0.1954439878463745, + "learning_rate": 2.5511310083248243e-05, + "loss": 0.017, + "step": 733 + }, + { + "epoch": 1.4144578313253011, + "grad_norm": 0.37158989906311035, + "learning_rate": 2.5468184808380104e-05, + "loss": 0.0173, + "step": 734 + }, + { + "epoch": 1.4163855421686746, + "grad_norm": 0.2001633644104004, + "learning_rate": 2.542503203770458e-05, + "loss": 0.0165, + "step": 735 + }, + { + "epoch": 1.4183132530120481, + "grad_norm": 0.45673373341560364, + "learning_rate": 2.53818519882078e-05, + "loss": 0.0185, + "step": 736 + }, + { + "epoch": 1.4202409638554216, + "grad_norm": 0.3838701546192169, + "learning_rate": 2.5338644877013067e-05, + "loss": 0.0134, + "step": 737 + }, + { + "epoch": 1.4221686746987952, + "grad_norm": 0.32032477855682373, + "learning_rate": 2.5295410921379745e-05, + "loss": 0.0143, + "step": 738 + }, + { + "epoch": 1.4240963855421687, + "grad_norm": 0.4594039022922516, + "learning_rate": 2.52521503387022e-05, + "loss": 0.0193, + "step": 739 + }, + { + "epoch": 1.4260240963855422, + "grad_norm": 0.3889620900154114, + "learning_rate": 2.5208863346508667e-05, + "loss": 0.0114, + "step": 740 + }, + { + "epoch": 1.4279518072289157, + "grad_norm": 0.33153319358825684, + "learning_rate": 2.5165550162460203e-05, + "loss": 0.0102, + "step": 741 + }, + { + "epoch": 1.4298795180722892, + "grad_norm": 0.7269518375396729, + "learning_rate": 2.5122211004349536e-05, + "loss": 0.0215, + "step": 742 + }, + { + "epoch": 1.4318072289156627, + "grad_norm": 0.31653261184692383, + "learning_rate": 2.5078846090100023e-05, + "loss": 0.0115, + "step": 743 + }, + { + "epoch": 1.4337349397590362, + "grad_norm": 0.20620353519916534, + "learning_rate": 2.5035455637764518e-05, + "loss": 0.0153, + "step": 744 + }, + { + "epoch": 1.4356626506024097, + "grad_norm": 0.17266008257865906, + "learning_rate": 2.4992039865524297e-05, + "loss": 0.0069, + "step": 745 + }, + { + "epoch": 1.4375903614457832, + "grad_norm": 0.24760811030864716, + "learning_rate": 2.494859899168795e-05, + "loss": 0.0108, + "step": 746 + }, + { + "epoch": 1.4395180722891565, + "grad_norm": 0.2584865391254425, + "learning_rate": 2.4905133234690282e-05, + "loss": 0.0095, + "step": 747 + }, + { + "epoch": 1.4414457831325302, + "grad_norm": 0.48847514390945435, + "learning_rate": 2.486164281309122e-05, + "loss": 0.0181, + "step": 748 + }, + { + "epoch": 1.4433734939759035, + "grad_norm": 0.42942047119140625, + "learning_rate": 2.4818127945574717e-05, + "loss": 0.025, + "step": 749 + }, + { + "epoch": 1.445301204819277, + "grad_norm": 0.23713800311088562, + "learning_rate": 2.4774588850947648e-05, + "loss": 0.0085, + "step": 750 + }, + { + "epoch": 1.4472289156626506, + "grad_norm": 0.8797569870948792, + "learning_rate": 2.473102574813871e-05, + "loss": 0.0097, + "step": 751 + }, + { + "epoch": 1.449156626506024, + "grad_norm": 0.2744862735271454, + "learning_rate": 2.4687438856197302e-05, + "loss": 0.0122, + "step": 752 + }, + { + "epoch": 1.4510843373493976, + "grad_norm": 0.12747010588645935, + "learning_rate": 2.4643828394292478e-05, + "loss": 0.0056, + "step": 753 + }, + { + "epoch": 1.453012048192771, + "grad_norm": 0.37376829981803894, + "learning_rate": 2.4600194581711775e-05, + "loss": 0.0052, + "step": 754 + }, + { + "epoch": 1.4549397590361446, + "grad_norm": 0.2536911368370056, + "learning_rate": 2.4556537637860176e-05, + "loss": 0.0113, + "step": 755 + }, + { + "epoch": 1.456867469879518, + "grad_norm": 0.25950780510902405, + "learning_rate": 2.451285778225894e-05, + "loss": 0.0099, + "step": 756 + }, + { + "epoch": 1.4587951807228916, + "grad_norm": 0.19535955786705017, + "learning_rate": 2.4469155234544565e-05, + "loss": 0.0069, + "step": 757 + }, + { + "epoch": 1.4607228915662651, + "grad_norm": 0.22816115617752075, + "learning_rate": 2.442543021446764e-05, + "loss": 0.0088, + "step": 758 + }, + { + "epoch": 1.4626506024096386, + "grad_norm": 0.3363986313343048, + "learning_rate": 2.4381682941891755e-05, + "loss": 0.0182, + "step": 759 + }, + { + "epoch": 1.464578313253012, + "grad_norm": 0.21492891013622284, + "learning_rate": 2.4337913636792382e-05, + "loss": 0.0069, + "step": 760 + }, + { + "epoch": 1.4665060240963856, + "grad_norm": 0.6070862412452698, + "learning_rate": 2.429412251925579e-05, + "loss": 0.0406, + "step": 761 + }, + { + "epoch": 1.468433734939759, + "grad_norm": 2.6469690799713135, + "learning_rate": 2.425030980947793e-05, + "loss": 0.0205, + "step": 762 + }, + { + "epoch": 1.4703614457831327, + "grad_norm": 0.30909740924835205, + "learning_rate": 2.420647572776332e-05, + "loss": 0.0136, + "step": 763 + }, + { + "epoch": 1.472289156626506, + "grad_norm": 0.6639553904533386, + "learning_rate": 2.416262049452395e-05, + "loss": 0.011, + "step": 764 + }, + { + "epoch": 1.4742168674698795, + "grad_norm": 0.2919616997241974, + "learning_rate": 2.4118744330278147e-05, + "loss": 0.0131, + "step": 765 + }, + { + "epoch": 1.476144578313253, + "grad_norm": 0.5232429504394531, + "learning_rate": 2.4074847455649523e-05, + "loss": 0.0138, + "step": 766 + }, + { + "epoch": 1.4780722891566265, + "grad_norm": 5.630630970001221, + "learning_rate": 2.403093009136579e-05, + "loss": 0.0264, + "step": 767 + }, + { + "epoch": 1.48, + "grad_norm": 0.33234721422195435, + "learning_rate": 2.3986992458257707e-05, + "loss": 0.0111, + "step": 768 + }, + { + "epoch": 1.4819277108433735, + "grad_norm": 0.28444772958755493, + "learning_rate": 2.3943034777257945e-05, + "loss": 0.0144, + "step": 769 + }, + { + "epoch": 1.483855421686747, + "grad_norm": 0.16229979693889618, + "learning_rate": 2.38990572694e-05, + "loss": 0.0062, + "step": 770 + }, + { + "epoch": 1.4857831325301205, + "grad_norm": 0.27474716305732727, + "learning_rate": 2.385506015581704e-05, + "loss": 0.0172, + "step": 771 + }, + { + "epoch": 1.487710843373494, + "grad_norm": 0.246526300907135, + "learning_rate": 2.381104365774083e-05, + "loss": 0.012, + "step": 772 + }, + { + "epoch": 1.4896385542168675, + "grad_norm": 0.282047837972641, + "learning_rate": 2.37670079965006e-05, + "loss": 0.0116, + "step": 773 + }, + { + "epoch": 1.491566265060241, + "grad_norm": 0.2878139317035675, + "learning_rate": 2.3722953393521944e-05, + "loss": 0.0147, + "step": 774 + }, + { + "epoch": 1.4934939759036143, + "grad_norm": 0.5586277842521667, + "learning_rate": 2.367888007032571e-05, + "loss": 0.0111, + "step": 775 + }, + { + "epoch": 1.495421686746988, + "grad_norm": 0.562160313129425, + "learning_rate": 2.3634788248526846e-05, + "loss": 0.0061, + "step": 776 + }, + { + "epoch": 1.4973493975903613, + "grad_norm": 0.3452005982398987, + "learning_rate": 2.3590678149833356e-05, + "loss": 0.0205, + "step": 777 + }, + { + "epoch": 1.499277108433735, + "grad_norm": 0.7757686376571655, + "learning_rate": 2.3546549996045114e-05, + "loss": 0.0273, + "step": 778 + }, + { + "epoch": 1.5012048192771084, + "grad_norm": 0.19530551135540009, + "learning_rate": 2.3502404009052812e-05, + "loss": 0.0083, + "step": 779 + }, + { + "epoch": 1.503132530120482, + "grad_norm": 0.2586531639099121, + "learning_rate": 2.3458240410836775e-05, + "loss": 0.0122, + "step": 780 + }, + { + "epoch": 1.5050602409638554, + "grad_norm": 0.30063286423683167, + "learning_rate": 2.3414059423465924e-05, + "loss": 0.0083, + "step": 781 + }, + { + "epoch": 1.5069879518072289, + "grad_norm": 0.18663185834884644, + "learning_rate": 2.3369861269096575e-05, + "loss": 0.0104, + "step": 782 + }, + { + "epoch": 1.5089156626506024, + "grad_norm": 0.4405941069126129, + "learning_rate": 2.3325646169971416e-05, + "loss": 0.0264, + "step": 783 + }, + { + "epoch": 1.510843373493976, + "grad_norm": 0.2947913110256195, + "learning_rate": 2.3281414348418294e-05, + "loss": 0.0107, + "step": 784 + }, + { + "epoch": 1.5127710843373494, + "grad_norm": 0.23813778162002563, + "learning_rate": 2.3237166026849158e-05, + "loss": 0.0084, + "step": 785 + }, + { + "epoch": 1.514698795180723, + "grad_norm": 0.33380329608917236, + "learning_rate": 2.3192901427758932e-05, + "loss": 0.0111, + "step": 786 + }, + { + "epoch": 1.5166265060240964, + "grad_norm": 0.3736988306045532, + "learning_rate": 2.314862077372438e-05, + "loss": 0.0135, + "step": 787 + }, + { + "epoch": 1.5185542168674697, + "grad_norm": 0.3785395920276642, + "learning_rate": 2.3104324287402996e-05, + "loss": 0.0265, + "step": 788 + }, + { + "epoch": 1.5204819277108435, + "grad_norm": 0.3359154462814331, + "learning_rate": 2.3060012191531885e-05, + "loss": 0.0127, + "step": 789 + }, + { + "epoch": 1.5224096385542167, + "grad_norm": 0.720753014087677, + "learning_rate": 2.301568470892664e-05, + "loss": 0.0134, + "step": 790 + }, + { + "epoch": 1.5243373493975905, + "grad_norm": 0.36473193764686584, + "learning_rate": 2.297134206248024e-05, + "loss": 0.0318, + "step": 791 + }, + { + "epoch": 1.5262650602409638, + "grad_norm": 0.29987087845802307, + "learning_rate": 2.2926984475161884e-05, + "loss": 0.008, + "step": 792 + }, + { + "epoch": 1.5281927710843375, + "grad_norm": 0.2883112132549286, + "learning_rate": 2.2882612170015914e-05, + "loss": 0.0125, + "step": 793 + }, + { + "epoch": 1.5301204819277108, + "grad_norm": 0.28983229398727417, + "learning_rate": 2.2838225370160682e-05, + "loss": 0.0155, + "step": 794 + }, + { + "epoch": 1.5320481927710843, + "grad_norm": 0.47236886620521545, + "learning_rate": 2.2793824298787414e-05, + "loss": 0.0132, + "step": 795 + }, + { + "epoch": 1.5339759036144578, + "grad_norm": 0.8328865170478821, + "learning_rate": 2.2749409179159104e-05, + "loss": 0.026, + "step": 796 + }, + { + "epoch": 1.5359036144578313, + "grad_norm": 0.3129172623157501, + "learning_rate": 2.2704980234609396e-05, + "loss": 0.0099, + "step": 797 + }, + { + "epoch": 1.5378313253012048, + "grad_norm": 0.22284500300884247, + "learning_rate": 2.2660537688541416e-05, + "loss": 0.009, + "step": 798 + }, + { + "epoch": 1.5397590361445783, + "grad_norm": 0.3346405625343323, + "learning_rate": 2.2616081764426726e-05, + "loss": 0.0077, + "step": 799 + }, + { + "epoch": 1.5416867469879518, + "grad_norm": 0.2923565208911896, + "learning_rate": 2.2571612685804124e-05, + "loss": 0.0119, + "step": 800 + }, + { + "epoch": 1.5436144578313253, + "grad_norm": 0.1921311914920807, + "learning_rate": 2.252713067627857e-05, + "loss": 0.0083, + "step": 801 + }, + { + "epoch": 1.5455421686746988, + "grad_norm": 0.23221106827259064, + "learning_rate": 2.2482635959520044e-05, + "loss": 0.0049, + "step": 802 + }, + { + "epoch": 1.5474698795180721, + "grad_norm": 0.6340724229812622, + "learning_rate": 2.243812875926241e-05, + "loss": 0.0273, + "step": 803 + }, + { + "epoch": 1.5493975903614459, + "grad_norm": 0.2699439823627472, + "learning_rate": 2.2393609299302314e-05, + "loss": 0.0108, + "step": 804 + }, + { + "epoch": 1.5513253012048192, + "grad_norm": 0.2005189210176468, + "learning_rate": 2.2349077803498052e-05, + "loss": 0.0076, + "step": 805 + }, + { + "epoch": 1.5532530120481929, + "grad_norm": 0.39668548107147217, + "learning_rate": 2.230453449576842e-05, + "loss": 0.0135, + "step": 806 + }, + { + "epoch": 1.5551807228915662, + "grad_norm": 0.2406950294971466, + "learning_rate": 2.2259979600091635e-05, + "loss": 0.0094, + "step": 807 + }, + { + "epoch": 1.55710843373494, + "grad_norm": 0.30363157391548157, + "learning_rate": 2.2215413340504158e-05, + "loss": 0.0178, + "step": 808 + }, + { + "epoch": 1.5590361445783132, + "grad_norm": 0.19508181512355804, + "learning_rate": 2.2170835941099605e-05, + "loss": 0.0069, + "step": 809 + }, + { + "epoch": 1.5609638554216867, + "grad_norm": 0.734106719493866, + "learning_rate": 2.2126247626027615e-05, + "loss": 0.0319, + "step": 810 + }, + { + "epoch": 1.5628915662650602, + "grad_norm": 0.2591583728790283, + "learning_rate": 2.208164861949268e-05, + "loss": 0.0168, + "step": 811 + }, + { + "epoch": 1.5648192771084337, + "grad_norm": 0.2386734038591385, + "learning_rate": 2.20370391457531e-05, + "loss": 0.0041, + "step": 812 + }, + { + "epoch": 1.5667469879518072, + "grad_norm": 0.1675218939781189, + "learning_rate": 2.1992419429119764e-05, + "loss": 0.0078, + "step": 813 + }, + { + "epoch": 1.5686746987951807, + "grad_norm": 0.45591506361961365, + "learning_rate": 2.1947789693955097e-05, + "loss": 0.0166, + "step": 814 + }, + { + "epoch": 1.5706024096385542, + "grad_norm": 0.46940621733665466, + "learning_rate": 2.190315016467188e-05, + "loss": 0.0176, + "step": 815 + }, + { + "epoch": 1.5725301204819278, + "grad_norm": 0.2294205278158188, + "learning_rate": 2.1858501065732146e-05, + "loss": 0.0102, + "step": 816 + }, + { + "epoch": 1.5744578313253013, + "grad_norm": 0.28922322392463684, + "learning_rate": 2.181384262164606e-05, + "loss": 0.0111, + "step": 817 + }, + { + "epoch": 1.5763855421686745, + "grad_norm": 0.19650064408779144, + "learning_rate": 2.1769175056970765e-05, + "loss": 0.0076, + "step": 818 + }, + { + "epoch": 1.5783132530120483, + "grad_norm": 0.19538825750350952, + "learning_rate": 2.172449859630927e-05, + "loss": 0.0118, + "step": 819 + }, + { + "epoch": 1.5802409638554216, + "grad_norm": 0.1900389939546585, + "learning_rate": 2.167981346430931e-05, + "loss": 0.0066, + "step": 820 + }, + { + "epoch": 1.5821686746987953, + "grad_norm": 0.21593710780143738, + "learning_rate": 2.1635119885662235e-05, + "loss": 0.0101, + "step": 821 + }, + { + "epoch": 1.5840963855421686, + "grad_norm": 0.2699289321899414, + "learning_rate": 2.159041808510185e-05, + "loss": 0.0118, + "step": 822 + }, + { + "epoch": 1.5860240963855423, + "grad_norm": 0.31867673993110657, + "learning_rate": 2.1545708287403322e-05, + "loss": 0.0122, + "step": 823 + }, + { + "epoch": 1.5879518072289156, + "grad_norm": 0.2862400412559509, + "learning_rate": 2.1500990717382004e-05, + "loss": 0.0216, + "step": 824 + }, + { + "epoch": 1.589879518072289, + "grad_norm": 0.28482481837272644, + "learning_rate": 2.145626559989237e-05, + "loss": 0.0136, + "step": 825 + }, + { + "epoch": 1.5918072289156626, + "grad_norm": 0.2866958975791931, + "learning_rate": 2.1411533159826803e-05, + "loss": 0.0298, + "step": 826 + }, + { + "epoch": 1.5937349397590361, + "grad_norm": 0.39092838764190674, + "learning_rate": 2.1366793622114533e-05, + "loss": 0.0382, + "step": 827 + }, + { + "epoch": 1.5956626506024096, + "grad_norm": 0.16381537914276123, + "learning_rate": 2.1322047211720468e-05, + "loss": 0.0074, + "step": 828 + }, + { + "epoch": 1.5975903614457831, + "grad_norm": 0.22146940231323242, + "learning_rate": 2.1277294153644083e-05, + "loss": 0.0103, + "step": 829 + }, + { + "epoch": 1.5995180722891567, + "grad_norm": 0.2155209183692932, + "learning_rate": 2.123253467291827e-05, + "loss": 0.0095, + "step": 830 + }, + { + "epoch": 1.6014457831325302, + "grad_norm": 0.41510409116744995, + "learning_rate": 2.118776899460822e-05, + "loss": 0.0457, + "step": 831 + }, + { + "epoch": 1.6033734939759037, + "grad_norm": 0.19718150794506073, + "learning_rate": 2.1142997343810293e-05, + "loss": 0.0192, + "step": 832 + }, + { + "epoch": 1.605301204819277, + "grad_norm": 0.40924403071403503, + "learning_rate": 2.1098219945650865e-05, + "loss": 0.0278, + "step": 833 + }, + { + "epoch": 1.6072289156626507, + "grad_norm": 0.18657824397087097, + "learning_rate": 2.105343702528524e-05, + "loss": 0.0076, + "step": 834 + }, + { + "epoch": 1.609156626506024, + "grad_norm": 0.1727641075849533, + "learning_rate": 2.100864880789645e-05, + "loss": 0.0076, + "step": 835 + }, + { + "epoch": 1.6110843373493977, + "grad_norm": 0.18138745427131653, + "learning_rate": 2.0963855518694203e-05, + "loss": 0.005, + "step": 836 + }, + { + "epoch": 1.613012048192771, + "grad_norm": 0.19173955917358398, + "learning_rate": 2.0919057382913675e-05, + "loss": 0.0084, + "step": 837 + }, + { + "epoch": 1.6149397590361447, + "grad_norm": 0.3812403380870819, + "learning_rate": 2.0874254625814435e-05, + "loss": 0.009, + "step": 838 + }, + { + "epoch": 1.616867469879518, + "grad_norm": 0.2009759545326233, + "learning_rate": 2.0829447472679285e-05, + "loss": 0.0098, + "step": 839 + }, + { + "epoch": 1.6187951807228915, + "grad_norm": 0.48703446984291077, + "learning_rate": 2.0784636148813124e-05, + "loss": 0.0099, + "step": 840 + }, + { + "epoch": 1.620722891566265, + "grad_norm": 0.28995075821876526, + "learning_rate": 2.0739820879541827e-05, + "loss": 0.0075, + "step": 841 + }, + { + "epoch": 1.6226506024096385, + "grad_norm": 0.2130059450864792, + "learning_rate": 2.069500189021111e-05, + "loss": 0.007, + "step": 842 + }, + { + "epoch": 1.624578313253012, + "grad_norm": 0.252524733543396, + "learning_rate": 2.0650179406185397e-05, + "loss": 0.0249, + "step": 843 + }, + { + "epoch": 1.6265060240963856, + "grad_norm": 0.23069098591804504, + "learning_rate": 2.060535365284668e-05, + "loss": 0.0084, + "step": 844 + }, + { + "epoch": 1.628433734939759, + "grad_norm": 0.25051403045654297, + "learning_rate": 2.056052485559338e-05, + "loss": 0.0071, + "step": 845 + }, + { + "epoch": 1.6303614457831326, + "grad_norm": 0.27664798498153687, + "learning_rate": 2.051569323983924e-05, + "loss": 0.0198, + "step": 846 + }, + { + "epoch": 1.632289156626506, + "grad_norm": 0.2954922318458557, + "learning_rate": 2.047085903101218e-05, + "loss": 0.006, + "step": 847 + }, + { + "epoch": 1.6342168674698794, + "grad_norm": 0.28477591276168823, + "learning_rate": 2.0426022454553137e-05, + "loss": 0.0147, + "step": 848 + }, + { + "epoch": 1.636144578313253, + "grad_norm": 0.2785305678844452, + "learning_rate": 2.0381183735914968e-05, + "loss": 0.0117, + "step": 849 + }, + { + "epoch": 1.6380722891566264, + "grad_norm": 0.2500309348106384, + "learning_rate": 2.0336343100561295e-05, + "loss": 0.008, + "step": 850 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.18932047486305237, + "learning_rate": 2.0291500773965392e-05, + "loss": 0.0256, + "step": 851 + }, + { + "epoch": 1.6419277108433734, + "grad_norm": 0.6396257877349854, + "learning_rate": 2.0246656981609013e-05, + "loss": 0.0141, + "step": 852 + }, + { + "epoch": 1.6438554216867471, + "grad_norm": 0.5072891116142273, + "learning_rate": 2.02018119489813e-05, + "loss": 0.008, + "step": 853 + }, + { + "epoch": 1.6457831325301204, + "grad_norm": 0.2920839488506317, + "learning_rate": 2.0156965901577635e-05, + "loss": 0.0085, + "step": 854 + }, + { + "epoch": 1.647710843373494, + "grad_norm": 0.1391262263059616, + "learning_rate": 2.011211906489848e-05, + "loss": 0.0078, + "step": 855 + }, + { + "epoch": 1.6496385542168674, + "grad_norm": 0.29620468616485596, + "learning_rate": 2.00672716644483e-05, + "loss": 0.0109, + "step": 856 + }, + { + "epoch": 1.651566265060241, + "grad_norm": 0.13946573436260223, + "learning_rate": 2.002242392573436e-05, + "loss": 0.0076, + "step": 857 + }, + { + "epoch": 1.6534939759036145, + "grad_norm": 0.9766128659248352, + "learning_rate": 1.997757607426565e-05, + "loss": 0.0309, + "step": 858 + }, + { + "epoch": 1.655421686746988, + "grad_norm": 0.18002203106880188, + "learning_rate": 1.9932728335551702e-05, + "loss": 0.0072, + "step": 859 + }, + { + "epoch": 1.6573493975903615, + "grad_norm": 0.28073111176490784, + "learning_rate": 1.988788093510152e-05, + "loss": 0.0246, + "step": 860 + }, + { + "epoch": 1.659277108433735, + "grad_norm": 0.1919957399368286, + "learning_rate": 1.9843034098422375e-05, + "loss": 0.0087, + "step": 861 + }, + { + "epoch": 1.6612048192771085, + "grad_norm": 0.1825258433818817, + "learning_rate": 1.9798188051018705e-05, + "loss": 0.0092, + "step": 862 + }, + { + "epoch": 1.6631325301204818, + "grad_norm": 0.32412952184677124, + "learning_rate": 1.9753343018390997e-05, + "loss": 0.0118, + "step": 863 + }, + { + "epoch": 1.6650602409638555, + "grad_norm": 0.12828563153743744, + "learning_rate": 1.9708499226034618e-05, + "loss": 0.0056, + "step": 864 + }, + { + "epoch": 1.6669879518072288, + "grad_norm": 0.18647560477256775, + "learning_rate": 1.966365689943871e-05, + "loss": 0.0094, + "step": 865 + }, + { + "epoch": 1.6689156626506025, + "grad_norm": 0.19835828244686127, + "learning_rate": 1.9618816264085042e-05, + "loss": 0.0097, + "step": 866 + }, + { + "epoch": 1.6708433734939758, + "grad_norm": 0.22364282608032227, + "learning_rate": 1.957397754544687e-05, + "loss": 0.0062, + "step": 867 + }, + { + "epoch": 1.6727710843373496, + "grad_norm": 0.29420018196105957, + "learning_rate": 1.952914096898783e-05, + "loss": 0.0182, + "step": 868 + }, + { + "epoch": 1.6746987951807228, + "grad_norm": 0.2149929702281952, + "learning_rate": 1.9484306760160766e-05, + "loss": 0.0125, + "step": 869 + }, + { + "epoch": 1.6766265060240964, + "grad_norm": 0.16844330728054047, + "learning_rate": 1.9439475144406623e-05, + "loss": 0.0074, + "step": 870 + }, + { + "epoch": 1.6785542168674699, + "grad_norm": 0.5010282397270203, + "learning_rate": 1.9394646347153334e-05, + "loss": 0.0213, + "step": 871 + }, + { + "epoch": 1.6804819277108434, + "grad_norm": 0.29847195744514465, + "learning_rate": 1.9349820593814606e-05, + "loss": 0.0173, + "step": 872 + }, + { + "epoch": 1.6824096385542169, + "grad_norm": 0.23835812509059906, + "learning_rate": 1.930499810978889e-05, + "loss": 0.011, + "step": 873 + }, + { + "epoch": 1.6843373493975904, + "grad_norm": 0.3269020617008209, + "learning_rate": 1.9260179120458177e-05, + "loss": 0.0285, + "step": 874 + }, + { + "epoch": 1.686265060240964, + "grad_norm": 0.2142144739627838, + "learning_rate": 1.9215363851186883e-05, + "loss": 0.0146, + "step": 875 + }, + { + "epoch": 1.6881927710843372, + "grad_norm": 0.3098377585411072, + "learning_rate": 1.9170552527320725e-05, + "loss": 0.0104, + "step": 876 + }, + { + "epoch": 1.690120481927711, + "grad_norm": 0.22504115104675293, + "learning_rate": 1.9125745374185568e-05, + "loss": 0.0091, + "step": 877 + }, + { + "epoch": 1.6920481927710842, + "grad_norm": 0.20633333921432495, + "learning_rate": 1.908094261708633e-05, + "loss": 0.0097, + "step": 878 + }, + { + "epoch": 1.693975903614458, + "grad_norm": 1.179566502571106, + "learning_rate": 1.9036144481305807e-05, + "loss": 0.0143, + "step": 879 + }, + { + "epoch": 1.6959036144578312, + "grad_norm": 0.15525613725185394, + "learning_rate": 1.8991351192103554e-05, + "loss": 0.0062, + "step": 880 + }, + { + "epoch": 1.697831325301205, + "grad_norm": 0.15966367721557617, + "learning_rate": 1.8946562974714763e-05, + "loss": 0.0048, + "step": 881 + }, + { + "epoch": 1.6997590361445782, + "grad_norm": 0.18902607262134552, + "learning_rate": 1.890178005434914e-05, + "loss": 0.0124, + "step": 882 + }, + { + "epoch": 1.701686746987952, + "grad_norm": 0.21692413091659546, + "learning_rate": 1.885700265618971e-05, + "loss": 0.0135, + "step": 883 + }, + { + "epoch": 1.7036144578313253, + "grad_norm": 0.38948455452919006, + "learning_rate": 1.8812231005391786e-05, + "loss": 0.0365, + "step": 884 + }, + { + "epoch": 1.7055421686746988, + "grad_norm": 0.2483491599559784, + "learning_rate": 1.8767465327081736e-05, + "loss": 0.0202, + "step": 885 + }, + { + "epoch": 1.7074698795180723, + "grad_norm": 0.15305832028388977, + "learning_rate": 1.872270584635592e-05, + "loss": 0.0035, + "step": 886 + }, + { + "epoch": 1.7093975903614458, + "grad_norm": 0.17794466018676758, + "learning_rate": 1.867795278827954e-05, + "loss": 0.0157, + "step": 887 + }, + { + "epoch": 1.7113253012048193, + "grad_norm": 0.1938813328742981, + "learning_rate": 1.863320637788547e-05, + "loss": 0.0071, + "step": 888 + }, + { + "epoch": 1.7132530120481928, + "grad_norm": 0.27061617374420166, + "learning_rate": 1.8588466840173207e-05, + "loss": 0.0347, + "step": 889 + }, + { + "epoch": 1.7151807228915663, + "grad_norm": 0.1541014313697815, + "learning_rate": 1.8543734400107637e-05, + "loss": 0.006, + "step": 890 + }, + { + "epoch": 1.7171084337349396, + "grad_norm": 0.1436876654624939, + "learning_rate": 1.8499009282617996e-05, + "loss": 0.0059, + "step": 891 + }, + { + "epoch": 1.7190361445783133, + "grad_norm": 1.0573723316192627, + "learning_rate": 1.8454291712596688e-05, + "loss": 0.008, + "step": 892 + }, + { + "epoch": 1.7209638554216866, + "grad_norm": 0.15406259894371033, + "learning_rate": 1.8409581914898157e-05, + "loss": 0.0061, + "step": 893 + }, + { + "epoch": 1.7228915662650603, + "grad_norm": 0.24822913110256195, + "learning_rate": 1.836488011433777e-05, + "loss": 0.0085, + "step": 894 + }, + { + "epoch": 1.7248192771084336, + "grad_norm": 0.21049316227436066, + "learning_rate": 1.83201865356907e-05, + "loss": 0.0075, + "step": 895 + }, + { + "epoch": 1.7267469879518074, + "grad_norm": 0.24159866571426392, + "learning_rate": 1.8275501403690733e-05, + "loss": 0.0156, + "step": 896 + }, + { + "epoch": 1.7286746987951807, + "grad_norm": 0.3191063106060028, + "learning_rate": 1.823082494302924e-05, + "loss": 0.0218, + "step": 897 + }, + { + "epoch": 1.7306024096385542, + "grad_norm": 0.20296362042427063, + "learning_rate": 1.8186157378353945e-05, + "loss": 0.0126, + "step": 898 + }, + { + "epoch": 1.7325301204819277, + "grad_norm": 0.1905524581670761, + "learning_rate": 1.8141498934267858e-05, + "loss": 0.0131, + "step": 899 + }, + { + "epoch": 1.7344578313253012, + "grad_norm": 0.5350520610809326, + "learning_rate": 1.809684983532813e-05, + "loss": 0.0115, + "step": 900 + }, + { + "epoch": 1.7363855421686747, + "grad_norm": 0.17144092917442322, + "learning_rate": 1.8052210306044907e-05, + "loss": 0.0113, + "step": 901 + }, + { + "epoch": 1.7383132530120482, + "grad_norm": 0.11777982115745544, + "learning_rate": 1.8007580570880236e-05, + "loss": 0.0058, + "step": 902 + }, + { + "epoch": 1.7402409638554217, + "grad_norm": 0.2078275978565216, + "learning_rate": 1.7962960854246908e-05, + "loss": 0.0106, + "step": 903 + }, + { + "epoch": 1.7421686746987952, + "grad_norm": 0.2550877630710602, + "learning_rate": 1.791835138050732e-05, + "loss": 0.0076, + "step": 904 + }, + { + "epoch": 1.7440963855421687, + "grad_norm": 0.11553912609815598, + "learning_rate": 1.7873752373972395e-05, + "loss": 0.0038, + "step": 905 + }, + { + "epoch": 1.746024096385542, + "grad_norm": 0.10724586248397827, + "learning_rate": 1.7829164058900398e-05, + "loss": 0.0043, + "step": 906 + }, + { + "epoch": 1.7479518072289157, + "grad_norm": 0.30152231454849243, + "learning_rate": 1.7784586659495845e-05, + "loss": 0.0099, + "step": 907 + }, + { + "epoch": 1.749879518072289, + "grad_norm": 0.18372933566570282, + "learning_rate": 1.7740020399908372e-05, + "loss": 0.0074, + "step": 908 + }, + { + "epoch": 1.7518072289156628, + "grad_norm": 0.35184428095817566, + "learning_rate": 1.7695465504231586e-05, + "loss": 0.0184, + "step": 909 + }, + { + "epoch": 1.753734939759036, + "grad_norm": 0.15083615481853485, + "learning_rate": 1.765092219650196e-05, + "loss": 0.0061, + "step": 910 + }, + { + "epoch": 1.7556626506024098, + "grad_norm": 0.2599961459636688, + "learning_rate": 1.7606390700697693e-05, + "loss": 0.0101, + "step": 911 + }, + { + "epoch": 1.757590361445783, + "grad_norm": 0.10829206556081772, + "learning_rate": 1.7561871240737595e-05, + "loss": 0.0034, + "step": 912 + }, + { + "epoch": 1.7595180722891566, + "grad_norm": 0.38098782300949097, + "learning_rate": 1.7517364040479966e-05, + "loss": 0.0384, + "step": 913 + }, + { + "epoch": 1.76144578313253, + "grad_norm": 0.14975085854530334, + "learning_rate": 1.7472869323721432e-05, + "loss": 0.0055, + "step": 914 + }, + { + "epoch": 1.7633734939759036, + "grad_norm": 0.4151444733142853, + "learning_rate": 1.742838731419588e-05, + "loss": 0.0307, + "step": 915 + }, + { + "epoch": 1.765301204819277, + "grad_norm": 0.22238481044769287, + "learning_rate": 1.738391823557328e-05, + "loss": 0.0059, + "step": 916 + }, + { + "epoch": 1.7672289156626506, + "grad_norm": 0.23386356234550476, + "learning_rate": 1.7339462311458587e-05, + "loss": 0.0113, + "step": 917 + }, + { + "epoch": 1.7691566265060241, + "grad_norm": 0.21911191940307617, + "learning_rate": 1.7295019765390618e-05, + "loss": 0.0071, + "step": 918 + }, + { + "epoch": 1.7710843373493976, + "grad_norm": 0.343159943819046, + "learning_rate": 1.7250590820840903e-05, + "loss": 0.0144, + "step": 919 + }, + { + "epoch": 1.7730120481927711, + "grad_norm": 0.32204556465148926, + "learning_rate": 1.720617570121259e-05, + "loss": 0.0131, + "step": 920 + }, + { + "epoch": 1.7749397590361444, + "grad_norm": 0.4105585515499115, + "learning_rate": 1.7161774629839328e-05, + "loss": 0.0148, + "step": 921 + }, + { + "epoch": 1.7768674698795182, + "grad_norm": 0.16380974650382996, + "learning_rate": 1.7117387829984093e-05, + "loss": 0.0066, + "step": 922 + }, + { + "epoch": 1.7787951807228914, + "grad_norm": 0.22920913994312286, + "learning_rate": 1.707301552483813e-05, + "loss": 0.0105, + "step": 923 + }, + { + "epoch": 1.7807228915662652, + "grad_norm": 0.2075149267911911, + "learning_rate": 1.7028657937519767e-05, + "loss": 0.0104, + "step": 924 + }, + { + "epoch": 1.7826506024096385, + "grad_norm": 0.44439977407455444, + "learning_rate": 1.6984315291073355e-05, + "loss": 0.0134, + "step": 925 + }, + { + "epoch": 1.7845783132530122, + "grad_norm": 0.24068203568458557, + "learning_rate": 1.6939987808468125e-05, + "loss": 0.0078, + "step": 926 + }, + { + "epoch": 1.7865060240963855, + "grad_norm": 0.34044349193573, + "learning_rate": 1.689567571259701e-05, + "loss": 0.0108, + "step": 927 + }, + { + "epoch": 1.788433734939759, + "grad_norm": 0.34082743525505066, + "learning_rate": 1.6851379226275624e-05, + "loss": 0.0266, + "step": 928 + }, + { + "epoch": 1.7903614457831325, + "grad_norm": 0.19490115344524384, + "learning_rate": 1.6807098572241075e-05, + "loss": 0.0109, + "step": 929 + }, + { + "epoch": 1.792289156626506, + "grad_norm": 0.16208237409591675, + "learning_rate": 1.6762833973150846e-05, + "loss": 0.0113, + "step": 930 + }, + { + "epoch": 1.7942168674698795, + "grad_norm": 0.35555699467658997, + "learning_rate": 1.671858565158172e-05, + "loss": 0.0196, + "step": 931 + }, + { + "epoch": 1.796144578313253, + "grad_norm": 0.1600857824087143, + "learning_rate": 1.6674353830028587e-05, + "loss": 0.0089, + "step": 932 + }, + { + "epoch": 1.7980722891566265, + "grad_norm": 0.1699574887752533, + "learning_rate": 1.663013873090342e-05, + "loss": 0.0074, + "step": 933 + }, + { + "epoch": 1.8, + "grad_norm": 0.2472933828830719, + "learning_rate": 1.6585940576534086e-05, + "loss": 0.0063, + "step": 934 + }, + { + "epoch": 1.8019277108433736, + "grad_norm": 0.23491555452346802, + "learning_rate": 1.654175958916323e-05, + "loss": 0.0101, + "step": 935 + }, + { + "epoch": 1.8038554216867468, + "grad_norm": 0.28635191917419434, + "learning_rate": 1.6497595990947195e-05, + "loss": 0.0131, + "step": 936 + }, + { + "epoch": 1.8057831325301206, + "grad_norm": 0.15400712192058563, + "learning_rate": 1.645345000395489e-05, + "loss": 0.0068, + "step": 937 + }, + { + "epoch": 1.8077108433734939, + "grad_norm": 0.18223172426223755, + "learning_rate": 1.6409321850166647e-05, + "loss": 0.0094, + "step": 938 + }, + { + "epoch": 1.8096385542168676, + "grad_norm": 0.2789457142353058, + "learning_rate": 1.636521175147316e-05, + "loss": 0.0202, + "step": 939 + }, + { + "epoch": 1.8115662650602409, + "grad_norm": 0.4267627000808716, + "learning_rate": 1.6321119929674297e-05, + "loss": 0.0176, + "step": 940 + }, + { + "epoch": 1.8134939759036146, + "grad_norm": 0.3021615445613861, + "learning_rate": 1.6277046606478056e-05, + "loss": 0.0085, + "step": 941 + }, + { + "epoch": 1.815421686746988, + "grad_norm": 0.3724934756755829, + "learning_rate": 1.6232992003499405e-05, + "loss": 0.0474, + "step": 942 + }, + { + "epoch": 1.8173493975903614, + "grad_norm": 0.20904326438903809, + "learning_rate": 1.6188956342259177e-05, + "loss": 0.0078, + "step": 943 + }, + { + "epoch": 1.819277108433735, + "grad_norm": 0.31168171763420105, + "learning_rate": 1.614493984418297e-05, + "loss": 0.0174, + "step": 944 + }, + { + "epoch": 1.8212048192771084, + "grad_norm": 0.21273556351661682, + "learning_rate": 1.6100942730600003e-05, + "loss": 0.0054, + "step": 945 + }, + { + "epoch": 1.823132530120482, + "grad_norm": 0.16991695761680603, + "learning_rate": 1.6056965222742055e-05, + "loss": 0.0063, + "step": 946 + }, + { + "epoch": 1.8250602409638554, + "grad_norm": 0.22762684524059296, + "learning_rate": 1.6013007541742303e-05, + "loss": 0.0234, + "step": 947 + }, + { + "epoch": 1.826987951807229, + "grad_norm": 0.20128795504570007, + "learning_rate": 1.596906990863422e-05, + "loss": 0.0095, + "step": 948 + }, + { + "epoch": 1.8289156626506025, + "grad_norm": 0.30772027373313904, + "learning_rate": 1.592515254435048e-05, + "loss": 0.0356, + "step": 949 + }, + { + "epoch": 1.830843373493976, + "grad_norm": 0.12954631447792053, + "learning_rate": 1.5881255669721857e-05, + "loss": 0.008, + "step": 950 + }, + { + "epoch": 1.8327710843373493, + "grad_norm": 0.7787145972251892, + "learning_rate": 1.5837379505476054e-05, + "loss": 0.0108, + "step": 951 + }, + { + "epoch": 1.834698795180723, + "grad_norm": 0.1683879941701889, + "learning_rate": 1.5793524272236683e-05, + "loss": 0.006, + "step": 952 + }, + { + "epoch": 1.8366265060240963, + "grad_norm": 0.16475361585617065, + "learning_rate": 1.5749690190522076e-05, + "loss": 0.0065, + "step": 953 + }, + { + "epoch": 1.83855421686747, + "grad_norm": 0.211905375123024, + "learning_rate": 1.5705877480744214e-05, + "loss": 0.0092, + "step": 954 + }, + { + "epoch": 1.8404819277108433, + "grad_norm": 0.23850117623806, + "learning_rate": 1.5662086363207628e-05, + "loss": 0.012, + "step": 955 + }, + { + "epoch": 1.842409638554217, + "grad_norm": 0.19100065529346466, + "learning_rate": 1.561831705810825e-05, + "loss": 0.0113, + "step": 956 + }, + { + "epoch": 1.8443373493975903, + "grad_norm": 0.3635985255241394, + "learning_rate": 1.557456978553236e-05, + "loss": 0.0168, + "step": 957 + }, + { + "epoch": 1.8462650602409638, + "grad_norm": 0.16449116170406342, + "learning_rate": 1.553084476545544e-05, + "loss": 0.0042, + "step": 958 + }, + { + "epoch": 1.8481927710843373, + "grad_norm": 0.566093385219574, + "learning_rate": 1.5487142217741062e-05, + "loss": 0.0145, + "step": 959 + }, + { + "epoch": 1.8501204819277108, + "grad_norm": 0.15960252285003662, + "learning_rate": 1.5443462362139834e-05, + "loss": 0.0059, + "step": 960 + }, + { + "epoch": 1.8520481927710843, + "grad_norm": 0.40773797035217285, + "learning_rate": 1.539980541828823e-05, + "loss": 0.0257, + "step": 961 + }, + { + "epoch": 1.8539759036144579, + "grad_norm": 0.4802496135234833, + "learning_rate": 1.5356171605707522e-05, + "loss": 0.0111, + "step": 962 + }, + { + "epoch": 1.8559036144578314, + "grad_norm": 0.15745794773101807, + "learning_rate": 1.5312561143802704e-05, + "loss": 0.0049, + "step": 963 + }, + { + "epoch": 1.8578313253012049, + "grad_norm": 0.15139251947402954, + "learning_rate": 1.5268974251861298e-05, + "loss": 0.0077, + "step": 964 + }, + { + "epoch": 1.8597590361445784, + "grad_norm": 0.2188841849565506, + "learning_rate": 1.5225411149052356e-05, + "loss": 0.017, + "step": 965 + }, + { + "epoch": 1.8616867469879517, + "grad_norm": 0.10853131115436554, + "learning_rate": 1.5181872054425287e-05, + "loss": 0.0049, + "step": 966 + }, + { + "epoch": 1.8636144578313254, + "grad_norm": 0.8254880905151367, + "learning_rate": 1.5138357186908785e-05, + "loss": 0.0317, + "step": 967 + }, + { + "epoch": 1.8655421686746987, + "grad_norm": 0.2989620566368103, + "learning_rate": 1.5094866765309728e-05, + "loss": 0.0126, + "step": 968 + }, + { + "epoch": 1.8674698795180724, + "grad_norm": 0.16411150991916656, + "learning_rate": 1.5051401008312054e-05, + "loss": 0.0101, + "step": 969 + }, + { + "epoch": 1.8693975903614457, + "grad_norm": 0.2861763834953308, + "learning_rate": 1.5007960134475706e-05, + "loss": 0.0155, + "step": 970 + }, + { + "epoch": 1.8713253012048194, + "grad_norm": 0.24879588186740875, + "learning_rate": 1.4964544362235487e-05, + "loss": 0.0187, + "step": 971 + }, + { + "epoch": 1.8732530120481927, + "grad_norm": 0.2433672398328781, + "learning_rate": 1.4921153909899983e-05, + "loss": 0.0084, + "step": 972 + }, + { + "epoch": 1.8751807228915662, + "grad_norm": 0.15097154676914215, + "learning_rate": 1.487778899565047e-05, + "loss": 0.007, + "step": 973 + }, + { + "epoch": 1.8771084337349397, + "grad_norm": 0.1629047691822052, + "learning_rate": 1.4834449837539806e-05, + "loss": 0.0058, + "step": 974 + }, + { + "epoch": 1.8790361445783132, + "grad_norm": 0.9937071204185486, + "learning_rate": 1.4791136653491333e-05, + "loss": 0.0323, + "step": 975 + }, + { + "epoch": 1.8809638554216868, + "grad_norm": 0.19555562734603882, + "learning_rate": 1.4747849661297808e-05, + "loss": 0.0126, + "step": 976 + }, + { + "epoch": 1.8828915662650603, + "grad_norm": 0.16147711873054504, + "learning_rate": 1.470458907862026e-05, + "loss": 0.0067, + "step": 977 + }, + { + "epoch": 1.8848192771084338, + "grad_norm": 0.2730027735233307, + "learning_rate": 1.4661355122986945e-05, + "loss": 0.0147, + "step": 978 + }, + { + "epoch": 1.886746987951807, + "grad_norm": 0.13759832084178925, + "learning_rate": 1.4618148011792206e-05, + "loss": 0.0038, + "step": 979 + }, + { + "epoch": 1.8886746987951808, + "grad_norm": 0.33516690135002136, + "learning_rate": 1.4574967962295419e-05, + "loss": 0.0139, + "step": 980 + }, + { + "epoch": 1.890602409638554, + "grad_norm": 0.2345741093158722, + "learning_rate": 1.4531815191619903e-05, + "loss": 0.0094, + "step": 981 + }, + { + "epoch": 1.8925301204819278, + "grad_norm": 0.14681044220924377, + "learning_rate": 1.4488689916751762e-05, + "loss": 0.0065, + "step": 982 + }, + { + "epoch": 1.894457831325301, + "grad_norm": 0.21143914759159088, + "learning_rate": 1.4445592354538885e-05, + "loss": 0.0057, + "step": 983 + }, + { + "epoch": 1.8963855421686748, + "grad_norm": 0.3109160363674164, + "learning_rate": 1.44025227216898e-05, + "loss": 0.0142, + "step": 984 + }, + { + "epoch": 1.8983132530120481, + "grad_norm": 0.24301907420158386, + "learning_rate": 1.435948123477259e-05, + "loss": 0.012, + "step": 985 + }, + { + "epoch": 1.9002409638554218, + "grad_norm": 0.19817675650119781, + "learning_rate": 1.431646811021382e-05, + "loss": 0.0097, + "step": 986 + }, + { + "epoch": 1.9021686746987951, + "grad_norm": 0.13464932143688202, + "learning_rate": 1.4273483564297425e-05, + "loss": 0.0046, + "step": 987 + }, + { + "epoch": 1.9040963855421686, + "grad_norm": 0.1698642522096634, + "learning_rate": 1.4230527813163656e-05, + "loss": 0.0038, + "step": 988 + }, + { + "epoch": 1.9060240963855422, + "grad_norm": 0.19395388662815094, + "learning_rate": 1.4187601072807975e-05, + "loss": 0.0123, + "step": 989 + }, + { + "epoch": 1.9079518072289157, + "grad_norm": 0.2093188613653183, + "learning_rate": 1.4144703559079948e-05, + "loss": 0.0093, + "step": 990 + }, + { + "epoch": 1.9098795180722892, + "grad_norm": 0.1529311090707779, + "learning_rate": 1.4101835487682198e-05, + "loss": 0.0051, + "step": 991 + }, + { + "epoch": 1.9118072289156627, + "grad_norm": 0.18725350499153137, + "learning_rate": 1.4058997074169299e-05, + "loss": 0.0083, + "step": 992 + }, + { + "epoch": 1.9137349397590362, + "grad_norm": 0.15601560473442078, + "learning_rate": 1.401618853394668e-05, + "loss": 0.0086, + "step": 993 + }, + { + "epoch": 1.9156626506024095, + "grad_norm": 0.23890644311904907, + "learning_rate": 1.3973410082269591e-05, + "loss": 0.015, + "step": 994 + }, + { + "epoch": 1.9175903614457832, + "grad_norm": 0.2442619949579239, + "learning_rate": 1.3930661934241947e-05, + "loss": 0.0089, + "step": 995 + }, + { + "epoch": 1.9195180722891565, + "grad_norm": 0.1540212482213974, + "learning_rate": 1.388794430481532e-05, + "loss": 0.0072, + "step": 996 + }, + { + "epoch": 1.9214457831325302, + "grad_norm": 0.1359291970729828, + "learning_rate": 1.3845257408787807e-05, + "loss": 0.0131, + "step": 997 + }, + { + "epoch": 1.9233734939759035, + "grad_norm": 0.25486138463020325, + "learning_rate": 1.3802601460802967e-05, + "loss": 0.0198, + "step": 998 + }, + { + "epoch": 1.9253012048192772, + "grad_norm": 0.28815609216690063, + "learning_rate": 1.3759976675348754e-05, + "loss": 0.014, + "step": 999 + }, + { + "epoch": 1.9272289156626505, + "grad_norm": 0.15648497641086578, + "learning_rate": 1.3717383266756403e-05, + "loss": 0.0065, + "step": 1000 + }, + { + "epoch": 1.929156626506024, + "grad_norm": 0.16912540793418884, + "learning_rate": 1.367482144919941e-05, + "loss": 0.0059, + "step": 1001 + }, + { + "epoch": 1.9310843373493976, + "grad_norm": 0.16896723210811615, + "learning_rate": 1.3632291436692397e-05, + "loss": 0.0054, + "step": 1002 + }, + { + "epoch": 1.933012048192771, + "grad_norm": 0.20287497341632843, + "learning_rate": 1.3589793443090064e-05, + "loss": 0.0097, + "step": 1003 + }, + { + "epoch": 1.9349397590361446, + "grad_norm": 0.14804276823997498, + "learning_rate": 1.3547327682086114e-05, + "loss": 0.0125, + "step": 1004 + }, + { + "epoch": 1.936867469879518, + "grad_norm": 0.23820064961910248, + "learning_rate": 1.3504894367212171e-05, + "loss": 0.0131, + "step": 1005 + }, + { + "epoch": 1.9387951807228916, + "grad_norm": 0.25607362389564514, + "learning_rate": 1.34624937118367e-05, + "loss": 0.0115, + "step": 1006 + }, + { + "epoch": 1.940722891566265, + "grad_norm": 0.37233737111091614, + "learning_rate": 1.3420125929163976e-05, + "loss": 0.0309, + "step": 1007 + }, + { + "epoch": 1.9426506024096386, + "grad_norm": 0.19426730275154114, + "learning_rate": 1.3377791232232929e-05, + "loss": 0.0078, + "step": 1008 + }, + { + "epoch": 1.944578313253012, + "grad_norm": 0.2784160077571869, + "learning_rate": 1.333548983391617e-05, + "loss": 0.0142, + "step": 1009 + }, + { + "epoch": 1.9465060240963856, + "grad_norm": 0.11407195776700974, + "learning_rate": 1.3293221946918853e-05, + "loss": 0.0035, + "step": 1010 + }, + { + "epoch": 1.948433734939759, + "grad_norm": 0.3965436816215515, + "learning_rate": 1.325098778377762e-05, + "loss": 0.0242, + "step": 1011 + }, + { + "epoch": 1.9503614457831326, + "grad_norm": 0.18520519137382507, + "learning_rate": 1.3208787556859543e-05, + "loss": 0.0096, + "step": 1012 + }, + { + "epoch": 1.952289156626506, + "grad_norm": 0.2783315181732178, + "learning_rate": 1.3166621478361075e-05, + "loss": 0.0103, + "step": 1013 + }, + { + "epoch": 1.9542168674698797, + "grad_norm": 0.22714459896087646, + "learning_rate": 1.3124489760306917e-05, + "loss": 0.0078, + "step": 1014 + }, + { + "epoch": 1.956144578313253, + "grad_norm": 0.1257915049791336, + "learning_rate": 1.3082392614549036e-05, + "loss": 0.0077, + "step": 1015 + }, + { + "epoch": 1.9580722891566265, + "grad_norm": 0.15592887997627258, + "learning_rate": 1.3040330252765526e-05, + "loss": 0.0106, + "step": 1016 + }, + { + "epoch": 1.96, + "grad_norm": 0.19295449554920197, + "learning_rate": 1.2998302886459586e-05, + "loss": 0.0082, + "step": 1017 + }, + { + "epoch": 1.9619277108433735, + "grad_norm": 0.15544794499874115, + "learning_rate": 1.2956310726958472e-05, + "loss": 0.0068, + "step": 1018 + }, + { + "epoch": 1.963855421686747, + "grad_norm": 0.25899502635002136, + "learning_rate": 1.291435398541236e-05, + "loss": 0.0086, + "step": 1019 + }, + { + "epoch": 1.9657831325301205, + "grad_norm": 0.34639033675193787, + "learning_rate": 1.2872432872793379e-05, + "loss": 0.0116, + "step": 1020 + }, + { + "epoch": 1.967710843373494, + "grad_norm": 0.1628410518169403, + "learning_rate": 1.283054759989447e-05, + "loss": 0.0055, + "step": 1021 + }, + { + "epoch": 1.9696385542168675, + "grad_norm": 0.9273788928985596, + "learning_rate": 1.2788698377328385e-05, + "loss": 0.0264, + "step": 1022 + }, + { + "epoch": 1.971566265060241, + "grad_norm": 0.163126140832901, + "learning_rate": 1.2746885415526594e-05, + "loss": 0.0046, + "step": 1023 + }, + { + "epoch": 1.9734939759036143, + "grad_norm": 0.1475439816713333, + "learning_rate": 1.2705108924738223e-05, + "loss": 0.0056, + "step": 1024 + }, + { + "epoch": 1.975421686746988, + "grad_norm": 0.1654318869113922, + "learning_rate": 1.2663369115029034e-05, + "loss": 0.0056, + "step": 1025 + }, + { + "epoch": 1.9773493975903613, + "grad_norm": 0.20536045730113983, + "learning_rate": 1.2621666196280333e-05, + "loss": 0.0101, + "step": 1026 + }, + { + "epoch": 1.979277108433735, + "grad_norm": 0.19256474077701569, + "learning_rate": 1.258000037818792e-05, + "loss": 0.0059, + "step": 1027 + }, + { + "epoch": 1.9812048192771083, + "grad_norm": 0.2605120539665222, + "learning_rate": 1.2538371870261053e-05, + "loss": 0.0115, + "step": 1028 + }, + { + "epoch": 1.983132530120482, + "grad_norm": 0.14840295910835266, + "learning_rate": 1.249678088182137e-05, + "loss": 0.0046, + "step": 1029 + }, + { + "epoch": 1.9850602409638554, + "grad_norm": 0.17585207521915436, + "learning_rate": 1.2455227622001851e-05, + "loss": 0.0086, + "step": 1030 + }, + { + "epoch": 1.9869879518072289, + "grad_norm": 0.11044781655073166, + "learning_rate": 1.241371229974579e-05, + "loss": 0.0034, + "step": 1031 + }, + { + "epoch": 1.9889156626506024, + "grad_norm": 0.25584840774536133, + "learning_rate": 1.2372235123805672e-05, + "loss": 0.0245, + "step": 1032 + }, + { + "epoch": 1.9908433734939759, + "grad_norm": 0.25962474942207336, + "learning_rate": 1.2330796302742211e-05, + "loss": 0.0104, + "step": 1033 + }, + { + "epoch": 1.9927710843373494, + "grad_norm": 0.33408522605895996, + "learning_rate": 1.2289396044923238e-05, + "loss": 0.0176, + "step": 1034 + }, + { + "epoch": 1.994698795180723, + "grad_norm": 0.479950487613678, + "learning_rate": 1.2248034558522682e-05, + "loss": 0.0113, + "step": 1035 + }, + { + "epoch": 1.9966265060240964, + "grad_norm": 0.16567294299602509, + "learning_rate": 1.2206712051519518e-05, + "loss": 0.0036, + "step": 1036 + }, + { + "epoch": 1.99855421686747, + "grad_norm": 0.19343771040439606, + "learning_rate": 1.2165428731696713e-05, + "loss": 0.0077, + "step": 1037 + }, + { + "epoch": 2.0, + "grad_norm": 0.22895601391792297, + "learning_rate": 1.2124184806640202e-05, + "loss": 0.0114, + "step": 1038 + }, + { + "epoch": 2.0019277108433733, + "grad_norm": 0.15838384628295898, + "learning_rate": 1.208298048373782e-05, + "loss": 0.0043, + "step": 1039 + }, + { + "epoch": 2.003855421686747, + "grad_norm": 0.681065559387207, + "learning_rate": 1.2041815970178268e-05, + "loss": 0.0214, + "step": 1040 + }, + { + "epoch": 2.0057831325301203, + "grad_norm": 0.3357350528240204, + "learning_rate": 1.2000691472950081e-05, + "loss": 0.0079, + "step": 1041 + }, + { + "epoch": 2.007710843373494, + "grad_norm": 0.15238308906555176, + "learning_rate": 1.1959607198840568e-05, + "loss": 0.0041, + "step": 1042 + }, + { + "epoch": 2.0096385542168673, + "grad_norm": 0.11763229966163635, + "learning_rate": 1.1918563354434784e-05, + "loss": 0.0033, + "step": 1043 + }, + { + "epoch": 2.011566265060241, + "grad_norm": 0.3759301006793976, + "learning_rate": 1.1877560146114515e-05, + "loss": 0.0128, + "step": 1044 + }, + { + "epoch": 2.0134939759036143, + "grad_norm": 0.1143188625574112, + "learning_rate": 1.1836597780057183e-05, + "loss": 0.0078, + "step": 1045 + }, + { + "epoch": 2.015421686746988, + "grad_norm": 0.20059260725975037, + "learning_rate": 1.179567646223485e-05, + "loss": 0.0149, + "step": 1046 + }, + { + "epoch": 2.0173493975903614, + "grad_norm": 0.15569567680358887, + "learning_rate": 1.1754796398413196e-05, + "loss": 0.0038, + "step": 1047 + }, + { + "epoch": 2.019277108433735, + "grad_norm": 0.1153278723359108, + "learning_rate": 1.1713957794150423e-05, + "loss": 0.0041, + "step": 1048 + }, + { + "epoch": 2.0212048192771084, + "grad_norm": 0.1838717758655548, + "learning_rate": 1.1673160854796307e-05, + "loss": 0.0041, + "step": 1049 + }, + { + "epoch": 2.023132530120482, + "grad_norm": 0.12264502793550491, + "learning_rate": 1.1632405785491077e-05, + "loss": 0.0043, + "step": 1050 + }, + { + "epoch": 2.0250602409638554, + "grad_norm": 0.14363229274749756, + "learning_rate": 1.159169279116445e-05, + "loss": 0.0066, + "step": 1051 + }, + { + "epoch": 2.026987951807229, + "grad_norm": 0.1316995471715927, + "learning_rate": 1.1551022076534585e-05, + "loss": 0.0024, + "step": 1052 + }, + { + "epoch": 2.0289156626506024, + "grad_norm": 0.13392619788646698, + "learning_rate": 1.1510393846107001e-05, + "loss": 0.0051, + "step": 1053 + }, + { + "epoch": 2.0308433734939757, + "grad_norm": 3.0086817741394043, + "learning_rate": 1.1469808304173658e-05, + "loss": 0.0334, + "step": 1054 + }, + { + "epoch": 2.0327710843373494, + "grad_norm": 0.17756076157093048, + "learning_rate": 1.1429265654811803e-05, + "loss": 0.0068, + "step": 1055 + }, + { + "epoch": 2.0346987951807227, + "grad_norm": 0.13250532746315002, + "learning_rate": 1.1388766101883038e-05, + "loss": 0.0087, + "step": 1056 + }, + { + "epoch": 2.0366265060240965, + "grad_norm": 0.3534089922904968, + "learning_rate": 1.1348309849032257e-05, + "loss": 0.0076, + "step": 1057 + }, + { + "epoch": 2.0385542168674697, + "grad_norm": 0.11939049512147903, + "learning_rate": 1.1307897099686627e-05, + "loss": 0.0029, + "step": 1058 + }, + { + "epoch": 2.0404819277108435, + "grad_norm": 0.11862517893314362, + "learning_rate": 1.1267528057054562e-05, + "loss": 0.0062, + "step": 1059 + }, + { + "epoch": 2.0424096385542168, + "grad_norm": 0.1539212018251419, + "learning_rate": 1.1227202924124704e-05, + "loss": 0.0067, + "step": 1060 + }, + { + "epoch": 2.0443373493975905, + "grad_norm": 0.17163440585136414, + "learning_rate": 1.118692190366491e-05, + "loss": 0.0055, + "step": 1061 + }, + { + "epoch": 2.0462650602409638, + "grad_norm": 0.12304897606372833, + "learning_rate": 1.1146685198221222e-05, + "loss": 0.0036, + "step": 1062 + }, + { + "epoch": 2.0481927710843375, + "grad_norm": 0.17319051921367645, + "learning_rate": 1.1106493010116842e-05, + "loss": 0.0058, + "step": 1063 + }, + { + "epoch": 2.050120481927711, + "grad_norm": 0.2242443859577179, + "learning_rate": 1.1066345541451127e-05, + "loss": 0.0059, + "step": 1064 + }, + { + "epoch": 2.0520481927710845, + "grad_norm": 0.09533938020467758, + "learning_rate": 1.1026242994098597e-05, + "loss": 0.0033, + "step": 1065 + }, + { + "epoch": 2.053975903614458, + "grad_norm": 0.11697929352521896, + "learning_rate": 1.0986185569707852e-05, + "loss": 0.0038, + "step": 1066 + }, + { + "epoch": 2.0559036144578315, + "grad_norm": 0.2563149333000183, + "learning_rate": 1.0946173469700625e-05, + "loss": 0.0158, + "step": 1067 + }, + { + "epoch": 2.057831325301205, + "grad_norm": 0.21836932003498077, + "learning_rate": 1.0906206895270739e-05, + "loss": 0.0085, + "step": 1068 + }, + { + "epoch": 2.059759036144578, + "grad_norm": 0.1798071414232254, + "learning_rate": 1.0866286047383094e-05, + "loss": 0.0053, + "step": 1069 + }, + { + "epoch": 2.061686746987952, + "grad_norm": 0.08937730640172958, + "learning_rate": 1.0826411126772675e-05, + "loss": 0.0025, + "step": 1070 + }, + { + "epoch": 2.063614457831325, + "grad_norm": 0.0942138060927391, + "learning_rate": 1.0786582333943499e-05, + "loss": 0.0017, + "step": 1071 + }, + { + "epoch": 2.065542168674699, + "grad_norm": 0.13076582551002502, + "learning_rate": 1.0746799869167679e-05, + "loss": 0.0033, + "step": 1072 + }, + { + "epoch": 2.067469879518072, + "grad_norm": 0.0993233174085617, + "learning_rate": 1.0707063932484357e-05, + "loss": 0.0046, + "step": 1073 + }, + { + "epoch": 2.069397590361446, + "grad_norm": 0.3046741485595703, + "learning_rate": 1.0667374723698698e-05, + "loss": 0.009, + "step": 1074 + }, + { + "epoch": 2.071325301204819, + "grad_norm": 0.12197669595479965, + "learning_rate": 1.0627732442380932e-05, + "loss": 0.0034, + "step": 1075 + }, + { + "epoch": 2.073253012048193, + "grad_norm": 0.12721140682697296, + "learning_rate": 1.058813728786531e-05, + "loss": 0.0048, + "step": 1076 + }, + { + "epoch": 2.075180722891566, + "grad_norm": 0.10011966526508331, + "learning_rate": 1.0548589459249112e-05, + "loss": 0.0026, + "step": 1077 + }, + { + "epoch": 2.07710843373494, + "grad_norm": 0.3314201831817627, + "learning_rate": 1.0509089155391661e-05, + "loss": 0.0284, + "step": 1078 + }, + { + "epoch": 2.079036144578313, + "grad_norm": 0.32739701867103577, + "learning_rate": 1.0469636574913288e-05, + "loss": 0.0088, + "step": 1079 + }, + { + "epoch": 2.080963855421687, + "grad_norm": 0.13805675506591797, + "learning_rate": 1.043023191619438e-05, + "loss": 0.0042, + "step": 1080 + }, + { + "epoch": 2.0828915662650602, + "grad_norm": 0.14789745211601257, + "learning_rate": 1.039087537737435e-05, + "loss": 0.0037, + "step": 1081 + }, + { + "epoch": 2.0848192771084335, + "grad_norm": 0.15518991649150848, + "learning_rate": 1.0351567156350617e-05, + "loss": 0.0044, + "step": 1082 + }, + { + "epoch": 2.0867469879518072, + "grad_norm": 0.08380113542079926, + "learning_rate": 1.0312307450777706e-05, + "loss": 0.0019, + "step": 1083 + }, + { + "epoch": 2.0886746987951805, + "grad_norm": 0.17892400920391083, + "learning_rate": 1.027309645806613e-05, + "loss": 0.0065, + "step": 1084 + }, + { + "epoch": 2.0906024096385543, + "grad_norm": 0.5497608780860901, + "learning_rate": 1.0233934375381489e-05, + "loss": 0.0238, + "step": 1085 + }, + { + "epoch": 2.0925301204819275, + "grad_norm": 1.0189186334609985, + "learning_rate": 1.019482139964344e-05, + "loss": 0.0092, + "step": 1086 + }, + { + "epoch": 2.0944578313253013, + "grad_norm": 0.12144117057323456, + "learning_rate": 1.015575772752472e-05, + "loss": 0.0038, + "step": 1087 + }, + { + "epoch": 2.0963855421686746, + "grad_norm": 0.1115315854549408, + "learning_rate": 1.0116743555450148e-05, + "loss": 0.0024, + "step": 1088 + }, + { + "epoch": 2.0983132530120483, + "grad_norm": 0.22671759128570557, + "learning_rate": 1.0077779079595631e-05, + "loss": 0.0136, + "step": 1089 + }, + { + "epoch": 2.1002409638554216, + "grad_norm": 2.0009827613830566, + "learning_rate": 1.003886449588719e-05, + "loss": 0.0493, + "step": 1090 + }, + { + "epoch": 2.1021686746987953, + "grad_norm": 0.11907301843166351, + "learning_rate": 1.0000000000000006e-05, + "loss": 0.0034, + "step": 1091 + }, + { + "epoch": 2.1040963855421686, + "grad_norm": 0.31257638335227966, + "learning_rate": 9.961185787357346e-06, + "loss": 0.0129, + "step": 1092 + }, + { + "epoch": 2.1060240963855423, + "grad_norm": 0.11033743619918823, + "learning_rate": 9.922422053129674e-06, + "loss": 0.0184, + "step": 1093 + }, + { + "epoch": 2.1079518072289156, + "grad_norm": 0.2575698494911194, + "learning_rate": 9.883708992233626e-06, + "loss": 0.0054, + "step": 1094 + }, + { + "epoch": 2.1098795180722894, + "grad_norm": 0.12921132147312164, + "learning_rate": 9.845046799331029e-06, + "loss": 0.0037, + "step": 1095 + }, + { + "epoch": 2.1118072289156626, + "grad_norm": 0.21405921876430511, + "learning_rate": 9.806435668827941e-06, + "loss": 0.006, + "step": 1096 + }, + { + "epoch": 2.113734939759036, + "grad_norm": 0.12929430603981018, + "learning_rate": 9.76787579487363e-06, + "loss": 0.0049, + "step": 1097 + }, + { + "epoch": 2.1156626506024097, + "grad_norm": 0.1793181151151657, + "learning_rate": 9.729367371359681e-06, + "loss": 0.0086, + "step": 1098 + }, + { + "epoch": 2.117590361445783, + "grad_norm": 0.2182074338197708, + "learning_rate": 9.690910591918936e-06, + "loss": 0.0106, + "step": 1099 + }, + { + "epoch": 2.1195180722891567, + "grad_norm": 0.0705680400133133, + "learning_rate": 9.652505649924547e-06, + "loss": 0.0012, + "step": 1100 + }, + { + "epoch": 2.12144578313253, + "grad_norm": 0.10509738326072693, + "learning_rate": 9.614152738489021e-06, + "loss": 0.0048, + "step": 1101 + }, + { + "epoch": 2.1233734939759037, + "grad_norm": 0.13775436580181122, + "learning_rate": 9.575852050463268e-06, + "loss": 0.0089, + "step": 1102 + }, + { + "epoch": 2.125301204819277, + "grad_norm": 0.15230101346969604, + "learning_rate": 9.537603778435545e-06, + "loss": 0.0065, + "step": 1103 + }, + { + "epoch": 2.1272289156626507, + "grad_norm": 0.24702346324920654, + "learning_rate": 9.499408114730583e-06, + "loss": 0.016, + "step": 1104 + }, + { + "epoch": 2.129156626506024, + "grad_norm": 0.1082577034831047, + "learning_rate": 9.461265251408575e-06, + "loss": 0.0036, + "step": 1105 + }, + { + "epoch": 2.1310843373493977, + "grad_norm": 0.1063847690820694, + "learning_rate": 9.423175380264211e-06, + "loss": 0.0037, + "step": 1106 + }, + { + "epoch": 2.133012048192771, + "grad_norm": 0.07686953246593475, + "learning_rate": 9.385138692825729e-06, + "loss": 0.0031, + "step": 1107 + }, + { + "epoch": 2.1349397590361447, + "grad_norm": 0.2046380341053009, + "learning_rate": 9.347155380353912e-06, + "loss": 0.0087, + "step": 1108 + }, + { + "epoch": 2.136867469879518, + "grad_norm": 0.1341692954301834, + "learning_rate": 9.30922563384121e-06, + "loss": 0.0045, + "step": 1109 + }, + { + "epoch": 2.1387951807228918, + "grad_norm": 0.09870535880327225, + "learning_rate": 9.271349644010672e-06, + "loss": 0.003, + "step": 1110 + }, + { + "epoch": 2.140722891566265, + "grad_norm": 0.18708615005016327, + "learning_rate": 9.233527601315069e-06, + "loss": 0.0042, + "step": 1111 + }, + { + "epoch": 2.1426506024096383, + "grad_norm": 0.5175634026527405, + "learning_rate": 9.195759695935907e-06, + "loss": 0.0173, + "step": 1112 + }, + { + "epoch": 2.144578313253012, + "grad_norm": 0.14939036965370178, + "learning_rate": 9.158046117782464e-06, + "loss": 0.0031, + "step": 1113 + }, + { + "epoch": 2.1465060240963854, + "grad_norm": 0.2837410569190979, + "learning_rate": 9.120387056490851e-06, + "loss": 0.0097, + "step": 1114 + }, + { + "epoch": 2.148433734939759, + "grad_norm": 0.11088677495718002, + "learning_rate": 9.082782701423047e-06, + "loss": 0.0026, + "step": 1115 + }, + { + "epoch": 2.1503614457831324, + "grad_norm": 0.07785166054964066, + "learning_rate": 9.045233241665947e-06, + "loss": 0.0019, + "step": 1116 + }, + { + "epoch": 2.152289156626506, + "grad_norm": 0.17568141222000122, + "learning_rate": 9.007738866030427e-06, + "loss": 0.0039, + "step": 1117 + }, + { + "epoch": 2.1542168674698794, + "grad_norm": 0.12652266025543213, + "learning_rate": 8.970299763050356e-06, + "loss": 0.0033, + "step": 1118 + }, + { + "epoch": 2.156144578313253, + "grad_norm": 0.16801467537879944, + "learning_rate": 8.932916120981695e-06, + "loss": 0.0076, + "step": 1119 + }, + { + "epoch": 2.1580722891566264, + "grad_norm": 0.18313169479370117, + "learning_rate": 8.895588127801545e-06, + "loss": 0.0052, + "step": 1120 + }, + { + "epoch": 2.16, + "grad_norm": 0.07546049356460571, + "learning_rate": 8.858315971207146e-06, + "loss": 0.0022, + "step": 1121 + }, + { + "epoch": 2.1619277108433734, + "grad_norm": 0.4039839208126068, + "learning_rate": 8.821099838614996e-06, + "loss": 0.0203, + "step": 1122 + }, + { + "epoch": 2.163855421686747, + "grad_norm": 0.09244243055582047, + "learning_rate": 8.783939917159897e-06, + "loss": 0.002, + "step": 1123 + }, + { + "epoch": 2.1657831325301204, + "grad_norm": 0.18327835202217102, + "learning_rate": 8.746836393693978e-06, + "loss": 0.0055, + "step": 1124 + }, + { + "epoch": 2.167710843373494, + "grad_norm": 0.22010307013988495, + "learning_rate": 8.709789454785809e-06, + "loss": 0.0077, + "step": 1125 + }, + { + "epoch": 2.1696385542168675, + "grad_norm": 0.09438297897577286, + "learning_rate": 8.67279928671939e-06, + "loss": 0.0032, + "step": 1126 + }, + { + "epoch": 2.1715662650602408, + "grad_norm": 0.20782770216464996, + "learning_rate": 8.635866075493318e-06, + "loss": 0.0028, + "step": 1127 + }, + { + "epoch": 2.1734939759036145, + "grad_norm": 0.1958685964345932, + "learning_rate": 8.598990006819756e-06, + "loss": 0.0047, + "step": 1128 + }, + { + "epoch": 2.1754216867469878, + "grad_norm": 0.06459935009479523, + "learning_rate": 8.562171266123528e-06, + "loss": 0.0015, + "step": 1129 + }, + { + "epoch": 2.1773493975903615, + "grad_norm": 0.33486708998680115, + "learning_rate": 8.525410038541218e-06, + "loss": 0.0094, + "step": 1130 + }, + { + "epoch": 2.179277108433735, + "grad_norm": 0.5755940079689026, + "learning_rate": 8.488706508920202e-06, + "loss": 0.0067, + "step": 1131 + }, + { + "epoch": 2.1812048192771085, + "grad_norm": 0.10840924829244614, + "learning_rate": 8.452060861817738e-06, + "loss": 0.0082, + "step": 1132 + }, + { + "epoch": 2.183132530120482, + "grad_norm": 0.18611350655555725, + "learning_rate": 8.415473281500037e-06, + "loss": 0.0059, + "step": 1133 + }, + { + "epoch": 2.1850602409638555, + "grad_norm": 0.11245249956846237, + "learning_rate": 8.378943951941301e-06, + "loss": 0.0107, + "step": 1134 + }, + { + "epoch": 2.186987951807229, + "grad_norm": 0.12284426391124725, + "learning_rate": 8.342473056822873e-06, + "loss": 0.0025, + "step": 1135 + }, + { + "epoch": 2.1889156626506026, + "grad_norm": 0.12542888522148132, + "learning_rate": 8.306060779532245e-06, + "loss": 0.0059, + "step": 1136 + }, + { + "epoch": 2.190843373493976, + "grad_norm": 0.1287655532360077, + "learning_rate": 8.26970730316215e-06, + "loss": 0.0022, + "step": 1137 + }, + { + "epoch": 2.1927710843373496, + "grad_norm": 0.1818632185459137, + "learning_rate": 8.233412810509669e-06, + "loss": 0.0131, + "step": 1138 + }, + { + "epoch": 2.194698795180723, + "grad_norm": 0.09687745571136475, + "learning_rate": 8.197177484075284e-06, + "loss": 0.0025, + "step": 1139 + }, + { + "epoch": 2.1966265060240966, + "grad_norm": 0.16103452444076538, + "learning_rate": 8.161001506061979e-06, + "loss": 0.0031, + "step": 1140 + }, + { + "epoch": 2.19855421686747, + "grad_norm": 0.2711680233478546, + "learning_rate": 8.124885058374302e-06, + "loss": 0.0034, + "step": 1141 + }, + { + "epoch": 2.200481927710843, + "grad_norm": 0.17613105475902557, + "learning_rate": 8.088828322617473e-06, + "loss": 0.0044, + "step": 1142 + }, + { + "epoch": 2.202409638554217, + "grad_norm": 0.2298487424850464, + "learning_rate": 8.052831480096464e-06, + "loss": 0.0168, + "step": 1143 + }, + { + "epoch": 2.20433734939759, + "grad_norm": 0.17042206227779388, + "learning_rate": 8.016894711815067e-06, + "loss": 0.007, + "step": 1144 + }, + { + "epoch": 2.206265060240964, + "grad_norm": 0.2830466628074646, + "learning_rate": 7.98101819847501e-06, + "loss": 0.0091, + "step": 1145 + }, + { + "epoch": 2.208192771084337, + "grad_norm": 0.22089065611362457, + "learning_rate": 7.945202120475063e-06, + "loss": 0.0046, + "step": 1146 + }, + { + "epoch": 2.210120481927711, + "grad_norm": 0.1716073900461197, + "learning_rate": 7.909446657910072e-06, + "loss": 0.0032, + "step": 1147 + }, + { + "epoch": 2.212048192771084, + "grad_norm": 0.16140373051166534, + "learning_rate": 7.873751990570104e-06, + "loss": 0.0057, + "step": 1148 + }, + { + "epoch": 2.213975903614458, + "grad_norm": 0.1671605408191681, + "learning_rate": 7.838118297939529e-06, + "loss": 0.0039, + "step": 1149 + }, + { + "epoch": 2.2159036144578312, + "grad_norm": 0.10933005809783936, + "learning_rate": 7.802545759196117e-06, + "loss": 0.005, + "step": 1150 + }, + { + "epoch": 2.217831325301205, + "grad_norm": 0.07819998264312744, + "learning_rate": 7.76703455321014e-06, + "loss": 0.0025, + "step": 1151 + }, + { + "epoch": 2.2197590361445783, + "grad_norm": 0.36211854219436646, + "learning_rate": 7.73158485854344e-06, + "loss": 0.0151, + "step": 1152 + }, + { + "epoch": 2.221686746987952, + "grad_norm": 0.09098304808139801, + "learning_rate": 7.696196853448612e-06, + "loss": 0.0027, + "step": 1153 + }, + { + "epoch": 2.2236144578313253, + "grad_norm": 0.17442144453525543, + "learning_rate": 7.660870715868018e-06, + "loss": 0.006, + "step": 1154 + }, + { + "epoch": 2.225542168674699, + "grad_norm": 0.09785338491201401, + "learning_rate": 7.625606623432933e-06, + "loss": 0.0041, + "step": 1155 + }, + { + "epoch": 2.2274698795180723, + "grad_norm": 0.19399888813495636, + "learning_rate": 7.590404753462653e-06, + "loss": 0.0125, + "step": 1156 + }, + { + "epoch": 2.2293975903614456, + "grad_norm": 0.11080623418092728, + "learning_rate": 7.55526528296362e-06, + "loss": 0.0022, + "step": 1157 + }, + { + "epoch": 2.2313253012048193, + "grad_norm": 0.14067359268665314, + "learning_rate": 7.520188388628473e-06, + "loss": 0.0123, + "step": 1158 + }, + { + "epoch": 2.2332530120481926, + "grad_norm": 0.14533625543117523, + "learning_rate": 7.485174246835227e-06, + "loss": 0.0039, + "step": 1159 + }, + { + "epoch": 2.2351807228915663, + "grad_norm": 0.1253812462091446, + "learning_rate": 7.4502230336463466e-06, + "loss": 0.003, + "step": 1160 + }, + { + "epoch": 2.2371084337349396, + "grad_norm": 0.12766572833061218, + "learning_rate": 7.415334924807869e-06, + "loss": 0.0044, + "step": 1161 + }, + { + "epoch": 2.2390361445783133, + "grad_norm": 0.11985791474580765, + "learning_rate": 7.380510095748535e-06, + "loss": 0.0071, + "step": 1162 + }, + { + "epoch": 2.2409638554216866, + "grad_norm": 0.15505346655845642, + "learning_rate": 7.3457487215788605e-06, + "loss": 0.0046, + "step": 1163 + }, + { + "epoch": 2.2428915662650604, + "grad_norm": 0.18983210623264313, + "learning_rate": 7.311050977090343e-06, + "loss": 0.0079, + "step": 1164 + }, + { + "epoch": 2.2448192771084337, + "grad_norm": 0.19279207289218903, + "learning_rate": 7.276417036754479e-06, + "loss": 0.0042, + "step": 1165 + }, + { + "epoch": 2.2467469879518074, + "grad_norm": 0.21539707481861115, + "learning_rate": 7.241847074721964e-06, + "loss": 0.0087, + "step": 1166 + }, + { + "epoch": 2.2486746987951807, + "grad_norm": 0.07004354894161224, + "learning_rate": 7.207341264821783e-06, + "loss": 0.002, + "step": 1167 + }, + { + "epoch": 2.2506024096385544, + "grad_norm": 0.2203039526939392, + "learning_rate": 7.172899780560345e-06, + "loss": 0.0069, + "step": 1168 + }, + { + "epoch": 2.2525301204819277, + "grad_norm": 0.12474718689918518, + "learning_rate": 7.138522795120606e-06, + "loss": 0.0122, + "step": 1169 + }, + { + "epoch": 2.2544578313253014, + "grad_norm": 0.09078995883464813, + "learning_rate": 7.104210481361204e-06, + "loss": 0.0025, + "step": 1170 + }, + { + "epoch": 2.2563855421686747, + "grad_norm": 0.141757071018219, + "learning_rate": 7.069963011815584e-06, + "loss": 0.0039, + "step": 1171 + }, + { + "epoch": 2.258313253012048, + "grad_norm": 0.14944659173488617, + "learning_rate": 7.035780558691141e-06, + "loss": 0.0025, + "step": 1172 + }, + { + "epoch": 2.2602409638554217, + "grad_norm": 0.06723666191101074, + "learning_rate": 7.001663293868328e-06, + "loss": 0.0014, + "step": 1173 + }, + { + "epoch": 2.262168674698795, + "grad_norm": 0.11966485530138016, + "learning_rate": 6.967611388899826e-06, + "loss": 0.0067, + "step": 1174 + }, + { + "epoch": 2.2640963855421687, + "grad_norm": 0.08943185210227966, + "learning_rate": 6.933625015009666e-06, + "loss": 0.0036, + "step": 1175 + }, + { + "epoch": 2.266024096385542, + "grad_norm": 0.04511453956365585, + "learning_rate": 6.899704343092359e-06, + "loss": 0.0014, + "step": 1176 + }, + { + "epoch": 2.2679518072289158, + "grad_norm": 0.1867951601743698, + "learning_rate": 6.865849543712058e-06, + "loss": 0.009, + "step": 1177 + }, + { + "epoch": 2.269879518072289, + "grad_norm": 0.23791250586509705, + "learning_rate": 6.832060787101658e-06, + "loss": 0.0117, + "step": 1178 + }, + { + "epoch": 2.271807228915663, + "grad_norm": 0.13210316002368927, + "learning_rate": 6.798338243162008e-06, + "loss": 0.0024, + "step": 1179 + }, + { + "epoch": 2.273734939759036, + "grad_norm": 0.1601375937461853, + "learning_rate": 6.764682081461002e-06, + "loss": 0.013, + "step": 1180 + }, + { + "epoch": 2.27566265060241, + "grad_norm": 0.21996766328811646, + "learning_rate": 6.73109247123273e-06, + "loss": 0.0074, + "step": 1181 + }, + { + "epoch": 2.277590361445783, + "grad_norm": 0.15780030190944672, + "learning_rate": 6.6975695813766465e-06, + "loss": 0.0052, + "step": 1182 + }, + { + "epoch": 2.279518072289157, + "grad_norm": 0.18146437406539917, + "learning_rate": 6.664113580456739e-06, + "loss": 0.0265, + "step": 1183 + }, + { + "epoch": 2.28144578313253, + "grad_norm": 0.12033495306968689, + "learning_rate": 6.630724636700618e-06, + "loss": 0.0026, + "step": 1184 + }, + { + "epoch": 2.283373493975904, + "grad_norm": 0.25268155336380005, + "learning_rate": 6.59740291799873e-06, + "loss": 0.0046, + "step": 1185 + }, + { + "epoch": 2.285301204819277, + "grad_norm": 0.19043004512786865, + "learning_rate": 6.564148591903488e-06, + "loss": 0.0063, + "step": 1186 + }, + { + "epoch": 2.2872289156626504, + "grad_norm": 0.06894923001527786, + "learning_rate": 6.530961825628432e-06, + "loss": 0.0012, + "step": 1187 + }, + { + "epoch": 2.289156626506024, + "grad_norm": 0.16378818452358246, + "learning_rate": 6.4978427860474015e-06, + "loss": 0.0048, + "step": 1188 + }, + { + "epoch": 2.2910843373493974, + "grad_norm": 0.11130444705486298, + "learning_rate": 6.464791639693648e-06, + "loss": 0.0049, + "step": 1189 + }, + { + "epoch": 2.293012048192771, + "grad_norm": 0.10573417693376541, + "learning_rate": 6.431808552759083e-06, + "loss": 0.0019, + "step": 1190 + }, + { + "epoch": 2.2949397590361444, + "grad_norm": 0.13344882428646088, + "learning_rate": 6.398893691093367e-06, + "loss": 0.0033, + "step": 1191 + }, + { + "epoch": 2.296867469879518, + "grad_norm": 0.12659135460853577, + "learning_rate": 6.366047220203088e-06, + "loss": 0.0032, + "step": 1192 + }, + { + "epoch": 2.2987951807228915, + "grad_norm": 0.10152821987867355, + "learning_rate": 6.333269305250971e-06, + "loss": 0.0027, + "step": 1193 + }, + { + "epoch": 2.300722891566265, + "grad_norm": 0.1889944225549698, + "learning_rate": 6.300560111055006e-06, + "loss": 0.0062, + "step": 1194 + }, + { + "epoch": 2.3026506024096385, + "grad_norm": 2.3101227283477783, + "learning_rate": 6.2679198020876275e-06, + "loss": 0.0113, + "step": 1195 + }, + { + "epoch": 2.304578313253012, + "grad_norm": 0.6224933862686157, + "learning_rate": 6.235348542474908e-06, + "loss": 0.0273, + "step": 1196 + }, + { + "epoch": 2.3065060240963855, + "grad_norm": 0.1908419281244278, + "learning_rate": 6.202846495995705e-06, + "loss": 0.0056, + "step": 1197 + }, + { + "epoch": 2.3084337349397592, + "grad_norm": 0.10968491435050964, + "learning_rate": 6.170413826080856e-06, + "loss": 0.0034, + "step": 1198 + }, + { + "epoch": 2.3103614457831325, + "grad_norm": 0.23200668394565582, + "learning_rate": 6.138050695812343e-06, + "loss": 0.0042, + "step": 1199 + }, + { + "epoch": 2.3122891566265062, + "grad_norm": 0.12442032992839813, + "learning_rate": 6.105757267922481e-06, + "loss": 0.0045, + "step": 1200 + }, + { + "epoch": 2.3142168674698795, + "grad_norm": 0.14563624560832977, + "learning_rate": 6.073533704793122e-06, + "loss": 0.0035, + "step": 1201 + }, + { + "epoch": 2.316144578313253, + "grad_norm": 0.11523722857236862, + "learning_rate": 6.04138016845478e-06, + "loss": 0.0088, + "step": 1202 + }, + { + "epoch": 2.3180722891566266, + "grad_norm": 0.2000943422317505, + "learning_rate": 6.009296820585871e-06, + "loss": 0.0059, + "step": 1203 + }, + { + "epoch": 2.32, + "grad_norm": 0.10698592662811279, + "learning_rate": 5.977283822511879e-06, + "loss": 0.0028, + "step": 1204 + }, + { + "epoch": 2.3219277108433736, + "grad_norm": 0.1533137410879135, + "learning_rate": 5.945341335204547e-06, + "loss": 0.0044, + "step": 1205 + }, + { + "epoch": 2.323855421686747, + "grad_norm": 0.1235835999250412, + "learning_rate": 5.9134695192810695e-06, + "loss": 0.0043, + "step": 1206 + }, + { + "epoch": 2.3257831325301206, + "grad_norm": 0.1916925013065338, + "learning_rate": 5.8816685350032575e-06, + "loss": 0.0066, + "step": 1207 + }, + { + "epoch": 2.327710843373494, + "grad_norm": 0.08812380582094193, + "learning_rate": 5.849938542276801e-06, + "loss": 0.0022, + "step": 1208 + }, + { + "epoch": 2.3296385542168676, + "grad_norm": 0.13387660682201385, + "learning_rate": 5.818279700650393e-06, + "loss": 0.0037, + "step": 1209 + }, + { + "epoch": 2.331566265060241, + "grad_norm": 0.2309022694826126, + "learning_rate": 5.786692169314954e-06, + "loss": 0.0049, + "step": 1210 + }, + { + "epoch": 2.3334939759036146, + "grad_norm": 0.09956549853086472, + "learning_rate": 5.755176107102833e-06, + "loss": 0.002, + "step": 1211 + }, + { + "epoch": 2.335421686746988, + "grad_norm": 0.06035687029361725, + "learning_rate": 5.723731672487043e-06, + "loss": 0.002, + "step": 1212 + }, + { + "epoch": 2.337349397590361, + "grad_norm": 0.06850237399339676, + "learning_rate": 5.69235902358038e-06, + "loss": 0.0013, + "step": 1213 + }, + { + "epoch": 2.339277108433735, + "grad_norm": 0.12068171054124832, + "learning_rate": 5.661058318134711e-06, + "loss": 0.0041, + "step": 1214 + }, + { + "epoch": 2.3412048192771087, + "grad_norm": 0.13146616518497467, + "learning_rate": 5.6298297135401355e-06, + "loss": 0.0022, + "step": 1215 + }, + { + "epoch": 2.343132530120482, + "grad_norm": 0.15160737931728363, + "learning_rate": 5.598673366824212e-06, + "loss": 0.0036, + "step": 1216 + }, + { + "epoch": 2.3450602409638552, + "grad_norm": 0.26196014881134033, + "learning_rate": 5.567589434651164e-06, + "loss": 0.0151, + "step": 1217 + }, + { + "epoch": 2.346987951807229, + "grad_norm": 0.12898831069469452, + "learning_rate": 5.536578073321073e-06, + "loss": 0.006, + "step": 1218 + }, + { + "epoch": 2.3489156626506023, + "grad_norm": 0.11385104805231094, + "learning_rate": 5.505639438769146e-06, + "loss": 0.0052, + "step": 1219 + }, + { + "epoch": 2.350843373493976, + "grad_norm": 0.14569509029388428, + "learning_rate": 5.47477368656486e-06, + "loss": 0.0048, + "step": 1220 + }, + { + "epoch": 2.3527710843373493, + "grad_norm": 0.12406075745820999, + "learning_rate": 5.443980971911238e-06, + "loss": 0.0028, + "step": 1221 + }, + { + "epoch": 2.354698795180723, + "grad_norm": 0.3730498254299164, + "learning_rate": 5.413261449644039e-06, + "loss": 0.0043, + "step": 1222 + }, + { + "epoch": 2.3566265060240963, + "grad_norm": 0.1449914574623108, + "learning_rate": 5.382615274230987e-06, + "loss": 0.0075, + "step": 1223 + }, + { + "epoch": 2.35855421686747, + "grad_norm": 0.20739100873470306, + "learning_rate": 5.352042599770995e-06, + "loss": 0.0061, + "step": 1224 + }, + { + "epoch": 2.3604819277108433, + "grad_norm": 0.05786775052547455, + "learning_rate": 5.321543579993398e-06, + "loss": 0.0015, + "step": 1225 + }, + { + "epoch": 2.362409638554217, + "grad_norm": 0.09043122828006744, + "learning_rate": 5.2911183682571446e-06, + "loss": 0.0034, + "step": 1226 + }, + { + "epoch": 2.3643373493975903, + "grad_norm": 0.2685496211051941, + "learning_rate": 5.260767117550094e-06, + "loss": 0.0076, + "step": 1227 + }, + { + "epoch": 2.3662650602409636, + "grad_norm": 0.17694126069545746, + "learning_rate": 5.230489980488165e-06, + "loss": 0.0148, + "step": 1228 + }, + { + "epoch": 2.3681927710843373, + "grad_norm": 0.11609307676553726, + "learning_rate": 5.200287109314633e-06, + "loss": 0.0049, + "step": 1229 + }, + { + "epoch": 2.370120481927711, + "grad_norm": 0.1257704645395279, + "learning_rate": 5.1701586558993285e-06, + "loss": 0.0031, + "step": 1230 + }, + { + "epoch": 2.3720481927710844, + "grad_norm": 0.27177703380584717, + "learning_rate": 5.140104771737899e-06, + "loss": 0.0058, + "step": 1231 + }, + { + "epoch": 2.3739759036144576, + "grad_norm": 0.13928169012069702, + "learning_rate": 5.110125607951024e-06, + "loss": 0.0051, + "step": 1232 + }, + { + "epoch": 2.3759036144578314, + "grad_norm": 0.679577648639679, + "learning_rate": 5.0802213152836514e-06, + "loss": 0.0173, + "step": 1233 + }, + { + "epoch": 2.3778313253012047, + "grad_norm": 0.16769403219223022, + "learning_rate": 5.0503920441042845e-06, + "loss": 0.0045, + "step": 1234 + }, + { + "epoch": 2.3797590361445784, + "grad_norm": 0.09427493065595627, + "learning_rate": 5.0206379444041764e-06, + "loss": 0.0024, + "step": 1235 + }, + { + "epoch": 2.3816867469879517, + "grad_norm": 0.33908671140670776, + "learning_rate": 4.990959165796585e-06, + "loss": 0.0088, + "step": 1236 + }, + { + "epoch": 2.3836144578313254, + "grad_norm": 0.18106943368911743, + "learning_rate": 4.961355857516034e-06, + "loss": 0.0094, + "step": 1237 + }, + { + "epoch": 2.3855421686746987, + "grad_norm": 0.5833203196525574, + "learning_rate": 4.931828168417583e-06, + "loss": 0.0086, + "step": 1238 + }, + { + "epoch": 2.3874698795180724, + "grad_norm": 0.09108569473028183, + "learning_rate": 4.902376246976015e-06, + "loss": 0.0014, + "step": 1239 + }, + { + "epoch": 2.3893975903614457, + "grad_norm": 0.10596407204866409, + "learning_rate": 4.873000241285153e-06, + "loss": 0.0043, + "step": 1240 + }, + { + "epoch": 2.3913253012048195, + "grad_norm": 0.10775511711835861, + "learning_rate": 4.8437002990570835e-06, + "loss": 0.0014, + "step": 1241 + }, + { + "epoch": 2.3932530120481927, + "grad_norm": 0.9646345973014832, + "learning_rate": 4.8144765676214245e-06, + "loss": 0.0525, + "step": 1242 + }, + { + "epoch": 2.395180722891566, + "grad_norm": 0.20530278980731964, + "learning_rate": 4.7853291939245814e-06, + "loss": 0.008, + "step": 1243 + }, + { + "epoch": 2.3971084337349398, + "grad_norm": 0.1682119369506836, + "learning_rate": 4.756258324528995e-06, + "loss": 0.0044, + "step": 1244 + }, + { + "epoch": 2.3990361445783135, + "grad_norm": 0.45536917448043823, + "learning_rate": 4.727264105612439e-06, + "loss": 0.0186, + "step": 1245 + }, + { + "epoch": 2.4009638554216868, + "grad_norm": 0.3017471730709076, + "learning_rate": 4.698346682967258e-06, + "loss": 0.0106, + "step": 1246 + }, + { + "epoch": 2.40289156626506, + "grad_norm": 0.1226554661989212, + "learning_rate": 4.669506201999625e-06, + "loss": 0.0035, + "step": 1247 + }, + { + "epoch": 2.404819277108434, + "grad_norm": 0.13750068843364716, + "learning_rate": 4.640742807728837e-06, + "loss": 0.0038, + "step": 1248 + }, + { + "epoch": 2.406746987951807, + "grad_norm": 0.11531024426221848, + "learning_rate": 4.612056644786575e-06, + "loss": 0.0021, + "step": 1249 + }, + { + "epoch": 2.408674698795181, + "grad_norm": 0.1143675372004509, + "learning_rate": 4.583447857416175e-06, + "loss": 0.0028, + "step": 1250 + }, + { + "epoch": 2.410602409638554, + "grad_norm": 0.0914216861128807, + "learning_rate": 4.554916589471898e-06, + "loss": 0.0027, + "step": 1251 + }, + { + "epoch": 2.412530120481928, + "grad_norm": 0.18339012563228607, + "learning_rate": 4.526462984418221e-06, + "loss": 0.0037, + "step": 1252 + }, + { + "epoch": 2.414457831325301, + "grad_norm": 0.11073138564825058, + "learning_rate": 4.498087185329105e-06, + "loss": 0.003, + "step": 1253 + }, + { + "epoch": 2.416385542168675, + "grad_norm": 0.20792435109615326, + "learning_rate": 4.469789334887265e-06, + "loss": 0.009, + "step": 1254 + }, + { + "epoch": 2.418313253012048, + "grad_norm": 0.09485629945993423, + "learning_rate": 4.441569575383471e-06, + "loss": 0.0033, + "step": 1255 + }, + { + "epoch": 2.420240963855422, + "grad_norm": 0.11831793934106827, + "learning_rate": 4.413428048715851e-06, + "loss": 0.0021, + "step": 1256 + }, + { + "epoch": 2.422168674698795, + "grad_norm": 0.11818034201860428, + "learning_rate": 4.38536489638911e-06, + "loss": 0.0041, + "step": 1257 + }, + { + "epoch": 2.4240963855421684, + "grad_norm": 0.2583082616329193, + "learning_rate": 4.3573802595138945e-06, + "loss": 0.0039, + "step": 1258 + }, + { + "epoch": 2.426024096385542, + "grad_norm": 0.3120201826095581, + "learning_rate": 4.329474278806034e-06, + "loss": 0.0087, + "step": 1259 + }, + { + "epoch": 2.427951807228916, + "grad_norm": 0.1258879452943802, + "learning_rate": 4.301647094585855e-06, + "loss": 0.0046, + "step": 1260 + }, + { + "epoch": 2.429879518072289, + "grad_norm": 0.15144586563110352, + "learning_rate": 4.273898846777473e-06, + "loss": 0.0054, + "step": 1261 + }, + { + "epoch": 2.4318072289156625, + "grad_norm": 0.15615184605121613, + "learning_rate": 4.246229674908067e-06, + "loss": 0.0072, + "step": 1262 + }, + { + "epoch": 2.433734939759036, + "grad_norm": 0.09690173715353012, + "learning_rate": 4.218639718107225e-06, + "loss": 0.003, + "step": 1263 + }, + { + "epoch": 2.4356626506024095, + "grad_norm": 0.23884955048561096, + "learning_rate": 4.1911291151062e-06, + "loss": 0.0109, + "step": 1264 + }, + { + "epoch": 2.4375903614457832, + "grad_norm": 0.0905768945813179, + "learning_rate": 4.163698004237222e-06, + "loss": 0.0027, + "step": 1265 + }, + { + "epoch": 2.4395180722891565, + "grad_norm": 0.09168912470340729, + "learning_rate": 4.136346523432821e-06, + "loss": 0.0018, + "step": 1266 + }, + { + "epoch": 2.4414457831325302, + "grad_norm": 0.17878012359142303, + "learning_rate": 4.109074810225118e-06, + "loss": 0.0048, + "step": 1267 + }, + { + "epoch": 2.4433734939759035, + "grad_norm": 0.09913790971040726, + "learning_rate": 4.08188300174513e-06, + "loss": 0.0021, + "step": 1268 + }, + { + "epoch": 2.4453012048192773, + "grad_norm": 0.16615812480449677, + "learning_rate": 4.054771234722106e-06, + "loss": 0.0066, + "step": 1269 + }, + { + "epoch": 2.4472289156626506, + "grad_norm": 0.09618276357650757, + "learning_rate": 4.027739645482784e-06, + "loss": 0.0043, + "step": 1270 + }, + { + "epoch": 2.4491566265060243, + "grad_norm": 0.33473479747772217, + "learning_rate": 4.0007883699507855e-06, + "loss": 0.0236, + "step": 1271 + }, + { + "epoch": 2.4510843373493976, + "grad_norm": 0.15051880478858948, + "learning_rate": 3.973917543645867e-06, + "loss": 0.0068, + "step": 1272 + }, + { + "epoch": 2.453012048192771, + "grad_norm": 0.24134816229343414, + "learning_rate": 3.947127301683249e-06, + "loss": 0.0194, + "step": 1273 + }, + { + "epoch": 2.4549397590361446, + "grad_norm": 0.10495353490114212, + "learning_rate": 3.920417778772967e-06, + "loss": 0.0042, + "step": 1274 + }, + { + "epoch": 2.4568674698795183, + "grad_norm": 0.2294938713312149, + "learning_rate": 3.893789109219171e-06, + "loss": 0.0224, + "step": 1275 + }, + { + "epoch": 2.4587951807228916, + "grad_norm": 0.13710513710975647, + "learning_rate": 3.867241426919446e-06, + "loss": 0.0046, + "step": 1276 + }, + { + "epoch": 2.460722891566265, + "grad_norm": 0.06754808127880096, + "learning_rate": 3.840774865364157e-06, + "loss": 0.0019, + "step": 1277 + }, + { + "epoch": 2.4626506024096386, + "grad_norm": 0.24797780811786652, + "learning_rate": 3.8143895576357605e-06, + "loss": 0.0063, + "step": 1278 + }, + { + "epoch": 2.464578313253012, + "grad_norm": 0.1476449817419052, + "learning_rate": 3.788085636408143e-06, + "loss": 0.0055, + "step": 1279 + }, + { + "epoch": 2.4665060240963856, + "grad_norm": 0.22397096455097198, + "learning_rate": 3.7618632339459616e-06, + "loss": 0.0164, + "step": 1280 + }, + { + "epoch": 2.468433734939759, + "grad_norm": 0.21596969664096832, + "learning_rate": 3.7357224821039497e-06, + "loss": 0.0112, + "step": 1281 + }, + { + "epoch": 2.4703614457831327, + "grad_norm": 0.2775099575519562, + "learning_rate": 3.7096635123263068e-06, + "loss": 0.0112, + "step": 1282 + }, + { + "epoch": 2.472289156626506, + "grad_norm": 0.07963326573371887, + "learning_rate": 3.683686455645974e-06, + "loss": 0.0013, + "step": 1283 + }, + { + "epoch": 2.4742168674698797, + "grad_norm": 0.1253802627325058, + "learning_rate": 3.6577914426840266e-06, + "loss": 0.0038, + "step": 1284 + }, + { + "epoch": 2.476144578313253, + "grad_norm": 0.10258597880601883, + "learning_rate": 3.631978603648989e-06, + "loss": 0.0023, + "step": 1285 + }, + { + "epoch": 2.4780722891566267, + "grad_norm": 0.17102380096912384, + "learning_rate": 3.6062480683361935e-06, + "loss": 0.0025, + "step": 1286 + }, + { + "epoch": 2.48, + "grad_norm": 0.09547360241413116, + "learning_rate": 3.580599966127123e-06, + "loss": 0.003, + "step": 1287 + }, + { + "epoch": 2.4819277108433733, + "grad_norm": 0.08008653670549393, + "learning_rate": 3.5550344259887438e-06, + "loss": 0.0023, + "step": 1288 + }, + { + "epoch": 2.483855421686747, + "grad_norm": 0.07712296396493912, + "learning_rate": 3.5295515764729003e-06, + "loss": 0.0015, + "step": 1289 + }, + { + "epoch": 2.4857831325301207, + "grad_norm": 0.21118703484535217, + "learning_rate": 3.5041515457156303e-06, + "loss": 0.0041, + "step": 1290 + }, + { + "epoch": 2.487710843373494, + "grad_norm": 0.10772393643856049, + "learning_rate": 3.4788344614365155e-06, + "loss": 0.0029, + "step": 1291 + }, + { + "epoch": 2.4896385542168673, + "grad_norm": 0.2353268563747406, + "learning_rate": 3.453600450938073e-06, + "loss": 0.0072, + "step": 1292 + }, + { + "epoch": 2.491566265060241, + "grad_norm": 0.2897944152355194, + "learning_rate": 3.428449641105107e-06, + "loss": 0.0205, + "step": 1293 + }, + { + "epoch": 2.4934939759036143, + "grad_norm": 0.19756680727005005, + "learning_rate": 3.4033821584040383e-06, + "loss": 0.0065, + "step": 1294 + }, + { + "epoch": 2.495421686746988, + "grad_norm": 0.13538534939289093, + "learning_rate": 3.378398128882305e-06, + "loss": 0.0025, + "step": 1295 + }, + { + "epoch": 2.4973493975903613, + "grad_norm": 0.2301637977361679, + "learning_rate": 3.3534976781677142e-06, + "loss": 0.0071, + "step": 1296 + }, + { + "epoch": 2.499277108433735, + "grad_norm": 0.0965796634554863, + "learning_rate": 3.3286809314678137e-06, + "loss": 0.0024, + "step": 1297 + }, + { + "epoch": 2.5012048192771084, + "grad_norm": 0.0777980163693428, + "learning_rate": 3.30394801356926e-06, + "loss": 0.0013, + "step": 1298 + }, + { + "epoch": 2.503132530120482, + "grad_norm": 0.3157603442668915, + "learning_rate": 3.279299048837177e-06, + "loss": 0.0228, + "step": 1299 + }, + { + "epoch": 2.5050602409638554, + "grad_norm": 0.15660233795642853, + "learning_rate": 3.2547341612145654e-06, + "loss": 0.0056, + "step": 1300 + }, + { + "epoch": 2.506987951807229, + "grad_norm": 0.21655581891536713, + "learning_rate": 3.2302534742216586e-06, + "loss": 0.0081, + "step": 1301 + }, + { + "epoch": 2.5089156626506024, + "grad_norm": 0.09475889801979065, + "learning_rate": 3.205857110955277e-06, + "loss": 0.0029, + "step": 1302 + }, + { + "epoch": 2.5108433734939757, + "grad_norm": 0.13174696266651154, + "learning_rate": 3.18154519408826e-06, + "loss": 0.0059, + "step": 1303 + }, + { + "epoch": 2.5127710843373494, + "grad_norm": 0.10386355221271515, + "learning_rate": 3.1573178458688102e-06, + "loss": 0.0042, + "step": 1304 + }, + { + "epoch": 2.514698795180723, + "grad_norm": 0.12700854241847992, + "learning_rate": 3.133175188119899e-06, + "loss": 0.0041, + "step": 1305 + }, + { + "epoch": 2.5166265060240964, + "grad_norm": 0.1617022454738617, + "learning_rate": 3.109117342238639e-06, + "loss": 0.0053, + "step": 1306 + }, + { + "epoch": 2.5185542168674697, + "grad_norm": 0.8668884038925171, + "learning_rate": 3.085144429195688e-06, + "loss": 0.0084, + "step": 1307 + }, + { + "epoch": 2.5204819277108435, + "grad_norm": 0.22429344058036804, + "learning_rate": 3.061256569534634e-06, + "loss": 0.0053, + "step": 1308 + }, + { + "epoch": 2.5224096385542167, + "grad_norm": 0.08967582136392593, + "learning_rate": 3.037453883371375e-06, + "loss": 0.0018, + "step": 1309 + }, + { + "epoch": 2.5243373493975905, + "grad_norm": 0.1251695454120636, + "learning_rate": 3.0137364903935464e-06, + "loss": 0.0037, + "step": 1310 + }, + { + "epoch": 2.5262650602409638, + "grad_norm": 0.09026174992322922, + "learning_rate": 2.990104509859897e-06, + "loss": 0.0024, + "step": 1311 + }, + { + "epoch": 2.5281927710843375, + "grad_norm": 0.34319114685058594, + "learning_rate": 2.966558060599689e-06, + "loss": 0.0063, + "step": 1312 + }, + { + "epoch": 2.5301204819277108, + "grad_norm": 0.20300136506557465, + "learning_rate": 2.9430972610121087e-06, + "loss": 0.0054, + "step": 1313 + }, + { + "epoch": 2.532048192771084, + "grad_norm": 0.19160760939121246, + "learning_rate": 2.9197222290656737e-06, + "loss": 0.0095, + "step": 1314 + }, + { + "epoch": 2.533975903614458, + "grad_norm": 0.18991442024707794, + "learning_rate": 2.8964330822976227e-06, + "loss": 0.006, + "step": 1315 + }, + { + "epoch": 2.5359036144578315, + "grad_norm": 0.1801903396844864, + "learning_rate": 2.873229937813349e-06, + "loss": 0.0067, + "step": 1316 + }, + { + "epoch": 2.537831325301205, + "grad_norm": 0.07068303227424622, + "learning_rate": 2.850112912285783e-06, + "loss": 0.0015, + "step": 1317 + }, + { + "epoch": 2.539759036144578, + "grad_norm": 0.1404612809419632, + "learning_rate": 2.8270821219548296e-06, + "loss": 0.0036, + "step": 1318 + }, + { + "epoch": 2.541686746987952, + "grad_norm": 0.12199504673480988, + "learning_rate": 2.8041376826267862e-06, + "loss": 0.0068, + "step": 1319 + }, + { + "epoch": 2.5436144578313256, + "grad_norm": 0.2167249619960785, + "learning_rate": 2.7812797096737253e-06, + "loss": 0.0048, + "step": 1320 + }, + { + "epoch": 2.545542168674699, + "grad_norm": 0.07466506212949753, + "learning_rate": 2.7585083180329575e-06, + "loss": 0.0017, + "step": 1321 + }, + { + "epoch": 2.547469879518072, + "grad_norm": 0.11736353486776352, + "learning_rate": 2.7358236222064283e-06, + "loss": 0.003, + "step": 1322 + }, + { + "epoch": 2.549397590361446, + "grad_norm": 0.16602204740047455, + "learning_rate": 2.7132257362601453e-06, + "loss": 0.005, + "step": 1323 + }, + { + "epoch": 2.551325301204819, + "grad_norm": 0.15473629534244537, + "learning_rate": 2.6907147738236193e-06, + "loss": 0.0077, + "step": 1324 + }, + { + "epoch": 2.553253012048193, + "grad_norm": 0.07868973910808563, + "learning_rate": 2.6682908480892567e-06, + "loss": 0.0013, + "step": 1325 + }, + { + "epoch": 2.555180722891566, + "grad_norm": 0.2137845754623413, + "learning_rate": 2.645954071811847e-06, + "loss": 0.0092, + "step": 1326 + }, + { + "epoch": 2.55710843373494, + "grad_norm": 0.11191053688526154, + "learning_rate": 2.623704557307949e-06, + "loss": 0.0031, + "step": 1327 + }, + { + "epoch": 2.559036144578313, + "grad_norm": 0.3080642521381378, + "learning_rate": 2.6015424164553295e-06, + "loss": 0.0104, + "step": 1328 + }, + { + "epoch": 2.5609638554216865, + "grad_norm": 0.08816439658403397, + "learning_rate": 2.579467760692427e-06, + "loss": 0.004, + "step": 1329 + }, + { + "epoch": 2.56289156626506, + "grad_norm": 0.17154981195926666, + "learning_rate": 2.557480701017776e-06, + "loss": 0.0035, + "step": 1330 + }, + { + "epoch": 2.564819277108434, + "grad_norm": 0.09479143470525742, + "learning_rate": 2.5355813479894464e-06, + "loss": 0.0034, + "step": 1331 + }, + { + "epoch": 2.5667469879518072, + "grad_norm": 0.26139333844184875, + "learning_rate": 2.513769811724487e-06, + "loss": 0.0076, + "step": 1332 + }, + { + "epoch": 2.5686746987951805, + "grad_norm": 0.16864238679409027, + "learning_rate": 2.4920462018983816e-06, + "loss": 0.0046, + "step": 1333 + }, + { + "epoch": 2.5706024096385542, + "grad_norm": 0.1133158802986145, + "learning_rate": 2.4704106277444884e-06, + "loss": 0.0034, + "step": 1334 + }, + { + "epoch": 2.572530120481928, + "grad_norm": 0.27522334456443787, + "learning_rate": 2.4488631980534995e-06, + "loss": 0.0127, + "step": 1335 + }, + { + "epoch": 2.5744578313253013, + "grad_norm": 0.13547387719154358, + "learning_rate": 2.427404021172868e-06, + "loss": 0.0031, + "step": 1336 + }, + { + "epoch": 2.5763855421686745, + "grad_norm": 0.13478629291057587, + "learning_rate": 2.406033205006313e-06, + "loss": 0.0039, + "step": 1337 + }, + { + "epoch": 2.5783132530120483, + "grad_norm": 0.11515481770038605, + "learning_rate": 2.3847508570132226e-06, + "loss": 0.0029, + "step": 1338 + }, + { + "epoch": 2.5802409638554216, + "grad_norm": 0.21657171845436096, + "learning_rate": 2.36355708420815e-06, + "loss": 0.011, + "step": 1339 + }, + { + "epoch": 2.5821686746987953, + "grad_norm": 0.11441601067781448, + "learning_rate": 2.342451993160262e-06, + "loss": 0.006, + "step": 1340 + }, + { + "epoch": 2.5840963855421686, + "grad_norm": 0.13475841283798218, + "learning_rate": 2.3214356899928036e-06, + "loss": 0.0051, + "step": 1341 + }, + { + "epoch": 2.5860240963855423, + "grad_norm": 0.053035832941532135, + "learning_rate": 2.300508280382572e-06, + "loss": 0.0012, + "step": 1342 + }, + { + "epoch": 2.5879518072289156, + "grad_norm": 0.12467508763074875, + "learning_rate": 2.279669869559358e-06, + "loss": 0.0024, + "step": 1343 + }, + { + "epoch": 2.589879518072289, + "grad_norm": 0.10572273284196854, + "learning_rate": 2.2589205623054646e-06, + "loss": 0.0024, + "step": 1344 + }, + { + "epoch": 2.5918072289156626, + "grad_norm": 0.17056365311145782, + "learning_rate": 2.238260462955142e-06, + "loss": 0.0064, + "step": 1345 + }, + { + "epoch": 2.5937349397590364, + "grad_norm": 0.07940494269132614, + "learning_rate": 2.2176896753940637e-06, + "loss": 0.0012, + "step": 1346 + }, + { + "epoch": 2.5956626506024096, + "grad_norm": 0.10416694730520248, + "learning_rate": 2.1972083030588244e-06, + "loss": 0.0092, + "step": 1347 + }, + { + "epoch": 2.597590361445783, + "grad_norm": 0.2384328842163086, + "learning_rate": 2.176816448936423e-06, + "loss": 0.0067, + "step": 1348 + }, + { + "epoch": 2.5995180722891567, + "grad_norm": 0.14279082417488098, + "learning_rate": 2.156514215563703e-06, + "loss": 0.0059, + "step": 1349 + }, + { + "epoch": 2.6014457831325304, + "grad_norm": 0.08462683111429214, + "learning_rate": 2.1363017050268886e-06, + "loss": 0.0021, + "step": 1350 + }, + { + "epoch": 2.6033734939759037, + "grad_norm": 0.09768491238355637, + "learning_rate": 2.1161790189610377e-06, + "loss": 0.0038, + "step": 1351 + }, + { + "epoch": 2.605301204819277, + "grad_norm": 0.25498896837234497, + "learning_rate": 2.0961462585495474e-06, + "loss": 0.0114, + "step": 1352 + }, + { + "epoch": 2.6072289156626507, + "grad_norm": 0.15635675191879272, + "learning_rate": 2.076203524523637e-06, + "loss": 0.0054, + "step": 1353 + }, + { + "epoch": 2.609156626506024, + "grad_norm": 0.11619213968515396, + "learning_rate": 2.056350917161836e-06, + "loss": 0.007, + "step": 1354 + }, + { + "epoch": 2.6110843373493977, + "grad_norm": 0.18085338175296783, + "learning_rate": 2.0365885362895053e-06, + "loss": 0.0061, + "step": 1355 + }, + { + "epoch": 2.613012048192771, + "grad_norm": 0.14492927491664886, + "learning_rate": 2.016916481278306e-06, + "loss": 0.0114, + "step": 1356 + }, + { + "epoch": 2.6149397590361447, + "grad_norm": 0.21257621049880981, + "learning_rate": 1.997334851045709e-06, + "loss": 0.0057, + "step": 1357 + }, + { + "epoch": 2.616867469879518, + "grad_norm": 0.11539656668901443, + "learning_rate": 1.9778437440545085e-06, + "loss": 0.0071, + "step": 1358 + }, + { + "epoch": 2.6187951807228913, + "grad_norm": 0.1642933189868927, + "learning_rate": 1.95844325831231e-06, + "loss": 0.0054, + "step": 1359 + }, + { + "epoch": 2.620722891566265, + "grad_norm": 0.10779479146003723, + "learning_rate": 1.9391334913710545e-06, + "loss": 0.0028, + "step": 1360 + }, + { + "epoch": 2.6226506024096388, + "grad_norm": 0.14295366406440735, + "learning_rate": 1.9199145403265175e-06, + "loss": 0.0048, + "step": 1361 + }, + { + "epoch": 2.624578313253012, + "grad_norm": 0.13454844057559967, + "learning_rate": 1.9007865018178107e-06, + "loss": 0.0072, + "step": 1362 + }, + { + "epoch": 2.6265060240963853, + "grad_norm": 0.778252363204956, + "learning_rate": 1.8817494720269302e-06, + "loss": 0.0071, + "step": 1363 + }, + { + "epoch": 2.628433734939759, + "grad_norm": 0.11488679051399231, + "learning_rate": 1.8628035466782268e-06, + "loss": 0.0038, + "step": 1364 + }, + { + "epoch": 2.630361445783133, + "grad_norm": 0.15560875833034515, + "learning_rate": 1.8439488210379687e-06, + "loss": 0.0043, + "step": 1365 + }, + { + "epoch": 2.632289156626506, + "grad_norm": 0.10538071393966675, + "learning_rate": 1.8251853899138306e-06, + "loss": 0.0041, + "step": 1366 + }, + { + "epoch": 2.6342168674698794, + "grad_norm": 0.12866193056106567, + "learning_rate": 1.8065133476544306e-06, + "loss": 0.0034, + "step": 1367 + }, + { + "epoch": 2.636144578313253, + "grad_norm": 0.2045469433069229, + "learning_rate": 1.7879327881488584e-06, + "loss": 0.0141, + "step": 1368 + }, + { + "epoch": 2.6380722891566264, + "grad_norm": 0.12423976510763168, + "learning_rate": 1.769443804826194e-06, + "loss": 0.0047, + "step": 1369 + }, + { + "epoch": 2.64, + "grad_norm": 0.1007109209895134, + "learning_rate": 1.751046490655046e-06, + "loss": 0.0031, + "step": 1370 + }, + { + "epoch": 2.6419277108433734, + "grad_norm": 0.0681275874376297, + "learning_rate": 1.7327409381430804e-06, + "loss": 0.0019, + "step": 1371 + }, + { + "epoch": 2.643855421686747, + "grad_norm": 0.1645517498254776, + "learning_rate": 1.7145272393365498e-06, + "loss": 0.0035, + "step": 1372 + }, + { + "epoch": 2.6457831325301204, + "grad_norm": 0.13689427077770233, + "learning_rate": 1.6964054858198386e-06, + "loss": 0.0086, + "step": 1373 + }, + { + "epoch": 2.6477108433734937, + "grad_norm": 0.10440093278884888, + "learning_rate": 1.6783757687150149e-06, + "loss": 0.0019, + "step": 1374 + }, + { + "epoch": 2.6496385542168674, + "grad_norm": 0.1142532229423523, + "learning_rate": 1.6604381786813383e-06, + "loss": 0.0047, + "step": 1375 + }, + { + "epoch": 2.651566265060241, + "grad_norm": 0.10430166125297546, + "learning_rate": 1.6425928059148312e-06, + "loss": 0.0027, + "step": 1376 + }, + { + "epoch": 2.6534939759036145, + "grad_norm": 0.2315254956483841, + "learning_rate": 1.624839740147819e-06, + "loss": 0.0071, + "step": 1377 + }, + { + "epoch": 2.6554216867469878, + "grad_norm": 0.15356265008449554, + "learning_rate": 1.6071790706484746e-06, + "loss": 0.0109, + "step": 1378 + }, + { + "epoch": 2.6573493975903615, + "grad_norm": 0.1332363784313202, + "learning_rate": 1.589610886220383e-06, + "loss": 0.0046, + "step": 1379 + }, + { + "epoch": 2.659277108433735, + "grad_norm": 0.18892519176006317, + "learning_rate": 1.5721352752020602e-06, + "loss": 0.0138, + "step": 1380 + } + ], + "logging_steps": 1, + "max_steps": 1557, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 92, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.811308669430661e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1380/training_args.bin b/checkpoint-1380/training_args.bin new file mode 100644 index 0000000..ecc7b6b --- /dev/null +++ b/checkpoint-1380/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:342dfb3c86216e436950100c79812c54066d5572c4e9975b0f133c067f061bcf +size 7825 diff --git a/checkpoint-1472/chat_template.jinja b/checkpoint-1472/chat_template.jinja new file mode 100644 index 0000000..1bad6a0 --- /dev/null +++ b/checkpoint-1472/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/checkpoint-1472/config.json b/checkpoint-1472/config.json new file mode 100644 index 0000000..f8bf41e --- /dev/null +++ b/checkpoint-1472/config.json @@ -0,0 +1,36 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128009, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": 128009, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "transformers_version": "4.56.2", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/checkpoint-1472/generation_config.json b/checkpoint-1472/generation_config.json new file mode 100644 index 0000000..2152026 --- /dev/null +++ b/checkpoint-1472/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128009, + 128001, + 128008, + 128009 + ], + "pad_token_id": 128009, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.56.2" +} diff --git a/checkpoint-1472/model.safetensors b/checkpoint-1472/model.safetensors new file mode 100644 index 0000000..e6af94c --- /dev/null +++ b/checkpoint-1472/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b694ee6ce0e00ee378c814d32cdf4b3195bac58a7690bca5dfa3c70a10fcabe +size 2996982344 diff --git a/checkpoint-1472/special_tokens_map.json b/checkpoint-1472/special_tokens_map.json new file mode 100644 index 0000000..14daf45 --- /dev/null +++ b/checkpoint-1472/special_tokens_map.json @@ -0,0 +1,26 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/checkpoint-1472/tokenizer.json b/checkpoint-1472/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/checkpoint-1472/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-1472/tokenizer_config.json b/checkpoint-1472/tokenizer_config.json new file mode 100644 index 0000000..d1e1ea9 --- /dev/null +++ b/checkpoint-1472/tokenizer_config.json @@ -0,0 +1,2068 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-1472/trainer_state.json b/checkpoint-1472/trainer_state.json new file mode 100644 index 0000000..88f6af0 --- /dev/null +++ b/checkpoint-1472/trainer_state.json @@ -0,0 +1,10338 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.8366265060240963, + "eval_steps": 500, + "global_step": 1472, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0019277108433734939, + "grad_norm": 2.8518834114074707, + "learning_rate": 0.0, + "loss": 0.0891, + "step": 1 + }, + { + "epoch": 0.0038554216867469878, + "grad_norm": 1.8441249132156372, + "learning_rate": 2.564102564102564e-07, + "loss": 0.0539, + "step": 2 + }, + { + "epoch": 0.005783132530120482, + "grad_norm": 2.8263237476348877, + "learning_rate": 5.128205128205128e-07, + "loss": 0.099, + "step": 3 + }, + { + "epoch": 0.0077108433734939755, + "grad_norm": 2.5051236152648926, + "learning_rate": 7.692307692307694e-07, + "loss": 0.0789, + "step": 4 + }, + { + "epoch": 0.00963855421686747, + "grad_norm": 2.6903438568115234, + "learning_rate": 1.0256410256410257e-06, + "loss": 0.0881, + "step": 5 + }, + { + "epoch": 0.011566265060240964, + "grad_norm": 2.6205761432647705, + "learning_rate": 1.282051282051282e-06, + "loss": 0.0776, + "step": 6 + }, + { + "epoch": 0.013493975903614458, + "grad_norm": 2.6309337615966797, + "learning_rate": 1.5384615384615387e-06, + "loss": 0.0827, + "step": 7 + }, + { + "epoch": 0.015421686746987951, + "grad_norm": 1.5427855253219604, + "learning_rate": 1.794871794871795e-06, + "loss": 0.0577, + "step": 8 + }, + { + "epoch": 0.017349397590361446, + "grad_norm": 1.0973446369171143, + "learning_rate": 2.0512820512820513e-06, + "loss": 0.04, + "step": 9 + }, + { + "epoch": 0.01927710843373494, + "grad_norm": 1.3253350257873535, + "learning_rate": 2.307692307692308e-06, + "loss": 0.0506, + "step": 10 + }, + { + "epoch": 0.021204819277108433, + "grad_norm": 1.588739037513733, + "learning_rate": 2.564102564102564e-06, + "loss": 0.0874, + "step": 11 + }, + { + "epoch": 0.02313253012048193, + "grad_norm": 1.4987014532089233, + "learning_rate": 2.8205128205128207e-06, + "loss": 0.0597, + "step": 12 + }, + { + "epoch": 0.02506024096385542, + "grad_norm": 1.6571592092514038, + "learning_rate": 3.0769230769230774e-06, + "loss": 0.0559, + "step": 13 + }, + { + "epoch": 0.026987951807228915, + "grad_norm": 1.8860628604888916, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.0688, + "step": 14 + }, + { + "epoch": 0.02891566265060241, + "grad_norm": 1.3202295303344727, + "learning_rate": 3.58974358974359e-06, + "loss": 0.0433, + "step": 15 + }, + { + "epoch": 0.030843373493975902, + "grad_norm": 1.5870612859725952, + "learning_rate": 3.846153846153847e-06, + "loss": 0.0695, + "step": 16 + }, + { + "epoch": 0.0327710843373494, + "grad_norm": 0.9192284345626831, + "learning_rate": 4.102564102564103e-06, + "loss": 0.0392, + "step": 17 + }, + { + "epoch": 0.03469879518072289, + "grad_norm": 0.7950155735015869, + "learning_rate": 4.358974358974359e-06, + "loss": 0.0351, + "step": 18 + }, + { + "epoch": 0.03662650602409639, + "grad_norm": 0.8854314684867859, + "learning_rate": 4.615384615384616e-06, + "loss": 0.0356, + "step": 19 + }, + { + "epoch": 0.03855421686746988, + "grad_norm": 0.9546788930892944, + "learning_rate": 4.871794871794872e-06, + "loss": 0.0427, + "step": 20 + }, + { + "epoch": 0.04048192771084337, + "grad_norm": 0.6315903663635254, + "learning_rate": 5.128205128205128e-06, + "loss": 0.0397, + "step": 21 + }, + { + "epoch": 0.042409638554216866, + "grad_norm": 0.9230924844741821, + "learning_rate": 5.384615384615385e-06, + "loss": 0.0481, + "step": 22 + }, + { + "epoch": 0.04433734939759036, + "grad_norm": 0.711546003818512, + "learning_rate": 5.641025641025641e-06, + "loss": 0.0479, + "step": 23 + }, + { + "epoch": 0.04626506024096386, + "grad_norm": 0.5288046598434448, + "learning_rate": 5.897435897435898e-06, + "loss": 0.0182, + "step": 24 + }, + { + "epoch": 0.04819277108433735, + "grad_norm": 0.9420496225357056, + "learning_rate": 6.153846153846155e-06, + "loss": 0.0389, + "step": 25 + }, + { + "epoch": 0.05012048192771084, + "grad_norm": 0.5001983046531677, + "learning_rate": 6.410256410256412e-06, + "loss": 0.0268, + "step": 26 + }, + { + "epoch": 0.052048192771084335, + "grad_norm": 0.8084653615951538, + "learning_rate": 6.666666666666667e-06, + "loss": 0.0367, + "step": 27 + }, + { + "epoch": 0.05397590361445783, + "grad_norm": 0.7195103764533997, + "learning_rate": 6.923076923076923e-06, + "loss": 0.0251, + "step": 28 + }, + { + "epoch": 0.055903614457831326, + "grad_norm": 0.529958963394165, + "learning_rate": 7.17948717948718e-06, + "loss": 0.0289, + "step": 29 + }, + { + "epoch": 0.05783132530120482, + "grad_norm": 0.795376181602478, + "learning_rate": 7.435897435897437e-06, + "loss": 0.043, + "step": 30 + }, + { + "epoch": 0.059759036144578316, + "grad_norm": 0.6366249918937683, + "learning_rate": 7.692307692307694e-06, + "loss": 0.029, + "step": 31 + }, + { + "epoch": 0.061686746987951804, + "grad_norm": 0.5414115190505981, + "learning_rate": 7.948717948717949e-06, + "loss": 0.0365, + "step": 32 + }, + { + "epoch": 0.0636144578313253, + "grad_norm": 0.9350972175598145, + "learning_rate": 8.205128205128205e-06, + "loss": 0.0283, + "step": 33 + }, + { + "epoch": 0.0655421686746988, + "grad_norm": 0.5660741925239563, + "learning_rate": 8.461538461538462e-06, + "loss": 0.0234, + "step": 34 + }, + { + "epoch": 0.06746987951807229, + "grad_norm": 0.5623988509178162, + "learning_rate": 8.717948717948719e-06, + "loss": 0.0307, + "step": 35 + }, + { + "epoch": 0.06939759036144579, + "grad_norm": 0.5260195732116699, + "learning_rate": 8.974358974358976e-06, + "loss": 0.0264, + "step": 36 + }, + { + "epoch": 0.07132530120481928, + "grad_norm": 0.4934785068035126, + "learning_rate": 9.230769230769232e-06, + "loss": 0.0224, + "step": 37 + }, + { + "epoch": 0.07325301204819278, + "grad_norm": 0.4797322154045105, + "learning_rate": 9.487179487179487e-06, + "loss": 0.0163, + "step": 38 + }, + { + "epoch": 0.07518072289156627, + "grad_norm": 0.4739217460155487, + "learning_rate": 9.743589743589744e-06, + "loss": 0.0165, + "step": 39 + }, + { + "epoch": 0.07710843373493977, + "grad_norm": 0.4527677595615387, + "learning_rate": 1e-05, + "loss": 0.0163, + "step": 40 + }, + { + "epoch": 0.07903614457831325, + "grad_norm": 0.6241316795349121, + "learning_rate": 1.0256410256410256e-05, + "loss": 0.0302, + "step": 41 + }, + { + "epoch": 0.08096385542168674, + "grad_norm": 0.639043927192688, + "learning_rate": 1.0512820512820514e-05, + "loss": 0.0312, + "step": 42 + }, + { + "epoch": 0.08289156626506024, + "grad_norm": 0.5121409296989441, + "learning_rate": 1.076923076923077e-05, + "loss": 0.0256, + "step": 43 + }, + { + "epoch": 0.08481927710843373, + "grad_norm": 0.6340477466583252, + "learning_rate": 1.1025641025641028e-05, + "loss": 0.04, + "step": 44 + }, + { + "epoch": 0.08674698795180723, + "grad_norm": 0.5260409712791443, + "learning_rate": 1.1282051282051283e-05, + "loss": 0.0282, + "step": 45 + }, + { + "epoch": 0.08867469879518072, + "grad_norm": 0.6390711069107056, + "learning_rate": 1.1538461538461538e-05, + "loss": 0.0243, + "step": 46 + }, + { + "epoch": 0.09060240963855422, + "grad_norm": 0.46469295024871826, + "learning_rate": 1.1794871794871796e-05, + "loss": 0.0208, + "step": 47 + }, + { + "epoch": 0.09253012048192771, + "grad_norm": 0.8711516857147217, + "learning_rate": 1.2051282051282051e-05, + "loss": 0.0291, + "step": 48 + }, + { + "epoch": 0.09445783132530121, + "grad_norm": 0.9164300560951233, + "learning_rate": 1.230769230769231e-05, + "loss": 0.0342, + "step": 49 + }, + { + "epoch": 0.0963855421686747, + "grad_norm": 0.5401139259338379, + "learning_rate": 1.2564102564102565e-05, + "loss": 0.0185, + "step": 50 + }, + { + "epoch": 0.0983132530120482, + "grad_norm": 0.44393008947372437, + "learning_rate": 1.2820512820512823e-05, + "loss": 0.0228, + "step": 51 + }, + { + "epoch": 0.10024096385542168, + "grad_norm": 0.3855767846107483, + "learning_rate": 1.3076923076923078e-05, + "loss": 0.0176, + "step": 52 + }, + { + "epoch": 0.10216867469879518, + "grad_norm": 0.8561235070228577, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.0433, + "step": 53 + }, + { + "epoch": 0.10409638554216867, + "grad_norm": 0.768002450466156, + "learning_rate": 1.3589743589743592e-05, + "loss": 0.0245, + "step": 54 + }, + { + "epoch": 0.10602409638554217, + "grad_norm": 0.4559759497642517, + "learning_rate": 1.3846153846153847e-05, + "loss": 0.0224, + "step": 55 + }, + { + "epoch": 0.10795180722891566, + "grad_norm": 0.6203847527503967, + "learning_rate": 1.4102564102564105e-05, + "loss": 0.0296, + "step": 56 + }, + { + "epoch": 0.10987951807228916, + "grad_norm": 0.6651368141174316, + "learning_rate": 1.435897435897436e-05, + "loss": 0.0336, + "step": 57 + }, + { + "epoch": 0.11180722891566265, + "grad_norm": 0.377734512090683, + "learning_rate": 1.4615384615384615e-05, + "loss": 0.0196, + "step": 58 + }, + { + "epoch": 0.11373493975903615, + "grad_norm": 0.687568724155426, + "learning_rate": 1.4871794871794874e-05, + "loss": 0.0207, + "step": 59 + }, + { + "epoch": 0.11566265060240964, + "grad_norm": 0.7905604243278503, + "learning_rate": 1.5128205128205129e-05, + "loss": 0.047, + "step": 60 + }, + { + "epoch": 0.11759036144578314, + "grad_norm": 0.7938196063041687, + "learning_rate": 1.5384615384615387e-05, + "loss": 0.0198, + "step": 61 + }, + { + "epoch": 0.11951807228915663, + "grad_norm": 0.41340553760528564, + "learning_rate": 1.5641025641025644e-05, + "loss": 0.0161, + "step": 62 + }, + { + "epoch": 0.12144578313253013, + "grad_norm": 0.5668172240257263, + "learning_rate": 1.5897435897435897e-05, + "loss": 0.0275, + "step": 63 + }, + { + "epoch": 0.12337349397590361, + "grad_norm": 0.48333367705345154, + "learning_rate": 1.6153846153846154e-05, + "loss": 0.0137, + "step": 64 + }, + { + "epoch": 0.12530120481927712, + "grad_norm": 0.6843933463096619, + "learning_rate": 1.641025641025641e-05, + "loss": 0.0294, + "step": 65 + }, + { + "epoch": 0.1272289156626506, + "grad_norm": 0.7789272665977478, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.0401, + "step": 66 + }, + { + "epoch": 0.1291566265060241, + "grad_norm": 0.6203492879867554, + "learning_rate": 1.6923076923076924e-05, + "loss": 0.0292, + "step": 67 + }, + { + "epoch": 0.1310843373493976, + "grad_norm": 0.5940662622451782, + "learning_rate": 1.717948717948718e-05, + "loss": 0.0178, + "step": 68 + }, + { + "epoch": 0.13301204819277107, + "grad_norm": 0.35504868626594543, + "learning_rate": 1.7435897435897438e-05, + "loss": 0.0129, + "step": 69 + }, + { + "epoch": 0.13493975903614458, + "grad_norm": 0.8796699643135071, + "learning_rate": 1.7692307692307694e-05, + "loss": 0.034, + "step": 70 + }, + { + "epoch": 0.13686746987951806, + "grad_norm": 0.967444896697998, + "learning_rate": 1.794871794871795e-05, + "loss": 0.0266, + "step": 71 + }, + { + "epoch": 0.13879518072289157, + "grad_norm": 0.4428526759147644, + "learning_rate": 1.8205128205128208e-05, + "loss": 0.0223, + "step": 72 + }, + { + "epoch": 0.14072289156626505, + "grad_norm": 0.42897751927375793, + "learning_rate": 1.8461538461538465e-05, + "loss": 0.0187, + "step": 73 + }, + { + "epoch": 0.14265060240963856, + "grad_norm": 0.5100914835929871, + "learning_rate": 1.8717948717948718e-05, + "loss": 0.0164, + "step": 74 + }, + { + "epoch": 0.14457831325301204, + "grad_norm": 0.6028861999511719, + "learning_rate": 1.8974358974358975e-05, + "loss": 0.0164, + "step": 75 + }, + { + "epoch": 0.14650602409638555, + "grad_norm": 0.6187024116516113, + "learning_rate": 1.923076923076923e-05, + "loss": 0.0296, + "step": 76 + }, + { + "epoch": 0.14843373493975903, + "grad_norm": 0.4822489619255066, + "learning_rate": 1.9487179487179488e-05, + "loss": 0.0148, + "step": 77 + }, + { + "epoch": 0.15036144578313254, + "grad_norm": 0.7231149673461914, + "learning_rate": 1.9743589743589745e-05, + "loss": 0.0395, + "step": 78 + }, + { + "epoch": 0.15228915662650602, + "grad_norm": 0.8409642577171326, + "learning_rate": 2e-05, + "loss": 0.0446, + "step": 79 + }, + { + "epoch": 0.15421686746987953, + "grad_norm": 0.4883500039577484, + "learning_rate": 2.025641025641026e-05, + "loss": 0.0206, + "step": 80 + }, + { + "epoch": 0.156144578313253, + "grad_norm": 0.6287479400634766, + "learning_rate": 2.0512820512820512e-05, + "loss": 0.0333, + "step": 81 + }, + { + "epoch": 0.1580722891566265, + "grad_norm": 0.5041632652282715, + "learning_rate": 2.0769230769230772e-05, + "loss": 0.0414, + "step": 82 + }, + { + "epoch": 0.16, + "grad_norm": 0.5103405117988586, + "learning_rate": 2.102564102564103e-05, + "loss": 0.045, + "step": 83 + }, + { + "epoch": 0.16192771084337348, + "grad_norm": 0.493161678314209, + "learning_rate": 2.1282051282051285e-05, + "loss": 0.021, + "step": 84 + }, + { + "epoch": 0.163855421686747, + "grad_norm": 0.908843994140625, + "learning_rate": 2.153846153846154e-05, + "loss": 0.0389, + "step": 85 + }, + { + "epoch": 0.16578313253012048, + "grad_norm": 0.5067003965377808, + "learning_rate": 2.1794871794871795e-05, + "loss": 0.0272, + "step": 86 + }, + { + "epoch": 0.16771084337349398, + "grad_norm": 0.5791381597518921, + "learning_rate": 2.2051282051282056e-05, + "loss": 0.0368, + "step": 87 + }, + { + "epoch": 0.16963855421686747, + "grad_norm": 0.7056036591529846, + "learning_rate": 2.230769230769231e-05, + "loss": 0.0284, + "step": 88 + }, + { + "epoch": 0.17156626506024097, + "grad_norm": 0.6563822031021118, + "learning_rate": 2.2564102564102566e-05, + "loss": 0.0646, + "step": 89 + }, + { + "epoch": 0.17349397590361446, + "grad_norm": 0.9483286142349243, + "learning_rate": 2.2820512820512822e-05, + "loss": 0.0439, + "step": 90 + }, + { + "epoch": 0.17542168674698796, + "grad_norm": 0.370664119720459, + "learning_rate": 2.3076923076923076e-05, + "loss": 0.0109, + "step": 91 + }, + { + "epoch": 0.17734939759036145, + "grad_norm": 0.9776477813720703, + "learning_rate": 2.3333333333333336e-05, + "loss": 0.0458, + "step": 92 + }, + { + "epoch": 0.17927710843373493, + "grad_norm": 0.45710092782974243, + "learning_rate": 2.3589743589743593e-05, + "loss": 0.0212, + "step": 93 + }, + { + "epoch": 0.18120481927710844, + "grad_norm": 0.8623896837234497, + "learning_rate": 2.384615384615385e-05, + "loss": 0.0215, + "step": 94 + }, + { + "epoch": 0.18313253012048192, + "grad_norm": 0.55814528465271, + "learning_rate": 2.4102564102564103e-05, + "loss": 0.0218, + "step": 95 + }, + { + "epoch": 0.18506024096385543, + "grad_norm": 0.49882641434669495, + "learning_rate": 2.435897435897436e-05, + "loss": 0.0268, + "step": 96 + }, + { + "epoch": 0.1869879518072289, + "grad_norm": 0.3508654534816742, + "learning_rate": 2.461538461538462e-05, + "loss": 0.0172, + "step": 97 + }, + { + "epoch": 0.18891566265060242, + "grad_norm": 0.601170003414154, + "learning_rate": 2.4871794871794873e-05, + "loss": 0.0208, + "step": 98 + }, + { + "epoch": 0.1908433734939759, + "grad_norm": 1.1748133897781372, + "learning_rate": 2.512820512820513e-05, + "loss": 0.0259, + "step": 99 + }, + { + "epoch": 0.1927710843373494, + "grad_norm": 0.46370384097099304, + "learning_rate": 2.5384615384615386e-05, + "loss": 0.0242, + "step": 100 + }, + { + "epoch": 0.1946987951807229, + "grad_norm": 0.525010883808136, + "learning_rate": 2.5641025641025646e-05, + "loss": 0.0188, + "step": 101 + }, + { + "epoch": 0.1966265060240964, + "grad_norm": 0.766501784324646, + "learning_rate": 2.58974358974359e-05, + "loss": 0.0584, + "step": 102 + }, + { + "epoch": 0.19855421686746988, + "grad_norm": 0.3572964370250702, + "learning_rate": 2.6153846153846157e-05, + "loss": 0.0131, + "step": 103 + }, + { + "epoch": 0.20048192771084336, + "grad_norm": 0.6467130780220032, + "learning_rate": 2.6410256410256413e-05, + "loss": 0.0231, + "step": 104 + }, + { + "epoch": 0.20240963855421687, + "grad_norm": 1.1852102279663086, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.027, + "step": 105 + }, + { + "epoch": 0.20433734939759035, + "grad_norm": 2.3659932613372803, + "learning_rate": 2.6923076923076927e-05, + "loss": 0.0224, + "step": 106 + }, + { + "epoch": 0.20626506024096386, + "grad_norm": 0.5343687534332275, + "learning_rate": 2.7179487179487183e-05, + "loss": 0.0198, + "step": 107 + }, + { + "epoch": 0.20819277108433734, + "grad_norm": 1.852160096168518, + "learning_rate": 2.7435897435897437e-05, + "loss": 0.032, + "step": 108 + }, + { + "epoch": 0.21012048192771085, + "grad_norm": 0.47291702032089233, + "learning_rate": 2.7692307692307694e-05, + "loss": 0.0117, + "step": 109 + }, + { + "epoch": 0.21204819277108433, + "grad_norm": 0.7623187899589539, + "learning_rate": 2.794871794871795e-05, + "loss": 0.0337, + "step": 110 + }, + { + "epoch": 0.21397590361445784, + "grad_norm": 0.5272570848464966, + "learning_rate": 2.820512820512821e-05, + "loss": 0.0131, + "step": 111 + }, + { + "epoch": 0.21590361445783132, + "grad_norm": 0.5568500757217407, + "learning_rate": 2.8461538461538464e-05, + "loss": 0.0233, + "step": 112 + }, + { + "epoch": 0.21783132530120483, + "grad_norm": 0.4008469879627228, + "learning_rate": 2.871794871794872e-05, + "loss": 0.0204, + "step": 113 + }, + { + "epoch": 0.2197590361445783, + "grad_norm": 0.4888612926006317, + "learning_rate": 2.8974358974358977e-05, + "loss": 0.016, + "step": 114 + }, + { + "epoch": 0.2216867469879518, + "grad_norm": 0.44903355836868286, + "learning_rate": 2.923076923076923e-05, + "loss": 0.0135, + "step": 115 + }, + { + "epoch": 0.2236144578313253, + "grad_norm": 0.9266762733459473, + "learning_rate": 2.948717948717949e-05, + "loss": 0.0233, + "step": 116 + }, + { + "epoch": 0.22554216867469878, + "grad_norm": 0.5352638959884644, + "learning_rate": 2.9743589743589747e-05, + "loss": 0.0198, + "step": 117 + }, + { + "epoch": 0.2274698795180723, + "grad_norm": 0.6051343679428101, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.0246, + "step": 118 + }, + { + "epoch": 0.22939759036144577, + "grad_norm": 0.9971133470535278, + "learning_rate": 3.0256410256410257e-05, + "loss": 0.025, + "step": 119 + }, + { + "epoch": 0.23132530120481928, + "grad_norm": 0.704236626625061, + "learning_rate": 3.0512820512820514e-05, + "loss": 0.031, + "step": 120 + }, + { + "epoch": 0.23325301204819276, + "grad_norm": 0.6137097477912903, + "learning_rate": 3.0769230769230774e-05, + "loss": 0.0519, + "step": 121 + }, + { + "epoch": 0.23518072289156627, + "grad_norm": 0.7396159768104553, + "learning_rate": 3.102564102564103e-05, + "loss": 0.0325, + "step": 122 + }, + { + "epoch": 0.23710843373493976, + "grad_norm": 1.3282053470611572, + "learning_rate": 3.128205128205129e-05, + "loss": 0.0252, + "step": 123 + }, + { + "epoch": 0.23903614457831326, + "grad_norm": 0.5220731496810913, + "learning_rate": 3.153846153846154e-05, + "loss": 0.0262, + "step": 124 + }, + { + "epoch": 0.24096385542168675, + "grad_norm": 0.5357242822647095, + "learning_rate": 3.1794871794871795e-05, + "loss": 0.0243, + "step": 125 + }, + { + "epoch": 0.24289156626506025, + "grad_norm": 0.48207753896713257, + "learning_rate": 3.205128205128206e-05, + "loss": 0.0178, + "step": 126 + }, + { + "epoch": 0.24481927710843374, + "grad_norm": 0.552988588809967, + "learning_rate": 3.230769230769231e-05, + "loss": 0.023, + "step": 127 + }, + { + "epoch": 0.24674698795180722, + "grad_norm": 1.7962840795516968, + "learning_rate": 3.2564102564102565e-05, + "loss": 0.032, + "step": 128 + }, + { + "epoch": 0.24867469879518073, + "grad_norm": 1.6404600143432617, + "learning_rate": 3.282051282051282e-05, + "loss": 0.0231, + "step": 129 + }, + { + "epoch": 0.25060240963855424, + "grad_norm": 0.39142486453056335, + "learning_rate": 3.307692307692308e-05, + "loss": 0.0147, + "step": 130 + }, + { + "epoch": 0.2525301204819277, + "grad_norm": 1.3272887468338013, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.0439, + "step": 131 + }, + { + "epoch": 0.2544578313253012, + "grad_norm": 1.5122811794281006, + "learning_rate": 3.358974358974359e-05, + "loss": 0.0282, + "step": 132 + }, + { + "epoch": 0.2563855421686747, + "grad_norm": 1.8542430400848389, + "learning_rate": 3.384615384615385e-05, + "loss": 0.0515, + "step": 133 + }, + { + "epoch": 0.2583132530120482, + "grad_norm": 4.059277534484863, + "learning_rate": 3.4102564102564105e-05, + "loss": 0.0781, + "step": 134 + }, + { + "epoch": 0.26024096385542167, + "grad_norm": 0.6206214427947998, + "learning_rate": 3.435897435897436e-05, + "loss": 0.0306, + "step": 135 + }, + { + "epoch": 0.2621686746987952, + "grad_norm": 0.4575510323047638, + "learning_rate": 3.461538461538462e-05, + "loss": 0.0154, + "step": 136 + }, + { + "epoch": 0.2640963855421687, + "grad_norm": 1.1556978225708008, + "learning_rate": 3.4871794871794875e-05, + "loss": 0.0235, + "step": 137 + }, + { + "epoch": 0.26602409638554214, + "grad_norm": 0.6975051760673523, + "learning_rate": 3.512820512820513e-05, + "loss": 0.0453, + "step": 138 + }, + { + "epoch": 0.26795180722891565, + "grad_norm": 0.8686623573303223, + "learning_rate": 3.538461538461539e-05, + "loss": 0.0427, + "step": 139 + }, + { + "epoch": 0.26987951807228916, + "grad_norm": 2.0681848526000977, + "learning_rate": 3.5641025641025646e-05, + "loss": 0.04, + "step": 140 + }, + { + "epoch": 0.27180722891566267, + "grad_norm": 0.4397984445095062, + "learning_rate": 3.58974358974359e-05, + "loss": 0.0188, + "step": 141 + }, + { + "epoch": 0.2737349397590361, + "grad_norm": 0.5871334075927734, + "learning_rate": 3.615384615384616e-05, + "loss": 0.0253, + "step": 142 + }, + { + "epoch": 0.27566265060240963, + "grad_norm": 1.1078568696975708, + "learning_rate": 3.6410256410256416e-05, + "loss": 0.0316, + "step": 143 + }, + { + "epoch": 0.27759036144578314, + "grad_norm": 0.5691841840744019, + "learning_rate": 3.6666666666666666e-05, + "loss": 0.0266, + "step": 144 + }, + { + "epoch": 0.27951807228915665, + "grad_norm": 0.7896255254745483, + "learning_rate": 3.692307692307693e-05, + "loss": 0.0281, + "step": 145 + }, + { + "epoch": 0.2814457831325301, + "grad_norm": 0.9988337159156799, + "learning_rate": 3.7179487179487186e-05, + "loss": 0.0295, + "step": 146 + }, + { + "epoch": 0.2833734939759036, + "grad_norm": 0.9811834692955017, + "learning_rate": 3.7435897435897436e-05, + "loss": 0.0322, + "step": 147 + }, + { + "epoch": 0.2853012048192771, + "grad_norm": 0.6503105759620667, + "learning_rate": 3.769230769230769e-05, + "loss": 0.0266, + "step": 148 + }, + { + "epoch": 0.28722891566265063, + "grad_norm": 1.9164355993270874, + "learning_rate": 3.794871794871795e-05, + "loss": 0.0677, + "step": 149 + }, + { + "epoch": 0.2891566265060241, + "grad_norm": 1.1724557876586914, + "learning_rate": 3.820512820512821e-05, + "loss": 0.0324, + "step": 150 + }, + { + "epoch": 0.2910843373493976, + "grad_norm": 0.8482469916343689, + "learning_rate": 3.846153846153846e-05, + "loss": 0.0259, + "step": 151 + }, + { + "epoch": 0.2930120481927711, + "grad_norm": 0.8572830557823181, + "learning_rate": 3.871794871794872e-05, + "loss": 0.0358, + "step": 152 + }, + { + "epoch": 0.29493975903614456, + "grad_norm": 0.6630825400352478, + "learning_rate": 3.8974358974358976e-05, + "loss": 0.0447, + "step": 153 + }, + { + "epoch": 0.29686746987951806, + "grad_norm": 0.9197093844413757, + "learning_rate": 3.923076923076923e-05, + "loss": 0.0409, + "step": 154 + }, + { + "epoch": 0.2987951807228916, + "grad_norm": 0.6976819634437561, + "learning_rate": 3.948717948717949e-05, + "loss": 0.0317, + "step": 155 + }, + { + "epoch": 0.3007228915662651, + "grad_norm": 0.7353514432907104, + "learning_rate": 3.9743589743589747e-05, + "loss": 0.0306, + "step": 156 + }, + { + "epoch": 0.30265060240963854, + "grad_norm": 0.5730232000350952, + "learning_rate": 4e-05, + "loss": 0.0324, + "step": 157 + }, + { + "epoch": 0.30457831325301205, + "grad_norm": 0.7852078676223755, + "learning_rate": 3.999994971675547e-05, + "loss": 0.0354, + "step": 158 + }, + { + "epoch": 0.30650602409638555, + "grad_norm": 0.5924715399742126, + "learning_rate": 3.999979886727471e-05, + "loss": 0.0366, + "step": 159 + }, + { + "epoch": 0.30843373493975906, + "grad_norm": 0.7359845638275146, + "learning_rate": 3.999954745231624e-05, + "loss": 0.0437, + "step": 160 + }, + { + "epoch": 0.3103614457831325, + "grad_norm": 0.7866976857185364, + "learning_rate": 3.999919547314426e-05, + "loss": 0.0363, + "step": 161 + }, + { + "epoch": 0.312289156626506, + "grad_norm": 0.7425745129585266, + "learning_rate": 3.999874293152863e-05, + "loss": 0.0259, + "step": 162 + }, + { + "epoch": 0.31421686746987953, + "grad_norm": 1.8922245502471924, + "learning_rate": 3.9998189829744885e-05, + "loss": 0.0341, + "step": 163 + }, + { + "epoch": 0.316144578313253, + "grad_norm": 0.7908634543418884, + "learning_rate": 3.99975361705742e-05, + "loss": 0.0424, + "step": 164 + }, + { + "epoch": 0.3180722891566265, + "grad_norm": 2.047368049621582, + "learning_rate": 3.999678195730337e-05, + "loss": 0.0535, + "step": 165 + }, + { + "epoch": 0.32, + "grad_norm": 0.5702639222145081, + "learning_rate": 3.999592719372484e-05, + "loss": 0.0284, + "step": 166 + }, + { + "epoch": 0.3219277108433735, + "grad_norm": 0.45015648007392883, + "learning_rate": 3.9994971884136636e-05, + "loss": 0.0313, + "step": 167 + }, + { + "epoch": 0.32385542168674697, + "grad_norm": 4.094679355621338, + "learning_rate": 3.9993916033342355e-05, + "loss": 0.0524, + "step": 168 + }, + { + "epoch": 0.3257831325301205, + "grad_norm": 0.800846517086029, + "learning_rate": 3.999275964665117e-05, + "loss": 0.0282, + "step": 169 + }, + { + "epoch": 0.327710843373494, + "grad_norm": 0.47881078720092773, + "learning_rate": 3.999150272987776e-05, + "loss": 0.0293, + "step": 170 + }, + { + "epoch": 0.3296385542168675, + "grad_norm": 0.5716657638549805, + "learning_rate": 3.999014528934232e-05, + "loss": 0.0221, + "step": 171 + }, + { + "epoch": 0.33156626506024095, + "grad_norm": 0.6333311200141907, + "learning_rate": 3.998868733187048e-05, + "loss": 0.0302, + "step": 172 + }, + { + "epoch": 0.33349397590361446, + "grad_norm": 6.642521858215332, + "learning_rate": 3.998712886479335e-05, + "loss": 0.0364, + "step": 173 + }, + { + "epoch": 0.33542168674698797, + "grad_norm": 0.7515506148338318, + "learning_rate": 3.998546989594739e-05, + "loss": 0.0296, + "step": 174 + }, + { + "epoch": 0.3373493975903614, + "grad_norm": 1.0728015899658203, + "learning_rate": 3.998371043367445e-05, + "loss": 0.0549, + "step": 175 + }, + { + "epoch": 0.33927710843373493, + "grad_norm": 1.3025579452514648, + "learning_rate": 3.998185048682166e-05, + "loss": 0.0577, + "step": 176 + }, + { + "epoch": 0.34120481927710844, + "grad_norm": 1.0962958335876465, + "learning_rate": 3.997989006474144e-05, + "loss": 0.0313, + "step": 177 + }, + { + "epoch": 0.34313253012048195, + "grad_norm": 0.7064313292503357, + "learning_rate": 3.997782917729143e-05, + "loss": 0.0309, + "step": 178 + }, + { + "epoch": 0.3450602409638554, + "grad_norm": 0.43374207615852356, + "learning_rate": 3.997566783483445e-05, + "loss": 0.0166, + "step": 179 + }, + { + "epoch": 0.3469879518072289, + "grad_norm": 0.7236390113830566, + "learning_rate": 3.9973406048238413e-05, + "loss": 0.0254, + "step": 180 + }, + { + "epoch": 0.3489156626506024, + "grad_norm": 0.5041500926017761, + "learning_rate": 3.9971043828876334e-05, + "loss": 0.0239, + "step": 181 + }, + { + "epoch": 0.35084337349397593, + "grad_norm": 1.2744532823562622, + "learning_rate": 3.9968581188626204e-05, + "loss": 0.0404, + "step": 182 + }, + { + "epoch": 0.3527710843373494, + "grad_norm": 0.45845362544059753, + "learning_rate": 3.996601813987098e-05, + "loss": 0.0127, + "step": 183 + }, + { + "epoch": 0.3546987951807229, + "grad_norm": 0.4426881968975067, + "learning_rate": 3.996335469549852e-05, + "loss": 0.0176, + "step": 184 + }, + { + "epoch": 0.3566265060240964, + "grad_norm": 1.0030732154846191, + "learning_rate": 3.9960590868901465e-05, + "loss": 0.0457, + "step": 185 + }, + { + "epoch": 0.35855421686746985, + "grad_norm": 0.6428582668304443, + "learning_rate": 3.995772667397725e-05, + "loss": 0.0271, + "step": 186 + }, + { + "epoch": 0.36048192771084336, + "grad_norm": 0.5335744619369507, + "learning_rate": 3.995476212512795e-05, + "loss": 0.0297, + "step": 187 + }, + { + "epoch": 0.3624096385542169, + "grad_norm": 0.6995761394500732, + "learning_rate": 3.99516972372603e-05, + "loss": 0.0322, + "step": 188 + }, + { + "epoch": 0.3643373493975904, + "grad_norm": 0.765511155128479, + "learning_rate": 3.9948532025785546e-05, + "loss": 0.0253, + "step": 189 + }, + { + "epoch": 0.36626506024096384, + "grad_norm": 0.6165828108787537, + "learning_rate": 3.9945266506619403e-05, + "loss": 0.0355, + "step": 190 + }, + { + "epoch": 0.36819277108433734, + "grad_norm": 0.851970911026001, + "learning_rate": 3.994190069618195e-05, + "loss": 0.056, + "step": 191 + }, + { + "epoch": 0.37012048192771085, + "grad_norm": 0.9850023984909058, + "learning_rate": 3.993843461139757e-05, + "loss": 0.0415, + "step": 192 + }, + { + "epoch": 0.37204819277108436, + "grad_norm": 0.7455295324325562, + "learning_rate": 3.9934868269694886e-05, + "loss": 0.0379, + "step": 193 + }, + { + "epoch": 0.3739759036144578, + "grad_norm": 1.159469723701477, + "learning_rate": 3.9931201689006595e-05, + "loss": 0.0237, + "step": 194 + }, + { + "epoch": 0.3759036144578313, + "grad_norm": 0.5490080118179321, + "learning_rate": 3.992743488776947e-05, + "loss": 0.024, + "step": 195 + }, + { + "epoch": 0.37783132530120483, + "grad_norm": 1.279831886291504, + "learning_rate": 3.992356788492421e-05, + "loss": 0.0273, + "step": 196 + }, + { + "epoch": 0.3797590361445783, + "grad_norm": 0.859104335308075, + "learning_rate": 3.9919600699915355e-05, + "loss": 0.0411, + "step": 197 + }, + { + "epoch": 0.3816867469879518, + "grad_norm": 1.2525300979614258, + "learning_rate": 3.991553335269119e-05, + "loss": 0.0857, + "step": 198 + }, + { + "epoch": 0.3836144578313253, + "grad_norm": 0.4924193024635315, + "learning_rate": 3.991136586370367e-05, + "loss": 0.0294, + "step": 199 + }, + { + "epoch": 0.3855421686746988, + "grad_norm": 1.417190670967102, + "learning_rate": 3.990709825390828e-05, + "loss": 0.0395, + "step": 200 + }, + { + "epoch": 0.38746987951807227, + "grad_norm": 0.6172056198120117, + "learning_rate": 3.9902730544763936e-05, + "loss": 0.0194, + "step": 201 + }, + { + "epoch": 0.3893975903614458, + "grad_norm": 0.7292149662971497, + "learning_rate": 3.989826275823291e-05, + "loss": 0.0381, + "step": 202 + }, + { + "epoch": 0.3913253012048193, + "grad_norm": 0.5949816107749939, + "learning_rate": 3.989369491678067e-05, + "loss": 0.0254, + "step": 203 + }, + { + "epoch": 0.3932530120481928, + "grad_norm": 0.6012582182884216, + "learning_rate": 3.988902704337582e-05, + "loss": 0.048, + "step": 204 + }, + { + "epoch": 0.39518072289156625, + "grad_norm": 0.6273590922355652, + "learning_rate": 3.9884259161489936e-05, + "loss": 0.0268, + "step": 205 + }, + { + "epoch": 0.39710843373493976, + "grad_norm": 0.9615244269371033, + "learning_rate": 3.987939129509746e-05, + "loss": 0.0192, + "step": 206 + }, + { + "epoch": 0.39903614457831327, + "grad_norm": 0.6009241342544556, + "learning_rate": 3.9874423468675624e-05, + "loss": 0.0362, + "step": 207 + }, + { + "epoch": 0.4009638554216867, + "grad_norm": 0.411335289478302, + "learning_rate": 3.9869355707204266e-05, + "loss": 0.017, + "step": 208 + }, + { + "epoch": 0.40289156626506023, + "grad_norm": 0.6151527166366577, + "learning_rate": 3.986418803616573e-05, + "loss": 0.0283, + "step": 209 + }, + { + "epoch": 0.40481927710843374, + "grad_norm": 0.33808204531669617, + "learning_rate": 3.985892048154474e-05, + "loss": 0.0158, + "step": 210 + }, + { + "epoch": 0.40674698795180725, + "grad_norm": 0.5464187860488892, + "learning_rate": 3.9853553069828284e-05, + "loss": 0.0292, + "step": 211 + }, + { + "epoch": 0.4086746987951807, + "grad_norm": 0.6658390760421753, + "learning_rate": 3.984808582800543e-05, + "loss": 0.0281, + "step": 212 + }, + { + "epoch": 0.4106024096385542, + "grad_norm": 0.4253764748573303, + "learning_rate": 3.984251878356726e-05, + "loss": 0.031, + "step": 213 + }, + { + "epoch": 0.4125301204819277, + "grad_norm": 0.32309481501579285, + "learning_rate": 3.983685196450667e-05, + "loss": 0.0166, + "step": 214 + }, + { + "epoch": 0.41445783132530123, + "grad_norm": 0.43756410479545593, + "learning_rate": 3.9831085399318265e-05, + "loss": 0.0326, + "step": 215 + }, + { + "epoch": 0.4163855421686747, + "grad_norm": 0.264046847820282, + "learning_rate": 3.982521911699822e-05, + "loss": 0.0118, + "step": 216 + }, + { + "epoch": 0.4183132530120482, + "grad_norm": 0.8630897402763367, + "learning_rate": 3.9819253147044084e-05, + "loss": 0.0246, + "step": 217 + }, + { + "epoch": 0.4202409638554217, + "grad_norm": 0.6923379898071289, + "learning_rate": 3.98131875194547e-05, + "loss": 0.036, + "step": 218 + }, + { + "epoch": 0.42216867469879515, + "grad_norm": 0.5874778628349304, + "learning_rate": 3.9807022264730024e-05, + "loss": 0.0255, + "step": 219 + }, + { + "epoch": 0.42409638554216866, + "grad_norm": 0.394336074590683, + "learning_rate": 3.980075741387094e-05, + "loss": 0.0187, + "step": 220 + }, + { + "epoch": 0.4260240963855422, + "grad_norm": 0.6300327777862549, + "learning_rate": 3.979439299837915e-05, + "loss": 0.0214, + "step": 221 + }, + { + "epoch": 0.4279518072289157, + "grad_norm": 0.5200467109680176, + "learning_rate": 3.978792905025702e-05, + "loss": 0.0628, + "step": 222 + }, + { + "epoch": 0.42987951807228914, + "grad_norm": 0.5713880062103271, + "learning_rate": 3.978136560200735e-05, + "loss": 0.0302, + "step": 223 + }, + { + "epoch": 0.43180722891566264, + "grad_norm": 0.5345383286476135, + "learning_rate": 3.977470268663331e-05, + "loss": 0.0125, + "step": 224 + }, + { + "epoch": 0.43373493975903615, + "grad_norm": 0.5378350019454956, + "learning_rate": 3.976794033763819e-05, + "loss": 0.0246, + "step": 225 + }, + { + "epoch": 0.43566265060240966, + "grad_norm": 0.5554935336112976, + "learning_rate": 3.9761078589025276e-05, + "loss": 0.0212, + "step": 226 + }, + { + "epoch": 0.4375903614457831, + "grad_norm": 0.2832634747028351, + "learning_rate": 3.9754117475297664e-05, + "loss": 0.0125, + "step": 227 + }, + { + "epoch": 0.4395180722891566, + "grad_norm": 1.2910150289535522, + "learning_rate": 3.97470570314581e-05, + "loss": 0.0364, + "step": 228 + }, + { + "epoch": 0.44144578313253013, + "grad_norm": 0.3731018602848053, + "learning_rate": 3.973989729300878e-05, + "loss": 0.0128, + "step": 229 + }, + { + "epoch": 0.4433734939759036, + "grad_norm": 0.9433871507644653, + "learning_rate": 3.9732638295951195e-05, + "loss": 0.0367, + "step": 230 + }, + { + "epoch": 0.4453012048192771, + "grad_norm": 1.0779197216033936, + "learning_rate": 3.972528007678594e-05, + "loss": 0.0667, + "step": 231 + }, + { + "epoch": 0.4472289156626506, + "grad_norm": 1.7009105682373047, + "learning_rate": 3.9717822672512516e-05, + "loss": 0.0655, + "step": 232 + }, + { + "epoch": 0.4491566265060241, + "grad_norm": 0.5646032094955444, + "learning_rate": 3.971026612062919e-05, + "loss": 0.064, + "step": 233 + }, + { + "epoch": 0.45108433734939757, + "grad_norm": 0.44474121928215027, + "learning_rate": 3.970261045913274e-05, + "loss": 0.0206, + "step": 234 + }, + { + "epoch": 0.4530120481927711, + "grad_norm": 1.3969277143478394, + "learning_rate": 3.969485572651833e-05, + "loss": 0.0486, + "step": 235 + }, + { + "epoch": 0.4549397590361446, + "grad_norm": 0.6401994228363037, + "learning_rate": 3.968700196177925e-05, + "loss": 0.0262, + "step": 236 + }, + { + "epoch": 0.4568674698795181, + "grad_norm": 0.7091913223266602, + "learning_rate": 3.96790492044068e-05, + "loss": 0.014, + "step": 237 + }, + { + "epoch": 0.45879518072289155, + "grad_norm": 0.6561547517776489, + "learning_rate": 3.967099749439002e-05, + "loss": 0.0482, + "step": 238 + }, + { + "epoch": 0.46072289156626506, + "grad_norm": 0.6924155354499817, + "learning_rate": 3.966284687221551e-05, + "loss": 0.0289, + "step": 239 + }, + { + "epoch": 0.46265060240963857, + "grad_norm": 0.5868663787841797, + "learning_rate": 3.9654597378867256e-05, + "loss": 0.0331, + "step": 240 + }, + { + "epoch": 0.464578313253012, + "grad_norm": 0.7930939793586731, + "learning_rate": 3.964624905582637e-05, + "loss": 0.0925, + "step": 241 + }, + { + "epoch": 0.46650602409638553, + "grad_norm": 0.4888836145401001, + "learning_rate": 3.9637801945070944e-05, + "loss": 0.015, + "step": 242 + }, + { + "epoch": 0.46843373493975904, + "grad_norm": 0.7820287346839905, + "learning_rate": 3.962925608907579e-05, + "loss": 0.0382, + "step": 243 + }, + { + "epoch": 0.47036144578313255, + "grad_norm": 0.4914316236972809, + "learning_rate": 3.962061153081224e-05, + "loss": 0.0257, + "step": 244 + }, + { + "epoch": 0.472289156626506, + "grad_norm": 0.5681505799293518, + "learning_rate": 3.961186831374793e-05, + "loss": 0.0551, + "step": 245 + }, + { + "epoch": 0.4742168674698795, + "grad_norm": 0.5049723386764526, + "learning_rate": 3.9603026481846616e-05, + "loss": 0.0186, + "step": 246 + }, + { + "epoch": 0.476144578313253, + "grad_norm": 0.5034119486808777, + "learning_rate": 3.959408607956787e-05, + "loss": 0.024, + "step": 247 + }, + { + "epoch": 0.47807228915662653, + "grad_norm": 0.4543336033821106, + "learning_rate": 3.958504715186695e-05, + "loss": 0.0256, + "step": 248 + }, + { + "epoch": 0.48, + "grad_norm": 0.5595743656158447, + "learning_rate": 3.957590974419452e-05, + "loss": 0.0222, + "step": 249 + }, + { + "epoch": 0.4819277108433735, + "grad_norm": 0.5701581239700317, + "learning_rate": 3.956667390249642e-05, + "loss": 0.0334, + "step": 250 + }, + { + "epoch": 0.483855421686747, + "grad_norm": 0.53755784034729, + "learning_rate": 3.9557339673213474e-05, + "loss": 0.0345, + "step": 251 + }, + { + "epoch": 0.4857831325301205, + "grad_norm": 0.4368877112865448, + "learning_rate": 3.95479071032812e-05, + "loss": 0.0183, + "step": 252 + }, + { + "epoch": 0.48771084337349396, + "grad_norm": 0.7972906827926636, + "learning_rate": 3.953837624012963e-05, + "loss": 0.0337, + "step": 253 + }, + { + "epoch": 0.48963855421686747, + "grad_norm": 0.6148451566696167, + "learning_rate": 3.9528747131683023e-05, + "loss": 0.0524, + "step": 254 + }, + { + "epoch": 0.491566265060241, + "grad_norm": 0.500840961933136, + "learning_rate": 3.9519019826359676e-05, + "loss": 0.0248, + "step": 255 + }, + { + "epoch": 0.49349397590361443, + "grad_norm": 0.5536255240440369, + "learning_rate": 3.9509194373071624e-05, + "loss": 0.0219, + "step": 256 + }, + { + "epoch": 0.49542168674698794, + "grad_norm": 0.6873176097869873, + "learning_rate": 3.9499270821224444e-05, + "loss": 0.0312, + "step": 257 + }, + { + "epoch": 0.49734939759036145, + "grad_norm": 0.37207168340682983, + "learning_rate": 3.9489249220716974e-05, + "loss": 0.0149, + "step": 258 + }, + { + "epoch": 0.49927710843373496, + "grad_norm": 0.4458799660205841, + "learning_rate": 3.947912962194107e-05, + "loss": 0.0214, + "step": 259 + }, + { + "epoch": 0.5012048192771085, + "grad_norm": 0.4272724390029907, + "learning_rate": 3.9468912075781345e-05, + "loss": 0.0263, + "step": 260 + }, + { + "epoch": 0.503132530120482, + "grad_norm": 0.5245792269706726, + "learning_rate": 3.945859663361496e-05, + "loss": 0.0103, + "step": 261 + }, + { + "epoch": 0.5050602409638554, + "grad_norm": 0.8799260854721069, + "learning_rate": 3.9448183347311284e-05, + "loss": 0.0292, + "step": 262 + }, + { + "epoch": 0.5069879518072289, + "grad_norm": 0.5996833443641663, + "learning_rate": 3.943767226923171e-05, + "loss": 0.0306, + "step": 263 + }, + { + "epoch": 0.5089156626506024, + "grad_norm": 0.6044682860374451, + "learning_rate": 3.942706345222935e-05, + "loss": 0.0218, + "step": 264 + }, + { + "epoch": 0.5108433734939759, + "grad_norm": 0.4770200848579407, + "learning_rate": 3.941635694964878e-05, + "loss": 0.0226, + "step": 265 + }, + { + "epoch": 0.5127710843373494, + "grad_norm": 0.5605704188346863, + "learning_rate": 3.940555281532576e-05, + "loss": 0.0354, + "step": 266 + }, + { + "epoch": 0.5146987951807229, + "grad_norm": 0.46532443165779114, + "learning_rate": 3.939465110358699e-05, + "loss": 0.0223, + "step": 267 + }, + { + "epoch": 0.5166265060240964, + "grad_norm": 0.5190595388412476, + "learning_rate": 3.93836518692498e-05, + "loss": 0.0219, + "step": 268 + }, + { + "epoch": 0.5185542168674698, + "grad_norm": 0.5767757892608643, + "learning_rate": 3.937255516762193e-05, + "loss": 0.0294, + "step": 269 + }, + { + "epoch": 0.5204819277108433, + "grad_norm": 0.4543164372444153, + "learning_rate": 3.936136105450119e-05, + "loss": 0.0244, + "step": 270 + }, + { + "epoch": 0.5224096385542168, + "grad_norm": 0.4155154526233673, + "learning_rate": 3.9350069586175195e-05, + "loss": 0.02, + "step": 271 + }, + { + "epoch": 0.5243373493975904, + "grad_norm": 0.5470768213272095, + "learning_rate": 3.933868081942113e-05, + "loss": 0.0187, + "step": 272 + }, + { + "epoch": 0.5262650602409639, + "grad_norm": 0.9491772651672363, + "learning_rate": 3.9327194811505406e-05, + "loss": 0.0337, + "step": 273 + }, + { + "epoch": 0.5281927710843374, + "grad_norm": 0.9313873052597046, + "learning_rate": 3.93156116201834e-05, + "loss": 0.0573, + "step": 274 + }, + { + "epoch": 0.5301204819277109, + "grad_norm": 0.7181005477905273, + "learning_rate": 3.930393130369915e-05, + "loss": 0.0405, + "step": 275 + }, + { + "epoch": 0.5320481927710843, + "grad_norm": 0.34231385588645935, + "learning_rate": 3.9292153920785076e-05, + "loss": 0.0153, + "step": 276 + }, + { + "epoch": 0.5339759036144578, + "grad_norm": 0.6899610161781311, + "learning_rate": 3.928027953066168e-05, + "loss": 0.0338, + "step": 277 + }, + { + "epoch": 0.5359036144578313, + "grad_norm": 0.7509781718254089, + "learning_rate": 3.926830819303726e-05, + "loss": 0.0416, + "step": 278 + }, + { + "epoch": 0.5378313253012048, + "grad_norm": 0.6326774954795837, + "learning_rate": 3.925623996810757e-05, + "loss": 0.0293, + "step": 279 + }, + { + "epoch": 0.5397590361445783, + "grad_norm": 0.5543203353881836, + "learning_rate": 3.924407491655557e-05, + "loss": 0.0263, + "step": 280 + }, + { + "epoch": 0.5416867469879518, + "grad_norm": 0.5367572903633118, + "learning_rate": 3.9231813099551086e-05, + "loss": 0.0276, + "step": 281 + }, + { + "epoch": 0.5436144578313253, + "grad_norm": 0.3143869638442993, + "learning_rate": 3.921945457875051e-05, + "loss": 0.0146, + "step": 282 + }, + { + "epoch": 0.5455421686746988, + "grad_norm": 0.47403043508529663, + "learning_rate": 3.920699941629649e-05, + "loss": 0.0267, + "step": 283 + }, + { + "epoch": 0.5474698795180722, + "grad_norm": 0.5082595348358154, + "learning_rate": 3.919444767481763e-05, + "loss": 0.0183, + "step": 284 + }, + { + "epoch": 0.5493975903614458, + "grad_norm": 0.747949481010437, + "learning_rate": 3.918179941742816e-05, + "loss": 0.0412, + "step": 285 + }, + { + "epoch": 0.5513253012048193, + "grad_norm": 0.6553886532783508, + "learning_rate": 3.916905470772762e-05, + "loss": 0.0505, + "step": 286 + }, + { + "epoch": 0.5532530120481928, + "grad_norm": 0.3838176131248474, + "learning_rate": 3.9156213609800545e-05, + "loss": 0.0156, + "step": 287 + }, + { + "epoch": 0.5551807228915663, + "grad_norm": 0.7427731156349182, + "learning_rate": 3.914327618821614e-05, + "loss": 0.0278, + "step": 288 + }, + { + "epoch": 0.5571084337349398, + "grad_norm": 0.2612821161746979, + "learning_rate": 3.913024250802796e-05, + "loss": 0.0101, + "step": 289 + }, + { + "epoch": 0.5590361445783133, + "grad_norm": 0.3799416124820709, + "learning_rate": 3.911711263477357e-05, + "loss": 0.0168, + "step": 290 + }, + { + "epoch": 0.5609638554216867, + "grad_norm": 0.5053854584693909, + "learning_rate": 3.910388663447425e-05, + "loss": 0.0249, + "step": 291 + }, + { + "epoch": 0.5628915662650602, + "grad_norm": 0.38095012307167053, + "learning_rate": 3.909056457363461e-05, + "loss": 0.0156, + "step": 292 + }, + { + "epoch": 0.5648192771084337, + "grad_norm": 0.4477892220020294, + "learning_rate": 3.907714651924229e-05, + "loss": 0.0309, + "step": 293 + }, + { + "epoch": 0.5667469879518072, + "grad_norm": 0.5875864624977112, + "learning_rate": 3.906363253876763e-05, + "loss": 0.0287, + "step": 294 + }, + { + "epoch": 0.5686746987951807, + "grad_norm": 0.522990882396698, + "learning_rate": 3.90500227001633e-05, + "loss": 0.0318, + "step": 295 + }, + { + "epoch": 0.5706024096385542, + "grad_norm": 0.4153876304626465, + "learning_rate": 3.9036317071863994e-05, + "loss": 0.0192, + "step": 296 + }, + { + "epoch": 0.5725301204819278, + "grad_norm": 0.4675769507884979, + "learning_rate": 3.902251572278605e-05, + "loss": 0.067, + "step": 297 + }, + { + "epoch": 0.5744578313253013, + "grad_norm": 0.35778650641441345, + "learning_rate": 3.900861872232713e-05, + "loss": 0.0197, + "step": 298 + }, + { + "epoch": 0.5763855421686747, + "grad_norm": 0.7382330894470215, + "learning_rate": 3.899462614036587e-05, + "loss": 0.0283, + "step": 299 + }, + { + "epoch": 0.5783132530120482, + "grad_norm": 0.41268599033355713, + "learning_rate": 3.89805380472615e-05, + "loss": 0.0207, + "step": 300 + }, + { + "epoch": 0.5802409638554217, + "grad_norm": 1.2013020515441895, + "learning_rate": 3.8966354513853535e-05, + "loss": 0.0301, + "step": 301 + }, + { + "epoch": 0.5821686746987952, + "grad_norm": 0.424757719039917, + "learning_rate": 3.895207561146137e-05, + "loss": 0.022, + "step": 302 + }, + { + "epoch": 0.5840963855421687, + "grad_norm": 0.4196677505970001, + "learning_rate": 3.893770141188396e-05, + "loss": 0.0424, + "step": 303 + }, + { + "epoch": 0.5860240963855422, + "grad_norm": 0.8644190430641174, + "learning_rate": 3.892323198739946e-05, + "loss": 0.08, + "step": 304 + }, + { + "epoch": 0.5879518072289157, + "grad_norm": 0.5645135045051575, + "learning_rate": 3.890866741076482e-05, + "loss": 0.0152, + "step": 305 + }, + { + "epoch": 0.5898795180722891, + "grad_norm": 0.5218387246131897, + "learning_rate": 3.889400775521545e-05, + "loss": 0.0205, + "step": 306 + }, + { + "epoch": 0.5918072289156626, + "grad_norm": 0.39709413051605225, + "learning_rate": 3.8879253094464865e-05, + "loss": 0.0233, + "step": 307 + }, + { + "epoch": 0.5937349397590361, + "grad_norm": 0.3572910726070404, + "learning_rate": 3.8864403502704285e-05, + "loss": 0.0198, + "step": 308 + }, + { + "epoch": 0.5956626506024096, + "grad_norm": 0.382709264755249, + "learning_rate": 3.8849459054602274e-05, + "loss": 0.0176, + "step": 309 + }, + { + "epoch": 0.5975903614457831, + "grad_norm": 3.4527227878570557, + "learning_rate": 3.883441982530436e-05, + "loss": 0.0239, + "step": 310 + }, + { + "epoch": 0.5995180722891567, + "grad_norm": 0.4467569589614868, + "learning_rate": 3.8819285890432674e-05, + "loss": 0.0284, + "step": 311 + }, + { + "epoch": 0.6014457831325302, + "grad_norm": 0.44513460993766785, + "learning_rate": 3.880405732608555e-05, + "loss": 0.0233, + "step": 312 + }, + { + "epoch": 0.6033734939759036, + "grad_norm": 0.8029689192771912, + "learning_rate": 3.8788734208837155e-05, + "loss": 0.0433, + "step": 313 + }, + { + "epoch": 0.6053012048192771, + "grad_norm": 0.7291454076766968, + "learning_rate": 3.877331661573709e-05, + "loss": 0.043, + "step": 314 + }, + { + "epoch": 0.6072289156626506, + "grad_norm": 0.6050467491149902, + "learning_rate": 3.8757804624310006e-05, + "loss": 0.0377, + "step": 315 + }, + { + "epoch": 0.6091566265060241, + "grad_norm": 0.6714366674423218, + "learning_rate": 3.874219831255524e-05, + "loss": 0.046, + "step": 316 + }, + { + "epoch": 0.6110843373493976, + "grad_norm": 0.336037278175354, + "learning_rate": 3.8726497758946394e-05, + "loss": 0.0149, + "step": 317 + }, + { + "epoch": 0.6130120481927711, + "grad_norm": 0.3057402968406677, + "learning_rate": 3.871070304243094e-05, + "loss": 0.014, + "step": 318 + }, + { + "epoch": 0.6149397590361446, + "grad_norm": 0.4537644684314728, + "learning_rate": 3.8694814242429834e-05, + "loss": 0.0503, + "step": 319 + }, + { + "epoch": 0.6168674698795181, + "grad_norm": 0.45573824644088745, + "learning_rate": 3.8678831438837116e-05, + "loss": 0.021, + "step": 320 + }, + { + "epoch": 0.6187951807228915, + "grad_norm": 0.30729591846466064, + "learning_rate": 3.866275471201952e-05, + "loss": 0.0163, + "step": 321 + }, + { + "epoch": 0.620722891566265, + "grad_norm": 0.7614850401878357, + "learning_rate": 3.8646584142816036e-05, + "loss": 0.0347, + "step": 322 + }, + { + "epoch": 0.6226506024096385, + "grad_norm": 0.5323611497879028, + "learning_rate": 3.863031981253754e-05, + "loss": 0.0201, + "step": 323 + }, + { + "epoch": 0.624578313253012, + "grad_norm": 0.34426453709602356, + "learning_rate": 3.861396180296635e-05, + "loss": 0.0243, + "step": 324 + }, + { + "epoch": 0.6265060240963856, + "grad_norm": 0.621636152267456, + "learning_rate": 3.859751019635585e-05, + "loss": 0.0166, + "step": 325 + }, + { + "epoch": 0.6284337349397591, + "grad_norm": 0.549324095249176, + "learning_rate": 3.858096507543006e-05, + "loss": 0.0274, + "step": 326 + }, + { + "epoch": 0.6303614457831326, + "grad_norm": 0.358426570892334, + "learning_rate": 3.8564326523383214e-05, + "loss": 0.0207, + "step": 327 + }, + { + "epoch": 0.632289156626506, + "grad_norm": 0.3639723062515259, + "learning_rate": 3.8547594623879346e-05, + "loss": 0.0297, + "step": 328 + }, + { + "epoch": 0.6342168674698795, + "grad_norm": 0.3402212858200073, + "learning_rate": 3.853076946105188e-05, + "loss": 0.0258, + "step": 329 + }, + { + "epoch": 0.636144578313253, + "grad_norm": 0.4083027243614197, + "learning_rate": 3.85138511195032e-05, + "loss": 0.0351, + "step": 330 + }, + { + "epoch": 0.6380722891566265, + "grad_norm": 0.43532121181488037, + "learning_rate": 3.84968396843042e-05, + "loss": 0.0388, + "step": 331 + }, + { + "epoch": 0.64, + "grad_norm": 0.35353463888168335, + "learning_rate": 3.8479735240993904e-05, + "loss": 0.0203, + "step": 332 + }, + { + "epoch": 0.6419277108433735, + "grad_norm": 0.350149929523468, + "learning_rate": 3.846253787557901e-05, + "loss": 0.0261, + "step": 333 + }, + { + "epoch": 0.643855421686747, + "grad_norm": 0.7665389180183411, + "learning_rate": 3.844524767453344e-05, + "loss": 0.0108, + "step": 334 + }, + { + "epoch": 0.6457831325301204, + "grad_norm": 0.44621360301971436, + "learning_rate": 3.842786472479795e-05, + "loss": 0.0282, + "step": 335 + }, + { + "epoch": 0.6477108433734939, + "grad_norm": 0.7787201404571533, + "learning_rate": 3.841038911377962e-05, + "loss": 0.0216, + "step": 336 + }, + { + "epoch": 0.6496385542168674, + "grad_norm": 0.48260653018951416, + "learning_rate": 3.839282092935153e-05, + "loss": 0.0234, + "step": 337 + }, + { + "epoch": 0.651566265060241, + "grad_norm": 0.4987852871417999, + "learning_rate": 3.837516025985219e-05, + "loss": 0.0515, + "step": 338 + }, + { + "epoch": 0.6534939759036145, + "grad_norm": 0.9030266404151917, + "learning_rate": 3.835740719408517e-05, + "loss": 0.0508, + "step": 339 + }, + { + "epoch": 0.655421686746988, + "grad_norm": 0.6381701231002808, + "learning_rate": 3.833956182131867e-05, + "loss": 0.0405, + "step": 340 + }, + { + "epoch": 0.6573493975903615, + "grad_norm": 0.42828986048698425, + "learning_rate": 3.832162423128499e-05, + "loss": 0.024, + "step": 341 + }, + { + "epoch": 0.659277108433735, + "grad_norm": 0.38725873827934265, + "learning_rate": 3.8303594514180164e-05, + "loss": 0.0199, + "step": 342 + }, + { + "epoch": 0.6612048192771084, + "grad_norm": 0.23280498385429382, + "learning_rate": 3.828547276066346e-05, + "loss": 0.0101, + "step": 343 + }, + { + "epoch": 0.6631325301204819, + "grad_norm": 0.7298216819763184, + "learning_rate": 3.8267259061856925e-05, + "loss": 0.0455, + "step": 344 + }, + { + "epoch": 0.6650602409638554, + "grad_norm": 0.5975687503814697, + "learning_rate": 3.824895350934496e-05, + "loss": 0.0372, + "step": 345 + }, + { + "epoch": 0.6669879518072289, + "grad_norm": 0.6295403242111206, + "learning_rate": 3.823055619517381e-05, + "loss": 0.0362, + "step": 346 + }, + { + "epoch": 0.6689156626506024, + "grad_norm": 0.5086020827293396, + "learning_rate": 3.821206721185115e-05, + "loss": 0.0368, + "step": 347 + }, + { + "epoch": 0.6708433734939759, + "grad_norm": 0.34506168961524963, + "learning_rate": 3.819348665234557e-05, + "loss": 0.0178, + "step": 348 + }, + { + "epoch": 0.6727710843373494, + "grad_norm": 1.309940218925476, + "learning_rate": 3.817481461008617e-05, + "loss": 0.024, + "step": 349 + }, + { + "epoch": 0.6746987951807228, + "grad_norm": 0.4074770510196686, + "learning_rate": 3.815605117896204e-05, + "loss": 0.0262, + "step": 350 + }, + { + "epoch": 0.6766265060240964, + "grad_norm": 0.48525840044021606, + "learning_rate": 3.8137196453321775e-05, + "loss": 0.0209, + "step": 351 + }, + { + "epoch": 0.6785542168674699, + "grad_norm": 0.7199739217758179, + "learning_rate": 3.811825052797308e-05, + "loss": 0.0396, + "step": 352 + }, + { + "epoch": 0.6804819277108434, + "grad_norm": 0.519540011882782, + "learning_rate": 3.8099213498182196e-05, + "loss": 0.0453, + "step": 353 + }, + { + "epoch": 0.6824096385542169, + "grad_norm": 0.9738391041755676, + "learning_rate": 3.808008545967349e-05, + "loss": 0.0317, + "step": 354 + }, + { + "epoch": 0.6843373493975904, + "grad_norm": 1.888344407081604, + "learning_rate": 3.8060866508628953e-05, + "loss": 0.0452, + "step": 355 + }, + { + "epoch": 0.6862650602409639, + "grad_norm": 0.48989811539649963, + "learning_rate": 3.8041556741687695e-05, + "loss": 0.0315, + "step": 356 + }, + { + "epoch": 0.6881927710843373, + "grad_norm": 0.3764645457267761, + "learning_rate": 3.8022156255945496e-05, + "loss": 0.0269, + "step": 357 + }, + { + "epoch": 0.6901204819277108, + "grad_norm": 0.46409738063812256, + "learning_rate": 3.800266514895429e-05, + "loss": 0.0171, + "step": 358 + }, + { + "epoch": 0.6920481927710843, + "grad_norm": 0.41091030836105347, + "learning_rate": 3.7983083518721695e-05, + "loss": 0.0167, + "step": 359 + }, + { + "epoch": 0.6939759036144578, + "grad_norm": 0.8375523090362549, + "learning_rate": 3.79634114637105e-05, + "loss": 0.0342, + "step": 360 + }, + { + "epoch": 0.6959036144578313, + "grad_norm": 1.7053394317626953, + "learning_rate": 3.794364908283817e-05, + "loss": 0.02, + "step": 361 + }, + { + "epoch": 0.6978313253012048, + "grad_norm": 0.4163115918636322, + "learning_rate": 3.792379647547637e-05, + "loss": 0.0138, + "step": 362 + }, + { + "epoch": 0.6997590361445784, + "grad_norm": 0.388751745223999, + "learning_rate": 3.790385374145046e-05, + "loss": 0.0172, + "step": 363 + }, + { + "epoch": 0.7016867469879519, + "grad_norm": 0.5584064722061157, + "learning_rate": 3.7883820981038966e-05, + "loss": 0.0254, + "step": 364 + }, + { + "epoch": 0.7036144578313253, + "grad_norm": 1.394264817237854, + "learning_rate": 3.7863698294973114e-05, + "loss": 0.037, + "step": 365 + }, + { + "epoch": 0.7055421686746988, + "grad_norm": 0.46280744671821594, + "learning_rate": 3.78434857844363e-05, + "loss": 0.0234, + "step": 366 + }, + { + "epoch": 0.7074698795180723, + "grad_norm": 0.39548924565315247, + "learning_rate": 3.782318355106358e-05, + "loss": 0.0164, + "step": 367 + }, + { + "epoch": 0.7093975903614458, + "grad_norm": 0.7307773232460022, + "learning_rate": 3.780279169694118e-05, + "loss": 0.0192, + "step": 368 + }, + { + "epoch": 0.7113253012048193, + "grad_norm": 0.28035807609558105, + "learning_rate": 3.778231032460594e-05, + "loss": 0.0131, + "step": 369 + }, + { + "epoch": 0.7132530120481928, + "grad_norm": 0.8376953601837158, + "learning_rate": 3.776173953704486e-05, + "loss": 0.0291, + "step": 370 + }, + { + "epoch": 0.7151807228915663, + "grad_norm": 0.7356843948364258, + "learning_rate": 3.774107943769454e-05, + "loss": 0.0214, + "step": 371 + }, + { + "epoch": 0.7171084337349397, + "grad_norm": 0.41503390669822693, + "learning_rate": 3.772033013044064e-05, + "loss": 0.0221, + "step": 372 + }, + { + "epoch": 0.7190361445783132, + "grad_norm": 0.35732385516166687, + "learning_rate": 3.7699491719617436e-05, + "loss": 0.015, + "step": 373 + }, + { + "epoch": 0.7209638554216867, + "grad_norm": 0.283778578042984, + "learning_rate": 3.76785643100072e-05, + "loss": 0.0146, + "step": 374 + }, + { + "epoch": 0.7228915662650602, + "grad_norm": 0.3219413459300995, + "learning_rate": 3.765754800683974e-05, + "loss": 0.015, + "step": 375 + }, + { + "epoch": 0.7248192771084337, + "grad_norm": 0.610431432723999, + "learning_rate": 3.7636442915791856e-05, + "loss": 0.0326, + "step": 376 + }, + { + "epoch": 0.7267469879518073, + "grad_norm": 4.944870948791504, + "learning_rate": 3.7615249142986784e-05, + "loss": 0.0432, + "step": 377 + }, + { + "epoch": 0.7286746987951808, + "grad_norm": 0.4894593060016632, + "learning_rate": 3.7593966794993696e-05, + "loss": 0.0174, + "step": 378 + }, + { + "epoch": 0.7306024096385542, + "grad_norm": 0.4211325943470001, + "learning_rate": 3.757259597882714e-05, + "loss": 0.023, + "step": 379 + }, + { + "epoch": 0.7325301204819277, + "grad_norm": 0.33621737360954285, + "learning_rate": 3.755113680194651e-05, + "loss": 0.0201, + "step": 380 + }, + { + "epoch": 0.7344578313253012, + "grad_norm": 0.5799694657325745, + "learning_rate": 3.7529589372255514e-05, + "loss": 0.0173, + "step": 381 + }, + { + "epoch": 0.7363855421686747, + "grad_norm": 0.5172572731971741, + "learning_rate": 3.750795379810162e-05, + "loss": 0.0284, + "step": 382 + }, + { + "epoch": 0.7383132530120482, + "grad_norm": 0.5715453028678894, + "learning_rate": 3.748623018827552e-05, + "loss": 0.0194, + "step": 383 + }, + { + "epoch": 0.7402409638554217, + "grad_norm": 0.5284178256988525, + "learning_rate": 3.746441865201056e-05, + "loss": 0.0247, + "step": 384 + }, + { + "epoch": 0.7421686746987952, + "grad_norm": 0.37828654050827026, + "learning_rate": 3.744251929898223e-05, + "loss": 0.0097, + "step": 385 + }, + { + "epoch": 0.7440963855421687, + "grad_norm": 0.3252779543399811, + "learning_rate": 3.742053223930758e-05, + "loss": 0.0238, + "step": 386 + }, + { + "epoch": 0.7460240963855421, + "grad_norm": 0.6031543612480164, + "learning_rate": 3.7398457583544674e-05, + "loss": 0.0332, + "step": 387 + }, + { + "epoch": 0.7479518072289156, + "grad_norm": 0.23846614360809326, + "learning_rate": 3.737629544269206e-05, + "loss": 0.0122, + "step": 388 + }, + { + "epoch": 0.7498795180722891, + "grad_norm": 0.5274029970169067, + "learning_rate": 3.7354045928188155e-05, + "loss": 0.0324, + "step": 389 + }, + { + "epoch": 0.7518072289156627, + "grad_norm": 0.4672217071056366, + "learning_rate": 3.733170915191075e-05, + "loss": 0.0196, + "step": 390 + }, + { + "epoch": 0.7537349397590362, + "grad_norm": 0.29819396138191223, + "learning_rate": 3.730928522617639e-05, + "loss": 0.0131, + "step": 391 + }, + { + "epoch": 0.7556626506024097, + "grad_norm": 0.43824997544288635, + "learning_rate": 3.7286774263739855e-05, + "loss": 0.0238, + "step": 392 + }, + { + "epoch": 0.7575903614457832, + "grad_norm": 0.2822072505950928, + "learning_rate": 3.726417637779357e-05, + "loss": 0.0314, + "step": 393 + }, + { + "epoch": 0.7595180722891566, + "grad_norm": 0.43815648555755615, + "learning_rate": 3.7241491681967044e-05, + "loss": 0.0144, + "step": 394 + }, + { + "epoch": 0.7614457831325301, + "grad_norm": 0.37194815278053284, + "learning_rate": 3.721872029032628e-05, + "loss": 0.0286, + "step": 395 + }, + { + "epoch": 0.7633734939759036, + "grad_norm": 0.7319737672805786, + "learning_rate": 3.719586231737322e-05, + "loss": 0.0427, + "step": 396 + }, + { + "epoch": 0.7653012048192771, + "grad_norm": 0.5870066285133362, + "learning_rate": 3.717291787804517e-05, + "loss": 0.0138, + "step": 397 + }, + { + "epoch": 0.7672289156626506, + "grad_norm": 0.6574277281761169, + "learning_rate": 3.7149887087714225e-05, + "loss": 0.061, + "step": 398 + }, + { + "epoch": 0.7691566265060241, + "grad_norm": 0.5467348694801331, + "learning_rate": 3.712677006218666e-05, + "loss": 0.022, + "step": 399 + }, + { + "epoch": 0.7710843373493976, + "grad_norm": 0.3589288890361786, + "learning_rate": 3.710356691770238e-05, + "loss": 0.0161, + "step": 400 + }, + { + "epoch": 0.7730120481927711, + "grad_norm": 0.574630618095398, + "learning_rate": 3.708027777093433e-05, + "loss": 0.0285, + "step": 401 + }, + { + "epoch": 0.7749397590361445, + "grad_norm": 0.39048445224761963, + "learning_rate": 3.70569027389879e-05, + "loss": 0.012, + "step": 402 + }, + { + "epoch": 0.776867469879518, + "grad_norm": 0.34803536534309387, + "learning_rate": 3.703344193940032e-05, + "loss": 0.0155, + "step": 403 + }, + { + "epoch": 0.7787951807228916, + "grad_norm": 1.188948392868042, + "learning_rate": 3.700989549014011e-05, + "loss": 0.0617, + "step": 404 + }, + { + "epoch": 0.7807228915662651, + "grad_norm": 0.473157674074173, + "learning_rate": 3.698626350960646e-05, + "loss": 0.0298, + "step": 405 + }, + { + "epoch": 0.7826506024096386, + "grad_norm": 0.42009076476097107, + "learning_rate": 3.6962546116628634e-05, + "loss": 0.03, + "step": 406 + }, + { + "epoch": 0.7845783132530121, + "grad_norm": 0.6334308981895447, + "learning_rate": 3.693874343046537e-05, + "loss": 0.0107, + "step": 407 + }, + { + "epoch": 0.7865060240963856, + "grad_norm": 0.35594677925109863, + "learning_rate": 3.6914855570804314e-05, + "loss": 0.0174, + "step": 408 + }, + { + "epoch": 0.788433734939759, + "grad_norm": 0.28985708951950073, + "learning_rate": 3.689088265776136e-05, + "loss": 0.0149, + "step": 409 + }, + { + "epoch": 0.7903614457831325, + "grad_norm": 0.3981950581073761, + "learning_rate": 3.686682481188011e-05, + "loss": 0.019, + "step": 410 + }, + { + "epoch": 0.792289156626506, + "grad_norm": 0.48819583654403687, + "learning_rate": 3.6842682154131193e-05, + "loss": 0.0217, + "step": 411 + }, + { + "epoch": 0.7942168674698795, + "grad_norm": 0.42819952964782715, + "learning_rate": 3.681845480591174e-05, + "loss": 0.0198, + "step": 412 + }, + { + "epoch": 0.796144578313253, + "grad_norm": 0.48591694235801697, + "learning_rate": 3.6794142889044727e-05, + "loss": 0.0253, + "step": 413 + }, + { + "epoch": 0.7980722891566265, + "grad_norm": 0.4730607271194458, + "learning_rate": 3.676974652577835e-05, + "loss": 0.0329, + "step": 414 + }, + { + "epoch": 0.8, + "grad_norm": 0.5390865802764893, + "learning_rate": 3.6745265838785434e-05, + "loss": 0.0479, + "step": 415 + }, + { + "epoch": 0.8019277108433734, + "grad_norm": 0.6377891302108765, + "learning_rate": 3.672070095116283e-05, + "loss": 0.019, + "step": 416 + }, + { + "epoch": 0.803855421686747, + "grad_norm": 0.8984615206718445, + "learning_rate": 3.669605198643075e-05, + "loss": 0.0444, + "step": 417 + }, + { + "epoch": 0.8057831325301205, + "grad_norm": 0.4913877546787262, + "learning_rate": 3.667131906853219e-05, + "loss": 0.031, + "step": 418 + }, + { + "epoch": 0.807710843373494, + "grad_norm": 0.37894028425216675, + "learning_rate": 3.664650232183229e-05, + "loss": 0.0195, + "step": 419 + }, + { + "epoch": 0.8096385542168675, + "grad_norm": 0.3644949495792389, + "learning_rate": 3.66216018711177e-05, + "loss": 0.018, + "step": 420 + }, + { + "epoch": 0.811566265060241, + "grad_norm": 0.414440393447876, + "learning_rate": 3.659661784159597e-05, + "loss": 0.0188, + "step": 421 + }, + { + "epoch": 0.8134939759036145, + "grad_norm": 0.49220341444015503, + "learning_rate": 3.65715503588949e-05, + "loss": 0.016, + "step": 422 + }, + { + "epoch": 0.815421686746988, + "grad_norm": 1.0939836502075195, + "learning_rate": 3.654639954906193e-05, + "loss": 0.0758, + "step": 423 + }, + { + "epoch": 0.8173493975903614, + "grad_norm": 0.43222442269325256, + "learning_rate": 3.652116553856349e-05, + "loss": 0.0308, + "step": 424 + }, + { + "epoch": 0.8192771084337349, + "grad_norm": 0.5081896185874939, + "learning_rate": 3.649584845428438e-05, + "loss": 0.0493, + "step": 425 + }, + { + "epoch": 0.8212048192771084, + "grad_norm": 0.9811948537826538, + "learning_rate": 3.64704484235271e-05, + "loss": 0.019, + "step": 426 + }, + { + "epoch": 0.8231325301204819, + "grad_norm": 0.31656572222709656, + "learning_rate": 3.6444965574011255e-05, + "loss": 0.0135, + "step": 427 + }, + { + "epoch": 0.8250602409638554, + "grad_norm": 0.7844433188438416, + "learning_rate": 3.641940003387289e-05, + "loss": 0.0402, + "step": 428 + }, + { + "epoch": 0.826987951807229, + "grad_norm": 0.3353273570537567, + "learning_rate": 3.6393751931663814e-05, + "loss": 0.0132, + "step": 429 + }, + { + "epoch": 0.8289156626506025, + "grad_norm": 0.7253058552742004, + "learning_rate": 3.6368021396351015e-05, + "loss": 0.0296, + "step": 430 + }, + { + "epoch": 0.8308433734939759, + "grad_norm": 0.45300304889678955, + "learning_rate": 3.634220855731598e-05, + "loss": 0.0258, + "step": 431 + }, + { + "epoch": 0.8327710843373494, + "grad_norm": 0.3480473458766937, + "learning_rate": 3.631631354435403e-05, + "loss": 0.0099, + "step": 432 + }, + { + "epoch": 0.8346987951807229, + "grad_norm": 2.1114516258239746, + "learning_rate": 3.62903364876737e-05, + "loss": 0.0457, + "step": 433 + }, + { + "epoch": 0.8366265060240964, + "grad_norm": 0.5649561882019043, + "learning_rate": 3.626427751789606e-05, + "loss": 0.0444, + "step": 434 + }, + { + "epoch": 0.8385542168674699, + "grad_norm": 0.3864995539188385, + "learning_rate": 3.623813676605405e-05, + "loss": 0.0223, + "step": 435 + }, + { + "epoch": 0.8404819277108434, + "grad_norm": 1.2134298086166382, + "learning_rate": 3.621191436359186e-05, + "loss": 0.0353, + "step": 436 + }, + { + "epoch": 0.8424096385542169, + "grad_norm": 0.4403415024280548, + "learning_rate": 3.6185610442364246e-05, + "loss": 0.0216, + "step": 437 + }, + { + "epoch": 0.8443373493975903, + "grad_norm": 0.6050297021865845, + "learning_rate": 3.6159225134635846e-05, + "loss": 0.0433, + "step": 438 + }, + { + "epoch": 0.8462650602409638, + "grad_norm": 0.7951678037643433, + "learning_rate": 3.6132758573080556e-05, + "loss": 0.031, + "step": 439 + }, + { + "epoch": 0.8481927710843373, + "grad_norm": 0.4991949796676636, + "learning_rate": 3.6106210890780834e-05, + "loss": 0.0313, + "step": 440 + }, + { + "epoch": 0.8501204819277108, + "grad_norm": 0.47951385378837585, + "learning_rate": 3.607958222122704e-05, + "loss": 0.0218, + "step": 441 + }, + { + "epoch": 0.8520481927710843, + "grad_norm": 0.7345194220542908, + "learning_rate": 3.6052872698316755e-05, + "loss": 0.0239, + "step": 442 + }, + { + "epoch": 0.8539759036144579, + "grad_norm": 1.4814884662628174, + "learning_rate": 3.602608245635414e-05, + "loss": 0.0127, + "step": 443 + }, + { + "epoch": 0.8559036144578314, + "grad_norm": 2.4240877628326416, + "learning_rate": 3.599921163004922e-05, + "loss": 0.0618, + "step": 444 + }, + { + "epoch": 0.8578313253012049, + "grad_norm": 0.41523510217666626, + "learning_rate": 3.5972260354517216e-05, + "loss": 0.0283, + "step": 445 + }, + { + "epoch": 0.8597590361445783, + "grad_norm": 0.5577677488327026, + "learning_rate": 3.594522876527791e-05, + "loss": 0.0271, + "step": 446 + }, + { + "epoch": 0.8616867469879518, + "grad_norm": 0.5829064846038818, + "learning_rate": 3.591811699825487e-05, + "loss": 0.0169, + "step": 447 + }, + { + "epoch": 0.8636144578313253, + "grad_norm": 0.4478822350502014, + "learning_rate": 3.5890925189774886e-05, + "loss": 0.0239, + "step": 448 + }, + { + "epoch": 0.8655421686746988, + "grad_norm": 0.3498048782348633, + "learning_rate": 3.586365347656718e-05, + "loss": 0.0137, + "step": 449 + }, + { + "epoch": 0.8674698795180723, + "grad_norm": 0.6571130156517029, + "learning_rate": 3.583630199576278e-05, + "loss": 0.027, + "step": 450 + }, + { + "epoch": 0.8693975903614458, + "grad_norm": 0.344970166683197, + "learning_rate": 3.58088708848938e-05, + "loss": 0.0167, + "step": 451 + }, + { + "epoch": 0.8713253012048193, + "grad_norm": 0.34611570835113525, + "learning_rate": 3.5781360281892775e-05, + "loss": 0.0468, + "step": 452 + }, + { + "epoch": 0.8732530120481927, + "grad_norm": 0.66157066822052, + "learning_rate": 3.575377032509194e-05, + "loss": 0.0344, + "step": 453 + }, + { + "epoch": 0.8751807228915662, + "grad_norm": 0.3676326870918274, + "learning_rate": 3.5726101153222534e-05, + "loss": 0.0366, + "step": 454 + }, + { + "epoch": 0.8771084337349397, + "grad_norm": 0.5958423018455505, + "learning_rate": 3.569835290541414e-05, + "loss": 0.0382, + "step": 455 + }, + { + "epoch": 0.8790361445783132, + "grad_norm": 0.36787471175193787, + "learning_rate": 3.567052572119397e-05, + "loss": 0.018, + "step": 456 + }, + { + "epoch": 0.8809638554216868, + "grad_norm": 0.9478234052658081, + "learning_rate": 3.564261974048611e-05, + "loss": 0.0179, + "step": 457 + }, + { + "epoch": 0.8828915662650603, + "grad_norm": 0.3337579369544983, + "learning_rate": 3.56146351036109e-05, + "loss": 0.0147, + "step": 458 + }, + { + "epoch": 0.8848192771084338, + "grad_norm": 0.4984932243824005, + "learning_rate": 3.558657195128416e-05, + "loss": 0.0224, + "step": 459 + }, + { + "epoch": 0.8867469879518072, + "grad_norm": 0.36718735098838806, + "learning_rate": 3.555843042461653e-05, + "loss": 0.0202, + "step": 460 + }, + { + "epoch": 0.8886746987951807, + "grad_norm": 0.4081745445728302, + "learning_rate": 3.553021066511274e-05, + "loss": 0.0288, + "step": 461 + }, + { + "epoch": 0.8906024096385542, + "grad_norm": 0.3233242332935333, + "learning_rate": 3.55019128146709e-05, + "loss": 0.0362, + "step": 462 + }, + { + "epoch": 0.8925301204819277, + "grad_norm": 0.6560158729553223, + "learning_rate": 3.547353701558178e-05, + "loss": 0.038, + "step": 463 + }, + { + "epoch": 0.8944578313253012, + "grad_norm": 0.47668641805648804, + "learning_rate": 3.544508341052811e-05, + "loss": 0.0399, + "step": 464 + }, + { + "epoch": 0.8963855421686747, + "grad_norm": 0.45512664318084717, + "learning_rate": 3.541655214258383e-05, + "loss": 0.022, + "step": 465 + }, + { + "epoch": 0.8983132530120482, + "grad_norm": 0.8410730361938477, + "learning_rate": 3.538794335521343e-05, + "loss": 0.0315, + "step": 466 + }, + { + "epoch": 0.9002409638554217, + "grad_norm": 0.4872909486293793, + "learning_rate": 3.535925719227117e-05, + "loss": 0.0152, + "step": 467 + }, + { + "epoch": 0.9021686746987951, + "grad_norm": 0.45623311400413513, + "learning_rate": 3.533049379800038e-05, + "loss": 0.0305, + "step": 468 + }, + { + "epoch": 0.9040963855421686, + "grad_norm": 0.43087029457092285, + "learning_rate": 3.530165331703275e-05, + "loss": 0.0131, + "step": 469 + }, + { + "epoch": 0.9060240963855422, + "grad_norm": 0.4610525369644165, + "learning_rate": 3.527273589438756e-05, + "loss": 0.0187, + "step": 470 + }, + { + "epoch": 0.9079518072289157, + "grad_norm": 0.3356114327907562, + "learning_rate": 3.5243741675471006e-05, + "loss": 0.0185, + "step": 471 + }, + { + "epoch": 0.9098795180722892, + "grad_norm": 0.9065960049629211, + "learning_rate": 3.5214670806075426e-05, + "loss": 0.0433, + "step": 472 + }, + { + "epoch": 0.9118072289156627, + "grad_norm": 0.3652578294277191, + "learning_rate": 3.518552343237858e-05, + "loss": 0.02, + "step": 473 + }, + { + "epoch": 0.9137349397590362, + "grad_norm": 0.32377883791923523, + "learning_rate": 3.5156299700942916e-05, + "loss": 0.0165, + "step": 474 + }, + { + "epoch": 0.9156626506024096, + "grad_norm": 0.2431817352771759, + "learning_rate": 3.512699975871485e-05, + "loss": 0.0172, + "step": 475 + }, + { + "epoch": 0.9175903614457831, + "grad_norm": 0.6390707492828369, + "learning_rate": 3.509762375302399e-05, + "loss": 0.0356, + "step": 476 + }, + { + "epoch": 0.9195180722891566, + "grad_norm": 0.2283092886209488, + "learning_rate": 3.506817183158243e-05, + "loss": 0.0088, + "step": 477 + }, + { + "epoch": 0.9214457831325301, + "grad_norm": 0.5053914189338684, + "learning_rate": 3.5038644142483966e-05, + "loss": 0.0389, + "step": 478 + }, + { + "epoch": 0.9233734939759036, + "grad_norm": 0.2567576467990875, + "learning_rate": 3.500904083420342e-05, + "loss": 0.0155, + "step": 479 + }, + { + "epoch": 0.9253012048192771, + "grad_norm": 0.6852384209632874, + "learning_rate": 3.497936205559583e-05, + "loss": 0.0247, + "step": 480 + }, + { + "epoch": 0.9272289156626506, + "grad_norm": 0.36403414607048035, + "learning_rate": 3.494960795589572e-05, + "loss": 0.023, + "step": 481 + }, + { + "epoch": 0.929156626506024, + "grad_norm": 0.506554901599884, + "learning_rate": 3.491977868471635e-05, + "loss": 0.0273, + "step": 482 + }, + { + "epoch": 0.9310843373493976, + "grad_norm": 0.38329923152923584, + "learning_rate": 3.4889874392048985e-05, + "loss": 0.0169, + "step": 483 + }, + { + "epoch": 0.9330120481927711, + "grad_norm": 0.2805836498737335, + "learning_rate": 3.48598952282621e-05, + "loss": 0.0105, + "step": 484 + }, + { + "epoch": 0.9349397590361446, + "grad_norm": 0.6315302848815918, + "learning_rate": 3.482984134410067e-05, + "loss": 0.0289, + "step": 485 + }, + { + "epoch": 0.9368674698795181, + "grad_norm": 0.6431388854980469, + "learning_rate": 3.479971289068537e-05, + "loss": 0.0311, + "step": 486 + }, + { + "epoch": 0.9387951807228916, + "grad_norm": 0.9794723391532898, + "learning_rate": 3.476951001951184e-05, + "loss": 0.0452, + "step": 487 + }, + { + "epoch": 0.9407228915662651, + "grad_norm": 0.7984824180603027, + "learning_rate": 3.473923288244991e-05, + "loss": 0.0689, + "step": 488 + }, + { + "epoch": 0.9426506024096386, + "grad_norm": 0.46362006664276123, + "learning_rate": 3.470888163174286e-05, + "loss": 0.0241, + "step": 489 + }, + { + "epoch": 0.944578313253012, + "grad_norm": 0.5051195025444031, + "learning_rate": 3.467845642000661e-05, + "loss": 0.0228, + "step": 490 + }, + { + "epoch": 0.9465060240963855, + "grad_norm": 0.3082812428474426, + "learning_rate": 3.4647957400229004e-05, + "loss": 0.0144, + "step": 491 + }, + { + "epoch": 0.948433734939759, + "grad_norm": 0.2691391110420227, + "learning_rate": 3.461738472576902e-05, + "loss": 0.0167, + "step": 492 + }, + { + "epoch": 0.9503614457831325, + "grad_norm": 0.5627671480178833, + "learning_rate": 3.458673855035597e-05, + "loss": 0.031, + "step": 493 + }, + { + "epoch": 0.952289156626506, + "grad_norm": 0.4571435749530792, + "learning_rate": 3.455601902808876e-05, + "loss": 0.0191, + "step": 494 + }, + { + "epoch": 0.9542168674698795, + "grad_norm": 1.0117709636688232, + "learning_rate": 3.452522631343515e-05, + "loss": 0.0192, + "step": 495 + }, + { + "epoch": 0.9561445783132531, + "grad_norm": 0.28375712037086487, + "learning_rate": 3.449436056123086e-05, + "loss": 0.0159, + "step": 496 + }, + { + "epoch": 0.9580722891566265, + "grad_norm": 0.26381856203079224, + "learning_rate": 3.446342192667893e-05, + "loss": 0.0113, + "step": 497 + }, + { + "epoch": 0.96, + "grad_norm": 0.49317577481269836, + "learning_rate": 3.443241056534884e-05, + "loss": 0.0332, + "step": 498 + }, + { + "epoch": 0.9619277108433735, + "grad_norm": 0.28884485363960266, + "learning_rate": 3.440132663317579e-05, + "loss": 0.0117, + "step": 499 + }, + { + "epoch": 0.963855421686747, + "grad_norm": 0.36255285143852234, + "learning_rate": 3.4370170286459864e-05, + "loss": 0.0169, + "step": 500 + }, + { + "epoch": 0.9657831325301205, + "grad_norm": 0.4265049993991852, + "learning_rate": 3.433894168186529e-05, + "loss": 0.0217, + "step": 501 + }, + { + "epoch": 0.967710843373494, + "grad_norm": 0.8169426321983337, + "learning_rate": 3.430764097641962e-05, + "loss": 0.0207, + "step": 502 + }, + { + "epoch": 0.9696385542168675, + "grad_norm": 1.866077184677124, + "learning_rate": 3.427626832751296e-05, + "loss": 0.0381, + "step": 503 + }, + { + "epoch": 0.971566265060241, + "grad_norm": 0.33124980330467224, + "learning_rate": 3.424482389289716e-05, + "loss": 0.0245, + "step": 504 + }, + { + "epoch": 0.9734939759036144, + "grad_norm": 0.37479540705680847, + "learning_rate": 3.4213307830685055e-05, + "loss": 0.0164, + "step": 505 + }, + { + "epoch": 0.9754216867469879, + "grad_norm": 0.39738863706588745, + "learning_rate": 3.4181720299349615e-05, + "loss": 0.0297, + "step": 506 + }, + { + "epoch": 0.9773493975903614, + "grad_norm": 0.2567287087440491, + "learning_rate": 3.4150061457723205e-05, + "loss": 0.0102, + "step": 507 + }, + { + "epoch": 0.9792771084337349, + "grad_norm": 0.6230517029762268, + "learning_rate": 3.411833146499675e-05, + "loss": 0.0243, + "step": 508 + }, + { + "epoch": 0.9812048192771085, + "grad_norm": 0.44843971729278564, + "learning_rate": 3.408653048071894e-05, + "loss": 0.0357, + "step": 509 + }, + { + "epoch": 0.983132530120482, + "grad_norm": 1.0569655895233154, + "learning_rate": 3.405465866479546e-05, + "loss": 0.037, + "step": 510 + }, + { + "epoch": 0.9850602409638555, + "grad_norm": 0.29000964760780334, + "learning_rate": 3.402271617748812e-05, + "loss": 0.0129, + "step": 511 + }, + { + "epoch": 0.9869879518072289, + "grad_norm": 2.1627447605133057, + "learning_rate": 3.399070317941413e-05, + "loss": 0.0442, + "step": 512 + }, + { + "epoch": 0.9889156626506024, + "grad_norm": 0.27371272444725037, + "learning_rate": 3.395861983154522e-05, + "loss": 0.0119, + "step": 513 + }, + { + "epoch": 0.9908433734939759, + "grad_norm": 0.4117226302623749, + "learning_rate": 3.392646629520688e-05, + "loss": 0.0455, + "step": 514 + }, + { + "epoch": 0.9927710843373494, + "grad_norm": 0.5098996758460999, + "learning_rate": 3.389424273207752e-05, + "loss": 0.0203, + "step": 515 + }, + { + "epoch": 0.9946987951807229, + "grad_norm": 0.5192157626152039, + "learning_rate": 3.386194930418767e-05, + "loss": 0.0329, + "step": 516 + }, + { + "epoch": 0.9966265060240964, + "grad_norm": 0.18757697939872742, + "learning_rate": 3.382958617391915e-05, + "loss": 0.0065, + "step": 517 + }, + { + "epoch": 0.9985542168674699, + "grad_norm": 0.3334413170814514, + "learning_rate": 3.3797153504004296e-05, + "loss": 0.0266, + "step": 518 + }, + { + "epoch": 1.0, + "grad_norm": 0.4152225852012634, + "learning_rate": 3.3764651457525095e-05, + "loss": 0.0169, + "step": 519 + }, + { + "epoch": 1.0019277108433735, + "grad_norm": 0.43535247445106506, + "learning_rate": 3.373208019791237e-05, + "loss": 0.0221, + "step": 520 + }, + { + "epoch": 1.003855421686747, + "grad_norm": 0.39292722940444946, + "learning_rate": 3.3699439888945e-05, + "loss": 0.0211, + "step": 521 + }, + { + "epoch": 1.0057831325301205, + "grad_norm": 0.19566713273525238, + "learning_rate": 3.366673069474904e-05, + "loss": 0.0069, + "step": 522 + }, + { + "epoch": 1.007710843373494, + "grad_norm": 0.5101853609085083, + "learning_rate": 3.3633952779796914e-05, + "loss": 0.0191, + "step": 523 + }, + { + "epoch": 1.0096385542168675, + "grad_norm": 0.999434769153595, + "learning_rate": 3.360110630890664e-05, + "loss": 0.0196, + "step": 524 + }, + { + "epoch": 1.011566265060241, + "grad_norm": 0.4646223783493042, + "learning_rate": 3.356819144724092e-05, + "loss": 0.0328, + "step": 525 + }, + { + "epoch": 1.0134939759036146, + "grad_norm": 0.3132480978965759, + "learning_rate": 3.3535208360306354e-05, + "loss": 0.0203, + "step": 526 + }, + { + "epoch": 1.0154216867469879, + "grad_norm": 0.3038032352924347, + "learning_rate": 3.350215721395261e-05, + "loss": 0.0122, + "step": 527 + }, + { + "epoch": 1.0173493975903614, + "grad_norm": 0.45082882046699524, + "learning_rate": 3.346903817437157e-05, + "loss": 0.0437, + "step": 528 + }, + { + "epoch": 1.0192771084337349, + "grad_norm": 0.26917046308517456, + "learning_rate": 3.343585140809651e-05, + "loss": 0.013, + "step": 529 + }, + { + "epoch": 1.0212048192771084, + "grad_norm": 0.23869264125823975, + "learning_rate": 3.3402597082001276e-05, + "loss": 0.008, + "step": 530 + }, + { + "epoch": 1.0231325301204819, + "grad_norm": 0.31315353512763977, + "learning_rate": 3.3369275363299394e-05, + "loss": 0.0078, + "step": 531 + }, + { + "epoch": 1.0250602409638554, + "grad_norm": 0.4780346751213074, + "learning_rate": 3.333588641954327e-05, + "loss": 0.0225, + "step": 532 + }, + { + "epoch": 1.026987951807229, + "grad_norm": 0.2920368015766144, + "learning_rate": 3.330243041862336e-05, + "loss": 0.0118, + "step": 533 + }, + { + "epoch": 1.0289156626506024, + "grad_norm": 0.543669581413269, + "learning_rate": 3.326890752876728e-05, + "loss": 0.0338, + "step": 534 + }, + { + "epoch": 1.030843373493976, + "grad_norm": 0.4288000464439392, + "learning_rate": 3.323531791853901e-05, + "loss": 0.0341, + "step": 535 + }, + { + "epoch": 1.0327710843373494, + "grad_norm": 0.26600322127342224, + "learning_rate": 3.3201661756838e-05, + "loss": 0.0184, + "step": 536 + }, + { + "epoch": 1.034698795180723, + "grad_norm": 0.290937602519989, + "learning_rate": 3.316793921289835e-05, + "loss": 0.0152, + "step": 537 + }, + { + "epoch": 1.0366265060240965, + "grad_norm": 0.7621443271636963, + "learning_rate": 3.313415045628795e-05, + "loss": 0.0326, + "step": 538 + }, + { + "epoch": 1.03855421686747, + "grad_norm": 0.5581283569335938, + "learning_rate": 3.3100295656907646e-05, + "loss": 0.0164, + "step": 539 + }, + { + "epoch": 1.0404819277108435, + "grad_norm": 0.20930901169776917, + "learning_rate": 3.306637498499034e-05, + "loss": 0.0091, + "step": 540 + }, + { + "epoch": 1.0424096385542168, + "grad_norm": 0.46212059259414673, + "learning_rate": 3.303238861110018e-05, + "loss": 0.0118, + "step": 541 + }, + { + "epoch": 1.0443373493975903, + "grad_norm": 0.38259151577949524, + "learning_rate": 3.299833670613168e-05, + "loss": 0.0081, + "step": 542 + }, + { + "epoch": 1.0462650602409638, + "grad_norm": 0.4888618290424347, + "learning_rate": 3.2964219441308865e-05, + "loss": 0.0138, + "step": 543 + }, + { + "epoch": 1.0481927710843373, + "grad_norm": 0.32103127241134644, + "learning_rate": 3.2930036988184425e-05, + "loss": 0.0171, + "step": 544 + }, + { + "epoch": 1.0501204819277108, + "grad_norm": 0.27787327766418457, + "learning_rate": 3.28957895186388e-05, + "loss": 0.0106, + "step": 545 + }, + { + "epoch": 1.0520481927710843, + "grad_norm": 0.35597777366638184, + "learning_rate": 3.2861477204879395e-05, + "loss": 0.0123, + "step": 546 + }, + { + "epoch": 1.0539759036144578, + "grad_norm": 0.3619804084300995, + "learning_rate": 3.2827100219439656e-05, + "loss": 0.0088, + "step": 547 + }, + { + "epoch": 1.0559036144578313, + "grad_norm": 0.2525513470172882, + "learning_rate": 3.279265873517822e-05, + "loss": 0.0179, + "step": 548 + }, + { + "epoch": 1.0578313253012048, + "grad_norm": 0.3910020887851715, + "learning_rate": 3.275815292527804e-05, + "loss": 0.0142, + "step": 549 + }, + { + "epoch": 1.0597590361445783, + "grad_norm": 0.30515050888061523, + "learning_rate": 3.2723582963245526e-05, + "loss": 0.0123, + "step": 550 + }, + { + "epoch": 1.0616867469879518, + "grad_norm": 0.21708644926548004, + "learning_rate": 3.2688949022909665e-05, + "loss": 0.0098, + "step": 551 + }, + { + "epoch": 1.0636144578313254, + "grad_norm": 0.23307719826698303, + "learning_rate": 3.265425127842114e-05, + "loss": 0.0097, + "step": 552 + }, + { + "epoch": 1.0655421686746989, + "grad_norm": 0.676654577255249, + "learning_rate": 3.261948990425147e-05, + "loss": 0.0227, + "step": 553 + }, + { + "epoch": 1.0674698795180724, + "grad_norm": 0.4593975841999054, + "learning_rate": 3.258466507519213e-05, + "loss": 0.047, + "step": 554 + }, + { + "epoch": 1.0693975903614459, + "grad_norm": 0.19405829906463623, + "learning_rate": 3.254977696635366e-05, + "loss": 0.0314, + "step": 555 + }, + { + "epoch": 1.0713253012048192, + "grad_norm": 0.14563389122486115, + "learning_rate": 3.2514825753164774e-05, + "loss": 0.0046, + "step": 556 + }, + { + "epoch": 1.0732530120481927, + "grad_norm": 0.2642340064048767, + "learning_rate": 3.247981161137153e-05, + "loss": 0.022, + "step": 557 + }, + { + "epoch": 1.0751807228915662, + "grad_norm": 0.17274761199951172, + "learning_rate": 3.2444734717036386e-05, + "loss": 0.0134, + "step": 558 + }, + { + "epoch": 1.0771084337349397, + "grad_norm": 0.44354626536369324, + "learning_rate": 3.240959524653735e-05, + "loss": 0.0211, + "step": 559 + }, + { + "epoch": 1.0790361445783132, + "grad_norm": 0.2806888818740845, + "learning_rate": 3.237439337656708e-05, + "loss": 0.0141, + "step": 560 + }, + { + "epoch": 1.0809638554216867, + "grad_norm": 0.21679501235485077, + "learning_rate": 3.2339129284131994e-05, + "loss": 0.019, + "step": 561 + }, + { + "epoch": 1.0828915662650602, + "grad_norm": 0.3040260076522827, + "learning_rate": 3.2303803146551386e-05, + "loss": 0.0249, + "step": 562 + }, + { + "epoch": 1.0848192771084337, + "grad_norm": 0.2793775200843811, + "learning_rate": 3.226841514145656e-05, + "loss": 0.0088, + "step": 563 + }, + { + "epoch": 1.0867469879518072, + "grad_norm": 0.149955615401268, + "learning_rate": 3.223296544678987e-05, + "loss": 0.0054, + "step": 564 + }, + { + "epoch": 1.0886746987951808, + "grad_norm": 0.22166767716407776, + "learning_rate": 3.219745424080389e-05, + "loss": 0.0109, + "step": 565 + }, + { + "epoch": 1.0906024096385543, + "grad_norm": 0.22399431467056274, + "learning_rate": 3.2161881702060476e-05, + "loss": 0.0106, + "step": 566 + }, + { + "epoch": 1.0925301204819278, + "grad_norm": 0.18537986278533936, + "learning_rate": 3.2126248009429905e-05, + "loss": 0.0077, + "step": 567 + }, + { + "epoch": 1.0944578313253013, + "grad_norm": 0.24511495232582092, + "learning_rate": 3.2090553342089935e-05, + "loss": 0.0093, + "step": 568 + }, + { + "epoch": 1.0963855421686748, + "grad_norm": 0.4766045808792114, + "learning_rate": 3.205479787952494e-05, + "loss": 0.036, + "step": 569 + }, + { + "epoch": 1.0983132530120483, + "grad_norm": 0.1425715535879135, + "learning_rate": 3.201898180152499e-05, + "loss": 0.0085, + "step": 570 + }, + { + "epoch": 1.1002409638554216, + "grad_norm": 0.1909666359424591, + "learning_rate": 3.1983105288184945e-05, + "loss": 0.0081, + "step": 571 + }, + { + "epoch": 1.102168674698795, + "grad_norm": 0.44077104330062866, + "learning_rate": 3.194716851990355e-05, + "loss": 0.017, + "step": 572 + }, + { + "epoch": 1.1040963855421686, + "grad_norm": 0.5757400989532471, + "learning_rate": 3.191117167738253e-05, + "loss": 0.021, + "step": 573 + }, + { + "epoch": 1.106024096385542, + "grad_norm": 0.1977701038122177, + "learning_rate": 3.1875114941625705e-05, + "loss": 0.0096, + "step": 574 + }, + { + "epoch": 1.1079518072289156, + "grad_norm": 0.3524581491947174, + "learning_rate": 3.1838998493938026e-05, + "loss": 0.0118, + "step": 575 + }, + { + "epoch": 1.1098795180722891, + "grad_norm": 0.3301331698894501, + "learning_rate": 3.180282251592472e-05, + "loss": 0.0094, + "step": 576 + }, + { + "epoch": 1.1118072289156626, + "grad_norm": 0.2774488925933838, + "learning_rate": 3.1766587189490336e-05, + "loss": 0.0131, + "step": 577 + }, + { + "epoch": 1.1137349397590361, + "grad_norm": 1.732595443725586, + "learning_rate": 3.173029269683785e-05, + "loss": 0.0445, + "step": 578 + }, + { + "epoch": 1.1156626506024097, + "grad_norm": 0.28746843338012695, + "learning_rate": 3.169393922046776e-05, + "loss": 0.0116, + "step": 579 + }, + { + "epoch": 1.1175903614457832, + "grad_norm": 0.2952995002269745, + "learning_rate": 3.165752694317713e-05, + "loss": 0.0116, + "step": 580 + }, + { + "epoch": 1.1195180722891567, + "grad_norm": 0.2938575744628906, + "learning_rate": 3.16210560480587e-05, + "loss": 0.013, + "step": 581 + }, + { + "epoch": 1.1214457831325302, + "grad_norm": 0.22283495962619781, + "learning_rate": 3.158452671849998e-05, + "loss": 0.0052, + "step": 582 + }, + { + "epoch": 1.1233734939759037, + "grad_norm": 0.6272858381271362, + "learning_rate": 3.154793913818226e-05, + "loss": 0.0182, + "step": 583 + }, + { + "epoch": 1.1253012048192772, + "grad_norm": 0.479753702878952, + "learning_rate": 3.1511293491079804e-05, + "loss": 0.0146, + "step": 584 + }, + { + "epoch": 1.1272289156626507, + "grad_norm": 0.31104400753974915, + "learning_rate": 3.1474589961458786e-05, + "loss": 0.0139, + "step": 585 + }, + { + "epoch": 1.129156626506024, + "grad_norm": 0.4932832419872284, + "learning_rate": 3.1437828733876477e-05, + "loss": 0.0236, + "step": 586 + }, + { + "epoch": 1.1310843373493975, + "grad_norm": 0.222808837890625, + "learning_rate": 3.140100999318025e-05, + "loss": 0.0084, + "step": 587 + }, + { + "epoch": 1.133012048192771, + "grad_norm": 0.4515356719493866, + "learning_rate": 3.136413392450668e-05, + "loss": 0.0215, + "step": 588 + }, + { + "epoch": 1.1349397590361445, + "grad_norm": 0.39302268624305725, + "learning_rate": 3.132720071328061e-05, + "loss": 0.0154, + "step": 589 + }, + { + "epoch": 1.136867469879518, + "grad_norm": 0.43382835388183594, + "learning_rate": 3.1290210545214205e-05, + "loss": 0.0088, + "step": 590 + }, + { + "epoch": 1.1387951807228915, + "grad_norm": 0.18707136809825897, + "learning_rate": 3.125316360630602e-05, + "loss": 0.0126, + "step": 591 + }, + { + "epoch": 1.140722891566265, + "grad_norm": 0.5688219666481018, + "learning_rate": 3.121606008284011e-05, + "loss": 0.0147, + "step": 592 + }, + { + "epoch": 1.1426506024096386, + "grad_norm": 0.3321833312511444, + "learning_rate": 3.1178900161385005e-05, + "loss": 0.0119, + "step": 593 + }, + { + "epoch": 1.144578313253012, + "grad_norm": 0.3738424777984619, + "learning_rate": 3.114168402879286e-05, + "loss": 0.0158, + "step": 594 + }, + { + "epoch": 1.1465060240963856, + "grad_norm": 0.2386978417634964, + "learning_rate": 3.110441187219846e-05, + "loss": 0.0107, + "step": 595 + }, + { + "epoch": 1.148433734939759, + "grad_norm": 0.2165699452161789, + "learning_rate": 3.10670838790183e-05, + "loss": 0.0079, + "step": 596 + }, + { + "epoch": 1.1503614457831326, + "grad_norm": 0.25952696800231934, + "learning_rate": 3.102970023694965e-05, + "loss": 0.0147, + "step": 597 + }, + { + "epoch": 1.152289156626506, + "grad_norm": 0.21448305249214172, + "learning_rate": 3.099226113396959e-05, + "loss": 0.0099, + "step": 598 + }, + { + "epoch": 1.1542168674698796, + "grad_norm": 0.37226060032844543, + "learning_rate": 3.095476675833405e-05, + "loss": 0.0214, + "step": 599 + }, + { + "epoch": 1.1561445783132531, + "grad_norm": 0.29637983441352844, + "learning_rate": 3.0917217298576955e-05, + "loss": 0.0118, + "step": 600 + }, + { + "epoch": 1.1580722891566264, + "grad_norm": 0.18535609543323517, + "learning_rate": 3.0879612943509154e-05, + "loss": 0.0086, + "step": 601 + }, + { + "epoch": 1.16, + "grad_norm": 0.25874125957489014, + "learning_rate": 3.0841953882217536e-05, + "loss": 0.0088, + "step": 602 + }, + { + "epoch": 1.1619277108433734, + "grad_norm": 0.46092745661735535, + "learning_rate": 3.08042403040641e-05, + "loss": 0.0241, + "step": 603 + }, + { + "epoch": 1.163855421686747, + "grad_norm": 0.27023249864578247, + "learning_rate": 3.076647239868494e-05, + "loss": 0.0154, + "step": 604 + }, + { + "epoch": 1.1657831325301204, + "grad_norm": 0.445157527923584, + "learning_rate": 3.072865035598933e-05, + "loss": 0.0197, + "step": 605 + }, + { + "epoch": 1.167710843373494, + "grad_norm": 0.18097272515296936, + "learning_rate": 3.06907743661588e-05, + "loss": 0.0093, + "step": 606 + }, + { + "epoch": 1.1696385542168675, + "grad_norm": 0.22469942271709442, + "learning_rate": 3.065284461964609e-05, + "loss": 0.0171, + "step": 607 + }, + { + "epoch": 1.171566265060241, + "grad_norm": 0.20190906524658203, + "learning_rate": 3.061486130717428e-05, + "loss": 0.008, + "step": 608 + }, + { + "epoch": 1.1734939759036145, + "grad_norm": 0.18294145166873932, + "learning_rate": 3.057682461973579e-05, + "loss": 0.0155, + "step": 609 + }, + { + "epoch": 1.175421686746988, + "grad_norm": 0.34203943610191345, + "learning_rate": 3.053873474859143e-05, + "loss": 0.0212, + "step": 610 + }, + { + "epoch": 1.1773493975903615, + "grad_norm": 0.49073582887649536, + "learning_rate": 3.050059188526942e-05, + "loss": 0.019, + "step": 611 + }, + { + "epoch": 1.179277108433735, + "grad_norm": 0.3537680506706238, + "learning_rate": 3.046239622156446e-05, + "loss": 0.0147, + "step": 612 + }, + { + "epoch": 1.1812048192771085, + "grad_norm": 0.2584632635116577, + "learning_rate": 3.042414794953674e-05, + "loss": 0.0088, + "step": 613 + }, + { + "epoch": 1.1831325301204818, + "grad_norm": 0.3529360890388489, + "learning_rate": 3.0385847261510975e-05, + "loss": 0.0187, + "step": 614 + }, + { + "epoch": 1.1850602409638555, + "grad_norm": 0.3331570327281952, + "learning_rate": 3.0347494350075465e-05, + "loss": 0.0124, + "step": 615 + }, + { + "epoch": 1.1869879518072288, + "grad_norm": 0.2223527580499649, + "learning_rate": 3.0309089408081074e-05, + "loss": 0.01, + "step": 616 + }, + { + "epoch": 1.1889156626506023, + "grad_norm": 0.21985746920108795, + "learning_rate": 3.027063262864032e-05, + "loss": 0.0087, + "step": 617 + }, + { + "epoch": 1.1908433734939758, + "grad_norm": 0.2989653944969177, + "learning_rate": 3.023212420512637e-05, + "loss": 0.0137, + "step": 618 + }, + { + "epoch": 1.1927710843373494, + "grad_norm": 0.17423275113105774, + "learning_rate": 3.0193564331172074e-05, + "loss": 0.0056, + "step": 619 + }, + { + "epoch": 1.1946987951807229, + "grad_norm": 1.0992127656936646, + "learning_rate": 3.0154953200668976e-05, + "loss": 0.0274, + "step": 620 + }, + { + "epoch": 1.1966265060240964, + "grad_norm": 0.21641989052295685, + "learning_rate": 3.011629100776638e-05, + "loss": 0.0151, + "step": 621 + }, + { + "epoch": 1.1985542168674699, + "grad_norm": 0.4558199644088745, + "learning_rate": 3.007757794687033e-05, + "loss": 0.0424, + "step": 622 + }, + { + "epoch": 1.2004819277108434, + "grad_norm": 0.42380189895629883, + "learning_rate": 3.003881421264266e-05, + "loss": 0.0079, + "step": 623 + }, + { + "epoch": 1.202409638554217, + "grad_norm": 0.28791171312332153, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.0142, + "step": 624 + }, + { + "epoch": 1.2043373493975904, + "grad_norm": 0.3906581997871399, + "learning_rate": 2.996113550411281e-05, + "loss": 0.0251, + "step": 625 + }, + { + "epoch": 1.206265060240964, + "grad_norm": 0.47848746180534363, + "learning_rate": 2.9922220920404375e-05, + "loss": 0.0137, + "step": 626 + }, + { + "epoch": 1.2081927710843374, + "grad_norm": 0.22666941583156586, + "learning_rate": 2.9883256444549862e-05, + "loss": 0.0105, + "step": 627 + }, + { + "epoch": 1.210120481927711, + "grad_norm": 0.18968136608600616, + "learning_rate": 2.984424227247529e-05, + "loss": 0.0089, + "step": 628 + }, + { + "epoch": 1.2120481927710842, + "grad_norm": 0.28732606768608093, + "learning_rate": 2.980517860035656e-05, + "loss": 0.0253, + "step": 629 + }, + { + "epoch": 1.213975903614458, + "grad_norm": 0.21131543815135956, + "learning_rate": 2.9766065624618518e-05, + "loss": 0.0134, + "step": 630 + }, + { + "epoch": 1.2159036144578312, + "grad_norm": 0.7594877481460571, + "learning_rate": 2.972690354193388e-05, + "loss": 0.0157, + "step": 631 + }, + { + "epoch": 1.2178313253012047, + "grad_norm": 0.730291485786438, + "learning_rate": 2.96876925492223e-05, + "loss": 0.0204, + "step": 632 + }, + { + "epoch": 1.2197590361445783, + "grad_norm": 0.20333674550056458, + "learning_rate": 2.9648432843649382e-05, + "loss": 0.0114, + "step": 633 + }, + { + "epoch": 1.2216867469879518, + "grad_norm": 0.5680793523788452, + "learning_rate": 2.960912462262566e-05, + "loss": 0.0146, + "step": 634 + }, + { + "epoch": 1.2236144578313253, + "grad_norm": 0.4591079354286194, + "learning_rate": 2.9569768083805618e-05, + "loss": 0.0112, + "step": 635 + }, + { + "epoch": 1.2255421686746988, + "grad_norm": 0.3793511390686035, + "learning_rate": 2.953036342508671e-05, + "loss": 0.0377, + "step": 636 + }, + { + "epoch": 1.2274698795180723, + "grad_norm": 1.118723750114441, + "learning_rate": 2.9490910844608346e-05, + "loss": 0.0432, + "step": 637 + }, + { + "epoch": 1.2293975903614458, + "grad_norm": 0.36990776658058167, + "learning_rate": 2.9451410540750887e-05, + "loss": 0.0203, + "step": 638 + }, + { + "epoch": 1.2313253012048193, + "grad_norm": 0.930397629737854, + "learning_rate": 2.94118627121347e-05, + "loss": 0.0311, + "step": 639 + }, + { + "epoch": 1.2332530120481928, + "grad_norm": 0.2347625195980072, + "learning_rate": 2.9372267557619075e-05, + "loss": 0.0168, + "step": 640 + }, + { + "epoch": 1.2351807228915663, + "grad_norm": 0.3720332384109497, + "learning_rate": 2.933262527630131e-05, + "loss": 0.0136, + "step": 641 + }, + { + "epoch": 1.2371084337349398, + "grad_norm": 0.4871984124183655, + "learning_rate": 2.929293606751565e-05, + "loss": 0.0339, + "step": 642 + }, + { + "epoch": 1.2390361445783133, + "grad_norm": 0.35853689908981323, + "learning_rate": 2.9253200130832322e-05, + "loss": 0.0095, + "step": 643 + }, + { + "epoch": 1.2409638554216866, + "grad_norm": 0.42003703117370605, + "learning_rate": 2.92134176660565e-05, + "loss": 0.0142, + "step": 644 + }, + { + "epoch": 1.2428915662650604, + "grad_norm": 0.3854500651359558, + "learning_rate": 2.9173588873227338e-05, + "loss": 0.0209, + "step": 645 + }, + { + "epoch": 1.2448192771084337, + "grad_norm": 0.24665917456150055, + "learning_rate": 2.913371395261691e-05, + "loss": 0.0087, + "step": 646 + }, + { + "epoch": 1.2467469879518072, + "grad_norm": 0.41571593284606934, + "learning_rate": 2.9093793104729268e-05, + "loss": 0.0164, + "step": 647 + }, + { + "epoch": 1.2486746987951807, + "grad_norm": 0.4597891569137573, + "learning_rate": 2.9053826530299377e-05, + "loss": 0.0138, + "step": 648 + }, + { + "epoch": 1.2506024096385542, + "grad_norm": 0.43345385789871216, + "learning_rate": 2.901381443029215e-05, + "loss": 0.0353, + "step": 649 + }, + { + "epoch": 1.2525301204819277, + "grad_norm": 0.3706768751144409, + "learning_rate": 2.897375700590141e-05, + "loss": 0.007, + "step": 650 + }, + { + "epoch": 1.2544578313253012, + "grad_norm": 0.30305296182632446, + "learning_rate": 2.8933654458548873e-05, + "loss": 0.0123, + "step": 651 + }, + { + "epoch": 1.2563855421686747, + "grad_norm": 0.2042127549648285, + "learning_rate": 2.8893506989883167e-05, + "loss": 0.0099, + "step": 652 + }, + { + "epoch": 1.2583132530120482, + "grad_norm": 0.20524422824382782, + "learning_rate": 2.8853314801778784e-05, + "loss": 0.0097, + "step": 653 + }, + { + "epoch": 1.2602409638554217, + "grad_norm": 0.2351921945810318, + "learning_rate": 2.8813078096335093e-05, + "loss": 0.0091, + "step": 654 + }, + { + "epoch": 1.2621686746987952, + "grad_norm": 0.34547340869903564, + "learning_rate": 2.87727970758753e-05, + "loss": 0.0088, + "step": 655 + }, + { + "epoch": 1.2640963855421687, + "grad_norm": 0.35163217782974243, + "learning_rate": 2.8732471942945443e-05, + "loss": 0.0145, + "step": 656 + }, + { + "epoch": 1.266024096385542, + "grad_norm": 1.715137243270874, + "learning_rate": 2.8692102900313378e-05, + "loss": 0.0198, + "step": 657 + }, + { + "epoch": 1.2679518072289158, + "grad_norm": 0.2860178053379059, + "learning_rate": 2.8651690150967748e-05, + "loss": 0.0085, + "step": 658 + }, + { + "epoch": 1.269879518072289, + "grad_norm": 0.21175967156887054, + "learning_rate": 2.8611233898116967e-05, + "loss": 0.0071, + "step": 659 + }, + { + "epoch": 1.2718072289156628, + "grad_norm": 0.33726972341537476, + "learning_rate": 2.85707343451882e-05, + "loss": 0.012, + "step": 660 + }, + { + "epoch": 1.273734939759036, + "grad_norm": 0.2138456553220749, + "learning_rate": 2.853019169582635e-05, + "loss": 0.0092, + "step": 661 + }, + { + "epoch": 1.2756626506024096, + "grad_norm": 0.2304934412240982, + "learning_rate": 2.8489606153892997e-05, + "loss": 0.0144, + "step": 662 + }, + { + "epoch": 1.277590361445783, + "grad_norm": 0.2691061794757843, + "learning_rate": 2.8448977923465425e-05, + "loss": 0.0121, + "step": 663 + }, + { + "epoch": 1.2795180722891566, + "grad_norm": 0.35254305601119995, + "learning_rate": 2.840830720883555e-05, + "loss": 0.0125, + "step": 664 + }, + { + "epoch": 1.28144578313253, + "grad_norm": 0.36552608013153076, + "learning_rate": 2.836759421450893e-05, + "loss": 0.021, + "step": 665 + }, + { + "epoch": 1.2833734939759036, + "grad_norm": 0.37177154421806335, + "learning_rate": 2.83268391452037e-05, + "loss": 0.0216, + "step": 666 + }, + { + "epoch": 1.2853012048192771, + "grad_norm": 0.20932547748088837, + "learning_rate": 2.828604220584958e-05, + "loss": 0.0077, + "step": 667 + }, + { + "epoch": 1.2872289156626506, + "grad_norm": 0.5158557295799255, + "learning_rate": 2.824520360158681e-05, + "loss": 0.0394, + "step": 668 + }, + { + "epoch": 1.2891566265060241, + "grad_norm": 0.22623969614505768, + "learning_rate": 2.820432353776515e-05, + "loss": 0.0087, + "step": 669 + }, + { + "epoch": 1.2910843373493976, + "grad_norm": 0.2996046245098114, + "learning_rate": 2.8163402219942822e-05, + "loss": 0.01, + "step": 670 + }, + { + "epoch": 1.2930120481927712, + "grad_norm": 0.24957989156246185, + "learning_rate": 2.8122439853885488e-05, + "loss": 0.0127, + "step": 671 + }, + { + "epoch": 1.2949397590361444, + "grad_norm": 0.2636559307575226, + "learning_rate": 2.8081436645565216e-05, + "loss": 0.0128, + "step": 672 + }, + { + "epoch": 1.2968674698795182, + "grad_norm": 0.3531591296195984, + "learning_rate": 2.804039280115944e-05, + "loss": 0.0199, + "step": 673 + }, + { + "epoch": 1.2987951807228915, + "grad_norm": 0.3682299852371216, + "learning_rate": 2.7999308527049927e-05, + "loss": 0.0088, + "step": 674 + }, + { + "epoch": 1.3007228915662652, + "grad_norm": 0.19555217027664185, + "learning_rate": 2.795818402982174e-05, + "loss": 0.0084, + "step": 675 + }, + { + "epoch": 1.3026506024096385, + "grad_norm": 0.2864912450313568, + "learning_rate": 2.7917019516262186e-05, + "loss": 0.0154, + "step": 676 + }, + { + "epoch": 1.304578313253012, + "grad_norm": 0.2211237996816635, + "learning_rate": 2.78758151933598e-05, + "loss": 0.0078, + "step": 677 + }, + { + "epoch": 1.3065060240963855, + "grad_norm": 0.13646945357322693, + "learning_rate": 2.7834571268303294e-05, + "loss": 0.0058, + "step": 678 + }, + { + "epoch": 1.308433734939759, + "grad_norm": 0.16530285775661469, + "learning_rate": 2.779328794848049e-05, + "loss": 0.007, + "step": 679 + }, + { + "epoch": 1.3103614457831325, + "grad_norm": 0.2145693302154541, + "learning_rate": 2.7751965441477325e-05, + "loss": 0.0203, + "step": 680 + }, + { + "epoch": 1.312289156626506, + "grad_norm": 0.24273739755153656, + "learning_rate": 2.771060395507677e-05, + "loss": 0.0106, + "step": 681 + }, + { + "epoch": 1.3142168674698795, + "grad_norm": 0.20430618524551392, + "learning_rate": 2.7669203697257794e-05, + "loss": 0.0122, + "step": 682 + }, + { + "epoch": 1.316144578313253, + "grad_norm": 0.2502615749835968, + "learning_rate": 2.7627764876194335e-05, + "loss": 0.0101, + "step": 683 + }, + { + "epoch": 1.3180722891566266, + "grad_norm": 0.287239670753479, + "learning_rate": 2.7586287700254214e-05, + "loss": 0.0203, + "step": 684 + }, + { + "epoch": 1.32, + "grad_norm": 0.16239754855632782, + "learning_rate": 2.7544772377998147e-05, + "loss": 0.0084, + "step": 685 + }, + { + "epoch": 1.3219277108433736, + "grad_norm": 0.27174142003059387, + "learning_rate": 2.7503219118178636e-05, + "loss": 0.008, + "step": 686 + }, + { + "epoch": 1.3238554216867469, + "grad_norm": 0.12878240644931793, + "learning_rate": 2.7461628129738954e-05, + "loss": 0.0053, + "step": 687 + }, + { + "epoch": 1.3257831325301206, + "grad_norm": 0.16112515330314636, + "learning_rate": 2.7419999621812086e-05, + "loss": 0.0059, + "step": 688 + }, + { + "epoch": 1.3277108433734939, + "grad_norm": 0.2398834228515625, + "learning_rate": 2.7378333803719672e-05, + "loss": 0.0095, + "step": 689 + }, + { + "epoch": 1.3296385542168676, + "grad_norm": 0.18516193330287933, + "learning_rate": 2.733663088497097e-05, + "loss": 0.0071, + "step": 690 + }, + { + "epoch": 1.331566265060241, + "grad_norm": 0.2974924147129059, + "learning_rate": 2.7294891075261785e-05, + "loss": 0.0227, + "step": 691 + }, + { + "epoch": 1.3334939759036144, + "grad_norm": 0.12931054830551147, + "learning_rate": 2.7253114584473418e-05, + "loss": 0.0039, + "step": 692 + }, + { + "epoch": 1.335421686746988, + "grad_norm": 0.16319474577903748, + "learning_rate": 2.7211301622671623e-05, + "loss": 0.008, + "step": 693 + }, + { + "epoch": 1.3373493975903614, + "grad_norm": 0.27622169256210327, + "learning_rate": 2.7169452400105533e-05, + "loss": 0.0238, + "step": 694 + }, + { + "epoch": 1.339277108433735, + "grad_norm": 0.45309779047966003, + "learning_rate": 2.712756712720663e-05, + "loss": 0.0439, + "step": 695 + }, + { + "epoch": 1.3412048192771084, + "grad_norm": 0.2469855099916458, + "learning_rate": 2.708564601458765e-05, + "loss": 0.0085, + "step": 696 + }, + { + "epoch": 1.343132530120482, + "grad_norm": 0.4245856702327728, + "learning_rate": 2.7043689273041535e-05, + "loss": 0.0097, + "step": 697 + }, + { + "epoch": 1.3450602409638555, + "grad_norm": 0.26796087622642517, + "learning_rate": 2.7001697113540414e-05, + "loss": 0.0119, + "step": 698 + }, + { + "epoch": 1.346987951807229, + "grad_norm": 0.3569283187389374, + "learning_rate": 2.6959669747234482e-05, + "loss": 0.0096, + "step": 699 + }, + { + "epoch": 1.3489156626506025, + "grad_norm": 0.7038524150848389, + "learning_rate": 2.6917607385450973e-05, + "loss": 0.0317, + "step": 700 + }, + { + "epoch": 1.350843373493976, + "grad_norm": 0.23568563163280487, + "learning_rate": 2.687551023969308e-05, + "loss": 0.0112, + "step": 701 + }, + { + "epoch": 1.3527710843373493, + "grad_norm": 0.20338499546051025, + "learning_rate": 2.6833378521638935e-05, + "loss": 0.0092, + "step": 702 + }, + { + "epoch": 1.354698795180723, + "grad_norm": 4.22187614440918, + "learning_rate": 2.679121244314046e-05, + "loss": 0.0314, + "step": 703 + }, + { + "epoch": 1.3566265060240963, + "grad_norm": 0.2542206048965454, + "learning_rate": 2.674901221622239e-05, + "loss": 0.0158, + "step": 704 + }, + { + "epoch": 1.3585542168674698, + "grad_norm": 0.49705010652542114, + "learning_rate": 2.670677805308116e-05, + "loss": 0.0162, + "step": 705 + }, + { + "epoch": 1.3604819277108433, + "grad_norm": 0.17502115666866302, + "learning_rate": 2.666451016608383e-05, + "loss": 0.0074, + "step": 706 + }, + { + "epoch": 1.3624096385542168, + "grad_norm": 0.21738742291927338, + "learning_rate": 2.6622208767767075e-05, + "loss": 0.0135, + "step": 707 + }, + { + "epoch": 1.3643373493975903, + "grad_norm": 0.3309847414493561, + "learning_rate": 2.6579874070836032e-05, + "loss": 0.0107, + "step": 708 + }, + { + "epoch": 1.3662650602409638, + "grad_norm": 0.10706827789545059, + "learning_rate": 2.6537506288163303e-05, + "loss": 0.0043, + "step": 709 + }, + { + "epoch": 1.3681927710843373, + "grad_norm": 0.173640176653862, + "learning_rate": 2.6495105632787835e-05, + "loss": 0.0092, + "step": 710 + }, + { + "epoch": 1.3701204819277109, + "grad_norm": 0.2636397182941437, + "learning_rate": 2.6452672317913893e-05, + "loss": 0.0097, + "step": 711 + }, + { + "epoch": 1.3720481927710844, + "grad_norm": 0.28485360741615295, + "learning_rate": 2.6410206556909943e-05, + "loss": 0.0193, + "step": 712 + }, + { + "epoch": 1.3739759036144579, + "grad_norm": 0.23210027813911438, + "learning_rate": 2.636770856330761e-05, + "loss": 0.0229, + "step": 713 + }, + { + "epoch": 1.3759036144578314, + "grad_norm": 0.13388316333293915, + "learning_rate": 2.6325178550800596e-05, + "loss": 0.004, + "step": 714 + }, + { + "epoch": 1.377831325301205, + "grad_norm": 0.5131422877311707, + "learning_rate": 2.6282616733243603e-05, + "loss": 0.0137, + "step": 715 + }, + { + "epoch": 1.3797590361445784, + "grad_norm": 0.3243267834186554, + "learning_rate": 2.6240023324651258e-05, + "loss": 0.0153, + "step": 716 + }, + { + "epoch": 1.3816867469879517, + "grad_norm": 0.1440611034631729, + "learning_rate": 2.619739853919704e-05, + "loss": 0.0031, + "step": 717 + }, + { + "epoch": 1.3836144578313254, + "grad_norm": 0.30346596240997314, + "learning_rate": 2.6154742591212196e-05, + "loss": 0.0109, + "step": 718 + }, + { + "epoch": 1.3855421686746987, + "grad_norm": 0.19109240174293518, + "learning_rate": 2.611205569518468e-05, + "loss": 0.0094, + "step": 719 + }, + { + "epoch": 1.3874698795180722, + "grad_norm": 0.28636518120765686, + "learning_rate": 2.6069338065758056e-05, + "loss": 0.0123, + "step": 720 + }, + { + "epoch": 1.3893975903614457, + "grad_norm": 0.28083911538124084, + "learning_rate": 2.6026589917730416e-05, + "loss": 0.0104, + "step": 721 + }, + { + "epoch": 1.3913253012048192, + "grad_norm": 0.36553966999053955, + "learning_rate": 2.5983811466053327e-05, + "loss": 0.0143, + "step": 722 + }, + { + "epoch": 1.3932530120481927, + "grad_norm": 0.23317205905914307, + "learning_rate": 2.5941002925830708e-05, + "loss": 0.011, + "step": 723 + }, + { + "epoch": 1.3951807228915662, + "grad_norm": 0.3825171887874603, + "learning_rate": 2.589816451231781e-05, + "loss": 0.0098, + "step": 724 + }, + { + "epoch": 1.3971084337349398, + "grad_norm": 0.19916608929634094, + "learning_rate": 2.585529644092006e-05, + "loss": 0.0094, + "step": 725 + }, + { + "epoch": 1.3990361445783133, + "grad_norm": 0.19990523159503937, + "learning_rate": 2.5812398927192027e-05, + "loss": 0.0128, + "step": 726 + }, + { + "epoch": 1.4009638554216868, + "grad_norm": 0.34662899374961853, + "learning_rate": 2.5769472186836347e-05, + "loss": 0.0091, + "step": 727 + }, + { + "epoch": 1.4028915662650603, + "grad_norm": 0.23481112718582153, + "learning_rate": 2.5726516435702583e-05, + "loss": 0.0154, + "step": 728 + }, + { + "epoch": 1.4048192771084338, + "grad_norm": 0.1846667379140854, + "learning_rate": 2.5683531889786194e-05, + "loss": 0.0088, + "step": 729 + }, + { + "epoch": 1.4067469879518073, + "grad_norm": 0.16717663407325745, + "learning_rate": 2.564051876522742e-05, + "loss": 0.0083, + "step": 730 + }, + { + "epoch": 1.4086746987951808, + "grad_norm": 0.4116475284099579, + "learning_rate": 2.5597477278310202e-05, + "loss": 0.0179, + "step": 731 + }, + { + "epoch": 1.410602409638554, + "grad_norm": 0.171807661652565, + "learning_rate": 2.5554407645461115e-05, + "loss": 0.0063, + "step": 732 + }, + { + "epoch": 1.4125301204819278, + "grad_norm": 0.1954439878463745, + "learning_rate": 2.5511310083248243e-05, + "loss": 0.017, + "step": 733 + }, + { + "epoch": 1.4144578313253011, + "grad_norm": 0.37158989906311035, + "learning_rate": 2.5468184808380104e-05, + "loss": 0.0173, + "step": 734 + }, + { + "epoch": 1.4163855421686746, + "grad_norm": 0.2001633644104004, + "learning_rate": 2.542503203770458e-05, + "loss": 0.0165, + "step": 735 + }, + { + "epoch": 1.4183132530120481, + "grad_norm": 0.45673373341560364, + "learning_rate": 2.53818519882078e-05, + "loss": 0.0185, + "step": 736 + }, + { + "epoch": 1.4202409638554216, + "grad_norm": 0.3838701546192169, + "learning_rate": 2.5338644877013067e-05, + "loss": 0.0134, + "step": 737 + }, + { + "epoch": 1.4221686746987952, + "grad_norm": 0.32032477855682373, + "learning_rate": 2.5295410921379745e-05, + "loss": 0.0143, + "step": 738 + }, + { + "epoch": 1.4240963855421687, + "grad_norm": 0.4594039022922516, + "learning_rate": 2.52521503387022e-05, + "loss": 0.0193, + "step": 739 + }, + { + "epoch": 1.4260240963855422, + "grad_norm": 0.3889620900154114, + "learning_rate": 2.5208863346508667e-05, + "loss": 0.0114, + "step": 740 + }, + { + "epoch": 1.4279518072289157, + "grad_norm": 0.33153319358825684, + "learning_rate": 2.5165550162460203e-05, + "loss": 0.0102, + "step": 741 + }, + { + "epoch": 1.4298795180722892, + "grad_norm": 0.7269518375396729, + "learning_rate": 2.5122211004349536e-05, + "loss": 0.0215, + "step": 742 + }, + { + "epoch": 1.4318072289156627, + "grad_norm": 0.31653261184692383, + "learning_rate": 2.5078846090100023e-05, + "loss": 0.0115, + "step": 743 + }, + { + "epoch": 1.4337349397590362, + "grad_norm": 0.20620353519916534, + "learning_rate": 2.5035455637764518e-05, + "loss": 0.0153, + "step": 744 + }, + { + "epoch": 1.4356626506024097, + "grad_norm": 0.17266008257865906, + "learning_rate": 2.4992039865524297e-05, + "loss": 0.0069, + "step": 745 + }, + { + "epoch": 1.4375903614457832, + "grad_norm": 0.24760811030864716, + "learning_rate": 2.494859899168795e-05, + "loss": 0.0108, + "step": 746 + }, + { + "epoch": 1.4395180722891565, + "grad_norm": 0.2584865391254425, + "learning_rate": 2.4905133234690282e-05, + "loss": 0.0095, + "step": 747 + }, + { + "epoch": 1.4414457831325302, + "grad_norm": 0.48847514390945435, + "learning_rate": 2.486164281309122e-05, + "loss": 0.0181, + "step": 748 + }, + { + "epoch": 1.4433734939759035, + "grad_norm": 0.42942047119140625, + "learning_rate": 2.4818127945574717e-05, + "loss": 0.025, + "step": 749 + }, + { + "epoch": 1.445301204819277, + "grad_norm": 0.23713800311088562, + "learning_rate": 2.4774588850947648e-05, + "loss": 0.0085, + "step": 750 + }, + { + "epoch": 1.4472289156626506, + "grad_norm": 0.8797569870948792, + "learning_rate": 2.473102574813871e-05, + "loss": 0.0097, + "step": 751 + }, + { + "epoch": 1.449156626506024, + "grad_norm": 0.2744862735271454, + "learning_rate": 2.4687438856197302e-05, + "loss": 0.0122, + "step": 752 + }, + { + "epoch": 1.4510843373493976, + "grad_norm": 0.12747010588645935, + "learning_rate": 2.4643828394292478e-05, + "loss": 0.0056, + "step": 753 + }, + { + "epoch": 1.453012048192771, + "grad_norm": 0.37376829981803894, + "learning_rate": 2.4600194581711775e-05, + "loss": 0.0052, + "step": 754 + }, + { + "epoch": 1.4549397590361446, + "grad_norm": 0.2536911368370056, + "learning_rate": 2.4556537637860176e-05, + "loss": 0.0113, + "step": 755 + }, + { + "epoch": 1.456867469879518, + "grad_norm": 0.25950780510902405, + "learning_rate": 2.451285778225894e-05, + "loss": 0.0099, + "step": 756 + }, + { + "epoch": 1.4587951807228916, + "grad_norm": 0.19535955786705017, + "learning_rate": 2.4469155234544565e-05, + "loss": 0.0069, + "step": 757 + }, + { + "epoch": 1.4607228915662651, + "grad_norm": 0.22816115617752075, + "learning_rate": 2.442543021446764e-05, + "loss": 0.0088, + "step": 758 + }, + { + "epoch": 1.4626506024096386, + "grad_norm": 0.3363986313343048, + "learning_rate": 2.4381682941891755e-05, + "loss": 0.0182, + "step": 759 + }, + { + "epoch": 1.464578313253012, + "grad_norm": 0.21492891013622284, + "learning_rate": 2.4337913636792382e-05, + "loss": 0.0069, + "step": 760 + }, + { + "epoch": 1.4665060240963856, + "grad_norm": 0.6070862412452698, + "learning_rate": 2.429412251925579e-05, + "loss": 0.0406, + "step": 761 + }, + { + "epoch": 1.468433734939759, + "grad_norm": 2.6469690799713135, + "learning_rate": 2.425030980947793e-05, + "loss": 0.0205, + "step": 762 + }, + { + "epoch": 1.4703614457831327, + "grad_norm": 0.30909740924835205, + "learning_rate": 2.420647572776332e-05, + "loss": 0.0136, + "step": 763 + }, + { + "epoch": 1.472289156626506, + "grad_norm": 0.6639553904533386, + "learning_rate": 2.416262049452395e-05, + "loss": 0.011, + "step": 764 + }, + { + "epoch": 1.4742168674698795, + "grad_norm": 0.2919616997241974, + "learning_rate": 2.4118744330278147e-05, + "loss": 0.0131, + "step": 765 + }, + { + "epoch": 1.476144578313253, + "grad_norm": 0.5232429504394531, + "learning_rate": 2.4074847455649523e-05, + "loss": 0.0138, + "step": 766 + }, + { + "epoch": 1.4780722891566265, + "grad_norm": 5.630630970001221, + "learning_rate": 2.403093009136579e-05, + "loss": 0.0264, + "step": 767 + }, + { + "epoch": 1.48, + "grad_norm": 0.33234721422195435, + "learning_rate": 2.3986992458257707e-05, + "loss": 0.0111, + "step": 768 + }, + { + "epoch": 1.4819277108433735, + "grad_norm": 0.28444772958755493, + "learning_rate": 2.3943034777257945e-05, + "loss": 0.0144, + "step": 769 + }, + { + "epoch": 1.483855421686747, + "grad_norm": 0.16229979693889618, + "learning_rate": 2.38990572694e-05, + "loss": 0.0062, + "step": 770 + }, + { + "epoch": 1.4857831325301205, + "grad_norm": 0.27474716305732727, + "learning_rate": 2.385506015581704e-05, + "loss": 0.0172, + "step": 771 + }, + { + "epoch": 1.487710843373494, + "grad_norm": 0.246526300907135, + "learning_rate": 2.381104365774083e-05, + "loss": 0.012, + "step": 772 + }, + { + "epoch": 1.4896385542168675, + "grad_norm": 0.282047837972641, + "learning_rate": 2.37670079965006e-05, + "loss": 0.0116, + "step": 773 + }, + { + "epoch": 1.491566265060241, + "grad_norm": 0.2878139317035675, + "learning_rate": 2.3722953393521944e-05, + "loss": 0.0147, + "step": 774 + }, + { + "epoch": 1.4934939759036143, + "grad_norm": 0.5586277842521667, + "learning_rate": 2.367888007032571e-05, + "loss": 0.0111, + "step": 775 + }, + { + "epoch": 1.495421686746988, + "grad_norm": 0.562160313129425, + "learning_rate": 2.3634788248526846e-05, + "loss": 0.0061, + "step": 776 + }, + { + "epoch": 1.4973493975903613, + "grad_norm": 0.3452005982398987, + "learning_rate": 2.3590678149833356e-05, + "loss": 0.0205, + "step": 777 + }, + { + "epoch": 1.499277108433735, + "grad_norm": 0.7757686376571655, + "learning_rate": 2.3546549996045114e-05, + "loss": 0.0273, + "step": 778 + }, + { + "epoch": 1.5012048192771084, + "grad_norm": 0.19530551135540009, + "learning_rate": 2.3502404009052812e-05, + "loss": 0.0083, + "step": 779 + }, + { + "epoch": 1.503132530120482, + "grad_norm": 0.2586531639099121, + "learning_rate": 2.3458240410836775e-05, + "loss": 0.0122, + "step": 780 + }, + { + "epoch": 1.5050602409638554, + "grad_norm": 0.30063286423683167, + "learning_rate": 2.3414059423465924e-05, + "loss": 0.0083, + "step": 781 + }, + { + "epoch": 1.5069879518072289, + "grad_norm": 0.18663185834884644, + "learning_rate": 2.3369861269096575e-05, + "loss": 0.0104, + "step": 782 + }, + { + "epoch": 1.5089156626506024, + "grad_norm": 0.4405941069126129, + "learning_rate": 2.3325646169971416e-05, + "loss": 0.0264, + "step": 783 + }, + { + "epoch": 1.510843373493976, + "grad_norm": 0.2947913110256195, + "learning_rate": 2.3281414348418294e-05, + "loss": 0.0107, + "step": 784 + }, + { + "epoch": 1.5127710843373494, + "grad_norm": 0.23813778162002563, + "learning_rate": 2.3237166026849158e-05, + "loss": 0.0084, + "step": 785 + }, + { + "epoch": 1.514698795180723, + "grad_norm": 0.33380329608917236, + "learning_rate": 2.3192901427758932e-05, + "loss": 0.0111, + "step": 786 + }, + { + "epoch": 1.5166265060240964, + "grad_norm": 0.3736988306045532, + "learning_rate": 2.314862077372438e-05, + "loss": 0.0135, + "step": 787 + }, + { + "epoch": 1.5185542168674697, + "grad_norm": 0.3785395920276642, + "learning_rate": 2.3104324287402996e-05, + "loss": 0.0265, + "step": 788 + }, + { + "epoch": 1.5204819277108435, + "grad_norm": 0.3359154462814331, + "learning_rate": 2.3060012191531885e-05, + "loss": 0.0127, + "step": 789 + }, + { + "epoch": 1.5224096385542167, + "grad_norm": 0.720753014087677, + "learning_rate": 2.301568470892664e-05, + "loss": 0.0134, + "step": 790 + }, + { + "epoch": 1.5243373493975905, + "grad_norm": 0.36473193764686584, + "learning_rate": 2.297134206248024e-05, + "loss": 0.0318, + "step": 791 + }, + { + "epoch": 1.5262650602409638, + "grad_norm": 0.29987087845802307, + "learning_rate": 2.2926984475161884e-05, + "loss": 0.008, + "step": 792 + }, + { + "epoch": 1.5281927710843375, + "grad_norm": 0.2883112132549286, + "learning_rate": 2.2882612170015914e-05, + "loss": 0.0125, + "step": 793 + }, + { + "epoch": 1.5301204819277108, + "grad_norm": 0.28983229398727417, + "learning_rate": 2.2838225370160682e-05, + "loss": 0.0155, + "step": 794 + }, + { + "epoch": 1.5320481927710843, + "grad_norm": 0.47236886620521545, + "learning_rate": 2.2793824298787414e-05, + "loss": 0.0132, + "step": 795 + }, + { + "epoch": 1.5339759036144578, + "grad_norm": 0.8328865170478821, + "learning_rate": 2.2749409179159104e-05, + "loss": 0.026, + "step": 796 + }, + { + "epoch": 1.5359036144578313, + "grad_norm": 0.3129172623157501, + "learning_rate": 2.2704980234609396e-05, + "loss": 0.0099, + "step": 797 + }, + { + "epoch": 1.5378313253012048, + "grad_norm": 0.22284500300884247, + "learning_rate": 2.2660537688541416e-05, + "loss": 0.009, + "step": 798 + }, + { + "epoch": 1.5397590361445783, + "grad_norm": 0.3346405625343323, + "learning_rate": 2.2616081764426726e-05, + "loss": 0.0077, + "step": 799 + }, + { + "epoch": 1.5416867469879518, + "grad_norm": 0.2923565208911896, + "learning_rate": 2.2571612685804124e-05, + "loss": 0.0119, + "step": 800 + }, + { + "epoch": 1.5436144578313253, + "grad_norm": 0.1921311914920807, + "learning_rate": 2.252713067627857e-05, + "loss": 0.0083, + "step": 801 + }, + { + "epoch": 1.5455421686746988, + "grad_norm": 0.23221106827259064, + "learning_rate": 2.2482635959520044e-05, + "loss": 0.0049, + "step": 802 + }, + { + "epoch": 1.5474698795180721, + "grad_norm": 0.6340724229812622, + "learning_rate": 2.243812875926241e-05, + "loss": 0.0273, + "step": 803 + }, + { + "epoch": 1.5493975903614459, + "grad_norm": 0.2699439823627472, + "learning_rate": 2.2393609299302314e-05, + "loss": 0.0108, + "step": 804 + }, + { + "epoch": 1.5513253012048192, + "grad_norm": 0.2005189210176468, + "learning_rate": 2.2349077803498052e-05, + "loss": 0.0076, + "step": 805 + }, + { + "epoch": 1.5532530120481929, + "grad_norm": 0.39668548107147217, + "learning_rate": 2.230453449576842e-05, + "loss": 0.0135, + "step": 806 + }, + { + "epoch": 1.5551807228915662, + "grad_norm": 0.2406950294971466, + "learning_rate": 2.2259979600091635e-05, + "loss": 0.0094, + "step": 807 + }, + { + "epoch": 1.55710843373494, + "grad_norm": 0.30363157391548157, + "learning_rate": 2.2215413340504158e-05, + "loss": 0.0178, + "step": 808 + }, + { + "epoch": 1.5590361445783132, + "grad_norm": 0.19508181512355804, + "learning_rate": 2.2170835941099605e-05, + "loss": 0.0069, + "step": 809 + }, + { + "epoch": 1.5609638554216867, + "grad_norm": 0.734106719493866, + "learning_rate": 2.2126247626027615e-05, + "loss": 0.0319, + "step": 810 + }, + { + "epoch": 1.5628915662650602, + "grad_norm": 0.2591583728790283, + "learning_rate": 2.208164861949268e-05, + "loss": 0.0168, + "step": 811 + }, + { + "epoch": 1.5648192771084337, + "grad_norm": 0.2386734038591385, + "learning_rate": 2.20370391457531e-05, + "loss": 0.0041, + "step": 812 + }, + { + "epoch": 1.5667469879518072, + "grad_norm": 0.1675218939781189, + "learning_rate": 2.1992419429119764e-05, + "loss": 0.0078, + "step": 813 + }, + { + "epoch": 1.5686746987951807, + "grad_norm": 0.45591506361961365, + "learning_rate": 2.1947789693955097e-05, + "loss": 0.0166, + "step": 814 + }, + { + "epoch": 1.5706024096385542, + "grad_norm": 0.46940621733665466, + "learning_rate": 2.190315016467188e-05, + "loss": 0.0176, + "step": 815 + }, + { + "epoch": 1.5725301204819278, + "grad_norm": 0.2294205278158188, + "learning_rate": 2.1858501065732146e-05, + "loss": 0.0102, + "step": 816 + }, + { + "epoch": 1.5744578313253013, + "grad_norm": 0.28922322392463684, + "learning_rate": 2.181384262164606e-05, + "loss": 0.0111, + "step": 817 + }, + { + "epoch": 1.5763855421686745, + "grad_norm": 0.19650064408779144, + "learning_rate": 2.1769175056970765e-05, + "loss": 0.0076, + "step": 818 + }, + { + "epoch": 1.5783132530120483, + "grad_norm": 0.19538825750350952, + "learning_rate": 2.172449859630927e-05, + "loss": 0.0118, + "step": 819 + }, + { + "epoch": 1.5802409638554216, + "grad_norm": 0.1900389939546585, + "learning_rate": 2.167981346430931e-05, + "loss": 0.0066, + "step": 820 + }, + { + "epoch": 1.5821686746987953, + "grad_norm": 0.21593710780143738, + "learning_rate": 2.1635119885662235e-05, + "loss": 0.0101, + "step": 821 + }, + { + "epoch": 1.5840963855421686, + "grad_norm": 0.2699289321899414, + "learning_rate": 2.159041808510185e-05, + "loss": 0.0118, + "step": 822 + }, + { + "epoch": 1.5860240963855423, + "grad_norm": 0.31867673993110657, + "learning_rate": 2.1545708287403322e-05, + "loss": 0.0122, + "step": 823 + }, + { + "epoch": 1.5879518072289156, + "grad_norm": 0.2862400412559509, + "learning_rate": 2.1500990717382004e-05, + "loss": 0.0216, + "step": 824 + }, + { + "epoch": 1.589879518072289, + "grad_norm": 0.28482481837272644, + "learning_rate": 2.145626559989237e-05, + "loss": 0.0136, + "step": 825 + }, + { + "epoch": 1.5918072289156626, + "grad_norm": 0.2866958975791931, + "learning_rate": 2.1411533159826803e-05, + "loss": 0.0298, + "step": 826 + }, + { + "epoch": 1.5937349397590361, + "grad_norm": 0.39092838764190674, + "learning_rate": 2.1366793622114533e-05, + "loss": 0.0382, + "step": 827 + }, + { + "epoch": 1.5956626506024096, + "grad_norm": 0.16381537914276123, + "learning_rate": 2.1322047211720468e-05, + "loss": 0.0074, + "step": 828 + }, + { + "epoch": 1.5975903614457831, + "grad_norm": 0.22146940231323242, + "learning_rate": 2.1277294153644083e-05, + "loss": 0.0103, + "step": 829 + }, + { + "epoch": 1.5995180722891567, + "grad_norm": 0.2155209183692932, + "learning_rate": 2.123253467291827e-05, + "loss": 0.0095, + "step": 830 + }, + { + "epoch": 1.6014457831325302, + "grad_norm": 0.41510409116744995, + "learning_rate": 2.118776899460822e-05, + "loss": 0.0457, + "step": 831 + }, + { + "epoch": 1.6033734939759037, + "grad_norm": 0.19718150794506073, + "learning_rate": 2.1142997343810293e-05, + "loss": 0.0192, + "step": 832 + }, + { + "epoch": 1.605301204819277, + "grad_norm": 0.40924403071403503, + "learning_rate": 2.1098219945650865e-05, + "loss": 0.0278, + "step": 833 + }, + { + "epoch": 1.6072289156626507, + "grad_norm": 0.18657824397087097, + "learning_rate": 2.105343702528524e-05, + "loss": 0.0076, + "step": 834 + }, + { + "epoch": 1.609156626506024, + "grad_norm": 0.1727641075849533, + "learning_rate": 2.100864880789645e-05, + "loss": 0.0076, + "step": 835 + }, + { + "epoch": 1.6110843373493977, + "grad_norm": 0.18138745427131653, + "learning_rate": 2.0963855518694203e-05, + "loss": 0.005, + "step": 836 + }, + { + "epoch": 1.613012048192771, + "grad_norm": 0.19173955917358398, + "learning_rate": 2.0919057382913675e-05, + "loss": 0.0084, + "step": 837 + }, + { + "epoch": 1.6149397590361447, + "grad_norm": 0.3812403380870819, + "learning_rate": 2.0874254625814435e-05, + "loss": 0.009, + "step": 838 + }, + { + "epoch": 1.616867469879518, + "grad_norm": 0.2009759545326233, + "learning_rate": 2.0829447472679285e-05, + "loss": 0.0098, + "step": 839 + }, + { + "epoch": 1.6187951807228915, + "grad_norm": 0.48703446984291077, + "learning_rate": 2.0784636148813124e-05, + "loss": 0.0099, + "step": 840 + }, + { + "epoch": 1.620722891566265, + "grad_norm": 0.28995075821876526, + "learning_rate": 2.0739820879541827e-05, + "loss": 0.0075, + "step": 841 + }, + { + "epoch": 1.6226506024096385, + "grad_norm": 0.2130059450864792, + "learning_rate": 2.069500189021111e-05, + "loss": 0.007, + "step": 842 + }, + { + "epoch": 1.624578313253012, + "grad_norm": 0.252524733543396, + "learning_rate": 2.0650179406185397e-05, + "loss": 0.0249, + "step": 843 + }, + { + "epoch": 1.6265060240963856, + "grad_norm": 0.23069098591804504, + "learning_rate": 2.060535365284668e-05, + "loss": 0.0084, + "step": 844 + }, + { + "epoch": 1.628433734939759, + "grad_norm": 0.25051403045654297, + "learning_rate": 2.056052485559338e-05, + "loss": 0.0071, + "step": 845 + }, + { + "epoch": 1.6303614457831326, + "grad_norm": 0.27664798498153687, + "learning_rate": 2.051569323983924e-05, + "loss": 0.0198, + "step": 846 + }, + { + "epoch": 1.632289156626506, + "grad_norm": 0.2954922318458557, + "learning_rate": 2.047085903101218e-05, + "loss": 0.006, + "step": 847 + }, + { + "epoch": 1.6342168674698794, + "grad_norm": 0.28477591276168823, + "learning_rate": 2.0426022454553137e-05, + "loss": 0.0147, + "step": 848 + }, + { + "epoch": 1.636144578313253, + "grad_norm": 0.2785305678844452, + "learning_rate": 2.0381183735914968e-05, + "loss": 0.0117, + "step": 849 + }, + { + "epoch": 1.6380722891566264, + "grad_norm": 0.2500309348106384, + "learning_rate": 2.0336343100561295e-05, + "loss": 0.008, + "step": 850 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.18932047486305237, + "learning_rate": 2.0291500773965392e-05, + "loss": 0.0256, + "step": 851 + }, + { + "epoch": 1.6419277108433734, + "grad_norm": 0.6396257877349854, + "learning_rate": 2.0246656981609013e-05, + "loss": 0.0141, + "step": 852 + }, + { + "epoch": 1.6438554216867471, + "grad_norm": 0.5072891116142273, + "learning_rate": 2.02018119489813e-05, + "loss": 0.008, + "step": 853 + }, + { + "epoch": 1.6457831325301204, + "grad_norm": 0.2920839488506317, + "learning_rate": 2.0156965901577635e-05, + "loss": 0.0085, + "step": 854 + }, + { + "epoch": 1.647710843373494, + "grad_norm": 0.1391262263059616, + "learning_rate": 2.011211906489848e-05, + "loss": 0.0078, + "step": 855 + }, + { + "epoch": 1.6496385542168674, + "grad_norm": 0.29620468616485596, + "learning_rate": 2.00672716644483e-05, + "loss": 0.0109, + "step": 856 + }, + { + "epoch": 1.651566265060241, + "grad_norm": 0.13946573436260223, + "learning_rate": 2.002242392573436e-05, + "loss": 0.0076, + "step": 857 + }, + { + "epoch": 1.6534939759036145, + "grad_norm": 0.9766128659248352, + "learning_rate": 1.997757607426565e-05, + "loss": 0.0309, + "step": 858 + }, + { + "epoch": 1.655421686746988, + "grad_norm": 0.18002203106880188, + "learning_rate": 1.9932728335551702e-05, + "loss": 0.0072, + "step": 859 + }, + { + "epoch": 1.6573493975903615, + "grad_norm": 0.28073111176490784, + "learning_rate": 1.988788093510152e-05, + "loss": 0.0246, + "step": 860 + }, + { + "epoch": 1.659277108433735, + "grad_norm": 0.1919957399368286, + "learning_rate": 1.9843034098422375e-05, + "loss": 0.0087, + "step": 861 + }, + { + "epoch": 1.6612048192771085, + "grad_norm": 0.1825258433818817, + "learning_rate": 1.9798188051018705e-05, + "loss": 0.0092, + "step": 862 + }, + { + "epoch": 1.6631325301204818, + "grad_norm": 0.32412952184677124, + "learning_rate": 1.9753343018390997e-05, + "loss": 0.0118, + "step": 863 + }, + { + "epoch": 1.6650602409638555, + "grad_norm": 0.12828563153743744, + "learning_rate": 1.9708499226034618e-05, + "loss": 0.0056, + "step": 864 + }, + { + "epoch": 1.6669879518072288, + "grad_norm": 0.18647560477256775, + "learning_rate": 1.966365689943871e-05, + "loss": 0.0094, + "step": 865 + }, + { + "epoch": 1.6689156626506025, + "grad_norm": 0.19835828244686127, + "learning_rate": 1.9618816264085042e-05, + "loss": 0.0097, + "step": 866 + }, + { + "epoch": 1.6708433734939758, + "grad_norm": 0.22364282608032227, + "learning_rate": 1.957397754544687e-05, + "loss": 0.0062, + "step": 867 + }, + { + "epoch": 1.6727710843373496, + "grad_norm": 0.29420018196105957, + "learning_rate": 1.952914096898783e-05, + "loss": 0.0182, + "step": 868 + }, + { + "epoch": 1.6746987951807228, + "grad_norm": 0.2149929702281952, + "learning_rate": 1.9484306760160766e-05, + "loss": 0.0125, + "step": 869 + }, + { + "epoch": 1.6766265060240964, + "grad_norm": 0.16844330728054047, + "learning_rate": 1.9439475144406623e-05, + "loss": 0.0074, + "step": 870 + }, + { + "epoch": 1.6785542168674699, + "grad_norm": 0.5010282397270203, + "learning_rate": 1.9394646347153334e-05, + "loss": 0.0213, + "step": 871 + }, + { + "epoch": 1.6804819277108434, + "grad_norm": 0.29847195744514465, + "learning_rate": 1.9349820593814606e-05, + "loss": 0.0173, + "step": 872 + }, + { + "epoch": 1.6824096385542169, + "grad_norm": 0.23835812509059906, + "learning_rate": 1.930499810978889e-05, + "loss": 0.011, + "step": 873 + }, + { + "epoch": 1.6843373493975904, + "grad_norm": 0.3269020617008209, + "learning_rate": 1.9260179120458177e-05, + "loss": 0.0285, + "step": 874 + }, + { + "epoch": 1.686265060240964, + "grad_norm": 0.2142144739627838, + "learning_rate": 1.9215363851186883e-05, + "loss": 0.0146, + "step": 875 + }, + { + "epoch": 1.6881927710843372, + "grad_norm": 0.3098377585411072, + "learning_rate": 1.9170552527320725e-05, + "loss": 0.0104, + "step": 876 + }, + { + "epoch": 1.690120481927711, + "grad_norm": 0.22504115104675293, + "learning_rate": 1.9125745374185568e-05, + "loss": 0.0091, + "step": 877 + }, + { + "epoch": 1.6920481927710842, + "grad_norm": 0.20633333921432495, + "learning_rate": 1.908094261708633e-05, + "loss": 0.0097, + "step": 878 + }, + { + "epoch": 1.693975903614458, + "grad_norm": 1.179566502571106, + "learning_rate": 1.9036144481305807e-05, + "loss": 0.0143, + "step": 879 + }, + { + "epoch": 1.6959036144578312, + "grad_norm": 0.15525613725185394, + "learning_rate": 1.8991351192103554e-05, + "loss": 0.0062, + "step": 880 + }, + { + "epoch": 1.697831325301205, + "grad_norm": 0.15966367721557617, + "learning_rate": 1.8946562974714763e-05, + "loss": 0.0048, + "step": 881 + }, + { + "epoch": 1.6997590361445782, + "grad_norm": 0.18902607262134552, + "learning_rate": 1.890178005434914e-05, + "loss": 0.0124, + "step": 882 + }, + { + "epoch": 1.701686746987952, + "grad_norm": 0.21692413091659546, + "learning_rate": 1.885700265618971e-05, + "loss": 0.0135, + "step": 883 + }, + { + "epoch": 1.7036144578313253, + "grad_norm": 0.38948455452919006, + "learning_rate": 1.8812231005391786e-05, + "loss": 0.0365, + "step": 884 + }, + { + "epoch": 1.7055421686746988, + "grad_norm": 0.2483491599559784, + "learning_rate": 1.8767465327081736e-05, + "loss": 0.0202, + "step": 885 + }, + { + "epoch": 1.7074698795180723, + "grad_norm": 0.15305832028388977, + "learning_rate": 1.872270584635592e-05, + "loss": 0.0035, + "step": 886 + }, + { + "epoch": 1.7093975903614458, + "grad_norm": 0.17794466018676758, + "learning_rate": 1.867795278827954e-05, + "loss": 0.0157, + "step": 887 + }, + { + "epoch": 1.7113253012048193, + "grad_norm": 0.1938813328742981, + "learning_rate": 1.863320637788547e-05, + "loss": 0.0071, + "step": 888 + }, + { + "epoch": 1.7132530120481928, + "grad_norm": 0.27061617374420166, + "learning_rate": 1.8588466840173207e-05, + "loss": 0.0347, + "step": 889 + }, + { + "epoch": 1.7151807228915663, + "grad_norm": 0.1541014313697815, + "learning_rate": 1.8543734400107637e-05, + "loss": 0.006, + "step": 890 + }, + { + "epoch": 1.7171084337349396, + "grad_norm": 0.1436876654624939, + "learning_rate": 1.8499009282617996e-05, + "loss": 0.0059, + "step": 891 + }, + { + "epoch": 1.7190361445783133, + "grad_norm": 1.0573723316192627, + "learning_rate": 1.8454291712596688e-05, + "loss": 0.008, + "step": 892 + }, + { + "epoch": 1.7209638554216866, + "grad_norm": 0.15406259894371033, + "learning_rate": 1.8409581914898157e-05, + "loss": 0.0061, + "step": 893 + }, + { + "epoch": 1.7228915662650603, + "grad_norm": 0.24822913110256195, + "learning_rate": 1.836488011433777e-05, + "loss": 0.0085, + "step": 894 + }, + { + "epoch": 1.7248192771084336, + "grad_norm": 0.21049316227436066, + "learning_rate": 1.83201865356907e-05, + "loss": 0.0075, + "step": 895 + }, + { + "epoch": 1.7267469879518074, + "grad_norm": 0.24159866571426392, + "learning_rate": 1.8275501403690733e-05, + "loss": 0.0156, + "step": 896 + }, + { + "epoch": 1.7286746987951807, + "grad_norm": 0.3191063106060028, + "learning_rate": 1.823082494302924e-05, + "loss": 0.0218, + "step": 897 + }, + { + "epoch": 1.7306024096385542, + "grad_norm": 0.20296362042427063, + "learning_rate": 1.8186157378353945e-05, + "loss": 0.0126, + "step": 898 + }, + { + "epoch": 1.7325301204819277, + "grad_norm": 0.1905524581670761, + "learning_rate": 1.8141498934267858e-05, + "loss": 0.0131, + "step": 899 + }, + { + "epoch": 1.7344578313253012, + "grad_norm": 0.5350520610809326, + "learning_rate": 1.809684983532813e-05, + "loss": 0.0115, + "step": 900 + }, + { + "epoch": 1.7363855421686747, + "grad_norm": 0.17144092917442322, + "learning_rate": 1.8052210306044907e-05, + "loss": 0.0113, + "step": 901 + }, + { + "epoch": 1.7383132530120482, + "grad_norm": 0.11777982115745544, + "learning_rate": 1.8007580570880236e-05, + "loss": 0.0058, + "step": 902 + }, + { + "epoch": 1.7402409638554217, + "grad_norm": 0.2078275978565216, + "learning_rate": 1.7962960854246908e-05, + "loss": 0.0106, + "step": 903 + }, + { + "epoch": 1.7421686746987952, + "grad_norm": 0.2550877630710602, + "learning_rate": 1.791835138050732e-05, + "loss": 0.0076, + "step": 904 + }, + { + "epoch": 1.7440963855421687, + "grad_norm": 0.11553912609815598, + "learning_rate": 1.7873752373972395e-05, + "loss": 0.0038, + "step": 905 + }, + { + "epoch": 1.746024096385542, + "grad_norm": 0.10724586248397827, + "learning_rate": 1.7829164058900398e-05, + "loss": 0.0043, + "step": 906 + }, + { + "epoch": 1.7479518072289157, + "grad_norm": 0.30152231454849243, + "learning_rate": 1.7784586659495845e-05, + "loss": 0.0099, + "step": 907 + }, + { + "epoch": 1.749879518072289, + "grad_norm": 0.18372933566570282, + "learning_rate": 1.7740020399908372e-05, + "loss": 0.0074, + "step": 908 + }, + { + "epoch": 1.7518072289156628, + "grad_norm": 0.35184428095817566, + "learning_rate": 1.7695465504231586e-05, + "loss": 0.0184, + "step": 909 + }, + { + "epoch": 1.753734939759036, + "grad_norm": 0.15083615481853485, + "learning_rate": 1.765092219650196e-05, + "loss": 0.0061, + "step": 910 + }, + { + "epoch": 1.7556626506024098, + "grad_norm": 0.2599961459636688, + "learning_rate": 1.7606390700697693e-05, + "loss": 0.0101, + "step": 911 + }, + { + "epoch": 1.757590361445783, + "grad_norm": 0.10829206556081772, + "learning_rate": 1.7561871240737595e-05, + "loss": 0.0034, + "step": 912 + }, + { + "epoch": 1.7595180722891566, + "grad_norm": 0.38098782300949097, + "learning_rate": 1.7517364040479966e-05, + "loss": 0.0384, + "step": 913 + }, + { + "epoch": 1.76144578313253, + "grad_norm": 0.14975085854530334, + "learning_rate": 1.7472869323721432e-05, + "loss": 0.0055, + "step": 914 + }, + { + "epoch": 1.7633734939759036, + "grad_norm": 0.4151444733142853, + "learning_rate": 1.742838731419588e-05, + "loss": 0.0307, + "step": 915 + }, + { + "epoch": 1.765301204819277, + "grad_norm": 0.22238481044769287, + "learning_rate": 1.738391823557328e-05, + "loss": 0.0059, + "step": 916 + }, + { + "epoch": 1.7672289156626506, + "grad_norm": 0.23386356234550476, + "learning_rate": 1.7339462311458587e-05, + "loss": 0.0113, + "step": 917 + }, + { + "epoch": 1.7691566265060241, + "grad_norm": 0.21911191940307617, + "learning_rate": 1.7295019765390618e-05, + "loss": 0.0071, + "step": 918 + }, + { + "epoch": 1.7710843373493976, + "grad_norm": 0.343159943819046, + "learning_rate": 1.7250590820840903e-05, + "loss": 0.0144, + "step": 919 + }, + { + "epoch": 1.7730120481927711, + "grad_norm": 0.32204556465148926, + "learning_rate": 1.720617570121259e-05, + "loss": 0.0131, + "step": 920 + }, + { + "epoch": 1.7749397590361444, + "grad_norm": 0.4105585515499115, + "learning_rate": 1.7161774629839328e-05, + "loss": 0.0148, + "step": 921 + }, + { + "epoch": 1.7768674698795182, + "grad_norm": 0.16380974650382996, + "learning_rate": 1.7117387829984093e-05, + "loss": 0.0066, + "step": 922 + }, + { + "epoch": 1.7787951807228914, + "grad_norm": 0.22920913994312286, + "learning_rate": 1.707301552483813e-05, + "loss": 0.0105, + "step": 923 + }, + { + "epoch": 1.7807228915662652, + "grad_norm": 0.2075149267911911, + "learning_rate": 1.7028657937519767e-05, + "loss": 0.0104, + "step": 924 + }, + { + "epoch": 1.7826506024096385, + "grad_norm": 0.44439977407455444, + "learning_rate": 1.6984315291073355e-05, + "loss": 0.0134, + "step": 925 + }, + { + "epoch": 1.7845783132530122, + "grad_norm": 0.24068203568458557, + "learning_rate": 1.6939987808468125e-05, + "loss": 0.0078, + "step": 926 + }, + { + "epoch": 1.7865060240963855, + "grad_norm": 0.34044349193573, + "learning_rate": 1.689567571259701e-05, + "loss": 0.0108, + "step": 927 + }, + { + "epoch": 1.788433734939759, + "grad_norm": 0.34082743525505066, + "learning_rate": 1.6851379226275624e-05, + "loss": 0.0266, + "step": 928 + }, + { + "epoch": 1.7903614457831325, + "grad_norm": 0.19490115344524384, + "learning_rate": 1.6807098572241075e-05, + "loss": 0.0109, + "step": 929 + }, + { + "epoch": 1.792289156626506, + "grad_norm": 0.16208237409591675, + "learning_rate": 1.6762833973150846e-05, + "loss": 0.0113, + "step": 930 + }, + { + "epoch": 1.7942168674698795, + "grad_norm": 0.35555699467658997, + "learning_rate": 1.671858565158172e-05, + "loss": 0.0196, + "step": 931 + }, + { + "epoch": 1.796144578313253, + "grad_norm": 0.1600857824087143, + "learning_rate": 1.6674353830028587e-05, + "loss": 0.0089, + "step": 932 + }, + { + "epoch": 1.7980722891566265, + "grad_norm": 0.1699574887752533, + "learning_rate": 1.663013873090342e-05, + "loss": 0.0074, + "step": 933 + }, + { + "epoch": 1.8, + "grad_norm": 0.2472933828830719, + "learning_rate": 1.6585940576534086e-05, + "loss": 0.0063, + "step": 934 + }, + { + "epoch": 1.8019277108433736, + "grad_norm": 0.23491555452346802, + "learning_rate": 1.654175958916323e-05, + "loss": 0.0101, + "step": 935 + }, + { + "epoch": 1.8038554216867468, + "grad_norm": 0.28635191917419434, + "learning_rate": 1.6497595990947195e-05, + "loss": 0.0131, + "step": 936 + }, + { + "epoch": 1.8057831325301206, + "grad_norm": 0.15400712192058563, + "learning_rate": 1.645345000395489e-05, + "loss": 0.0068, + "step": 937 + }, + { + "epoch": 1.8077108433734939, + "grad_norm": 0.18223172426223755, + "learning_rate": 1.6409321850166647e-05, + "loss": 0.0094, + "step": 938 + }, + { + "epoch": 1.8096385542168676, + "grad_norm": 0.2789457142353058, + "learning_rate": 1.636521175147316e-05, + "loss": 0.0202, + "step": 939 + }, + { + "epoch": 1.8115662650602409, + "grad_norm": 0.4267627000808716, + "learning_rate": 1.6321119929674297e-05, + "loss": 0.0176, + "step": 940 + }, + { + "epoch": 1.8134939759036146, + "grad_norm": 0.3021615445613861, + "learning_rate": 1.6277046606478056e-05, + "loss": 0.0085, + "step": 941 + }, + { + "epoch": 1.815421686746988, + "grad_norm": 0.3724934756755829, + "learning_rate": 1.6232992003499405e-05, + "loss": 0.0474, + "step": 942 + }, + { + "epoch": 1.8173493975903614, + "grad_norm": 0.20904326438903809, + "learning_rate": 1.6188956342259177e-05, + "loss": 0.0078, + "step": 943 + }, + { + "epoch": 1.819277108433735, + "grad_norm": 0.31168171763420105, + "learning_rate": 1.614493984418297e-05, + "loss": 0.0174, + "step": 944 + }, + { + "epoch": 1.8212048192771084, + "grad_norm": 0.21273556351661682, + "learning_rate": 1.6100942730600003e-05, + "loss": 0.0054, + "step": 945 + }, + { + "epoch": 1.823132530120482, + "grad_norm": 0.16991695761680603, + "learning_rate": 1.6056965222742055e-05, + "loss": 0.0063, + "step": 946 + }, + { + "epoch": 1.8250602409638554, + "grad_norm": 0.22762684524059296, + "learning_rate": 1.6013007541742303e-05, + "loss": 0.0234, + "step": 947 + }, + { + "epoch": 1.826987951807229, + "grad_norm": 0.20128795504570007, + "learning_rate": 1.596906990863422e-05, + "loss": 0.0095, + "step": 948 + }, + { + "epoch": 1.8289156626506025, + "grad_norm": 0.30772027373313904, + "learning_rate": 1.592515254435048e-05, + "loss": 0.0356, + "step": 949 + }, + { + "epoch": 1.830843373493976, + "grad_norm": 0.12954631447792053, + "learning_rate": 1.5881255669721857e-05, + "loss": 0.008, + "step": 950 + }, + { + "epoch": 1.8327710843373493, + "grad_norm": 0.7787145972251892, + "learning_rate": 1.5837379505476054e-05, + "loss": 0.0108, + "step": 951 + }, + { + "epoch": 1.834698795180723, + "grad_norm": 0.1683879941701889, + "learning_rate": 1.5793524272236683e-05, + "loss": 0.006, + "step": 952 + }, + { + "epoch": 1.8366265060240963, + "grad_norm": 0.16475361585617065, + "learning_rate": 1.5749690190522076e-05, + "loss": 0.0065, + "step": 953 + }, + { + "epoch": 1.83855421686747, + "grad_norm": 0.211905375123024, + "learning_rate": 1.5705877480744214e-05, + "loss": 0.0092, + "step": 954 + }, + { + "epoch": 1.8404819277108433, + "grad_norm": 0.23850117623806, + "learning_rate": 1.5662086363207628e-05, + "loss": 0.012, + "step": 955 + }, + { + "epoch": 1.842409638554217, + "grad_norm": 0.19100065529346466, + "learning_rate": 1.561831705810825e-05, + "loss": 0.0113, + "step": 956 + }, + { + "epoch": 1.8443373493975903, + "grad_norm": 0.3635985255241394, + "learning_rate": 1.557456978553236e-05, + "loss": 0.0168, + "step": 957 + }, + { + "epoch": 1.8462650602409638, + "grad_norm": 0.16449116170406342, + "learning_rate": 1.553084476545544e-05, + "loss": 0.0042, + "step": 958 + }, + { + "epoch": 1.8481927710843373, + "grad_norm": 0.566093385219574, + "learning_rate": 1.5487142217741062e-05, + "loss": 0.0145, + "step": 959 + }, + { + "epoch": 1.8501204819277108, + "grad_norm": 0.15960252285003662, + "learning_rate": 1.5443462362139834e-05, + "loss": 0.0059, + "step": 960 + }, + { + "epoch": 1.8520481927710843, + "grad_norm": 0.40773797035217285, + "learning_rate": 1.539980541828823e-05, + "loss": 0.0257, + "step": 961 + }, + { + "epoch": 1.8539759036144579, + "grad_norm": 0.4802496135234833, + "learning_rate": 1.5356171605707522e-05, + "loss": 0.0111, + "step": 962 + }, + { + "epoch": 1.8559036144578314, + "grad_norm": 0.15745794773101807, + "learning_rate": 1.5312561143802704e-05, + "loss": 0.0049, + "step": 963 + }, + { + "epoch": 1.8578313253012049, + "grad_norm": 0.15139251947402954, + "learning_rate": 1.5268974251861298e-05, + "loss": 0.0077, + "step": 964 + }, + { + "epoch": 1.8597590361445784, + "grad_norm": 0.2188841849565506, + "learning_rate": 1.5225411149052356e-05, + "loss": 0.017, + "step": 965 + }, + { + "epoch": 1.8616867469879517, + "grad_norm": 0.10853131115436554, + "learning_rate": 1.5181872054425287e-05, + "loss": 0.0049, + "step": 966 + }, + { + "epoch": 1.8636144578313254, + "grad_norm": 0.8254880905151367, + "learning_rate": 1.5138357186908785e-05, + "loss": 0.0317, + "step": 967 + }, + { + "epoch": 1.8655421686746987, + "grad_norm": 0.2989620566368103, + "learning_rate": 1.5094866765309728e-05, + "loss": 0.0126, + "step": 968 + }, + { + "epoch": 1.8674698795180724, + "grad_norm": 0.16411150991916656, + "learning_rate": 1.5051401008312054e-05, + "loss": 0.0101, + "step": 969 + }, + { + "epoch": 1.8693975903614457, + "grad_norm": 0.2861763834953308, + "learning_rate": 1.5007960134475706e-05, + "loss": 0.0155, + "step": 970 + }, + { + "epoch": 1.8713253012048194, + "grad_norm": 0.24879588186740875, + "learning_rate": 1.4964544362235487e-05, + "loss": 0.0187, + "step": 971 + }, + { + "epoch": 1.8732530120481927, + "grad_norm": 0.2433672398328781, + "learning_rate": 1.4921153909899983e-05, + "loss": 0.0084, + "step": 972 + }, + { + "epoch": 1.8751807228915662, + "grad_norm": 0.15097154676914215, + "learning_rate": 1.487778899565047e-05, + "loss": 0.007, + "step": 973 + }, + { + "epoch": 1.8771084337349397, + "grad_norm": 0.1629047691822052, + "learning_rate": 1.4834449837539806e-05, + "loss": 0.0058, + "step": 974 + }, + { + "epoch": 1.8790361445783132, + "grad_norm": 0.9937071204185486, + "learning_rate": 1.4791136653491333e-05, + "loss": 0.0323, + "step": 975 + }, + { + "epoch": 1.8809638554216868, + "grad_norm": 0.19555562734603882, + "learning_rate": 1.4747849661297808e-05, + "loss": 0.0126, + "step": 976 + }, + { + "epoch": 1.8828915662650603, + "grad_norm": 0.16147711873054504, + "learning_rate": 1.470458907862026e-05, + "loss": 0.0067, + "step": 977 + }, + { + "epoch": 1.8848192771084338, + "grad_norm": 0.2730027735233307, + "learning_rate": 1.4661355122986945e-05, + "loss": 0.0147, + "step": 978 + }, + { + "epoch": 1.886746987951807, + "grad_norm": 0.13759832084178925, + "learning_rate": 1.4618148011792206e-05, + "loss": 0.0038, + "step": 979 + }, + { + "epoch": 1.8886746987951808, + "grad_norm": 0.33516690135002136, + "learning_rate": 1.4574967962295419e-05, + "loss": 0.0139, + "step": 980 + }, + { + "epoch": 1.890602409638554, + "grad_norm": 0.2345741093158722, + "learning_rate": 1.4531815191619903e-05, + "loss": 0.0094, + "step": 981 + }, + { + "epoch": 1.8925301204819278, + "grad_norm": 0.14681044220924377, + "learning_rate": 1.4488689916751762e-05, + "loss": 0.0065, + "step": 982 + }, + { + "epoch": 1.894457831325301, + "grad_norm": 0.21143914759159088, + "learning_rate": 1.4445592354538885e-05, + "loss": 0.0057, + "step": 983 + }, + { + "epoch": 1.8963855421686748, + "grad_norm": 0.3109160363674164, + "learning_rate": 1.44025227216898e-05, + "loss": 0.0142, + "step": 984 + }, + { + "epoch": 1.8983132530120481, + "grad_norm": 0.24301907420158386, + "learning_rate": 1.435948123477259e-05, + "loss": 0.012, + "step": 985 + }, + { + "epoch": 1.9002409638554218, + "grad_norm": 0.19817675650119781, + "learning_rate": 1.431646811021382e-05, + "loss": 0.0097, + "step": 986 + }, + { + "epoch": 1.9021686746987951, + "grad_norm": 0.13464932143688202, + "learning_rate": 1.4273483564297425e-05, + "loss": 0.0046, + "step": 987 + }, + { + "epoch": 1.9040963855421686, + "grad_norm": 0.1698642522096634, + "learning_rate": 1.4230527813163656e-05, + "loss": 0.0038, + "step": 988 + }, + { + "epoch": 1.9060240963855422, + "grad_norm": 0.19395388662815094, + "learning_rate": 1.4187601072807975e-05, + "loss": 0.0123, + "step": 989 + }, + { + "epoch": 1.9079518072289157, + "grad_norm": 0.2093188613653183, + "learning_rate": 1.4144703559079948e-05, + "loss": 0.0093, + "step": 990 + }, + { + "epoch": 1.9098795180722892, + "grad_norm": 0.1529311090707779, + "learning_rate": 1.4101835487682198e-05, + "loss": 0.0051, + "step": 991 + }, + { + "epoch": 1.9118072289156627, + "grad_norm": 0.18725350499153137, + "learning_rate": 1.4058997074169299e-05, + "loss": 0.0083, + "step": 992 + }, + { + "epoch": 1.9137349397590362, + "grad_norm": 0.15601560473442078, + "learning_rate": 1.401618853394668e-05, + "loss": 0.0086, + "step": 993 + }, + { + "epoch": 1.9156626506024095, + "grad_norm": 0.23890644311904907, + "learning_rate": 1.3973410082269591e-05, + "loss": 0.015, + "step": 994 + }, + { + "epoch": 1.9175903614457832, + "grad_norm": 0.2442619949579239, + "learning_rate": 1.3930661934241947e-05, + "loss": 0.0089, + "step": 995 + }, + { + "epoch": 1.9195180722891565, + "grad_norm": 0.1540212482213974, + "learning_rate": 1.388794430481532e-05, + "loss": 0.0072, + "step": 996 + }, + { + "epoch": 1.9214457831325302, + "grad_norm": 0.1359291970729828, + "learning_rate": 1.3845257408787807e-05, + "loss": 0.0131, + "step": 997 + }, + { + "epoch": 1.9233734939759035, + "grad_norm": 0.25486138463020325, + "learning_rate": 1.3802601460802967e-05, + "loss": 0.0198, + "step": 998 + }, + { + "epoch": 1.9253012048192772, + "grad_norm": 0.28815609216690063, + "learning_rate": 1.3759976675348754e-05, + "loss": 0.014, + "step": 999 + }, + { + "epoch": 1.9272289156626505, + "grad_norm": 0.15648497641086578, + "learning_rate": 1.3717383266756403e-05, + "loss": 0.0065, + "step": 1000 + }, + { + "epoch": 1.929156626506024, + "grad_norm": 0.16912540793418884, + "learning_rate": 1.367482144919941e-05, + "loss": 0.0059, + "step": 1001 + }, + { + "epoch": 1.9310843373493976, + "grad_norm": 0.16896723210811615, + "learning_rate": 1.3632291436692397e-05, + "loss": 0.0054, + "step": 1002 + }, + { + "epoch": 1.933012048192771, + "grad_norm": 0.20287497341632843, + "learning_rate": 1.3589793443090064e-05, + "loss": 0.0097, + "step": 1003 + }, + { + "epoch": 1.9349397590361446, + "grad_norm": 0.14804276823997498, + "learning_rate": 1.3547327682086114e-05, + "loss": 0.0125, + "step": 1004 + }, + { + "epoch": 1.936867469879518, + "grad_norm": 0.23820064961910248, + "learning_rate": 1.3504894367212171e-05, + "loss": 0.0131, + "step": 1005 + }, + { + "epoch": 1.9387951807228916, + "grad_norm": 0.25607362389564514, + "learning_rate": 1.34624937118367e-05, + "loss": 0.0115, + "step": 1006 + }, + { + "epoch": 1.940722891566265, + "grad_norm": 0.37233737111091614, + "learning_rate": 1.3420125929163976e-05, + "loss": 0.0309, + "step": 1007 + }, + { + "epoch": 1.9426506024096386, + "grad_norm": 0.19426730275154114, + "learning_rate": 1.3377791232232929e-05, + "loss": 0.0078, + "step": 1008 + }, + { + "epoch": 1.944578313253012, + "grad_norm": 0.2784160077571869, + "learning_rate": 1.333548983391617e-05, + "loss": 0.0142, + "step": 1009 + }, + { + "epoch": 1.9465060240963856, + "grad_norm": 0.11407195776700974, + "learning_rate": 1.3293221946918853e-05, + "loss": 0.0035, + "step": 1010 + }, + { + "epoch": 1.948433734939759, + "grad_norm": 0.3965436816215515, + "learning_rate": 1.325098778377762e-05, + "loss": 0.0242, + "step": 1011 + }, + { + "epoch": 1.9503614457831326, + "grad_norm": 0.18520519137382507, + "learning_rate": 1.3208787556859543e-05, + "loss": 0.0096, + "step": 1012 + }, + { + "epoch": 1.952289156626506, + "grad_norm": 0.2783315181732178, + "learning_rate": 1.3166621478361075e-05, + "loss": 0.0103, + "step": 1013 + }, + { + "epoch": 1.9542168674698797, + "grad_norm": 0.22714459896087646, + "learning_rate": 1.3124489760306917e-05, + "loss": 0.0078, + "step": 1014 + }, + { + "epoch": 1.956144578313253, + "grad_norm": 0.1257915049791336, + "learning_rate": 1.3082392614549036e-05, + "loss": 0.0077, + "step": 1015 + }, + { + "epoch": 1.9580722891566265, + "grad_norm": 0.15592887997627258, + "learning_rate": 1.3040330252765526e-05, + "loss": 0.0106, + "step": 1016 + }, + { + "epoch": 1.96, + "grad_norm": 0.19295449554920197, + "learning_rate": 1.2998302886459586e-05, + "loss": 0.0082, + "step": 1017 + }, + { + "epoch": 1.9619277108433735, + "grad_norm": 0.15544794499874115, + "learning_rate": 1.2956310726958472e-05, + "loss": 0.0068, + "step": 1018 + }, + { + "epoch": 1.963855421686747, + "grad_norm": 0.25899502635002136, + "learning_rate": 1.291435398541236e-05, + "loss": 0.0086, + "step": 1019 + }, + { + "epoch": 1.9657831325301205, + "grad_norm": 0.34639033675193787, + "learning_rate": 1.2872432872793379e-05, + "loss": 0.0116, + "step": 1020 + }, + { + "epoch": 1.967710843373494, + "grad_norm": 0.1628410518169403, + "learning_rate": 1.283054759989447e-05, + "loss": 0.0055, + "step": 1021 + }, + { + "epoch": 1.9696385542168675, + "grad_norm": 0.9273788928985596, + "learning_rate": 1.2788698377328385e-05, + "loss": 0.0264, + "step": 1022 + }, + { + "epoch": 1.971566265060241, + "grad_norm": 0.163126140832901, + "learning_rate": 1.2746885415526594e-05, + "loss": 0.0046, + "step": 1023 + }, + { + "epoch": 1.9734939759036143, + "grad_norm": 0.1475439816713333, + "learning_rate": 1.2705108924738223e-05, + "loss": 0.0056, + "step": 1024 + }, + { + "epoch": 1.975421686746988, + "grad_norm": 0.1654318869113922, + "learning_rate": 1.2663369115029034e-05, + "loss": 0.0056, + "step": 1025 + }, + { + "epoch": 1.9773493975903613, + "grad_norm": 0.20536045730113983, + "learning_rate": 1.2621666196280333e-05, + "loss": 0.0101, + "step": 1026 + }, + { + "epoch": 1.979277108433735, + "grad_norm": 0.19256474077701569, + "learning_rate": 1.258000037818792e-05, + "loss": 0.0059, + "step": 1027 + }, + { + "epoch": 1.9812048192771083, + "grad_norm": 0.2605120539665222, + "learning_rate": 1.2538371870261053e-05, + "loss": 0.0115, + "step": 1028 + }, + { + "epoch": 1.983132530120482, + "grad_norm": 0.14840295910835266, + "learning_rate": 1.249678088182137e-05, + "loss": 0.0046, + "step": 1029 + }, + { + "epoch": 1.9850602409638554, + "grad_norm": 0.17585207521915436, + "learning_rate": 1.2455227622001851e-05, + "loss": 0.0086, + "step": 1030 + }, + { + "epoch": 1.9869879518072289, + "grad_norm": 0.11044781655073166, + "learning_rate": 1.241371229974579e-05, + "loss": 0.0034, + "step": 1031 + }, + { + "epoch": 1.9889156626506024, + "grad_norm": 0.25584840774536133, + "learning_rate": 1.2372235123805672e-05, + "loss": 0.0245, + "step": 1032 + }, + { + "epoch": 1.9908433734939759, + "grad_norm": 0.25962474942207336, + "learning_rate": 1.2330796302742211e-05, + "loss": 0.0104, + "step": 1033 + }, + { + "epoch": 1.9927710843373494, + "grad_norm": 0.33408522605895996, + "learning_rate": 1.2289396044923238e-05, + "loss": 0.0176, + "step": 1034 + }, + { + "epoch": 1.994698795180723, + "grad_norm": 0.479950487613678, + "learning_rate": 1.2248034558522682e-05, + "loss": 0.0113, + "step": 1035 + }, + { + "epoch": 1.9966265060240964, + "grad_norm": 0.16567294299602509, + "learning_rate": 1.2206712051519518e-05, + "loss": 0.0036, + "step": 1036 + }, + { + "epoch": 1.99855421686747, + "grad_norm": 0.19343771040439606, + "learning_rate": 1.2165428731696713e-05, + "loss": 0.0077, + "step": 1037 + }, + { + "epoch": 2.0, + "grad_norm": 0.22895601391792297, + "learning_rate": 1.2124184806640202e-05, + "loss": 0.0114, + "step": 1038 + }, + { + "epoch": 2.0019277108433733, + "grad_norm": 0.15838384628295898, + "learning_rate": 1.208298048373782e-05, + "loss": 0.0043, + "step": 1039 + }, + { + "epoch": 2.003855421686747, + "grad_norm": 0.681065559387207, + "learning_rate": 1.2041815970178268e-05, + "loss": 0.0214, + "step": 1040 + }, + { + "epoch": 2.0057831325301203, + "grad_norm": 0.3357350528240204, + "learning_rate": 1.2000691472950081e-05, + "loss": 0.0079, + "step": 1041 + }, + { + "epoch": 2.007710843373494, + "grad_norm": 0.15238308906555176, + "learning_rate": 1.1959607198840568e-05, + "loss": 0.0041, + "step": 1042 + }, + { + "epoch": 2.0096385542168673, + "grad_norm": 0.11763229966163635, + "learning_rate": 1.1918563354434784e-05, + "loss": 0.0033, + "step": 1043 + }, + { + "epoch": 2.011566265060241, + "grad_norm": 0.3759301006793976, + "learning_rate": 1.1877560146114515e-05, + "loss": 0.0128, + "step": 1044 + }, + { + "epoch": 2.0134939759036143, + "grad_norm": 0.1143188625574112, + "learning_rate": 1.1836597780057183e-05, + "loss": 0.0078, + "step": 1045 + }, + { + "epoch": 2.015421686746988, + "grad_norm": 0.20059260725975037, + "learning_rate": 1.179567646223485e-05, + "loss": 0.0149, + "step": 1046 + }, + { + "epoch": 2.0173493975903614, + "grad_norm": 0.15569567680358887, + "learning_rate": 1.1754796398413196e-05, + "loss": 0.0038, + "step": 1047 + }, + { + "epoch": 2.019277108433735, + "grad_norm": 0.1153278723359108, + "learning_rate": 1.1713957794150423e-05, + "loss": 0.0041, + "step": 1048 + }, + { + "epoch": 2.0212048192771084, + "grad_norm": 0.1838717758655548, + "learning_rate": 1.1673160854796307e-05, + "loss": 0.0041, + "step": 1049 + }, + { + "epoch": 2.023132530120482, + "grad_norm": 0.12264502793550491, + "learning_rate": 1.1632405785491077e-05, + "loss": 0.0043, + "step": 1050 + }, + { + "epoch": 2.0250602409638554, + "grad_norm": 0.14363229274749756, + "learning_rate": 1.159169279116445e-05, + "loss": 0.0066, + "step": 1051 + }, + { + "epoch": 2.026987951807229, + "grad_norm": 0.1316995471715927, + "learning_rate": 1.1551022076534585e-05, + "loss": 0.0024, + "step": 1052 + }, + { + "epoch": 2.0289156626506024, + "grad_norm": 0.13392619788646698, + "learning_rate": 1.1510393846107001e-05, + "loss": 0.0051, + "step": 1053 + }, + { + "epoch": 2.0308433734939757, + "grad_norm": 3.0086817741394043, + "learning_rate": 1.1469808304173658e-05, + "loss": 0.0334, + "step": 1054 + }, + { + "epoch": 2.0327710843373494, + "grad_norm": 0.17756076157093048, + "learning_rate": 1.1429265654811803e-05, + "loss": 0.0068, + "step": 1055 + }, + { + "epoch": 2.0346987951807227, + "grad_norm": 0.13250532746315002, + "learning_rate": 1.1388766101883038e-05, + "loss": 0.0087, + "step": 1056 + }, + { + "epoch": 2.0366265060240965, + "grad_norm": 0.3534089922904968, + "learning_rate": 1.1348309849032257e-05, + "loss": 0.0076, + "step": 1057 + }, + { + "epoch": 2.0385542168674697, + "grad_norm": 0.11939049512147903, + "learning_rate": 1.1307897099686627e-05, + "loss": 0.0029, + "step": 1058 + }, + { + "epoch": 2.0404819277108435, + "grad_norm": 0.11862517893314362, + "learning_rate": 1.1267528057054562e-05, + "loss": 0.0062, + "step": 1059 + }, + { + "epoch": 2.0424096385542168, + "grad_norm": 0.1539212018251419, + "learning_rate": 1.1227202924124704e-05, + "loss": 0.0067, + "step": 1060 + }, + { + "epoch": 2.0443373493975905, + "grad_norm": 0.17163440585136414, + "learning_rate": 1.118692190366491e-05, + "loss": 0.0055, + "step": 1061 + }, + { + "epoch": 2.0462650602409638, + "grad_norm": 0.12304897606372833, + "learning_rate": 1.1146685198221222e-05, + "loss": 0.0036, + "step": 1062 + }, + { + "epoch": 2.0481927710843375, + "grad_norm": 0.17319051921367645, + "learning_rate": 1.1106493010116842e-05, + "loss": 0.0058, + "step": 1063 + }, + { + "epoch": 2.050120481927711, + "grad_norm": 0.2242443859577179, + "learning_rate": 1.1066345541451127e-05, + "loss": 0.0059, + "step": 1064 + }, + { + "epoch": 2.0520481927710845, + "grad_norm": 0.09533938020467758, + "learning_rate": 1.1026242994098597e-05, + "loss": 0.0033, + "step": 1065 + }, + { + "epoch": 2.053975903614458, + "grad_norm": 0.11697929352521896, + "learning_rate": 1.0986185569707852e-05, + "loss": 0.0038, + "step": 1066 + }, + { + "epoch": 2.0559036144578315, + "grad_norm": 0.2563149333000183, + "learning_rate": 1.0946173469700625e-05, + "loss": 0.0158, + "step": 1067 + }, + { + "epoch": 2.057831325301205, + "grad_norm": 0.21836932003498077, + "learning_rate": 1.0906206895270739e-05, + "loss": 0.0085, + "step": 1068 + }, + { + "epoch": 2.059759036144578, + "grad_norm": 0.1798071414232254, + "learning_rate": 1.0866286047383094e-05, + "loss": 0.0053, + "step": 1069 + }, + { + "epoch": 2.061686746987952, + "grad_norm": 0.08937730640172958, + "learning_rate": 1.0826411126772675e-05, + "loss": 0.0025, + "step": 1070 + }, + { + "epoch": 2.063614457831325, + "grad_norm": 0.0942138060927391, + "learning_rate": 1.0786582333943499e-05, + "loss": 0.0017, + "step": 1071 + }, + { + "epoch": 2.065542168674699, + "grad_norm": 0.13076582551002502, + "learning_rate": 1.0746799869167679e-05, + "loss": 0.0033, + "step": 1072 + }, + { + "epoch": 2.067469879518072, + "grad_norm": 0.0993233174085617, + "learning_rate": 1.0707063932484357e-05, + "loss": 0.0046, + "step": 1073 + }, + { + "epoch": 2.069397590361446, + "grad_norm": 0.3046741485595703, + "learning_rate": 1.0667374723698698e-05, + "loss": 0.009, + "step": 1074 + }, + { + "epoch": 2.071325301204819, + "grad_norm": 0.12197669595479965, + "learning_rate": 1.0627732442380932e-05, + "loss": 0.0034, + "step": 1075 + }, + { + "epoch": 2.073253012048193, + "grad_norm": 0.12721140682697296, + "learning_rate": 1.058813728786531e-05, + "loss": 0.0048, + "step": 1076 + }, + { + "epoch": 2.075180722891566, + "grad_norm": 0.10011966526508331, + "learning_rate": 1.0548589459249112e-05, + "loss": 0.0026, + "step": 1077 + }, + { + "epoch": 2.07710843373494, + "grad_norm": 0.3314201831817627, + "learning_rate": 1.0509089155391661e-05, + "loss": 0.0284, + "step": 1078 + }, + { + "epoch": 2.079036144578313, + "grad_norm": 0.32739701867103577, + "learning_rate": 1.0469636574913288e-05, + "loss": 0.0088, + "step": 1079 + }, + { + "epoch": 2.080963855421687, + "grad_norm": 0.13805675506591797, + "learning_rate": 1.043023191619438e-05, + "loss": 0.0042, + "step": 1080 + }, + { + "epoch": 2.0828915662650602, + "grad_norm": 0.14789745211601257, + "learning_rate": 1.039087537737435e-05, + "loss": 0.0037, + "step": 1081 + }, + { + "epoch": 2.0848192771084335, + "grad_norm": 0.15518991649150848, + "learning_rate": 1.0351567156350617e-05, + "loss": 0.0044, + "step": 1082 + }, + { + "epoch": 2.0867469879518072, + "grad_norm": 0.08380113542079926, + "learning_rate": 1.0312307450777706e-05, + "loss": 0.0019, + "step": 1083 + }, + { + "epoch": 2.0886746987951805, + "grad_norm": 0.17892400920391083, + "learning_rate": 1.027309645806613e-05, + "loss": 0.0065, + "step": 1084 + }, + { + "epoch": 2.0906024096385543, + "grad_norm": 0.5497608780860901, + "learning_rate": 1.0233934375381489e-05, + "loss": 0.0238, + "step": 1085 + }, + { + "epoch": 2.0925301204819275, + "grad_norm": 1.0189186334609985, + "learning_rate": 1.019482139964344e-05, + "loss": 0.0092, + "step": 1086 + }, + { + "epoch": 2.0944578313253013, + "grad_norm": 0.12144117057323456, + "learning_rate": 1.015575772752472e-05, + "loss": 0.0038, + "step": 1087 + }, + { + "epoch": 2.0963855421686746, + "grad_norm": 0.1115315854549408, + "learning_rate": 1.0116743555450148e-05, + "loss": 0.0024, + "step": 1088 + }, + { + "epoch": 2.0983132530120483, + "grad_norm": 0.22671759128570557, + "learning_rate": 1.0077779079595631e-05, + "loss": 0.0136, + "step": 1089 + }, + { + "epoch": 2.1002409638554216, + "grad_norm": 2.0009827613830566, + "learning_rate": 1.003886449588719e-05, + "loss": 0.0493, + "step": 1090 + }, + { + "epoch": 2.1021686746987953, + "grad_norm": 0.11907301843166351, + "learning_rate": 1.0000000000000006e-05, + "loss": 0.0034, + "step": 1091 + }, + { + "epoch": 2.1040963855421686, + "grad_norm": 0.31257638335227966, + "learning_rate": 9.961185787357346e-06, + "loss": 0.0129, + "step": 1092 + }, + { + "epoch": 2.1060240963855423, + "grad_norm": 0.11033743619918823, + "learning_rate": 9.922422053129674e-06, + "loss": 0.0184, + "step": 1093 + }, + { + "epoch": 2.1079518072289156, + "grad_norm": 0.2575698494911194, + "learning_rate": 9.883708992233626e-06, + "loss": 0.0054, + "step": 1094 + }, + { + "epoch": 2.1098795180722894, + "grad_norm": 0.12921132147312164, + "learning_rate": 9.845046799331029e-06, + "loss": 0.0037, + "step": 1095 + }, + { + "epoch": 2.1118072289156626, + "grad_norm": 0.21405921876430511, + "learning_rate": 9.806435668827941e-06, + "loss": 0.006, + "step": 1096 + }, + { + "epoch": 2.113734939759036, + "grad_norm": 0.12929430603981018, + "learning_rate": 9.76787579487363e-06, + "loss": 0.0049, + "step": 1097 + }, + { + "epoch": 2.1156626506024097, + "grad_norm": 0.1793181151151657, + "learning_rate": 9.729367371359681e-06, + "loss": 0.0086, + "step": 1098 + }, + { + "epoch": 2.117590361445783, + "grad_norm": 0.2182074338197708, + "learning_rate": 9.690910591918936e-06, + "loss": 0.0106, + "step": 1099 + }, + { + "epoch": 2.1195180722891567, + "grad_norm": 0.0705680400133133, + "learning_rate": 9.652505649924547e-06, + "loss": 0.0012, + "step": 1100 + }, + { + "epoch": 2.12144578313253, + "grad_norm": 0.10509738326072693, + "learning_rate": 9.614152738489021e-06, + "loss": 0.0048, + "step": 1101 + }, + { + "epoch": 2.1233734939759037, + "grad_norm": 0.13775436580181122, + "learning_rate": 9.575852050463268e-06, + "loss": 0.0089, + "step": 1102 + }, + { + "epoch": 2.125301204819277, + "grad_norm": 0.15230101346969604, + "learning_rate": 9.537603778435545e-06, + "loss": 0.0065, + "step": 1103 + }, + { + "epoch": 2.1272289156626507, + "grad_norm": 0.24702346324920654, + "learning_rate": 9.499408114730583e-06, + "loss": 0.016, + "step": 1104 + }, + { + "epoch": 2.129156626506024, + "grad_norm": 0.1082577034831047, + "learning_rate": 9.461265251408575e-06, + "loss": 0.0036, + "step": 1105 + }, + { + "epoch": 2.1310843373493977, + "grad_norm": 0.1063847690820694, + "learning_rate": 9.423175380264211e-06, + "loss": 0.0037, + "step": 1106 + }, + { + "epoch": 2.133012048192771, + "grad_norm": 0.07686953246593475, + "learning_rate": 9.385138692825729e-06, + "loss": 0.0031, + "step": 1107 + }, + { + "epoch": 2.1349397590361447, + "grad_norm": 0.2046380341053009, + "learning_rate": 9.347155380353912e-06, + "loss": 0.0087, + "step": 1108 + }, + { + "epoch": 2.136867469879518, + "grad_norm": 0.1341692954301834, + "learning_rate": 9.30922563384121e-06, + "loss": 0.0045, + "step": 1109 + }, + { + "epoch": 2.1387951807228918, + "grad_norm": 0.09870535880327225, + "learning_rate": 9.271349644010672e-06, + "loss": 0.003, + "step": 1110 + }, + { + "epoch": 2.140722891566265, + "grad_norm": 0.18708615005016327, + "learning_rate": 9.233527601315069e-06, + "loss": 0.0042, + "step": 1111 + }, + { + "epoch": 2.1426506024096383, + "grad_norm": 0.5175634026527405, + "learning_rate": 9.195759695935907e-06, + "loss": 0.0173, + "step": 1112 + }, + { + "epoch": 2.144578313253012, + "grad_norm": 0.14939036965370178, + "learning_rate": 9.158046117782464e-06, + "loss": 0.0031, + "step": 1113 + }, + { + "epoch": 2.1465060240963854, + "grad_norm": 0.2837410569190979, + "learning_rate": 9.120387056490851e-06, + "loss": 0.0097, + "step": 1114 + }, + { + "epoch": 2.148433734939759, + "grad_norm": 0.11088677495718002, + "learning_rate": 9.082782701423047e-06, + "loss": 0.0026, + "step": 1115 + }, + { + "epoch": 2.1503614457831324, + "grad_norm": 0.07785166054964066, + "learning_rate": 9.045233241665947e-06, + "loss": 0.0019, + "step": 1116 + }, + { + "epoch": 2.152289156626506, + "grad_norm": 0.17568141222000122, + "learning_rate": 9.007738866030427e-06, + "loss": 0.0039, + "step": 1117 + }, + { + "epoch": 2.1542168674698794, + "grad_norm": 0.12652266025543213, + "learning_rate": 8.970299763050356e-06, + "loss": 0.0033, + "step": 1118 + }, + { + "epoch": 2.156144578313253, + "grad_norm": 0.16801467537879944, + "learning_rate": 8.932916120981695e-06, + "loss": 0.0076, + "step": 1119 + }, + { + "epoch": 2.1580722891566264, + "grad_norm": 0.18313169479370117, + "learning_rate": 8.895588127801545e-06, + "loss": 0.0052, + "step": 1120 + }, + { + "epoch": 2.16, + "grad_norm": 0.07546049356460571, + "learning_rate": 8.858315971207146e-06, + "loss": 0.0022, + "step": 1121 + }, + { + "epoch": 2.1619277108433734, + "grad_norm": 0.4039839208126068, + "learning_rate": 8.821099838614996e-06, + "loss": 0.0203, + "step": 1122 + }, + { + "epoch": 2.163855421686747, + "grad_norm": 0.09244243055582047, + "learning_rate": 8.783939917159897e-06, + "loss": 0.002, + "step": 1123 + }, + { + "epoch": 2.1657831325301204, + "grad_norm": 0.18327835202217102, + "learning_rate": 8.746836393693978e-06, + "loss": 0.0055, + "step": 1124 + }, + { + "epoch": 2.167710843373494, + "grad_norm": 0.22010307013988495, + "learning_rate": 8.709789454785809e-06, + "loss": 0.0077, + "step": 1125 + }, + { + "epoch": 2.1696385542168675, + "grad_norm": 0.09438297897577286, + "learning_rate": 8.67279928671939e-06, + "loss": 0.0032, + "step": 1126 + }, + { + "epoch": 2.1715662650602408, + "grad_norm": 0.20782770216464996, + "learning_rate": 8.635866075493318e-06, + "loss": 0.0028, + "step": 1127 + }, + { + "epoch": 2.1734939759036145, + "grad_norm": 0.1958685964345932, + "learning_rate": 8.598990006819756e-06, + "loss": 0.0047, + "step": 1128 + }, + { + "epoch": 2.1754216867469878, + "grad_norm": 0.06459935009479523, + "learning_rate": 8.562171266123528e-06, + "loss": 0.0015, + "step": 1129 + }, + { + "epoch": 2.1773493975903615, + "grad_norm": 0.33486708998680115, + "learning_rate": 8.525410038541218e-06, + "loss": 0.0094, + "step": 1130 + }, + { + "epoch": 2.179277108433735, + "grad_norm": 0.5755940079689026, + "learning_rate": 8.488706508920202e-06, + "loss": 0.0067, + "step": 1131 + }, + { + "epoch": 2.1812048192771085, + "grad_norm": 0.10840924829244614, + "learning_rate": 8.452060861817738e-06, + "loss": 0.0082, + "step": 1132 + }, + { + "epoch": 2.183132530120482, + "grad_norm": 0.18611350655555725, + "learning_rate": 8.415473281500037e-06, + "loss": 0.0059, + "step": 1133 + }, + { + "epoch": 2.1850602409638555, + "grad_norm": 0.11245249956846237, + "learning_rate": 8.378943951941301e-06, + "loss": 0.0107, + "step": 1134 + }, + { + "epoch": 2.186987951807229, + "grad_norm": 0.12284426391124725, + "learning_rate": 8.342473056822873e-06, + "loss": 0.0025, + "step": 1135 + }, + { + "epoch": 2.1889156626506026, + "grad_norm": 0.12542888522148132, + "learning_rate": 8.306060779532245e-06, + "loss": 0.0059, + "step": 1136 + }, + { + "epoch": 2.190843373493976, + "grad_norm": 0.1287655532360077, + "learning_rate": 8.26970730316215e-06, + "loss": 0.0022, + "step": 1137 + }, + { + "epoch": 2.1927710843373496, + "grad_norm": 0.1818632185459137, + "learning_rate": 8.233412810509669e-06, + "loss": 0.0131, + "step": 1138 + }, + { + "epoch": 2.194698795180723, + "grad_norm": 0.09687745571136475, + "learning_rate": 8.197177484075284e-06, + "loss": 0.0025, + "step": 1139 + }, + { + "epoch": 2.1966265060240966, + "grad_norm": 0.16103452444076538, + "learning_rate": 8.161001506061979e-06, + "loss": 0.0031, + "step": 1140 + }, + { + "epoch": 2.19855421686747, + "grad_norm": 0.2711680233478546, + "learning_rate": 8.124885058374302e-06, + "loss": 0.0034, + "step": 1141 + }, + { + "epoch": 2.200481927710843, + "grad_norm": 0.17613105475902557, + "learning_rate": 8.088828322617473e-06, + "loss": 0.0044, + "step": 1142 + }, + { + "epoch": 2.202409638554217, + "grad_norm": 0.2298487424850464, + "learning_rate": 8.052831480096464e-06, + "loss": 0.0168, + "step": 1143 + }, + { + "epoch": 2.20433734939759, + "grad_norm": 0.17042206227779388, + "learning_rate": 8.016894711815067e-06, + "loss": 0.007, + "step": 1144 + }, + { + "epoch": 2.206265060240964, + "grad_norm": 0.2830466628074646, + "learning_rate": 7.98101819847501e-06, + "loss": 0.0091, + "step": 1145 + }, + { + "epoch": 2.208192771084337, + "grad_norm": 0.22089065611362457, + "learning_rate": 7.945202120475063e-06, + "loss": 0.0046, + "step": 1146 + }, + { + "epoch": 2.210120481927711, + "grad_norm": 0.1716073900461197, + "learning_rate": 7.909446657910072e-06, + "loss": 0.0032, + "step": 1147 + }, + { + "epoch": 2.212048192771084, + "grad_norm": 0.16140373051166534, + "learning_rate": 7.873751990570104e-06, + "loss": 0.0057, + "step": 1148 + }, + { + "epoch": 2.213975903614458, + "grad_norm": 0.1671605408191681, + "learning_rate": 7.838118297939529e-06, + "loss": 0.0039, + "step": 1149 + }, + { + "epoch": 2.2159036144578312, + "grad_norm": 0.10933005809783936, + "learning_rate": 7.802545759196117e-06, + "loss": 0.005, + "step": 1150 + }, + { + "epoch": 2.217831325301205, + "grad_norm": 0.07819998264312744, + "learning_rate": 7.76703455321014e-06, + "loss": 0.0025, + "step": 1151 + }, + { + "epoch": 2.2197590361445783, + "grad_norm": 0.36211854219436646, + "learning_rate": 7.73158485854344e-06, + "loss": 0.0151, + "step": 1152 + }, + { + "epoch": 2.221686746987952, + "grad_norm": 0.09098304808139801, + "learning_rate": 7.696196853448612e-06, + "loss": 0.0027, + "step": 1153 + }, + { + "epoch": 2.2236144578313253, + "grad_norm": 0.17442144453525543, + "learning_rate": 7.660870715868018e-06, + "loss": 0.006, + "step": 1154 + }, + { + "epoch": 2.225542168674699, + "grad_norm": 0.09785338491201401, + "learning_rate": 7.625606623432933e-06, + "loss": 0.0041, + "step": 1155 + }, + { + "epoch": 2.2274698795180723, + "grad_norm": 0.19399888813495636, + "learning_rate": 7.590404753462653e-06, + "loss": 0.0125, + "step": 1156 + }, + { + "epoch": 2.2293975903614456, + "grad_norm": 0.11080623418092728, + "learning_rate": 7.55526528296362e-06, + "loss": 0.0022, + "step": 1157 + }, + { + "epoch": 2.2313253012048193, + "grad_norm": 0.14067359268665314, + "learning_rate": 7.520188388628473e-06, + "loss": 0.0123, + "step": 1158 + }, + { + "epoch": 2.2332530120481926, + "grad_norm": 0.14533625543117523, + "learning_rate": 7.485174246835227e-06, + "loss": 0.0039, + "step": 1159 + }, + { + "epoch": 2.2351807228915663, + "grad_norm": 0.1253812462091446, + "learning_rate": 7.4502230336463466e-06, + "loss": 0.003, + "step": 1160 + }, + { + "epoch": 2.2371084337349396, + "grad_norm": 0.12766572833061218, + "learning_rate": 7.415334924807869e-06, + "loss": 0.0044, + "step": 1161 + }, + { + "epoch": 2.2390361445783133, + "grad_norm": 0.11985791474580765, + "learning_rate": 7.380510095748535e-06, + "loss": 0.0071, + "step": 1162 + }, + { + "epoch": 2.2409638554216866, + "grad_norm": 0.15505346655845642, + "learning_rate": 7.3457487215788605e-06, + "loss": 0.0046, + "step": 1163 + }, + { + "epoch": 2.2428915662650604, + "grad_norm": 0.18983210623264313, + "learning_rate": 7.311050977090343e-06, + "loss": 0.0079, + "step": 1164 + }, + { + "epoch": 2.2448192771084337, + "grad_norm": 0.19279207289218903, + "learning_rate": 7.276417036754479e-06, + "loss": 0.0042, + "step": 1165 + }, + { + "epoch": 2.2467469879518074, + "grad_norm": 0.21539707481861115, + "learning_rate": 7.241847074721964e-06, + "loss": 0.0087, + "step": 1166 + }, + { + "epoch": 2.2486746987951807, + "grad_norm": 0.07004354894161224, + "learning_rate": 7.207341264821783e-06, + "loss": 0.002, + "step": 1167 + }, + { + "epoch": 2.2506024096385544, + "grad_norm": 0.2203039526939392, + "learning_rate": 7.172899780560345e-06, + "loss": 0.0069, + "step": 1168 + }, + { + "epoch": 2.2525301204819277, + "grad_norm": 0.12474718689918518, + "learning_rate": 7.138522795120606e-06, + "loss": 0.0122, + "step": 1169 + }, + { + "epoch": 2.2544578313253014, + "grad_norm": 0.09078995883464813, + "learning_rate": 7.104210481361204e-06, + "loss": 0.0025, + "step": 1170 + }, + { + "epoch": 2.2563855421686747, + "grad_norm": 0.141757071018219, + "learning_rate": 7.069963011815584e-06, + "loss": 0.0039, + "step": 1171 + }, + { + "epoch": 2.258313253012048, + "grad_norm": 0.14944659173488617, + "learning_rate": 7.035780558691141e-06, + "loss": 0.0025, + "step": 1172 + }, + { + "epoch": 2.2602409638554217, + "grad_norm": 0.06723666191101074, + "learning_rate": 7.001663293868328e-06, + "loss": 0.0014, + "step": 1173 + }, + { + "epoch": 2.262168674698795, + "grad_norm": 0.11966485530138016, + "learning_rate": 6.967611388899826e-06, + "loss": 0.0067, + "step": 1174 + }, + { + "epoch": 2.2640963855421687, + "grad_norm": 0.08943185210227966, + "learning_rate": 6.933625015009666e-06, + "loss": 0.0036, + "step": 1175 + }, + { + "epoch": 2.266024096385542, + "grad_norm": 0.04511453956365585, + "learning_rate": 6.899704343092359e-06, + "loss": 0.0014, + "step": 1176 + }, + { + "epoch": 2.2679518072289158, + "grad_norm": 0.1867951601743698, + "learning_rate": 6.865849543712058e-06, + "loss": 0.009, + "step": 1177 + }, + { + "epoch": 2.269879518072289, + "grad_norm": 0.23791250586509705, + "learning_rate": 6.832060787101658e-06, + "loss": 0.0117, + "step": 1178 + }, + { + "epoch": 2.271807228915663, + "grad_norm": 0.13210316002368927, + "learning_rate": 6.798338243162008e-06, + "loss": 0.0024, + "step": 1179 + }, + { + "epoch": 2.273734939759036, + "grad_norm": 0.1601375937461853, + "learning_rate": 6.764682081461002e-06, + "loss": 0.013, + "step": 1180 + }, + { + "epoch": 2.27566265060241, + "grad_norm": 0.21996766328811646, + "learning_rate": 6.73109247123273e-06, + "loss": 0.0074, + "step": 1181 + }, + { + "epoch": 2.277590361445783, + "grad_norm": 0.15780030190944672, + "learning_rate": 6.6975695813766465e-06, + "loss": 0.0052, + "step": 1182 + }, + { + "epoch": 2.279518072289157, + "grad_norm": 0.18146437406539917, + "learning_rate": 6.664113580456739e-06, + "loss": 0.0265, + "step": 1183 + }, + { + "epoch": 2.28144578313253, + "grad_norm": 0.12033495306968689, + "learning_rate": 6.630724636700618e-06, + "loss": 0.0026, + "step": 1184 + }, + { + "epoch": 2.283373493975904, + "grad_norm": 0.25268155336380005, + "learning_rate": 6.59740291799873e-06, + "loss": 0.0046, + "step": 1185 + }, + { + "epoch": 2.285301204819277, + "grad_norm": 0.19043004512786865, + "learning_rate": 6.564148591903488e-06, + "loss": 0.0063, + "step": 1186 + }, + { + "epoch": 2.2872289156626504, + "grad_norm": 0.06894923001527786, + "learning_rate": 6.530961825628432e-06, + "loss": 0.0012, + "step": 1187 + }, + { + "epoch": 2.289156626506024, + "grad_norm": 0.16378818452358246, + "learning_rate": 6.4978427860474015e-06, + "loss": 0.0048, + "step": 1188 + }, + { + "epoch": 2.2910843373493974, + "grad_norm": 0.11130444705486298, + "learning_rate": 6.464791639693648e-06, + "loss": 0.0049, + "step": 1189 + }, + { + "epoch": 2.293012048192771, + "grad_norm": 0.10573417693376541, + "learning_rate": 6.431808552759083e-06, + "loss": 0.0019, + "step": 1190 + }, + { + "epoch": 2.2949397590361444, + "grad_norm": 0.13344882428646088, + "learning_rate": 6.398893691093367e-06, + "loss": 0.0033, + "step": 1191 + }, + { + "epoch": 2.296867469879518, + "grad_norm": 0.12659135460853577, + "learning_rate": 6.366047220203088e-06, + "loss": 0.0032, + "step": 1192 + }, + { + "epoch": 2.2987951807228915, + "grad_norm": 0.10152821987867355, + "learning_rate": 6.333269305250971e-06, + "loss": 0.0027, + "step": 1193 + }, + { + "epoch": 2.300722891566265, + "grad_norm": 0.1889944225549698, + "learning_rate": 6.300560111055006e-06, + "loss": 0.0062, + "step": 1194 + }, + { + "epoch": 2.3026506024096385, + "grad_norm": 2.3101227283477783, + "learning_rate": 6.2679198020876275e-06, + "loss": 0.0113, + "step": 1195 + }, + { + "epoch": 2.304578313253012, + "grad_norm": 0.6224933862686157, + "learning_rate": 6.235348542474908e-06, + "loss": 0.0273, + "step": 1196 + }, + { + "epoch": 2.3065060240963855, + "grad_norm": 0.1908419281244278, + "learning_rate": 6.202846495995705e-06, + "loss": 0.0056, + "step": 1197 + }, + { + "epoch": 2.3084337349397592, + "grad_norm": 0.10968491435050964, + "learning_rate": 6.170413826080856e-06, + "loss": 0.0034, + "step": 1198 + }, + { + "epoch": 2.3103614457831325, + "grad_norm": 0.23200668394565582, + "learning_rate": 6.138050695812343e-06, + "loss": 0.0042, + "step": 1199 + }, + { + "epoch": 2.3122891566265062, + "grad_norm": 0.12442032992839813, + "learning_rate": 6.105757267922481e-06, + "loss": 0.0045, + "step": 1200 + }, + { + "epoch": 2.3142168674698795, + "grad_norm": 0.14563624560832977, + "learning_rate": 6.073533704793122e-06, + "loss": 0.0035, + "step": 1201 + }, + { + "epoch": 2.316144578313253, + "grad_norm": 0.11523722857236862, + "learning_rate": 6.04138016845478e-06, + "loss": 0.0088, + "step": 1202 + }, + { + "epoch": 2.3180722891566266, + "grad_norm": 0.2000943422317505, + "learning_rate": 6.009296820585871e-06, + "loss": 0.0059, + "step": 1203 + }, + { + "epoch": 2.32, + "grad_norm": 0.10698592662811279, + "learning_rate": 5.977283822511879e-06, + "loss": 0.0028, + "step": 1204 + }, + { + "epoch": 2.3219277108433736, + "grad_norm": 0.1533137410879135, + "learning_rate": 5.945341335204547e-06, + "loss": 0.0044, + "step": 1205 + }, + { + "epoch": 2.323855421686747, + "grad_norm": 0.1235835999250412, + "learning_rate": 5.9134695192810695e-06, + "loss": 0.0043, + "step": 1206 + }, + { + "epoch": 2.3257831325301206, + "grad_norm": 0.1916925013065338, + "learning_rate": 5.8816685350032575e-06, + "loss": 0.0066, + "step": 1207 + }, + { + "epoch": 2.327710843373494, + "grad_norm": 0.08812380582094193, + "learning_rate": 5.849938542276801e-06, + "loss": 0.0022, + "step": 1208 + }, + { + "epoch": 2.3296385542168676, + "grad_norm": 0.13387660682201385, + "learning_rate": 5.818279700650393e-06, + "loss": 0.0037, + "step": 1209 + }, + { + "epoch": 2.331566265060241, + "grad_norm": 0.2309022694826126, + "learning_rate": 5.786692169314954e-06, + "loss": 0.0049, + "step": 1210 + }, + { + "epoch": 2.3334939759036146, + "grad_norm": 0.09956549853086472, + "learning_rate": 5.755176107102833e-06, + "loss": 0.002, + "step": 1211 + }, + { + "epoch": 2.335421686746988, + "grad_norm": 0.06035687029361725, + "learning_rate": 5.723731672487043e-06, + "loss": 0.002, + "step": 1212 + }, + { + "epoch": 2.337349397590361, + "grad_norm": 0.06850237399339676, + "learning_rate": 5.69235902358038e-06, + "loss": 0.0013, + "step": 1213 + }, + { + "epoch": 2.339277108433735, + "grad_norm": 0.12068171054124832, + "learning_rate": 5.661058318134711e-06, + "loss": 0.0041, + "step": 1214 + }, + { + "epoch": 2.3412048192771087, + "grad_norm": 0.13146616518497467, + "learning_rate": 5.6298297135401355e-06, + "loss": 0.0022, + "step": 1215 + }, + { + "epoch": 2.343132530120482, + "grad_norm": 0.15160737931728363, + "learning_rate": 5.598673366824212e-06, + "loss": 0.0036, + "step": 1216 + }, + { + "epoch": 2.3450602409638552, + "grad_norm": 0.26196014881134033, + "learning_rate": 5.567589434651164e-06, + "loss": 0.0151, + "step": 1217 + }, + { + "epoch": 2.346987951807229, + "grad_norm": 0.12898831069469452, + "learning_rate": 5.536578073321073e-06, + "loss": 0.006, + "step": 1218 + }, + { + "epoch": 2.3489156626506023, + "grad_norm": 0.11385104805231094, + "learning_rate": 5.505639438769146e-06, + "loss": 0.0052, + "step": 1219 + }, + { + "epoch": 2.350843373493976, + "grad_norm": 0.14569509029388428, + "learning_rate": 5.47477368656486e-06, + "loss": 0.0048, + "step": 1220 + }, + { + "epoch": 2.3527710843373493, + "grad_norm": 0.12406075745820999, + "learning_rate": 5.443980971911238e-06, + "loss": 0.0028, + "step": 1221 + }, + { + "epoch": 2.354698795180723, + "grad_norm": 0.3730498254299164, + "learning_rate": 5.413261449644039e-06, + "loss": 0.0043, + "step": 1222 + }, + { + "epoch": 2.3566265060240963, + "grad_norm": 0.1449914574623108, + "learning_rate": 5.382615274230987e-06, + "loss": 0.0075, + "step": 1223 + }, + { + "epoch": 2.35855421686747, + "grad_norm": 0.20739100873470306, + "learning_rate": 5.352042599770995e-06, + "loss": 0.0061, + "step": 1224 + }, + { + "epoch": 2.3604819277108433, + "grad_norm": 0.05786775052547455, + "learning_rate": 5.321543579993398e-06, + "loss": 0.0015, + "step": 1225 + }, + { + "epoch": 2.362409638554217, + "grad_norm": 0.09043122828006744, + "learning_rate": 5.2911183682571446e-06, + "loss": 0.0034, + "step": 1226 + }, + { + "epoch": 2.3643373493975903, + "grad_norm": 0.2685496211051941, + "learning_rate": 5.260767117550094e-06, + "loss": 0.0076, + "step": 1227 + }, + { + "epoch": 2.3662650602409636, + "grad_norm": 0.17694126069545746, + "learning_rate": 5.230489980488165e-06, + "loss": 0.0148, + "step": 1228 + }, + { + "epoch": 2.3681927710843373, + "grad_norm": 0.11609307676553726, + "learning_rate": 5.200287109314633e-06, + "loss": 0.0049, + "step": 1229 + }, + { + "epoch": 2.370120481927711, + "grad_norm": 0.1257704645395279, + "learning_rate": 5.1701586558993285e-06, + "loss": 0.0031, + "step": 1230 + }, + { + "epoch": 2.3720481927710844, + "grad_norm": 0.27177703380584717, + "learning_rate": 5.140104771737899e-06, + "loss": 0.0058, + "step": 1231 + }, + { + "epoch": 2.3739759036144576, + "grad_norm": 0.13928169012069702, + "learning_rate": 5.110125607951024e-06, + "loss": 0.0051, + "step": 1232 + }, + { + "epoch": 2.3759036144578314, + "grad_norm": 0.679577648639679, + "learning_rate": 5.0802213152836514e-06, + "loss": 0.0173, + "step": 1233 + }, + { + "epoch": 2.3778313253012047, + "grad_norm": 0.16769403219223022, + "learning_rate": 5.0503920441042845e-06, + "loss": 0.0045, + "step": 1234 + }, + { + "epoch": 2.3797590361445784, + "grad_norm": 0.09427493065595627, + "learning_rate": 5.0206379444041764e-06, + "loss": 0.0024, + "step": 1235 + }, + { + "epoch": 2.3816867469879517, + "grad_norm": 0.33908671140670776, + "learning_rate": 4.990959165796585e-06, + "loss": 0.0088, + "step": 1236 + }, + { + "epoch": 2.3836144578313254, + "grad_norm": 0.18106943368911743, + "learning_rate": 4.961355857516034e-06, + "loss": 0.0094, + "step": 1237 + }, + { + "epoch": 2.3855421686746987, + "grad_norm": 0.5833203196525574, + "learning_rate": 4.931828168417583e-06, + "loss": 0.0086, + "step": 1238 + }, + { + "epoch": 2.3874698795180724, + "grad_norm": 0.09108569473028183, + "learning_rate": 4.902376246976015e-06, + "loss": 0.0014, + "step": 1239 + }, + { + "epoch": 2.3893975903614457, + "grad_norm": 0.10596407204866409, + "learning_rate": 4.873000241285153e-06, + "loss": 0.0043, + "step": 1240 + }, + { + "epoch": 2.3913253012048195, + "grad_norm": 0.10775511711835861, + "learning_rate": 4.8437002990570835e-06, + "loss": 0.0014, + "step": 1241 + }, + { + "epoch": 2.3932530120481927, + "grad_norm": 0.9646345973014832, + "learning_rate": 4.8144765676214245e-06, + "loss": 0.0525, + "step": 1242 + }, + { + "epoch": 2.395180722891566, + "grad_norm": 0.20530278980731964, + "learning_rate": 4.7853291939245814e-06, + "loss": 0.008, + "step": 1243 + }, + { + "epoch": 2.3971084337349398, + "grad_norm": 0.1682119369506836, + "learning_rate": 4.756258324528995e-06, + "loss": 0.0044, + "step": 1244 + }, + { + "epoch": 2.3990361445783135, + "grad_norm": 0.45536917448043823, + "learning_rate": 4.727264105612439e-06, + "loss": 0.0186, + "step": 1245 + }, + { + "epoch": 2.4009638554216868, + "grad_norm": 0.3017471730709076, + "learning_rate": 4.698346682967258e-06, + "loss": 0.0106, + "step": 1246 + }, + { + "epoch": 2.40289156626506, + "grad_norm": 0.1226554661989212, + "learning_rate": 4.669506201999625e-06, + "loss": 0.0035, + "step": 1247 + }, + { + "epoch": 2.404819277108434, + "grad_norm": 0.13750068843364716, + "learning_rate": 4.640742807728837e-06, + "loss": 0.0038, + "step": 1248 + }, + { + "epoch": 2.406746987951807, + "grad_norm": 0.11531024426221848, + "learning_rate": 4.612056644786575e-06, + "loss": 0.0021, + "step": 1249 + }, + { + "epoch": 2.408674698795181, + "grad_norm": 0.1143675372004509, + "learning_rate": 4.583447857416175e-06, + "loss": 0.0028, + "step": 1250 + }, + { + "epoch": 2.410602409638554, + "grad_norm": 0.0914216861128807, + "learning_rate": 4.554916589471898e-06, + "loss": 0.0027, + "step": 1251 + }, + { + "epoch": 2.412530120481928, + "grad_norm": 0.18339012563228607, + "learning_rate": 4.526462984418221e-06, + "loss": 0.0037, + "step": 1252 + }, + { + "epoch": 2.414457831325301, + "grad_norm": 0.11073138564825058, + "learning_rate": 4.498087185329105e-06, + "loss": 0.003, + "step": 1253 + }, + { + "epoch": 2.416385542168675, + "grad_norm": 0.20792435109615326, + "learning_rate": 4.469789334887265e-06, + "loss": 0.009, + "step": 1254 + }, + { + "epoch": 2.418313253012048, + "grad_norm": 0.09485629945993423, + "learning_rate": 4.441569575383471e-06, + "loss": 0.0033, + "step": 1255 + }, + { + "epoch": 2.420240963855422, + "grad_norm": 0.11831793934106827, + "learning_rate": 4.413428048715851e-06, + "loss": 0.0021, + "step": 1256 + }, + { + "epoch": 2.422168674698795, + "grad_norm": 0.11818034201860428, + "learning_rate": 4.38536489638911e-06, + "loss": 0.0041, + "step": 1257 + }, + { + "epoch": 2.4240963855421684, + "grad_norm": 0.2583082616329193, + "learning_rate": 4.3573802595138945e-06, + "loss": 0.0039, + "step": 1258 + }, + { + "epoch": 2.426024096385542, + "grad_norm": 0.3120201826095581, + "learning_rate": 4.329474278806034e-06, + "loss": 0.0087, + "step": 1259 + }, + { + "epoch": 2.427951807228916, + "grad_norm": 0.1258879452943802, + "learning_rate": 4.301647094585855e-06, + "loss": 0.0046, + "step": 1260 + }, + { + "epoch": 2.429879518072289, + "grad_norm": 0.15144586563110352, + "learning_rate": 4.273898846777473e-06, + "loss": 0.0054, + "step": 1261 + }, + { + "epoch": 2.4318072289156625, + "grad_norm": 0.15615184605121613, + "learning_rate": 4.246229674908067e-06, + "loss": 0.0072, + "step": 1262 + }, + { + "epoch": 2.433734939759036, + "grad_norm": 0.09690173715353012, + "learning_rate": 4.218639718107225e-06, + "loss": 0.003, + "step": 1263 + }, + { + "epoch": 2.4356626506024095, + "grad_norm": 0.23884955048561096, + "learning_rate": 4.1911291151062e-06, + "loss": 0.0109, + "step": 1264 + }, + { + "epoch": 2.4375903614457832, + "grad_norm": 0.0905768945813179, + "learning_rate": 4.163698004237222e-06, + "loss": 0.0027, + "step": 1265 + }, + { + "epoch": 2.4395180722891565, + "grad_norm": 0.09168912470340729, + "learning_rate": 4.136346523432821e-06, + "loss": 0.0018, + "step": 1266 + }, + { + "epoch": 2.4414457831325302, + "grad_norm": 0.17878012359142303, + "learning_rate": 4.109074810225118e-06, + "loss": 0.0048, + "step": 1267 + }, + { + "epoch": 2.4433734939759035, + "grad_norm": 0.09913790971040726, + "learning_rate": 4.08188300174513e-06, + "loss": 0.0021, + "step": 1268 + }, + { + "epoch": 2.4453012048192773, + "grad_norm": 0.16615812480449677, + "learning_rate": 4.054771234722106e-06, + "loss": 0.0066, + "step": 1269 + }, + { + "epoch": 2.4472289156626506, + "grad_norm": 0.09618276357650757, + "learning_rate": 4.027739645482784e-06, + "loss": 0.0043, + "step": 1270 + }, + { + "epoch": 2.4491566265060243, + "grad_norm": 0.33473479747772217, + "learning_rate": 4.0007883699507855e-06, + "loss": 0.0236, + "step": 1271 + }, + { + "epoch": 2.4510843373493976, + "grad_norm": 0.15051880478858948, + "learning_rate": 3.973917543645867e-06, + "loss": 0.0068, + "step": 1272 + }, + { + "epoch": 2.453012048192771, + "grad_norm": 0.24134816229343414, + "learning_rate": 3.947127301683249e-06, + "loss": 0.0194, + "step": 1273 + }, + { + "epoch": 2.4549397590361446, + "grad_norm": 0.10495353490114212, + "learning_rate": 3.920417778772967e-06, + "loss": 0.0042, + "step": 1274 + }, + { + "epoch": 2.4568674698795183, + "grad_norm": 0.2294938713312149, + "learning_rate": 3.893789109219171e-06, + "loss": 0.0224, + "step": 1275 + }, + { + "epoch": 2.4587951807228916, + "grad_norm": 0.13710513710975647, + "learning_rate": 3.867241426919446e-06, + "loss": 0.0046, + "step": 1276 + }, + { + "epoch": 2.460722891566265, + "grad_norm": 0.06754808127880096, + "learning_rate": 3.840774865364157e-06, + "loss": 0.0019, + "step": 1277 + }, + { + "epoch": 2.4626506024096386, + "grad_norm": 0.24797780811786652, + "learning_rate": 3.8143895576357605e-06, + "loss": 0.0063, + "step": 1278 + }, + { + "epoch": 2.464578313253012, + "grad_norm": 0.1476449817419052, + "learning_rate": 3.788085636408143e-06, + "loss": 0.0055, + "step": 1279 + }, + { + "epoch": 2.4665060240963856, + "grad_norm": 0.22397096455097198, + "learning_rate": 3.7618632339459616e-06, + "loss": 0.0164, + "step": 1280 + }, + { + "epoch": 2.468433734939759, + "grad_norm": 0.21596969664096832, + "learning_rate": 3.7357224821039497e-06, + "loss": 0.0112, + "step": 1281 + }, + { + "epoch": 2.4703614457831327, + "grad_norm": 0.2775099575519562, + "learning_rate": 3.7096635123263068e-06, + "loss": 0.0112, + "step": 1282 + }, + { + "epoch": 2.472289156626506, + "grad_norm": 0.07963326573371887, + "learning_rate": 3.683686455645974e-06, + "loss": 0.0013, + "step": 1283 + }, + { + "epoch": 2.4742168674698797, + "grad_norm": 0.1253802627325058, + "learning_rate": 3.6577914426840266e-06, + "loss": 0.0038, + "step": 1284 + }, + { + "epoch": 2.476144578313253, + "grad_norm": 0.10258597880601883, + "learning_rate": 3.631978603648989e-06, + "loss": 0.0023, + "step": 1285 + }, + { + "epoch": 2.4780722891566267, + "grad_norm": 0.17102380096912384, + "learning_rate": 3.6062480683361935e-06, + "loss": 0.0025, + "step": 1286 + }, + { + "epoch": 2.48, + "grad_norm": 0.09547360241413116, + "learning_rate": 3.580599966127123e-06, + "loss": 0.003, + "step": 1287 + }, + { + "epoch": 2.4819277108433733, + "grad_norm": 0.08008653670549393, + "learning_rate": 3.5550344259887438e-06, + "loss": 0.0023, + "step": 1288 + }, + { + "epoch": 2.483855421686747, + "grad_norm": 0.07712296396493912, + "learning_rate": 3.5295515764729003e-06, + "loss": 0.0015, + "step": 1289 + }, + { + "epoch": 2.4857831325301207, + "grad_norm": 0.21118703484535217, + "learning_rate": 3.5041515457156303e-06, + "loss": 0.0041, + "step": 1290 + }, + { + "epoch": 2.487710843373494, + "grad_norm": 0.10772393643856049, + "learning_rate": 3.4788344614365155e-06, + "loss": 0.0029, + "step": 1291 + }, + { + "epoch": 2.4896385542168673, + "grad_norm": 0.2353268563747406, + "learning_rate": 3.453600450938073e-06, + "loss": 0.0072, + "step": 1292 + }, + { + "epoch": 2.491566265060241, + "grad_norm": 0.2897944152355194, + "learning_rate": 3.428449641105107e-06, + "loss": 0.0205, + "step": 1293 + }, + { + "epoch": 2.4934939759036143, + "grad_norm": 0.19756680727005005, + "learning_rate": 3.4033821584040383e-06, + "loss": 0.0065, + "step": 1294 + }, + { + "epoch": 2.495421686746988, + "grad_norm": 0.13538534939289093, + "learning_rate": 3.378398128882305e-06, + "loss": 0.0025, + "step": 1295 + }, + { + "epoch": 2.4973493975903613, + "grad_norm": 0.2301637977361679, + "learning_rate": 3.3534976781677142e-06, + "loss": 0.0071, + "step": 1296 + }, + { + "epoch": 2.499277108433735, + "grad_norm": 0.0965796634554863, + "learning_rate": 3.3286809314678137e-06, + "loss": 0.0024, + "step": 1297 + }, + { + "epoch": 2.5012048192771084, + "grad_norm": 0.0777980163693428, + "learning_rate": 3.30394801356926e-06, + "loss": 0.0013, + "step": 1298 + }, + { + "epoch": 2.503132530120482, + "grad_norm": 0.3157603442668915, + "learning_rate": 3.279299048837177e-06, + "loss": 0.0228, + "step": 1299 + }, + { + "epoch": 2.5050602409638554, + "grad_norm": 0.15660233795642853, + "learning_rate": 3.2547341612145654e-06, + "loss": 0.0056, + "step": 1300 + }, + { + "epoch": 2.506987951807229, + "grad_norm": 0.21655581891536713, + "learning_rate": 3.2302534742216586e-06, + "loss": 0.0081, + "step": 1301 + }, + { + "epoch": 2.5089156626506024, + "grad_norm": 0.09475889801979065, + "learning_rate": 3.205857110955277e-06, + "loss": 0.0029, + "step": 1302 + }, + { + "epoch": 2.5108433734939757, + "grad_norm": 0.13174696266651154, + "learning_rate": 3.18154519408826e-06, + "loss": 0.0059, + "step": 1303 + }, + { + "epoch": 2.5127710843373494, + "grad_norm": 0.10386355221271515, + "learning_rate": 3.1573178458688102e-06, + "loss": 0.0042, + "step": 1304 + }, + { + "epoch": 2.514698795180723, + "grad_norm": 0.12700854241847992, + "learning_rate": 3.133175188119899e-06, + "loss": 0.0041, + "step": 1305 + }, + { + "epoch": 2.5166265060240964, + "grad_norm": 0.1617022454738617, + "learning_rate": 3.109117342238639e-06, + "loss": 0.0053, + "step": 1306 + }, + { + "epoch": 2.5185542168674697, + "grad_norm": 0.8668884038925171, + "learning_rate": 3.085144429195688e-06, + "loss": 0.0084, + "step": 1307 + }, + { + "epoch": 2.5204819277108435, + "grad_norm": 0.22429344058036804, + "learning_rate": 3.061256569534634e-06, + "loss": 0.0053, + "step": 1308 + }, + { + "epoch": 2.5224096385542167, + "grad_norm": 0.08967582136392593, + "learning_rate": 3.037453883371375e-06, + "loss": 0.0018, + "step": 1309 + }, + { + "epoch": 2.5243373493975905, + "grad_norm": 0.1251695454120636, + "learning_rate": 3.0137364903935464e-06, + "loss": 0.0037, + "step": 1310 + }, + { + "epoch": 2.5262650602409638, + "grad_norm": 0.09026174992322922, + "learning_rate": 2.990104509859897e-06, + "loss": 0.0024, + "step": 1311 + }, + { + "epoch": 2.5281927710843375, + "grad_norm": 0.34319114685058594, + "learning_rate": 2.966558060599689e-06, + "loss": 0.0063, + "step": 1312 + }, + { + "epoch": 2.5301204819277108, + "grad_norm": 0.20300136506557465, + "learning_rate": 2.9430972610121087e-06, + "loss": 0.0054, + "step": 1313 + }, + { + "epoch": 2.532048192771084, + "grad_norm": 0.19160760939121246, + "learning_rate": 2.9197222290656737e-06, + "loss": 0.0095, + "step": 1314 + }, + { + "epoch": 2.533975903614458, + "grad_norm": 0.18991442024707794, + "learning_rate": 2.8964330822976227e-06, + "loss": 0.006, + "step": 1315 + }, + { + "epoch": 2.5359036144578315, + "grad_norm": 0.1801903396844864, + "learning_rate": 2.873229937813349e-06, + "loss": 0.0067, + "step": 1316 + }, + { + "epoch": 2.537831325301205, + "grad_norm": 0.07068303227424622, + "learning_rate": 2.850112912285783e-06, + "loss": 0.0015, + "step": 1317 + }, + { + "epoch": 2.539759036144578, + "grad_norm": 0.1404612809419632, + "learning_rate": 2.8270821219548296e-06, + "loss": 0.0036, + "step": 1318 + }, + { + "epoch": 2.541686746987952, + "grad_norm": 0.12199504673480988, + "learning_rate": 2.8041376826267862e-06, + "loss": 0.0068, + "step": 1319 + }, + { + "epoch": 2.5436144578313256, + "grad_norm": 0.2167249619960785, + "learning_rate": 2.7812797096737253e-06, + "loss": 0.0048, + "step": 1320 + }, + { + "epoch": 2.545542168674699, + "grad_norm": 0.07466506212949753, + "learning_rate": 2.7585083180329575e-06, + "loss": 0.0017, + "step": 1321 + }, + { + "epoch": 2.547469879518072, + "grad_norm": 0.11736353486776352, + "learning_rate": 2.7358236222064283e-06, + "loss": 0.003, + "step": 1322 + }, + { + "epoch": 2.549397590361446, + "grad_norm": 0.16602204740047455, + "learning_rate": 2.7132257362601453e-06, + "loss": 0.005, + "step": 1323 + }, + { + "epoch": 2.551325301204819, + "grad_norm": 0.15473629534244537, + "learning_rate": 2.6907147738236193e-06, + "loss": 0.0077, + "step": 1324 + }, + { + "epoch": 2.553253012048193, + "grad_norm": 0.07868973910808563, + "learning_rate": 2.6682908480892567e-06, + "loss": 0.0013, + "step": 1325 + }, + { + "epoch": 2.555180722891566, + "grad_norm": 0.2137845754623413, + "learning_rate": 2.645954071811847e-06, + "loss": 0.0092, + "step": 1326 + }, + { + "epoch": 2.55710843373494, + "grad_norm": 0.11191053688526154, + "learning_rate": 2.623704557307949e-06, + "loss": 0.0031, + "step": 1327 + }, + { + "epoch": 2.559036144578313, + "grad_norm": 0.3080642521381378, + "learning_rate": 2.6015424164553295e-06, + "loss": 0.0104, + "step": 1328 + }, + { + "epoch": 2.5609638554216865, + "grad_norm": 0.08816439658403397, + "learning_rate": 2.579467760692427e-06, + "loss": 0.004, + "step": 1329 + }, + { + "epoch": 2.56289156626506, + "grad_norm": 0.17154981195926666, + "learning_rate": 2.557480701017776e-06, + "loss": 0.0035, + "step": 1330 + }, + { + "epoch": 2.564819277108434, + "grad_norm": 0.09479143470525742, + "learning_rate": 2.5355813479894464e-06, + "loss": 0.0034, + "step": 1331 + }, + { + "epoch": 2.5667469879518072, + "grad_norm": 0.26139333844184875, + "learning_rate": 2.513769811724487e-06, + "loss": 0.0076, + "step": 1332 + }, + { + "epoch": 2.5686746987951805, + "grad_norm": 0.16864238679409027, + "learning_rate": 2.4920462018983816e-06, + "loss": 0.0046, + "step": 1333 + }, + { + "epoch": 2.5706024096385542, + "grad_norm": 0.1133158802986145, + "learning_rate": 2.4704106277444884e-06, + "loss": 0.0034, + "step": 1334 + }, + { + "epoch": 2.572530120481928, + "grad_norm": 0.27522334456443787, + "learning_rate": 2.4488631980534995e-06, + "loss": 0.0127, + "step": 1335 + }, + { + "epoch": 2.5744578313253013, + "grad_norm": 0.13547387719154358, + "learning_rate": 2.427404021172868e-06, + "loss": 0.0031, + "step": 1336 + }, + { + "epoch": 2.5763855421686745, + "grad_norm": 0.13478629291057587, + "learning_rate": 2.406033205006313e-06, + "loss": 0.0039, + "step": 1337 + }, + { + "epoch": 2.5783132530120483, + "grad_norm": 0.11515481770038605, + "learning_rate": 2.3847508570132226e-06, + "loss": 0.0029, + "step": 1338 + }, + { + "epoch": 2.5802409638554216, + "grad_norm": 0.21657171845436096, + "learning_rate": 2.36355708420815e-06, + "loss": 0.011, + "step": 1339 + }, + { + "epoch": 2.5821686746987953, + "grad_norm": 0.11441601067781448, + "learning_rate": 2.342451993160262e-06, + "loss": 0.006, + "step": 1340 + }, + { + "epoch": 2.5840963855421686, + "grad_norm": 0.13475841283798218, + "learning_rate": 2.3214356899928036e-06, + "loss": 0.0051, + "step": 1341 + }, + { + "epoch": 2.5860240963855423, + "grad_norm": 0.053035832941532135, + "learning_rate": 2.300508280382572e-06, + "loss": 0.0012, + "step": 1342 + }, + { + "epoch": 2.5879518072289156, + "grad_norm": 0.12467508763074875, + "learning_rate": 2.279669869559358e-06, + "loss": 0.0024, + "step": 1343 + }, + { + "epoch": 2.589879518072289, + "grad_norm": 0.10572273284196854, + "learning_rate": 2.2589205623054646e-06, + "loss": 0.0024, + "step": 1344 + }, + { + "epoch": 2.5918072289156626, + "grad_norm": 0.17056365311145782, + "learning_rate": 2.238260462955142e-06, + "loss": 0.0064, + "step": 1345 + }, + { + "epoch": 2.5937349397590364, + "grad_norm": 0.07940494269132614, + "learning_rate": 2.2176896753940637e-06, + "loss": 0.0012, + "step": 1346 + }, + { + "epoch": 2.5956626506024096, + "grad_norm": 0.10416694730520248, + "learning_rate": 2.1972083030588244e-06, + "loss": 0.0092, + "step": 1347 + }, + { + "epoch": 2.597590361445783, + "grad_norm": 0.2384328842163086, + "learning_rate": 2.176816448936423e-06, + "loss": 0.0067, + "step": 1348 + }, + { + "epoch": 2.5995180722891567, + "grad_norm": 0.14279082417488098, + "learning_rate": 2.156514215563703e-06, + "loss": 0.0059, + "step": 1349 + }, + { + "epoch": 2.6014457831325304, + "grad_norm": 0.08462683111429214, + "learning_rate": 2.1363017050268886e-06, + "loss": 0.0021, + "step": 1350 + }, + { + "epoch": 2.6033734939759037, + "grad_norm": 0.09768491238355637, + "learning_rate": 2.1161790189610377e-06, + "loss": 0.0038, + "step": 1351 + }, + { + "epoch": 2.605301204819277, + "grad_norm": 0.25498896837234497, + "learning_rate": 2.0961462585495474e-06, + "loss": 0.0114, + "step": 1352 + }, + { + "epoch": 2.6072289156626507, + "grad_norm": 0.15635675191879272, + "learning_rate": 2.076203524523637e-06, + "loss": 0.0054, + "step": 1353 + }, + { + "epoch": 2.609156626506024, + "grad_norm": 0.11619213968515396, + "learning_rate": 2.056350917161836e-06, + "loss": 0.007, + "step": 1354 + }, + { + "epoch": 2.6110843373493977, + "grad_norm": 0.18085338175296783, + "learning_rate": 2.0365885362895053e-06, + "loss": 0.0061, + "step": 1355 + }, + { + "epoch": 2.613012048192771, + "grad_norm": 0.14492927491664886, + "learning_rate": 2.016916481278306e-06, + "loss": 0.0114, + "step": 1356 + }, + { + "epoch": 2.6149397590361447, + "grad_norm": 0.21257621049880981, + "learning_rate": 1.997334851045709e-06, + "loss": 0.0057, + "step": 1357 + }, + { + "epoch": 2.616867469879518, + "grad_norm": 0.11539656668901443, + "learning_rate": 1.9778437440545085e-06, + "loss": 0.0071, + "step": 1358 + }, + { + "epoch": 2.6187951807228913, + "grad_norm": 0.1642933189868927, + "learning_rate": 1.95844325831231e-06, + "loss": 0.0054, + "step": 1359 + }, + { + "epoch": 2.620722891566265, + "grad_norm": 0.10779479146003723, + "learning_rate": 1.9391334913710545e-06, + "loss": 0.0028, + "step": 1360 + }, + { + "epoch": 2.6226506024096388, + "grad_norm": 0.14295366406440735, + "learning_rate": 1.9199145403265175e-06, + "loss": 0.0048, + "step": 1361 + }, + { + "epoch": 2.624578313253012, + "grad_norm": 0.13454844057559967, + "learning_rate": 1.9007865018178107e-06, + "loss": 0.0072, + "step": 1362 + }, + { + "epoch": 2.6265060240963853, + "grad_norm": 0.778252363204956, + "learning_rate": 1.8817494720269302e-06, + "loss": 0.0071, + "step": 1363 + }, + { + "epoch": 2.628433734939759, + "grad_norm": 0.11488679051399231, + "learning_rate": 1.8628035466782268e-06, + "loss": 0.0038, + "step": 1364 + }, + { + "epoch": 2.630361445783133, + "grad_norm": 0.15560875833034515, + "learning_rate": 1.8439488210379687e-06, + "loss": 0.0043, + "step": 1365 + }, + { + "epoch": 2.632289156626506, + "grad_norm": 0.10538071393966675, + "learning_rate": 1.8251853899138306e-06, + "loss": 0.0041, + "step": 1366 + }, + { + "epoch": 2.6342168674698794, + "grad_norm": 0.12866193056106567, + "learning_rate": 1.8065133476544306e-06, + "loss": 0.0034, + "step": 1367 + }, + { + "epoch": 2.636144578313253, + "grad_norm": 0.2045469433069229, + "learning_rate": 1.7879327881488584e-06, + "loss": 0.0141, + "step": 1368 + }, + { + "epoch": 2.6380722891566264, + "grad_norm": 0.12423976510763168, + "learning_rate": 1.769443804826194e-06, + "loss": 0.0047, + "step": 1369 + }, + { + "epoch": 2.64, + "grad_norm": 0.1007109209895134, + "learning_rate": 1.751046490655046e-06, + "loss": 0.0031, + "step": 1370 + }, + { + "epoch": 2.6419277108433734, + "grad_norm": 0.0681275874376297, + "learning_rate": 1.7327409381430804e-06, + "loss": 0.0019, + "step": 1371 + }, + { + "epoch": 2.643855421686747, + "grad_norm": 0.1645517498254776, + "learning_rate": 1.7145272393365498e-06, + "loss": 0.0035, + "step": 1372 + }, + { + "epoch": 2.6457831325301204, + "grad_norm": 0.13689427077770233, + "learning_rate": 1.6964054858198386e-06, + "loss": 0.0086, + "step": 1373 + }, + { + "epoch": 2.6477108433734937, + "grad_norm": 0.10440093278884888, + "learning_rate": 1.6783757687150149e-06, + "loss": 0.0019, + "step": 1374 + }, + { + "epoch": 2.6496385542168674, + "grad_norm": 0.1142532229423523, + "learning_rate": 1.6604381786813383e-06, + "loss": 0.0047, + "step": 1375 + }, + { + "epoch": 2.651566265060241, + "grad_norm": 0.10430166125297546, + "learning_rate": 1.6425928059148312e-06, + "loss": 0.0027, + "step": 1376 + }, + { + "epoch": 2.6534939759036145, + "grad_norm": 0.2315254956483841, + "learning_rate": 1.624839740147819e-06, + "loss": 0.0071, + "step": 1377 + }, + { + "epoch": 2.6554216867469878, + "grad_norm": 0.15356265008449554, + "learning_rate": 1.6071790706484746e-06, + "loss": 0.0109, + "step": 1378 + }, + { + "epoch": 2.6573493975903615, + "grad_norm": 0.1332363784313202, + "learning_rate": 1.589610886220383e-06, + "loss": 0.0046, + "step": 1379 + }, + { + "epoch": 2.659277108433735, + "grad_norm": 0.18892519176006317, + "learning_rate": 1.5721352752020602e-06, + "loss": 0.0138, + "step": 1380 + }, + { + "epoch": 2.6612048192771085, + "grad_norm": 0.10537895560264587, + "learning_rate": 1.5547523254665598e-06, + "loss": 0.0066, + "step": 1381 + }, + { + "epoch": 2.663132530120482, + "grad_norm": 0.1308947205543518, + "learning_rate": 1.5374621244209965e-06, + "loss": 0.0039, + "step": 1382 + }, + { + "epoch": 2.6650602409638555, + "grad_norm": 0.11358808726072311, + "learning_rate": 1.5202647590060983e-06, + "loss": 0.0029, + "step": 1383 + }, + { + "epoch": 2.666987951807229, + "grad_norm": 0.12029009312391281, + "learning_rate": 1.5031603156958064e-06, + "loss": 0.0032, + "step": 1384 + }, + { + "epoch": 2.6689156626506025, + "grad_norm": 0.36994072794914246, + "learning_rate": 1.4861488804968093e-06, + "loss": 0.024, + "step": 1385 + }, + { + "epoch": 2.670843373493976, + "grad_norm": 0.1263083666563034, + "learning_rate": 1.4692305389481232e-06, + "loss": 0.0047, + "step": 1386 + }, + { + "epoch": 2.6727710843373496, + "grad_norm": 0.15056709945201874, + "learning_rate": 1.452405376120658e-06, + "loss": 0.0014, + "step": 1387 + }, + { + "epoch": 2.674698795180723, + "grad_norm": 0.10418888181447983, + "learning_rate": 1.4356734766167925e-06, + "loss": 0.0035, + "step": 1388 + }, + { + "epoch": 2.676626506024096, + "grad_norm": 0.12220565974712372, + "learning_rate": 1.4190349245699443e-06, + "loss": 0.0063, + "step": 1389 + }, + { + "epoch": 2.67855421686747, + "grad_norm": 0.14774753153324127, + "learning_rate": 1.402489803644156e-06, + "loss": 0.008, + "step": 1390 + }, + { + "epoch": 2.6804819277108436, + "grad_norm": 0.14384198188781738, + "learning_rate": 1.3860381970336544e-06, + "loss": 0.0039, + "step": 1391 + }, + { + "epoch": 2.682409638554217, + "grad_norm": 0.10995055735111237, + "learning_rate": 1.3696801874624698e-06, + "loss": 0.0028, + "step": 1392 + }, + { + "epoch": 2.68433734939759, + "grad_norm": 0.12208505719900131, + "learning_rate": 1.353415857183966e-06, + "loss": 0.0029, + "step": 1393 + }, + { + "epoch": 2.686265060240964, + "grad_norm": 0.16018439829349518, + "learning_rate": 1.337245287980482e-06, + "loss": 0.0068, + "step": 1394 + }, + { + "epoch": 2.688192771084337, + "grad_norm": 5.2112274169921875, + "learning_rate": 1.3211685611628844e-06, + "loss": 0.1645, + "step": 1395 + }, + { + "epoch": 2.690120481927711, + "grad_norm": 0.12426120787858963, + "learning_rate": 1.3051857575701732e-06, + "loss": 0.0044, + "step": 1396 + }, + { + "epoch": 2.692048192771084, + "grad_norm": 0.13931375741958618, + "learning_rate": 1.2892969575690685e-06, + "loss": 0.0035, + "step": 1397 + }, + { + "epoch": 2.693975903614458, + "grad_norm": 0.1804540753364563, + "learning_rate": 1.273502241053608e-06, + "loss": 0.0108, + "step": 1398 + }, + { + "epoch": 2.695903614457831, + "grad_norm": 0.12313607335090637, + "learning_rate": 1.2578016874447596e-06, + "loss": 0.0073, + "step": 1399 + }, + { + "epoch": 2.697831325301205, + "grad_norm": 0.1301470398902893, + "learning_rate": 1.2421953756899985e-06, + "loss": 0.0037, + "step": 1400 + }, + { + "epoch": 2.6997590361445782, + "grad_norm": 0.12769126892089844, + "learning_rate": 1.226683384262919e-06, + "loss": 0.0041, + "step": 1401 + }, + { + "epoch": 2.701686746987952, + "grad_norm": 0.20923997461795807, + "learning_rate": 1.21126579116285e-06, + "loss": 0.0101, + "step": 1402 + }, + { + "epoch": 2.7036144578313253, + "grad_norm": 0.09334482997655869, + "learning_rate": 1.1959426739144497e-06, + "loss": 0.0022, + "step": 1403 + }, + { + "epoch": 2.7055421686746985, + "grad_norm": 0.06848987936973572, + "learning_rate": 1.1807141095673291e-06, + "loss": 0.0013, + "step": 1404 + }, + { + "epoch": 2.7074698795180723, + "grad_norm": 0.14552196860313416, + "learning_rate": 1.1655801746956463e-06, + "loss": 0.0066, + "step": 1405 + }, + { + "epoch": 2.709397590361446, + "grad_norm": 0.11259587109088898, + "learning_rate": 1.1505409453977334e-06, + "loss": 0.0045, + "step": 1406 + }, + { + "epoch": 2.7113253012048193, + "grad_norm": 0.23408068716526031, + "learning_rate": 1.135596497295719e-06, + "loss": 0.0181, + "step": 1407 + }, + { + "epoch": 2.7132530120481926, + "grad_norm": 0.1483619660139084, + "learning_rate": 1.1207469055351395e-06, + "loss": 0.0042, + "step": 1408 + }, + { + "epoch": 2.7151807228915663, + "grad_norm": 0.1170588880777359, + "learning_rate": 1.105992244784555e-06, + "loss": 0.0059, + "step": 1409 + }, + { + "epoch": 2.7171084337349396, + "grad_norm": 0.15649215877056122, + "learning_rate": 1.0913325892351857e-06, + "loss": 0.0023, + "step": 1410 + }, + { + "epoch": 2.7190361445783133, + "grad_norm": 0.0980108231306076, + "learning_rate": 1.0767680126005443e-06, + "loss": 0.0019, + "step": 1411 + }, + { + "epoch": 2.7209638554216866, + "grad_norm": 0.14913050830364227, + "learning_rate": 1.0622985881160396e-06, + "loss": 0.0018, + "step": 1412 + }, + { + "epoch": 2.7228915662650603, + "grad_norm": 0.0827481672167778, + "learning_rate": 1.0479243885386347e-06, + "loss": 0.0023, + "step": 1413 + }, + { + "epoch": 2.7248192771084336, + "grad_norm": 0.15648555755615234, + "learning_rate": 1.0336454861464706e-06, + "loss": 0.0033, + "step": 1414 + }, + { + "epoch": 2.7267469879518074, + "grad_norm": 0.10614357888698578, + "learning_rate": 1.0194619527385007e-06, + "loss": 0.0029, + "step": 1415 + }, + { + "epoch": 2.7286746987951807, + "grad_norm": 0.07111652940511703, + "learning_rate": 1.0053738596341355e-06, + "loss": 0.0026, + "step": 1416 + }, + { + "epoch": 2.7306024096385544, + "grad_norm": 0.11736573278903961, + "learning_rate": 9.91381277672867e-07, + "loss": 0.005, + "step": 1417 + }, + { + "epoch": 2.7325301204819277, + "grad_norm": 0.18440629541873932, + "learning_rate": 9.774842772139537e-07, + "loss": 0.0038, + "step": 1418 + }, + { + "epoch": 2.734457831325301, + "grad_norm": 0.11000041663646698, + "learning_rate": 9.636829281360116e-07, + "loss": 0.0034, + "step": 1419 + }, + { + "epoch": 2.7363855421686747, + "grad_norm": 0.15212605893611908, + "learning_rate": 9.499772998367018e-07, + "loss": 0.0038, + "step": 1420 + }, + { + "epoch": 2.7383132530120484, + "grad_norm": 0.07784705609083176, + "learning_rate": 9.36367461232377e-07, + "loss": 0.002, + "step": 1421 + }, + { + "epoch": 2.7402409638554217, + "grad_norm": 0.1096726506948471, + "learning_rate": 9.22853480757715e-07, + "loss": 0.0028, + "step": 1422 + }, + { + "epoch": 2.742168674698795, + "grad_norm": 0.17528535425662994, + "learning_rate": 9.094354263653971e-07, + "loss": 0.0065, + "step": 1423 + }, + { + "epoch": 2.7440963855421687, + "grad_norm": 0.09263470768928528, + "learning_rate": 8.961133655257548e-07, + "loss": 0.0031, + "step": 1424 + }, + { + "epoch": 2.746024096385542, + "grad_norm": 0.14822180569171906, + "learning_rate": 8.828873652264303e-07, + "loss": 0.0043, + "step": 1425 + }, + { + "epoch": 2.7479518072289157, + "grad_norm": 0.11577019095420837, + "learning_rate": 8.697574919720497e-07, + "loss": 0.004, + "step": 1426 + }, + { + "epoch": 2.749879518072289, + "grad_norm": 0.11681873351335526, + "learning_rate": 8.567238117838683e-07, + "loss": 0.0035, + "step": 1427 + }, + { + "epoch": 2.7518072289156628, + "grad_norm": 0.1191524937748909, + "learning_rate": 8.437863901994592e-07, + "loss": 0.0022, + "step": 1428 + }, + { + "epoch": 2.753734939759036, + "grad_norm": 0.1528361737728119, + "learning_rate": 8.309452922723849e-07, + "loss": 0.0042, + "step": 1429 + }, + { + "epoch": 2.75566265060241, + "grad_norm": 0.42052382230758667, + "learning_rate": 8.18200582571842e-07, + "loss": 0.0149, + "step": 1430 + }, + { + "epoch": 2.757590361445783, + "grad_norm": 0.13524137437343597, + "learning_rate": 8.055523251823705e-07, + "loss": 0.0029, + "step": 1431 + }, + { + "epoch": 2.759518072289157, + "grad_norm": 0.0980493426322937, + "learning_rate": 7.930005837035138e-07, + "loss": 0.0036, + "step": 1432 + }, + { + "epoch": 2.76144578313253, + "grad_norm": 0.17335453629493713, + "learning_rate": 7.805454212494967e-07, + "loss": 0.0066, + "step": 1433 + }, + { + "epoch": 2.7633734939759034, + "grad_norm": 0.13746409118175507, + "learning_rate": 7.681869004489218e-07, + "loss": 0.0066, + "step": 1434 + }, + { + "epoch": 2.765301204819277, + "grad_norm": 0.18556399643421173, + "learning_rate": 7.559250834444332e-07, + "loss": 0.0073, + "step": 1435 + }, + { + "epoch": 2.767228915662651, + "grad_norm": 0.09743557125329971, + "learning_rate": 7.437600318924332e-07, + "loss": 0.0023, + "step": 1436 + }, + { + "epoch": 2.769156626506024, + "grad_norm": 0.10671001672744751, + "learning_rate": 7.316918069627488e-07, + "loss": 0.003, + "step": 1437 + }, + { + "epoch": 2.7710843373493974, + "grad_norm": 0.10671380162239075, + "learning_rate": 7.197204693383231e-07, + "loss": 0.0021, + "step": 1438 + }, + { + "epoch": 2.773012048192771, + "grad_norm": 0.06824454665184021, + "learning_rate": 7.078460792149311e-07, + "loss": 0.0017, + "step": 1439 + }, + { + "epoch": 2.7749397590361444, + "grad_norm": 0.12668560445308685, + "learning_rate": 6.960686963008556e-07, + "loss": 0.0035, + "step": 1440 + }, + { + "epoch": 2.776867469879518, + "grad_norm": 0.10260980576276779, + "learning_rate": 6.843883798166029e-07, + "loss": 0.0027, + "step": 1441 + }, + { + "epoch": 2.7787951807228914, + "grad_norm": 0.09880302101373672, + "learning_rate": 6.728051884945941e-07, + "loss": 0.0029, + "step": 1442 + }, + { + "epoch": 2.780722891566265, + "grad_norm": 0.305993914604187, + "learning_rate": 6.613191805788699e-07, + "loss": 0.0112, + "step": 1443 + }, + { + "epoch": 2.7826506024096385, + "grad_norm": 0.10707511752843857, + "learning_rate": 6.499304138248064e-07, + "loss": 0.0062, + "step": 1444 + }, + { + "epoch": 2.784578313253012, + "grad_norm": 0.0986943170428276, + "learning_rate": 6.386389454988195e-07, + "loss": 0.0021, + "step": 1445 + }, + { + "epoch": 2.7865060240963855, + "grad_norm": 0.1458776742219925, + "learning_rate": 6.274448323780724e-07, + "loss": 0.0094, + "step": 1446 + }, + { + "epoch": 2.788433734939759, + "grad_norm": 0.09657061100006104, + "learning_rate": 6.163481307501995e-07, + "loss": 0.0026, + "step": 1447 + }, + { + "epoch": 2.7903614457831325, + "grad_norm": 0.1462988704442978, + "learning_rate": 6.053488964130183e-07, + "loss": 0.0075, + "step": 1448 + }, + { + "epoch": 2.792289156626506, + "grad_norm": 0.15330864489078522, + "learning_rate": 5.94447184674245e-07, + "loss": 0.0067, + "step": 1449 + }, + { + "epoch": 2.7942168674698795, + "grad_norm": 0.1513473242521286, + "learning_rate": 5.836430503512236e-07, + "loss": 0.0106, + "step": 1450 + }, + { + "epoch": 2.7961445783132532, + "grad_norm": 0.2151842713356018, + "learning_rate": 5.729365477706505e-07, + "loss": 0.0062, + "step": 1451 + }, + { + "epoch": 2.7980722891566265, + "grad_norm": 0.13624203205108643, + "learning_rate": 5.623277307682929e-07, + "loss": 0.0045, + "step": 1452 + }, + { + "epoch": 2.8, + "grad_norm": 0.12075261026620865, + "learning_rate": 5.518166526887214e-07, + "loss": 0.0073, + "step": 1453 + }, + { + "epoch": 2.8019277108433736, + "grad_norm": 0.11320624500513077, + "learning_rate": 5.41403366385047e-07, + "loss": 0.002, + "step": 1454 + }, + { + "epoch": 2.803855421686747, + "grad_norm": 0.08470363914966583, + "learning_rate": 5.310879242186606e-07, + "loss": 0.0021, + "step": 1455 + }, + { + "epoch": 2.8057831325301206, + "grad_norm": 0.15221907198429108, + "learning_rate": 5.208703780589419e-07, + "loss": 0.0019, + "step": 1456 + }, + { + "epoch": 2.807710843373494, + "grad_norm": 0.12709103524684906, + "learning_rate": 5.107507792830335e-07, + "loss": 0.0052, + "step": 1457 + }, + { + "epoch": 2.8096385542168676, + "grad_norm": 0.10888515412807465, + "learning_rate": 5.007291787755586e-07, + "loss": 0.0023, + "step": 1458 + }, + { + "epoch": 2.811566265060241, + "grad_norm": 0.25710970163345337, + "learning_rate": 4.908056269283789e-07, + "loss": 0.0073, + "step": 1459 + }, + { + "epoch": 2.8134939759036146, + "grad_norm": 0.08488702774047852, + "learning_rate": 4.809801736403308e-07, + "loss": 0.0016, + "step": 1460 + }, + { + "epoch": 2.815421686746988, + "grad_norm": 0.1282006949186325, + "learning_rate": 4.7125286831698034e-07, + "loss": 0.0035, + "step": 1461 + }, + { + "epoch": 2.8173493975903616, + "grad_norm": 0.08955442905426025, + "learning_rate": 4.6162375987037766e-07, + "loss": 0.004, + "step": 1462 + }, + { + "epoch": 2.819277108433735, + "grad_norm": 0.11310838907957077, + "learning_rate": 4.520928967188054e-07, + "loss": 0.0022, + "step": 1463 + }, + { + "epoch": 2.821204819277108, + "grad_norm": 0.15055686235427856, + "learning_rate": 4.426603267865326e-07, + "loss": 0.0042, + "step": 1464 + }, + { + "epoch": 2.823132530120482, + "grad_norm": 0.14379452168941498, + "learning_rate": 4.333260975035769e-07, + "loss": 0.0089, + "step": 1465 + }, + { + "epoch": 2.8250602409638557, + "grad_norm": 0.1795361489057541, + "learning_rate": 4.240902558054827e-07, + "loss": 0.013, + "step": 1466 + }, + { + "epoch": 2.826987951807229, + "grad_norm": 0.06829468160867691, + "learning_rate": 4.1495284813305003e-07, + "loss": 0.0018, + "step": 1467 + }, + { + "epoch": 2.8289156626506022, + "grad_norm": 0.35213515162467957, + "learning_rate": 4.0591392043213275e-07, + "loss": 0.0144, + "step": 1468 + }, + { + "epoch": 2.830843373493976, + "grad_norm": 0.11828093230724335, + "learning_rate": 3.969735181533918e-07, + "loss": 0.0028, + "step": 1469 + }, + { + "epoch": 2.8327710843373493, + "grad_norm": 0.13286921381950378, + "learning_rate": 3.881316862520712e-07, + "loss": 0.0042, + "step": 1470 + }, + { + "epoch": 2.834698795180723, + "grad_norm": 0.10271132737398148, + "learning_rate": 3.7938846918776917e-07, + "loss": 0.0047, + "step": 1471 + }, + { + "epoch": 2.8366265060240963, + "grad_norm": 0.09422904253005981, + "learning_rate": 3.707439109242139e-07, + "loss": 0.0061, + "step": 1472 + } + ], + "logging_steps": 1, + "max_steps": 1557, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 92, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.9327468408898847e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1472/training_args.bin b/checkpoint-1472/training_args.bin new file mode 100644 index 0000000..ecc7b6b --- /dev/null +++ b/checkpoint-1472/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:342dfb3c86216e436950100c79812c54066d5572c4e9975b0f133c067f061bcf +size 7825 diff --git a/checkpoint-1557/chat_template.jinja b/checkpoint-1557/chat_template.jinja new file mode 100644 index 0000000..1bad6a0 --- /dev/null +++ b/checkpoint-1557/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/checkpoint-1557/config.json b/checkpoint-1557/config.json new file mode 100644 index 0000000..f8bf41e --- /dev/null +++ b/checkpoint-1557/config.json @@ -0,0 +1,36 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128009, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": 128009, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "transformers_version": "4.56.2", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/checkpoint-1557/generation_config.json b/checkpoint-1557/generation_config.json new file mode 100644 index 0000000..2152026 --- /dev/null +++ b/checkpoint-1557/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128009, + 128001, + 128008, + 128009 + ], + "pad_token_id": 128009, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.56.2" +} diff --git a/checkpoint-1557/model.safetensors b/checkpoint-1557/model.safetensors new file mode 100644 index 0000000..9832a64 --- /dev/null +++ b/checkpoint-1557/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:310cc3b4c5cd95f89d0465923767832cc5bad4d98a9c532e2504b385a5fb6698 +size 2996982344 diff --git a/checkpoint-1557/special_tokens_map.json b/checkpoint-1557/special_tokens_map.json new file mode 100644 index 0000000..14daf45 --- /dev/null +++ b/checkpoint-1557/special_tokens_map.json @@ -0,0 +1,26 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/checkpoint-1557/tokenizer.json b/checkpoint-1557/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/checkpoint-1557/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-1557/tokenizer_config.json b/checkpoint-1557/tokenizer_config.json new file mode 100644 index 0000000..d1e1ea9 --- /dev/null +++ b/checkpoint-1557/tokenizer_config.json @@ -0,0 +1,2068 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-1557/trainer_state.json b/checkpoint-1557/trainer_state.json new file mode 100644 index 0000000..aa56f87 --- /dev/null +++ b/checkpoint-1557/trainer_state.json @@ -0,0 +1,10933 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1557, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0019277108433734939, + "grad_norm": 2.8518834114074707, + "learning_rate": 0.0, + "loss": 0.0891, + "step": 1 + }, + { + "epoch": 0.0038554216867469878, + "grad_norm": 1.8441249132156372, + "learning_rate": 2.564102564102564e-07, + "loss": 0.0539, + "step": 2 + }, + { + "epoch": 0.005783132530120482, + "grad_norm": 2.8263237476348877, + "learning_rate": 5.128205128205128e-07, + "loss": 0.099, + "step": 3 + }, + { + "epoch": 0.0077108433734939755, + "grad_norm": 2.5051236152648926, + "learning_rate": 7.692307692307694e-07, + "loss": 0.0789, + "step": 4 + }, + { + "epoch": 0.00963855421686747, + "grad_norm": 2.6903438568115234, + "learning_rate": 1.0256410256410257e-06, + "loss": 0.0881, + "step": 5 + }, + { + "epoch": 0.011566265060240964, + "grad_norm": 2.6205761432647705, + "learning_rate": 1.282051282051282e-06, + "loss": 0.0776, + "step": 6 + }, + { + "epoch": 0.013493975903614458, + "grad_norm": 2.6309337615966797, + "learning_rate": 1.5384615384615387e-06, + "loss": 0.0827, + "step": 7 + }, + { + "epoch": 0.015421686746987951, + "grad_norm": 1.5427855253219604, + "learning_rate": 1.794871794871795e-06, + "loss": 0.0577, + "step": 8 + }, + { + "epoch": 0.017349397590361446, + "grad_norm": 1.0973446369171143, + "learning_rate": 2.0512820512820513e-06, + "loss": 0.04, + "step": 9 + }, + { + "epoch": 0.01927710843373494, + "grad_norm": 1.3253350257873535, + "learning_rate": 2.307692307692308e-06, + "loss": 0.0506, + "step": 10 + }, + { + "epoch": 0.021204819277108433, + "grad_norm": 1.588739037513733, + "learning_rate": 2.564102564102564e-06, + "loss": 0.0874, + "step": 11 + }, + { + "epoch": 0.02313253012048193, + "grad_norm": 1.4987014532089233, + "learning_rate": 2.8205128205128207e-06, + "loss": 0.0597, + "step": 12 + }, + { + "epoch": 0.02506024096385542, + "grad_norm": 1.6571592092514038, + "learning_rate": 3.0769230769230774e-06, + "loss": 0.0559, + "step": 13 + }, + { + "epoch": 0.026987951807228915, + "grad_norm": 1.8860628604888916, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.0688, + "step": 14 + }, + { + "epoch": 0.02891566265060241, + "grad_norm": 1.3202295303344727, + "learning_rate": 3.58974358974359e-06, + "loss": 0.0433, + "step": 15 + }, + { + "epoch": 0.030843373493975902, + "grad_norm": 1.5870612859725952, + "learning_rate": 3.846153846153847e-06, + "loss": 0.0695, + "step": 16 + }, + { + "epoch": 0.0327710843373494, + "grad_norm": 0.9192284345626831, + "learning_rate": 4.102564102564103e-06, + "loss": 0.0392, + "step": 17 + }, + { + "epoch": 0.03469879518072289, + "grad_norm": 0.7950155735015869, + "learning_rate": 4.358974358974359e-06, + "loss": 0.0351, + "step": 18 + }, + { + "epoch": 0.03662650602409639, + "grad_norm": 0.8854314684867859, + "learning_rate": 4.615384615384616e-06, + "loss": 0.0356, + "step": 19 + }, + { + "epoch": 0.03855421686746988, + "grad_norm": 0.9546788930892944, + "learning_rate": 4.871794871794872e-06, + "loss": 0.0427, + "step": 20 + }, + { + "epoch": 0.04048192771084337, + "grad_norm": 0.6315903663635254, + "learning_rate": 5.128205128205128e-06, + "loss": 0.0397, + "step": 21 + }, + { + "epoch": 0.042409638554216866, + "grad_norm": 0.9230924844741821, + "learning_rate": 5.384615384615385e-06, + "loss": 0.0481, + "step": 22 + }, + { + "epoch": 0.04433734939759036, + "grad_norm": 0.711546003818512, + "learning_rate": 5.641025641025641e-06, + "loss": 0.0479, + "step": 23 + }, + { + "epoch": 0.04626506024096386, + "grad_norm": 0.5288046598434448, + "learning_rate": 5.897435897435898e-06, + "loss": 0.0182, + "step": 24 + }, + { + "epoch": 0.04819277108433735, + "grad_norm": 0.9420496225357056, + "learning_rate": 6.153846153846155e-06, + "loss": 0.0389, + "step": 25 + }, + { + "epoch": 0.05012048192771084, + "grad_norm": 0.5001983046531677, + "learning_rate": 6.410256410256412e-06, + "loss": 0.0268, + "step": 26 + }, + { + "epoch": 0.052048192771084335, + "grad_norm": 0.8084653615951538, + "learning_rate": 6.666666666666667e-06, + "loss": 0.0367, + "step": 27 + }, + { + "epoch": 0.05397590361445783, + "grad_norm": 0.7195103764533997, + "learning_rate": 6.923076923076923e-06, + "loss": 0.0251, + "step": 28 + }, + { + "epoch": 0.055903614457831326, + "grad_norm": 0.529958963394165, + "learning_rate": 7.17948717948718e-06, + "loss": 0.0289, + "step": 29 + }, + { + "epoch": 0.05783132530120482, + "grad_norm": 0.795376181602478, + "learning_rate": 7.435897435897437e-06, + "loss": 0.043, + "step": 30 + }, + { + "epoch": 0.059759036144578316, + "grad_norm": 0.6366249918937683, + "learning_rate": 7.692307692307694e-06, + "loss": 0.029, + "step": 31 + }, + { + "epoch": 0.061686746987951804, + "grad_norm": 0.5414115190505981, + "learning_rate": 7.948717948717949e-06, + "loss": 0.0365, + "step": 32 + }, + { + "epoch": 0.0636144578313253, + "grad_norm": 0.9350972175598145, + "learning_rate": 8.205128205128205e-06, + "loss": 0.0283, + "step": 33 + }, + { + "epoch": 0.0655421686746988, + "grad_norm": 0.5660741925239563, + "learning_rate": 8.461538461538462e-06, + "loss": 0.0234, + "step": 34 + }, + { + "epoch": 0.06746987951807229, + "grad_norm": 0.5623988509178162, + "learning_rate": 8.717948717948719e-06, + "loss": 0.0307, + "step": 35 + }, + { + "epoch": 0.06939759036144579, + "grad_norm": 0.5260195732116699, + "learning_rate": 8.974358974358976e-06, + "loss": 0.0264, + "step": 36 + }, + { + "epoch": 0.07132530120481928, + "grad_norm": 0.4934785068035126, + "learning_rate": 9.230769230769232e-06, + "loss": 0.0224, + "step": 37 + }, + { + "epoch": 0.07325301204819278, + "grad_norm": 0.4797322154045105, + "learning_rate": 9.487179487179487e-06, + "loss": 0.0163, + "step": 38 + }, + { + "epoch": 0.07518072289156627, + "grad_norm": 0.4739217460155487, + "learning_rate": 9.743589743589744e-06, + "loss": 0.0165, + "step": 39 + }, + { + "epoch": 0.07710843373493977, + "grad_norm": 0.4527677595615387, + "learning_rate": 1e-05, + "loss": 0.0163, + "step": 40 + }, + { + "epoch": 0.07903614457831325, + "grad_norm": 0.6241316795349121, + "learning_rate": 1.0256410256410256e-05, + "loss": 0.0302, + "step": 41 + }, + { + "epoch": 0.08096385542168674, + "grad_norm": 0.639043927192688, + "learning_rate": 1.0512820512820514e-05, + "loss": 0.0312, + "step": 42 + }, + { + "epoch": 0.08289156626506024, + "grad_norm": 0.5121409296989441, + "learning_rate": 1.076923076923077e-05, + "loss": 0.0256, + "step": 43 + }, + { + "epoch": 0.08481927710843373, + "grad_norm": 0.6340477466583252, + "learning_rate": 1.1025641025641028e-05, + "loss": 0.04, + "step": 44 + }, + { + "epoch": 0.08674698795180723, + "grad_norm": 0.5260409712791443, + "learning_rate": 1.1282051282051283e-05, + "loss": 0.0282, + "step": 45 + }, + { + "epoch": 0.08867469879518072, + "grad_norm": 0.6390711069107056, + "learning_rate": 1.1538461538461538e-05, + "loss": 0.0243, + "step": 46 + }, + { + "epoch": 0.09060240963855422, + "grad_norm": 0.46469295024871826, + "learning_rate": 1.1794871794871796e-05, + "loss": 0.0208, + "step": 47 + }, + { + "epoch": 0.09253012048192771, + "grad_norm": 0.8711516857147217, + "learning_rate": 1.2051282051282051e-05, + "loss": 0.0291, + "step": 48 + }, + { + "epoch": 0.09445783132530121, + "grad_norm": 0.9164300560951233, + "learning_rate": 1.230769230769231e-05, + "loss": 0.0342, + "step": 49 + }, + { + "epoch": 0.0963855421686747, + "grad_norm": 0.5401139259338379, + "learning_rate": 1.2564102564102565e-05, + "loss": 0.0185, + "step": 50 + }, + { + "epoch": 0.0983132530120482, + "grad_norm": 0.44393008947372437, + "learning_rate": 1.2820512820512823e-05, + "loss": 0.0228, + "step": 51 + }, + { + "epoch": 0.10024096385542168, + "grad_norm": 0.3855767846107483, + "learning_rate": 1.3076923076923078e-05, + "loss": 0.0176, + "step": 52 + }, + { + "epoch": 0.10216867469879518, + "grad_norm": 0.8561235070228577, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.0433, + "step": 53 + }, + { + "epoch": 0.10409638554216867, + "grad_norm": 0.768002450466156, + "learning_rate": 1.3589743589743592e-05, + "loss": 0.0245, + "step": 54 + }, + { + "epoch": 0.10602409638554217, + "grad_norm": 0.4559759497642517, + "learning_rate": 1.3846153846153847e-05, + "loss": 0.0224, + "step": 55 + }, + { + "epoch": 0.10795180722891566, + "grad_norm": 0.6203847527503967, + "learning_rate": 1.4102564102564105e-05, + "loss": 0.0296, + "step": 56 + }, + { + "epoch": 0.10987951807228916, + "grad_norm": 0.6651368141174316, + "learning_rate": 1.435897435897436e-05, + "loss": 0.0336, + "step": 57 + }, + { + "epoch": 0.11180722891566265, + "grad_norm": 0.377734512090683, + "learning_rate": 1.4615384615384615e-05, + "loss": 0.0196, + "step": 58 + }, + { + "epoch": 0.11373493975903615, + "grad_norm": 0.687568724155426, + "learning_rate": 1.4871794871794874e-05, + "loss": 0.0207, + "step": 59 + }, + { + "epoch": 0.11566265060240964, + "grad_norm": 0.7905604243278503, + "learning_rate": 1.5128205128205129e-05, + "loss": 0.047, + "step": 60 + }, + { + "epoch": 0.11759036144578314, + "grad_norm": 0.7938196063041687, + "learning_rate": 1.5384615384615387e-05, + "loss": 0.0198, + "step": 61 + }, + { + "epoch": 0.11951807228915663, + "grad_norm": 0.41340553760528564, + "learning_rate": 1.5641025641025644e-05, + "loss": 0.0161, + "step": 62 + }, + { + "epoch": 0.12144578313253013, + "grad_norm": 0.5668172240257263, + "learning_rate": 1.5897435897435897e-05, + "loss": 0.0275, + "step": 63 + }, + { + "epoch": 0.12337349397590361, + "grad_norm": 0.48333367705345154, + "learning_rate": 1.6153846153846154e-05, + "loss": 0.0137, + "step": 64 + }, + { + "epoch": 0.12530120481927712, + "grad_norm": 0.6843933463096619, + "learning_rate": 1.641025641025641e-05, + "loss": 0.0294, + "step": 65 + }, + { + "epoch": 0.1272289156626506, + "grad_norm": 0.7789272665977478, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.0401, + "step": 66 + }, + { + "epoch": 0.1291566265060241, + "grad_norm": 0.6203492879867554, + "learning_rate": 1.6923076923076924e-05, + "loss": 0.0292, + "step": 67 + }, + { + "epoch": 0.1310843373493976, + "grad_norm": 0.5940662622451782, + "learning_rate": 1.717948717948718e-05, + "loss": 0.0178, + "step": 68 + }, + { + "epoch": 0.13301204819277107, + "grad_norm": 0.35504868626594543, + "learning_rate": 1.7435897435897438e-05, + "loss": 0.0129, + "step": 69 + }, + { + "epoch": 0.13493975903614458, + "grad_norm": 0.8796699643135071, + "learning_rate": 1.7692307692307694e-05, + "loss": 0.034, + "step": 70 + }, + { + "epoch": 0.13686746987951806, + "grad_norm": 0.967444896697998, + "learning_rate": 1.794871794871795e-05, + "loss": 0.0266, + "step": 71 + }, + { + "epoch": 0.13879518072289157, + "grad_norm": 0.4428526759147644, + "learning_rate": 1.8205128205128208e-05, + "loss": 0.0223, + "step": 72 + }, + { + "epoch": 0.14072289156626505, + "grad_norm": 0.42897751927375793, + "learning_rate": 1.8461538461538465e-05, + "loss": 0.0187, + "step": 73 + }, + { + "epoch": 0.14265060240963856, + "grad_norm": 0.5100914835929871, + "learning_rate": 1.8717948717948718e-05, + "loss": 0.0164, + "step": 74 + }, + { + "epoch": 0.14457831325301204, + "grad_norm": 0.6028861999511719, + "learning_rate": 1.8974358974358975e-05, + "loss": 0.0164, + "step": 75 + }, + { + "epoch": 0.14650602409638555, + "grad_norm": 0.6187024116516113, + "learning_rate": 1.923076923076923e-05, + "loss": 0.0296, + "step": 76 + }, + { + "epoch": 0.14843373493975903, + "grad_norm": 0.4822489619255066, + "learning_rate": 1.9487179487179488e-05, + "loss": 0.0148, + "step": 77 + }, + { + "epoch": 0.15036144578313254, + "grad_norm": 0.7231149673461914, + "learning_rate": 1.9743589743589745e-05, + "loss": 0.0395, + "step": 78 + }, + { + "epoch": 0.15228915662650602, + "grad_norm": 0.8409642577171326, + "learning_rate": 2e-05, + "loss": 0.0446, + "step": 79 + }, + { + "epoch": 0.15421686746987953, + "grad_norm": 0.4883500039577484, + "learning_rate": 2.025641025641026e-05, + "loss": 0.0206, + "step": 80 + }, + { + "epoch": 0.156144578313253, + "grad_norm": 0.6287479400634766, + "learning_rate": 2.0512820512820512e-05, + "loss": 0.0333, + "step": 81 + }, + { + "epoch": 0.1580722891566265, + "grad_norm": 0.5041632652282715, + "learning_rate": 2.0769230769230772e-05, + "loss": 0.0414, + "step": 82 + }, + { + "epoch": 0.16, + "grad_norm": 0.5103405117988586, + "learning_rate": 2.102564102564103e-05, + "loss": 0.045, + "step": 83 + }, + { + "epoch": 0.16192771084337348, + "grad_norm": 0.493161678314209, + "learning_rate": 2.1282051282051285e-05, + "loss": 0.021, + "step": 84 + }, + { + "epoch": 0.163855421686747, + "grad_norm": 0.908843994140625, + "learning_rate": 2.153846153846154e-05, + "loss": 0.0389, + "step": 85 + }, + { + "epoch": 0.16578313253012048, + "grad_norm": 0.5067003965377808, + "learning_rate": 2.1794871794871795e-05, + "loss": 0.0272, + "step": 86 + }, + { + "epoch": 0.16771084337349398, + "grad_norm": 0.5791381597518921, + "learning_rate": 2.2051282051282056e-05, + "loss": 0.0368, + "step": 87 + }, + { + "epoch": 0.16963855421686747, + "grad_norm": 0.7056036591529846, + "learning_rate": 2.230769230769231e-05, + "loss": 0.0284, + "step": 88 + }, + { + "epoch": 0.17156626506024097, + "grad_norm": 0.6563822031021118, + "learning_rate": 2.2564102564102566e-05, + "loss": 0.0646, + "step": 89 + }, + { + "epoch": 0.17349397590361446, + "grad_norm": 0.9483286142349243, + "learning_rate": 2.2820512820512822e-05, + "loss": 0.0439, + "step": 90 + }, + { + "epoch": 0.17542168674698796, + "grad_norm": 0.370664119720459, + "learning_rate": 2.3076923076923076e-05, + "loss": 0.0109, + "step": 91 + }, + { + "epoch": 0.17734939759036145, + "grad_norm": 0.9776477813720703, + "learning_rate": 2.3333333333333336e-05, + "loss": 0.0458, + "step": 92 + }, + { + "epoch": 0.17927710843373493, + "grad_norm": 0.45710092782974243, + "learning_rate": 2.3589743589743593e-05, + "loss": 0.0212, + "step": 93 + }, + { + "epoch": 0.18120481927710844, + "grad_norm": 0.8623896837234497, + "learning_rate": 2.384615384615385e-05, + "loss": 0.0215, + "step": 94 + }, + { + "epoch": 0.18313253012048192, + "grad_norm": 0.55814528465271, + "learning_rate": 2.4102564102564103e-05, + "loss": 0.0218, + "step": 95 + }, + { + "epoch": 0.18506024096385543, + "grad_norm": 0.49882641434669495, + "learning_rate": 2.435897435897436e-05, + "loss": 0.0268, + "step": 96 + }, + { + "epoch": 0.1869879518072289, + "grad_norm": 0.3508654534816742, + "learning_rate": 2.461538461538462e-05, + "loss": 0.0172, + "step": 97 + }, + { + "epoch": 0.18891566265060242, + "grad_norm": 0.601170003414154, + "learning_rate": 2.4871794871794873e-05, + "loss": 0.0208, + "step": 98 + }, + { + "epoch": 0.1908433734939759, + "grad_norm": 1.1748133897781372, + "learning_rate": 2.512820512820513e-05, + "loss": 0.0259, + "step": 99 + }, + { + "epoch": 0.1927710843373494, + "grad_norm": 0.46370384097099304, + "learning_rate": 2.5384615384615386e-05, + "loss": 0.0242, + "step": 100 + }, + { + "epoch": 0.1946987951807229, + "grad_norm": 0.525010883808136, + "learning_rate": 2.5641025641025646e-05, + "loss": 0.0188, + "step": 101 + }, + { + "epoch": 0.1966265060240964, + "grad_norm": 0.766501784324646, + "learning_rate": 2.58974358974359e-05, + "loss": 0.0584, + "step": 102 + }, + { + "epoch": 0.19855421686746988, + "grad_norm": 0.3572964370250702, + "learning_rate": 2.6153846153846157e-05, + "loss": 0.0131, + "step": 103 + }, + { + "epoch": 0.20048192771084336, + "grad_norm": 0.6467130780220032, + "learning_rate": 2.6410256410256413e-05, + "loss": 0.0231, + "step": 104 + }, + { + "epoch": 0.20240963855421687, + "grad_norm": 1.1852102279663086, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.027, + "step": 105 + }, + { + "epoch": 0.20433734939759035, + "grad_norm": 2.3659932613372803, + "learning_rate": 2.6923076923076927e-05, + "loss": 0.0224, + "step": 106 + }, + { + "epoch": 0.20626506024096386, + "grad_norm": 0.5343687534332275, + "learning_rate": 2.7179487179487183e-05, + "loss": 0.0198, + "step": 107 + }, + { + "epoch": 0.20819277108433734, + "grad_norm": 1.852160096168518, + "learning_rate": 2.7435897435897437e-05, + "loss": 0.032, + "step": 108 + }, + { + "epoch": 0.21012048192771085, + "grad_norm": 0.47291702032089233, + "learning_rate": 2.7692307692307694e-05, + "loss": 0.0117, + "step": 109 + }, + { + "epoch": 0.21204819277108433, + "grad_norm": 0.7623187899589539, + "learning_rate": 2.794871794871795e-05, + "loss": 0.0337, + "step": 110 + }, + { + "epoch": 0.21397590361445784, + "grad_norm": 0.5272570848464966, + "learning_rate": 2.820512820512821e-05, + "loss": 0.0131, + "step": 111 + }, + { + "epoch": 0.21590361445783132, + "grad_norm": 0.5568500757217407, + "learning_rate": 2.8461538461538464e-05, + "loss": 0.0233, + "step": 112 + }, + { + "epoch": 0.21783132530120483, + "grad_norm": 0.4008469879627228, + "learning_rate": 2.871794871794872e-05, + "loss": 0.0204, + "step": 113 + }, + { + "epoch": 0.2197590361445783, + "grad_norm": 0.4888612926006317, + "learning_rate": 2.8974358974358977e-05, + "loss": 0.016, + "step": 114 + }, + { + "epoch": 0.2216867469879518, + "grad_norm": 0.44903355836868286, + "learning_rate": 2.923076923076923e-05, + "loss": 0.0135, + "step": 115 + }, + { + "epoch": 0.2236144578313253, + "grad_norm": 0.9266762733459473, + "learning_rate": 2.948717948717949e-05, + "loss": 0.0233, + "step": 116 + }, + { + "epoch": 0.22554216867469878, + "grad_norm": 0.5352638959884644, + "learning_rate": 2.9743589743589747e-05, + "loss": 0.0198, + "step": 117 + }, + { + "epoch": 0.2274698795180723, + "grad_norm": 0.6051343679428101, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.0246, + "step": 118 + }, + { + "epoch": 0.22939759036144577, + "grad_norm": 0.9971133470535278, + "learning_rate": 3.0256410256410257e-05, + "loss": 0.025, + "step": 119 + }, + { + "epoch": 0.23132530120481928, + "grad_norm": 0.704236626625061, + "learning_rate": 3.0512820512820514e-05, + "loss": 0.031, + "step": 120 + }, + { + "epoch": 0.23325301204819276, + "grad_norm": 0.6137097477912903, + "learning_rate": 3.0769230769230774e-05, + "loss": 0.0519, + "step": 121 + }, + { + "epoch": 0.23518072289156627, + "grad_norm": 0.7396159768104553, + "learning_rate": 3.102564102564103e-05, + "loss": 0.0325, + "step": 122 + }, + { + "epoch": 0.23710843373493976, + "grad_norm": 1.3282053470611572, + "learning_rate": 3.128205128205129e-05, + "loss": 0.0252, + "step": 123 + }, + { + "epoch": 0.23903614457831326, + "grad_norm": 0.5220731496810913, + "learning_rate": 3.153846153846154e-05, + "loss": 0.0262, + "step": 124 + }, + { + "epoch": 0.24096385542168675, + "grad_norm": 0.5357242822647095, + "learning_rate": 3.1794871794871795e-05, + "loss": 0.0243, + "step": 125 + }, + { + "epoch": 0.24289156626506025, + "grad_norm": 0.48207753896713257, + "learning_rate": 3.205128205128206e-05, + "loss": 0.0178, + "step": 126 + }, + { + "epoch": 0.24481927710843374, + "grad_norm": 0.552988588809967, + "learning_rate": 3.230769230769231e-05, + "loss": 0.023, + "step": 127 + }, + { + "epoch": 0.24674698795180722, + "grad_norm": 1.7962840795516968, + "learning_rate": 3.2564102564102565e-05, + "loss": 0.032, + "step": 128 + }, + { + "epoch": 0.24867469879518073, + "grad_norm": 1.6404600143432617, + "learning_rate": 3.282051282051282e-05, + "loss": 0.0231, + "step": 129 + }, + { + "epoch": 0.25060240963855424, + "grad_norm": 0.39142486453056335, + "learning_rate": 3.307692307692308e-05, + "loss": 0.0147, + "step": 130 + }, + { + "epoch": 0.2525301204819277, + "grad_norm": 1.3272887468338013, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.0439, + "step": 131 + }, + { + "epoch": 0.2544578313253012, + "grad_norm": 1.5122811794281006, + "learning_rate": 3.358974358974359e-05, + "loss": 0.0282, + "step": 132 + }, + { + "epoch": 0.2563855421686747, + "grad_norm": 1.8542430400848389, + "learning_rate": 3.384615384615385e-05, + "loss": 0.0515, + "step": 133 + }, + { + "epoch": 0.2583132530120482, + "grad_norm": 4.059277534484863, + "learning_rate": 3.4102564102564105e-05, + "loss": 0.0781, + "step": 134 + }, + { + "epoch": 0.26024096385542167, + "grad_norm": 0.6206214427947998, + "learning_rate": 3.435897435897436e-05, + "loss": 0.0306, + "step": 135 + }, + { + "epoch": 0.2621686746987952, + "grad_norm": 0.4575510323047638, + "learning_rate": 3.461538461538462e-05, + "loss": 0.0154, + "step": 136 + }, + { + "epoch": 0.2640963855421687, + "grad_norm": 1.1556978225708008, + "learning_rate": 3.4871794871794875e-05, + "loss": 0.0235, + "step": 137 + }, + { + "epoch": 0.26602409638554214, + "grad_norm": 0.6975051760673523, + "learning_rate": 3.512820512820513e-05, + "loss": 0.0453, + "step": 138 + }, + { + "epoch": 0.26795180722891565, + "grad_norm": 0.8686623573303223, + "learning_rate": 3.538461538461539e-05, + "loss": 0.0427, + "step": 139 + }, + { + "epoch": 0.26987951807228916, + "grad_norm": 2.0681848526000977, + "learning_rate": 3.5641025641025646e-05, + "loss": 0.04, + "step": 140 + }, + { + "epoch": 0.27180722891566267, + "grad_norm": 0.4397984445095062, + "learning_rate": 3.58974358974359e-05, + "loss": 0.0188, + "step": 141 + }, + { + "epoch": 0.2737349397590361, + "grad_norm": 0.5871334075927734, + "learning_rate": 3.615384615384616e-05, + "loss": 0.0253, + "step": 142 + }, + { + "epoch": 0.27566265060240963, + "grad_norm": 1.1078568696975708, + "learning_rate": 3.6410256410256416e-05, + "loss": 0.0316, + "step": 143 + }, + { + "epoch": 0.27759036144578314, + "grad_norm": 0.5691841840744019, + "learning_rate": 3.6666666666666666e-05, + "loss": 0.0266, + "step": 144 + }, + { + "epoch": 0.27951807228915665, + "grad_norm": 0.7896255254745483, + "learning_rate": 3.692307692307693e-05, + "loss": 0.0281, + "step": 145 + }, + { + "epoch": 0.2814457831325301, + "grad_norm": 0.9988337159156799, + "learning_rate": 3.7179487179487186e-05, + "loss": 0.0295, + "step": 146 + }, + { + "epoch": 0.2833734939759036, + "grad_norm": 0.9811834692955017, + "learning_rate": 3.7435897435897436e-05, + "loss": 0.0322, + "step": 147 + }, + { + "epoch": 0.2853012048192771, + "grad_norm": 0.6503105759620667, + "learning_rate": 3.769230769230769e-05, + "loss": 0.0266, + "step": 148 + }, + { + "epoch": 0.28722891566265063, + "grad_norm": 1.9164355993270874, + "learning_rate": 3.794871794871795e-05, + "loss": 0.0677, + "step": 149 + }, + { + "epoch": 0.2891566265060241, + "grad_norm": 1.1724557876586914, + "learning_rate": 3.820512820512821e-05, + "loss": 0.0324, + "step": 150 + }, + { + "epoch": 0.2910843373493976, + "grad_norm": 0.8482469916343689, + "learning_rate": 3.846153846153846e-05, + "loss": 0.0259, + "step": 151 + }, + { + "epoch": 0.2930120481927711, + "grad_norm": 0.8572830557823181, + "learning_rate": 3.871794871794872e-05, + "loss": 0.0358, + "step": 152 + }, + { + "epoch": 0.29493975903614456, + "grad_norm": 0.6630825400352478, + "learning_rate": 3.8974358974358976e-05, + "loss": 0.0447, + "step": 153 + }, + { + "epoch": 0.29686746987951806, + "grad_norm": 0.9197093844413757, + "learning_rate": 3.923076923076923e-05, + "loss": 0.0409, + "step": 154 + }, + { + "epoch": 0.2987951807228916, + "grad_norm": 0.6976819634437561, + "learning_rate": 3.948717948717949e-05, + "loss": 0.0317, + "step": 155 + }, + { + "epoch": 0.3007228915662651, + "grad_norm": 0.7353514432907104, + "learning_rate": 3.9743589743589747e-05, + "loss": 0.0306, + "step": 156 + }, + { + "epoch": 0.30265060240963854, + "grad_norm": 0.5730232000350952, + "learning_rate": 4e-05, + "loss": 0.0324, + "step": 157 + }, + { + "epoch": 0.30457831325301205, + "grad_norm": 0.7852078676223755, + "learning_rate": 3.999994971675547e-05, + "loss": 0.0354, + "step": 158 + }, + { + "epoch": 0.30650602409638555, + "grad_norm": 0.5924715399742126, + "learning_rate": 3.999979886727471e-05, + "loss": 0.0366, + "step": 159 + }, + { + "epoch": 0.30843373493975906, + "grad_norm": 0.7359845638275146, + "learning_rate": 3.999954745231624e-05, + "loss": 0.0437, + "step": 160 + }, + { + "epoch": 0.3103614457831325, + "grad_norm": 0.7866976857185364, + "learning_rate": 3.999919547314426e-05, + "loss": 0.0363, + "step": 161 + }, + { + "epoch": 0.312289156626506, + "grad_norm": 0.7425745129585266, + "learning_rate": 3.999874293152863e-05, + "loss": 0.0259, + "step": 162 + }, + { + "epoch": 0.31421686746987953, + "grad_norm": 1.8922245502471924, + "learning_rate": 3.9998189829744885e-05, + "loss": 0.0341, + "step": 163 + }, + { + "epoch": 0.316144578313253, + "grad_norm": 0.7908634543418884, + "learning_rate": 3.99975361705742e-05, + "loss": 0.0424, + "step": 164 + }, + { + "epoch": 0.3180722891566265, + "grad_norm": 2.047368049621582, + "learning_rate": 3.999678195730337e-05, + "loss": 0.0535, + "step": 165 + }, + { + "epoch": 0.32, + "grad_norm": 0.5702639222145081, + "learning_rate": 3.999592719372484e-05, + "loss": 0.0284, + "step": 166 + }, + { + "epoch": 0.3219277108433735, + "grad_norm": 0.45015648007392883, + "learning_rate": 3.9994971884136636e-05, + "loss": 0.0313, + "step": 167 + }, + { + "epoch": 0.32385542168674697, + "grad_norm": 4.094679355621338, + "learning_rate": 3.9993916033342355e-05, + "loss": 0.0524, + "step": 168 + }, + { + "epoch": 0.3257831325301205, + "grad_norm": 0.800846517086029, + "learning_rate": 3.999275964665117e-05, + "loss": 0.0282, + "step": 169 + }, + { + "epoch": 0.327710843373494, + "grad_norm": 0.47881078720092773, + "learning_rate": 3.999150272987776e-05, + "loss": 0.0293, + "step": 170 + }, + { + "epoch": 0.3296385542168675, + "grad_norm": 0.5716657638549805, + "learning_rate": 3.999014528934232e-05, + "loss": 0.0221, + "step": 171 + }, + { + "epoch": 0.33156626506024095, + "grad_norm": 0.6333311200141907, + "learning_rate": 3.998868733187048e-05, + "loss": 0.0302, + "step": 172 + }, + { + "epoch": 0.33349397590361446, + "grad_norm": 6.642521858215332, + "learning_rate": 3.998712886479335e-05, + "loss": 0.0364, + "step": 173 + }, + { + "epoch": 0.33542168674698797, + "grad_norm": 0.7515506148338318, + "learning_rate": 3.998546989594739e-05, + "loss": 0.0296, + "step": 174 + }, + { + "epoch": 0.3373493975903614, + "grad_norm": 1.0728015899658203, + "learning_rate": 3.998371043367445e-05, + "loss": 0.0549, + "step": 175 + }, + { + "epoch": 0.33927710843373493, + "grad_norm": 1.3025579452514648, + "learning_rate": 3.998185048682166e-05, + "loss": 0.0577, + "step": 176 + }, + { + "epoch": 0.34120481927710844, + "grad_norm": 1.0962958335876465, + "learning_rate": 3.997989006474144e-05, + "loss": 0.0313, + "step": 177 + }, + { + "epoch": 0.34313253012048195, + "grad_norm": 0.7064313292503357, + "learning_rate": 3.997782917729143e-05, + "loss": 0.0309, + "step": 178 + }, + { + "epoch": 0.3450602409638554, + "grad_norm": 0.43374207615852356, + "learning_rate": 3.997566783483445e-05, + "loss": 0.0166, + "step": 179 + }, + { + "epoch": 0.3469879518072289, + "grad_norm": 0.7236390113830566, + "learning_rate": 3.9973406048238413e-05, + "loss": 0.0254, + "step": 180 + }, + { + "epoch": 0.3489156626506024, + "grad_norm": 0.5041500926017761, + "learning_rate": 3.9971043828876334e-05, + "loss": 0.0239, + "step": 181 + }, + { + "epoch": 0.35084337349397593, + "grad_norm": 1.2744532823562622, + "learning_rate": 3.9968581188626204e-05, + "loss": 0.0404, + "step": 182 + }, + { + "epoch": 0.3527710843373494, + "grad_norm": 0.45845362544059753, + "learning_rate": 3.996601813987098e-05, + "loss": 0.0127, + "step": 183 + }, + { + "epoch": 0.3546987951807229, + "grad_norm": 0.4426881968975067, + "learning_rate": 3.996335469549852e-05, + "loss": 0.0176, + "step": 184 + }, + { + "epoch": 0.3566265060240964, + "grad_norm": 1.0030732154846191, + "learning_rate": 3.9960590868901465e-05, + "loss": 0.0457, + "step": 185 + }, + { + "epoch": 0.35855421686746985, + "grad_norm": 0.6428582668304443, + "learning_rate": 3.995772667397725e-05, + "loss": 0.0271, + "step": 186 + }, + { + "epoch": 0.36048192771084336, + "grad_norm": 0.5335744619369507, + "learning_rate": 3.995476212512795e-05, + "loss": 0.0297, + "step": 187 + }, + { + "epoch": 0.3624096385542169, + "grad_norm": 0.6995761394500732, + "learning_rate": 3.99516972372603e-05, + "loss": 0.0322, + "step": 188 + }, + { + "epoch": 0.3643373493975904, + "grad_norm": 0.765511155128479, + "learning_rate": 3.9948532025785546e-05, + "loss": 0.0253, + "step": 189 + }, + { + "epoch": 0.36626506024096384, + "grad_norm": 0.6165828108787537, + "learning_rate": 3.9945266506619403e-05, + "loss": 0.0355, + "step": 190 + }, + { + "epoch": 0.36819277108433734, + "grad_norm": 0.851970911026001, + "learning_rate": 3.994190069618195e-05, + "loss": 0.056, + "step": 191 + }, + { + "epoch": 0.37012048192771085, + "grad_norm": 0.9850023984909058, + "learning_rate": 3.993843461139757e-05, + "loss": 0.0415, + "step": 192 + }, + { + "epoch": 0.37204819277108436, + "grad_norm": 0.7455295324325562, + "learning_rate": 3.9934868269694886e-05, + "loss": 0.0379, + "step": 193 + }, + { + "epoch": 0.3739759036144578, + "grad_norm": 1.159469723701477, + "learning_rate": 3.9931201689006595e-05, + "loss": 0.0237, + "step": 194 + }, + { + "epoch": 0.3759036144578313, + "grad_norm": 0.5490080118179321, + "learning_rate": 3.992743488776947e-05, + "loss": 0.024, + "step": 195 + }, + { + "epoch": 0.37783132530120483, + "grad_norm": 1.279831886291504, + "learning_rate": 3.992356788492421e-05, + "loss": 0.0273, + "step": 196 + }, + { + "epoch": 0.3797590361445783, + "grad_norm": 0.859104335308075, + "learning_rate": 3.9919600699915355e-05, + "loss": 0.0411, + "step": 197 + }, + { + "epoch": 0.3816867469879518, + "grad_norm": 1.2525300979614258, + "learning_rate": 3.991553335269119e-05, + "loss": 0.0857, + "step": 198 + }, + { + "epoch": 0.3836144578313253, + "grad_norm": 0.4924193024635315, + "learning_rate": 3.991136586370367e-05, + "loss": 0.0294, + "step": 199 + }, + { + "epoch": 0.3855421686746988, + "grad_norm": 1.417190670967102, + "learning_rate": 3.990709825390828e-05, + "loss": 0.0395, + "step": 200 + }, + { + "epoch": 0.38746987951807227, + "grad_norm": 0.6172056198120117, + "learning_rate": 3.9902730544763936e-05, + "loss": 0.0194, + "step": 201 + }, + { + "epoch": 0.3893975903614458, + "grad_norm": 0.7292149662971497, + "learning_rate": 3.989826275823291e-05, + "loss": 0.0381, + "step": 202 + }, + { + "epoch": 0.3913253012048193, + "grad_norm": 0.5949816107749939, + "learning_rate": 3.989369491678067e-05, + "loss": 0.0254, + "step": 203 + }, + { + "epoch": 0.3932530120481928, + "grad_norm": 0.6012582182884216, + "learning_rate": 3.988902704337582e-05, + "loss": 0.048, + "step": 204 + }, + { + "epoch": 0.39518072289156625, + "grad_norm": 0.6273590922355652, + "learning_rate": 3.9884259161489936e-05, + "loss": 0.0268, + "step": 205 + }, + { + "epoch": 0.39710843373493976, + "grad_norm": 0.9615244269371033, + "learning_rate": 3.987939129509746e-05, + "loss": 0.0192, + "step": 206 + }, + { + "epoch": 0.39903614457831327, + "grad_norm": 0.6009241342544556, + "learning_rate": 3.9874423468675624e-05, + "loss": 0.0362, + "step": 207 + }, + { + "epoch": 0.4009638554216867, + "grad_norm": 0.411335289478302, + "learning_rate": 3.9869355707204266e-05, + "loss": 0.017, + "step": 208 + }, + { + "epoch": 0.40289156626506023, + "grad_norm": 0.6151527166366577, + "learning_rate": 3.986418803616573e-05, + "loss": 0.0283, + "step": 209 + }, + { + "epoch": 0.40481927710843374, + "grad_norm": 0.33808204531669617, + "learning_rate": 3.985892048154474e-05, + "loss": 0.0158, + "step": 210 + }, + { + "epoch": 0.40674698795180725, + "grad_norm": 0.5464187860488892, + "learning_rate": 3.9853553069828284e-05, + "loss": 0.0292, + "step": 211 + }, + { + "epoch": 0.4086746987951807, + "grad_norm": 0.6658390760421753, + "learning_rate": 3.984808582800543e-05, + "loss": 0.0281, + "step": 212 + }, + { + "epoch": 0.4106024096385542, + "grad_norm": 0.4253764748573303, + "learning_rate": 3.984251878356726e-05, + "loss": 0.031, + "step": 213 + }, + { + "epoch": 0.4125301204819277, + "grad_norm": 0.32309481501579285, + "learning_rate": 3.983685196450667e-05, + "loss": 0.0166, + "step": 214 + }, + { + "epoch": 0.41445783132530123, + "grad_norm": 0.43756410479545593, + "learning_rate": 3.9831085399318265e-05, + "loss": 0.0326, + "step": 215 + }, + { + "epoch": 0.4163855421686747, + "grad_norm": 0.264046847820282, + "learning_rate": 3.982521911699822e-05, + "loss": 0.0118, + "step": 216 + }, + { + "epoch": 0.4183132530120482, + "grad_norm": 0.8630897402763367, + "learning_rate": 3.9819253147044084e-05, + "loss": 0.0246, + "step": 217 + }, + { + "epoch": 0.4202409638554217, + "grad_norm": 0.6923379898071289, + "learning_rate": 3.98131875194547e-05, + "loss": 0.036, + "step": 218 + }, + { + "epoch": 0.42216867469879515, + "grad_norm": 0.5874778628349304, + "learning_rate": 3.9807022264730024e-05, + "loss": 0.0255, + "step": 219 + }, + { + "epoch": 0.42409638554216866, + "grad_norm": 0.394336074590683, + "learning_rate": 3.980075741387094e-05, + "loss": 0.0187, + "step": 220 + }, + { + "epoch": 0.4260240963855422, + "grad_norm": 0.6300327777862549, + "learning_rate": 3.979439299837915e-05, + "loss": 0.0214, + "step": 221 + }, + { + "epoch": 0.4279518072289157, + "grad_norm": 0.5200467109680176, + "learning_rate": 3.978792905025702e-05, + "loss": 0.0628, + "step": 222 + }, + { + "epoch": 0.42987951807228914, + "grad_norm": 0.5713880062103271, + "learning_rate": 3.978136560200735e-05, + "loss": 0.0302, + "step": 223 + }, + { + "epoch": 0.43180722891566264, + "grad_norm": 0.5345383286476135, + "learning_rate": 3.977470268663331e-05, + "loss": 0.0125, + "step": 224 + }, + { + "epoch": 0.43373493975903615, + "grad_norm": 0.5378350019454956, + "learning_rate": 3.976794033763819e-05, + "loss": 0.0246, + "step": 225 + }, + { + "epoch": 0.43566265060240966, + "grad_norm": 0.5554935336112976, + "learning_rate": 3.9761078589025276e-05, + "loss": 0.0212, + "step": 226 + }, + { + "epoch": 0.4375903614457831, + "grad_norm": 0.2832634747028351, + "learning_rate": 3.9754117475297664e-05, + "loss": 0.0125, + "step": 227 + }, + { + "epoch": 0.4395180722891566, + "grad_norm": 1.2910150289535522, + "learning_rate": 3.97470570314581e-05, + "loss": 0.0364, + "step": 228 + }, + { + "epoch": 0.44144578313253013, + "grad_norm": 0.3731018602848053, + "learning_rate": 3.973989729300878e-05, + "loss": 0.0128, + "step": 229 + }, + { + "epoch": 0.4433734939759036, + "grad_norm": 0.9433871507644653, + "learning_rate": 3.9732638295951195e-05, + "loss": 0.0367, + "step": 230 + }, + { + "epoch": 0.4453012048192771, + "grad_norm": 1.0779197216033936, + "learning_rate": 3.972528007678594e-05, + "loss": 0.0667, + "step": 231 + }, + { + "epoch": 0.4472289156626506, + "grad_norm": 1.7009105682373047, + "learning_rate": 3.9717822672512516e-05, + "loss": 0.0655, + "step": 232 + }, + { + "epoch": 0.4491566265060241, + "grad_norm": 0.5646032094955444, + "learning_rate": 3.971026612062919e-05, + "loss": 0.064, + "step": 233 + }, + { + "epoch": 0.45108433734939757, + "grad_norm": 0.44474121928215027, + "learning_rate": 3.970261045913274e-05, + "loss": 0.0206, + "step": 234 + }, + { + "epoch": 0.4530120481927711, + "grad_norm": 1.3969277143478394, + "learning_rate": 3.969485572651833e-05, + "loss": 0.0486, + "step": 235 + }, + { + "epoch": 0.4549397590361446, + "grad_norm": 0.6401994228363037, + "learning_rate": 3.968700196177925e-05, + "loss": 0.0262, + "step": 236 + }, + { + "epoch": 0.4568674698795181, + "grad_norm": 0.7091913223266602, + "learning_rate": 3.96790492044068e-05, + "loss": 0.014, + "step": 237 + }, + { + "epoch": 0.45879518072289155, + "grad_norm": 0.6561547517776489, + "learning_rate": 3.967099749439002e-05, + "loss": 0.0482, + "step": 238 + }, + { + "epoch": 0.46072289156626506, + "grad_norm": 0.6924155354499817, + "learning_rate": 3.966284687221551e-05, + "loss": 0.0289, + "step": 239 + }, + { + "epoch": 0.46265060240963857, + "grad_norm": 0.5868663787841797, + "learning_rate": 3.9654597378867256e-05, + "loss": 0.0331, + "step": 240 + }, + { + "epoch": 0.464578313253012, + "grad_norm": 0.7930939793586731, + "learning_rate": 3.964624905582637e-05, + "loss": 0.0925, + "step": 241 + }, + { + "epoch": 0.46650602409638553, + "grad_norm": 0.4888836145401001, + "learning_rate": 3.9637801945070944e-05, + "loss": 0.015, + "step": 242 + }, + { + "epoch": 0.46843373493975904, + "grad_norm": 0.7820287346839905, + "learning_rate": 3.962925608907579e-05, + "loss": 0.0382, + "step": 243 + }, + { + "epoch": 0.47036144578313255, + "grad_norm": 0.4914316236972809, + "learning_rate": 3.962061153081224e-05, + "loss": 0.0257, + "step": 244 + }, + { + "epoch": 0.472289156626506, + "grad_norm": 0.5681505799293518, + "learning_rate": 3.961186831374793e-05, + "loss": 0.0551, + "step": 245 + }, + { + "epoch": 0.4742168674698795, + "grad_norm": 0.5049723386764526, + "learning_rate": 3.9603026481846616e-05, + "loss": 0.0186, + "step": 246 + }, + { + "epoch": 0.476144578313253, + "grad_norm": 0.5034119486808777, + "learning_rate": 3.959408607956787e-05, + "loss": 0.024, + "step": 247 + }, + { + "epoch": 0.47807228915662653, + "grad_norm": 0.4543336033821106, + "learning_rate": 3.958504715186695e-05, + "loss": 0.0256, + "step": 248 + }, + { + "epoch": 0.48, + "grad_norm": 0.5595743656158447, + "learning_rate": 3.957590974419452e-05, + "loss": 0.0222, + "step": 249 + }, + { + "epoch": 0.4819277108433735, + "grad_norm": 0.5701581239700317, + "learning_rate": 3.956667390249642e-05, + "loss": 0.0334, + "step": 250 + }, + { + "epoch": 0.483855421686747, + "grad_norm": 0.53755784034729, + "learning_rate": 3.9557339673213474e-05, + "loss": 0.0345, + "step": 251 + }, + { + "epoch": 0.4857831325301205, + "grad_norm": 0.4368877112865448, + "learning_rate": 3.95479071032812e-05, + "loss": 0.0183, + "step": 252 + }, + { + "epoch": 0.48771084337349396, + "grad_norm": 0.7972906827926636, + "learning_rate": 3.953837624012963e-05, + "loss": 0.0337, + "step": 253 + }, + { + "epoch": 0.48963855421686747, + "grad_norm": 0.6148451566696167, + "learning_rate": 3.9528747131683023e-05, + "loss": 0.0524, + "step": 254 + }, + { + "epoch": 0.491566265060241, + "grad_norm": 0.500840961933136, + "learning_rate": 3.9519019826359676e-05, + "loss": 0.0248, + "step": 255 + }, + { + "epoch": 0.49349397590361443, + "grad_norm": 0.5536255240440369, + "learning_rate": 3.9509194373071624e-05, + "loss": 0.0219, + "step": 256 + }, + { + "epoch": 0.49542168674698794, + "grad_norm": 0.6873176097869873, + "learning_rate": 3.9499270821224444e-05, + "loss": 0.0312, + "step": 257 + }, + { + "epoch": 0.49734939759036145, + "grad_norm": 0.37207168340682983, + "learning_rate": 3.9489249220716974e-05, + "loss": 0.0149, + "step": 258 + }, + { + "epoch": 0.49927710843373496, + "grad_norm": 0.4458799660205841, + "learning_rate": 3.947912962194107e-05, + "loss": 0.0214, + "step": 259 + }, + { + "epoch": 0.5012048192771085, + "grad_norm": 0.4272724390029907, + "learning_rate": 3.9468912075781345e-05, + "loss": 0.0263, + "step": 260 + }, + { + "epoch": 0.503132530120482, + "grad_norm": 0.5245792269706726, + "learning_rate": 3.945859663361496e-05, + "loss": 0.0103, + "step": 261 + }, + { + "epoch": 0.5050602409638554, + "grad_norm": 0.8799260854721069, + "learning_rate": 3.9448183347311284e-05, + "loss": 0.0292, + "step": 262 + }, + { + "epoch": 0.5069879518072289, + "grad_norm": 0.5996833443641663, + "learning_rate": 3.943767226923171e-05, + "loss": 0.0306, + "step": 263 + }, + { + "epoch": 0.5089156626506024, + "grad_norm": 0.6044682860374451, + "learning_rate": 3.942706345222935e-05, + "loss": 0.0218, + "step": 264 + }, + { + "epoch": 0.5108433734939759, + "grad_norm": 0.4770200848579407, + "learning_rate": 3.941635694964878e-05, + "loss": 0.0226, + "step": 265 + }, + { + "epoch": 0.5127710843373494, + "grad_norm": 0.5605704188346863, + "learning_rate": 3.940555281532576e-05, + "loss": 0.0354, + "step": 266 + }, + { + "epoch": 0.5146987951807229, + "grad_norm": 0.46532443165779114, + "learning_rate": 3.939465110358699e-05, + "loss": 0.0223, + "step": 267 + }, + { + "epoch": 0.5166265060240964, + "grad_norm": 0.5190595388412476, + "learning_rate": 3.93836518692498e-05, + "loss": 0.0219, + "step": 268 + }, + { + "epoch": 0.5185542168674698, + "grad_norm": 0.5767757892608643, + "learning_rate": 3.937255516762193e-05, + "loss": 0.0294, + "step": 269 + }, + { + "epoch": 0.5204819277108433, + "grad_norm": 0.4543164372444153, + "learning_rate": 3.936136105450119e-05, + "loss": 0.0244, + "step": 270 + }, + { + "epoch": 0.5224096385542168, + "grad_norm": 0.4155154526233673, + "learning_rate": 3.9350069586175195e-05, + "loss": 0.02, + "step": 271 + }, + { + "epoch": 0.5243373493975904, + "grad_norm": 0.5470768213272095, + "learning_rate": 3.933868081942113e-05, + "loss": 0.0187, + "step": 272 + }, + { + "epoch": 0.5262650602409639, + "grad_norm": 0.9491772651672363, + "learning_rate": 3.9327194811505406e-05, + "loss": 0.0337, + "step": 273 + }, + { + "epoch": 0.5281927710843374, + "grad_norm": 0.9313873052597046, + "learning_rate": 3.93156116201834e-05, + "loss": 0.0573, + "step": 274 + }, + { + "epoch": 0.5301204819277109, + "grad_norm": 0.7181005477905273, + "learning_rate": 3.930393130369915e-05, + "loss": 0.0405, + "step": 275 + }, + { + "epoch": 0.5320481927710843, + "grad_norm": 0.34231385588645935, + "learning_rate": 3.9292153920785076e-05, + "loss": 0.0153, + "step": 276 + }, + { + "epoch": 0.5339759036144578, + "grad_norm": 0.6899610161781311, + "learning_rate": 3.928027953066168e-05, + "loss": 0.0338, + "step": 277 + }, + { + "epoch": 0.5359036144578313, + "grad_norm": 0.7509781718254089, + "learning_rate": 3.926830819303726e-05, + "loss": 0.0416, + "step": 278 + }, + { + "epoch": 0.5378313253012048, + "grad_norm": 0.6326774954795837, + "learning_rate": 3.925623996810757e-05, + "loss": 0.0293, + "step": 279 + }, + { + "epoch": 0.5397590361445783, + "grad_norm": 0.5543203353881836, + "learning_rate": 3.924407491655557e-05, + "loss": 0.0263, + "step": 280 + }, + { + "epoch": 0.5416867469879518, + "grad_norm": 0.5367572903633118, + "learning_rate": 3.9231813099551086e-05, + "loss": 0.0276, + "step": 281 + }, + { + "epoch": 0.5436144578313253, + "grad_norm": 0.3143869638442993, + "learning_rate": 3.921945457875051e-05, + "loss": 0.0146, + "step": 282 + }, + { + "epoch": 0.5455421686746988, + "grad_norm": 0.47403043508529663, + "learning_rate": 3.920699941629649e-05, + "loss": 0.0267, + "step": 283 + }, + { + "epoch": 0.5474698795180722, + "grad_norm": 0.5082595348358154, + "learning_rate": 3.919444767481763e-05, + "loss": 0.0183, + "step": 284 + }, + { + "epoch": 0.5493975903614458, + "grad_norm": 0.747949481010437, + "learning_rate": 3.918179941742816e-05, + "loss": 0.0412, + "step": 285 + }, + { + "epoch": 0.5513253012048193, + "grad_norm": 0.6553886532783508, + "learning_rate": 3.916905470772762e-05, + "loss": 0.0505, + "step": 286 + }, + { + "epoch": 0.5532530120481928, + "grad_norm": 0.3838176131248474, + "learning_rate": 3.9156213609800545e-05, + "loss": 0.0156, + "step": 287 + }, + { + "epoch": 0.5551807228915663, + "grad_norm": 0.7427731156349182, + "learning_rate": 3.914327618821614e-05, + "loss": 0.0278, + "step": 288 + }, + { + "epoch": 0.5571084337349398, + "grad_norm": 0.2612821161746979, + "learning_rate": 3.913024250802796e-05, + "loss": 0.0101, + "step": 289 + }, + { + "epoch": 0.5590361445783133, + "grad_norm": 0.3799416124820709, + "learning_rate": 3.911711263477357e-05, + "loss": 0.0168, + "step": 290 + }, + { + "epoch": 0.5609638554216867, + "grad_norm": 0.5053854584693909, + "learning_rate": 3.910388663447425e-05, + "loss": 0.0249, + "step": 291 + }, + { + "epoch": 0.5628915662650602, + "grad_norm": 0.38095012307167053, + "learning_rate": 3.909056457363461e-05, + "loss": 0.0156, + "step": 292 + }, + { + "epoch": 0.5648192771084337, + "grad_norm": 0.4477892220020294, + "learning_rate": 3.907714651924229e-05, + "loss": 0.0309, + "step": 293 + }, + { + "epoch": 0.5667469879518072, + "grad_norm": 0.5875864624977112, + "learning_rate": 3.906363253876763e-05, + "loss": 0.0287, + "step": 294 + }, + { + "epoch": 0.5686746987951807, + "grad_norm": 0.522990882396698, + "learning_rate": 3.90500227001633e-05, + "loss": 0.0318, + "step": 295 + }, + { + "epoch": 0.5706024096385542, + "grad_norm": 0.4153876304626465, + "learning_rate": 3.9036317071863994e-05, + "loss": 0.0192, + "step": 296 + }, + { + "epoch": 0.5725301204819278, + "grad_norm": 0.4675769507884979, + "learning_rate": 3.902251572278605e-05, + "loss": 0.067, + "step": 297 + }, + { + "epoch": 0.5744578313253013, + "grad_norm": 0.35778650641441345, + "learning_rate": 3.900861872232713e-05, + "loss": 0.0197, + "step": 298 + }, + { + "epoch": 0.5763855421686747, + "grad_norm": 0.7382330894470215, + "learning_rate": 3.899462614036587e-05, + "loss": 0.0283, + "step": 299 + }, + { + "epoch": 0.5783132530120482, + "grad_norm": 0.41268599033355713, + "learning_rate": 3.89805380472615e-05, + "loss": 0.0207, + "step": 300 + }, + { + "epoch": 0.5802409638554217, + "grad_norm": 1.2013020515441895, + "learning_rate": 3.8966354513853535e-05, + "loss": 0.0301, + "step": 301 + }, + { + "epoch": 0.5821686746987952, + "grad_norm": 0.424757719039917, + "learning_rate": 3.895207561146137e-05, + "loss": 0.022, + "step": 302 + }, + { + "epoch": 0.5840963855421687, + "grad_norm": 0.4196677505970001, + "learning_rate": 3.893770141188396e-05, + "loss": 0.0424, + "step": 303 + }, + { + "epoch": 0.5860240963855422, + "grad_norm": 0.8644190430641174, + "learning_rate": 3.892323198739946e-05, + "loss": 0.08, + "step": 304 + }, + { + "epoch": 0.5879518072289157, + "grad_norm": 0.5645135045051575, + "learning_rate": 3.890866741076482e-05, + "loss": 0.0152, + "step": 305 + }, + { + "epoch": 0.5898795180722891, + "grad_norm": 0.5218387246131897, + "learning_rate": 3.889400775521545e-05, + "loss": 0.0205, + "step": 306 + }, + { + "epoch": 0.5918072289156626, + "grad_norm": 0.39709413051605225, + "learning_rate": 3.8879253094464865e-05, + "loss": 0.0233, + "step": 307 + }, + { + "epoch": 0.5937349397590361, + "grad_norm": 0.3572910726070404, + "learning_rate": 3.8864403502704285e-05, + "loss": 0.0198, + "step": 308 + }, + { + "epoch": 0.5956626506024096, + "grad_norm": 0.382709264755249, + "learning_rate": 3.8849459054602274e-05, + "loss": 0.0176, + "step": 309 + }, + { + "epoch": 0.5975903614457831, + "grad_norm": 3.4527227878570557, + "learning_rate": 3.883441982530436e-05, + "loss": 0.0239, + "step": 310 + }, + { + "epoch": 0.5995180722891567, + "grad_norm": 0.4467569589614868, + "learning_rate": 3.8819285890432674e-05, + "loss": 0.0284, + "step": 311 + }, + { + "epoch": 0.6014457831325302, + "grad_norm": 0.44513460993766785, + "learning_rate": 3.880405732608555e-05, + "loss": 0.0233, + "step": 312 + }, + { + "epoch": 0.6033734939759036, + "grad_norm": 0.8029689192771912, + "learning_rate": 3.8788734208837155e-05, + "loss": 0.0433, + "step": 313 + }, + { + "epoch": 0.6053012048192771, + "grad_norm": 0.7291454076766968, + "learning_rate": 3.877331661573709e-05, + "loss": 0.043, + "step": 314 + }, + { + "epoch": 0.6072289156626506, + "grad_norm": 0.6050467491149902, + "learning_rate": 3.8757804624310006e-05, + "loss": 0.0377, + "step": 315 + }, + { + "epoch": 0.6091566265060241, + "grad_norm": 0.6714366674423218, + "learning_rate": 3.874219831255524e-05, + "loss": 0.046, + "step": 316 + }, + { + "epoch": 0.6110843373493976, + "grad_norm": 0.336037278175354, + "learning_rate": 3.8726497758946394e-05, + "loss": 0.0149, + "step": 317 + }, + { + "epoch": 0.6130120481927711, + "grad_norm": 0.3057402968406677, + "learning_rate": 3.871070304243094e-05, + "loss": 0.014, + "step": 318 + }, + { + "epoch": 0.6149397590361446, + "grad_norm": 0.4537644684314728, + "learning_rate": 3.8694814242429834e-05, + "loss": 0.0503, + "step": 319 + }, + { + "epoch": 0.6168674698795181, + "grad_norm": 0.45573824644088745, + "learning_rate": 3.8678831438837116e-05, + "loss": 0.021, + "step": 320 + }, + { + "epoch": 0.6187951807228915, + "grad_norm": 0.30729591846466064, + "learning_rate": 3.866275471201952e-05, + "loss": 0.0163, + "step": 321 + }, + { + "epoch": 0.620722891566265, + "grad_norm": 0.7614850401878357, + "learning_rate": 3.8646584142816036e-05, + "loss": 0.0347, + "step": 322 + }, + { + "epoch": 0.6226506024096385, + "grad_norm": 0.5323611497879028, + "learning_rate": 3.863031981253754e-05, + "loss": 0.0201, + "step": 323 + }, + { + "epoch": 0.624578313253012, + "grad_norm": 0.34426453709602356, + "learning_rate": 3.861396180296635e-05, + "loss": 0.0243, + "step": 324 + }, + { + "epoch": 0.6265060240963856, + "grad_norm": 0.621636152267456, + "learning_rate": 3.859751019635585e-05, + "loss": 0.0166, + "step": 325 + }, + { + "epoch": 0.6284337349397591, + "grad_norm": 0.549324095249176, + "learning_rate": 3.858096507543006e-05, + "loss": 0.0274, + "step": 326 + }, + { + "epoch": 0.6303614457831326, + "grad_norm": 0.358426570892334, + "learning_rate": 3.8564326523383214e-05, + "loss": 0.0207, + "step": 327 + }, + { + "epoch": 0.632289156626506, + "grad_norm": 0.3639723062515259, + "learning_rate": 3.8547594623879346e-05, + "loss": 0.0297, + "step": 328 + }, + { + "epoch": 0.6342168674698795, + "grad_norm": 0.3402212858200073, + "learning_rate": 3.853076946105188e-05, + "loss": 0.0258, + "step": 329 + }, + { + "epoch": 0.636144578313253, + "grad_norm": 0.4083027243614197, + "learning_rate": 3.85138511195032e-05, + "loss": 0.0351, + "step": 330 + }, + { + "epoch": 0.6380722891566265, + "grad_norm": 0.43532121181488037, + "learning_rate": 3.84968396843042e-05, + "loss": 0.0388, + "step": 331 + }, + { + "epoch": 0.64, + "grad_norm": 0.35353463888168335, + "learning_rate": 3.8479735240993904e-05, + "loss": 0.0203, + "step": 332 + }, + { + "epoch": 0.6419277108433735, + "grad_norm": 0.350149929523468, + "learning_rate": 3.846253787557901e-05, + "loss": 0.0261, + "step": 333 + }, + { + "epoch": 0.643855421686747, + "grad_norm": 0.7665389180183411, + "learning_rate": 3.844524767453344e-05, + "loss": 0.0108, + "step": 334 + }, + { + "epoch": 0.6457831325301204, + "grad_norm": 0.44621360301971436, + "learning_rate": 3.842786472479795e-05, + "loss": 0.0282, + "step": 335 + }, + { + "epoch": 0.6477108433734939, + "grad_norm": 0.7787201404571533, + "learning_rate": 3.841038911377962e-05, + "loss": 0.0216, + "step": 336 + }, + { + "epoch": 0.6496385542168674, + "grad_norm": 0.48260653018951416, + "learning_rate": 3.839282092935153e-05, + "loss": 0.0234, + "step": 337 + }, + { + "epoch": 0.651566265060241, + "grad_norm": 0.4987852871417999, + "learning_rate": 3.837516025985219e-05, + "loss": 0.0515, + "step": 338 + }, + { + "epoch": 0.6534939759036145, + "grad_norm": 0.9030266404151917, + "learning_rate": 3.835740719408517e-05, + "loss": 0.0508, + "step": 339 + }, + { + "epoch": 0.655421686746988, + "grad_norm": 0.6381701231002808, + "learning_rate": 3.833956182131867e-05, + "loss": 0.0405, + "step": 340 + }, + { + "epoch": 0.6573493975903615, + "grad_norm": 0.42828986048698425, + "learning_rate": 3.832162423128499e-05, + "loss": 0.024, + "step": 341 + }, + { + "epoch": 0.659277108433735, + "grad_norm": 0.38725873827934265, + "learning_rate": 3.8303594514180164e-05, + "loss": 0.0199, + "step": 342 + }, + { + "epoch": 0.6612048192771084, + "grad_norm": 0.23280498385429382, + "learning_rate": 3.828547276066346e-05, + "loss": 0.0101, + "step": 343 + }, + { + "epoch": 0.6631325301204819, + "grad_norm": 0.7298216819763184, + "learning_rate": 3.8267259061856925e-05, + "loss": 0.0455, + "step": 344 + }, + { + "epoch": 0.6650602409638554, + "grad_norm": 0.5975687503814697, + "learning_rate": 3.824895350934496e-05, + "loss": 0.0372, + "step": 345 + }, + { + "epoch": 0.6669879518072289, + "grad_norm": 0.6295403242111206, + "learning_rate": 3.823055619517381e-05, + "loss": 0.0362, + "step": 346 + }, + { + "epoch": 0.6689156626506024, + "grad_norm": 0.5086020827293396, + "learning_rate": 3.821206721185115e-05, + "loss": 0.0368, + "step": 347 + }, + { + "epoch": 0.6708433734939759, + "grad_norm": 0.34506168961524963, + "learning_rate": 3.819348665234557e-05, + "loss": 0.0178, + "step": 348 + }, + { + "epoch": 0.6727710843373494, + "grad_norm": 1.309940218925476, + "learning_rate": 3.817481461008617e-05, + "loss": 0.024, + "step": 349 + }, + { + "epoch": 0.6746987951807228, + "grad_norm": 0.4074770510196686, + "learning_rate": 3.815605117896204e-05, + "loss": 0.0262, + "step": 350 + }, + { + "epoch": 0.6766265060240964, + "grad_norm": 0.48525840044021606, + "learning_rate": 3.8137196453321775e-05, + "loss": 0.0209, + "step": 351 + }, + { + "epoch": 0.6785542168674699, + "grad_norm": 0.7199739217758179, + "learning_rate": 3.811825052797308e-05, + "loss": 0.0396, + "step": 352 + }, + { + "epoch": 0.6804819277108434, + "grad_norm": 0.519540011882782, + "learning_rate": 3.8099213498182196e-05, + "loss": 0.0453, + "step": 353 + }, + { + "epoch": 0.6824096385542169, + "grad_norm": 0.9738391041755676, + "learning_rate": 3.808008545967349e-05, + "loss": 0.0317, + "step": 354 + }, + { + "epoch": 0.6843373493975904, + "grad_norm": 1.888344407081604, + "learning_rate": 3.8060866508628953e-05, + "loss": 0.0452, + "step": 355 + }, + { + "epoch": 0.6862650602409639, + "grad_norm": 0.48989811539649963, + "learning_rate": 3.8041556741687695e-05, + "loss": 0.0315, + "step": 356 + }, + { + "epoch": 0.6881927710843373, + "grad_norm": 0.3764645457267761, + "learning_rate": 3.8022156255945496e-05, + "loss": 0.0269, + "step": 357 + }, + { + "epoch": 0.6901204819277108, + "grad_norm": 0.46409738063812256, + "learning_rate": 3.800266514895429e-05, + "loss": 0.0171, + "step": 358 + }, + { + "epoch": 0.6920481927710843, + "grad_norm": 0.41091030836105347, + "learning_rate": 3.7983083518721695e-05, + "loss": 0.0167, + "step": 359 + }, + { + "epoch": 0.6939759036144578, + "grad_norm": 0.8375523090362549, + "learning_rate": 3.79634114637105e-05, + "loss": 0.0342, + "step": 360 + }, + { + "epoch": 0.6959036144578313, + "grad_norm": 1.7053394317626953, + "learning_rate": 3.794364908283817e-05, + "loss": 0.02, + "step": 361 + }, + { + "epoch": 0.6978313253012048, + "grad_norm": 0.4163115918636322, + "learning_rate": 3.792379647547637e-05, + "loss": 0.0138, + "step": 362 + }, + { + "epoch": 0.6997590361445784, + "grad_norm": 0.388751745223999, + "learning_rate": 3.790385374145046e-05, + "loss": 0.0172, + "step": 363 + }, + { + "epoch": 0.7016867469879519, + "grad_norm": 0.5584064722061157, + "learning_rate": 3.7883820981038966e-05, + "loss": 0.0254, + "step": 364 + }, + { + "epoch": 0.7036144578313253, + "grad_norm": 1.394264817237854, + "learning_rate": 3.7863698294973114e-05, + "loss": 0.037, + "step": 365 + }, + { + "epoch": 0.7055421686746988, + "grad_norm": 0.46280744671821594, + "learning_rate": 3.78434857844363e-05, + "loss": 0.0234, + "step": 366 + }, + { + "epoch": 0.7074698795180723, + "grad_norm": 0.39548924565315247, + "learning_rate": 3.782318355106358e-05, + "loss": 0.0164, + "step": 367 + }, + { + "epoch": 0.7093975903614458, + "grad_norm": 0.7307773232460022, + "learning_rate": 3.780279169694118e-05, + "loss": 0.0192, + "step": 368 + }, + { + "epoch": 0.7113253012048193, + "grad_norm": 0.28035807609558105, + "learning_rate": 3.778231032460594e-05, + "loss": 0.0131, + "step": 369 + }, + { + "epoch": 0.7132530120481928, + "grad_norm": 0.8376953601837158, + "learning_rate": 3.776173953704486e-05, + "loss": 0.0291, + "step": 370 + }, + { + "epoch": 0.7151807228915663, + "grad_norm": 0.7356843948364258, + "learning_rate": 3.774107943769454e-05, + "loss": 0.0214, + "step": 371 + }, + { + "epoch": 0.7171084337349397, + "grad_norm": 0.41503390669822693, + "learning_rate": 3.772033013044064e-05, + "loss": 0.0221, + "step": 372 + }, + { + "epoch": 0.7190361445783132, + "grad_norm": 0.35732385516166687, + "learning_rate": 3.7699491719617436e-05, + "loss": 0.015, + "step": 373 + }, + { + "epoch": 0.7209638554216867, + "grad_norm": 0.283778578042984, + "learning_rate": 3.76785643100072e-05, + "loss": 0.0146, + "step": 374 + }, + { + "epoch": 0.7228915662650602, + "grad_norm": 0.3219413459300995, + "learning_rate": 3.765754800683974e-05, + "loss": 0.015, + "step": 375 + }, + { + "epoch": 0.7248192771084337, + "grad_norm": 0.610431432723999, + "learning_rate": 3.7636442915791856e-05, + "loss": 0.0326, + "step": 376 + }, + { + "epoch": 0.7267469879518073, + "grad_norm": 4.944870948791504, + "learning_rate": 3.7615249142986784e-05, + "loss": 0.0432, + "step": 377 + }, + { + "epoch": 0.7286746987951808, + "grad_norm": 0.4894593060016632, + "learning_rate": 3.7593966794993696e-05, + "loss": 0.0174, + "step": 378 + }, + { + "epoch": 0.7306024096385542, + "grad_norm": 0.4211325943470001, + "learning_rate": 3.757259597882714e-05, + "loss": 0.023, + "step": 379 + }, + { + "epoch": 0.7325301204819277, + "grad_norm": 0.33621737360954285, + "learning_rate": 3.755113680194651e-05, + "loss": 0.0201, + "step": 380 + }, + { + "epoch": 0.7344578313253012, + "grad_norm": 0.5799694657325745, + "learning_rate": 3.7529589372255514e-05, + "loss": 0.0173, + "step": 381 + }, + { + "epoch": 0.7363855421686747, + "grad_norm": 0.5172572731971741, + "learning_rate": 3.750795379810162e-05, + "loss": 0.0284, + "step": 382 + }, + { + "epoch": 0.7383132530120482, + "grad_norm": 0.5715453028678894, + "learning_rate": 3.748623018827552e-05, + "loss": 0.0194, + "step": 383 + }, + { + "epoch": 0.7402409638554217, + "grad_norm": 0.5284178256988525, + "learning_rate": 3.746441865201056e-05, + "loss": 0.0247, + "step": 384 + }, + { + "epoch": 0.7421686746987952, + "grad_norm": 0.37828654050827026, + "learning_rate": 3.744251929898223e-05, + "loss": 0.0097, + "step": 385 + }, + { + "epoch": 0.7440963855421687, + "grad_norm": 0.3252779543399811, + "learning_rate": 3.742053223930758e-05, + "loss": 0.0238, + "step": 386 + }, + { + "epoch": 0.7460240963855421, + "grad_norm": 0.6031543612480164, + "learning_rate": 3.7398457583544674e-05, + "loss": 0.0332, + "step": 387 + }, + { + "epoch": 0.7479518072289156, + "grad_norm": 0.23846614360809326, + "learning_rate": 3.737629544269206e-05, + "loss": 0.0122, + "step": 388 + }, + { + "epoch": 0.7498795180722891, + "grad_norm": 0.5274029970169067, + "learning_rate": 3.7354045928188155e-05, + "loss": 0.0324, + "step": 389 + }, + { + "epoch": 0.7518072289156627, + "grad_norm": 0.4672217071056366, + "learning_rate": 3.733170915191075e-05, + "loss": 0.0196, + "step": 390 + }, + { + "epoch": 0.7537349397590362, + "grad_norm": 0.29819396138191223, + "learning_rate": 3.730928522617639e-05, + "loss": 0.0131, + "step": 391 + }, + { + "epoch": 0.7556626506024097, + "grad_norm": 0.43824997544288635, + "learning_rate": 3.7286774263739855e-05, + "loss": 0.0238, + "step": 392 + }, + { + "epoch": 0.7575903614457832, + "grad_norm": 0.2822072505950928, + "learning_rate": 3.726417637779357e-05, + "loss": 0.0314, + "step": 393 + }, + { + "epoch": 0.7595180722891566, + "grad_norm": 0.43815648555755615, + "learning_rate": 3.7241491681967044e-05, + "loss": 0.0144, + "step": 394 + }, + { + "epoch": 0.7614457831325301, + "grad_norm": 0.37194815278053284, + "learning_rate": 3.721872029032628e-05, + "loss": 0.0286, + "step": 395 + }, + { + "epoch": 0.7633734939759036, + "grad_norm": 0.7319737672805786, + "learning_rate": 3.719586231737322e-05, + "loss": 0.0427, + "step": 396 + }, + { + "epoch": 0.7653012048192771, + "grad_norm": 0.5870066285133362, + "learning_rate": 3.717291787804517e-05, + "loss": 0.0138, + "step": 397 + }, + { + "epoch": 0.7672289156626506, + "grad_norm": 0.6574277281761169, + "learning_rate": 3.7149887087714225e-05, + "loss": 0.061, + "step": 398 + }, + { + "epoch": 0.7691566265060241, + "grad_norm": 0.5467348694801331, + "learning_rate": 3.712677006218666e-05, + "loss": 0.022, + "step": 399 + }, + { + "epoch": 0.7710843373493976, + "grad_norm": 0.3589288890361786, + "learning_rate": 3.710356691770238e-05, + "loss": 0.0161, + "step": 400 + }, + { + "epoch": 0.7730120481927711, + "grad_norm": 0.574630618095398, + "learning_rate": 3.708027777093433e-05, + "loss": 0.0285, + "step": 401 + }, + { + "epoch": 0.7749397590361445, + "grad_norm": 0.39048445224761963, + "learning_rate": 3.70569027389879e-05, + "loss": 0.012, + "step": 402 + }, + { + "epoch": 0.776867469879518, + "grad_norm": 0.34803536534309387, + "learning_rate": 3.703344193940032e-05, + "loss": 0.0155, + "step": 403 + }, + { + "epoch": 0.7787951807228916, + "grad_norm": 1.188948392868042, + "learning_rate": 3.700989549014011e-05, + "loss": 0.0617, + "step": 404 + }, + { + "epoch": 0.7807228915662651, + "grad_norm": 0.473157674074173, + "learning_rate": 3.698626350960646e-05, + "loss": 0.0298, + "step": 405 + }, + { + "epoch": 0.7826506024096386, + "grad_norm": 0.42009076476097107, + "learning_rate": 3.6962546116628634e-05, + "loss": 0.03, + "step": 406 + }, + { + "epoch": 0.7845783132530121, + "grad_norm": 0.6334308981895447, + "learning_rate": 3.693874343046537e-05, + "loss": 0.0107, + "step": 407 + }, + { + "epoch": 0.7865060240963856, + "grad_norm": 0.35594677925109863, + "learning_rate": 3.6914855570804314e-05, + "loss": 0.0174, + "step": 408 + }, + { + "epoch": 0.788433734939759, + "grad_norm": 0.28985708951950073, + "learning_rate": 3.689088265776136e-05, + "loss": 0.0149, + "step": 409 + }, + { + "epoch": 0.7903614457831325, + "grad_norm": 0.3981950581073761, + "learning_rate": 3.686682481188011e-05, + "loss": 0.019, + "step": 410 + }, + { + "epoch": 0.792289156626506, + "grad_norm": 0.48819583654403687, + "learning_rate": 3.6842682154131193e-05, + "loss": 0.0217, + "step": 411 + }, + { + "epoch": 0.7942168674698795, + "grad_norm": 0.42819952964782715, + "learning_rate": 3.681845480591174e-05, + "loss": 0.0198, + "step": 412 + }, + { + "epoch": 0.796144578313253, + "grad_norm": 0.48591694235801697, + "learning_rate": 3.6794142889044727e-05, + "loss": 0.0253, + "step": 413 + }, + { + "epoch": 0.7980722891566265, + "grad_norm": 0.4730607271194458, + "learning_rate": 3.676974652577835e-05, + "loss": 0.0329, + "step": 414 + }, + { + "epoch": 0.8, + "grad_norm": 0.5390865802764893, + "learning_rate": 3.6745265838785434e-05, + "loss": 0.0479, + "step": 415 + }, + { + "epoch": 0.8019277108433734, + "grad_norm": 0.6377891302108765, + "learning_rate": 3.672070095116283e-05, + "loss": 0.019, + "step": 416 + }, + { + "epoch": 0.803855421686747, + "grad_norm": 0.8984615206718445, + "learning_rate": 3.669605198643075e-05, + "loss": 0.0444, + "step": 417 + }, + { + "epoch": 0.8057831325301205, + "grad_norm": 0.4913877546787262, + "learning_rate": 3.667131906853219e-05, + "loss": 0.031, + "step": 418 + }, + { + "epoch": 0.807710843373494, + "grad_norm": 0.37894028425216675, + "learning_rate": 3.664650232183229e-05, + "loss": 0.0195, + "step": 419 + }, + { + "epoch": 0.8096385542168675, + "grad_norm": 0.3644949495792389, + "learning_rate": 3.66216018711177e-05, + "loss": 0.018, + "step": 420 + }, + { + "epoch": 0.811566265060241, + "grad_norm": 0.414440393447876, + "learning_rate": 3.659661784159597e-05, + "loss": 0.0188, + "step": 421 + }, + { + "epoch": 0.8134939759036145, + "grad_norm": 0.49220341444015503, + "learning_rate": 3.65715503588949e-05, + "loss": 0.016, + "step": 422 + }, + { + "epoch": 0.815421686746988, + "grad_norm": 1.0939836502075195, + "learning_rate": 3.654639954906193e-05, + "loss": 0.0758, + "step": 423 + }, + { + "epoch": 0.8173493975903614, + "grad_norm": 0.43222442269325256, + "learning_rate": 3.652116553856349e-05, + "loss": 0.0308, + "step": 424 + }, + { + "epoch": 0.8192771084337349, + "grad_norm": 0.5081896185874939, + "learning_rate": 3.649584845428438e-05, + "loss": 0.0493, + "step": 425 + }, + { + "epoch": 0.8212048192771084, + "grad_norm": 0.9811948537826538, + "learning_rate": 3.64704484235271e-05, + "loss": 0.019, + "step": 426 + }, + { + "epoch": 0.8231325301204819, + "grad_norm": 0.31656572222709656, + "learning_rate": 3.6444965574011255e-05, + "loss": 0.0135, + "step": 427 + }, + { + "epoch": 0.8250602409638554, + "grad_norm": 0.7844433188438416, + "learning_rate": 3.641940003387289e-05, + "loss": 0.0402, + "step": 428 + }, + { + "epoch": 0.826987951807229, + "grad_norm": 0.3353273570537567, + "learning_rate": 3.6393751931663814e-05, + "loss": 0.0132, + "step": 429 + }, + { + "epoch": 0.8289156626506025, + "grad_norm": 0.7253058552742004, + "learning_rate": 3.6368021396351015e-05, + "loss": 0.0296, + "step": 430 + }, + { + "epoch": 0.8308433734939759, + "grad_norm": 0.45300304889678955, + "learning_rate": 3.634220855731598e-05, + "loss": 0.0258, + "step": 431 + }, + { + "epoch": 0.8327710843373494, + "grad_norm": 0.3480473458766937, + "learning_rate": 3.631631354435403e-05, + "loss": 0.0099, + "step": 432 + }, + { + "epoch": 0.8346987951807229, + "grad_norm": 2.1114516258239746, + "learning_rate": 3.62903364876737e-05, + "loss": 0.0457, + "step": 433 + }, + { + "epoch": 0.8366265060240964, + "grad_norm": 0.5649561882019043, + "learning_rate": 3.626427751789606e-05, + "loss": 0.0444, + "step": 434 + }, + { + "epoch": 0.8385542168674699, + "grad_norm": 0.3864995539188385, + "learning_rate": 3.623813676605405e-05, + "loss": 0.0223, + "step": 435 + }, + { + "epoch": 0.8404819277108434, + "grad_norm": 1.2134298086166382, + "learning_rate": 3.621191436359186e-05, + "loss": 0.0353, + "step": 436 + }, + { + "epoch": 0.8424096385542169, + "grad_norm": 0.4403415024280548, + "learning_rate": 3.6185610442364246e-05, + "loss": 0.0216, + "step": 437 + }, + { + "epoch": 0.8443373493975903, + "grad_norm": 0.6050297021865845, + "learning_rate": 3.6159225134635846e-05, + "loss": 0.0433, + "step": 438 + }, + { + "epoch": 0.8462650602409638, + "grad_norm": 0.7951678037643433, + "learning_rate": 3.6132758573080556e-05, + "loss": 0.031, + "step": 439 + }, + { + "epoch": 0.8481927710843373, + "grad_norm": 0.4991949796676636, + "learning_rate": 3.6106210890780834e-05, + "loss": 0.0313, + "step": 440 + }, + { + "epoch": 0.8501204819277108, + "grad_norm": 0.47951385378837585, + "learning_rate": 3.607958222122704e-05, + "loss": 0.0218, + "step": 441 + }, + { + "epoch": 0.8520481927710843, + "grad_norm": 0.7345194220542908, + "learning_rate": 3.6052872698316755e-05, + "loss": 0.0239, + "step": 442 + }, + { + "epoch": 0.8539759036144579, + "grad_norm": 1.4814884662628174, + "learning_rate": 3.602608245635414e-05, + "loss": 0.0127, + "step": 443 + }, + { + "epoch": 0.8559036144578314, + "grad_norm": 2.4240877628326416, + "learning_rate": 3.599921163004922e-05, + "loss": 0.0618, + "step": 444 + }, + { + "epoch": 0.8578313253012049, + "grad_norm": 0.41523510217666626, + "learning_rate": 3.5972260354517216e-05, + "loss": 0.0283, + "step": 445 + }, + { + "epoch": 0.8597590361445783, + "grad_norm": 0.5577677488327026, + "learning_rate": 3.594522876527791e-05, + "loss": 0.0271, + "step": 446 + }, + { + "epoch": 0.8616867469879518, + "grad_norm": 0.5829064846038818, + "learning_rate": 3.591811699825487e-05, + "loss": 0.0169, + "step": 447 + }, + { + "epoch": 0.8636144578313253, + "grad_norm": 0.4478822350502014, + "learning_rate": 3.5890925189774886e-05, + "loss": 0.0239, + "step": 448 + }, + { + "epoch": 0.8655421686746988, + "grad_norm": 0.3498048782348633, + "learning_rate": 3.586365347656718e-05, + "loss": 0.0137, + "step": 449 + }, + { + "epoch": 0.8674698795180723, + "grad_norm": 0.6571130156517029, + "learning_rate": 3.583630199576278e-05, + "loss": 0.027, + "step": 450 + }, + { + "epoch": 0.8693975903614458, + "grad_norm": 0.344970166683197, + "learning_rate": 3.58088708848938e-05, + "loss": 0.0167, + "step": 451 + }, + { + "epoch": 0.8713253012048193, + "grad_norm": 0.34611570835113525, + "learning_rate": 3.5781360281892775e-05, + "loss": 0.0468, + "step": 452 + }, + { + "epoch": 0.8732530120481927, + "grad_norm": 0.66157066822052, + "learning_rate": 3.575377032509194e-05, + "loss": 0.0344, + "step": 453 + }, + { + "epoch": 0.8751807228915662, + "grad_norm": 0.3676326870918274, + "learning_rate": 3.5726101153222534e-05, + "loss": 0.0366, + "step": 454 + }, + { + "epoch": 0.8771084337349397, + "grad_norm": 0.5958423018455505, + "learning_rate": 3.569835290541414e-05, + "loss": 0.0382, + "step": 455 + }, + { + "epoch": 0.8790361445783132, + "grad_norm": 0.36787471175193787, + "learning_rate": 3.567052572119397e-05, + "loss": 0.018, + "step": 456 + }, + { + "epoch": 0.8809638554216868, + "grad_norm": 0.9478234052658081, + "learning_rate": 3.564261974048611e-05, + "loss": 0.0179, + "step": 457 + }, + { + "epoch": 0.8828915662650603, + "grad_norm": 0.3337579369544983, + "learning_rate": 3.56146351036109e-05, + "loss": 0.0147, + "step": 458 + }, + { + "epoch": 0.8848192771084338, + "grad_norm": 0.4984932243824005, + "learning_rate": 3.558657195128416e-05, + "loss": 0.0224, + "step": 459 + }, + { + "epoch": 0.8867469879518072, + "grad_norm": 0.36718735098838806, + "learning_rate": 3.555843042461653e-05, + "loss": 0.0202, + "step": 460 + }, + { + "epoch": 0.8886746987951807, + "grad_norm": 0.4081745445728302, + "learning_rate": 3.553021066511274e-05, + "loss": 0.0288, + "step": 461 + }, + { + "epoch": 0.8906024096385542, + "grad_norm": 0.3233242332935333, + "learning_rate": 3.55019128146709e-05, + "loss": 0.0362, + "step": 462 + }, + { + "epoch": 0.8925301204819277, + "grad_norm": 0.6560158729553223, + "learning_rate": 3.547353701558178e-05, + "loss": 0.038, + "step": 463 + }, + { + "epoch": 0.8944578313253012, + "grad_norm": 0.47668641805648804, + "learning_rate": 3.544508341052811e-05, + "loss": 0.0399, + "step": 464 + }, + { + "epoch": 0.8963855421686747, + "grad_norm": 0.45512664318084717, + "learning_rate": 3.541655214258383e-05, + "loss": 0.022, + "step": 465 + }, + { + "epoch": 0.8983132530120482, + "grad_norm": 0.8410730361938477, + "learning_rate": 3.538794335521343e-05, + "loss": 0.0315, + "step": 466 + }, + { + "epoch": 0.9002409638554217, + "grad_norm": 0.4872909486293793, + "learning_rate": 3.535925719227117e-05, + "loss": 0.0152, + "step": 467 + }, + { + "epoch": 0.9021686746987951, + "grad_norm": 0.45623311400413513, + "learning_rate": 3.533049379800038e-05, + "loss": 0.0305, + "step": 468 + }, + { + "epoch": 0.9040963855421686, + "grad_norm": 0.43087029457092285, + "learning_rate": 3.530165331703275e-05, + "loss": 0.0131, + "step": 469 + }, + { + "epoch": 0.9060240963855422, + "grad_norm": 0.4610525369644165, + "learning_rate": 3.527273589438756e-05, + "loss": 0.0187, + "step": 470 + }, + { + "epoch": 0.9079518072289157, + "grad_norm": 0.3356114327907562, + "learning_rate": 3.5243741675471006e-05, + "loss": 0.0185, + "step": 471 + }, + { + "epoch": 0.9098795180722892, + "grad_norm": 0.9065960049629211, + "learning_rate": 3.5214670806075426e-05, + "loss": 0.0433, + "step": 472 + }, + { + "epoch": 0.9118072289156627, + "grad_norm": 0.3652578294277191, + "learning_rate": 3.518552343237858e-05, + "loss": 0.02, + "step": 473 + }, + { + "epoch": 0.9137349397590362, + "grad_norm": 0.32377883791923523, + "learning_rate": 3.5156299700942916e-05, + "loss": 0.0165, + "step": 474 + }, + { + "epoch": 0.9156626506024096, + "grad_norm": 0.2431817352771759, + "learning_rate": 3.512699975871485e-05, + "loss": 0.0172, + "step": 475 + }, + { + "epoch": 0.9175903614457831, + "grad_norm": 0.6390707492828369, + "learning_rate": 3.509762375302399e-05, + "loss": 0.0356, + "step": 476 + }, + { + "epoch": 0.9195180722891566, + "grad_norm": 0.2283092886209488, + "learning_rate": 3.506817183158243e-05, + "loss": 0.0088, + "step": 477 + }, + { + "epoch": 0.9214457831325301, + "grad_norm": 0.5053914189338684, + "learning_rate": 3.5038644142483966e-05, + "loss": 0.0389, + "step": 478 + }, + { + "epoch": 0.9233734939759036, + "grad_norm": 0.2567576467990875, + "learning_rate": 3.500904083420342e-05, + "loss": 0.0155, + "step": 479 + }, + { + "epoch": 0.9253012048192771, + "grad_norm": 0.6852384209632874, + "learning_rate": 3.497936205559583e-05, + "loss": 0.0247, + "step": 480 + }, + { + "epoch": 0.9272289156626506, + "grad_norm": 0.36403414607048035, + "learning_rate": 3.494960795589572e-05, + "loss": 0.023, + "step": 481 + }, + { + "epoch": 0.929156626506024, + "grad_norm": 0.506554901599884, + "learning_rate": 3.491977868471635e-05, + "loss": 0.0273, + "step": 482 + }, + { + "epoch": 0.9310843373493976, + "grad_norm": 0.38329923152923584, + "learning_rate": 3.4889874392048985e-05, + "loss": 0.0169, + "step": 483 + }, + { + "epoch": 0.9330120481927711, + "grad_norm": 0.2805836498737335, + "learning_rate": 3.48598952282621e-05, + "loss": 0.0105, + "step": 484 + }, + { + "epoch": 0.9349397590361446, + "grad_norm": 0.6315302848815918, + "learning_rate": 3.482984134410067e-05, + "loss": 0.0289, + "step": 485 + }, + { + "epoch": 0.9368674698795181, + "grad_norm": 0.6431388854980469, + "learning_rate": 3.479971289068537e-05, + "loss": 0.0311, + "step": 486 + }, + { + "epoch": 0.9387951807228916, + "grad_norm": 0.9794723391532898, + "learning_rate": 3.476951001951184e-05, + "loss": 0.0452, + "step": 487 + }, + { + "epoch": 0.9407228915662651, + "grad_norm": 0.7984824180603027, + "learning_rate": 3.473923288244991e-05, + "loss": 0.0689, + "step": 488 + }, + { + "epoch": 0.9426506024096386, + "grad_norm": 0.46362006664276123, + "learning_rate": 3.470888163174286e-05, + "loss": 0.0241, + "step": 489 + }, + { + "epoch": 0.944578313253012, + "grad_norm": 0.5051195025444031, + "learning_rate": 3.467845642000661e-05, + "loss": 0.0228, + "step": 490 + }, + { + "epoch": 0.9465060240963855, + "grad_norm": 0.3082812428474426, + "learning_rate": 3.4647957400229004e-05, + "loss": 0.0144, + "step": 491 + }, + { + "epoch": 0.948433734939759, + "grad_norm": 0.2691391110420227, + "learning_rate": 3.461738472576902e-05, + "loss": 0.0167, + "step": 492 + }, + { + "epoch": 0.9503614457831325, + "grad_norm": 0.5627671480178833, + "learning_rate": 3.458673855035597e-05, + "loss": 0.031, + "step": 493 + }, + { + "epoch": 0.952289156626506, + "grad_norm": 0.4571435749530792, + "learning_rate": 3.455601902808876e-05, + "loss": 0.0191, + "step": 494 + }, + { + "epoch": 0.9542168674698795, + "grad_norm": 1.0117709636688232, + "learning_rate": 3.452522631343515e-05, + "loss": 0.0192, + "step": 495 + }, + { + "epoch": 0.9561445783132531, + "grad_norm": 0.28375712037086487, + "learning_rate": 3.449436056123086e-05, + "loss": 0.0159, + "step": 496 + }, + { + "epoch": 0.9580722891566265, + "grad_norm": 0.26381856203079224, + "learning_rate": 3.446342192667893e-05, + "loss": 0.0113, + "step": 497 + }, + { + "epoch": 0.96, + "grad_norm": 0.49317577481269836, + "learning_rate": 3.443241056534884e-05, + "loss": 0.0332, + "step": 498 + }, + { + "epoch": 0.9619277108433735, + "grad_norm": 0.28884485363960266, + "learning_rate": 3.440132663317579e-05, + "loss": 0.0117, + "step": 499 + }, + { + "epoch": 0.963855421686747, + "grad_norm": 0.36255285143852234, + "learning_rate": 3.4370170286459864e-05, + "loss": 0.0169, + "step": 500 + }, + { + "epoch": 0.9657831325301205, + "grad_norm": 0.4265049993991852, + "learning_rate": 3.433894168186529e-05, + "loss": 0.0217, + "step": 501 + }, + { + "epoch": 0.967710843373494, + "grad_norm": 0.8169426321983337, + "learning_rate": 3.430764097641962e-05, + "loss": 0.0207, + "step": 502 + }, + { + "epoch": 0.9696385542168675, + "grad_norm": 1.866077184677124, + "learning_rate": 3.427626832751296e-05, + "loss": 0.0381, + "step": 503 + }, + { + "epoch": 0.971566265060241, + "grad_norm": 0.33124980330467224, + "learning_rate": 3.424482389289716e-05, + "loss": 0.0245, + "step": 504 + }, + { + "epoch": 0.9734939759036144, + "grad_norm": 0.37479540705680847, + "learning_rate": 3.4213307830685055e-05, + "loss": 0.0164, + "step": 505 + }, + { + "epoch": 0.9754216867469879, + "grad_norm": 0.39738863706588745, + "learning_rate": 3.4181720299349615e-05, + "loss": 0.0297, + "step": 506 + }, + { + "epoch": 0.9773493975903614, + "grad_norm": 0.2567287087440491, + "learning_rate": 3.4150061457723205e-05, + "loss": 0.0102, + "step": 507 + }, + { + "epoch": 0.9792771084337349, + "grad_norm": 0.6230517029762268, + "learning_rate": 3.411833146499675e-05, + "loss": 0.0243, + "step": 508 + }, + { + "epoch": 0.9812048192771085, + "grad_norm": 0.44843971729278564, + "learning_rate": 3.408653048071894e-05, + "loss": 0.0357, + "step": 509 + }, + { + "epoch": 0.983132530120482, + "grad_norm": 1.0569655895233154, + "learning_rate": 3.405465866479546e-05, + "loss": 0.037, + "step": 510 + }, + { + "epoch": 0.9850602409638555, + "grad_norm": 0.29000964760780334, + "learning_rate": 3.402271617748812e-05, + "loss": 0.0129, + "step": 511 + }, + { + "epoch": 0.9869879518072289, + "grad_norm": 2.1627447605133057, + "learning_rate": 3.399070317941413e-05, + "loss": 0.0442, + "step": 512 + }, + { + "epoch": 0.9889156626506024, + "grad_norm": 0.27371272444725037, + "learning_rate": 3.395861983154522e-05, + "loss": 0.0119, + "step": 513 + }, + { + "epoch": 0.9908433734939759, + "grad_norm": 0.4117226302623749, + "learning_rate": 3.392646629520688e-05, + "loss": 0.0455, + "step": 514 + }, + { + "epoch": 0.9927710843373494, + "grad_norm": 0.5098996758460999, + "learning_rate": 3.389424273207752e-05, + "loss": 0.0203, + "step": 515 + }, + { + "epoch": 0.9946987951807229, + "grad_norm": 0.5192157626152039, + "learning_rate": 3.386194930418767e-05, + "loss": 0.0329, + "step": 516 + }, + { + "epoch": 0.9966265060240964, + "grad_norm": 0.18757697939872742, + "learning_rate": 3.382958617391915e-05, + "loss": 0.0065, + "step": 517 + }, + { + "epoch": 0.9985542168674699, + "grad_norm": 0.3334413170814514, + "learning_rate": 3.3797153504004296e-05, + "loss": 0.0266, + "step": 518 + }, + { + "epoch": 1.0, + "grad_norm": 0.4152225852012634, + "learning_rate": 3.3764651457525095e-05, + "loss": 0.0169, + "step": 519 + }, + { + "epoch": 1.0019277108433735, + "grad_norm": 0.43535247445106506, + "learning_rate": 3.373208019791237e-05, + "loss": 0.0221, + "step": 520 + }, + { + "epoch": 1.003855421686747, + "grad_norm": 0.39292722940444946, + "learning_rate": 3.3699439888945e-05, + "loss": 0.0211, + "step": 521 + }, + { + "epoch": 1.0057831325301205, + "grad_norm": 0.19566713273525238, + "learning_rate": 3.366673069474904e-05, + "loss": 0.0069, + "step": 522 + }, + { + "epoch": 1.007710843373494, + "grad_norm": 0.5101853609085083, + "learning_rate": 3.3633952779796914e-05, + "loss": 0.0191, + "step": 523 + }, + { + "epoch": 1.0096385542168675, + "grad_norm": 0.999434769153595, + "learning_rate": 3.360110630890664e-05, + "loss": 0.0196, + "step": 524 + }, + { + "epoch": 1.011566265060241, + "grad_norm": 0.4646223783493042, + "learning_rate": 3.356819144724092e-05, + "loss": 0.0328, + "step": 525 + }, + { + "epoch": 1.0134939759036146, + "grad_norm": 0.3132480978965759, + "learning_rate": 3.3535208360306354e-05, + "loss": 0.0203, + "step": 526 + }, + { + "epoch": 1.0154216867469879, + "grad_norm": 0.3038032352924347, + "learning_rate": 3.350215721395261e-05, + "loss": 0.0122, + "step": 527 + }, + { + "epoch": 1.0173493975903614, + "grad_norm": 0.45082882046699524, + "learning_rate": 3.346903817437157e-05, + "loss": 0.0437, + "step": 528 + }, + { + "epoch": 1.0192771084337349, + "grad_norm": 0.26917046308517456, + "learning_rate": 3.343585140809651e-05, + "loss": 0.013, + "step": 529 + }, + { + "epoch": 1.0212048192771084, + "grad_norm": 0.23869264125823975, + "learning_rate": 3.3402597082001276e-05, + "loss": 0.008, + "step": 530 + }, + { + "epoch": 1.0231325301204819, + "grad_norm": 0.31315353512763977, + "learning_rate": 3.3369275363299394e-05, + "loss": 0.0078, + "step": 531 + }, + { + "epoch": 1.0250602409638554, + "grad_norm": 0.4780346751213074, + "learning_rate": 3.333588641954327e-05, + "loss": 0.0225, + "step": 532 + }, + { + "epoch": 1.026987951807229, + "grad_norm": 0.2920368015766144, + "learning_rate": 3.330243041862336e-05, + "loss": 0.0118, + "step": 533 + }, + { + "epoch": 1.0289156626506024, + "grad_norm": 0.543669581413269, + "learning_rate": 3.326890752876728e-05, + "loss": 0.0338, + "step": 534 + }, + { + "epoch": 1.030843373493976, + "grad_norm": 0.4288000464439392, + "learning_rate": 3.323531791853901e-05, + "loss": 0.0341, + "step": 535 + }, + { + "epoch": 1.0327710843373494, + "grad_norm": 0.26600322127342224, + "learning_rate": 3.3201661756838e-05, + "loss": 0.0184, + "step": 536 + }, + { + "epoch": 1.034698795180723, + "grad_norm": 0.290937602519989, + "learning_rate": 3.316793921289835e-05, + "loss": 0.0152, + "step": 537 + }, + { + "epoch": 1.0366265060240965, + "grad_norm": 0.7621443271636963, + "learning_rate": 3.313415045628795e-05, + "loss": 0.0326, + "step": 538 + }, + { + "epoch": 1.03855421686747, + "grad_norm": 0.5581283569335938, + "learning_rate": 3.3100295656907646e-05, + "loss": 0.0164, + "step": 539 + }, + { + "epoch": 1.0404819277108435, + "grad_norm": 0.20930901169776917, + "learning_rate": 3.306637498499034e-05, + "loss": 0.0091, + "step": 540 + }, + { + "epoch": 1.0424096385542168, + "grad_norm": 0.46212059259414673, + "learning_rate": 3.303238861110018e-05, + "loss": 0.0118, + "step": 541 + }, + { + "epoch": 1.0443373493975903, + "grad_norm": 0.38259151577949524, + "learning_rate": 3.299833670613168e-05, + "loss": 0.0081, + "step": 542 + }, + { + "epoch": 1.0462650602409638, + "grad_norm": 0.4888618290424347, + "learning_rate": 3.2964219441308865e-05, + "loss": 0.0138, + "step": 543 + }, + { + "epoch": 1.0481927710843373, + "grad_norm": 0.32103127241134644, + "learning_rate": 3.2930036988184425e-05, + "loss": 0.0171, + "step": 544 + }, + { + "epoch": 1.0501204819277108, + "grad_norm": 0.27787327766418457, + "learning_rate": 3.28957895186388e-05, + "loss": 0.0106, + "step": 545 + }, + { + "epoch": 1.0520481927710843, + "grad_norm": 0.35597777366638184, + "learning_rate": 3.2861477204879395e-05, + "loss": 0.0123, + "step": 546 + }, + { + "epoch": 1.0539759036144578, + "grad_norm": 0.3619804084300995, + "learning_rate": 3.2827100219439656e-05, + "loss": 0.0088, + "step": 547 + }, + { + "epoch": 1.0559036144578313, + "grad_norm": 0.2525513470172882, + "learning_rate": 3.279265873517822e-05, + "loss": 0.0179, + "step": 548 + }, + { + "epoch": 1.0578313253012048, + "grad_norm": 0.3910020887851715, + "learning_rate": 3.275815292527804e-05, + "loss": 0.0142, + "step": 549 + }, + { + "epoch": 1.0597590361445783, + "grad_norm": 0.30515050888061523, + "learning_rate": 3.2723582963245526e-05, + "loss": 0.0123, + "step": 550 + }, + { + "epoch": 1.0616867469879518, + "grad_norm": 0.21708644926548004, + "learning_rate": 3.2688949022909665e-05, + "loss": 0.0098, + "step": 551 + }, + { + "epoch": 1.0636144578313254, + "grad_norm": 0.23307719826698303, + "learning_rate": 3.265425127842114e-05, + "loss": 0.0097, + "step": 552 + }, + { + "epoch": 1.0655421686746989, + "grad_norm": 0.676654577255249, + "learning_rate": 3.261948990425147e-05, + "loss": 0.0227, + "step": 553 + }, + { + "epoch": 1.0674698795180724, + "grad_norm": 0.4593975841999054, + "learning_rate": 3.258466507519213e-05, + "loss": 0.047, + "step": 554 + }, + { + "epoch": 1.0693975903614459, + "grad_norm": 0.19405829906463623, + "learning_rate": 3.254977696635366e-05, + "loss": 0.0314, + "step": 555 + }, + { + "epoch": 1.0713253012048192, + "grad_norm": 0.14563389122486115, + "learning_rate": 3.2514825753164774e-05, + "loss": 0.0046, + "step": 556 + }, + { + "epoch": 1.0732530120481927, + "grad_norm": 0.2642340064048767, + "learning_rate": 3.247981161137153e-05, + "loss": 0.022, + "step": 557 + }, + { + "epoch": 1.0751807228915662, + "grad_norm": 0.17274761199951172, + "learning_rate": 3.2444734717036386e-05, + "loss": 0.0134, + "step": 558 + }, + { + "epoch": 1.0771084337349397, + "grad_norm": 0.44354626536369324, + "learning_rate": 3.240959524653735e-05, + "loss": 0.0211, + "step": 559 + }, + { + "epoch": 1.0790361445783132, + "grad_norm": 0.2806888818740845, + "learning_rate": 3.237439337656708e-05, + "loss": 0.0141, + "step": 560 + }, + { + "epoch": 1.0809638554216867, + "grad_norm": 0.21679501235485077, + "learning_rate": 3.2339129284131994e-05, + "loss": 0.019, + "step": 561 + }, + { + "epoch": 1.0828915662650602, + "grad_norm": 0.3040260076522827, + "learning_rate": 3.2303803146551386e-05, + "loss": 0.0249, + "step": 562 + }, + { + "epoch": 1.0848192771084337, + "grad_norm": 0.2793775200843811, + "learning_rate": 3.226841514145656e-05, + "loss": 0.0088, + "step": 563 + }, + { + "epoch": 1.0867469879518072, + "grad_norm": 0.149955615401268, + "learning_rate": 3.223296544678987e-05, + "loss": 0.0054, + "step": 564 + }, + { + "epoch": 1.0886746987951808, + "grad_norm": 0.22166767716407776, + "learning_rate": 3.219745424080389e-05, + "loss": 0.0109, + "step": 565 + }, + { + "epoch": 1.0906024096385543, + "grad_norm": 0.22399431467056274, + "learning_rate": 3.2161881702060476e-05, + "loss": 0.0106, + "step": 566 + }, + { + "epoch": 1.0925301204819278, + "grad_norm": 0.18537986278533936, + "learning_rate": 3.2126248009429905e-05, + "loss": 0.0077, + "step": 567 + }, + { + "epoch": 1.0944578313253013, + "grad_norm": 0.24511495232582092, + "learning_rate": 3.2090553342089935e-05, + "loss": 0.0093, + "step": 568 + }, + { + "epoch": 1.0963855421686748, + "grad_norm": 0.4766045808792114, + "learning_rate": 3.205479787952494e-05, + "loss": 0.036, + "step": 569 + }, + { + "epoch": 1.0983132530120483, + "grad_norm": 0.1425715535879135, + "learning_rate": 3.201898180152499e-05, + "loss": 0.0085, + "step": 570 + }, + { + "epoch": 1.1002409638554216, + "grad_norm": 0.1909666359424591, + "learning_rate": 3.1983105288184945e-05, + "loss": 0.0081, + "step": 571 + }, + { + "epoch": 1.102168674698795, + "grad_norm": 0.44077104330062866, + "learning_rate": 3.194716851990355e-05, + "loss": 0.017, + "step": 572 + }, + { + "epoch": 1.1040963855421686, + "grad_norm": 0.5757400989532471, + "learning_rate": 3.191117167738253e-05, + "loss": 0.021, + "step": 573 + }, + { + "epoch": 1.106024096385542, + "grad_norm": 0.1977701038122177, + "learning_rate": 3.1875114941625705e-05, + "loss": 0.0096, + "step": 574 + }, + { + "epoch": 1.1079518072289156, + "grad_norm": 0.3524581491947174, + "learning_rate": 3.1838998493938026e-05, + "loss": 0.0118, + "step": 575 + }, + { + "epoch": 1.1098795180722891, + "grad_norm": 0.3301331698894501, + "learning_rate": 3.180282251592472e-05, + "loss": 0.0094, + "step": 576 + }, + { + "epoch": 1.1118072289156626, + "grad_norm": 0.2774488925933838, + "learning_rate": 3.1766587189490336e-05, + "loss": 0.0131, + "step": 577 + }, + { + "epoch": 1.1137349397590361, + "grad_norm": 1.732595443725586, + "learning_rate": 3.173029269683785e-05, + "loss": 0.0445, + "step": 578 + }, + { + "epoch": 1.1156626506024097, + "grad_norm": 0.28746843338012695, + "learning_rate": 3.169393922046776e-05, + "loss": 0.0116, + "step": 579 + }, + { + "epoch": 1.1175903614457832, + "grad_norm": 0.2952995002269745, + "learning_rate": 3.165752694317713e-05, + "loss": 0.0116, + "step": 580 + }, + { + "epoch": 1.1195180722891567, + "grad_norm": 0.2938575744628906, + "learning_rate": 3.16210560480587e-05, + "loss": 0.013, + "step": 581 + }, + { + "epoch": 1.1214457831325302, + "grad_norm": 0.22283495962619781, + "learning_rate": 3.158452671849998e-05, + "loss": 0.0052, + "step": 582 + }, + { + "epoch": 1.1233734939759037, + "grad_norm": 0.6272858381271362, + "learning_rate": 3.154793913818226e-05, + "loss": 0.0182, + "step": 583 + }, + { + "epoch": 1.1253012048192772, + "grad_norm": 0.479753702878952, + "learning_rate": 3.1511293491079804e-05, + "loss": 0.0146, + "step": 584 + }, + { + "epoch": 1.1272289156626507, + "grad_norm": 0.31104400753974915, + "learning_rate": 3.1474589961458786e-05, + "loss": 0.0139, + "step": 585 + }, + { + "epoch": 1.129156626506024, + "grad_norm": 0.4932832419872284, + "learning_rate": 3.1437828733876477e-05, + "loss": 0.0236, + "step": 586 + }, + { + "epoch": 1.1310843373493975, + "grad_norm": 0.222808837890625, + "learning_rate": 3.140100999318025e-05, + "loss": 0.0084, + "step": 587 + }, + { + "epoch": 1.133012048192771, + "grad_norm": 0.4515356719493866, + "learning_rate": 3.136413392450668e-05, + "loss": 0.0215, + "step": 588 + }, + { + "epoch": 1.1349397590361445, + "grad_norm": 0.39302268624305725, + "learning_rate": 3.132720071328061e-05, + "loss": 0.0154, + "step": 589 + }, + { + "epoch": 1.136867469879518, + "grad_norm": 0.43382835388183594, + "learning_rate": 3.1290210545214205e-05, + "loss": 0.0088, + "step": 590 + }, + { + "epoch": 1.1387951807228915, + "grad_norm": 0.18707136809825897, + "learning_rate": 3.125316360630602e-05, + "loss": 0.0126, + "step": 591 + }, + { + "epoch": 1.140722891566265, + "grad_norm": 0.5688219666481018, + "learning_rate": 3.121606008284011e-05, + "loss": 0.0147, + "step": 592 + }, + { + "epoch": 1.1426506024096386, + "grad_norm": 0.3321833312511444, + "learning_rate": 3.1178900161385005e-05, + "loss": 0.0119, + "step": 593 + }, + { + "epoch": 1.144578313253012, + "grad_norm": 0.3738424777984619, + "learning_rate": 3.114168402879286e-05, + "loss": 0.0158, + "step": 594 + }, + { + "epoch": 1.1465060240963856, + "grad_norm": 0.2386978417634964, + "learning_rate": 3.110441187219846e-05, + "loss": 0.0107, + "step": 595 + }, + { + "epoch": 1.148433734939759, + "grad_norm": 0.2165699452161789, + "learning_rate": 3.10670838790183e-05, + "loss": 0.0079, + "step": 596 + }, + { + "epoch": 1.1503614457831326, + "grad_norm": 0.25952696800231934, + "learning_rate": 3.102970023694965e-05, + "loss": 0.0147, + "step": 597 + }, + { + "epoch": 1.152289156626506, + "grad_norm": 0.21448305249214172, + "learning_rate": 3.099226113396959e-05, + "loss": 0.0099, + "step": 598 + }, + { + "epoch": 1.1542168674698796, + "grad_norm": 0.37226060032844543, + "learning_rate": 3.095476675833405e-05, + "loss": 0.0214, + "step": 599 + }, + { + "epoch": 1.1561445783132531, + "grad_norm": 0.29637983441352844, + "learning_rate": 3.0917217298576955e-05, + "loss": 0.0118, + "step": 600 + }, + { + "epoch": 1.1580722891566264, + "grad_norm": 0.18535609543323517, + "learning_rate": 3.0879612943509154e-05, + "loss": 0.0086, + "step": 601 + }, + { + "epoch": 1.16, + "grad_norm": 0.25874125957489014, + "learning_rate": 3.0841953882217536e-05, + "loss": 0.0088, + "step": 602 + }, + { + "epoch": 1.1619277108433734, + "grad_norm": 0.46092745661735535, + "learning_rate": 3.08042403040641e-05, + "loss": 0.0241, + "step": 603 + }, + { + "epoch": 1.163855421686747, + "grad_norm": 0.27023249864578247, + "learning_rate": 3.076647239868494e-05, + "loss": 0.0154, + "step": 604 + }, + { + "epoch": 1.1657831325301204, + "grad_norm": 0.445157527923584, + "learning_rate": 3.072865035598933e-05, + "loss": 0.0197, + "step": 605 + }, + { + "epoch": 1.167710843373494, + "grad_norm": 0.18097272515296936, + "learning_rate": 3.06907743661588e-05, + "loss": 0.0093, + "step": 606 + }, + { + "epoch": 1.1696385542168675, + "grad_norm": 0.22469942271709442, + "learning_rate": 3.065284461964609e-05, + "loss": 0.0171, + "step": 607 + }, + { + "epoch": 1.171566265060241, + "grad_norm": 0.20190906524658203, + "learning_rate": 3.061486130717428e-05, + "loss": 0.008, + "step": 608 + }, + { + "epoch": 1.1734939759036145, + "grad_norm": 0.18294145166873932, + "learning_rate": 3.057682461973579e-05, + "loss": 0.0155, + "step": 609 + }, + { + "epoch": 1.175421686746988, + "grad_norm": 0.34203943610191345, + "learning_rate": 3.053873474859143e-05, + "loss": 0.0212, + "step": 610 + }, + { + "epoch": 1.1773493975903615, + "grad_norm": 0.49073582887649536, + "learning_rate": 3.050059188526942e-05, + "loss": 0.019, + "step": 611 + }, + { + "epoch": 1.179277108433735, + "grad_norm": 0.3537680506706238, + "learning_rate": 3.046239622156446e-05, + "loss": 0.0147, + "step": 612 + }, + { + "epoch": 1.1812048192771085, + "grad_norm": 0.2584632635116577, + "learning_rate": 3.042414794953674e-05, + "loss": 0.0088, + "step": 613 + }, + { + "epoch": 1.1831325301204818, + "grad_norm": 0.3529360890388489, + "learning_rate": 3.0385847261510975e-05, + "loss": 0.0187, + "step": 614 + }, + { + "epoch": 1.1850602409638555, + "grad_norm": 0.3331570327281952, + "learning_rate": 3.0347494350075465e-05, + "loss": 0.0124, + "step": 615 + }, + { + "epoch": 1.1869879518072288, + "grad_norm": 0.2223527580499649, + "learning_rate": 3.0309089408081074e-05, + "loss": 0.01, + "step": 616 + }, + { + "epoch": 1.1889156626506023, + "grad_norm": 0.21985746920108795, + "learning_rate": 3.027063262864032e-05, + "loss": 0.0087, + "step": 617 + }, + { + "epoch": 1.1908433734939758, + "grad_norm": 0.2989653944969177, + "learning_rate": 3.023212420512637e-05, + "loss": 0.0137, + "step": 618 + }, + { + "epoch": 1.1927710843373494, + "grad_norm": 0.17423275113105774, + "learning_rate": 3.0193564331172074e-05, + "loss": 0.0056, + "step": 619 + }, + { + "epoch": 1.1946987951807229, + "grad_norm": 1.0992127656936646, + "learning_rate": 3.0154953200668976e-05, + "loss": 0.0274, + "step": 620 + }, + { + "epoch": 1.1966265060240964, + "grad_norm": 0.21641989052295685, + "learning_rate": 3.011629100776638e-05, + "loss": 0.0151, + "step": 621 + }, + { + "epoch": 1.1985542168674699, + "grad_norm": 0.4558199644088745, + "learning_rate": 3.007757794687033e-05, + "loss": 0.0424, + "step": 622 + }, + { + "epoch": 1.2004819277108434, + "grad_norm": 0.42380189895629883, + "learning_rate": 3.003881421264266e-05, + "loss": 0.0079, + "step": 623 + }, + { + "epoch": 1.202409638554217, + "grad_norm": 0.28791171312332153, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.0142, + "step": 624 + }, + { + "epoch": 1.2043373493975904, + "grad_norm": 0.3906581997871399, + "learning_rate": 2.996113550411281e-05, + "loss": 0.0251, + "step": 625 + }, + { + "epoch": 1.206265060240964, + "grad_norm": 0.47848746180534363, + "learning_rate": 2.9922220920404375e-05, + "loss": 0.0137, + "step": 626 + }, + { + "epoch": 1.2081927710843374, + "grad_norm": 0.22666941583156586, + "learning_rate": 2.9883256444549862e-05, + "loss": 0.0105, + "step": 627 + }, + { + "epoch": 1.210120481927711, + "grad_norm": 0.18968136608600616, + "learning_rate": 2.984424227247529e-05, + "loss": 0.0089, + "step": 628 + }, + { + "epoch": 1.2120481927710842, + "grad_norm": 0.28732606768608093, + "learning_rate": 2.980517860035656e-05, + "loss": 0.0253, + "step": 629 + }, + { + "epoch": 1.213975903614458, + "grad_norm": 0.21131543815135956, + "learning_rate": 2.9766065624618518e-05, + "loss": 0.0134, + "step": 630 + }, + { + "epoch": 1.2159036144578312, + "grad_norm": 0.7594877481460571, + "learning_rate": 2.972690354193388e-05, + "loss": 0.0157, + "step": 631 + }, + { + "epoch": 1.2178313253012047, + "grad_norm": 0.730291485786438, + "learning_rate": 2.96876925492223e-05, + "loss": 0.0204, + "step": 632 + }, + { + "epoch": 1.2197590361445783, + "grad_norm": 0.20333674550056458, + "learning_rate": 2.9648432843649382e-05, + "loss": 0.0114, + "step": 633 + }, + { + "epoch": 1.2216867469879518, + "grad_norm": 0.5680793523788452, + "learning_rate": 2.960912462262566e-05, + "loss": 0.0146, + "step": 634 + }, + { + "epoch": 1.2236144578313253, + "grad_norm": 0.4591079354286194, + "learning_rate": 2.9569768083805618e-05, + "loss": 0.0112, + "step": 635 + }, + { + "epoch": 1.2255421686746988, + "grad_norm": 0.3793511390686035, + "learning_rate": 2.953036342508671e-05, + "loss": 0.0377, + "step": 636 + }, + { + "epoch": 1.2274698795180723, + "grad_norm": 1.118723750114441, + "learning_rate": 2.9490910844608346e-05, + "loss": 0.0432, + "step": 637 + }, + { + "epoch": 1.2293975903614458, + "grad_norm": 0.36990776658058167, + "learning_rate": 2.9451410540750887e-05, + "loss": 0.0203, + "step": 638 + }, + { + "epoch": 1.2313253012048193, + "grad_norm": 0.930397629737854, + "learning_rate": 2.94118627121347e-05, + "loss": 0.0311, + "step": 639 + }, + { + "epoch": 1.2332530120481928, + "grad_norm": 0.2347625195980072, + "learning_rate": 2.9372267557619075e-05, + "loss": 0.0168, + "step": 640 + }, + { + "epoch": 1.2351807228915663, + "grad_norm": 0.3720332384109497, + "learning_rate": 2.933262527630131e-05, + "loss": 0.0136, + "step": 641 + }, + { + "epoch": 1.2371084337349398, + "grad_norm": 0.4871984124183655, + "learning_rate": 2.929293606751565e-05, + "loss": 0.0339, + "step": 642 + }, + { + "epoch": 1.2390361445783133, + "grad_norm": 0.35853689908981323, + "learning_rate": 2.9253200130832322e-05, + "loss": 0.0095, + "step": 643 + }, + { + "epoch": 1.2409638554216866, + "grad_norm": 0.42003703117370605, + "learning_rate": 2.92134176660565e-05, + "loss": 0.0142, + "step": 644 + }, + { + "epoch": 1.2428915662650604, + "grad_norm": 0.3854500651359558, + "learning_rate": 2.9173588873227338e-05, + "loss": 0.0209, + "step": 645 + }, + { + "epoch": 1.2448192771084337, + "grad_norm": 0.24665917456150055, + "learning_rate": 2.913371395261691e-05, + "loss": 0.0087, + "step": 646 + }, + { + "epoch": 1.2467469879518072, + "grad_norm": 0.41571593284606934, + "learning_rate": 2.9093793104729268e-05, + "loss": 0.0164, + "step": 647 + }, + { + "epoch": 1.2486746987951807, + "grad_norm": 0.4597891569137573, + "learning_rate": 2.9053826530299377e-05, + "loss": 0.0138, + "step": 648 + }, + { + "epoch": 1.2506024096385542, + "grad_norm": 0.43345385789871216, + "learning_rate": 2.901381443029215e-05, + "loss": 0.0353, + "step": 649 + }, + { + "epoch": 1.2525301204819277, + "grad_norm": 0.3706768751144409, + "learning_rate": 2.897375700590141e-05, + "loss": 0.007, + "step": 650 + }, + { + "epoch": 1.2544578313253012, + "grad_norm": 0.30305296182632446, + "learning_rate": 2.8933654458548873e-05, + "loss": 0.0123, + "step": 651 + }, + { + "epoch": 1.2563855421686747, + "grad_norm": 0.2042127549648285, + "learning_rate": 2.8893506989883167e-05, + "loss": 0.0099, + "step": 652 + }, + { + "epoch": 1.2583132530120482, + "grad_norm": 0.20524422824382782, + "learning_rate": 2.8853314801778784e-05, + "loss": 0.0097, + "step": 653 + }, + { + "epoch": 1.2602409638554217, + "grad_norm": 0.2351921945810318, + "learning_rate": 2.8813078096335093e-05, + "loss": 0.0091, + "step": 654 + }, + { + "epoch": 1.2621686746987952, + "grad_norm": 0.34547340869903564, + "learning_rate": 2.87727970758753e-05, + "loss": 0.0088, + "step": 655 + }, + { + "epoch": 1.2640963855421687, + "grad_norm": 0.35163217782974243, + "learning_rate": 2.8732471942945443e-05, + "loss": 0.0145, + "step": 656 + }, + { + "epoch": 1.266024096385542, + "grad_norm": 1.715137243270874, + "learning_rate": 2.8692102900313378e-05, + "loss": 0.0198, + "step": 657 + }, + { + "epoch": 1.2679518072289158, + "grad_norm": 0.2860178053379059, + "learning_rate": 2.8651690150967748e-05, + "loss": 0.0085, + "step": 658 + }, + { + "epoch": 1.269879518072289, + "grad_norm": 0.21175967156887054, + "learning_rate": 2.8611233898116967e-05, + "loss": 0.0071, + "step": 659 + }, + { + "epoch": 1.2718072289156628, + "grad_norm": 0.33726972341537476, + "learning_rate": 2.85707343451882e-05, + "loss": 0.012, + "step": 660 + }, + { + "epoch": 1.273734939759036, + "grad_norm": 0.2138456553220749, + "learning_rate": 2.853019169582635e-05, + "loss": 0.0092, + "step": 661 + }, + { + "epoch": 1.2756626506024096, + "grad_norm": 0.2304934412240982, + "learning_rate": 2.8489606153892997e-05, + "loss": 0.0144, + "step": 662 + }, + { + "epoch": 1.277590361445783, + "grad_norm": 0.2691061794757843, + "learning_rate": 2.8448977923465425e-05, + "loss": 0.0121, + "step": 663 + }, + { + "epoch": 1.2795180722891566, + "grad_norm": 0.35254305601119995, + "learning_rate": 2.840830720883555e-05, + "loss": 0.0125, + "step": 664 + }, + { + "epoch": 1.28144578313253, + "grad_norm": 0.36552608013153076, + "learning_rate": 2.836759421450893e-05, + "loss": 0.021, + "step": 665 + }, + { + "epoch": 1.2833734939759036, + "grad_norm": 0.37177154421806335, + "learning_rate": 2.83268391452037e-05, + "loss": 0.0216, + "step": 666 + }, + { + "epoch": 1.2853012048192771, + "grad_norm": 0.20932547748088837, + "learning_rate": 2.828604220584958e-05, + "loss": 0.0077, + "step": 667 + }, + { + "epoch": 1.2872289156626506, + "grad_norm": 0.5158557295799255, + "learning_rate": 2.824520360158681e-05, + "loss": 0.0394, + "step": 668 + }, + { + "epoch": 1.2891566265060241, + "grad_norm": 0.22623969614505768, + "learning_rate": 2.820432353776515e-05, + "loss": 0.0087, + "step": 669 + }, + { + "epoch": 1.2910843373493976, + "grad_norm": 0.2996046245098114, + "learning_rate": 2.8163402219942822e-05, + "loss": 0.01, + "step": 670 + }, + { + "epoch": 1.2930120481927712, + "grad_norm": 0.24957989156246185, + "learning_rate": 2.8122439853885488e-05, + "loss": 0.0127, + "step": 671 + }, + { + "epoch": 1.2949397590361444, + "grad_norm": 0.2636559307575226, + "learning_rate": 2.8081436645565216e-05, + "loss": 0.0128, + "step": 672 + }, + { + "epoch": 1.2968674698795182, + "grad_norm": 0.3531591296195984, + "learning_rate": 2.804039280115944e-05, + "loss": 0.0199, + "step": 673 + }, + { + "epoch": 1.2987951807228915, + "grad_norm": 0.3682299852371216, + "learning_rate": 2.7999308527049927e-05, + "loss": 0.0088, + "step": 674 + }, + { + "epoch": 1.3007228915662652, + "grad_norm": 0.19555217027664185, + "learning_rate": 2.795818402982174e-05, + "loss": 0.0084, + "step": 675 + }, + { + "epoch": 1.3026506024096385, + "grad_norm": 0.2864912450313568, + "learning_rate": 2.7917019516262186e-05, + "loss": 0.0154, + "step": 676 + }, + { + "epoch": 1.304578313253012, + "grad_norm": 0.2211237996816635, + "learning_rate": 2.78758151933598e-05, + "loss": 0.0078, + "step": 677 + }, + { + "epoch": 1.3065060240963855, + "grad_norm": 0.13646945357322693, + "learning_rate": 2.7834571268303294e-05, + "loss": 0.0058, + "step": 678 + }, + { + "epoch": 1.308433734939759, + "grad_norm": 0.16530285775661469, + "learning_rate": 2.779328794848049e-05, + "loss": 0.007, + "step": 679 + }, + { + "epoch": 1.3103614457831325, + "grad_norm": 0.2145693302154541, + "learning_rate": 2.7751965441477325e-05, + "loss": 0.0203, + "step": 680 + }, + { + "epoch": 1.312289156626506, + "grad_norm": 0.24273739755153656, + "learning_rate": 2.771060395507677e-05, + "loss": 0.0106, + "step": 681 + }, + { + "epoch": 1.3142168674698795, + "grad_norm": 0.20430618524551392, + "learning_rate": 2.7669203697257794e-05, + "loss": 0.0122, + "step": 682 + }, + { + "epoch": 1.316144578313253, + "grad_norm": 0.2502615749835968, + "learning_rate": 2.7627764876194335e-05, + "loss": 0.0101, + "step": 683 + }, + { + "epoch": 1.3180722891566266, + "grad_norm": 0.287239670753479, + "learning_rate": 2.7586287700254214e-05, + "loss": 0.0203, + "step": 684 + }, + { + "epoch": 1.32, + "grad_norm": 0.16239754855632782, + "learning_rate": 2.7544772377998147e-05, + "loss": 0.0084, + "step": 685 + }, + { + "epoch": 1.3219277108433736, + "grad_norm": 0.27174142003059387, + "learning_rate": 2.7503219118178636e-05, + "loss": 0.008, + "step": 686 + }, + { + "epoch": 1.3238554216867469, + "grad_norm": 0.12878240644931793, + "learning_rate": 2.7461628129738954e-05, + "loss": 0.0053, + "step": 687 + }, + { + "epoch": 1.3257831325301206, + "grad_norm": 0.16112515330314636, + "learning_rate": 2.7419999621812086e-05, + "loss": 0.0059, + "step": 688 + }, + { + "epoch": 1.3277108433734939, + "grad_norm": 0.2398834228515625, + "learning_rate": 2.7378333803719672e-05, + "loss": 0.0095, + "step": 689 + }, + { + "epoch": 1.3296385542168676, + "grad_norm": 0.18516193330287933, + "learning_rate": 2.733663088497097e-05, + "loss": 0.0071, + "step": 690 + }, + { + "epoch": 1.331566265060241, + "grad_norm": 0.2974924147129059, + "learning_rate": 2.7294891075261785e-05, + "loss": 0.0227, + "step": 691 + }, + { + "epoch": 1.3334939759036144, + "grad_norm": 0.12931054830551147, + "learning_rate": 2.7253114584473418e-05, + "loss": 0.0039, + "step": 692 + }, + { + "epoch": 1.335421686746988, + "grad_norm": 0.16319474577903748, + "learning_rate": 2.7211301622671623e-05, + "loss": 0.008, + "step": 693 + }, + { + "epoch": 1.3373493975903614, + "grad_norm": 0.27622169256210327, + "learning_rate": 2.7169452400105533e-05, + "loss": 0.0238, + "step": 694 + }, + { + "epoch": 1.339277108433735, + "grad_norm": 0.45309779047966003, + "learning_rate": 2.712756712720663e-05, + "loss": 0.0439, + "step": 695 + }, + { + "epoch": 1.3412048192771084, + "grad_norm": 0.2469855099916458, + "learning_rate": 2.708564601458765e-05, + "loss": 0.0085, + "step": 696 + }, + { + "epoch": 1.343132530120482, + "grad_norm": 0.4245856702327728, + "learning_rate": 2.7043689273041535e-05, + "loss": 0.0097, + "step": 697 + }, + { + "epoch": 1.3450602409638555, + "grad_norm": 0.26796087622642517, + "learning_rate": 2.7001697113540414e-05, + "loss": 0.0119, + "step": 698 + }, + { + "epoch": 1.346987951807229, + "grad_norm": 0.3569283187389374, + "learning_rate": 2.6959669747234482e-05, + "loss": 0.0096, + "step": 699 + }, + { + "epoch": 1.3489156626506025, + "grad_norm": 0.7038524150848389, + "learning_rate": 2.6917607385450973e-05, + "loss": 0.0317, + "step": 700 + }, + { + "epoch": 1.350843373493976, + "grad_norm": 0.23568563163280487, + "learning_rate": 2.687551023969308e-05, + "loss": 0.0112, + "step": 701 + }, + { + "epoch": 1.3527710843373493, + "grad_norm": 0.20338499546051025, + "learning_rate": 2.6833378521638935e-05, + "loss": 0.0092, + "step": 702 + }, + { + "epoch": 1.354698795180723, + "grad_norm": 4.22187614440918, + "learning_rate": 2.679121244314046e-05, + "loss": 0.0314, + "step": 703 + }, + { + "epoch": 1.3566265060240963, + "grad_norm": 0.2542206048965454, + "learning_rate": 2.674901221622239e-05, + "loss": 0.0158, + "step": 704 + }, + { + "epoch": 1.3585542168674698, + "grad_norm": 0.49705010652542114, + "learning_rate": 2.670677805308116e-05, + "loss": 0.0162, + "step": 705 + }, + { + "epoch": 1.3604819277108433, + "grad_norm": 0.17502115666866302, + "learning_rate": 2.666451016608383e-05, + "loss": 0.0074, + "step": 706 + }, + { + "epoch": 1.3624096385542168, + "grad_norm": 0.21738742291927338, + "learning_rate": 2.6622208767767075e-05, + "loss": 0.0135, + "step": 707 + }, + { + "epoch": 1.3643373493975903, + "grad_norm": 0.3309847414493561, + "learning_rate": 2.6579874070836032e-05, + "loss": 0.0107, + "step": 708 + }, + { + "epoch": 1.3662650602409638, + "grad_norm": 0.10706827789545059, + "learning_rate": 2.6537506288163303e-05, + "loss": 0.0043, + "step": 709 + }, + { + "epoch": 1.3681927710843373, + "grad_norm": 0.173640176653862, + "learning_rate": 2.6495105632787835e-05, + "loss": 0.0092, + "step": 710 + }, + { + "epoch": 1.3701204819277109, + "grad_norm": 0.2636397182941437, + "learning_rate": 2.6452672317913893e-05, + "loss": 0.0097, + "step": 711 + }, + { + "epoch": 1.3720481927710844, + "grad_norm": 0.28485360741615295, + "learning_rate": 2.6410206556909943e-05, + "loss": 0.0193, + "step": 712 + }, + { + "epoch": 1.3739759036144579, + "grad_norm": 0.23210027813911438, + "learning_rate": 2.636770856330761e-05, + "loss": 0.0229, + "step": 713 + }, + { + "epoch": 1.3759036144578314, + "grad_norm": 0.13388316333293915, + "learning_rate": 2.6325178550800596e-05, + "loss": 0.004, + "step": 714 + }, + { + "epoch": 1.377831325301205, + "grad_norm": 0.5131422877311707, + "learning_rate": 2.6282616733243603e-05, + "loss": 0.0137, + "step": 715 + }, + { + "epoch": 1.3797590361445784, + "grad_norm": 0.3243267834186554, + "learning_rate": 2.6240023324651258e-05, + "loss": 0.0153, + "step": 716 + }, + { + "epoch": 1.3816867469879517, + "grad_norm": 0.1440611034631729, + "learning_rate": 2.619739853919704e-05, + "loss": 0.0031, + "step": 717 + }, + { + "epoch": 1.3836144578313254, + "grad_norm": 0.30346596240997314, + "learning_rate": 2.6154742591212196e-05, + "loss": 0.0109, + "step": 718 + }, + { + "epoch": 1.3855421686746987, + "grad_norm": 0.19109240174293518, + "learning_rate": 2.611205569518468e-05, + "loss": 0.0094, + "step": 719 + }, + { + "epoch": 1.3874698795180722, + "grad_norm": 0.28636518120765686, + "learning_rate": 2.6069338065758056e-05, + "loss": 0.0123, + "step": 720 + }, + { + "epoch": 1.3893975903614457, + "grad_norm": 0.28083911538124084, + "learning_rate": 2.6026589917730416e-05, + "loss": 0.0104, + "step": 721 + }, + { + "epoch": 1.3913253012048192, + "grad_norm": 0.36553966999053955, + "learning_rate": 2.5983811466053327e-05, + "loss": 0.0143, + "step": 722 + }, + { + "epoch": 1.3932530120481927, + "grad_norm": 0.23317205905914307, + "learning_rate": 2.5941002925830708e-05, + "loss": 0.011, + "step": 723 + }, + { + "epoch": 1.3951807228915662, + "grad_norm": 0.3825171887874603, + "learning_rate": 2.589816451231781e-05, + "loss": 0.0098, + "step": 724 + }, + { + "epoch": 1.3971084337349398, + "grad_norm": 0.19916608929634094, + "learning_rate": 2.585529644092006e-05, + "loss": 0.0094, + "step": 725 + }, + { + "epoch": 1.3990361445783133, + "grad_norm": 0.19990523159503937, + "learning_rate": 2.5812398927192027e-05, + "loss": 0.0128, + "step": 726 + }, + { + "epoch": 1.4009638554216868, + "grad_norm": 0.34662899374961853, + "learning_rate": 2.5769472186836347e-05, + "loss": 0.0091, + "step": 727 + }, + { + "epoch": 1.4028915662650603, + "grad_norm": 0.23481112718582153, + "learning_rate": 2.5726516435702583e-05, + "loss": 0.0154, + "step": 728 + }, + { + "epoch": 1.4048192771084338, + "grad_norm": 0.1846667379140854, + "learning_rate": 2.5683531889786194e-05, + "loss": 0.0088, + "step": 729 + }, + { + "epoch": 1.4067469879518073, + "grad_norm": 0.16717663407325745, + "learning_rate": 2.564051876522742e-05, + "loss": 0.0083, + "step": 730 + }, + { + "epoch": 1.4086746987951808, + "grad_norm": 0.4116475284099579, + "learning_rate": 2.5597477278310202e-05, + "loss": 0.0179, + "step": 731 + }, + { + "epoch": 1.410602409638554, + "grad_norm": 0.171807661652565, + "learning_rate": 2.5554407645461115e-05, + "loss": 0.0063, + "step": 732 + }, + { + "epoch": 1.4125301204819278, + "grad_norm": 0.1954439878463745, + "learning_rate": 2.5511310083248243e-05, + "loss": 0.017, + "step": 733 + }, + { + "epoch": 1.4144578313253011, + "grad_norm": 0.37158989906311035, + "learning_rate": 2.5468184808380104e-05, + "loss": 0.0173, + "step": 734 + }, + { + "epoch": 1.4163855421686746, + "grad_norm": 0.2001633644104004, + "learning_rate": 2.542503203770458e-05, + "loss": 0.0165, + "step": 735 + }, + { + "epoch": 1.4183132530120481, + "grad_norm": 0.45673373341560364, + "learning_rate": 2.53818519882078e-05, + "loss": 0.0185, + "step": 736 + }, + { + "epoch": 1.4202409638554216, + "grad_norm": 0.3838701546192169, + "learning_rate": 2.5338644877013067e-05, + "loss": 0.0134, + "step": 737 + }, + { + "epoch": 1.4221686746987952, + "grad_norm": 0.32032477855682373, + "learning_rate": 2.5295410921379745e-05, + "loss": 0.0143, + "step": 738 + }, + { + "epoch": 1.4240963855421687, + "grad_norm": 0.4594039022922516, + "learning_rate": 2.52521503387022e-05, + "loss": 0.0193, + "step": 739 + }, + { + "epoch": 1.4260240963855422, + "grad_norm": 0.3889620900154114, + "learning_rate": 2.5208863346508667e-05, + "loss": 0.0114, + "step": 740 + }, + { + "epoch": 1.4279518072289157, + "grad_norm": 0.33153319358825684, + "learning_rate": 2.5165550162460203e-05, + "loss": 0.0102, + "step": 741 + }, + { + "epoch": 1.4298795180722892, + "grad_norm": 0.7269518375396729, + "learning_rate": 2.5122211004349536e-05, + "loss": 0.0215, + "step": 742 + }, + { + "epoch": 1.4318072289156627, + "grad_norm": 0.31653261184692383, + "learning_rate": 2.5078846090100023e-05, + "loss": 0.0115, + "step": 743 + }, + { + "epoch": 1.4337349397590362, + "grad_norm": 0.20620353519916534, + "learning_rate": 2.5035455637764518e-05, + "loss": 0.0153, + "step": 744 + }, + { + "epoch": 1.4356626506024097, + "grad_norm": 0.17266008257865906, + "learning_rate": 2.4992039865524297e-05, + "loss": 0.0069, + "step": 745 + }, + { + "epoch": 1.4375903614457832, + "grad_norm": 0.24760811030864716, + "learning_rate": 2.494859899168795e-05, + "loss": 0.0108, + "step": 746 + }, + { + "epoch": 1.4395180722891565, + "grad_norm": 0.2584865391254425, + "learning_rate": 2.4905133234690282e-05, + "loss": 0.0095, + "step": 747 + }, + { + "epoch": 1.4414457831325302, + "grad_norm": 0.48847514390945435, + "learning_rate": 2.486164281309122e-05, + "loss": 0.0181, + "step": 748 + }, + { + "epoch": 1.4433734939759035, + "grad_norm": 0.42942047119140625, + "learning_rate": 2.4818127945574717e-05, + "loss": 0.025, + "step": 749 + }, + { + "epoch": 1.445301204819277, + "grad_norm": 0.23713800311088562, + "learning_rate": 2.4774588850947648e-05, + "loss": 0.0085, + "step": 750 + }, + { + "epoch": 1.4472289156626506, + "grad_norm": 0.8797569870948792, + "learning_rate": 2.473102574813871e-05, + "loss": 0.0097, + "step": 751 + }, + { + "epoch": 1.449156626506024, + "grad_norm": 0.2744862735271454, + "learning_rate": 2.4687438856197302e-05, + "loss": 0.0122, + "step": 752 + }, + { + "epoch": 1.4510843373493976, + "grad_norm": 0.12747010588645935, + "learning_rate": 2.4643828394292478e-05, + "loss": 0.0056, + "step": 753 + }, + { + "epoch": 1.453012048192771, + "grad_norm": 0.37376829981803894, + "learning_rate": 2.4600194581711775e-05, + "loss": 0.0052, + "step": 754 + }, + { + "epoch": 1.4549397590361446, + "grad_norm": 0.2536911368370056, + "learning_rate": 2.4556537637860176e-05, + "loss": 0.0113, + "step": 755 + }, + { + "epoch": 1.456867469879518, + "grad_norm": 0.25950780510902405, + "learning_rate": 2.451285778225894e-05, + "loss": 0.0099, + "step": 756 + }, + { + "epoch": 1.4587951807228916, + "grad_norm": 0.19535955786705017, + "learning_rate": 2.4469155234544565e-05, + "loss": 0.0069, + "step": 757 + }, + { + "epoch": 1.4607228915662651, + "grad_norm": 0.22816115617752075, + "learning_rate": 2.442543021446764e-05, + "loss": 0.0088, + "step": 758 + }, + { + "epoch": 1.4626506024096386, + "grad_norm": 0.3363986313343048, + "learning_rate": 2.4381682941891755e-05, + "loss": 0.0182, + "step": 759 + }, + { + "epoch": 1.464578313253012, + "grad_norm": 0.21492891013622284, + "learning_rate": 2.4337913636792382e-05, + "loss": 0.0069, + "step": 760 + }, + { + "epoch": 1.4665060240963856, + "grad_norm": 0.6070862412452698, + "learning_rate": 2.429412251925579e-05, + "loss": 0.0406, + "step": 761 + }, + { + "epoch": 1.468433734939759, + "grad_norm": 2.6469690799713135, + "learning_rate": 2.425030980947793e-05, + "loss": 0.0205, + "step": 762 + }, + { + "epoch": 1.4703614457831327, + "grad_norm": 0.30909740924835205, + "learning_rate": 2.420647572776332e-05, + "loss": 0.0136, + "step": 763 + }, + { + "epoch": 1.472289156626506, + "grad_norm": 0.6639553904533386, + "learning_rate": 2.416262049452395e-05, + "loss": 0.011, + "step": 764 + }, + { + "epoch": 1.4742168674698795, + "grad_norm": 0.2919616997241974, + "learning_rate": 2.4118744330278147e-05, + "loss": 0.0131, + "step": 765 + }, + { + "epoch": 1.476144578313253, + "grad_norm": 0.5232429504394531, + "learning_rate": 2.4074847455649523e-05, + "loss": 0.0138, + "step": 766 + }, + { + "epoch": 1.4780722891566265, + "grad_norm": 5.630630970001221, + "learning_rate": 2.403093009136579e-05, + "loss": 0.0264, + "step": 767 + }, + { + "epoch": 1.48, + "grad_norm": 0.33234721422195435, + "learning_rate": 2.3986992458257707e-05, + "loss": 0.0111, + "step": 768 + }, + { + "epoch": 1.4819277108433735, + "grad_norm": 0.28444772958755493, + "learning_rate": 2.3943034777257945e-05, + "loss": 0.0144, + "step": 769 + }, + { + "epoch": 1.483855421686747, + "grad_norm": 0.16229979693889618, + "learning_rate": 2.38990572694e-05, + "loss": 0.0062, + "step": 770 + }, + { + "epoch": 1.4857831325301205, + "grad_norm": 0.27474716305732727, + "learning_rate": 2.385506015581704e-05, + "loss": 0.0172, + "step": 771 + }, + { + "epoch": 1.487710843373494, + "grad_norm": 0.246526300907135, + "learning_rate": 2.381104365774083e-05, + "loss": 0.012, + "step": 772 + }, + { + "epoch": 1.4896385542168675, + "grad_norm": 0.282047837972641, + "learning_rate": 2.37670079965006e-05, + "loss": 0.0116, + "step": 773 + }, + { + "epoch": 1.491566265060241, + "grad_norm": 0.2878139317035675, + "learning_rate": 2.3722953393521944e-05, + "loss": 0.0147, + "step": 774 + }, + { + "epoch": 1.4934939759036143, + "grad_norm": 0.5586277842521667, + "learning_rate": 2.367888007032571e-05, + "loss": 0.0111, + "step": 775 + }, + { + "epoch": 1.495421686746988, + "grad_norm": 0.562160313129425, + "learning_rate": 2.3634788248526846e-05, + "loss": 0.0061, + "step": 776 + }, + { + "epoch": 1.4973493975903613, + "grad_norm": 0.3452005982398987, + "learning_rate": 2.3590678149833356e-05, + "loss": 0.0205, + "step": 777 + }, + { + "epoch": 1.499277108433735, + "grad_norm": 0.7757686376571655, + "learning_rate": 2.3546549996045114e-05, + "loss": 0.0273, + "step": 778 + }, + { + "epoch": 1.5012048192771084, + "grad_norm": 0.19530551135540009, + "learning_rate": 2.3502404009052812e-05, + "loss": 0.0083, + "step": 779 + }, + { + "epoch": 1.503132530120482, + "grad_norm": 0.2586531639099121, + "learning_rate": 2.3458240410836775e-05, + "loss": 0.0122, + "step": 780 + }, + { + "epoch": 1.5050602409638554, + "grad_norm": 0.30063286423683167, + "learning_rate": 2.3414059423465924e-05, + "loss": 0.0083, + "step": 781 + }, + { + "epoch": 1.5069879518072289, + "grad_norm": 0.18663185834884644, + "learning_rate": 2.3369861269096575e-05, + "loss": 0.0104, + "step": 782 + }, + { + "epoch": 1.5089156626506024, + "grad_norm": 0.4405941069126129, + "learning_rate": 2.3325646169971416e-05, + "loss": 0.0264, + "step": 783 + }, + { + "epoch": 1.510843373493976, + "grad_norm": 0.2947913110256195, + "learning_rate": 2.3281414348418294e-05, + "loss": 0.0107, + "step": 784 + }, + { + "epoch": 1.5127710843373494, + "grad_norm": 0.23813778162002563, + "learning_rate": 2.3237166026849158e-05, + "loss": 0.0084, + "step": 785 + }, + { + "epoch": 1.514698795180723, + "grad_norm": 0.33380329608917236, + "learning_rate": 2.3192901427758932e-05, + "loss": 0.0111, + "step": 786 + }, + { + "epoch": 1.5166265060240964, + "grad_norm": 0.3736988306045532, + "learning_rate": 2.314862077372438e-05, + "loss": 0.0135, + "step": 787 + }, + { + "epoch": 1.5185542168674697, + "grad_norm": 0.3785395920276642, + "learning_rate": 2.3104324287402996e-05, + "loss": 0.0265, + "step": 788 + }, + { + "epoch": 1.5204819277108435, + "grad_norm": 0.3359154462814331, + "learning_rate": 2.3060012191531885e-05, + "loss": 0.0127, + "step": 789 + }, + { + "epoch": 1.5224096385542167, + "grad_norm": 0.720753014087677, + "learning_rate": 2.301568470892664e-05, + "loss": 0.0134, + "step": 790 + }, + { + "epoch": 1.5243373493975905, + "grad_norm": 0.36473193764686584, + "learning_rate": 2.297134206248024e-05, + "loss": 0.0318, + "step": 791 + }, + { + "epoch": 1.5262650602409638, + "grad_norm": 0.29987087845802307, + "learning_rate": 2.2926984475161884e-05, + "loss": 0.008, + "step": 792 + }, + { + "epoch": 1.5281927710843375, + "grad_norm": 0.2883112132549286, + "learning_rate": 2.2882612170015914e-05, + "loss": 0.0125, + "step": 793 + }, + { + "epoch": 1.5301204819277108, + "grad_norm": 0.28983229398727417, + "learning_rate": 2.2838225370160682e-05, + "loss": 0.0155, + "step": 794 + }, + { + "epoch": 1.5320481927710843, + "grad_norm": 0.47236886620521545, + "learning_rate": 2.2793824298787414e-05, + "loss": 0.0132, + "step": 795 + }, + { + "epoch": 1.5339759036144578, + "grad_norm": 0.8328865170478821, + "learning_rate": 2.2749409179159104e-05, + "loss": 0.026, + "step": 796 + }, + { + "epoch": 1.5359036144578313, + "grad_norm": 0.3129172623157501, + "learning_rate": 2.2704980234609396e-05, + "loss": 0.0099, + "step": 797 + }, + { + "epoch": 1.5378313253012048, + "grad_norm": 0.22284500300884247, + "learning_rate": 2.2660537688541416e-05, + "loss": 0.009, + "step": 798 + }, + { + "epoch": 1.5397590361445783, + "grad_norm": 0.3346405625343323, + "learning_rate": 2.2616081764426726e-05, + "loss": 0.0077, + "step": 799 + }, + { + "epoch": 1.5416867469879518, + "grad_norm": 0.2923565208911896, + "learning_rate": 2.2571612685804124e-05, + "loss": 0.0119, + "step": 800 + }, + { + "epoch": 1.5436144578313253, + "grad_norm": 0.1921311914920807, + "learning_rate": 2.252713067627857e-05, + "loss": 0.0083, + "step": 801 + }, + { + "epoch": 1.5455421686746988, + "grad_norm": 0.23221106827259064, + "learning_rate": 2.2482635959520044e-05, + "loss": 0.0049, + "step": 802 + }, + { + "epoch": 1.5474698795180721, + "grad_norm": 0.6340724229812622, + "learning_rate": 2.243812875926241e-05, + "loss": 0.0273, + "step": 803 + }, + { + "epoch": 1.5493975903614459, + "grad_norm": 0.2699439823627472, + "learning_rate": 2.2393609299302314e-05, + "loss": 0.0108, + "step": 804 + }, + { + "epoch": 1.5513253012048192, + "grad_norm": 0.2005189210176468, + "learning_rate": 2.2349077803498052e-05, + "loss": 0.0076, + "step": 805 + }, + { + "epoch": 1.5532530120481929, + "grad_norm": 0.39668548107147217, + "learning_rate": 2.230453449576842e-05, + "loss": 0.0135, + "step": 806 + }, + { + "epoch": 1.5551807228915662, + "grad_norm": 0.2406950294971466, + "learning_rate": 2.2259979600091635e-05, + "loss": 0.0094, + "step": 807 + }, + { + "epoch": 1.55710843373494, + "grad_norm": 0.30363157391548157, + "learning_rate": 2.2215413340504158e-05, + "loss": 0.0178, + "step": 808 + }, + { + "epoch": 1.5590361445783132, + "grad_norm": 0.19508181512355804, + "learning_rate": 2.2170835941099605e-05, + "loss": 0.0069, + "step": 809 + }, + { + "epoch": 1.5609638554216867, + "grad_norm": 0.734106719493866, + "learning_rate": 2.2126247626027615e-05, + "loss": 0.0319, + "step": 810 + }, + { + "epoch": 1.5628915662650602, + "grad_norm": 0.2591583728790283, + "learning_rate": 2.208164861949268e-05, + "loss": 0.0168, + "step": 811 + }, + { + "epoch": 1.5648192771084337, + "grad_norm": 0.2386734038591385, + "learning_rate": 2.20370391457531e-05, + "loss": 0.0041, + "step": 812 + }, + { + "epoch": 1.5667469879518072, + "grad_norm": 0.1675218939781189, + "learning_rate": 2.1992419429119764e-05, + "loss": 0.0078, + "step": 813 + }, + { + "epoch": 1.5686746987951807, + "grad_norm": 0.45591506361961365, + "learning_rate": 2.1947789693955097e-05, + "loss": 0.0166, + "step": 814 + }, + { + "epoch": 1.5706024096385542, + "grad_norm": 0.46940621733665466, + "learning_rate": 2.190315016467188e-05, + "loss": 0.0176, + "step": 815 + }, + { + "epoch": 1.5725301204819278, + "grad_norm": 0.2294205278158188, + "learning_rate": 2.1858501065732146e-05, + "loss": 0.0102, + "step": 816 + }, + { + "epoch": 1.5744578313253013, + "grad_norm": 0.28922322392463684, + "learning_rate": 2.181384262164606e-05, + "loss": 0.0111, + "step": 817 + }, + { + "epoch": 1.5763855421686745, + "grad_norm": 0.19650064408779144, + "learning_rate": 2.1769175056970765e-05, + "loss": 0.0076, + "step": 818 + }, + { + "epoch": 1.5783132530120483, + "grad_norm": 0.19538825750350952, + "learning_rate": 2.172449859630927e-05, + "loss": 0.0118, + "step": 819 + }, + { + "epoch": 1.5802409638554216, + "grad_norm": 0.1900389939546585, + "learning_rate": 2.167981346430931e-05, + "loss": 0.0066, + "step": 820 + }, + { + "epoch": 1.5821686746987953, + "grad_norm": 0.21593710780143738, + "learning_rate": 2.1635119885662235e-05, + "loss": 0.0101, + "step": 821 + }, + { + "epoch": 1.5840963855421686, + "grad_norm": 0.2699289321899414, + "learning_rate": 2.159041808510185e-05, + "loss": 0.0118, + "step": 822 + }, + { + "epoch": 1.5860240963855423, + "grad_norm": 0.31867673993110657, + "learning_rate": 2.1545708287403322e-05, + "loss": 0.0122, + "step": 823 + }, + { + "epoch": 1.5879518072289156, + "grad_norm": 0.2862400412559509, + "learning_rate": 2.1500990717382004e-05, + "loss": 0.0216, + "step": 824 + }, + { + "epoch": 1.589879518072289, + "grad_norm": 0.28482481837272644, + "learning_rate": 2.145626559989237e-05, + "loss": 0.0136, + "step": 825 + }, + { + "epoch": 1.5918072289156626, + "grad_norm": 0.2866958975791931, + "learning_rate": 2.1411533159826803e-05, + "loss": 0.0298, + "step": 826 + }, + { + "epoch": 1.5937349397590361, + "grad_norm": 0.39092838764190674, + "learning_rate": 2.1366793622114533e-05, + "loss": 0.0382, + "step": 827 + }, + { + "epoch": 1.5956626506024096, + "grad_norm": 0.16381537914276123, + "learning_rate": 2.1322047211720468e-05, + "loss": 0.0074, + "step": 828 + }, + { + "epoch": 1.5975903614457831, + "grad_norm": 0.22146940231323242, + "learning_rate": 2.1277294153644083e-05, + "loss": 0.0103, + "step": 829 + }, + { + "epoch": 1.5995180722891567, + "grad_norm": 0.2155209183692932, + "learning_rate": 2.123253467291827e-05, + "loss": 0.0095, + "step": 830 + }, + { + "epoch": 1.6014457831325302, + "grad_norm": 0.41510409116744995, + "learning_rate": 2.118776899460822e-05, + "loss": 0.0457, + "step": 831 + }, + { + "epoch": 1.6033734939759037, + "grad_norm": 0.19718150794506073, + "learning_rate": 2.1142997343810293e-05, + "loss": 0.0192, + "step": 832 + }, + { + "epoch": 1.605301204819277, + "grad_norm": 0.40924403071403503, + "learning_rate": 2.1098219945650865e-05, + "loss": 0.0278, + "step": 833 + }, + { + "epoch": 1.6072289156626507, + "grad_norm": 0.18657824397087097, + "learning_rate": 2.105343702528524e-05, + "loss": 0.0076, + "step": 834 + }, + { + "epoch": 1.609156626506024, + "grad_norm": 0.1727641075849533, + "learning_rate": 2.100864880789645e-05, + "loss": 0.0076, + "step": 835 + }, + { + "epoch": 1.6110843373493977, + "grad_norm": 0.18138745427131653, + "learning_rate": 2.0963855518694203e-05, + "loss": 0.005, + "step": 836 + }, + { + "epoch": 1.613012048192771, + "grad_norm": 0.19173955917358398, + "learning_rate": 2.0919057382913675e-05, + "loss": 0.0084, + "step": 837 + }, + { + "epoch": 1.6149397590361447, + "grad_norm": 0.3812403380870819, + "learning_rate": 2.0874254625814435e-05, + "loss": 0.009, + "step": 838 + }, + { + "epoch": 1.616867469879518, + "grad_norm": 0.2009759545326233, + "learning_rate": 2.0829447472679285e-05, + "loss": 0.0098, + "step": 839 + }, + { + "epoch": 1.6187951807228915, + "grad_norm": 0.48703446984291077, + "learning_rate": 2.0784636148813124e-05, + "loss": 0.0099, + "step": 840 + }, + { + "epoch": 1.620722891566265, + "grad_norm": 0.28995075821876526, + "learning_rate": 2.0739820879541827e-05, + "loss": 0.0075, + "step": 841 + }, + { + "epoch": 1.6226506024096385, + "grad_norm": 0.2130059450864792, + "learning_rate": 2.069500189021111e-05, + "loss": 0.007, + "step": 842 + }, + { + "epoch": 1.624578313253012, + "grad_norm": 0.252524733543396, + "learning_rate": 2.0650179406185397e-05, + "loss": 0.0249, + "step": 843 + }, + { + "epoch": 1.6265060240963856, + "grad_norm": 0.23069098591804504, + "learning_rate": 2.060535365284668e-05, + "loss": 0.0084, + "step": 844 + }, + { + "epoch": 1.628433734939759, + "grad_norm": 0.25051403045654297, + "learning_rate": 2.056052485559338e-05, + "loss": 0.0071, + "step": 845 + }, + { + "epoch": 1.6303614457831326, + "grad_norm": 0.27664798498153687, + "learning_rate": 2.051569323983924e-05, + "loss": 0.0198, + "step": 846 + }, + { + "epoch": 1.632289156626506, + "grad_norm": 0.2954922318458557, + "learning_rate": 2.047085903101218e-05, + "loss": 0.006, + "step": 847 + }, + { + "epoch": 1.6342168674698794, + "grad_norm": 0.28477591276168823, + "learning_rate": 2.0426022454553137e-05, + "loss": 0.0147, + "step": 848 + }, + { + "epoch": 1.636144578313253, + "grad_norm": 0.2785305678844452, + "learning_rate": 2.0381183735914968e-05, + "loss": 0.0117, + "step": 849 + }, + { + "epoch": 1.6380722891566264, + "grad_norm": 0.2500309348106384, + "learning_rate": 2.0336343100561295e-05, + "loss": 0.008, + "step": 850 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.18932047486305237, + "learning_rate": 2.0291500773965392e-05, + "loss": 0.0256, + "step": 851 + }, + { + "epoch": 1.6419277108433734, + "grad_norm": 0.6396257877349854, + "learning_rate": 2.0246656981609013e-05, + "loss": 0.0141, + "step": 852 + }, + { + "epoch": 1.6438554216867471, + "grad_norm": 0.5072891116142273, + "learning_rate": 2.02018119489813e-05, + "loss": 0.008, + "step": 853 + }, + { + "epoch": 1.6457831325301204, + "grad_norm": 0.2920839488506317, + "learning_rate": 2.0156965901577635e-05, + "loss": 0.0085, + "step": 854 + }, + { + "epoch": 1.647710843373494, + "grad_norm": 0.1391262263059616, + "learning_rate": 2.011211906489848e-05, + "loss": 0.0078, + "step": 855 + }, + { + "epoch": 1.6496385542168674, + "grad_norm": 0.29620468616485596, + "learning_rate": 2.00672716644483e-05, + "loss": 0.0109, + "step": 856 + }, + { + "epoch": 1.651566265060241, + "grad_norm": 0.13946573436260223, + "learning_rate": 2.002242392573436e-05, + "loss": 0.0076, + "step": 857 + }, + { + "epoch": 1.6534939759036145, + "grad_norm": 0.9766128659248352, + "learning_rate": 1.997757607426565e-05, + "loss": 0.0309, + "step": 858 + }, + { + "epoch": 1.655421686746988, + "grad_norm": 0.18002203106880188, + "learning_rate": 1.9932728335551702e-05, + "loss": 0.0072, + "step": 859 + }, + { + "epoch": 1.6573493975903615, + "grad_norm": 0.28073111176490784, + "learning_rate": 1.988788093510152e-05, + "loss": 0.0246, + "step": 860 + }, + { + "epoch": 1.659277108433735, + "grad_norm": 0.1919957399368286, + "learning_rate": 1.9843034098422375e-05, + "loss": 0.0087, + "step": 861 + }, + { + "epoch": 1.6612048192771085, + "grad_norm": 0.1825258433818817, + "learning_rate": 1.9798188051018705e-05, + "loss": 0.0092, + "step": 862 + }, + { + "epoch": 1.6631325301204818, + "grad_norm": 0.32412952184677124, + "learning_rate": 1.9753343018390997e-05, + "loss": 0.0118, + "step": 863 + }, + { + "epoch": 1.6650602409638555, + "grad_norm": 0.12828563153743744, + "learning_rate": 1.9708499226034618e-05, + "loss": 0.0056, + "step": 864 + }, + { + "epoch": 1.6669879518072288, + "grad_norm": 0.18647560477256775, + "learning_rate": 1.966365689943871e-05, + "loss": 0.0094, + "step": 865 + }, + { + "epoch": 1.6689156626506025, + "grad_norm": 0.19835828244686127, + "learning_rate": 1.9618816264085042e-05, + "loss": 0.0097, + "step": 866 + }, + { + "epoch": 1.6708433734939758, + "grad_norm": 0.22364282608032227, + "learning_rate": 1.957397754544687e-05, + "loss": 0.0062, + "step": 867 + }, + { + "epoch": 1.6727710843373496, + "grad_norm": 0.29420018196105957, + "learning_rate": 1.952914096898783e-05, + "loss": 0.0182, + "step": 868 + }, + { + "epoch": 1.6746987951807228, + "grad_norm": 0.2149929702281952, + "learning_rate": 1.9484306760160766e-05, + "loss": 0.0125, + "step": 869 + }, + { + "epoch": 1.6766265060240964, + "grad_norm": 0.16844330728054047, + "learning_rate": 1.9439475144406623e-05, + "loss": 0.0074, + "step": 870 + }, + { + "epoch": 1.6785542168674699, + "grad_norm": 0.5010282397270203, + "learning_rate": 1.9394646347153334e-05, + "loss": 0.0213, + "step": 871 + }, + { + "epoch": 1.6804819277108434, + "grad_norm": 0.29847195744514465, + "learning_rate": 1.9349820593814606e-05, + "loss": 0.0173, + "step": 872 + }, + { + "epoch": 1.6824096385542169, + "grad_norm": 0.23835812509059906, + "learning_rate": 1.930499810978889e-05, + "loss": 0.011, + "step": 873 + }, + { + "epoch": 1.6843373493975904, + "grad_norm": 0.3269020617008209, + "learning_rate": 1.9260179120458177e-05, + "loss": 0.0285, + "step": 874 + }, + { + "epoch": 1.686265060240964, + "grad_norm": 0.2142144739627838, + "learning_rate": 1.9215363851186883e-05, + "loss": 0.0146, + "step": 875 + }, + { + "epoch": 1.6881927710843372, + "grad_norm": 0.3098377585411072, + "learning_rate": 1.9170552527320725e-05, + "loss": 0.0104, + "step": 876 + }, + { + "epoch": 1.690120481927711, + "grad_norm": 0.22504115104675293, + "learning_rate": 1.9125745374185568e-05, + "loss": 0.0091, + "step": 877 + }, + { + "epoch": 1.6920481927710842, + "grad_norm": 0.20633333921432495, + "learning_rate": 1.908094261708633e-05, + "loss": 0.0097, + "step": 878 + }, + { + "epoch": 1.693975903614458, + "grad_norm": 1.179566502571106, + "learning_rate": 1.9036144481305807e-05, + "loss": 0.0143, + "step": 879 + }, + { + "epoch": 1.6959036144578312, + "grad_norm": 0.15525613725185394, + "learning_rate": 1.8991351192103554e-05, + "loss": 0.0062, + "step": 880 + }, + { + "epoch": 1.697831325301205, + "grad_norm": 0.15966367721557617, + "learning_rate": 1.8946562974714763e-05, + "loss": 0.0048, + "step": 881 + }, + { + "epoch": 1.6997590361445782, + "grad_norm": 0.18902607262134552, + "learning_rate": 1.890178005434914e-05, + "loss": 0.0124, + "step": 882 + }, + { + "epoch": 1.701686746987952, + "grad_norm": 0.21692413091659546, + "learning_rate": 1.885700265618971e-05, + "loss": 0.0135, + "step": 883 + }, + { + "epoch": 1.7036144578313253, + "grad_norm": 0.38948455452919006, + "learning_rate": 1.8812231005391786e-05, + "loss": 0.0365, + "step": 884 + }, + { + "epoch": 1.7055421686746988, + "grad_norm": 0.2483491599559784, + "learning_rate": 1.8767465327081736e-05, + "loss": 0.0202, + "step": 885 + }, + { + "epoch": 1.7074698795180723, + "grad_norm": 0.15305832028388977, + "learning_rate": 1.872270584635592e-05, + "loss": 0.0035, + "step": 886 + }, + { + "epoch": 1.7093975903614458, + "grad_norm": 0.17794466018676758, + "learning_rate": 1.867795278827954e-05, + "loss": 0.0157, + "step": 887 + }, + { + "epoch": 1.7113253012048193, + "grad_norm": 0.1938813328742981, + "learning_rate": 1.863320637788547e-05, + "loss": 0.0071, + "step": 888 + }, + { + "epoch": 1.7132530120481928, + "grad_norm": 0.27061617374420166, + "learning_rate": 1.8588466840173207e-05, + "loss": 0.0347, + "step": 889 + }, + { + "epoch": 1.7151807228915663, + "grad_norm": 0.1541014313697815, + "learning_rate": 1.8543734400107637e-05, + "loss": 0.006, + "step": 890 + }, + { + "epoch": 1.7171084337349396, + "grad_norm": 0.1436876654624939, + "learning_rate": 1.8499009282617996e-05, + "loss": 0.0059, + "step": 891 + }, + { + "epoch": 1.7190361445783133, + "grad_norm": 1.0573723316192627, + "learning_rate": 1.8454291712596688e-05, + "loss": 0.008, + "step": 892 + }, + { + "epoch": 1.7209638554216866, + "grad_norm": 0.15406259894371033, + "learning_rate": 1.8409581914898157e-05, + "loss": 0.0061, + "step": 893 + }, + { + "epoch": 1.7228915662650603, + "grad_norm": 0.24822913110256195, + "learning_rate": 1.836488011433777e-05, + "loss": 0.0085, + "step": 894 + }, + { + "epoch": 1.7248192771084336, + "grad_norm": 0.21049316227436066, + "learning_rate": 1.83201865356907e-05, + "loss": 0.0075, + "step": 895 + }, + { + "epoch": 1.7267469879518074, + "grad_norm": 0.24159866571426392, + "learning_rate": 1.8275501403690733e-05, + "loss": 0.0156, + "step": 896 + }, + { + "epoch": 1.7286746987951807, + "grad_norm": 0.3191063106060028, + "learning_rate": 1.823082494302924e-05, + "loss": 0.0218, + "step": 897 + }, + { + "epoch": 1.7306024096385542, + "grad_norm": 0.20296362042427063, + "learning_rate": 1.8186157378353945e-05, + "loss": 0.0126, + "step": 898 + }, + { + "epoch": 1.7325301204819277, + "grad_norm": 0.1905524581670761, + "learning_rate": 1.8141498934267858e-05, + "loss": 0.0131, + "step": 899 + }, + { + "epoch": 1.7344578313253012, + "grad_norm": 0.5350520610809326, + "learning_rate": 1.809684983532813e-05, + "loss": 0.0115, + "step": 900 + }, + { + "epoch": 1.7363855421686747, + "grad_norm": 0.17144092917442322, + "learning_rate": 1.8052210306044907e-05, + "loss": 0.0113, + "step": 901 + }, + { + "epoch": 1.7383132530120482, + "grad_norm": 0.11777982115745544, + "learning_rate": 1.8007580570880236e-05, + "loss": 0.0058, + "step": 902 + }, + { + "epoch": 1.7402409638554217, + "grad_norm": 0.2078275978565216, + "learning_rate": 1.7962960854246908e-05, + "loss": 0.0106, + "step": 903 + }, + { + "epoch": 1.7421686746987952, + "grad_norm": 0.2550877630710602, + "learning_rate": 1.791835138050732e-05, + "loss": 0.0076, + "step": 904 + }, + { + "epoch": 1.7440963855421687, + "grad_norm": 0.11553912609815598, + "learning_rate": 1.7873752373972395e-05, + "loss": 0.0038, + "step": 905 + }, + { + "epoch": 1.746024096385542, + "grad_norm": 0.10724586248397827, + "learning_rate": 1.7829164058900398e-05, + "loss": 0.0043, + "step": 906 + }, + { + "epoch": 1.7479518072289157, + "grad_norm": 0.30152231454849243, + "learning_rate": 1.7784586659495845e-05, + "loss": 0.0099, + "step": 907 + }, + { + "epoch": 1.749879518072289, + "grad_norm": 0.18372933566570282, + "learning_rate": 1.7740020399908372e-05, + "loss": 0.0074, + "step": 908 + }, + { + "epoch": 1.7518072289156628, + "grad_norm": 0.35184428095817566, + "learning_rate": 1.7695465504231586e-05, + "loss": 0.0184, + "step": 909 + }, + { + "epoch": 1.753734939759036, + "grad_norm": 0.15083615481853485, + "learning_rate": 1.765092219650196e-05, + "loss": 0.0061, + "step": 910 + }, + { + "epoch": 1.7556626506024098, + "grad_norm": 0.2599961459636688, + "learning_rate": 1.7606390700697693e-05, + "loss": 0.0101, + "step": 911 + }, + { + "epoch": 1.757590361445783, + "grad_norm": 0.10829206556081772, + "learning_rate": 1.7561871240737595e-05, + "loss": 0.0034, + "step": 912 + }, + { + "epoch": 1.7595180722891566, + "grad_norm": 0.38098782300949097, + "learning_rate": 1.7517364040479966e-05, + "loss": 0.0384, + "step": 913 + }, + { + "epoch": 1.76144578313253, + "grad_norm": 0.14975085854530334, + "learning_rate": 1.7472869323721432e-05, + "loss": 0.0055, + "step": 914 + }, + { + "epoch": 1.7633734939759036, + "grad_norm": 0.4151444733142853, + "learning_rate": 1.742838731419588e-05, + "loss": 0.0307, + "step": 915 + }, + { + "epoch": 1.765301204819277, + "grad_norm": 0.22238481044769287, + "learning_rate": 1.738391823557328e-05, + "loss": 0.0059, + "step": 916 + }, + { + "epoch": 1.7672289156626506, + "grad_norm": 0.23386356234550476, + "learning_rate": 1.7339462311458587e-05, + "loss": 0.0113, + "step": 917 + }, + { + "epoch": 1.7691566265060241, + "grad_norm": 0.21911191940307617, + "learning_rate": 1.7295019765390618e-05, + "loss": 0.0071, + "step": 918 + }, + { + "epoch": 1.7710843373493976, + "grad_norm": 0.343159943819046, + "learning_rate": 1.7250590820840903e-05, + "loss": 0.0144, + "step": 919 + }, + { + "epoch": 1.7730120481927711, + "grad_norm": 0.32204556465148926, + "learning_rate": 1.720617570121259e-05, + "loss": 0.0131, + "step": 920 + }, + { + "epoch": 1.7749397590361444, + "grad_norm": 0.4105585515499115, + "learning_rate": 1.7161774629839328e-05, + "loss": 0.0148, + "step": 921 + }, + { + "epoch": 1.7768674698795182, + "grad_norm": 0.16380974650382996, + "learning_rate": 1.7117387829984093e-05, + "loss": 0.0066, + "step": 922 + }, + { + "epoch": 1.7787951807228914, + "grad_norm": 0.22920913994312286, + "learning_rate": 1.707301552483813e-05, + "loss": 0.0105, + "step": 923 + }, + { + "epoch": 1.7807228915662652, + "grad_norm": 0.2075149267911911, + "learning_rate": 1.7028657937519767e-05, + "loss": 0.0104, + "step": 924 + }, + { + "epoch": 1.7826506024096385, + "grad_norm": 0.44439977407455444, + "learning_rate": 1.6984315291073355e-05, + "loss": 0.0134, + "step": 925 + }, + { + "epoch": 1.7845783132530122, + "grad_norm": 0.24068203568458557, + "learning_rate": 1.6939987808468125e-05, + "loss": 0.0078, + "step": 926 + }, + { + "epoch": 1.7865060240963855, + "grad_norm": 0.34044349193573, + "learning_rate": 1.689567571259701e-05, + "loss": 0.0108, + "step": 927 + }, + { + "epoch": 1.788433734939759, + "grad_norm": 0.34082743525505066, + "learning_rate": 1.6851379226275624e-05, + "loss": 0.0266, + "step": 928 + }, + { + "epoch": 1.7903614457831325, + "grad_norm": 0.19490115344524384, + "learning_rate": 1.6807098572241075e-05, + "loss": 0.0109, + "step": 929 + }, + { + "epoch": 1.792289156626506, + "grad_norm": 0.16208237409591675, + "learning_rate": 1.6762833973150846e-05, + "loss": 0.0113, + "step": 930 + }, + { + "epoch": 1.7942168674698795, + "grad_norm": 0.35555699467658997, + "learning_rate": 1.671858565158172e-05, + "loss": 0.0196, + "step": 931 + }, + { + "epoch": 1.796144578313253, + "grad_norm": 0.1600857824087143, + "learning_rate": 1.6674353830028587e-05, + "loss": 0.0089, + "step": 932 + }, + { + "epoch": 1.7980722891566265, + "grad_norm": 0.1699574887752533, + "learning_rate": 1.663013873090342e-05, + "loss": 0.0074, + "step": 933 + }, + { + "epoch": 1.8, + "grad_norm": 0.2472933828830719, + "learning_rate": 1.6585940576534086e-05, + "loss": 0.0063, + "step": 934 + }, + { + "epoch": 1.8019277108433736, + "grad_norm": 0.23491555452346802, + "learning_rate": 1.654175958916323e-05, + "loss": 0.0101, + "step": 935 + }, + { + "epoch": 1.8038554216867468, + "grad_norm": 0.28635191917419434, + "learning_rate": 1.6497595990947195e-05, + "loss": 0.0131, + "step": 936 + }, + { + "epoch": 1.8057831325301206, + "grad_norm": 0.15400712192058563, + "learning_rate": 1.645345000395489e-05, + "loss": 0.0068, + "step": 937 + }, + { + "epoch": 1.8077108433734939, + "grad_norm": 0.18223172426223755, + "learning_rate": 1.6409321850166647e-05, + "loss": 0.0094, + "step": 938 + }, + { + "epoch": 1.8096385542168676, + "grad_norm": 0.2789457142353058, + "learning_rate": 1.636521175147316e-05, + "loss": 0.0202, + "step": 939 + }, + { + "epoch": 1.8115662650602409, + "grad_norm": 0.4267627000808716, + "learning_rate": 1.6321119929674297e-05, + "loss": 0.0176, + "step": 940 + }, + { + "epoch": 1.8134939759036146, + "grad_norm": 0.3021615445613861, + "learning_rate": 1.6277046606478056e-05, + "loss": 0.0085, + "step": 941 + }, + { + "epoch": 1.815421686746988, + "grad_norm": 0.3724934756755829, + "learning_rate": 1.6232992003499405e-05, + "loss": 0.0474, + "step": 942 + }, + { + "epoch": 1.8173493975903614, + "grad_norm": 0.20904326438903809, + "learning_rate": 1.6188956342259177e-05, + "loss": 0.0078, + "step": 943 + }, + { + "epoch": 1.819277108433735, + "grad_norm": 0.31168171763420105, + "learning_rate": 1.614493984418297e-05, + "loss": 0.0174, + "step": 944 + }, + { + "epoch": 1.8212048192771084, + "grad_norm": 0.21273556351661682, + "learning_rate": 1.6100942730600003e-05, + "loss": 0.0054, + "step": 945 + }, + { + "epoch": 1.823132530120482, + "grad_norm": 0.16991695761680603, + "learning_rate": 1.6056965222742055e-05, + "loss": 0.0063, + "step": 946 + }, + { + "epoch": 1.8250602409638554, + "grad_norm": 0.22762684524059296, + "learning_rate": 1.6013007541742303e-05, + "loss": 0.0234, + "step": 947 + }, + { + "epoch": 1.826987951807229, + "grad_norm": 0.20128795504570007, + "learning_rate": 1.596906990863422e-05, + "loss": 0.0095, + "step": 948 + }, + { + "epoch": 1.8289156626506025, + "grad_norm": 0.30772027373313904, + "learning_rate": 1.592515254435048e-05, + "loss": 0.0356, + "step": 949 + }, + { + "epoch": 1.830843373493976, + "grad_norm": 0.12954631447792053, + "learning_rate": 1.5881255669721857e-05, + "loss": 0.008, + "step": 950 + }, + { + "epoch": 1.8327710843373493, + "grad_norm": 0.7787145972251892, + "learning_rate": 1.5837379505476054e-05, + "loss": 0.0108, + "step": 951 + }, + { + "epoch": 1.834698795180723, + "grad_norm": 0.1683879941701889, + "learning_rate": 1.5793524272236683e-05, + "loss": 0.006, + "step": 952 + }, + { + "epoch": 1.8366265060240963, + "grad_norm": 0.16475361585617065, + "learning_rate": 1.5749690190522076e-05, + "loss": 0.0065, + "step": 953 + }, + { + "epoch": 1.83855421686747, + "grad_norm": 0.211905375123024, + "learning_rate": 1.5705877480744214e-05, + "loss": 0.0092, + "step": 954 + }, + { + "epoch": 1.8404819277108433, + "grad_norm": 0.23850117623806, + "learning_rate": 1.5662086363207628e-05, + "loss": 0.012, + "step": 955 + }, + { + "epoch": 1.842409638554217, + "grad_norm": 0.19100065529346466, + "learning_rate": 1.561831705810825e-05, + "loss": 0.0113, + "step": 956 + }, + { + "epoch": 1.8443373493975903, + "grad_norm": 0.3635985255241394, + "learning_rate": 1.557456978553236e-05, + "loss": 0.0168, + "step": 957 + }, + { + "epoch": 1.8462650602409638, + "grad_norm": 0.16449116170406342, + "learning_rate": 1.553084476545544e-05, + "loss": 0.0042, + "step": 958 + }, + { + "epoch": 1.8481927710843373, + "grad_norm": 0.566093385219574, + "learning_rate": 1.5487142217741062e-05, + "loss": 0.0145, + "step": 959 + }, + { + "epoch": 1.8501204819277108, + "grad_norm": 0.15960252285003662, + "learning_rate": 1.5443462362139834e-05, + "loss": 0.0059, + "step": 960 + }, + { + "epoch": 1.8520481927710843, + "grad_norm": 0.40773797035217285, + "learning_rate": 1.539980541828823e-05, + "loss": 0.0257, + "step": 961 + }, + { + "epoch": 1.8539759036144579, + "grad_norm": 0.4802496135234833, + "learning_rate": 1.5356171605707522e-05, + "loss": 0.0111, + "step": 962 + }, + { + "epoch": 1.8559036144578314, + "grad_norm": 0.15745794773101807, + "learning_rate": 1.5312561143802704e-05, + "loss": 0.0049, + "step": 963 + }, + { + "epoch": 1.8578313253012049, + "grad_norm": 0.15139251947402954, + "learning_rate": 1.5268974251861298e-05, + "loss": 0.0077, + "step": 964 + }, + { + "epoch": 1.8597590361445784, + "grad_norm": 0.2188841849565506, + "learning_rate": 1.5225411149052356e-05, + "loss": 0.017, + "step": 965 + }, + { + "epoch": 1.8616867469879517, + "grad_norm": 0.10853131115436554, + "learning_rate": 1.5181872054425287e-05, + "loss": 0.0049, + "step": 966 + }, + { + "epoch": 1.8636144578313254, + "grad_norm": 0.8254880905151367, + "learning_rate": 1.5138357186908785e-05, + "loss": 0.0317, + "step": 967 + }, + { + "epoch": 1.8655421686746987, + "grad_norm": 0.2989620566368103, + "learning_rate": 1.5094866765309728e-05, + "loss": 0.0126, + "step": 968 + }, + { + "epoch": 1.8674698795180724, + "grad_norm": 0.16411150991916656, + "learning_rate": 1.5051401008312054e-05, + "loss": 0.0101, + "step": 969 + }, + { + "epoch": 1.8693975903614457, + "grad_norm": 0.2861763834953308, + "learning_rate": 1.5007960134475706e-05, + "loss": 0.0155, + "step": 970 + }, + { + "epoch": 1.8713253012048194, + "grad_norm": 0.24879588186740875, + "learning_rate": 1.4964544362235487e-05, + "loss": 0.0187, + "step": 971 + }, + { + "epoch": 1.8732530120481927, + "grad_norm": 0.2433672398328781, + "learning_rate": 1.4921153909899983e-05, + "loss": 0.0084, + "step": 972 + }, + { + "epoch": 1.8751807228915662, + "grad_norm": 0.15097154676914215, + "learning_rate": 1.487778899565047e-05, + "loss": 0.007, + "step": 973 + }, + { + "epoch": 1.8771084337349397, + "grad_norm": 0.1629047691822052, + "learning_rate": 1.4834449837539806e-05, + "loss": 0.0058, + "step": 974 + }, + { + "epoch": 1.8790361445783132, + "grad_norm": 0.9937071204185486, + "learning_rate": 1.4791136653491333e-05, + "loss": 0.0323, + "step": 975 + }, + { + "epoch": 1.8809638554216868, + "grad_norm": 0.19555562734603882, + "learning_rate": 1.4747849661297808e-05, + "loss": 0.0126, + "step": 976 + }, + { + "epoch": 1.8828915662650603, + "grad_norm": 0.16147711873054504, + "learning_rate": 1.470458907862026e-05, + "loss": 0.0067, + "step": 977 + }, + { + "epoch": 1.8848192771084338, + "grad_norm": 0.2730027735233307, + "learning_rate": 1.4661355122986945e-05, + "loss": 0.0147, + "step": 978 + }, + { + "epoch": 1.886746987951807, + "grad_norm": 0.13759832084178925, + "learning_rate": 1.4618148011792206e-05, + "loss": 0.0038, + "step": 979 + }, + { + "epoch": 1.8886746987951808, + "grad_norm": 0.33516690135002136, + "learning_rate": 1.4574967962295419e-05, + "loss": 0.0139, + "step": 980 + }, + { + "epoch": 1.890602409638554, + "grad_norm": 0.2345741093158722, + "learning_rate": 1.4531815191619903e-05, + "loss": 0.0094, + "step": 981 + }, + { + "epoch": 1.8925301204819278, + "grad_norm": 0.14681044220924377, + "learning_rate": 1.4488689916751762e-05, + "loss": 0.0065, + "step": 982 + }, + { + "epoch": 1.894457831325301, + "grad_norm": 0.21143914759159088, + "learning_rate": 1.4445592354538885e-05, + "loss": 0.0057, + "step": 983 + }, + { + "epoch": 1.8963855421686748, + "grad_norm": 0.3109160363674164, + "learning_rate": 1.44025227216898e-05, + "loss": 0.0142, + "step": 984 + }, + { + "epoch": 1.8983132530120481, + "grad_norm": 0.24301907420158386, + "learning_rate": 1.435948123477259e-05, + "loss": 0.012, + "step": 985 + }, + { + "epoch": 1.9002409638554218, + "grad_norm": 0.19817675650119781, + "learning_rate": 1.431646811021382e-05, + "loss": 0.0097, + "step": 986 + }, + { + "epoch": 1.9021686746987951, + "grad_norm": 0.13464932143688202, + "learning_rate": 1.4273483564297425e-05, + "loss": 0.0046, + "step": 987 + }, + { + "epoch": 1.9040963855421686, + "grad_norm": 0.1698642522096634, + "learning_rate": 1.4230527813163656e-05, + "loss": 0.0038, + "step": 988 + }, + { + "epoch": 1.9060240963855422, + "grad_norm": 0.19395388662815094, + "learning_rate": 1.4187601072807975e-05, + "loss": 0.0123, + "step": 989 + }, + { + "epoch": 1.9079518072289157, + "grad_norm": 0.2093188613653183, + "learning_rate": 1.4144703559079948e-05, + "loss": 0.0093, + "step": 990 + }, + { + "epoch": 1.9098795180722892, + "grad_norm": 0.1529311090707779, + "learning_rate": 1.4101835487682198e-05, + "loss": 0.0051, + "step": 991 + }, + { + "epoch": 1.9118072289156627, + "grad_norm": 0.18725350499153137, + "learning_rate": 1.4058997074169299e-05, + "loss": 0.0083, + "step": 992 + }, + { + "epoch": 1.9137349397590362, + "grad_norm": 0.15601560473442078, + "learning_rate": 1.401618853394668e-05, + "loss": 0.0086, + "step": 993 + }, + { + "epoch": 1.9156626506024095, + "grad_norm": 0.23890644311904907, + "learning_rate": 1.3973410082269591e-05, + "loss": 0.015, + "step": 994 + }, + { + "epoch": 1.9175903614457832, + "grad_norm": 0.2442619949579239, + "learning_rate": 1.3930661934241947e-05, + "loss": 0.0089, + "step": 995 + }, + { + "epoch": 1.9195180722891565, + "grad_norm": 0.1540212482213974, + "learning_rate": 1.388794430481532e-05, + "loss": 0.0072, + "step": 996 + }, + { + "epoch": 1.9214457831325302, + "grad_norm": 0.1359291970729828, + "learning_rate": 1.3845257408787807e-05, + "loss": 0.0131, + "step": 997 + }, + { + "epoch": 1.9233734939759035, + "grad_norm": 0.25486138463020325, + "learning_rate": 1.3802601460802967e-05, + "loss": 0.0198, + "step": 998 + }, + { + "epoch": 1.9253012048192772, + "grad_norm": 0.28815609216690063, + "learning_rate": 1.3759976675348754e-05, + "loss": 0.014, + "step": 999 + }, + { + "epoch": 1.9272289156626505, + "grad_norm": 0.15648497641086578, + "learning_rate": 1.3717383266756403e-05, + "loss": 0.0065, + "step": 1000 + }, + { + "epoch": 1.929156626506024, + "grad_norm": 0.16912540793418884, + "learning_rate": 1.367482144919941e-05, + "loss": 0.0059, + "step": 1001 + }, + { + "epoch": 1.9310843373493976, + "grad_norm": 0.16896723210811615, + "learning_rate": 1.3632291436692397e-05, + "loss": 0.0054, + "step": 1002 + }, + { + "epoch": 1.933012048192771, + "grad_norm": 0.20287497341632843, + "learning_rate": 1.3589793443090064e-05, + "loss": 0.0097, + "step": 1003 + }, + { + "epoch": 1.9349397590361446, + "grad_norm": 0.14804276823997498, + "learning_rate": 1.3547327682086114e-05, + "loss": 0.0125, + "step": 1004 + }, + { + "epoch": 1.936867469879518, + "grad_norm": 0.23820064961910248, + "learning_rate": 1.3504894367212171e-05, + "loss": 0.0131, + "step": 1005 + }, + { + "epoch": 1.9387951807228916, + "grad_norm": 0.25607362389564514, + "learning_rate": 1.34624937118367e-05, + "loss": 0.0115, + "step": 1006 + }, + { + "epoch": 1.940722891566265, + "grad_norm": 0.37233737111091614, + "learning_rate": 1.3420125929163976e-05, + "loss": 0.0309, + "step": 1007 + }, + { + "epoch": 1.9426506024096386, + "grad_norm": 0.19426730275154114, + "learning_rate": 1.3377791232232929e-05, + "loss": 0.0078, + "step": 1008 + }, + { + "epoch": 1.944578313253012, + "grad_norm": 0.2784160077571869, + "learning_rate": 1.333548983391617e-05, + "loss": 0.0142, + "step": 1009 + }, + { + "epoch": 1.9465060240963856, + "grad_norm": 0.11407195776700974, + "learning_rate": 1.3293221946918853e-05, + "loss": 0.0035, + "step": 1010 + }, + { + "epoch": 1.948433734939759, + "grad_norm": 0.3965436816215515, + "learning_rate": 1.325098778377762e-05, + "loss": 0.0242, + "step": 1011 + }, + { + "epoch": 1.9503614457831326, + "grad_norm": 0.18520519137382507, + "learning_rate": 1.3208787556859543e-05, + "loss": 0.0096, + "step": 1012 + }, + { + "epoch": 1.952289156626506, + "grad_norm": 0.2783315181732178, + "learning_rate": 1.3166621478361075e-05, + "loss": 0.0103, + "step": 1013 + }, + { + "epoch": 1.9542168674698797, + "grad_norm": 0.22714459896087646, + "learning_rate": 1.3124489760306917e-05, + "loss": 0.0078, + "step": 1014 + }, + { + "epoch": 1.956144578313253, + "grad_norm": 0.1257915049791336, + "learning_rate": 1.3082392614549036e-05, + "loss": 0.0077, + "step": 1015 + }, + { + "epoch": 1.9580722891566265, + "grad_norm": 0.15592887997627258, + "learning_rate": 1.3040330252765526e-05, + "loss": 0.0106, + "step": 1016 + }, + { + "epoch": 1.96, + "grad_norm": 0.19295449554920197, + "learning_rate": 1.2998302886459586e-05, + "loss": 0.0082, + "step": 1017 + }, + { + "epoch": 1.9619277108433735, + "grad_norm": 0.15544794499874115, + "learning_rate": 1.2956310726958472e-05, + "loss": 0.0068, + "step": 1018 + }, + { + "epoch": 1.963855421686747, + "grad_norm": 0.25899502635002136, + "learning_rate": 1.291435398541236e-05, + "loss": 0.0086, + "step": 1019 + }, + { + "epoch": 1.9657831325301205, + "grad_norm": 0.34639033675193787, + "learning_rate": 1.2872432872793379e-05, + "loss": 0.0116, + "step": 1020 + }, + { + "epoch": 1.967710843373494, + "grad_norm": 0.1628410518169403, + "learning_rate": 1.283054759989447e-05, + "loss": 0.0055, + "step": 1021 + }, + { + "epoch": 1.9696385542168675, + "grad_norm": 0.9273788928985596, + "learning_rate": 1.2788698377328385e-05, + "loss": 0.0264, + "step": 1022 + }, + { + "epoch": 1.971566265060241, + "grad_norm": 0.163126140832901, + "learning_rate": 1.2746885415526594e-05, + "loss": 0.0046, + "step": 1023 + }, + { + "epoch": 1.9734939759036143, + "grad_norm": 0.1475439816713333, + "learning_rate": 1.2705108924738223e-05, + "loss": 0.0056, + "step": 1024 + }, + { + "epoch": 1.975421686746988, + "grad_norm": 0.1654318869113922, + "learning_rate": 1.2663369115029034e-05, + "loss": 0.0056, + "step": 1025 + }, + { + "epoch": 1.9773493975903613, + "grad_norm": 0.20536045730113983, + "learning_rate": 1.2621666196280333e-05, + "loss": 0.0101, + "step": 1026 + }, + { + "epoch": 1.979277108433735, + "grad_norm": 0.19256474077701569, + "learning_rate": 1.258000037818792e-05, + "loss": 0.0059, + "step": 1027 + }, + { + "epoch": 1.9812048192771083, + "grad_norm": 0.2605120539665222, + "learning_rate": 1.2538371870261053e-05, + "loss": 0.0115, + "step": 1028 + }, + { + "epoch": 1.983132530120482, + "grad_norm": 0.14840295910835266, + "learning_rate": 1.249678088182137e-05, + "loss": 0.0046, + "step": 1029 + }, + { + "epoch": 1.9850602409638554, + "grad_norm": 0.17585207521915436, + "learning_rate": 1.2455227622001851e-05, + "loss": 0.0086, + "step": 1030 + }, + { + "epoch": 1.9869879518072289, + "grad_norm": 0.11044781655073166, + "learning_rate": 1.241371229974579e-05, + "loss": 0.0034, + "step": 1031 + }, + { + "epoch": 1.9889156626506024, + "grad_norm": 0.25584840774536133, + "learning_rate": 1.2372235123805672e-05, + "loss": 0.0245, + "step": 1032 + }, + { + "epoch": 1.9908433734939759, + "grad_norm": 0.25962474942207336, + "learning_rate": 1.2330796302742211e-05, + "loss": 0.0104, + "step": 1033 + }, + { + "epoch": 1.9927710843373494, + "grad_norm": 0.33408522605895996, + "learning_rate": 1.2289396044923238e-05, + "loss": 0.0176, + "step": 1034 + }, + { + "epoch": 1.994698795180723, + "grad_norm": 0.479950487613678, + "learning_rate": 1.2248034558522682e-05, + "loss": 0.0113, + "step": 1035 + }, + { + "epoch": 1.9966265060240964, + "grad_norm": 0.16567294299602509, + "learning_rate": 1.2206712051519518e-05, + "loss": 0.0036, + "step": 1036 + }, + { + "epoch": 1.99855421686747, + "grad_norm": 0.19343771040439606, + "learning_rate": 1.2165428731696713e-05, + "loss": 0.0077, + "step": 1037 + }, + { + "epoch": 2.0, + "grad_norm": 0.22895601391792297, + "learning_rate": 1.2124184806640202e-05, + "loss": 0.0114, + "step": 1038 + }, + { + "epoch": 2.0019277108433733, + "grad_norm": 0.15838384628295898, + "learning_rate": 1.208298048373782e-05, + "loss": 0.0043, + "step": 1039 + }, + { + "epoch": 2.003855421686747, + "grad_norm": 0.681065559387207, + "learning_rate": 1.2041815970178268e-05, + "loss": 0.0214, + "step": 1040 + }, + { + "epoch": 2.0057831325301203, + "grad_norm": 0.3357350528240204, + "learning_rate": 1.2000691472950081e-05, + "loss": 0.0079, + "step": 1041 + }, + { + "epoch": 2.007710843373494, + "grad_norm": 0.15238308906555176, + "learning_rate": 1.1959607198840568e-05, + "loss": 0.0041, + "step": 1042 + }, + { + "epoch": 2.0096385542168673, + "grad_norm": 0.11763229966163635, + "learning_rate": 1.1918563354434784e-05, + "loss": 0.0033, + "step": 1043 + }, + { + "epoch": 2.011566265060241, + "grad_norm": 0.3759301006793976, + "learning_rate": 1.1877560146114515e-05, + "loss": 0.0128, + "step": 1044 + }, + { + "epoch": 2.0134939759036143, + "grad_norm": 0.1143188625574112, + "learning_rate": 1.1836597780057183e-05, + "loss": 0.0078, + "step": 1045 + }, + { + "epoch": 2.015421686746988, + "grad_norm": 0.20059260725975037, + "learning_rate": 1.179567646223485e-05, + "loss": 0.0149, + "step": 1046 + }, + { + "epoch": 2.0173493975903614, + "grad_norm": 0.15569567680358887, + "learning_rate": 1.1754796398413196e-05, + "loss": 0.0038, + "step": 1047 + }, + { + "epoch": 2.019277108433735, + "grad_norm": 0.1153278723359108, + "learning_rate": 1.1713957794150423e-05, + "loss": 0.0041, + "step": 1048 + }, + { + "epoch": 2.0212048192771084, + "grad_norm": 0.1838717758655548, + "learning_rate": 1.1673160854796307e-05, + "loss": 0.0041, + "step": 1049 + }, + { + "epoch": 2.023132530120482, + "grad_norm": 0.12264502793550491, + "learning_rate": 1.1632405785491077e-05, + "loss": 0.0043, + "step": 1050 + }, + { + "epoch": 2.0250602409638554, + "grad_norm": 0.14363229274749756, + "learning_rate": 1.159169279116445e-05, + "loss": 0.0066, + "step": 1051 + }, + { + "epoch": 2.026987951807229, + "grad_norm": 0.1316995471715927, + "learning_rate": 1.1551022076534585e-05, + "loss": 0.0024, + "step": 1052 + }, + { + "epoch": 2.0289156626506024, + "grad_norm": 0.13392619788646698, + "learning_rate": 1.1510393846107001e-05, + "loss": 0.0051, + "step": 1053 + }, + { + "epoch": 2.0308433734939757, + "grad_norm": 3.0086817741394043, + "learning_rate": 1.1469808304173658e-05, + "loss": 0.0334, + "step": 1054 + }, + { + "epoch": 2.0327710843373494, + "grad_norm": 0.17756076157093048, + "learning_rate": 1.1429265654811803e-05, + "loss": 0.0068, + "step": 1055 + }, + { + "epoch": 2.0346987951807227, + "grad_norm": 0.13250532746315002, + "learning_rate": 1.1388766101883038e-05, + "loss": 0.0087, + "step": 1056 + }, + { + "epoch": 2.0366265060240965, + "grad_norm": 0.3534089922904968, + "learning_rate": 1.1348309849032257e-05, + "loss": 0.0076, + "step": 1057 + }, + { + "epoch": 2.0385542168674697, + "grad_norm": 0.11939049512147903, + "learning_rate": 1.1307897099686627e-05, + "loss": 0.0029, + "step": 1058 + }, + { + "epoch": 2.0404819277108435, + "grad_norm": 0.11862517893314362, + "learning_rate": 1.1267528057054562e-05, + "loss": 0.0062, + "step": 1059 + }, + { + "epoch": 2.0424096385542168, + "grad_norm": 0.1539212018251419, + "learning_rate": 1.1227202924124704e-05, + "loss": 0.0067, + "step": 1060 + }, + { + "epoch": 2.0443373493975905, + "grad_norm": 0.17163440585136414, + "learning_rate": 1.118692190366491e-05, + "loss": 0.0055, + "step": 1061 + }, + { + "epoch": 2.0462650602409638, + "grad_norm": 0.12304897606372833, + "learning_rate": 1.1146685198221222e-05, + "loss": 0.0036, + "step": 1062 + }, + { + "epoch": 2.0481927710843375, + "grad_norm": 0.17319051921367645, + "learning_rate": 1.1106493010116842e-05, + "loss": 0.0058, + "step": 1063 + }, + { + "epoch": 2.050120481927711, + "grad_norm": 0.2242443859577179, + "learning_rate": 1.1066345541451127e-05, + "loss": 0.0059, + "step": 1064 + }, + { + "epoch": 2.0520481927710845, + "grad_norm": 0.09533938020467758, + "learning_rate": 1.1026242994098597e-05, + "loss": 0.0033, + "step": 1065 + }, + { + "epoch": 2.053975903614458, + "grad_norm": 0.11697929352521896, + "learning_rate": 1.0986185569707852e-05, + "loss": 0.0038, + "step": 1066 + }, + { + "epoch": 2.0559036144578315, + "grad_norm": 0.2563149333000183, + "learning_rate": 1.0946173469700625e-05, + "loss": 0.0158, + "step": 1067 + }, + { + "epoch": 2.057831325301205, + "grad_norm": 0.21836932003498077, + "learning_rate": 1.0906206895270739e-05, + "loss": 0.0085, + "step": 1068 + }, + { + "epoch": 2.059759036144578, + "grad_norm": 0.1798071414232254, + "learning_rate": 1.0866286047383094e-05, + "loss": 0.0053, + "step": 1069 + }, + { + "epoch": 2.061686746987952, + "grad_norm": 0.08937730640172958, + "learning_rate": 1.0826411126772675e-05, + "loss": 0.0025, + "step": 1070 + }, + { + "epoch": 2.063614457831325, + "grad_norm": 0.0942138060927391, + "learning_rate": 1.0786582333943499e-05, + "loss": 0.0017, + "step": 1071 + }, + { + "epoch": 2.065542168674699, + "grad_norm": 0.13076582551002502, + "learning_rate": 1.0746799869167679e-05, + "loss": 0.0033, + "step": 1072 + }, + { + "epoch": 2.067469879518072, + "grad_norm": 0.0993233174085617, + "learning_rate": 1.0707063932484357e-05, + "loss": 0.0046, + "step": 1073 + }, + { + "epoch": 2.069397590361446, + "grad_norm": 0.3046741485595703, + "learning_rate": 1.0667374723698698e-05, + "loss": 0.009, + "step": 1074 + }, + { + "epoch": 2.071325301204819, + "grad_norm": 0.12197669595479965, + "learning_rate": 1.0627732442380932e-05, + "loss": 0.0034, + "step": 1075 + }, + { + "epoch": 2.073253012048193, + "grad_norm": 0.12721140682697296, + "learning_rate": 1.058813728786531e-05, + "loss": 0.0048, + "step": 1076 + }, + { + "epoch": 2.075180722891566, + "grad_norm": 0.10011966526508331, + "learning_rate": 1.0548589459249112e-05, + "loss": 0.0026, + "step": 1077 + }, + { + "epoch": 2.07710843373494, + "grad_norm": 0.3314201831817627, + "learning_rate": 1.0509089155391661e-05, + "loss": 0.0284, + "step": 1078 + }, + { + "epoch": 2.079036144578313, + "grad_norm": 0.32739701867103577, + "learning_rate": 1.0469636574913288e-05, + "loss": 0.0088, + "step": 1079 + }, + { + "epoch": 2.080963855421687, + "grad_norm": 0.13805675506591797, + "learning_rate": 1.043023191619438e-05, + "loss": 0.0042, + "step": 1080 + }, + { + "epoch": 2.0828915662650602, + "grad_norm": 0.14789745211601257, + "learning_rate": 1.039087537737435e-05, + "loss": 0.0037, + "step": 1081 + }, + { + "epoch": 2.0848192771084335, + "grad_norm": 0.15518991649150848, + "learning_rate": 1.0351567156350617e-05, + "loss": 0.0044, + "step": 1082 + }, + { + "epoch": 2.0867469879518072, + "grad_norm": 0.08380113542079926, + "learning_rate": 1.0312307450777706e-05, + "loss": 0.0019, + "step": 1083 + }, + { + "epoch": 2.0886746987951805, + "grad_norm": 0.17892400920391083, + "learning_rate": 1.027309645806613e-05, + "loss": 0.0065, + "step": 1084 + }, + { + "epoch": 2.0906024096385543, + "grad_norm": 0.5497608780860901, + "learning_rate": 1.0233934375381489e-05, + "loss": 0.0238, + "step": 1085 + }, + { + "epoch": 2.0925301204819275, + "grad_norm": 1.0189186334609985, + "learning_rate": 1.019482139964344e-05, + "loss": 0.0092, + "step": 1086 + }, + { + "epoch": 2.0944578313253013, + "grad_norm": 0.12144117057323456, + "learning_rate": 1.015575772752472e-05, + "loss": 0.0038, + "step": 1087 + }, + { + "epoch": 2.0963855421686746, + "grad_norm": 0.1115315854549408, + "learning_rate": 1.0116743555450148e-05, + "loss": 0.0024, + "step": 1088 + }, + { + "epoch": 2.0983132530120483, + "grad_norm": 0.22671759128570557, + "learning_rate": 1.0077779079595631e-05, + "loss": 0.0136, + "step": 1089 + }, + { + "epoch": 2.1002409638554216, + "grad_norm": 2.0009827613830566, + "learning_rate": 1.003886449588719e-05, + "loss": 0.0493, + "step": 1090 + }, + { + "epoch": 2.1021686746987953, + "grad_norm": 0.11907301843166351, + "learning_rate": 1.0000000000000006e-05, + "loss": 0.0034, + "step": 1091 + }, + { + "epoch": 2.1040963855421686, + "grad_norm": 0.31257638335227966, + "learning_rate": 9.961185787357346e-06, + "loss": 0.0129, + "step": 1092 + }, + { + "epoch": 2.1060240963855423, + "grad_norm": 0.11033743619918823, + "learning_rate": 9.922422053129674e-06, + "loss": 0.0184, + "step": 1093 + }, + { + "epoch": 2.1079518072289156, + "grad_norm": 0.2575698494911194, + "learning_rate": 9.883708992233626e-06, + "loss": 0.0054, + "step": 1094 + }, + { + "epoch": 2.1098795180722894, + "grad_norm": 0.12921132147312164, + "learning_rate": 9.845046799331029e-06, + "loss": 0.0037, + "step": 1095 + }, + { + "epoch": 2.1118072289156626, + "grad_norm": 0.21405921876430511, + "learning_rate": 9.806435668827941e-06, + "loss": 0.006, + "step": 1096 + }, + { + "epoch": 2.113734939759036, + "grad_norm": 0.12929430603981018, + "learning_rate": 9.76787579487363e-06, + "loss": 0.0049, + "step": 1097 + }, + { + "epoch": 2.1156626506024097, + "grad_norm": 0.1793181151151657, + "learning_rate": 9.729367371359681e-06, + "loss": 0.0086, + "step": 1098 + }, + { + "epoch": 2.117590361445783, + "grad_norm": 0.2182074338197708, + "learning_rate": 9.690910591918936e-06, + "loss": 0.0106, + "step": 1099 + }, + { + "epoch": 2.1195180722891567, + "grad_norm": 0.0705680400133133, + "learning_rate": 9.652505649924547e-06, + "loss": 0.0012, + "step": 1100 + }, + { + "epoch": 2.12144578313253, + "grad_norm": 0.10509738326072693, + "learning_rate": 9.614152738489021e-06, + "loss": 0.0048, + "step": 1101 + }, + { + "epoch": 2.1233734939759037, + "grad_norm": 0.13775436580181122, + "learning_rate": 9.575852050463268e-06, + "loss": 0.0089, + "step": 1102 + }, + { + "epoch": 2.125301204819277, + "grad_norm": 0.15230101346969604, + "learning_rate": 9.537603778435545e-06, + "loss": 0.0065, + "step": 1103 + }, + { + "epoch": 2.1272289156626507, + "grad_norm": 0.24702346324920654, + "learning_rate": 9.499408114730583e-06, + "loss": 0.016, + "step": 1104 + }, + { + "epoch": 2.129156626506024, + "grad_norm": 0.1082577034831047, + "learning_rate": 9.461265251408575e-06, + "loss": 0.0036, + "step": 1105 + }, + { + "epoch": 2.1310843373493977, + "grad_norm": 0.1063847690820694, + "learning_rate": 9.423175380264211e-06, + "loss": 0.0037, + "step": 1106 + }, + { + "epoch": 2.133012048192771, + "grad_norm": 0.07686953246593475, + "learning_rate": 9.385138692825729e-06, + "loss": 0.0031, + "step": 1107 + }, + { + "epoch": 2.1349397590361447, + "grad_norm": 0.2046380341053009, + "learning_rate": 9.347155380353912e-06, + "loss": 0.0087, + "step": 1108 + }, + { + "epoch": 2.136867469879518, + "grad_norm": 0.1341692954301834, + "learning_rate": 9.30922563384121e-06, + "loss": 0.0045, + "step": 1109 + }, + { + "epoch": 2.1387951807228918, + "grad_norm": 0.09870535880327225, + "learning_rate": 9.271349644010672e-06, + "loss": 0.003, + "step": 1110 + }, + { + "epoch": 2.140722891566265, + "grad_norm": 0.18708615005016327, + "learning_rate": 9.233527601315069e-06, + "loss": 0.0042, + "step": 1111 + }, + { + "epoch": 2.1426506024096383, + "grad_norm": 0.5175634026527405, + "learning_rate": 9.195759695935907e-06, + "loss": 0.0173, + "step": 1112 + }, + { + "epoch": 2.144578313253012, + "grad_norm": 0.14939036965370178, + "learning_rate": 9.158046117782464e-06, + "loss": 0.0031, + "step": 1113 + }, + { + "epoch": 2.1465060240963854, + "grad_norm": 0.2837410569190979, + "learning_rate": 9.120387056490851e-06, + "loss": 0.0097, + "step": 1114 + }, + { + "epoch": 2.148433734939759, + "grad_norm": 0.11088677495718002, + "learning_rate": 9.082782701423047e-06, + "loss": 0.0026, + "step": 1115 + }, + { + "epoch": 2.1503614457831324, + "grad_norm": 0.07785166054964066, + "learning_rate": 9.045233241665947e-06, + "loss": 0.0019, + "step": 1116 + }, + { + "epoch": 2.152289156626506, + "grad_norm": 0.17568141222000122, + "learning_rate": 9.007738866030427e-06, + "loss": 0.0039, + "step": 1117 + }, + { + "epoch": 2.1542168674698794, + "grad_norm": 0.12652266025543213, + "learning_rate": 8.970299763050356e-06, + "loss": 0.0033, + "step": 1118 + }, + { + "epoch": 2.156144578313253, + "grad_norm": 0.16801467537879944, + "learning_rate": 8.932916120981695e-06, + "loss": 0.0076, + "step": 1119 + }, + { + "epoch": 2.1580722891566264, + "grad_norm": 0.18313169479370117, + "learning_rate": 8.895588127801545e-06, + "loss": 0.0052, + "step": 1120 + }, + { + "epoch": 2.16, + "grad_norm": 0.07546049356460571, + "learning_rate": 8.858315971207146e-06, + "loss": 0.0022, + "step": 1121 + }, + { + "epoch": 2.1619277108433734, + "grad_norm": 0.4039839208126068, + "learning_rate": 8.821099838614996e-06, + "loss": 0.0203, + "step": 1122 + }, + { + "epoch": 2.163855421686747, + "grad_norm": 0.09244243055582047, + "learning_rate": 8.783939917159897e-06, + "loss": 0.002, + "step": 1123 + }, + { + "epoch": 2.1657831325301204, + "grad_norm": 0.18327835202217102, + "learning_rate": 8.746836393693978e-06, + "loss": 0.0055, + "step": 1124 + }, + { + "epoch": 2.167710843373494, + "grad_norm": 0.22010307013988495, + "learning_rate": 8.709789454785809e-06, + "loss": 0.0077, + "step": 1125 + }, + { + "epoch": 2.1696385542168675, + "grad_norm": 0.09438297897577286, + "learning_rate": 8.67279928671939e-06, + "loss": 0.0032, + "step": 1126 + }, + { + "epoch": 2.1715662650602408, + "grad_norm": 0.20782770216464996, + "learning_rate": 8.635866075493318e-06, + "loss": 0.0028, + "step": 1127 + }, + { + "epoch": 2.1734939759036145, + "grad_norm": 0.1958685964345932, + "learning_rate": 8.598990006819756e-06, + "loss": 0.0047, + "step": 1128 + }, + { + "epoch": 2.1754216867469878, + "grad_norm": 0.06459935009479523, + "learning_rate": 8.562171266123528e-06, + "loss": 0.0015, + "step": 1129 + }, + { + "epoch": 2.1773493975903615, + "grad_norm": 0.33486708998680115, + "learning_rate": 8.525410038541218e-06, + "loss": 0.0094, + "step": 1130 + }, + { + "epoch": 2.179277108433735, + "grad_norm": 0.5755940079689026, + "learning_rate": 8.488706508920202e-06, + "loss": 0.0067, + "step": 1131 + }, + { + "epoch": 2.1812048192771085, + "grad_norm": 0.10840924829244614, + "learning_rate": 8.452060861817738e-06, + "loss": 0.0082, + "step": 1132 + }, + { + "epoch": 2.183132530120482, + "grad_norm": 0.18611350655555725, + "learning_rate": 8.415473281500037e-06, + "loss": 0.0059, + "step": 1133 + }, + { + "epoch": 2.1850602409638555, + "grad_norm": 0.11245249956846237, + "learning_rate": 8.378943951941301e-06, + "loss": 0.0107, + "step": 1134 + }, + { + "epoch": 2.186987951807229, + "grad_norm": 0.12284426391124725, + "learning_rate": 8.342473056822873e-06, + "loss": 0.0025, + "step": 1135 + }, + { + "epoch": 2.1889156626506026, + "grad_norm": 0.12542888522148132, + "learning_rate": 8.306060779532245e-06, + "loss": 0.0059, + "step": 1136 + }, + { + "epoch": 2.190843373493976, + "grad_norm": 0.1287655532360077, + "learning_rate": 8.26970730316215e-06, + "loss": 0.0022, + "step": 1137 + }, + { + "epoch": 2.1927710843373496, + "grad_norm": 0.1818632185459137, + "learning_rate": 8.233412810509669e-06, + "loss": 0.0131, + "step": 1138 + }, + { + "epoch": 2.194698795180723, + "grad_norm": 0.09687745571136475, + "learning_rate": 8.197177484075284e-06, + "loss": 0.0025, + "step": 1139 + }, + { + "epoch": 2.1966265060240966, + "grad_norm": 0.16103452444076538, + "learning_rate": 8.161001506061979e-06, + "loss": 0.0031, + "step": 1140 + }, + { + "epoch": 2.19855421686747, + "grad_norm": 0.2711680233478546, + "learning_rate": 8.124885058374302e-06, + "loss": 0.0034, + "step": 1141 + }, + { + "epoch": 2.200481927710843, + "grad_norm": 0.17613105475902557, + "learning_rate": 8.088828322617473e-06, + "loss": 0.0044, + "step": 1142 + }, + { + "epoch": 2.202409638554217, + "grad_norm": 0.2298487424850464, + "learning_rate": 8.052831480096464e-06, + "loss": 0.0168, + "step": 1143 + }, + { + "epoch": 2.20433734939759, + "grad_norm": 0.17042206227779388, + "learning_rate": 8.016894711815067e-06, + "loss": 0.007, + "step": 1144 + }, + { + "epoch": 2.206265060240964, + "grad_norm": 0.2830466628074646, + "learning_rate": 7.98101819847501e-06, + "loss": 0.0091, + "step": 1145 + }, + { + "epoch": 2.208192771084337, + "grad_norm": 0.22089065611362457, + "learning_rate": 7.945202120475063e-06, + "loss": 0.0046, + "step": 1146 + }, + { + "epoch": 2.210120481927711, + "grad_norm": 0.1716073900461197, + "learning_rate": 7.909446657910072e-06, + "loss": 0.0032, + "step": 1147 + }, + { + "epoch": 2.212048192771084, + "grad_norm": 0.16140373051166534, + "learning_rate": 7.873751990570104e-06, + "loss": 0.0057, + "step": 1148 + }, + { + "epoch": 2.213975903614458, + "grad_norm": 0.1671605408191681, + "learning_rate": 7.838118297939529e-06, + "loss": 0.0039, + "step": 1149 + }, + { + "epoch": 2.2159036144578312, + "grad_norm": 0.10933005809783936, + "learning_rate": 7.802545759196117e-06, + "loss": 0.005, + "step": 1150 + }, + { + "epoch": 2.217831325301205, + "grad_norm": 0.07819998264312744, + "learning_rate": 7.76703455321014e-06, + "loss": 0.0025, + "step": 1151 + }, + { + "epoch": 2.2197590361445783, + "grad_norm": 0.36211854219436646, + "learning_rate": 7.73158485854344e-06, + "loss": 0.0151, + "step": 1152 + }, + { + "epoch": 2.221686746987952, + "grad_norm": 0.09098304808139801, + "learning_rate": 7.696196853448612e-06, + "loss": 0.0027, + "step": 1153 + }, + { + "epoch": 2.2236144578313253, + "grad_norm": 0.17442144453525543, + "learning_rate": 7.660870715868018e-06, + "loss": 0.006, + "step": 1154 + }, + { + "epoch": 2.225542168674699, + "grad_norm": 0.09785338491201401, + "learning_rate": 7.625606623432933e-06, + "loss": 0.0041, + "step": 1155 + }, + { + "epoch": 2.2274698795180723, + "grad_norm": 0.19399888813495636, + "learning_rate": 7.590404753462653e-06, + "loss": 0.0125, + "step": 1156 + }, + { + "epoch": 2.2293975903614456, + "grad_norm": 0.11080623418092728, + "learning_rate": 7.55526528296362e-06, + "loss": 0.0022, + "step": 1157 + }, + { + "epoch": 2.2313253012048193, + "grad_norm": 0.14067359268665314, + "learning_rate": 7.520188388628473e-06, + "loss": 0.0123, + "step": 1158 + }, + { + "epoch": 2.2332530120481926, + "grad_norm": 0.14533625543117523, + "learning_rate": 7.485174246835227e-06, + "loss": 0.0039, + "step": 1159 + }, + { + "epoch": 2.2351807228915663, + "grad_norm": 0.1253812462091446, + "learning_rate": 7.4502230336463466e-06, + "loss": 0.003, + "step": 1160 + }, + { + "epoch": 2.2371084337349396, + "grad_norm": 0.12766572833061218, + "learning_rate": 7.415334924807869e-06, + "loss": 0.0044, + "step": 1161 + }, + { + "epoch": 2.2390361445783133, + "grad_norm": 0.11985791474580765, + "learning_rate": 7.380510095748535e-06, + "loss": 0.0071, + "step": 1162 + }, + { + "epoch": 2.2409638554216866, + "grad_norm": 0.15505346655845642, + "learning_rate": 7.3457487215788605e-06, + "loss": 0.0046, + "step": 1163 + }, + { + "epoch": 2.2428915662650604, + "grad_norm": 0.18983210623264313, + "learning_rate": 7.311050977090343e-06, + "loss": 0.0079, + "step": 1164 + }, + { + "epoch": 2.2448192771084337, + "grad_norm": 0.19279207289218903, + "learning_rate": 7.276417036754479e-06, + "loss": 0.0042, + "step": 1165 + }, + { + "epoch": 2.2467469879518074, + "grad_norm": 0.21539707481861115, + "learning_rate": 7.241847074721964e-06, + "loss": 0.0087, + "step": 1166 + }, + { + "epoch": 2.2486746987951807, + "grad_norm": 0.07004354894161224, + "learning_rate": 7.207341264821783e-06, + "loss": 0.002, + "step": 1167 + }, + { + "epoch": 2.2506024096385544, + "grad_norm": 0.2203039526939392, + "learning_rate": 7.172899780560345e-06, + "loss": 0.0069, + "step": 1168 + }, + { + "epoch": 2.2525301204819277, + "grad_norm": 0.12474718689918518, + "learning_rate": 7.138522795120606e-06, + "loss": 0.0122, + "step": 1169 + }, + { + "epoch": 2.2544578313253014, + "grad_norm": 0.09078995883464813, + "learning_rate": 7.104210481361204e-06, + "loss": 0.0025, + "step": 1170 + }, + { + "epoch": 2.2563855421686747, + "grad_norm": 0.141757071018219, + "learning_rate": 7.069963011815584e-06, + "loss": 0.0039, + "step": 1171 + }, + { + "epoch": 2.258313253012048, + "grad_norm": 0.14944659173488617, + "learning_rate": 7.035780558691141e-06, + "loss": 0.0025, + "step": 1172 + }, + { + "epoch": 2.2602409638554217, + "grad_norm": 0.06723666191101074, + "learning_rate": 7.001663293868328e-06, + "loss": 0.0014, + "step": 1173 + }, + { + "epoch": 2.262168674698795, + "grad_norm": 0.11966485530138016, + "learning_rate": 6.967611388899826e-06, + "loss": 0.0067, + "step": 1174 + }, + { + "epoch": 2.2640963855421687, + "grad_norm": 0.08943185210227966, + "learning_rate": 6.933625015009666e-06, + "loss": 0.0036, + "step": 1175 + }, + { + "epoch": 2.266024096385542, + "grad_norm": 0.04511453956365585, + "learning_rate": 6.899704343092359e-06, + "loss": 0.0014, + "step": 1176 + }, + { + "epoch": 2.2679518072289158, + "grad_norm": 0.1867951601743698, + "learning_rate": 6.865849543712058e-06, + "loss": 0.009, + "step": 1177 + }, + { + "epoch": 2.269879518072289, + "grad_norm": 0.23791250586509705, + "learning_rate": 6.832060787101658e-06, + "loss": 0.0117, + "step": 1178 + }, + { + "epoch": 2.271807228915663, + "grad_norm": 0.13210316002368927, + "learning_rate": 6.798338243162008e-06, + "loss": 0.0024, + "step": 1179 + }, + { + "epoch": 2.273734939759036, + "grad_norm": 0.1601375937461853, + "learning_rate": 6.764682081461002e-06, + "loss": 0.013, + "step": 1180 + }, + { + "epoch": 2.27566265060241, + "grad_norm": 0.21996766328811646, + "learning_rate": 6.73109247123273e-06, + "loss": 0.0074, + "step": 1181 + }, + { + "epoch": 2.277590361445783, + "grad_norm": 0.15780030190944672, + "learning_rate": 6.6975695813766465e-06, + "loss": 0.0052, + "step": 1182 + }, + { + "epoch": 2.279518072289157, + "grad_norm": 0.18146437406539917, + "learning_rate": 6.664113580456739e-06, + "loss": 0.0265, + "step": 1183 + }, + { + "epoch": 2.28144578313253, + "grad_norm": 0.12033495306968689, + "learning_rate": 6.630724636700618e-06, + "loss": 0.0026, + "step": 1184 + }, + { + "epoch": 2.283373493975904, + "grad_norm": 0.25268155336380005, + "learning_rate": 6.59740291799873e-06, + "loss": 0.0046, + "step": 1185 + }, + { + "epoch": 2.285301204819277, + "grad_norm": 0.19043004512786865, + "learning_rate": 6.564148591903488e-06, + "loss": 0.0063, + "step": 1186 + }, + { + "epoch": 2.2872289156626504, + "grad_norm": 0.06894923001527786, + "learning_rate": 6.530961825628432e-06, + "loss": 0.0012, + "step": 1187 + }, + { + "epoch": 2.289156626506024, + "grad_norm": 0.16378818452358246, + "learning_rate": 6.4978427860474015e-06, + "loss": 0.0048, + "step": 1188 + }, + { + "epoch": 2.2910843373493974, + "grad_norm": 0.11130444705486298, + "learning_rate": 6.464791639693648e-06, + "loss": 0.0049, + "step": 1189 + }, + { + "epoch": 2.293012048192771, + "grad_norm": 0.10573417693376541, + "learning_rate": 6.431808552759083e-06, + "loss": 0.0019, + "step": 1190 + }, + { + "epoch": 2.2949397590361444, + "grad_norm": 0.13344882428646088, + "learning_rate": 6.398893691093367e-06, + "loss": 0.0033, + "step": 1191 + }, + { + "epoch": 2.296867469879518, + "grad_norm": 0.12659135460853577, + "learning_rate": 6.366047220203088e-06, + "loss": 0.0032, + "step": 1192 + }, + { + "epoch": 2.2987951807228915, + "grad_norm": 0.10152821987867355, + "learning_rate": 6.333269305250971e-06, + "loss": 0.0027, + "step": 1193 + }, + { + "epoch": 2.300722891566265, + "grad_norm": 0.1889944225549698, + "learning_rate": 6.300560111055006e-06, + "loss": 0.0062, + "step": 1194 + }, + { + "epoch": 2.3026506024096385, + "grad_norm": 2.3101227283477783, + "learning_rate": 6.2679198020876275e-06, + "loss": 0.0113, + "step": 1195 + }, + { + "epoch": 2.304578313253012, + "grad_norm": 0.6224933862686157, + "learning_rate": 6.235348542474908e-06, + "loss": 0.0273, + "step": 1196 + }, + { + "epoch": 2.3065060240963855, + "grad_norm": 0.1908419281244278, + "learning_rate": 6.202846495995705e-06, + "loss": 0.0056, + "step": 1197 + }, + { + "epoch": 2.3084337349397592, + "grad_norm": 0.10968491435050964, + "learning_rate": 6.170413826080856e-06, + "loss": 0.0034, + "step": 1198 + }, + { + "epoch": 2.3103614457831325, + "grad_norm": 0.23200668394565582, + "learning_rate": 6.138050695812343e-06, + "loss": 0.0042, + "step": 1199 + }, + { + "epoch": 2.3122891566265062, + "grad_norm": 0.12442032992839813, + "learning_rate": 6.105757267922481e-06, + "loss": 0.0045, + "step": 1200 + }, + { + "epoch": 2.3142168674698795, + "grad_norm": 0.14563624560832977, + "learning_rate": 6.073533704793122e-06, + "loss": 0.0035, + "step": 1201 + }, + { + "epoch": 2.316144578313253, + "grad_norm": 0.11523722857236862, + "learning_rate": 6.04138016845478e-06, + "loss": 0.0088, + "step": 1202 + }, + { + "epoch": 2.3180722891566266, + "grad_norm": 0.2000943422317505, + "learning_rate": 6.009296820585871e-06, + "loss": 0.0059, + "step": 1203 + }, + { + "epoch": 2.32, + "grad_norm": 0.10698592662811279, + "learning_rate": 5.977283822511879e-06, + "loss": 0.0028, + "step": 1204 + }, + { + "epoch": 2.3219277108433736, + "grad_norm": 0.1533137410879135, + "learning_rate": 5.945341335204547e-06, + "loss": 0.0044, + "step": 1205 + }, + { + "epoch": 2.323855421686747, + "grad_norm": 0.1235835999250412, + "learning_rate": 5.9134695192810695e-06, + "loss": 0.0043, + "step": 1206 + }, + { + "epoch": 2.3257831325301206, + "grad_norm": 0.1916925013065338, + "learning_rate": 5.8816685350032575e-06, + "loss": 0.0066, + "step": 1207 + }, + { + "epoch": 2.327710843373494, + "grad_norm": 0.08812380582094193, + "learning_rate": 5.849938542276801e-06, + "loss": 0.0022, + "step": 1208 + }, + { + "epoch": 2.3296385542168676, + "grad_norm": 0.13387660682201385, + "learning_rate": 5.818279700650393e-06, + "loss": 0.0037, + "step": 1209 + }, + { + "epoch": 2.331566265060241, + "grad_norm": 0.2309022694826126, + "learning_rate": 5.786692169314954e-06, + "loss": 0.0049, + "step": 1210 + }, + { + "epoch": 2.3334939759036146, + "grad_norm": 0.09956549853086472, + "learning_rate": 5.755176107102833e-06, + "loss": 0.002, + "step": 1211 + }, + { + "epoch": 2.335421686746988, + "grad_norm": 0.06035687029361725, + "learning_rate": 5.723731672487043e-06, + "loss": 0.002, + "step": 1212 + }, + { + "epoch": 2.337349397590361, + "grad_norm": 0.06850237399339676, + "learning_rate": 5.69235902358038e-06, + "loss": 0.0013, + "step": 1213 + }, + { + "epoch": 2.339277108433735, + "grad_norm": 0.12068171054124832, + "learning_rate": 5.661058318134711e-06, + "loss": 0.0041, + "step": 1214 + }, + { + "epoch": 2.3412048192771087, + "grad_norm": 0.13146616518497467, + "learning_rate": 5.6298297135401355e-06, + "loss": 0.0022, + "step": 1215 + }, + { + "epoch": 2.343132530120482, + "grad_norm": 0.15160737931728363, + "learning_rate": 5.598673366824212e-06, + "loss": 0.0036, + "step": 1216 + }, + { + "epoch": 2.3450602409638552, + "grad_norm": 0.26196014881134033, + "learning_rate": 5.567589434651164e-06, + "loss": 0.0151, + "step": 1217 + }, + { + "epoch": 2.346987951807229, + "grad_norm": 0.12898831069469452, + "learning_rate": 5.536578073321073e-06, + "loss": 0.006, + "step": 1218 + }, + { + "epoch": 2.3489156626506023, + "grad_norm": 0.11385104805231094, + "learning_rate": 5.505639438769146e-06, + "loss": 0.0052, + "step": 1219 + }, + { + "epoch": 2.350843373493976, + "grad_norm": 0.14569509029388428, + "learning_rate": 5.47477368656486e-06, + "loss": 0.0048, + "step": 1220 + }, + { + "epoch": 2.3527710843373493, + "grad_norm": 0.12406075745820999, + "learning_rate": 5.443980971911238e-06, + "loss": 0.0028, + "step": 1221 + }, + { + "epoch": 2.354698795180723, + "grad_norm": 0.3730498254299164, + "learning_rate": 5.413261449644039e-06, + "loss": 0.0043, + "step": 1222 + }, + { + "epoch": 2.3566265060240963, + "grad_norm": 0.1449914574623108, + "learning_rate": 5.382615274230987e-06, + "loss": 0.0075, + "step": 1223 + }, + { + "epoch": 2.35855421686747, + "grad_norm": 0.20739100873470306, + "learning_rate": 5.352042599770995e-06, + "loss": 0.0061, + "step": 1224 + }, + { + "epoch": 2.3604819277108433, + "grad_norm": 0.05786775052547455, + "learning_rate": 5.321543579993398e-06, + "loss": 0.0015, + "step": 1225 + }, + { + "epoch": 2.362409638554217, + "grad_norm": 0.09043122828006744, + "learning_rate": 5.2911183682571446e-06, + "loss": 0.0034, + "step": 1226 + }, + { + "epoch": 2.3643373493975903, + "grad_norm": 0.2685496211051941, + "learning_rate": 5.260767117550094e-06, + "loss": 0.0076, + "step": 1227 + }, + { + "epoch": 2.3662650602409636, + "grad_norm": 0.17694126069545746, + "learning_rate": 5.230489980488165e-06, + "loss": 0.0148, + "step": 1228 + }, + { + "epoch": 2.3681927710843373, + "grad_norm": 0.11609307676553726, + "learning_rate": 5.200287109314633e-06, + "loss": 0.0049, + "step": 1229 + }, + { + "epoch": 2.370120481927711, + "grad_norm": 0.1257704645395279, + "learning_rate": 5.1701586558993285e-06, + "loss": 0.0031, + "step": 1230 + }, + { + "epoch": 2.3720481927710844, + "grad_norm": 0.27177703380584717, + "learning_rate": 5.140104771737899e-06, + "loss": 0.0058, + "step": 1231 + }, + { + "epoch": 2.3739759036144576, + "grad_norm": 0.13928169012069702, + "learning_rate": 5.110125607951024e-06, + "loss": 0.0051, + "step": 1232 + }, + { + "epoch": 2.3759036144578314, + "grad_norm": 0.679577648639679, + "learning_rate": 5.0802213152836514e-06, + "loss": 0.0173, + "step": 1233 + }, + { + "epoch": 2.3778313253012047, + "grad_norm": 0.16769403219223022, + "learning_rate": 5.0503920441042845e-06, + "loss": 0.0045, + "step": 1234 + }, + { + "epoch": 2.3797590361445784, + "grad_norm": 0.09427493065595627, + "learning_rate": 5.0206379444041764e-06, + "loss": 0.0024, + "step": 1235 + }, + { + "epoch": 2.3816867469879517, + "grad_norm": 0.33908671140670776, + "learning_rate": 4.990959165796585e-06, + "loss": 0.0088, + "step": 1236 + }, + { + "epoch": 2.3836144578313254, + "grad_norm": 0.18106943368911743, + "learning_rate": 4.961355857516034e-06, + "loss": 0.0094, + "step": 1237 + }, + { + "epoch": 2.3855421686746987, + "grad_norm": 0.5833203196525574, + "learning_rate": 4.931828168417583e-06, + "loss": 0.0086, + "step": 1238 + }, + { + "epoch": 2.3874698795180724, + "grad_norm": 0.09108569473028183, + "learning_rate": 4.902376246976015e-06, + "loss": 0.0014, + "step": 1239 + }, + { + "epoch": 2.3893975903614457, + "grad_norm": 0.10596407204866409, + "learning_rate": 4.873000241285153e-06, + "loss": 0.0043, + "step": 1240 + }, + { + "epoch": 2.3913253012048195, + "grad_norm": 0.10775511711835861, + "learning_rate": 4.8437002990570835e-06, + "loss": 0.0014, + "step": 1241 + }, + { + "epoch": 2.3932530120481927, + "grad_norm": 0.9646345973014832, + "learning_rate": 4.8144765676214245e-06, + "loss": 0.0525, + "step": 1242 + }, + { + "epoch": 2.395180722891566, + "grad_norm": 0.20530278980731964, + "learning_rate": 4.7853291939245814e-06, + "loss": 0.008, + "step": 1243 + }, + { + "epoch": 2.3971084337349398, + "grad_norm": 0.1682119369506836, + "learning_rate": 4.756258324528995e-06, + "loss": 0.0044, + "step": 1244 + }, + { + "epoch": 2.3990361445783135, + "grad_norm": 0.45536917448043823, + "learning_rate": 4.727264105612439e-06, + "loss": 0.0186, + "step": 1245 + }, + { + "epoch": 2.4009638554216868, + "grad_norm": 0.3017471730709076, + "learning_rate": 4.698346682967258e-06, + "loss": 0.0106, + "step": 1246 + }, + { + "epoch": 2.40289156626506, + "grad_norm": 0.1226554661989212, + "learning_rate": 4.669506201999625e-06, + "loss": 0.0035, + "step": 1247 + }, + { + "epoch": 2.404819277108434, + "grad_norm": 0.13750068843364716, + "learning_rate": 4.640742807728837e-06, + "loss": 0.0038, + "step": 1248 + }, + { + "epoch": 2.406746987951807, + "grad_norm": 0.11531024426221848, + "learning_rate": 4.612056644786575e-06, + "loss": 0.0021, + "step": 1249 + }, + { + "epoch": 2.408674698795181, + "grad_norm": 0.1143675372004509, + "learning_rate": 4.583447857416175e-06, + "loss": 0.0028, + "step": 1250 + }, + { + "epoch": 2.410602409638554, + "grad_norm": 0.0914216861128807, + "learning_rate": 4.554916589471898e-06, + "loss": 0.0027, + "step": 1251 + }, + { + "epoch": 2.412530120481928, + "grad_norm": 0.18339012563228607, + "learning_rate": 4.526462984418221e-06, + "loss": 0.0037, + "step": 1252 + }, + { + "epoch": 2.414457831325301, + "grad_norm": 0.11073138564825058, + "learning_rate": 4.498087185329105e-06, + "loss": 0.003, + "step": 1253 + }, + { + "epoch": 2.416385542168675, + "grad_norm": 0.20792435109615326, + "learning_rate": 4.469789334887265e-06, + "loss": 0.009, + "step": 1254 + }, + { + "epoch": 2.418313253012048, + "grad_norm": 0.09485629945993423, + "learning_rate": 4.441569575383471e-06, + "loss": 0.0033, + "step": 1255 + }, + { + "epoch": 2.420240963855422, + "grad_norm": 0.11831793934106827, + "learning_rate": 4.413428048715851e-06, + "loss": 0.0021, + "step": 1256 + }, + { + "epoch": 2.422168674698795, + "grad_norm": 0.11818034201860428, + "learning_rate": 4.38536489638911e-06, + "loss": 0.0041, + "step": 1257 + }, + { + "epoch": 2.4240963855421684, + "grad_norm": 0.2583082616329193, + "learning_rate": 4.3573802595138945e-06, + "loss": 0.0039, + "step": 1258 + }, + { + "epoch": 2.426024096385542, + "grad_norm": 0.3120201826095581, + "learning_rate": 4.329474278806034e-06, + "loss": 0.0087, + "step": 1259 + }, + { + "epoch": 2.427951807228916, + "grad_norm": 0.1258879452943802, + "learning_rate": 4.301647094585855e-06, + "loss": 0.0046, + "step": 1260 + }, + { + "epoch": 2.429879518072289, + "grad_norm": 0.15144586563110352, + "learning_rate": 4.273898846777473e-06, + "loss": 0.0054, + "step": 1261 + }, + { + "epoch": 2.4318072289156625, + "grad_norm": 0.15615184605121613, + "learning_rate": 4.246229674908067e-06, + "loss": 0.0072, + "step": 1262 + }, + { + "epoch": 2.433734939759036, + "grad_norm": 0.09690173715353012, + "learning_rate": 4.218639718107225e-06, + "loss": 0.003, + "step": 1263 + }, + { + "epoch": 2.4356626506024095, + "grad_norm": 0.23884955048561096, + "learning_rate": 4.1911291151062e-06, + "loss": 0.0109, + "step": 1264 + }, + { + "epoch": 2.4375903614457832, + "grad_norm": 0.0905768945813179, + "learning_rate": 4.163698004237222e-06, + "loss": 0.0027, + "step": 1265 + }, + { + "epoch": 2.4395180722891565, + "grad_norm": 0.09168912470340729, + "learning_rate": 4.136346523432821e-06, + "loss": 0.0018, + "step": 1266 + }, + { + "epoch": 2.4414457831325302, + "grad_norm": 0.17878012359142303, + "learning_rate": 4.109074810225118e-06, + "loss": 0.0048, + "step": 1267 + }, + { + "epoch": 2.4433734939759035, + "grad_norm": 0.09913790971040726, + "learning_rate": 4.08188300174513e-06, + "loss": 0.0021, + "step": 1268 + }, + { + "epoch": 2.4453012048192773, + "grad_norm": 0.16615812480449677, + "learning_rate": 4.054771234722106e-06, + "loss": 0.0066, + "step": 1269 + }, + { + "epoch": 2.4472289156626506, + "grad_norm": 0.09618276357650757, + "learning_rate": 4.027739645482784e-06, + "loss": 0.0043, + "step": 1270 + }, + { + "epoch": 2.4491566265060243, + "grad_norm": 0.33473479747772217, + "learning_rate": 4.0007883699507855e-06, + "loss": 0.0236, + "step": 1271 + }, + { + "epoch": 2.4510843373493976, + "grad_norm": 0.15051880478858948, + "learning_rate": 3.973917543645867e-06, + "loss": 0.0068, + "step": 1272 + }, + { + "epoch": 2.453012048192771, + "grad_norm": 0.24134816229343414, + "learning_rate": 3.947127301683249e-06, + "loss": 0.0194, + "step": 1273 + }, + { + "epoch": 2.4549397590361446, + "grad_norm": 0.10495353490114212, + "learning_rate": 3.920417778772967e-06, + "loss": 0.0042, + "step": 1274 + }, + { + "epoch": 2.4568674698795183, + "grad_norm": 0.2294938713312149, + "learning_rate": 3.893789109219171e-06, + "loss": 0.0224, + "step": 1275 + }, + { + "epoch": 2.4587951807228916, + "grad_norm": 0.13710513710975647, + "learning_rate": 3.867241426919446e-06, + "loss": 0.0046, + "step": 1276 + }, + { + "epoch": 2.460722891566265, + "grad_norm": 0.06754808127880096, + "learning_rate": 3.840774865364157e-06, + "loss": 0.0019, + "step": 1277 + }, + { + "epoch": 2.4626506024096386, + "grad_norm": 0.24797780811786652, + "learning_rate": 3.8143895576357605e-06, + "loss": 0.0063, + "step": 1278 + }, + { + "epoch": 2.464578313253012, + "grad_norm": 0.1476449817419052, + "learning_rate": 3.788085636408143e-06, + "loss": 0.0055, + "step": 1279 + }, + { + "epoch": 2.4665060240963856, + "grad_norm": 0.22397096455097198, + "learning_rate": 3.7618632339459616e-06, + "loss": 0.0164, + "step": 1280 + }, + { + "epoch": 2.468433734939759, + "grad_norm": 0.21596969664096832, + "learning_rate": 3.7357224821039497e-06, + "loss": 0.0112, + "step": 1281 + }, + { + "epoch": 2.4703614457831327, + "grad_norm": 0.2775099575519562, + "learning_rate": 3.7096635123263068e-06, + "loss": 0.0112, + "step": 1282 + }, + { + "epoch": 2.472289156626506, + "grad_norm": 0.07963326573371887, + "learning_rate": 3.683686455645974e-06, + "loss": 0.0013, + "step": 1283 + }, + { + "epoch": 2.4742168674698797, + "grad_norm": 0.1253802627325058, + "learning_rate": 3.6577914426840266e-06, + "loss": 0.0038, + "step": 1284 + }, + { + "epoch": 2.476144578313253, + "grad_norm": 0.10258597880601883, + "learning_rate": 3.631978603648989e-06, + "loss": 0.0023, + "step": 1285 + }, + { + "epoch": 2.4780722891566267, + "grad_norm": 0.17102380096912384, + "learning_rate": 3.6062480683361935e-06, + "loss": 0.0025, + "step": 1286 + }, + { + "epoch": 2.48, + "grad_norm": 0.09547360241413116, + "learning_rate": 3.580599966127123e-06, + "loss": 0.003, + "step": 1287 + }, + { + "epoch": 2.4819277108433733, + "grad_norm": 0.08008653670549393, + "learning_rate": 3.5550344259887438e-06, + "loss": 0.0023, + "step": 1288 + }, + { + "epoch": 2.483855421686747, + "grad_norm": 0.07712296396493912, + "learning_rate": 3.5295515764729003e-06, + "loss": 0.0015, + "step": 1289 + }, + { + "epoch": 2.4857831325301207, + "grad_norm": 0.21118703484535217, + "learning_rate": 3.5041515457156303e-06, + "loss": 0.0041, + "step": 1290 + }, + { + "epoch": 2.487710843373494, + "grad_norm": 0.10772393643856049, + "learning_rate": 3.4788344614365155e-06, + "loss": 0.0029, + "step": 1291 + }, + { + "epoch": 2.4896385542168673, + "grad_norm": 0.2353268563747406, + "learning_rate": 3.453600450938073e-06, + "loss": 0.0072, + "step": 1292 + }, + { + "epoch": 2.491566265060241, + "grad_norm": 0.2897944152355194, + "learning_rate": 3.428449641105107e-06, + "loss": 0.0205, + "step": 1293 + }, + { + "epoch": 2.4934939759036143, + "grad_norm": 0.19756680727005005, + "learning_rate": 3.4033821584040383e-06, + "loss": 0.0065, + "step": 1294 + }, + { + "epoch": 2.495421686746988, + "grad_norm": 0.13538534939289093, + "learning_rate": 3.378398128882305e-06, + "loss": 0.0025, + "step": 1295 + }, + { + "epoch": 2.4973493975903613, + "grad_norm": 0.2301637977361679, + "learning_rate": 3.3534976781677142e-06, + "loss": 0.0071, + "step": 1296 + }, + { + "epoch": 2.499277108433735, + "grad_norm": 0.0965796634554863, + "learning_rate": 3.3286809314678137e-06, + "loss": 0.0024, + "step": 1297 + }, + { + "epoch": 2.5012048192771084, + "grad_norm": 0.0777980163693428, + "learning_rate": 3.30394801356926e-06, + "loss": 0.0013, + "step": 1298 + }, + { + "epoch": 2.503132530120482, + "grad_norm": 0.3157603442668915, + "learning_rate": 3.279299048837177e-06, + "loss": 0.0228, + "step": 1299 + }, + { + "epoch": 2.5050602409638554, + "grad_norm": 0.15660233795642853, + "learning_rate": 3.2547341612145654e-06, + "loss": 0.0056, + "step": 1300 + }, + { + "epoch": 2.506987951807229, + "grad_norm": 0.21655581891536713, + "learning_rate": 3.2302534742216586e-06, + "loss": 0.0081, + "step": 1301 + }, + { + "epoch": 2.5089156626506024, + "grad_norm": 0.09475889801979065, + "learning_rate": 3.205857110955277e-06, + "loss": 0.0029, + "step": 1302 + }, + { + "epoch": 2.5108433734939757, + "grad_norm": 0.13174696266651154, + "learning_rate": 3.18154519408826e-06, + "loss": 0.0059, + "step": 1303 + }, + { + "epoch": 2.5127710843373494, + "grad_norm": 0.10386355221271515, + "learning_rate": 3.1573178458688102e-06, + "loss": 0.0042, + "step": 1304 + }, + { + "epoch": 2.514698795180723, + "grad_norm": 0.12700854241847992, + "learning_rate": 3.133175188119899e-06, + "loss": 0.0041, + "step": 1305 + }, + { + "epoch": 2.5166265060240964, + "grad_norm": 0.1617022454738617, + "learning_rate": 3.109117342238639e-06, + "loss": 0.0053, + "step": 1306 + }, + { + "epoch": 2.5185542168674697, + "grad_norm": 0.8668884038925171, + "learning_rate": 3.085144429195688e-06, + "loss": 0.0084, + "step": 1307 + }, + { + "epoch": 2.5204819277108435, + "grad_norm": 0.22429344058036804, + "learning_rate": 3.061256569534634e-06, + "loss": 0.0053, + "step": 1308 + }, + { + "epoch": 2.5224096385542167, + "grad_norm": 0.08967582136392593, + "learning_rate": 3.037453883371375e-06, + "loss": 0.0018, + "step": 1309 + }, + { + "epoch": 2.5243373493975905, + "grad_norm": 0.1251695454120636, + "learning_rate": 3.0137364903935464e-06, + "loss": 0.0037, + "step": 1310 + }, + { + "epoch": 2.5262650602409638, + "grad_norm": 0.09026174992322922, + "learning_rate": 2.990104509859897e-06, + "loss": 0.0024, + "step": 1311 + }, + { + "epoch": 2.5281927710843375, + "grad_norm": 0.34319114685058594, + "learning_rate": 2.966558060599689e-06, + "loss": 0.0063, + "step": 1312 + }, + { + "epoch": 2.5301204819277108, + "grad_norm": 0.20300136506557465, + "learning_rate": 2.9430972610121087e-06, + "loss": 0.0054, + "step": 1313 + }, + { + "epoch": 2.532048192771084, + "grad_norm": 0.19160760939121246, + "learning_rate": 2.9197222290656737e-06, + "loss": 0.0095, + "step": 1314 + }, + { + "epoch": 2.533975903614458, + "grad_norm": 0.18991442024707794, + "learning_rate": 2.8964330822976227e-06, + "loss": 0.006, + "step": 1315 + }, + { + "epoch": 2.5359036144578315, + "grad_norm": 0.1801903396844864, + "learning_rate": 2.873229937813349e-06, + "loss": 0.0067, + "step": 1316 + }, + { + "epoch": 2.537831325301205, + "grad_norm": 0.07068303227424622, + "learning_rate": 2.850112912285783e-06, + "loss": 0.0015, + "step": 1317 + }, + { + "epoch": 2.539759036144578, + "grad_norm": 0.1404612809419632, + "learning_rate": 2.8270821219548296e-06, + "loss": 0.0036, + "step": 1318 + }, + { + "epoch": 2.541686746987952, + "grad_norm": 0.12199504673480988, + "learning_rate": 2.8041376826267862e-06, + "loss": 0.0068, + "step": 1319 + }, + { + "epoch": 2.5436144578313256, + "grad_norm": 0.2167249619960785, + "learning_rate": 2.7812797096737253e-06, + "loss": 0.0048, + "step": 1320 + }, + { + "epoch": 2.545542168674699, + "grad_norm": 0.07466506212949753, + "learning_rate": 2.7585083180329575e-06, + "loss": 0.0017, + "step": 1321 + }, + { + "epoch": 2.547469879518072, + "grad_norm": 0.11736353486776352, + "learning_rate": 2.7358236222064283e-06, + "loss": 0.003, + "step": 1322 + }, + { + "epoch": 2.549397590361446, + "grad_norm": 0.16602204740047455, + "learning_rate": 2.7132257362601453e-06, + "loss": 0.005, + "step": 1323 + }, + { + "epoch": 2.551325301204819, + "grad_norm": 0.15473629534244537, + "learning_rate": 2.6907147738236193e-06, + "loss": 0.0077, + "step": 1324 + }, + { + "epoch": 2.553253012048193, + "grad_norm": 0.07868973910808563, + "learning_rate": 2.6682908480892567e-06, + "loss": 0.0013, + "step": 1325 + }, + { + "epoch": 2.555180722891566, + "grad_norm": 0.2137845754623413, + "learning_rate": 2.645954071811847e-06, + "loss": 0.0092, + "step": 1326 + }, + { + "epoch": 2.55710843373494, + "grad_norm": 0.11191053688526154, + "learning_rate": 2.623704557307949e-06, + "loss": 0.0031, + "step": 1327 + }, + { + "epoch": 2.559036144578313, + "grad_norm": 0.3080642521381378, + "learning_rate": 2.6015424164553295e-06, + "loss": 0.0104, + "step": 1328 + }, + { + "epoch": 2.5609638554216865, + "grad_norm": 0.08816439658403397, + "learning_rate": 2.579467760692427e-06, + "loss": 0.004, + "step": 1329 + }, + { + "epoch": 2.56289156626506, + "grad_norm": 0.17154981195926666, + "learning_rate": 2.557480701017776e-06, + "loss": 0.0035, + "step": 1330 + }, + { + "epoch": 2.564819277108434, + "grad_norm": 0.09479143470525742, + "learning_rate": 2.5355813479894464e-06, + "loss": 0.0034, + "step": 1331 + }, + { + "epoch": 2.5667469879518072, + "grad_norm": 0.26139333844184875, + "learning_rate": 2.513769811724487e-06, + "loss": 0.0076, + "step": 1332 + }, + { + "epoch": 2.5686746987951805, + "grad_norm": 0.16864238679409027, + "learning_rate": 2.4920462018983816e-06, + "loss": 0.0046, + "step": 1333 + }, + { + "epoch": 2.5706024096385542, + "grad_norm": 0.1133158802986145, + "learning_rate": 2.4704106277444884e-06, + "loss": 0.0034, + "step": 1334 + }, + { + "epoch": 2.572530120481928, + "grad_norm": 0.27522334456443787, + "learning_rate": 2.4488631980534995e-06, + "loss": 0.0127, + "step": 1335 + }, + { + "epoch": 2.5744578313253013, + "grad_norm": 0.13547387719154358, + "learning_rate": 2.427404021172868e-06, + "loss": 0.0031, + "step": 1336 + }, + { + "epoch": 2.5763855421686745, + "grad_norm": 0.13478629291057587, + "learning_rate": 2.406033205006313e-06, + "loss": 0.0039, + "step": 1337 + }, + { + "epoch": 2.5783132530120483, + "grad_norm": 0.11515481770038605, + "learning_rate": 2.3847508570132226e-06, + "loss": 0.0029, + "step": 1338 + }, + { + "epoch": 2.5802409638554216, + "grad_norm": 0.21657171845436096, + "learning_rate": 2.36355708420815e-06, + "loss": 0.011, + "step": 1339 + }, + { + "epoch": 2.5821686746987953, + "grad_norm": 0.11441601067781448, + "learning_rate": 2.342451993160262e-06, + "loss": 0.006, + "step": 1340 + }, + { + "epoch": 2.5840963855421686, + "grad_norm": 0.13475841283798218, + "learning_rate": 2.3214356899928036e-06, + "loss": 0.0051, + "step": 1341 + }, + { + "epoch": 2.5860240963855423, + "grad_norm": 0.053035832941532135, + "learning_rate": 2.300508280382572e-06, + "loss": 0.0012, + "step": 1342 + }, + { + "epoch": 2.5879518072289156, + "grad_norm": 0.12467508763074875, + "learning_rate": 2.279669869559358e-06, + "loss": 0.0024, + "step": 1343 + }, + { + "epoch": 2.589879518072289, + "grad_norm": 0.10572273284196854, + "learning_rate": 2.2589205623054646e-06, + "loss": 0.0024, + "step": 1344 + }, + { + "epoch": 2.5918072289156626, + "grad_norm": 0.17056365311145782, + "learning_rate": 2.238260462955142e-06, + "loss": 0.0064, + "step": 1345 + }, + { + "epoch": 2.5937349397590364, + "grad_norm": 0.07940494269132614, + "learning_rate": 2.2176896753940637e-06, + "loss": 0.0012, + "step": 1346 + }, + { + "epoch": 2.5956626506024096, + "grad_norm": 0.10416694730520248, + "learning_rate": 2.1972083030588244e-06, + "loss": 0.0092, + "step": 1347 + }, + { + "epoch": 2.597590361445783, + "grad_norm": 0.2384328842163086, + "learning_rate": 2.176816448936423e-06, + "loss": 0.0067, + "step": 1348 + }, + { + "epoch": 2.5995180722891567, + "grad_norm": 0.14279082417488098, + "learning_rate": 2.156514215563703e-06, + "loss": 0.0059, + "step": 1349 + }, + { + "epoch": 2.6014457831325304, + "grad_norm": 0.08462683111429214, + "learning_rate": 2.1363017050268886e-06, + "loss": 0.0021, + "step": 1350 + }, + { + "epoch": 2.6033734939759037, + "grad_norm": 0.09768491238355637, + "learning_rate": 2.1161790189610377e-06, + "loss": 0.0038, + "step": 1351 + }, + { + "epoch": 2.605301204819277, + "grad_norm": 0.25498896837234497, + "learning_rate": 2.0961462585495474e-06, + "loss": 0.0114, + "step": 1352 + }, + { + "epoch": 2.6072289156626507, + "grad_norm": 0.15635675191879272, + "learning_rate": 2.076203524523637e-06, + "loss": 0.0054, + "step": 1353 + }, + { + "epoch": 2.609156626506024, + "grad_norm": 0.11619213968515396, + "learning_rate": 2.056350917161836e-06, + "loss": 0.007, + "step": 1354 + }, + { + "epoch": 2.6110843373493977, + "grad_norm": 0.18085338175296783, + "learning_rate": 2.0365885362895053e-06, + "loss": 0.0061, + "step": 1355 + }, + { + "epoch": 2.613012048192771, + "grad_norm": 0.14492927491664886, + "learning_rate": 2.016916481278306e-06, + "loss": 0.0114, + "step": 1356 + }, + { + "epoch": 2.6149397590361447, + "grad_norm": 0.21257621049880981, + "learning_rate": 1.997334851045709e-06, + "loss": 0.0057, + "step": 1357 + }, + { + "epoch": 2.616867469879518, + "grad_norm": 0.11539656668901443, + "learning_rate": 1.9778437440545085e-06, + "loss": 0.0071, + "step": 1358 + }, + { + "epoch": 2.6187951807228913, + "grad_norm": 0.1642933189868927, + "learning_rate": 1.95844325831231e-06, + "loss": 0.0054, + "step": 1359 + }, + { + "epoch": 2.620722891566265, + "grad_norm": 0.10779479146003723, + "learning_rate": 1.9391334913710545e-06, + "loss": 0.0028, + "step": 1360 + }, + { + "epoch": 2.6226506024096388, + "grad_norm": 0.14295366406440735, + "learning_rate": 1.9199145403265175e-06, + "loss": 0.0048, + "step": 1361 + }, + { + "epoch": 2.624578313253012, + "grad_norm": 0.13454844057559967, + "learning_rate": 1.9007865018178107e-06, + "loss": 0.0072, + "step": 1362 + }, + { + "epoch": 2.6265060240963853, + "grad_norm": 0.778252363204956, + "learning_rate": 1.8817494720269302e-06, + "loss": 0.0071, + "step": 1363 + }, + { + "epoch": 2.628433734939759, + "grad_norm": 0.11488679051399231, + "learning_rate": 1.8628035466782268e-06, + "loss": 0.0038, + "step": 1364 + }, + { + "epoch": 2.630361445783133, + "grad_norm": 0.15560875833034515, + "learning_rate": 1.8439488210379687e-06, + "loss": 0.0043, + "step": 1365 + }, + { + "epoch": 2.632289156626506, + "grad_norm": 0.10538071393966675, + "learning_rate": 1.8251853899138306e-06, + "loss": 0.0041, + "step": 1366 + }, + { + "epoch": 2.6342168674698794, + "grad_norm": 0.12866193056106567, + "learning_rate": 1.8065133476544306e-06, + "loss": 0.0034, + "step": 1367 + }, + { + "epoch": 2.636144578313253, + "grad_norm": 0.2045469433069229, + "learning_rate": 1.7879327881488584e-06, + "loss": 0.0141, + "step": 1368 + }, + { + "epoch": 2.6380722891566264, + "grad_norm": 0.12423976510763168, + "learning_rate": 1.769443804826194e-06, + "loss": 0.0047, + "step": 1369 + }, + { + "epoch": 2.64, + "grad_norm": 0.1007109209895134, + "learning_rate": 1.751046490655046e-06, + "loss": 0.0031, + "step": 1370 + }, + { + "epoch": 2.6419277108433734, + "grad_norm": 0.0681275874376297, + "learning_rate": 1.7327409381430804e-06, + "loss": 0.0019, + "step": 1371 + }, + { + "epoch": 2.643855421686747, + "grad_norm": 0.1645517498254776, + "learning_rate": 1.7145272393365498e-06, + "loss": 0.0035, + "step": 1372 + }, + { + "epoch": 2.6457831325301204, + "grad_norm": 0.13689427077770233, + "learning_rate": 1.6964054858198386e-06, + "loss": 0.0086, + "step": 1373 + }, + { + "epoch": 2.6477108433734937, + "grad_norm": 0.10440093278884888, + "learning_rate": 1.6783757687150149e-06, + "loss": 0.0019, + "step": 1374 + }, + { + "epoch": 2.6496385542168674, + "grad_norm": 0.1142532229423523, + "learning_rate": 1.6604381786813383e-06, + "loss": 0.0047, + "step": 1375 + }, + { + "epoch": 2.651566265060241, + "grad_norm": 0.10430166125297546, + "learning_rate": 1.6425928059148312e-06, + "loss": 0.0027, + "step": 1376 + }, + { + "epoch": 2.6534939759036145, + "grad_norm": 0.2315254956483841, + "learning_rate": 1.624839740147819e-06, + "loss": 0.0071, + "step": 1377 + }, + { + "epoch": 2.6554216867469878, + "grad_norm": 0.15356265008449554, + "learning_rate": 1.6071790706484746e-06, + "loss": 0.0109, + "step": 1378 + }, + { + "epoch": 2.6573493975903615, + "grad_norm": 0.1332363784313202, + "learning_rate": 1.589610886220383e-06, + "loss": 0.0046, + "step": 1379 + }, + { + "epoch": 2.659277108433735, + "grad_norm": 0.18892519176006317, + "learning_rate": 1.5721352752020602e-06, + "loss": 0.0138, + "step": 1380 + }, + { + "epoch": 2.6612048192771085, + "grad_norm": 0.10537895560264587, + "learning_rate": 1.5547523254665598e-06, + "loss": 0.0066, + "step": 1381 + }, + { + "epoch": 2.663132530120482, + "grad_norm": 0.1308947205543518, + "learning_rate": 1.5374621244209965e-06, + "loss": 0.0039, + "step": 1382 + }, + { + "epoch": 2.6650602409638555, + "grad_norm": 0.11358808726072311, + "learning_rate": 1.5202647590060983e-06, + "loss": 0.0029, + "step": 1383 + }, + { + "epoch": 2.666987951807229, + "grad_norm": 0.12029009312391281, + "learning_rate": 1.5031603156958064e-06, + "loss": 0.0032, + "step": 1384 + }, + { + "epoch": 2.6689156626506025, + "grad_norm": 0.36994072794914246, + "learning_rate": 1.4861488804968093e-06, + "loss": 0.024, + "step": 1385 + }, + { + "epoch": 2.670843373493976, + "grad_norm": 0.1263083666563034, + "learning_rate": 1.4692305389481232e-06, + "loss": 0.0047, + "step": 1386 + }, + { + "epoch": 2.6727710843373496, + "grad_norm": 0.15056709945201874, + "learning_rate": 1.452405376120658e-06, + "loss": 0.0014, + "step": 1387 + }, + { + "epoch": 2.674698795180723, + "grad_norm": 0.10418888181447983, + "learning_rate": 1.4356734766167925e-06, + "loss": 0.0035, + "step": 1388 + }, + { + "epoch": 2.676626506024096, + "grad_norm": 0.12220565974712372, + "learning_rate": 1.4190349245699443e-06, + "loss": 0.0063, + "step": 1389 + }, + { + "epoch": 2.67855421686747, + "grad_norm": 0.14774753153324127, + "learning_rate": 1.402489803644156e-06, + "loss": 0.008, + "step": 1390 + }, + { + "epoch": 2.6804819277108436, + "grad_norm": 0.14384198188781738, + "learning_rate": 1.3860381970336544e-06, + "loss": 0.0039, + "step": 1391 + }, + { + "epoch": 2.682409638554217, + "grad_norm": 0.10995055735111237, + "learning_rate": 1.3696801874624698e-06, + "loss": 0.0028, + "step": 1392 + }, + { + "epoch": 2.68433734939759, + "grad_norm": 0.12208505719900131, + "learning_rate": 1.353415857183966e-06, + "loss": 0.0029, + "step": 1393 + }, + { + "epoch": 2.686265060240964, + "grad_norm": 0.16018439829349518, + "learning_rate": 1.337245287980482e-06, + "loss": 0.0068, + "step": 1394 + }, + { + "epoch": 2.688192771084337, + "grad_norm": 5.2112274169921875, + "learning_rate": 1.3211685611628844e-06, + "loss": 0.1645, + "step": 1395 + }, + { + "epoch": 2.690120481927711, + "grad_norm": 0.12426120787858963, + "learning_rate": 1.3051857575701732e-06, + "loss": 0.0044, + "step": 1396 + }, + { + "epoch": 2.692048192771084, + "grad_norm": 0.13931375741958618, + "learning_rate": 1.2892969575690685e-06, + "loss": 0.0035, + "step": 1397 + }, + { + "epoch": 2.693975903614458, + "grad_norm": 0.1804540753364563, + "learning_rate": 1.273502241053608e-06, + "loss": 0.0108, + "step": 1398 + }, + { + "epoch": 2.695903614457831, + "grad_norm": 0.12313607335090637, + "learning_rate": 1.2578016874447596e-06, + "loss": 0.0073, + "step": 1399 + }, + { + "epoch": 2.697831325301205, + "grad_norm": 0.1301470398902893, + "learning_rate": 1.2421953756899985e-06, + "loss": 0.0037, + "step": 1400 + }, + { + "epoch": 2.6997590361445782, + "grad_norm": 0.12769126892089844, + "learning_rate": 1.226683384262919e-06, + "loss": 0.0041, + "step": 1401 + }, + { + "epoch": 2.701686746987952, + "grad_norm": 0.20923997461795807, + "learning_rate": 1.21126579116285e-06, + "loss": 0.0101, + "step": 1402 + }, + { + "epoch": 2.7036144578313253, + "grad_norm": 0.09334482997655869, + "learning_rate": 1.1959426739144497e-06, + "loss": 0.0022, + "step": 1403 + }, + { + "epoch": 2.7055421686746985, + "grad_norm": 0.06848987936973572, + "learning_rate": 1.1807141095673291e-06, + "loss": 0.0013, + "step": 1404 + }, + { + "epoch": 2.7074698795180723, + "grad_norm": 0.14552196860313416, + "learning_rate": 1.1655801746956463e-06, + "loss": 0.0066, + "step": 1405 + }, + { + "epoch": 2.709397590361446, + "grad_norm": 0.11259587109088898, + "learning_rate": 1.1505409453977334e-06, + "loss": 0.0045, + "step": 1406 + }, + { + "epoch": 2.7113253012048193, + "grad_norm": 0.23408068716526031, + "learning_rate": 1.135596497295719e-06, + "loss": 0.0181, + "step": 1407 + }, + { + "epoch": 2.7132530120481926, + "grad_norm": 0.1483619660139084, + "learning_rate": 1.1207469055351395e-06, + "loss": 0.0042, + "step": 1408 + }, + { + "epoch": 2.7151807228915663, + "grad_norm": 0.1170588880777359, + "learning_rate": 1.105992244784555e-06, + "loss": 0.0059, + "step": 1409 + }, + { + "epoch": 2.7171084337349396, + "grad_norm": 0.15649215877056122, + "learning_rate": 1.0913325892351857e-06, + "loss": 0.0023, + "step": 1410 + }, + { + "epoch": 2.7190361445783133, + "grad_norm": 0.0980108231306076, + "learning_rate": 1.0767680126005443e-06, + "loss": 0.0019, + "step": 1411 + }, + { + "epoch": 2.7209638554216866, + "grad_norm": 0.14913050830364227, + "learning_rate": 1.0622985881160396e-06, + "loss": 0.0018, + "step": 1412 + }, + { + "epoch": 2.7228915662650603, + "grad_norm": 0.0827481672167778, + "learning_rate": 1.0479243885386347e-06, + "loss": 0.0023, + "step": 1413 + }, + { + "epoch": 2.7248192771084336, + "grad_norm": 0.15648555755615234, + "learning_rate": 1.0336454861464706e-06, + "loss": 0.0033, + "step": 1414 + }, + { + "epoch": 2.7267469879518074, + "grad_norm": 0.10614357888698578, + "learning_rate": 1.0194619527385007e-06, + "loss": 0.0029, + "step": 1415 + }, + { + "epoch": 2.7286746987951807, + "grad_norm": 0.07111652940511703, + "learning_rate": 1.0053738596341355e-06, + "loss": 0.0026, + "step": 1416 + }, + { + "epoch": 2.7306024096385544, + "grad_norm": 0.11736573278903961, + "learning_rate": 9.91381277672867e-07, + "loss": 0.005, + "step": 1417 + }, + { + "epoch": 2.7325301204819277, + "grad_norm": 0.18440629541873932, + "learning_rate": 9.774842772139537e-07, + "loss": 0.0038, + "step": 1418 + }, + { + "epoch": 2.734457831325301, + "grad_norm": 0.11000041663646698, + "learning_rate": 9.636829281360116e-07, + "loss": 0.0034, + "step": 1419 + }, + { + "epoch": 2.7363855421686747, + "grad_norm": 0.15212605893611908, + "learning_rate": 9.499772998367018e-07, + "loss": 0.0038, + "step": 1420 + }, + { + "epoch": 2.7383132530120484, + "grad_norm": 0.07784705609083176, + "learning_rate": 9.36367461232377e-07, + "loss": 0.002, + "step": 1421 + }, + { + "epoch": 2.7402409638554217, + "grad_norm": 0.1096726506948471, + "learning_rate": 9.22853480757715e-07, + "loss": 0.0028, + "step": 1422 + }, + { + "epoch": 2.742168674698795, + "grad_norm": 0.17528535425662994, + "learning_rate": 9.094354263653971e-07, + "loss": 0.0065, + "step": 1423 + }, + { + "epoch": 2.7440963855421687, + "grad_norm": 0.09263470768928528, + "learning_rate": 8.961133655257548e-07, + "loss": 0.0031, + "step": 1424 + }, + { + "epoch": 2.746024096385542, + "grad_norm": 0.14822180569171906, + "learning_rate": 8.828873652264303e-07, + "loss": 0.0043, + "step": 1425 + }, + { + "epoch": 2.7479518072289157, + "grad_norm": 0.11577019095420837, + "learning_rate": 8.697574919720497e-07, + "loss": 0.004, + "step": 1426 + }, + { + "epoch": 2.749879518072289, + "grad_norm": 0.11681873351335526, + "learning_rate": 8.567238117838683e-07, + "loss": 0.0035, + "step": 1427 + }, + { + "epoch": 2.7518072289156628, + "grad_norm": 0.1191524937748909, + "learning_rate": 8.437863901994592e-07, + "loss": 0.0022, + "step": 1428 + }, + { + "epoch": 2.753734939759036, + "grad_norm": 0.1528361737728119, + "learning_rate": 8.309452922723849e-07, + "loss": 0.0042, + "step": 1429 + }, + { + "epoch": 2.75566265060241, + "grad_norm": 0.42052382230758667, + "learning_rate": 8.18200582571842e-07, + "loss": 0.0149, + "step": 1430 + }, + { + "epoch": 2.757590361445783, + "grad_norm": 0.13524137437343597, + "learning_rate": 8.055523251823705e-07, + "loss": 0.0029, + "step": 1431 + }, + { + "epoch": 2.759518072289157, + "grad_norm": 0.0980493426322937, + "learning_rate": 7.930005837035138e-07, + "loss": 0.0036, + "step": 1432 + }, + { + "epoch": 2.76144578313253, + "grad_norm": 0.17335453629493713, + "learning_rate": 7.805454212494967e-07, + "loss": 0.0066, + "step": 1433 + }, + { + "epoch": 2.7633734939759034, + "grad_norm": 0.13746409118175507, + "learning_rate": 7.681869004489218e-07, + "loss": 0.0066, + "step": 1434 + }, + { + "epoch": 2.765301204819277, + "grad_norm": 0.18556399643421173, + "learning_rate": 7.559250834444332e-07, + "loss": 0.0073, + "step": 1435 + }, + { + "epoch": 2.767228915662651, + "grad_norm": 0.09743557125329971, + "learning_rate": 7.437600318924332e-07, + "loss": 0.0023, + "step": 1436 + }, + { + "epoch": 2.769156626506024, + "grad_norm": 0.10671001672744751, + "learning_rate": 7.316918069627488e-07, + "loss": 0.003, + "step": 1437 + }, + { + "epoch": 2.7710843373493974, + "grad_norm": 0.10671380162239075, + "learning_rate": 7.197204693383231e-07, + "loss": 0.0021, + "step": 1438 + }, + { + "epoch": 2.773012048192771, + "grad_norm": 0.06824454665184021, + "learning_rate": 7.078460792149311e-07, + "loss": 0.0017, + "step": 1439 + }, + { + "epoch": 2.7749397590361444, + "grad_norm": 0.12668560445308685, + "learning_rate": 6.960686963008556e-07, + "loss": 0.0035, + "step": 1440 + }, + { + "epoch": 2.776867469879518, + "grad_norm": 0.10260980576276779, + "learning_rate": 6.843883798166029e-07, + "loss": 0.0027, + "step": 1441 + }, + { + "epoch": 2.7787951807228914, + "grad_norm": 0.09880302101373672, + "learning_rate": 6.728051884945941e-07, + "loss": 0.0029, + "step": 1442 + }, + { + "epoch": 2.780722891566265, + "grad_norm": 0.305993914604187, + "learning_rate": 6.613191805788699e-07, + "loss": 0.0112, + "step": 1443 + }, + { + "epoch": 2.7826506024096385, + "grad_norm": 0.10707511752843857, + "learning_rate": 6.499304138248064e-07, + "loss": 0.0062, + "step": 1444 + }, + { + "epoch": 2.784578313253012, + "grad_norm": 0.0986943170428276, + "learning_rate": 6.386389454988195e-07, + "loss": 0.0021, + "step": 1445 + }, + { + "epoch": 2.7865060240963855, + "grad_norm": 0.1458776742219925, + "learning_rate": 6.274448323780724e-07, + "loss": 0.0094, + "step": 1446 + }, + { + "epoch": 2.788433734939759, + "grad_norm": 0.09657061100006104, + "learning_rate": 6.163481307501995e-07, + "loss": 0.0026, + "step": 1447 + }, + { + "epoch": 2.7903614457831325, + "grad_norm": 0.1462988704442978, + "learning_rate": 6.053488964130183e-07, + "loss": 0.0075, + "step": 1448 + }, + { + "epoch": 2.792289156626506, + "grad_norm": 0.15330864489078522, + "learning_rate": 5.94447184674245e-07, + "loss": 0.0067, + "step": 1449 + }, + { + "epoch": 2.7942168674698795, + "grad_norm": 0.1513473242521286, + "learning_rate": 5.836430503512236e-07, + "loss": 0.0106, + "step": 1450 + }, + { + "epoch": 2.7961445783132532, + "grad_norm": 0.2151842713356018, + "learning_rate": 5.729365477706505e-07, + "loss": 0.0062, + "step": 1451 + }, + { + "epoch": 2.7980722891566265, + "grad_norm": 0.13624203205108643, + "learning_rate": 5.623277307682929e-07, + "loss": 0.0045, + "step": 1452 + }, + { + "epoch": 2.8, + "grad_norm": 0.12075261026620865, + "learning_rate": 5.518166526887214e-07, + "loss": 0.0073, + "step": 1453 + }, + { + "epoch": 2.8019277108433736, + "grad_norm": 0.11320624500513077, + "learning_rate": 5.41403366385047e-07, + "loss": 0.002, + "step": 1454 + }, + { + "epoch": 2.803855421686747, + "grad_norm": 0.08470363914966583, + "learning_rate": 5.310879242186606e-07, + "loss": 0.0021, + "step": 1455 + }, + { + "epoch": 2.8057831325301206, + "grad_norm": 0.15221907198429108, + "learning_rate": 5.208703780589419e-07, + "loss": 0.0019, + "step": 1456 + }, + { + "epoch": 2.807710843373494, + "grad_norm": 0.12709103524684906, + "learning_rate": 5.107507792830335e-07, + "loss": 0.0052, + "step": 1457 + }, + { + "epoch": 2.8096385542168676, + "grad_norm": 0.10888515412807465, + "learning_rate": 5.007291787755586e-07, + "loss": 0.0023, + "step": 1458 + }, + { + "epoch": 2.811566265060241, + "grad_norm": 0.25710970163345337, + "learning_rate": 4.908056269283789e-07, + "loss": 0.0073, + "step": 1459 + }, + { + "epoch": 2.8134939759036146, + "grad_norm": 0.08488702774047852, + "learning_rate": 4.809801736403308e-07, + "loss": 0.0016, + "step": 1460 + }, + { + "epoch": 2.815421686746988, + "grad_norm": 0.1282006949186325, + "learning_rate": 4.7125286831698034e-07, + "loss": 0.0035, + "step": 1461 + }, + { + "epoch": 2.8173493975903616, + "grad_norm": 0.08955442905426025, + "learning_rate": 4.6162375987037766e-07, + "loss": 0.004, + "step": 1462 + }, + { + "epoch": 2.819277108433735, + "grad_norm": 0.11310838907957077, + "learning_rate": 4.520928967188054e-07, + "loss": 0.0022, + "step": 1463 + }, + { + "epoch": 2.821204819277108, + "grad_norm": 0.15055686235427856, + "learning_rate": 4.426603267865326e-07, + "loss": 0.0042, + "step": 1464 + }, + { + "epoch": 2.823132530120482, + "grad_norm": 0.14379452168941498, + "learning_rate": 4.333260975035769e-07, + "loss": 0.0089, + "step": 1465 + }, + { + "epoch": 2.8250602409638557, + "grad_norm": 0.1795361489057541, + "learning_rate": 4.240902558054827e-07, + "loss": 0.013, + "step": 1466 + }, + { + "epoch": 2.826987951807229, + "grad_norm": 0.06829468160867691, + "learning_rate": 4.1495284813305003e-07, + "loss": 0.0018, + "step": 1467 + }, + { + "epoch": 2.8289156626506022, + "grad_norm": 0.35213515162467957, + "learning_rate": 4.0591392043213275e-07, + "loss": 0.0144, + "step": 1468 + }, + { + "epoch": 2.830843373493976, + "grad_norm": 0.11828093230724335, + "learning_rate": 3.969735181533918e-07, + "loss": 0.0028, + "step": 1469 + }, + { + "epoch": 2.8327710843373493, + "grad_norm": 0.13286921381950378, + "learning_rate": 3.881316862520712e-07, + "loss": 0.0042, + "step": 1470 + }, + { + "epoch": 2.834698795180723, + "grad_norm": 0.10271132737398148, + "learning_rate": 3.7938846918776917e-07, + "loss": 0.0047, + "step": 1471 + }, + { + "epoch": 2.8366265060240963, + "grad_norm": 0.09422904253005981, + "learning_rate": 3.707439109242139e-07, + "loss": 0.0061, + "step": 1472 + }, + { + "epoch": 2.83855421686747, + "grad_norm": 0.10817123204469681, + "learning_rate": 3.6219805492905934e-07, + "loss": 0.0029, + "step": 1473 + }, + { + "epoch": 2.8404819277108433, + "grad_norm": 0.10254565626382828, + "learning_rate": 3.53750944173632e-07, + "loss": 0.0044, + "step": 1474 + }, + { + "epoch": 2.842409638554217, + "grad_norm": 0.11423154920339584, + "learning_rate": 3.45402621132751e-07, + "loss": 0.0059, + "step": 1475 + }, + { + "epoch": 2.8443373493975903, + "grad_norm": 0.15620556473731995, + "learning_rate": 3.3715312778449305e-07, + "loss": 0.005, + "step": 1476 + }, + { + "epoch": 2.846265060240964, + "grad_norm": 0.1081036925315857, + "learning_rate": 3.2900250560998546e-07, + "loss": 0.004, + "step": 1477 + }, + { + "epoch": 2.8481927710843373, + "grad_norm": 0.38650745153427124, + "learning_rate": 3.209507955932001e-07, + "loss": 0.0076, + "step": 1478 + }, + { + "epoch": 2.8501204819277106, + "grad_norm": 0.1864783614873886, + "learning_rate": 3.129980382207509e-07, + "loss": 0.0092, + "step": 1479 + }, + { + "epoch": 2.8520481927710843, + "grad_norm": 0.1458069533109665, + "learning_rate": 3.05144273481679e-07, + "loss": 0.0058, + "step": 1480 + }, + { + "epoch": 2.853975903614458, + "grad_norm": 0.14836257696151733, + "learning_rate": 2.9738954086726334e-07, + "loss": 0.014, + "step": 1481 + }, + { + "epoch": 2.8559036144578314, + "grad_norm": 0.10147511214017868, + "learning_rate": 2.8973387937081485e-07, + "loss": 0.0047, + "step": 1482 + }, + { + "epoch": 2.8578313253012047, + "grad_norm": 0.13740235567092896, + "learning_rate": 2.821773274874828e-07, + "loss": 0.0028, + "step": 1483 + }, + { + "epoch": 2.8597590361445784, + "grad_norm": 0.16089461743831635, + "learning_rate": 2.7471992321406624e-07, + "loss": 0.0168, + "step": 1484 + }, + { + "epoch": 2.8616867469879517, + "grad_norm": 0.0599152147769928, + "learning_rate": 2.6736170404880744e-07, + "loss": 0.0017, + "step": 1485 + }, + { + "epoch": 2.8636144578313254, + "grad_norm": 0.148875430226326, + "learning_rate": 2.6010270699122096e-07, + "loss": 0.0045, + "step": 1486 + }, + { + "epoch": 2.8655421686746987, + "grad_norm": 0.26763641834259033, + "learning_rate": 2.529429685419027e-07, + "loss": 0.007, + "step": 1487 + }, + { + "epoch": 2.8674698795180724, + "grad_norm": 0.1743084192276001, + "learning_rate": 2.458825247023389e-07, + "loss": 0.0112, + "step": 1488 + }, + { + "epoch": 2.8693975903614457, + "grad_norm": 0.21380828320980072, + "learning_rate": 2.3892141097473063e-07, + "loss": 0.0103, + "step": 1489 + }, + { + "epoch": 2.8713253012048194, + "grad_norm": 2.185253620147705, + "learning_rate": 2.3205966236181433e-07, + "loss": 0.0195, + "step": 1490 + }, + { + "epoch": 2.8732530120481927, + "grad_norm": 0.11854024976491928, + "learning_rate": 2.252973133666947e-07, + "loss": 0.0034, + "step": 1491 + }, + { + "epoch": 2.8751807228915665, + "grad_norm": 0.36487653851509094, + "learning_rate": 2.1863439799265195e-07, + "loss": 0.0063, + "step": 1492 + }, + { + "epoch": 2.8771084337349397, + "grad_norm": 0.1029730811715126, + "learning_rate": 2.1207094974298847e-07, + "loss": 0.0049, + "step": 1493 + }, + { + "epoch": 2.879036144578313, + "grad_norm": 0.10066278278827667, + "learning_rate": 2.056070016208489e-07, + "loss": 0.0021, + "step": 1494 + }, + { + "epoch": 2.8809638554216868, + "grad_norm": 0.21477262675762177, + "learning_rate": 1.9924258612906256e-07, + "loss": 0.0052, + "step": 1495 + }, + { + "epoch": 2.8828915662650605, + "grad_norm": 0.29007601737976074, + "learning_rate": 1.929777352699791e-07, + "loss": 0.0065, + "step": 1496 + }, + { + "epoch": 2.8848192771084338, + "grad_norm": 0.32320499420166016, + "learning_rate": 1.8681248054529754e-07, + "loss": 0.0334, + "step": 1497 + }, + { + "epoch": 2.886746987951807, + "grad_norm": 0.12790757417678833, + "learning_rate": 1.8074685295591754e-07, + "loss": 0.0034, + "step": 1498 + }, + { + "epoch": 2.888674698795181, + "grad_norm": 0.12194570153951645, + "learning_rate": 1.7478088300178608e-07, + "loss": 0.0038, + "step": 1499 + }, + { + "epoch": 2.890602409638554, + "grad_norm": 0.13514107465744019, + "learning_rate": 1.6891460068173548e-07, + "loss": 0.0042, + "step": 1500 + }, + { + "epoch": 2.892530120481928, + "grad_norm": 0.09762352705001831, + "learning_rate": 1.631480354933346e-07, + "loss": 0.0016, + "step": 1501 + }, + { + "epoch": 2.894457831325301, + "grad_norm": 0.10607658326625824, + "learning_rate": 1.5748121643274661e-07, + "loss": 0.0062, + "step": 1502 + }, + { + "epoch": 2.896385542168675, + "grad_norm": 0.0920143872499466, + "learning_rate": 1.519141719945738e-07, + "loss": 0.0025, + "step": 1503 + }, + { + "epoch": 2.898313253012048, + "grad_norm": 0.17520834505558014, + "learning_rate": 1.4644693017172418e-07, + "loss": 0.0045, + "step": 1504 + }, + { + "epoch": 2.900240963855422, + "grad_norm": 0.49769192934036255, + "learning_rate": 1.4107951845526267e-07, + "loss": 0.0059, + "step": 1505 + }, + { + "epoch": 2.902168674698795, + "grad_norm": 0.06354644149541855, + "learning_rate": 1.3581196383427586e-07, + "loss": 0.0021, + "step": 1506 + }, + { + "epoch": 2.904096385542169, + "grad_norm": 0.09340358525514603, + "learning_rate": 1.3064429279573853e-07, + "loss": 0.0036, + "step": 1507 + }, + { + "epoch": 2.906024096385542, + "grad_norm": 0.06073952466249466, + "learning_rate": 1.255765313243762e-07, + "loss": 0.001, + "step": 1508 + }, + { + "epoch": 2.9079518072289154, + "grad_norm": 0.1323407143354416, + "learning_rate": 1.206087049025384e-07, + "loss": 0.008, + "step": 1509 + }, + { + "epoch": 2.909879518072289, + "grad_norm": 0.18533159792423248, + "learning_rate": 1.1574083851007e-07, + "loss": 0.0086, + "step": 1510 + }, + { + "epoch": 2.911807228915663, + "grad_norm": 0.09885486960411072, + "learning_rate": 1.1097295662418018e-07, + "loss": 0.0023, + "step": 1511 + }, + { + "epoch": 2.913734939759036, + "grad_norm": 0.08286528289318085, + "learning_rate": 1.0630508321932687e-07, + "loss": 0.0029, + "step": 1512 + }, + { + "epoch": 2.9156626506024095, + "grad_norm": 0.1265413761138916, + "learning_rate": 1.0173724176709254e-07, + "loss": 0.003, + "step": 1513 + }, + { + "epoch": 2.917590361445783, + "grad_norm": 0.0776480957865715, + "learning_rate": 9.726945523606646e-08, + "loss": 0.0013, + "step": 1514 + }, + { + "epoch": 2.9195180722891565, + "grad_norm": 0.14106431603431702, + "learning_rate": 9.290174609172697e-08, + "loss": 0.0204, + "step": 1515 + }, + { + "epoch": 2.9214457831325302, + "grad_norm": 0.10813348740339279, + "learning_rate": 8.863413629633277e-08, + "loss": 0.0026, + "step": 1516 + }, + { + "epoch": 2.9233734939759035, + "grad_norm": 0.11505429446697235, + "learning_rate": 8.446664730881182e-08, + "loss": 0.0038, + "step": 1517 + }, + { + "epoch": 2.9253012048192772, + "grad_norm": 0.18488599359989166, + "learning_rate": 8.039930008465257e-08, + "loss": 0.0094, + "step": 1518 + }, + { + "epoch": 2.9272289156626505, + "grad_norm": 0.19229602813720703, + "learning_rate": 7.643211507579296e-08, + "loss": 0.0062, + "step": 1519 + }, + { + "epoch": 2.929156626506024, + "grad_norm": 0.0876188799738884, + "learning_rate": 7.25651122305293e-08, + "loss": 0.0024, + "step": 1520 + }, + { + "epoch": 2.9310843373493976, + "grad_norm": 0.15103434026241302, + "learning_rate": 6.87983109934054e-08, + "loss": 0.0056, + "step": 1521 + }, + { + "epoch": 2.9330120481927713, + "grad_norm": 0.1714266538619995, + "learning_rate": 6.51317303051191e-08, + "loss": 0.0047, + "step": 1522 + }, + { + "epoch": 2.9349397590361446, + "grad_norm": 0.30670225620269775, + "learning_rate": 6.156538860242922e-08, + "loss": 0.0111, + "step": 1523 + }, + { + "epoch": 2.936867469879518, + "grad_norm": 0.13250356912612915, + "learning_rate": 5.809930381805773e-08, + "loss": 0.0033, + "step": 1524 + }, + { + "epoch": 2.9387951807228916, + "grad_norm": 0.10350223630666733, + "learning_rate": 5.4733493380603183e-08, + "loss": 0.0028, + "step": 1525 + }, + { + "epoch": 2.9407228915662653, + "grad_norm": 0.1638195812702179, + "learning_rate": 5.1467974214456374e-08, + "loss": 0.0037, + "step": 1526 + }, + { + "epoch": 2.9426506024096386, + "grad_norm": 0.11159276962280273, + "learning_rate": 4.830276273970258e-08, + "loss": 0.003, + "step": 1527 + }, + { + "epoch": 2.944578313253012, + "grad_norm": 0.09866586327552795, + "learning_rate": 4.5237874872052776e-08, + "loss": 0.0032, + "step": 1528 + }, + { + "epoch": 2.9465060240963856, + "grad_norm": 0.17825454473495483, + "learning_rate": 4.227332602275924e-08, + "loss": 0.0105, + "step": 1529 + }, + { + "epoch": 2.948433734939759, + "grad_norm": 0.10379356890916824, + "learning_rate": 3.940913109853561e-08, + "loss": 0.0055, + "step": 1530 + }, + { + "epoch": 2.9503614457831326, + "grad_norm": 0.23834416270256042, + "learning_rate": 3.66453045014814e-08, + "loss": 0.0044, + "step": 1531 + }, + { + "epoch": 2.952289156626506, + "grad_norm": 0.11515571922063828, + "learning_rate": 3.398186012901539e-08, + "loss": 0.0042, + "step": 1532 + }, + { + "epoch": 2.9542168674698797, + "grad_norm": 0.14170049130916595, + "learning_rate": 3.141881137379788e-08, + "loss": 0.0073, + "step": 1533 + }, + { + "epoch": 2.956144578313253, + "grad_norm": 0.237248957157135, + "learning_rate": 2.8956171123670774e-08, + "loss": 0.0055, + "step": 1534 + }, + { + "epoch": 2.9580722891566262, + "grad_norm": 0.07076071947813034, + "learning_rate": 2.6593951761588744e-08, + "loss": 0.0016, + "step": 1535 + }, + { + "epoch": 2.96, + "grad_norm": 0.1100577786564827, + "learning_rate": 2.4332165165557032e-08, + "loss": 0.0026, + "step": 1536 + }, + { + "epoch": 2.9619277108433737, + "grad_norm": 0.11576279252767563, + "learning_rate": 2.2170822708573736e-08, + "loss": 0.0036, + "step": 1537 + }, + { + "epoch": 2.963855421686747, + "grad_norm": 0.2067718207836151, + "learning_rate": 2.0109935258565415e-08, + "loss": 0.0063, + "step": 1538 + }, + { + "epoch": 2.9657831325301203, + "grad_norm": 0.15040244162082672, + "learning_rate": 1.8149513178347122e-08, + "loss": 0.0081, + "step": 1539 + }, + { + "epoch": 2.967710843373494, + "grad_norm": 0.14071759581565857, + "learning_rate": 1.6289566325555783e-08, + "loss": 0.006, + "step": 1540 + }, + { + "epoch": 2.9696385542168677, + "grad_norm": 0.32527413964271545, + "learning_rate": 1.4530104052610239e-08, + "loss": 0.0021, + "step": 1541 + }, + { + "epoch": 2.971566265060241, + "grad_norm": 0.06794515997171402, + "learning_rate": 1.2871135206651287e-08, + "loss": 0.0016, + "step": 1542 + }, + { + "epoch": 2.9734939759036143, + "grad_norm": 0.08525913208723068, + "learning_rate": 1.1312668129519477e-08, + "loss": 0.0023, + "step": 1543 + }, + { + "epoch": 2.975421686746988, + "grad_norm": 0.14025282859802246, + "learning_rate": 9.854710657688504e-09, + "loss": 0.0025, + "step": 1544 + }, + { + "epoch": 2.9773493975903613, + "grad_norm": 0.15709802508354187, + "learning_rate": 8.497270122242996e-09, + "loss": 0.0038, + "step": 1545 + }, + { + "epoch": 2.979277108433735, + "grad_norm": 0.1520087569952011, + "learning_rate": 7.240353348834106e-09, + "loss": 0.0027, + "step": 1546 + }, + { + "epoch": 2.9812048192771083, + "grad_norm": 0.13271088898181915, + "learning_rate": 6.083966657646212e-09, + "loss": 0.003, + "step": 1547 + }, + { + "epoch": 2.983132530120482, + "grad_norm": 0.0962211862206459, + "learning_rate": 5.028115863370265e-09, + "loss": 0.0021, + "step": 1548 + }, + { + "epoch": 2.9850602409638554, + "grad_norm": 0.11485985666513443, + "learning_rate": 4.072806275163821e-09, + "loss": 0.0039, + "step": 1549 + }, + { + "epoch": 2.9869879518072286, + "grad_norm": 0.15437521040439606, + "learning_rate": 3.2180426966332833e-09, + "loss": 0.0048, + "step": 1550 + }, + { + "epoch": 2.9889156626506024, + "grad_norm": 0.09884651750326157, + "learning_rate": 2.4638294258072513e-09, + "loss": 0.0032, + "step": 1551 + }, + { + "epoch": 2.990843373493976, + "grad_norm": 0.30931419134140015, + "learning_rate": 1.810170255116539e-09, + "loss": 0.0038, + "step": 1552 + }, + { + "epoch": 2.9927710843373494, + "grad_norm": 0.3311678469181061, + "learning_rate": 1.2570684713719695e-09, + "loss": 0.0247, + "step": 1553 + }, + { + "epoch": 2.9946987951807227, + "grad_norm": 0.13150249421596527, + "learning_rate": 8.045268557443919e-10, + "loss": 0.0029, + "step": 1554 + }, + { + "epoch": 2.9966265060240964, + "grad_norm": 0.10827342420816422, + "learning_rate": 4.5254768376468137e-10, + "loss": 0.0119, + "step": 1555 + }, + { + "epoch": 2.99855421686747, + "grad_norm": 0.10358250141143799, + "learning_rate": 2.011327252948725e-10, + "loss": 0.0038, + "step": 1556 + }, + { + "epoch": 3.0, + "grad_norm": 0.09550733864307404, + "learning_rate": 5.028324453482114e-11, + "loss": 0.0016, + "step": 1557 + } + ], + "logging_steps": 1, + "max_steps": 1557, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 92, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.043435500286509e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1557/training_args.bin b/checkpoint-1557/training_args.bin new file mode 100644 index 0000000..ecc7b6b --- /dev/null +++ b/checkpoint-1557/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:342dfb3c86216e436950100c79812c54066d5572c4e9975b0f133c067f061bcf +size 7825 diff --git a/checkpoint-828/chat_template.jinja b/checkpoint-828/chat_template.jinja new file mode 100644 index 0000000..1bad6a0 --- /dev/null +++ b/checkpoint-828/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/checkpoint-828/config.json b/checkpoint-828/config.json new file mode 100644 index 0000000..f8bf41e --- /dev/null +++ b/checkpoint-828/config.json @@ -0,0 +1,36 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128009, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": 128009, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "transformers_version": "4.56.2", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/checkpoint-828/generation_config.json b/checkpoint-828/generation_config.json new file mode 100644 index 0000000..2152026 --- /dev/null +++ b/checkpoint-828/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128009, + 128001, + 128008, + 128009 + ], + "pad_token_id": 128009, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.56.2" +} diff --git a/checkpoint-828/model.safetensors b/checkpoint-828/model.safetensors new file mode 100644 index 0000000..48c81c1 --- /dev/null +++ b/checkpoint-828/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c84faa2b72d69248f9f5cff305994d87f81ea546c9efec12e454a12ca90b46d6 +size 2996982344 diff --git a/checkpoint-828/special_tokens_map.json b/checkpoint-828/special_tokens_map.json new file mode 100644 index 0000000..14daf45 --- /dev/null +++ b/checkpoint-828/special_tokens_map.json @@ -0,0 +1,26 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/checkpoint-828/tokenizer.json b/checkpoint-828/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/checkpoint-828/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-828/tokenizer_config.json b/checkpoint-828/tokenizer_config.json new file mode 100644 index 0000000..d1e1ea9 --- /dev/null +++ b/checkpoint-828/tokenizer_config.json @@ -0,0 +1,2068 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-828/trainer_state.json b/checkpoint-828/trainer_state.json new file mode 100644 index 0000000..a69cea0 --- /dev/null +++ b/checkpoint-828/trainer_state.json @@ -0,0 +1,5830 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.5956626506024096, + "eval_steps": 500, + "global_step": 828, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0019277108433734939, + "grad_norm": 2.8518834114074707, + "learning_rate": 0.0, + "loss": 0.0891, + "step": 1 + }, + { + "epoch": 0.0038554216867469878, + "grad_norm": 1.8441249132156372, + "learning_rate": 2.564102564102564e-07, + "loss": 0.0539, + "step": 2 + }, + { + "epoch": 0.005783132530120482, + "grad_norm": 2.8263237476348877, + "learning_rate": 5.128205128205128e-07, + "loss": 0.099, + "step": 3 + }, + { + "epoch": 0.0077108433734939755, + "grad_norm": 2.5051236152648926, + "learning_rate": 7.692307692307694e-07, + "loss": 0.0789, + "step": 4 + }, + { + "epoch": 0.00963855421686747, + "grad_norm": 2.6903438568115234, + "learning_rate": 1.0256410256410257e-06, + "loss": 0.0881, + "step": 5 + }, + { + "epoch": 0.011566265060240964, + "grad_norm": 2.6205761432647705, + "learning_rate": 1.282051282051282e-06, + "loss": 0.0776, + "step": 6 + }, + { + "epoch": 0.013493975903614458, + "grad_norm": 2.6309337615966797, + "learning_rate": 1.5384615384615387e-06, + "loss": 0.0827, + "step": 7 + }, + { + "epoch": 0.015421686746987951, + "grad_norm": 1.5427855253219604, + "learning_rate": 1.794871794871795e-06, + "loss": 0.0577, + "step": 8 + }, + { + "epoch": 0.017349397590361446, + "grad_norm": 1.0973446369171143, + "learning_rate": 2.0512820512820513e-06, + "loss": 0.04, + "step": 9 + }, + { + "epoch": 0.01927710843373494, + "grad_norm": 1.3253350257873535, + "learning_rate": 2.307692307692308e-06, + "loss": 0.0506, + "step": 10 + }, + { + "epoch": 0.021204819277108433, + "grad_norm": 1.588739037513733, + "learning_rate": 2.564102564102564e-06, + "loss": 0.0874, + "step": 11 + }, + { + "epoch": 0.02313253012048193, + "grad_norm": 1.4987014532089233, + "learning_rate": 2.8205128205128207e-06, + "loss": 0.0597, + "step": 12 + }, + { + "epoch": 0.02506024096385542, + "grad_norm": 1.6571592092514038, + "learning_rate": 3.0769230769230774e-06, + "loss": 0.0559, + "step": 13 + }, + { + "epoch": 0.026987951807228915, + "grad_norm": 1.8860628604888916, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.0688, + "step": 14 + }, + { + "epoch": 0.02891566265060241, + "grad_norm": 1.3202295303344727, + "learning_rate": 3.58974358974359e-06, + "loss": 0.0433, + "step": 15 + }, + { + "epoch": 0.030843373493975902, + "grad_norm": 1.5870612859725952, + "learning_rate": 3.846153846153847e-06, + "loss": 0.0695, + "step": 16 + }, + { + "epoch": 0.0327710843373494, + "grad_norm": 0.9192284345626831, + "learning_rate": 4.102564102564103e-06, + "loss": 0.0392, + "step": 17 + }, + { + "epoch": 0.03469879518072289, + "grad_norm": 0.7950155735015869, + "learning_rate": 4.358974358974359e-06, + "loss": 0.0351, + "step": 18 + }, + { + "epoch": 0.03662650602409639, + "grad_norm": 0.8854314684867859, + "learning_rate": 4.615384615384616e-06, + "loss": 0.0356, + "step": 19 + }, + { + "epoch": 0.03855421686746988, + "grad_norm": 0.9546788930892944, + "learning_rate": 4.871794871794872e-06, + "loss": 0.0427, + "step": 20 + }, + { + "epoch": 0.04048192771084337, + "grad_norm": 0.6315903663635254, + "learning_rate": 5.128205128205128e-06, + "loss": 0.0397, + "step": 21 + }, + { + "epoch": 0.042409638554216866, + "grad_norm": 0.9230924844741821, + "learning_rate": 5.384615384615385e-06, + "loss": 0.0481, + "step": 22 + }, + { + "epoch": 0.04433734939759036, + "grad_norm": 0.711546003818512, + "learning_rate": 5.641025641025641e-06, + "loss": 0.0479, + "step": 23 + }, + { + "epoch": 0.04626506024096386, + "grad_norm": 0.5288046598434448, + "learning_rate": 5.897435897435898e-06, + "loss": 0.0182, + "step": 24 + }, + { + "epoch": 0.04819277108433735, + "grad_norm": 0.9420496225357056, + "learning_rate": 6.153846153846155e-06, + "loss": 0.0389, + "step": 25 + }, + { + "epoch": 0.05012048192771084, + "grad_norm": 0.5001983046531677, + "learning_rate": 6.410256410256412e-06, + "loss": 0.0268, + "step": 26 + }, + { + "epoch": 0.052048192771084335, + "grad_norm": 0.8084653615951538, + "learning_rate": 6.666666666666667e-06, + "loss": 0.0367, + "step": 27 + }, + { + "epoch": 0.05397590361445783, + "grad_norm": 0.7195103764533997, + "learning_rate": 6.923076923076923e-06, + "loss": 0.0251, + "step": 28 + }, + { + "epoch": 0.055903614457831326, + "grad_norm": 0.529958963394165, + "learning_rate": 7.17948717948718e-06, + "loss": 0.0289, + "step": 29 + }, + { + "epoch": 0.05783132530120482, + "grad_norm": 0.795376181602478, + "learning_rate": 7.435897435897437e-06, + "loss": 0.043, + "step": 30 + }, + { + "epoch": 0.059759036144578316, + "grad_norm": 0.6366249918937683, + "learning_rate": 7.692307692307694e-06, + "loss": 0.029, + "step": 31 + }, + { + "epoch": 0.061686746987951804, + "grad_norm": 0.5414115190505981, + "learning_rate": 7.948717948717949e-06, + "loss": 0.0365, + "step": 32 + }, + { + "epoch": 0.0636144578313253, + "grad_norm": 0.9350972175598145, + "learning_rate": 8.205128205128205e-06, + "loss": 0.0283, + "step": 33 + }, + { + "epoch": 0.0655421686746988, + "grad_norm": 0.5660741925239563, + "learning_rate": 8.461538461538462e-06, + "loss": 0.0234, + "step": 34 + }, + { + "epoch": 0.06746987951807229, + "grad_norm": 0.5623988509178162, + "learning_rate": 8.717948717948719e-06, + "loss": 0.0307, + "step": 35 + }, + { + "epoch": 0.06939759036144579, + "grad_norm": 0.5260195732116699, + "learning_rate": 8.974358974358976e-06, + "loss": 0.0264, + "step": 36 + }, + { + "epoch": 0.07132530120481928, + "grad_norm": 0.4934785068035126, + "learning_rate": 9.230769230769232e-06, + "loss": 0.0224, + "step": 37 + }, + { + "epoch": 0.07325301204819278, + "grad_norm": 0.4797322154045105, + "learning_rate": 9.487179487179487e-06, + "loss": 0.0163, + "step": 38 + }, + { + "epoch": 0.07518072289156627, + "grad_norm": 0.4739217460155487, + "learning_rate": 9.743589743589744e-06, + "loss": 0.0165, + "step": 39 + }, + { + "epoch": 0.07710843373493977, + "grad_norm": 0.4527677595615387, + "learning_rate": 1e-05, + "loss": 0.0163, + "step": 40 + }, + { + "epoch": 0.07903614457831325, + "grad_norm": 0.6241316795349121, + "learning_rate": 1.0256410256410256e-05, + "loss": 0.0302, + "step": 41 + }, + { + "epoch": 0.08096385542168674, + "grad_norm": 0.639043927192688, + "learning_rate": 1.0512820512820514e-05, + "loss": 0.0312, + "step": 42 + }, + { + "epoch": 0.08289156626506024, + "grad_norm": 0.5121409296989441, + "learning_rate": 1.076923076923077e-05, + "loss": 0.0256, + "step": 43 + }, + { + "epoch": 0.08481927710843373, + "grad_norm": 0.6340477466583252, + "learning_rate": 1.1025641025641028e-05, + "loss": 0.04, + "step": 44 + }, + { + "epoch": 0.08674698795180723, + "grad_norm": 0.5260409712791443, + "learning_rate": 1.1282051282051283e-05, + "loss": 0.0282, + "step": 45 + }, + { + "epoch": 0.08867469879518072, + "grad_norm": 0.6390711069107056, + "learning_rate": 1.1538461538461538e-05, + "loss": 0.0243, + "step": 46 + }, + { + "epoch": 0.09060240963855422, + "grad_norm": 0.46469295024871826, + "learning_rate": 1.1794871794871796e-05, + "loss": 0.0208, + "step": 47 + }, + { + "epoch": 0.09253012048192771, + "grad_norm": 0.8711516857147217, + "learning_rate": 1.2051282051282051e-05, + "loss": 0.0291, + "step": 48 + }, + { + "epoch": 0.09445783132530121, + "grad_norm": 0.9164300560951233, + "learning_rate": 1.230769230769231e-05, + "loss": 0.0342, + "step": 49 + }, + { + "epoch": 0.0963855421686747, + "grad_norm": 0.5401139259338379, + "learning_rate": 1.2564102564102565e-05, + "loss": 0.0185, + "step": 50 + }, + { + "epoch": 0.0983132530120482, + "grad_norm": 0.44393008947372437, + "learning_rate": 1.2820512820512823e-05, + "loss": 0.0228, + "step": 51 + }, + { + "epoch": 0.10024096385542168, + "grad_norm": 0.3855767846107483, + "learning_rate": 1.3076923076923078e-05, + "loss": 0.0176, + "step": 52 + }, + { + "epoch": 0.10216867469879518, + "grad_norm": 0.8561235070228577, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.0433, + "step": 53 + }, + { + "epoch": 0.10409638554216867, + "grad_norm": 0.768002450466156, + "learning_rate": 1.3589743589743592e-05, + "loss": 0.0245, + "step": 54 + }, + { + "epoch": 0.10602409638554217, + "grad_norm": 0.4559759497642517, + "learning_rate": 1.3846153846153847e-05, + "loss": 0.0224, + "step": 55 + }, + { + "epoch": 0.10795180722891566, + "grad_norm": 0.6203847527503967, + "learning_rate": 1.4102564102564105e-05, + "loss": 0.0296, + "step": 56 + }, + { + "epoch": 0.10987951807228916, + "grad_norm": 0.6651368141174316, + "learning_rate": 1.435897435897436e-05, + "loss": 0.0336, + "step": 57 + }, + { + "epoch": 0.11180722891566265, + "grad_norm": 0.377734512090683, + "learning_rate": 1.4615384615384615e-05, + "loss": 0.0196, + "step": 58 + }, + { + "epoch": 0.11373493975903615, + "grad_norm": 0.687568724155426, + "learning_rate": 1.4871794871794874e-05, + "loss": 0.0207, + "step": 59 + }, + { + "epoch": 0.11566265060240964, + "grad_norm": 0.7905604243278503, + "learning_rate": 1.5128205128205129e-05, + "loss": 0.047, + "step": 60 + }, + { + "epoch": 0.11759036144578314, + "grad_norm": 0.7938196063041687, + "learning_rate": 1.5384615384615387e-05, + "loss": 0.0198, + "step": 61 + }, + { + "epoch": 0.11951807228915663, + "grad_norm": 0.41340553760528564, + "learning_rate": 1.5641025641025644e-05, + "loss": 0.0161, + "step": 62 + }, + { + "epoch": 0.12144578313253013, + "grad_norm": 0.5668172240257263, + "learning_rate": 1.5897435897435897e-05, + "loss": 0.0275, + "step": 63 + }, + { + "epoch": 0.12337349397590361, + "grad_norm": 0.48333367705345154, + "learning_rate": 1.6153846153846154e-05, + "loss": 0.0137, + "step": 64 + }, + { + "epoch": 0.12530120481927712, + "grad_norm": 0.6843933463096619, + "learning_rate": 1.641025641025641e-05, + "loss": 0.0294, + "step": 65 + }, + { + "epoch": 0.1272289156626506, + "grad_norm": 0.7789272665977478, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.0401, + "step": 66 + }, + { + "epoch": 0.1291566265060241, + "grad_norm": 0.6203492879867554, + "learning_rate": 1.6923076923076924e-05, + "loss": 0.0292, + "step": 67 + }, + { + "epoch": 0.1310843373493976, + "grad_norm": 0.5940662622451782, + "learning_rate": 1.717948717948718e-05, + "loss": 0.0178, + "step": 68 + }, + { + "epoch": 0.13301204819277107, + "grad_norm": 0.35504868626594543, + "learning_rate": 1.7435897435897438e-05, + "loss": 0.0129, + "step": 69 + }, + { + "epoch": 0.13493975903614458, + "grad_norm": 0.8796699643135071, + "learning_rate": 1.7692307692307694e-05, + "loss": 0.034, + "step": 70 + }, + { + "epoch": 0.13686746987951806, + "grad_norm": 0.967444896697998, + "learning_rate": 1.794871794871795e-05, + "loss": 0.0266, + "step": 71 + }, + { + "epoch": 0.13879518072289157, + "grad_norm": 0.4428526759147644, + "learning_rate": 1.8205128205128208e-05, + "loss": 0.0223, + "step": 72 + }, + { + "epoch": 0.14072289156626505, + "grad_norm": 0.42897751927375793, + "learning_rate": 1.8461538461538465e-05, + "loss": 0.0187, + "step": 73 + }, + { + "epoch": 0.14265060240963856, + "grad_norm": 0.5100914835929871, + "learning_rate": 1.8717948717948718e-05, + "loss": 0.0164, + "step": 74 + }, + { + "epoch": 0.14457831325301204, + "grad_norm": 0.6028861999511719, + "learning_rate": 1.8974358974358975e-05, + "loss": 0.0164, + "step": 75 + }, + { + "epoch": 0.14650602409638555, + "grad_norm": 0.6187024116516113, + "learning_rate": 1.923076923076923e-05, + "loss": 0.0296, + "step": 76 + }, + { + "epoch": 0.14843373493975903, + "grad_norm": 0.4822489619255066, + "learning_rate": 1.9487179487179488e-05, + "loss": 0.0148, + "step": 77 + }, + { + "epoch": 0.15036144578313254, + "grad_norm": 0.7231149673461914, + "learning_rate": 1.9743589743589745e-05, + "loss": 0.0395, + "step": 78 + }, + { + "epoch": 0.15228915662650602, + "grad_norm": 0.8409642577171326, + "learning_rate": 2e-05, + "loss": 0.0446, + "step": 79 + }, + { + "epoch": 0.15421686746987953, + "grad_norm": 0.4883500039577484, + "learning_rate": 2.025641025641026e-05, + "loss": 0.0206, + "step": 80 + }, + { + "epoch": 0.156144578313253, + "grad_norm": 0.6287479400634766, + "learning_rate": 2.0512820512820512e-05, + "loss": 0.0333, + "step": 81 + }, + { + "epoch": 0.1580722891566265, + "grad_norm": 0.5041632652282715, + "learning_rate": 2.0769230769230772e-05, + "loss": 0.0414, + "step": 82 + }, + { + "epoch": 0.16, + "grad_norm": 0.5103405117988586, + "learning_rate": 2.102564102564103e-05, + "loss": 0.045, + "step": 83 + }, + { + "epoch": 0.16192771084337348, + "grad_norm": 0.493161678314209, + "learning_rate": 2.1282051282051285e-05, + "loss": 0.021, + "step": 84 + }, + { + "epoch": 0.163855421686747, + "grad_norm": 0.908843994140625, + "learning_rate": 2.153846153846154e-05, + "loss": 0.0389, + "step": 85 + }, + { + "epoch": 0.16578313253012048, + "grad_norm": 0.5067003965377808, + "learning_rate": 2.1794871794871795e-05, + "loss": 0.0272, + "step": 86 + }, + { + "epoch": 0.16771084337349398, + "grad_norm": 0.5791381597518921, + "learning_rate": 2.2051282051282056e-05, + "loss": 0.0368, + "step": 87 + }, + { + "epoch": 0.16963855421686747, + "grad_norm": 0.7056036591529846, + "learning_rate": 2.230769230769231e-05, + "loss": 0.0284, + "step": 88 + }, + { + "epoch": 0.17156626506024097, + "grad_norm": 0.6563822031021118, + "learning_rate": 2.2564102564102566e-05, + "loss": 0.0646, + "step": 89 + }, + { + "epoch": 0.17349397590361446, + "grad_norm": 0.9483286142349243, + "learning_rate": 2.2820512820512822e-05, + "loss": 0.0439, + "step": 90 + }, + { + "epoch": 0.17542168674698796, + "grad_norm": 0.370664119720459, + "learning_rate": 2.3076923076923076e-05, + "loss": 0.0109, + "step": 91 + }, + { + "epoch": 0.17734939759036145, + "grad_norm": 0.9776477813720703, + "learning_rate": 2.3333333333333336e-05, + "loss": 0.0458, + "step": 92 + }, + { + "epoch": 0.17927710843373493, + "grad_norm": 0.45710092782974243, + "learning_rate": 2.3589743589743593e-05, + "loss": 0.0212, + "step": 93 + }, + { + "epoch": 0.18120481927710844, + "grad_norm": 0.8623896837234497, + "learning_rate": 2.384615384615385e-05, + "loss": 0.0215, + "step": 94 + }, + { + "epoch": 0.18313253012048192, + "grad_norm": 0.55814528465271, + "learning_rate": 2.4102564102564103e-05, + "loss": 0.0218, + "step": 95 + }, + { + "epoch": 0.18506024096385543, + "grad_norm": 0.49882641434669495, + "learning_rate": 2.435897435897436e-05, + "loss": 0.0268, + "step": 96 + }, + { + "epoch": 0.1869879518072289, + "grad_norm": 0.3508654534816742, + "learning_rate": 2.461538461538462e-05, + "loss": 0.0172, + "step": 97 + }, + { + "epoch": 0.18891566265060242, + "grad_norm": 0.601170003414154, + "learning_rate": 2.4871794871794873e-05, + "loss": 0.0208, + "step": 98 + }, + { + "epoch": 0.1908433734939759, + "grad_norm": 1.1748133897781372, + "learning_rate": 2.512820512820513e-05, + "loss": 0.0259, + "step": 99 + }, + { + "epoch": 0.1927710843373494, + "grad_norm": 0.46370384097099304, + "learning_rate": 2.5384615384615386e-05, + "loss": 0.0242, + "step": 100 + }, + { + "epoch": 0.1946987951807229, + "grad_norm": 0.525010883808136, + "learning_rate": 2.5641025641025646e-05, + "loss": 0.0188, + "step": 101 + }, + { + "epoch": 0.1966265060240964, + "grad_norm": 0.766501784324646, + "learning_rate": 2.58974358974359e-05, + "loss": 0.0584, + "step": 102 + }, + { + "epoch": 0.19855421686746988, + "grad_norm": 0.3572964370250702, + "learning_rate": 2.6153846153846157e-05, + "loss": 0.0131, + "step": 103 + }, + { + "epoch": 0.20048192771084336, + "grad_norm": 0.6467130780220032, + "learning_rate": 2.6410256410256413e-05, + "loss": 0.0231, + "step": 104 + }, + { + "epoch": 0.20240963855421687, + "grad_norm": 1.1852102279663086, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.027, + "step": 105 + }, + { + "epoch": 0.20433734939759035, + "grad_norm": 2.3659932613372803, + "learning_rate": 2.6923076923076927e-05, + "loss": 0.0224, + "step": 106 + }, + { + "epoch": 0.20626506024096386, + "grad_norm": 0.5343687534332275, + "learning_rate": 2.7179487179487183e-05, + "loss": 0.0198, + "step": 107 + }, + { + "epoch": 0.20819277108433734, + "grad_norm": 1.852160096168518, + "learning_rate": 2.7435897435897437e-05, + "loss": 0.032, + "step": 108 + }, + { + "epoch": 0.21012048192771085, + "grad_norm": 0.47291702032089233, + "learning_rate": 2.7692307692307694e-05, + "loss": 0.0117, + "step": 109 + }, + { + "epoch": 0.21204819277108433, + "grad_norm": 0.7623187899589539, + "learning_rate": 2.794871794871795e-05, + "loss": 0.0337, + "step": 110 + }, + { + "epoch": 0.21397590361445784, + "grad_norm": 0.5272570848464966, + "learning_rate": 2.820512820512821e-05, + "loss": 0.0131, + "step": 111 + }, + { + "epoch": 0.21590361445783132, + "grad_norm": 0.5568500757217407, + "learning_rate": 2.8461538461538464e-05, + "loss": 0.0233, + "step": 112 + }, + { + "epoch": 0.21783132530120483, + "grad_norm": 0.4008469879627228, + "learning_rate": 2.871794871794872e-05, + "loss": 0.0204, + "step": 113 + }, + { + "epoch": 0.2197590361445783, + "grad_norm": 0.4888612926006317, + "learning_rate": 2.8974358974358977e-05, + "loss": 0.016, + "step": 114 + }, + { + "epoch": 0.2216867469879518, + "grad_norm": 0.44903355836868286, + "learning_rate": 2.923076923076923e-05, + "loss": 0.0135, + "step": 115 + }, + { + "epoch": 0.2236144578313253, + "grad_norm": 0.9266762733459473, + "learning_rate": 2.948717948717949e-05, + "loss": 0.0233, + "step": 116 + }, + { + "epoch": 0.22554216867469878, + "grad_norm": 0.5352638959884644, + "learning_rate": 2.9743589743589747e-05, + "loss": 0.0198, + "step": 117 + }, + { + "epoch": 0.2274698795180723, + "grad_norm": 0.6051343679428101, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.0246, + "step": 118 + }, + { + "epoch": 0.22939759036144577, + "grad_norm": 0.9971133470535278, + "learning_rate": 3.0256410256410257e-05, + "loss": 0.025, + "step": 119 + }, + { + "epoch": 0.23132530120481928, + "grad_norm": 0.704236626625061, + "learning_rate": 3.0512820512820514e-05, + "loss": 0.031, + "step": 120 + }, + { + "epoch": 0.23325301204819276, + "grad_norm": 0.6137097477912903, + "learning_rate": 3.0769230769230774e-05, + "loss": 0.0519, + "step": 121 + }, + { + "epoch": 0.23518072289156627, + "grad_norm": 0.7396159768104553, + "learning_rate": 3.102564102564103e-05, + "loss": 0.0325, + "step": 122 + }, + { + "epoch": 0.23710843373493976, + "grad_norm": 1.3282053470611572, + "learning_rate": 3.128205128205129e-05, + "loss": 0.0252, + "step": 123 + }, + { + "epoch": 0.23903614457831326, + "grad_norm": 0.5220731496810913, + "learning_rate": 3.153846153846154e-05, + "loss": 0.0262, + "step": 124 + }, + { + "epoch": 0.24096385542168675, + "grad_norm": 0.5357242822647095, + "learning_rate": 3.1794871794871795e-05, + "loss": 0.0243, + "step": 125 + }, + { + "epoch": 0.24289156626506025, + "grad_norm": 0.48207753896713257, + "learning_rate": 3.205128205128206e-05, + "loss": 0.0178, + "step": 126 + }, + { + "epoch": 0.24481927710843374, + "grad_norm": 0.552988588809967, + "learning_rate": 3.230769230769231e-05, + "loss": 0.023, + "step": 127 + }, + { + "epoch": 0.24674698795180722, + "grad_norm": 1.7962840795516968, + "learning_rate": 3.2564102564102565e-05, + "loss": 0.032, + "step": 128 + }, + { + "epoch": 0.24867469879518073, + "grad_norm": 1.6404600143432617, + "learning_rate": 3.282051282051282e-05, + "loss": 0.0231, + "step": 129 + }, + { + "epoch": 0.25060240963855424, + "grad_norm": 0.39142486453056335, + "learning_rate": 3.307692307692308e-05, + "loss": 0.0147, + "step": 130 + }, + { + "epoch": 0.2525301204819277, + "grad_norm": 1.3272887468338013, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.0439, + "step": 131 + }, + { + "epoch": 0.2544578313253012, + "grad_norm": 1.5122811794281006, + "learning_rate": 3.358974358974359e-05, + "loss": 0.0282, + "step": 132 + }, + { + "epoch": 0.2563855421686747, + "grad_norm": 1.8542430400848389, + "learning_rate": 3.384615384615385e-05, + "loss": 0.0515, + "step": 133 + }, + { + "epoch": 0.2583132530120482, + "grad_norm": 4.059277534484863, + "learning_rate": 3.4102564102564105e-05, + "loss": 0.0781, + "step": 134 + }, + { + "epoch": 0.26024096385542167, + "grad_norm": 0.6206214427947998, + "learning_rate": 3.435897435897436e-05, + "loss": 0.0306, + "step": 135 + }, + { + "epoch": 0.2621686746987952, + "grad_norm": 0.4575510323047638, + "learning_rate": 3.461538461538462e-05, + "loss": 0.0154, + "step": 136 + }, + { + "epoch": 0.2640963855421687, + "grad_norm": 1.1556978225708008, + "learning_rate": 3.4871794871794875e-05, + "loss": 0.0235, + "step": 137 + }, + { + "epoch": 0.26602409638554214, + "grad_norm": 0.6975051760673523, + "learning_rate": 3.512820512820513e-05, + "loss": 0.0453, + "step": 138 + }, + { + "epoch": 0.26795180722891565, + "grad_norm": 0.8686623573303223, + "learning_rate": 3.538461538461539e-05, + "loss": 0.0427, + "step": 139 + }, + { + "epoch": 0.26987951807228916, + "grad_norm": 2.0681848526000977, + "learning_rate": 3.5641025641025646e-05, + "loss": 0.04, + "step": 140 + }, + { + "epoch": 0.27180722891566267, + "grad_norm": 0.4397984445095062, + "learning_rate": 3.58974358974359e-05, + "loss": 0.0188, + "step": 141 + }, + { + "epoch": 0.2737349397590361, + "grad_norm": 0.5871334075927734, + "learning_rate": 3.615384615384616e-05, + "loss": 0.0253, + "step": 142 + }, + { + "epoch": 0.27566265060240963, + "grad_norm": 1.1078568696975708, + "learning_rate": 3.6410256410256416e-05, + "loss": 0.0316, + "step": 143 + }, + { + "epoch": 0.27759036144578314, + "grad_norm": 0.5691841840744019, + "learning_rate": 3.6666666666666666e-05, + "loss": 0.0266, + "step": 144 + }, + { + "epoch": 0.27951807228915665, + "grad_norm": 0.7896255254745483, + "learning_rate": 3.692307692307693e-05, + "loss": 0.0281, + "step": 145 + }, + { + "epoch": 0.2814457831325301, + "grad_norm": 0.9988337159156799, + "learning_rate": 3.7179487179487186e-05, + "loss": 0.0295, + "step": 146 + }, + { + "epoch": 0.2833734939759036, + "grad_norm": 0.9811834692955017, + "learning_rate": 3.7435897435897436e-05, + "loss": 0.0322, + "step": 147 + }, + { + "epoch": 0.2853012048192771, + "grad_norm": 0.6503105759620667, + "learning_rate": 3.769230769230769e-05, + "loss": 0.0266, + "step": 148 + }, + { + "epoch": 0.28722891566265063, + "grad_norm": 1.9164355993270874, + "learning_rate": 3.794871794871795e-05, + "loss": 0.0677, + "step": 149 + }, + { + "epoch": 0.2891566265060241, + "grad_norm": 1.1724557876586914, + "learning_rate": 3.820512820512821e-05, + "loss": 0.0324, + "step": 150 + }, + { + "epoch": 0.2910843373493976, + "grad_norm": 0.8482469916343689, + "learning_rate": 3.846153846153846e-05, + "loss": 0.0259, + "step": 151 + }, + { + "epoch": 0.2930120481927711, + "grad_norm": 0.8572830557823181, + "learning_rate": 3.871794871794872e-05, + "loss": 0.0358, + "step": 152 + }, + { + "epoch": 0.29493975903614456, + "grad_norm": 0.6630825400352478, + "learning_rate": 3.8974358974358976e-05, + "loss": 0.0447, + "step": 153 + }, + { + "epoch": 0.29686746987951806, + "grad_norm": 0.9197093844413757, + "learning_rate": 3.923076923076923e-05, + "loss": 0.0409, + "step": 154 + }, + { + "epoch": 0.2987951807228916, + "grad_norm": 0.6976819634437561, + "learning_rate": 3.948717948717949e-05, + "loss": 0.0317, + "step": 155 + }, + { + "epoch": 0.3007228915662651, + "grad_norm": 0.7353514432907104, + "learning_rate": 3.9743589743589747e-05, + "loss": 0.0306, + "step": 156 + }, + { + "epoch": 0.30265060240963854, + "grad_norm": 0.5730232000350952, + "learning_rate": 4e-05, + "loss": 0.0324, + "step": 157 + }, + { + "epoch": 0.30457831325301205, + "grad_norm": 0.7852078676223755, + "learning_rate": 3.999994971675547e-05, + "loss": 0.0354, + "step": 158 + }, + { + "epoch": 0.30650602409638555, + "grad_norm": 0.5924715399742126, + "learning_rate": 3.999979886727471e-05, + "loss": 0.0366, + "step": 159 + }, + { + "epoch": 0.30843373493975906, + "grad_norm": 0.7359845638275146, + "learning_rate": 3.999954745231624e-05, + "loss": 0.0437, + "step": 160 + }, + { + "epoch": 0.3103614457831325, + "grad_norm": 0.7866976857185364, + "learning_rate": 3.999919547314426e-05, + "loss": 0.0363, + "step": 161 + }, + { + "epoch": 0.312289156626506, + "grad_norm": 0.7425745129585266, + "learning_rate": 3.999874293152863e-05, + "loss": 0.0259, + "step": 162 + }, + { + "epoch": 0.31421686746987953, + "grad_norm": 1.8922245502471924, + "learning_rate": 3.9998189829744885e-05, + "loss": 0.0341, + "step": 163 + }, + { + "epoch": 0.316144578313253, + "grad_norm": 0.7908634543418884, + "learning_rate": 3.99975361705742e-05, + "loss": 0.0424, + "step": 164 + }, + { + "epoch": 0.3180722891566265, + "grad_norm": 2.047368049621582, + "learning_rate": 3.999678195730337e-05, + "loss": 0.0535, + "step": 165 + }, + { + "epoch": 0.32, + "grad_norm": 0.5702639222145081, + "learning_rate": 3.999592719372484e-05, + "loss": 0.0284, + "step": 166 + }, + { + "epoch": 0.3219277108433735, + "grad_norm": 0.45015648007392883, + "learning_rate": 3.9994971884136636e-05, + "loss": 0.0313, + "step": 167 + }, + { + "epoch": 0.32385542168674697, + "grad_norm": 4.094679355621338, + "learning_rate": 3.9993916033342355e-05, + "loss": 0.0524, + "step": 168 + }, + { + "epoch": 0.3257831325301205, + "grad_norm": 0.800846517086029, + "learning_rate": 3.999275964665117e-05, + "loss": 0.0282, + "step": 169 + }, + { + "epoch": 0.327710843373494, + "grad_norm": 0.47881078720092773, + "learning_rate": 3.999150272987776e-05, + "loss": 0.0293, + "step": 170 + }, + { + "epoch": 0.3296385542168675, + "grad_norm": 0.5716657638549805, + "learning_rate": 3.999014528934232e-05, + "loss": 0.0221, + "step": 171 + }, + { + "epoch": 0.33156626506024095, + "grad_norm": 0.6333311200141907, + "learning_rate": 3.998868733187048e-05, + "loss": 0.0302, + "step": 172 + }, + { + "epoch": 0.33349397590361446, + "grad_norm": 6.642521858215332, + "learning_rate": 3.998712886479335e-05, + "loss": 0.0364, + "step": 173 + }, + { + "epoch": 0.33542168674698797, + "grad_norm": 0.7515506148338318, + "learning_rate": 3.998546989594739e-05, + "loss": 0.0296, + "step": 174 + }, + { + "epoch": 0.3373493975903614, + "grad_norm": 1.0728015899658203, + "learning_rate": 3.998371043367445e-05, + "loss": 0.0549, + "step": 175 + }, + { + "epoch": 0.33927710843373493, + "grad_norm": 1.3025579452514648, + "learning_rate": 3.998185048682166e-05, + "loss": 0.0577, + "step": 176 + }, + { + "epoch": 0.34120481927710844, + "grad_norm": 1.0962958335876465, + "learning_rate": 3.997989006474144e-05, + "loss": 0.0313, + "step": 177 + }, + { + "epoch": 0.34313253012048195, + "grad_norm": 0.7064313292503357, + "learning_rate": 3.997782917729143e-05, + "loss": 0.0309, + "step": 178 + }, + { + "epoch": 0.3450602409638554, + "grad_norm": 0.43374207615852356, + "learning_rate": 3.997566783483445e-05, + "loss": 0.0166, + "step": 179 + }, + { + "epoch": 0.3469879518072289, + "grad_norm": 0.7236390113830566, + "learning_rate": 3.9973406048238413e-05, + "loss": 0.0254, + "step": 180 + }, + { + "epoch": 0.3489156626506024, + "grad_norm": 0.5041500926017761, + "learning_rate": 3.9971043828876334e-05, + "loss": 0.0239, + "step": 181 + }, + { + "epoch": 0.35084337349397593, + "grad_norm": 1.2744532823562622, + "learning_rate": 3.9968581188626204e-05, + "loss": 0.0404, + "step": 182 + }, + { + "epoch": 0.3527710843373494, + "grad_norm": 0.45845362544059753, + "learning_rate": 3.996601813987098e-05, + "loss": 0.0127, + "step": 183 + }, + { + "epoch": 0.3546987951807229, + "grad_norm": 0.4426881968975067, + "learning_rate": 3.996335469549852e-05, + "loss": 0.0176, + "step": 184 + }, + { + "epoch": 0.3566265060240964, + "grad_norm": 1.0030732154846191, + "learning_rate": 3.9960590868901465e-05, + "loss": 0.0457, + "step": 185 + }, + { + "epoch": 0.35855421686746985, + "grad_norm": 0.6428582668304443, + "learning_rate": 3.995772667397725e-05, + "loss": 0.0271, + "step": 186 + }, + { + "epoch": 0.36048192771084336, + "grad_norm": 0.5335744619369507, + "learning_rate": 3.995476212512795e-05, + "loss": 0.0297, + "step": 187 + }, + { + "epoch": 0.3624096385542169, + "grad_norm": 0.6995761394500732, + "learning_rate": 3.99516972372603e-05, + "loss": 0.0322, + "step": 188 + }, + { + "epoch": 0.3643373493975904, + "grad_norm": 0.765511155128479, + "learning_rate": 3.9948532025785546e-05, + "loss": 0.0253, + "step": 189 + }, + { + "epoch": 0.36626506024096384, + "grad_norm": 0.6165828108787537, + "learning_rate": 3.9945266506619403e-05, + "loss": 0.0355, + "step": 190 + }, + { + "epoch": 0.36819277108433734, + "grad_norm": 0.851970911026001, + "learning_rate": 3.994190069618195e-05, + "loss": 0.056, + "step": 191 + }, + { + "epoch": 0.37012048192771085, + "grad_norm": 0.9850023984909058, + "learning_rate": 3.993843461139757e-05, + "loss": 0.0415, + "step": 192 + }, + { + "epoch": 0.37204819277108436, + "grad_norm": 0.7455295324325562, + "learning_rate": 3.9934868269694886e-05, + "loss": 0.0379, + "step": 193 + }, + { + "epoch": 0.3739759036144578, + "grad_norm": 1.159469723701477, + "learning_rate": 3.9931201689006595e-05, + "loss": 0.0237, + "step": 194 + }, + { + "epoch": 0.3759036144578313, + "grad_norm": 0.5490080118179321, + "learning_rate": 3.992743488776947e-05, + "loss": 0.024, + "step": 195 + }, + { + "epoch": 0.37783132530120483, + "grad_norm": 1.279831886291504, + "learning_rate": 3.992356788492421e-05, + "loss": 0.0273, + "step": 196 + }, + { + "epoch": 0.3797590361445783, + "grad_norm": 0.859104335308075, + "learning_rate": 3.9919600699915355e-05, + "loss": 0.0411, + "step": 197 + }, + { + "epoch": 0.3816867469879518, + "grad_norm": 1.2525300979614258, + "learning_rate": 3.991553335269119e-05, + "loss": 0.0857, + "step": 198 + }, + { + "epoch": 0.3836144578313253, + "grad_norm": 0.4924193024635315, + "learning_rate": 3.991136586370367e-05, + "loss": 0.0294, + "step": 199 + }, + { + "epoch": 0.3855421686746988, + "grad_norm": 1.417190670967102, + "learning_rate": 3.990709825390828e-05, + "loss": 0.0395, + "step": 200 + }, + { + "epoch": 0.38746987951807227, + "grad_norm": 0.6172056198120117, + "learning_rate": 3.9902730544763936e-05, + "loss": 0.0194, + "step": 201 + }, + { + "epoch": 0.3893975903614458, + "grad_norm": 0.7292149662971497, + "learning_rate": 3.989826275823291e-05, + "loss": 0.0381, + "step": 202 + }, + { + "epoch": 0.3913253012048193, + "grad_norm": 0.5949816107749939, + "learning_rate": 3.989369491678067e-05, + "loss": 0.0254, + "step": 203 + }, + { + "epoch": 0.3932530120481928, + "grad_norm": 0.6012582182884216, + "learning_rate": 3.988902704337582e-05, + "loss": 0.048, + "step": 204 + }, + { + "epoch": 0.39518072289156625, + "grad_norm": 0.6273590922355652, + "learning_rate": 3.9884259161489936e-05, + "loss": 0.0268, + "step": 205 + }, + { + "epoch": 0.39710843373493976, + "grad_norm": 0.9615244269371033, + "learning_rate": 3.987939129509746e-05, + "loss": 0.0192, + "step": 206 + }, + { + "epoch": 0.39903614457831327, + "grad_norm": 0.6009241342544556, + "learning_rate": 3.9874423468675624e-05, + "loss": 0.0362, + "step": 207 + }, + { + "epoch": 0.4009638554216867, + "grad_norm": 0.411335289478302, + "learning_rate": 3.9869355707204266e-05, + "loss": 0.017, + "step": 208 + }, + { + "epoch": 0.40289156626506023, + "grad_norm": 0.6151527166366577, + "learning_rate": 3.986418803616573e-05, + "loss": 0.0283, + "step": 209 + }, + { + "epoch": 0.40481927710843374, + "grad_norm": 0.33808204531669617, + "learning_rate": 3.985892048154474e-05, + "loss": 0.0158, + "step": 210 + }, + { + "epoch": 0.40674698795180725, + "grad_norm": 0.5464187860488892, + "learning_rate": 3.9853553069828284e-05, + "loss": 0.0292, + "step": 211 + }, + { + "epoch": 0.4086746987951807, + "grad_norm": 0.6658390760421753, + "learning_rate": 3.984808582800543e-05, + "loss": 0.0281, + "step": 212 + }, + { + "epoch": 0.4106024096385542, + "grad_norm": 0.4253764748573303, + "learning_rate": 3.984251878356726e-05, + "loss": 0.031, + "step": 213 + }, + { + "epoch": 0.4125301204819277, + "grad_norm": 0.32309481501579285, + "learning_rate": 3.983685196450667e-05, + "loss": 0.0166, + "step": 214 + }, + { + "epoch": 0.41445783132530123, + "grad_norm": 0.43756410479545593, + "learning_rate": 3.9831085399318265e-05, + "loss": 0.0326, + "step": 215 + }, + { + "epoch": 0.4163855421686747, + "grad_norm": 0.264046847820282, + "learning_rate": 3.982521911699822e-05, + "loss": 0.0118, + "step": 216 + }, + { + "epoch": 0.4183132530120482, + "grad_norm": 0.8630897402763367, + "learning_rate": 3.9819253147044084e-05, + "loss": 0.0246, + "step": 217 + }, + { + "epoch": 0.4202409638554217, + "grad_norm": 0.6923379898071289, + "learning_rate": 3.98131875194547e-05, + "loss": 0.036, + "step": 218 + }, + { + "epoch": 0.42216867469879515, + "grad_norm": 0.5874778628349304, + "learning_rate": 3.9807022264730024e-05, + "loss": 0.0255, + "step": 219 + }, + { + "epoch": 0.42409638554216866, + "grad_norm": 0.394336074590683, + "learning_rate": 3.980075741387094e-05, + "loss": 0.0187, + "step": 220 + }, + { + "epoch": 0.4260240963855422, + "grad_norm": 0.6300327777862549, + "learning_rate": 3.979439299837915e-05, + "loss": 0.0214, + "step": 221 + }, + { + "epoch": 0.4279518072289157, + "grad_norm": 0.5200467109680176, + "learning_rate": 3.978792905025702e-05, + "loss": 0.0628, + "step": 222 + }, + { + "epoch": 0.42987951807228914, + "grad_norm": 0.5713880062103271, + "learning_rate": 3.978136560200735e-05, + "loss": 0.0302, + "step": 223 + }, + { + "epoch": 0.43180722891566264, + "grad_norm": 0.5345383286476135, + "learning_rate": 3.977470268663331e-05, + "loss": 0.0125, + "step": 224 + }, + { + "epoch": 0.43373493975903615, + "grad_norm": 0.5378350019454956, + "learning_rate": 3.976794033763819e-05, + "loss": 0.0246, + "step": 225 + }, + { + "epoch": 0.43566265060240966, + "grad_norm": 0.5554935336112976, + "learning_rate": 3.9761078589025276e-05, + "loss": 0.0212, + "step": 226 + }, + { + "epoch": 0.4375903614457831, + "grad_norm": 0.2832634747028351, + "learning_rate": 3.9754117475297664e-05, + "loss": 0.0125, + "step": 227 + }, + { + "epoch": 0.4395180722891566, + "grad_norm": 1.2910150289535522, + "learning_rate": 3.97470570314581e-05, + "loss": 0.0364, + "step": 228 + }, + { + "epoch": 0.44144578313253013, + "grad_norm": 0.3731018602848053, + "learning_rate": 3.973989729300878e-05, + "loss": 0.0128, + "step": 229 + }, + { + "epoch": 0.4433734939759036, + "grad_norm": 0.9433871507644653, + "learning_rate": 3.9732638295951195e-05, + "loss": 0.0367, + "step": 230 + }, + { + "epoch": 0.4453012048192771, + "grad_norm": 1.0779197216033936, + "learning_rate": 3.972528007678594e-05, + "loss": 0.0667, + "step": 231 + }, + { + "epoch": 0.4472289156626506, + "grad_norm": 1.7009105682373047, + "learning_rate": 3.9717822672512516e-05, + "loss": 0.0655, + "step": 232 + }, + { + "epoch": 0.4491566265060241, + "grad_norm": 0.5646032094955444, + "learning_rate": 3.971026612062919e-05, + "loss": 0.064, + "step": 233 + }, + { + "epoch": 0.45108433734939757, + "grad_norm": 0.44474121928215027, + "learning_rate": 3.970261045913274e-05, + "loss": 0.0206, + "step": 234 + }, + { + "epoch": 0.4530120481927711, + "grad_norm": 1.3969277143478394, + "learning_rate": 3.969485572651833e-05, + "loss": 0.0486, + "step": 235 + }, + { + "epoch": 0.4549397590361446, + "grad_norm": 0.6401994228363037, + "learning_rate": 3.968700196177925e-05, + "loss": 0.0262, + "step": 236 + }, + { + "epoch": 0.4568674698795181, + "grad_norm": 0.7091913223266602, + "learning_rate": 3.96790492044068e-05, + "loss": 0.014, + "step": 237 + }, + { + "epoch": 0.45879518072289155, + "grad_norm": 0.6561547517776489, + "learning_rate": 3.967099749439002e-05, + "loss": 0.0482, + "step": 238 + }, + { + "epoch": 0.46072289156626506, + "grad_norm": 0.6924155354499817, + "learning_rate": 3.966284687221551e-05, + "loss": 0.0289, + "step": 239 + }, + { + "epoch": 0.46265060240963857, + "grad_norm": 0.5868663787841797, + "learning_rate": 3.9654597378867256e-05, + "loss": 0.0331, + "step": 240 + }, + { + "epoch": 0.464578313253012, + "grad_norm": 0.7930939793586731, + "learning_rate": 3.964624905582637e-05, + "loss": 0.0925, + "step": 241 + }, + { + "epoch": 0.46650602409638553, + "grad_norm": 0.4888836145401001, + "learning_rate": 3.9637801945070944e-05, + "loss": 0.015, + "step": 242 + }, + { + "epoch": 0.46843373493975904, + "grad_norm": 0.7820287346839905, + "learning_rate": 3.962925608907579e-05, + "loss": 0.0382, + "step": 243 + }, + { + "epoch": 0.47036144578313255, + "grad_norm": 0.4914316236972809, + "learning_rate": 3.962061153081224e-05, + "loss": 0.0257, + "step": 244 + }, + { + "epoch": 0.472289156626506, + "grad_norm": 0.5681505799293518, + "learning_rate": 3.961186831374793e-05, + "loss": 0.0551, + "step": 245 + }, + { + "epoch": 0.4742168674698795, + "grad_norm": 0.5049723386764526, + "learning_rate": 3.9603026481846616e-05, + "loss": 0.0186, + "step": 246 + }, + { + "epoch": 0.476144578313253, + "grad_norm": 0.5034119486808777, + "learning_rate": 3.959408607956787e-05, + "loss": 0.024, + "step": 247 + }, + { + "epoch": 0.47807228915662653, + "grad_norm": 0.4543336033821106, + "learning_rate": 3.958504715186695e-05, + "loss": 0.0256, + "step": 248 + }, + { + "epoch": 0.48, + "grad_norm": 0.5595743656158447, + "learning_rate": 3.957590974419452e-05, + "loss": 0.0222, + "step": 249 + }, + { + "epoch": 0.4819277108433735, + "grad_norm": 0.5701581239700317, + "learning_rate": 3.956667390249642e-05, + "loss": 0.0334, + "step": 250 + }, + { + "epoch": 0.483855421686747, + "grad_norm": 0.53755784034729, + "learning_rate": 3.9557339673213474e-05, + "loss": 0.0345, + "step": 251 + }, + { + "epoch": 0.4857831325301205, + "grad_norm": 0.4368877112865448, + "learning_rate": 3.95479071032812e-05, + "loss": 0.0183, + "step": 252 + }, + { + "epoch": 0.48771084337349396, + "grad_norm": 0.7972906827926636, + "learning_rate": 3.953837624012963e-05, + "loss": 0.0337, + "step": 253 + }, + { + "epoch": 0.48963855421686747, + "grad_norm": 0.6148451566696167, + "learning_rate": 3.9528747131683023e-05, + "loss": 0.0524, + "step": 254 + }, + { + "epoch": 0.491566265060241, + "grad_norm": 0.500840961933136, + "learning_rate": 3.9519019826359676e-05, + "loss": 0.0248, + "step": 255 + }, + { + "epoch": 0.49349397590361443, + "grad_norm": 0.5536255240440369, + "learning_rate": 3.9509194373071624e-05, + "loss": 0.0219, + "step": 256 + }, + { + "epoch": 0.49542168674698794, + "grad_norm": 0.6873176097869873, + "learning_rate": 3.9499270821224444e-05, + "loss": 0.0312, + "step": 257 + }, + { + "epoch": 0.49734939759036145, + "grad_norm": 0.37207168340682983, + "learning_rate": 3.9489249220716974e-05, + "loss": 0.0149, + "step": 258 + }, + { + "epoch": 0.49927710843373496, + "grad_norm": 0.4458799660205841, + "learning_rate": 3.947912962194107e-05, + "loss": 0.0214, + "step": 259 + }, + { + "epoch": 0.5012048192771085, + "grad_norm": 0.4272724390029907, + "learning_rate": 3.9468912075781345e-05, + "loss": 0.0263, + "step": 260 + }, + { + "epoch": 0.503132530120482, + "grad_norm": 0.5245792269706726, + "learning_rate": 3.945859663361496e-05, + "loss": 0.0103, + "step": 261 + }, + { + "epoch": 0.5050602409638554, + "grad_norm": 0.8799260854721069, + "learning_rate": 3.9448183347311284e-05, + "loss": 0.0292, + "step": 262 + }, + { + "epoch": 0.5069879518072289, + "grad_norm": 0.5996833443641663, + "learning_rate": 3.943767226923171e-05, + "loss": 0.0306, + "step": 263 + }, + { + "epoch": 0.5089156626506024, + "grad_norm": 0.6044682860374451, + "learning_rate": 3.942706345222935e-05, + "loss": 0.0218, + "step": 264 + }, + { + "epoch": 0.5108433734939759, + "grad_norm": 0.4770200848579407, + "learning_rate": 3.941635694964878e-05, + "loss": 0.0226, + "step": 265 + }, + { + "epoch": 0.5127710843373494, + "grad_norm": 0.5605704188346863, + "learning_rate": 3.940555281532576e-05, + "loss": 0.0354, + "step": 266 + }, + { + "epoch": 0.5146987951807229, + "grad_norm": 0.46532443165779114, + "learning_rate": 3.939465110358699e-05, + "loss": 0.0223, + "step": 267 + }, + { + "epoch": 0.5166265060240964, + "grad_norm": 0.5190595388412476, + "learning_rate": 3.93836518692498e-05, + "loss": 0.0219, + "step": 268 + }, + { + "epoch": 0.5185542168674698, + "grad_norm": 0.5767757892608643, + "learning_rate": 3.937255516762193e-05, + "loss": 0.0294, + "step": 269 + }, + { + "epoch": 0.5204819277108433, + "grad_norm": 0.4543164372444153, + "learning_rate": 3.936136105450119e-05, + "loss": 0.0244, + "step": 270 + }, + { + "epoch": 0.5224096385542168, + "grad_norm": 0.4155154526233673, + "learning_rate": 3.9350069586175195e-05, + "loss": 0.02, + "step": 271 + }, + { + "epoch": 0.5243373493975904, + "grad_norm": 0.5470768213272095, + "learning_rate": 3.933868081942113e-05, + "loss": 0.0187, + "step": 272 + }, + { + "epoch": 0.5262650602409639, + "grad_norm": 0.9491772651672363, + "learning_rate": 3.9327194811505406e-05, + "loss": 0.0337, + "step": 273 + }, + { + "epoch": 0.5281927710843374, + "grad_norm": 0.9313873052597046, + "learning_rate": 3.93156116201834e-05, + "loss": 0.0573, + "step": 274 + }, + { + "epoch": 0.5301204819277109, + "grad_norm": 0.7181005477905273, + "learning_rate": 3.930393130369915e-05, + "loss": 0.0405, + "step": 275 + }, + { + "epoch": 0.5320481927710843, + "grad_norm": 0.34231385588645935, + "learning_rate": 3.9292153920785076e-05, + "loss": 0.0153, + "step": 276 + }, + { + "epoch": 0.5339759036144578, + "grad_norm": 0.6899610161781311, + "learning_rate": 3.928027953066168e-05, + "loss": 0.0338, + "step": 277 + }, + { + "epoch": 0.5359036144578313, + "grad_norm": 0.7509781718254089, + "learning_rate": 3.926830819303726e-05, + "loss": 0.0416, + "step": 278 + }, + { + "epoch": 0.5378313253012048, + "grad_norm": 0.6326774954795837, + "learning_rate": 3.925623996810757e-05, + "loss": 0.0293, + "step": 279 + }, + { + "epoch": 0.5397590361445783, + "grad_norm": 0.5543203353881836, + "learning_rate": 3.924407491655557e-05, + "loss": 0.0263, + "step": 280 + }, + { + "epoch": 0.5416867469879518, + "grad_norm": 0.5367572903633118, + "learning_rate": 3.9231813099551086e-05, + "loss": 0.0276, + "step": 281 + }, + { + "epoch": 0.5436144578313253, + "grad_norm": 0.3143869638442993, + "learning_rate": 3.921945457875051e-05, + "loss": 0.0146, + "step": 282 + }, + { + "epoch": 0.5455421686746988, + "grad_norm": 0.47403043508529663, + "learning_rate": 3.920699941629649e-05, + "loss": 0.0267, + "step": 283 + }, + { + "epoch": 0.5474698795180722, + "grad_norm": 0.5082595348358154, + "learning_rate": 3.919444767481763e-05, + "loss": 0.0183, + "step": 284 + }, + { + "epoch": 0.5493975903614458, + "grad_norm": 0.747949481010437, + "learning_rate": 3.918179941742816e-05, + "loss": 0.0412, + "step": 285 + }, + { + "epoch": 0.5513253012048193, + "grad_norm": 0.6553886532783508, + "learning_rate": 3.916905470772762e-05, + "loss": 0.0505, + "step": 286 + }, + { + "epoch": 0.5532530120481928, + "grad_norm": 0.3838176131248474, + "learning_rate": 3.9156213609800545e-05, + "loss": 0.0156, + "step": 287 + }, + { + "epoch": 0.5551807228915663, + "grad_norm": 0.7427731156349182, + "learning_rate": 3.914327618821614e-05, + "loss": 0.0278, + "step": 288 + }, + { + "epoch": 0.5571084337349398, + "grad_norm": 0.2612821161746979, + "learning_rate": 3.913024250802796e-05, + "loss": 0.0101, + "step": 289 + }, + { + "epoch": 0.5590361445783133, + "grad_norm": 0.3799416124820709, + "learning_rate": 3.911711263477357e-05, + "loss": 0.0168, + "step": 290 + }, + { + "epoch": 0.5609638554216867, + "grad_norm": 0.5053854584693909, + "learning_rate": 3.910388663447425e-05, + "loss": 0.0249, + "step": 291 + }, + { + "epoch": 0.5628915662650602, + "grad_norm": 0.38095012307167053, + "learning_rate": 3.909056457363461e-05, + "loss": 0.0156, + "step": 292 + }, + { + "epoch": 0.5648192771084337, + "grad_norm": 0.4477892220020294, + "learning_rate": 3.907714651924229e-05, + "loss": 0.0309, + "step": 293 + }, + { + "epoch": 0.5667469879518072, + "grad_norm": 0.5875864624977112, + "learning_rate": 3.906363253876763e-05, + "loss": 0.0287, + "step": 294 + }, + { + "epoch": 0.5686746987951807, + "grad_norm": 0.522990882396698, + "learning_rate": 3.90500227001633e-05, + "loss": 0.0318, + "step": 295 + }, + { + "epoch": 0.5706024096385542, + "grad_norm": 0.4153876304626465, + "learning_rate": 3.9036317071863994e-05, + "loss": 0.0192, + "step": 296 + }, + { + "epoch": 0.5725301204819278, + "grad_norm": 0.4675769507884979, + "learning_rate": 3.902251572278605e-05, + "loss": 0.067, + "step": 297 + }, + { + "epoch": 0.5744578313253013, + "grad_norm": 0.35778650641441345, + "learning_rate": 3.900861872232713e-05, + "loss": 0.0197, + "step": 298 + }, + { + "epoch": 0.5763855421686747, + "grad_norm": 0.7382330894470215, + "learning_rate": 3.899462614036587e-05, + "loss": 0.0283, + "step": 299 + }, + { + "epoch": 0.5783132530120482, + "grad_norm": 0.41268599033355713, + "learning_rate": 3.89805380472615e-05, + "loss": 0.0207, + "step": 300 + }, + { + "epoch": 0.5802409638554217, + "grad_norm": 1.2013020515441895, + "learning_rate": 3.8966354513853535e-05, + "loss": 0.0301, + "step": 301 + }, + { + "epoch": 0.5821686746987952, + "grad_norm": 0.424757719039917, + "learning_rate": 3.895207561146137e-05, + "loss": 0.022, + "step": 302 + }, + { + "epoch": 0.5840963855421687, + "grad_norm": 0.4196677505970001, + "learning_rate": 3.893770141188396e-05, + "loss": 0.0424, + "step": 303 + }, + { + "epoch": 0.5860240963855422, + "grad_norm": 0.8644190430641174, + "learning_rate": 3.892323198739946e-05, + "loss": 0.08, + "step": 304 + }, + { + "epoch": 0.5879518072289157, + "grad_norm": 0.5645135045051575, + "learning_rate": 3.890866741076482e-05, + "loss": 0.0152, + "step": 305 + }, + { + "epoch": 0.5898795180722891, + "grad_norm": 0.5218387246131897, + "learning_rate": 3.889400775521545e-05, + "loss": 0.0205, + "step": 306 + }, + { + "epoch": 0.5918072289156626, + "grad_norm": 0.39709413051605225, + "learning_rate": 3.8879253094464865e-05, + "loss": 0.0233, + "step": 307 + }, + { + "epoch": 0.5937349397590361, + "grad_norm": 0.3572910726070404, + "learning_rate": 3.8864403502704285e-05, + "loss": 0.0198, + "step": 308 + }, + { + "epoch": 0.5956626506024096, + "grad_norm": 0.382709264755249, + "learning_rate": 3.8849459054602274e-05, + "loss": 0.0176, + "step": 309 + }, + { + "epoch": 0.5975903614457831, + "grad_norm": 3.4527227878570557, + "learning_rate": 3.883441982530436e-05, + "loss": 0.0239, + "step": 310 + }, + { + "epoch": 0.5995180722891567, + "grad_norm": 0.4467569589614868, + "learning_rate": 3.8819285890432674e-05, + "loss": 0.0284, + "step": 311 + }, + { + "epoch": 0.6014457831325302, + "grad_norm": 0.44513460993766785, + "learning_rate": 3.880405732608555e-05, + "loss": 0.0233, + "step": 312 + }, + { + "epoch": 0.6033734939759036, + "grad_norm": 0.8029689192771912, + "learning_rate": 3.8788734208837155e-05, + "loss": 0.0433, + "step": 313 + }, + { + "epoch": 0.6053012048192771, + "grad_norm": 0.7291454076766968, + "learning_rate": 3.877331661573709e-05, + "loss": 0.043, + "step": 314 + }, + { + "epoch": 0.6072289156626506, + "grad_norm": 0.6050467491149902, + "learning_rate": 3.8757804624310006e-05, + "loss": 0.0377, + "step": 315 + }, + { + "epoch": 0.6091566265060241, + "grad_norm": 0.6714366674423218, + "learning_rate": 3.874219831255524e-05, + "loss": 0.046, + "step": 316 + }, + { + "epoch": 0.6110843373493976, + "grad_norm": 0.336037278175354, + "learning_rate": 3.8726497758946394e-05, + "loss": 0.0149, + "step": 317 + }, + { + "epoch": 0.6130120481927711, + "grad_norm": 0.3057402968406677, + "learning_rate": 3.871070304243094e-05, + "loss": 0.014, + "step": 318 + }, + { + "epoch": 0.6149397590361446, + "grad_norm": 0.4537644684314728, + "learning_rate": 3.8694814242429834e-05, + "loss": 0.0503, + "step": 319 + }, + { + "epoch": 0.6168674698795181, + "grad_norm": 0.45573824644088745, + "learning_rate": 3.8678831438837116e-05, + "loss": 0.021, + "step": 320 + }, + { + "epoch": 0.6187951807228915, + "grad_norm": 0.30729591846466064, + "learning_rate": 3.866275471201952e-05, + "loss": 0.0163, + "step": 321 + }, + { + "epoch": 0.620722891566265, + "grad_norm": 0.7614850401878357, + "learning_rate": 3.8646584142816036e-05, + "loss": 0.0347, + "step": 322 + }, + { + "epoch": 0.6226506024096385, + "grad_norm": 0.5323611497879028, + "learning_rate": 3.863031981253754e-05, + "loss": 0.0201, + "step": 323 + }, + { + "epoch": 0.624578313253012, + "grad_norm": 0.34426453709602356, + "learning_rate": 3.861396180296635e-05, + "loss": 0.0243, + "step": 324 + }, + { + "epoch": 0.6265060240963856, + "grad_norm": 0.621636152267456, + "learning_rate": 3.859751019635585e-05, + "loss": 0.0166, + "step": 325 + }, + { + "epoch": 0.6284337349397591, + "grad_norm": 0.549324095249176, + "learning_rate": 3.858096507543006e-05, + "loss": 0.0274, + "step": 326 + }, + { + "epoch": 0.6303614457831326, + "grad_norm": 0.358426570892334, + "learning_rate": 3.8564326523383214e-05, + "loss": 0.0207, + "step": 327 + }, + { + "epoch": 0.632289156626506, + "grad_norm": 0.3639723062515259, + "learning_rate": 3.8547594623879346e-05, + "loss": 0.0297, + "step": 328 + }, + { + "epoch": 0.6342168674698795, + "grad_norm": 0.3402212858200073, + "learning_rate": 3.853076946105188e-05, + "loss": 0.0258, + "step": 329 + }, + { + "epoch": 0.636144578313253, + "grad_norm": 0.4083027243614197, + "learning_rate": 3.85138511195032e-05, + "loss": 0.0351, + "step": 330 + }, + { + "epoch": 0.6380722891566265, + "grad_norm": 0.43532121181488037, + "learning_rate": 3.84968396843042e-05, + "loss": 0.0388, + "step": 331 + }, + { + "epoch": 0.64, + "grad_norm": 0.35353463888168335, + "learning_rate": 3.8479735240993904e-05, + "loss": 0.0203, + "step": 332 + }, + { + "epoch": 0.6419277108433735, + "grad_norm": 0.350149929523468, + "learning_rate": 3.846253787557901e-05, + "loss": 0.0261, + "step": 333 + }, + { + "epoch": 0.643855421686747, + "grad_norm": 0.7665389180183411, + "learning_rate": 3.844524767453344e-05, + "loss": 0.0108, + "step": 334 + }, + { + "epoch": 0.6457831325301204, + "grad_norm": 0.44621360301971436, + "learning_rate": 3.842786472479795e-05, + "loss": 0.0282, + "step": 335 + }, + { + "epoch": 0.6477108433734939, + "grad_norm": 0.7787201404571533, + "learning_rate": 3.841038911377962e-05, + "loss": 0.0216, + "step": 336 + }, + { + "epoch": 0.6496385542168674, + "grad_norm": 0.48260653018951416, + "learning_rate": 3.839282092935153e-05, + "loss": 0.0234, + "step": 337 + }, + { + "epoch": 0.651566265060241, + "grad_norm": 0.4987852871417999, + "learning_rate": 3.837516025985219e-05, + "loss": 0.0515, + "step": 338 + }, + { + "epoch": 0.6534939759036145, + "grad_norm": 0.9030266404151917, + "learning_rate": 3.835740719408517e-05, + "loss": 0.0508, + "step": 339 + }, + { + "epoch": 0.655421686746988, + "grad_norm": 0.6381701231002808, + "learning_rate": 3.833956182131867e-05, + "loss": 0.0405, + "step": 340 + }, + { + "epoch": 0.6573493975903615, + "grad_norm": 0.42828986048698425, + "learning_rate": 3.832162423128499e-05, + "loss": 0.024, + "step": 341 + }, + { + "epoch": 0.659277108433735, + "grad_norm": 0.38725873827934265, + "learning_rate": 3.8303594514180164e-05, + "loss": 0.0199, + "step": 342 + }, + { + "epoch": 0.6612048192771084, + "grad_norm": 0.23280498385429382, + "learning_rate": 3.828547276066346e-05, + "loss": 0.0101, + "step": 343 + }, + { + "epoch": 0.6631325301204819, + "grad_norm": 0.7298216819763184, + "learning_rate": 3.8267259061856925e-05, + "loss": 0.0455, + "step": 344 + }, + { + "epoch": 0.6650602409638554, + "grad_norm": 0.5975687503814697, + "learning_rate": 3.824895350934496e-05, + "loss": 0.0372, + "step": 345 + }, + { + "epoch": 0.6669879518072289, + "grad_norm": 0.6295403242111206, + "learning_rate": 3.823055619517381e-05, + "loss": 0.0362, + "step": 346 + }, + { + "epoch": 0.6689156626506024, + "grad_norm": 0.5086020827293396, + "learning_rate": 3.821206721185115e-05, + "loss": 0.0368, + "step": 347 + }, + { + "epoch": 0.6708433734939759, + "grad_norm": 0.34506168961524963, + "learning_rate": 3.819348665234557e-05, + "loss": 0.0178, + "step": 348 + }, + { + "epoch": 0.6727710843373494, + "grad_norm": 1.309940218925476, + "learning_rate": 3.817481461008617e-05, + "loss": 0.024, + "step": 349 + }, + { + "epoch": 0.6746987951807228, + "grad_norm": 0.4074770510196686, + "learning_rate": 3.815605117896204e-05, + "loss": 0.0262, + "step": 350 + }, + { + "epoch": 0.6766265060240964, + "grad_norm": 0.48525840044021606, + "learning_rate": 3.8137196453321775e-05, + "loss": 0.0209, + "step": 351 + }, + { + "epoch": 0.6785542168674699, + "grad_norm": 0.7199739217758179, + "learning_rate": 3.811825052797308e-05, + "loss": 0.0396, + "step": 352 + }, + { + "epoch": 0.6804819277108434, + "grad_norm": 0.519540011882782, + "learning_rate": 3.8099213498182196e-05, + "loss": 0.0453, + "step": 353 + }, + { + "epoch": 0.6824096385542169, + "grad_norm": 0.9738391041755676, + "learning_rate": 3.808008545967349e-05, + "loss": 0.0317, + "step": 354 + }, + { + "epoch": 0.6843373493975904, + "grad_norm": 1.888344407081604, + "learning_rate": 3.8060866508628953e-05, + "loss": 0.0452, + "step": 355 + }, + { + "epoch": 0.6862650602409639, + "grad_norm": 0.48989811539649963, + "learning_rate": 3.8041556741687695e-05, + "loss": 0.0315, + "step": 356 + }, + { + "epoch": 0.6881927710843373, + "grad_norm": 0.3764645457267761, + "learning_rate": 3.8022156255945496e-05, + "loss": 0.0269, + "step": 357 + }, + { + "epoch": 0.6901204819277108, + "grad_norm": 0.46409738063812256, + "learning_rate": 3.800266514895429e-05, + "loss": 0.0171, + "step": 358 + }, + { + "epoch": 0.6920481927710843, + "grad_norm": 0.41091030836105347, + "learning_rate": 3.7983083518721695e-05, + "loss": 0.0167, + "step": 359 + }, + { + "epoch": 0.6939759036144578, + "grad_norm": 0.8375523090362549, + "learning_rate": 3.79634114637105e-05, + "loss": 0.0342, + "step": 360 + }, + { + "epoch": 0.6959036144578313, + "grad_norm": 1.7053394317626953, + "learning_rate": 3.794364908283817e-05, + "loss": 0.02, + "step": 361 + }, + { + "epoch": 0.6978313253012048, + "grad_norm": 0.4163115918636322, + "learning_rate": 3.792379647547637e-05, + "loss": 0.0138, + "step": 362 + }, + { + "epoch": 0.6997590361445784, + "grad_norm": 0.388751745223999, + "learning_rate": 3.790385374145046e-05, + "loss": 0.0172, + "step": 363 + }, + { + "epoch": 0.7016867469879519, + "grad_norm": 0.5584064722061157, + "learning_rate": 3.7883820981038966e-05, + "loss": 0.0254, + "step": 364 + }, + { + "epoch": 0.7036144578313253, + "grad_norm": 1.394264817237854, + "learning_rate": 3.7863698294973114e-05, + "loss": 0.037, + "step": 365 + }, + { + "epoch": 0.7055421686746988, + "grad_norm": 0.46280744671821594, + "learning_rate": 3.78434857844363e-05, + "loss": 0.0234, + "step": 366 + }, + { + "epoch": 0.7074698795180723, + "grad_norm": 0.39548924565315247, + "learning_rate": 3.782318355106358e-05, + "loss": 0.0164, + "step": 367 + }, + { + "epoch": 0.7093975903614458, + "grad_norm": 0.7307773232460022, + "learning_rate": 3.780279169694118e-05, + "loss": 0.0192, + "step": 368 + }, + { + "epoch": 0.7113253012048193, + "grad_norm": 0.28035807609558105, + "learning_rate": 3.778231032460594e-05, + "loss": 0.0131, + "step": 369 + }, + { + "epoch": 0.7132530120481928, + "grad_norm": 0.8376953601837158, + "learning_rate": 3.776173953704486e-05, + "loss": 0.0291, + "step": 370 + }, + { + "epoch": 0.7151807228915663, + "grad_norm": 0.7356843948364258, + "learning_rate": 3.774107943769454e-05, + "loss": 0.0214, + "step": 371 + }, + { + "epoch": 0.7171084337349397, + "grad_norm": 0.41503390669822693, + "learning_rate": 3.772033013044064e-05, + "loss": 0.0221, + "step": 372 + }, + { + "epoch": 0.7190361445783132, + "grad_norm": 0.35732385516166687, + "learning_rate": 3.7699491719617436e-05, + "loss": 0.015, + "step": 373 + }, + { + "epoch": 0.7209638554216867, + "grad_norm": 0.283778578042984, + "learning_rate": 3.76785643100072e-05, + "loss": 0.0146, + "step": 374 + }, + { + "epoch": 0.7228915662650602, + "grad_norm": 0.3219413459300995, + "learning_rate": 3.765754800683974e-05, + "loss": 0.015, + "step": 375 + }, + { + "epoch": 0.7248192771084337, + "grad_norm": 0.610431432723999, + "learning_rate": 3.7636442915791856e-05, + "loss": 0.0326, + "step": 376 + }, + { + "epoch": 0.7267469879518073, + "grad_norm": 4.944870948791504, + "learning_rate": 3.7615249142986784e-05, + "loss": 0.0432, + "step": 377 + }, + { + "epoch": 0.7286746987951808, + "grad_norm": 0.4894593060016632, + "learning_rate": 3.7593966794993696e-05, + "loss": 0.0174, + "step": 378 + }, + { + "epoch": 0.7306024096385542, + "grad_norm": 0.4211325943470001, + "learning_rate": 3.757259597882714e-05, + "loss": 0.023, + "step": 379 + }, + { + "epoch": 0.7325301204819277, + "grad_norm": 0.33621737360954285, + "learning_rate": 3.755113680194651e-05, + "loss": 0.0201, + "step": 380 + }, + { + "epoch": 0.7344578313253012, + "grad_norm": 0.5799694657325745, + "learning_rate": 3.7529589372255514e-05, + "loss": 0.0173, + "step": 381 + }, + { + "epoch": 0.7363855421686747, + "grad_norm": 0.5172572731971741, + "learning_rate": 3.750795379810162e-05, + "loss": 0.0284, + "step": 382 + }, + { + "epoch": 0.7383132530120482, + "grad_norm": 0.5715453028678894, + "learning_rate": 3.748623018827552e-05, + "loss": 0.0194, + "step": 383 + }, + { + "epoch": 0.7402409638554217, + "grad_norm": 0.5284178256988525, + "learning_rate": 3.746441865201056e-05, + "loss": 0.0247, + "step": 384 + }, + { + "epoch": 0.7421686746987952, + "grad_norm": 0.37828654050827026, + "learning_rate": 3.744251929898223e-05, + "loss": 0.0097, + "step": 385 + }, + { + "epoch": 0.7440963855421687, + "grad_norm": 0.3252779543399811, + "learning_rate": 3.742053223930758e-05, + "loss": 0.0238, + "step": 386 + }, + { + "epoch": 0.7460240963855421, + "grad_norm": 0.6031543612480164, + "learning_rate": 3.7398457583544674e-05, + "loss": 0.0332, + "step": 387 + }, + { + "epoch": 0.7479518072289156, + "grad_norm": 0.23846614360809326, + "learning_rate": 3.737629544269206e-05, + "loss": 0.0122, + "step": 388 + }, + { + "epoch": 0.7498795180722891, + "grad_norm": 0.5274029970169067, + "learning_rate": 3.7354045928188155e-05, + "loss": 0.0324, + "step": 389 + }, + { + "epoch": 0.7518072289156627, + "grad_norm": 0.4672217071056366, + "learning_rate": 3.733170915191075e-05, + "loss": 0.0196, + "step": 390 + }, + { + "epoch": 0.7537349397590362, + "grad_norm": 0.29819396138191223, + "learning_rate": 3.730928522617639e-05, + "loss": 0.0131, + "step": 391 + }, + { + "epoch": 0.7556626506024097, + "grad_norm": 0.43824997544288635, + "learning_rate": 3.7286774263739855e-05, + "loss": 0.0238, + "step": 392 + }, + { + "epoch": 0.7575903614457832, + "grad_norm": 0.2822072505950928, + "learning_rate": 3.726417637779357e-05, + "loss": 0.0314, + "step": 393 + }, + { + "epoch": 0.7595180722891566, + "grad_norm": 0.43815648555755615, + "learning_rate": 3.7241491681967044e-05, + "loss": 0.0144, + "step": 394 + }, + { + "epoch": 0.7614457831325301, + "grad_norm": 0.37194815278053284, + "learning_rate": 3.721872029032628e-05, + "loss": 0.0286, + "step": 395 + }, + { + "epoch": 0.7633734939759036, + "grad_norm": 0.7319737672805786, + "learning_rate": 3.719586231737322e-05, + "loss": 0.0427, + "step": 396 + }, + { + "epoch": 0.7653012048192771, + "grad_norm": 0.5870066285133362, + "learning_rate": 3.717291787804517e-05, + "loss": 0.0138, + "step": 397 + }, + { + "epoch": 0.7672289156626506, + "grad_norm": 0.6574277281761169, + "learning_rate": 3.7149887087714225e-05, + "loss": 0.061, + "step": 398 + }, + { + "epoch": 0.7691566265060241, + "grad_norm": 0.5467348694801331, + "learning_rate": 3.712677006218666e-05, + "loss": 0.022, + "step": 399 + }, + { + "epoch": 0.7710843373493976, + "grad_norm": 0.3589288890361786, + "learning_rate": 3.710356691770238e-05, + "loss": 0.0161, + "step": 400 + }, + { + "epoch": 0.7730120481927711, + "grad_norm": 0.574630618095398, + "learning_rate": 3.708027777093433e-05, + "loss": 0.0285, + "step": 401 + }, + { + "epoch": 0.7749397590361445, + "grad_norm": 0.39048445224761963, + "learning_rate": 3.70569027389879e-05, + "loss": 0.012, + "step": 402 + }, + { + "epoch": 0.776867469879518, + "grad_norm": 0.34803536534309387, + "learning_rate": 3.703344193940032e-05, + "loss": 0.0155, + "step": 403 + }, + { + "epoch": 0.7787951807228916, + "grad_norm": 1.188948392868042, + "learning_rate": 3.700989549014011e-05, + "loss": 0.0617, + "step": 404 + }, + { + "epoch": 0.7807228915662651, + "grad_norm": 0.473157674074173, + "learning_rate": 3.698626350960646e-05, + "loss": 0.0298, + "step": 405 + }, + { + "epoch": 0.7826506024096386, + "grad_norm": 0.42009076476097107, + "learning_rate": 3.6962546116628634e-05, + "loss": 0.03, + "step": 406 + }, + { + "epoch": 0.7845783132530121, + "grad_norm": 0.6334308981895447, + "learning_rate": 3.693874343046537e-05, + "loss": 0.0107, + "step": 407 + }, + { + "epoch": 0.7865060240963856, + "grad_norm": 0.35594677925109863, + "learning_rate": 3.6914855570804314e-05, + "loss": 0.0174, + "step": 408 + }, + { + "epoch": 0.788433734939759, + "grad_norm": 0.28985708951950073, + "learning_rate": 3.689088265776136e-05, + "loss": 0.0149, + "step": 409 + }, + { + "epoch": 0.7903614457831325, + "grad_norm": 0.3981950581073761, + "learning_rate": 3.686682481188011e-05, + "loss": 0.019, + "step": 410 + }, + { + "epoch": 0.792289156626506, + "grad_norm": 0.48819583654403687, + "learning_rate": 3.6842682154131193e-05, + "loss": 0.0217, + "step": 411 + }, + { + "epoch": 0.7942168674698795, + "grad_norm": 0.42819952964782715, + "learning_rate": 3.681845480591174e-05, + "loss": 0.0198, + "step": 412 + }, + { + "epoch": 0.796144578313253, + "grad_norm": 0.48591694235801697, + "learning_rate": 3.6794142889044727e-05, + "loss": 0.0253, + "step": 413 + }, + { + "epoch": 0.7980722891566265, + "grad_norm": 0.4730607271194458, + "learning_rate": 3.676974652577835e-05, + "loss": 0.0329, + "step": 414 + }, + { + "epoch": 0.8, + "grad_norm": 0.5390865802764893, + "learning_rate": 3.6745265838785434e-05, + "loss": 0.0479, + "step": 415 + }, + { + "epoch": 0.8019277108433734, + "grad_norm": 0.6377891302108765, + "learning_rate": 3.672070095116283e-05, + "loss": 0.019, + "step": 416 + }, + { + "epoch": 0.803855421686747, + "grad_norm": 0.8984615206718445, + "learning_rate": 3.669605198643075e-05, + "loss": 0.0444, + "step": 417 + }, + { + "epoch": 0.8057831325301205, + "grad_norm": 0.4913877546787262, + "learning_rate": 3.667131906853219e-05, + "loss": 0.031, + "step": 418 + }, + { + "epoch": 0.807710843373494, + "grad_norm": 0.37894028425216675, + "learning_rate": 3.664650232183229e-05, + "loss": 0.0195, + "step": 419 + }, + { + "epoch": 0.8096385542168675, + "grad_norm": 0.3644949495792389, + "learning_rate": 3.66216018711177e-05, + "loss": 0.018, + "step": 420 + }, + { + "epoch": 0.811566265060241, + "grad_norm": 0.414440393447876, + "learning_rate": 3.659661784159597e-05, + "loss": 0.0188, + "step": 421 + }, + { + "epoch": 0.8134939759036145, + "grad_norm": 0.49220341444015503, + "learning_rate": 3.65715503588949e-05, + "loss": 0.016, + "step": 422 + }, + { + "epoch": 0.815421686746988, + "grad_norm": 1.0939836502075195, + "learning_rate": 3.654639954906193e-05, + "loss": 0.0758, + "step": 423 + }, + { + "epoch": 0.8173493975903614, + "grad_norm": 0.43222442269325256, + "learning_rate": 3.652116553856349e-05, + "loss": 0.0308, + "step": 424 + }, + { + "epoch": 0.8192771084337349, + "grad_norm": 0.5081896185874939, + "learning_rate": 3.649584845428438e-05, + "loss": 0.0493, + "step": 425 + }, + { + "epoch": 0.8212048192771084, + "grad_norm": 0.9811948537826538, + "learning_rate": 3.64704484235271e-05, + "loss": 0.019, + "step": 426 + }, + { + "epoch": 0.8231325301204819, + "grad_norm": 0.31656572222709656, + "learning_rate": 3.6444965574011255e-05, + "loss": 0.0135, + "step": 427 + }, + { + "epoch": 0.8250602409638554, + "grad_norm": 0.7844433188438416, + "learning_rate": 3.641940003387289e-05, + "loss": 0.0402, + "step": 428 + }, + { + "epoch": 0.826987951807229, + "grad_norm": 0.3353273570537567, + "learning_rate": 3.6393751931663814e-05, + "loss": 0.0132, + "step": 429 + }, + { + "epoch": 0.8289156626506025, + "grad_norm": 0.7253058552742004, + "learning_rate": 3.6368021396351015e-05, + "loss": 0.0296, + "step": 430 + }, + { + "epoch": 0.8308433734939759, + "grad_norm": 0.45300304889678955, + "learning_rate": 3.634220855731598e-05, + "loss": 0.0258, + "step": 431 + }, + { + "epoch": 0.8327710843373494, + "grad_norm": 0.3480473458766937, + "learning_rate": 3.631631354435403e-05, + "loss": 0.0099, + "step": 432 + }, + { + "epoch": 0.8346987951807229, + "grad_norm": 2.1114516258239746, + "learning_rate": 3.62903364876737e-05, + "loss": 0.0457, + "step": 433 + }, + { + "epoch": 0.8366265060240964, + "grad_norm": 0.5649561882019043, + "learning_rate": 3.626427751789606e-05, + "loss": 0.0444, + "step": 434 + }, + { + "epoch": 0.8385542168674699, + "grad_norm": 0.3864995539188385, + "learning_rate": 3.623813676605405e-05, + "loss": 0.0223, + "step": 435 + }, + { + "epoch": 0.8404819277108434, + "grad_norm": 1.2134298086166382, + "learning_rate": 3.621191436359186e-05, + "loss": 0.0353, + "step": 436 + }, + { + "epoch": 0.8424096385542169, + "grad_norm": 0.4403415024280548, + "learning_rate": 3.6185610442364246e-05, + "loss": 0.0216, + "step": 437 + }, + { + "epoch": 0.8443373493975903, + "grad_norm": 0.6050297021865845, + "learning_rate": 3.6159225134635846e-05, + "loss": 0.0433, + "step": 438 + }, + { + "epoch": 0.8462650602409638, + "grad_norm": 0.7951678037643433, + "learning_rate": 3.6132758573080556e-05, + "loss": 0.031, + "step": 439 + }, + { + "epoch": 0.8481927710843373, + "grad_norm": 0.4991949796676636, + "learning_rate": 3.6106210890780834e-05, + "loss": 0.0313, + "step": 440 + }, + { + "epoch": 0.8501204819277108, + "grad_norm": 0.47951385378837585, + "learning_rate": 3.607958222122704e-05, + "loss": 0.0218, + "step": 441 + }, + { + "epoch": 0.8520481927710843, + "grad_norm": 0.7345194220542908, + "learning_rate": 3.6052872698316755e-05, + "loss": 0.0239, + "step": 442 + }, + { + "epoch": 0.8539759036144579, + "grad_norm": 1.4814884662628174, + "learning_rate": 3.602608245635414e-05, + "loss": 0.0127, + "step": 443 + }, + { + "epoch": 0.8559036144578314, + "grad_norm": 2.4240877628326416, + "learning_rate": 3.599921163004922e-05, + "loss": 0.0618, + "step": 444 + }, + { + "epoch": 0.8578313253012049, + "grad_norm": 0.41523510217666626, + "learning_rate": 3.5972260354517216e-05, + "loss": 0.0283, + "step": 445 + }, + { + "epoch": 0.8597590361445783, + "grad_norm": 0.5577677488327026, + "learning_rate": 3.594522876527791e-05, + "loss": 0.0271, + "step": 446 + }, + { + "epoch": 0.8616867469879518, + "grad_norm": 0.5829064846038818, + "learning_rate": 3.591811699825487e-05, + "loss": 0.0169, + "step": 447 + }, + { + "epoch": 0.8636144578313253, + "grad_norm": 0.4478822350502014, + "learning_rate": 3.5890925189774886e-05, + "loss": 0.0239, + "step": 448 + }, + { + "epoch": 0.8655421686746988, + "grad_norm": 0.3498048782348633, + "learning_rate": 3.586365347656718e-05, + "loss": 0.0137, + "step": 449 + }, + { + "epoch": 0.8674698795180723, + "grad_norm": 0.6571130156517029, + "learning_rate": 3.583630199576278e-05, + "loss": 0.027, + "step": 450 + }, + { + "epoch": 0.8693975903614458, + "grad_norm": 0.344970166683197, + "learning_rate": 3.58088708848938e-05, + "loss": 0.0167, + "step": 451 + }, + { + "epoch": 0.8713253012048193, + "grad_norm": 0.34611570835113525, + "learning_rate": 3.5781360281892775e-05, + "loss": 0.0468, + "step": 452 + }, + { + "epoch": 0.8732530120481927, + "grad_norm": 0.66157066822052, + "learning_rate": 3.575377032509194e-05, + "loss": 0.0344, + "step": 453 + }, + { + "epoch": 0.8751807228915662, + "grad_norm": 0.3676326870918274, + "learning_rate": 3.5726101153222534e-05, + "loss": 0.0366, + "step": 454 + }, + { + "epoch": 0.8771084337349397, + "grad_norm": 0.5958423018455505, + "learning_rate": 3.569835290541414e-05, + "loss": 0.0382, + "step": 455 + }, + { + "epoch": 0.8790361445783132, + "grad_norm": 0.36787471175193787, + "learning_rate": 3.567052572119397e-05, + "loss": 0.018, + "step": 456 + }, + { + "epoch": 0.8809638554216868, + "grad_norm": 0.9478234052658081, + "learning_rate": 3.564261974048611e-05, + "loss": 0.0179, + "step": 457 + }, + { + "epoch": 0.8828915662650603, + "grad_norm": 0.3337579369544983, + "learning_rate": 3.56146351036109e-05, + "loss": 0.0147, + "step": 458 + }, + { + "epoch": 0.8848192771084338, + "grad_norm": 0.4984932243824005, + "learning_rate": 3.558657195128416e-05, + "loss": 0.0224, + "step": 459 + }, + { + "epoch": 0.8867469879518072, + "grad_norm": 0.36718735098838806, + "learning_rate": 3.555843042461653e-05, + "loss": 0.0202, + "step": 460 + }, + { + "epoch": 0.8886746987951807, + "grad_norm": 0.4081745445728302, + "learning_rate": 3.553021066511274e-05, + "loss": 0.0288, + "step": 461 + }, + { + "epoch": 0.8906024096385542, + "grad_norm": 0.3233242332935333, + "learning_rate": 3.55019128146709e-05, + "loss": 0.0362, + "step": 462 + }, + { + "epoch": 0.8925301204819277, + "grad_norm": 0.6560158729553223, + "learning_rate": 3.547353701558178e-05, + "loss": 0.038, + "step": 463 + }, + { + "epoch": 0.8944578313253012, + "grad_norm": 0.47668641805648804, + "learning_rate": 3.544508341052811e-05, + "loss": 0.0399, + "step": 464 + }, + { + "epoch": 0.8963855421686747, + "grad_norm": 0.45512664318084717, + "learning_rate": 3.541655214258383e-05, + "loss": 0.022, + "step": 465 + }, + { + "epoch": 0.8983132530120482, + "grad_norm": 0.8410730361938477, + "learning_rate": 3.538794335521343e-05, + "loss": 0.0315, + "step": 466 + }, + { + "epoch": 0.9002409638554217, + "grad_norm": 0.4872909486293793, + "learning_rate": 3.535925719227117e-05, + "loss": 0.0152, + "step": 467 + }, + { + "epoch": 0.9021686746987951, + "grad_norm": 0.45623311400413513, + "learning_rate": 3.533049379800038e-05, + "loss": 0.0305, + "step": 468 + }, + { + "epoch": 0.9040963855421686, + "grad_norm": 0.43087029457092285, + "learning_rate": 3.530165331703275e-05, + "loss": 0.0131, + "step": 469 + }, + { + "epoch": 0.9060240963855422, + "grad_norm": 0.4610525369644165, + "learning_rate": 3.527273589438756e-05, + "loss": 0.0187, + "step": 470 + }, + { + "epoch": 0.9079518072289157, + "grad_norm": 0.3356114327907562, + "learning_rate": 3.5243741675471006e-05, + "loss": 0.0185, + "step": 471 + }, + { + "epoch": 0.9098795180722892, + "grad_norm": 0.9065960049629211, + "learning_rate": 3.5214670806075426e-05, + "loss": 0.0433, + "step": 472 + }, + { + "epoch": 0.9118072289156627, + "grad_norm": 0.3652578294277191, + "learning_rate": 3.518552343237858e-05, + "loss": 0.02, + "step": 473 + }, + { + "epoch": 0.9137349397590362, + "grad_norm": 0.32377883791923523, + "learning_rate": 3.5156299700942916e-05, + "loss": 0.0165, + "step": 474 + }, + { + "epoch": 0.9156626506024096, + "grad_norm": 0.2431817352771759, + "learning_rate": 3.512699975871485e-05, + "loss": 0.0172, + "step": 475 + }, + { + "epoch": 0.9175903614457831, + "grad_norm": 0.6390707492828369, + "learning_rate": 3.509762375302399e-05, + "loss": 0.0356, + "step": 476 + }, + { + "epoch": 0.9195180722891566, + "grad_norm": 0.2283092886209488, + "learning_rate": 3.506817183158243e-05, + "loss": 0.0088, + "step": 477 + }, + { + "epoch": 0.9214457831325301, + "grad_norm": 0.5053914189338684, + "learning_rate": 3.5038644142483966e-05, + "loss": 0.0389, + "step": 478 + }, + { + "epoch": 0.9233734939759036, + "grad_norm": 0.2567576467990875, + "learning_rate": 3.500904083420342e-05, + "loss": 0.0155, + "step": 479 + }, + { + "epoch": 0.9253012048192771, + "grad_norm": 0.6852384209632874, + "learning_rate": 3.497936205559583e-05, + "loss": 0.0247, + "step": 480 + }, + { + "epoch": 0.9272289156626506, + "grad_norm": 0.36403414607048035, + "learning_rate": 3.494960795589572e-05, + "loss": 0.023, + "step": 481 + }, + { + "epoch": 0.929156626506024, + "grad_norm": 0.506554901599884, + "learning_rate": 3.491977868471635e-05, + "loss": 0.0273, + "step": 482 + }, + { + "epoch": 0.9310843373493976, + "grad_norm": 0.38329923152923584, + "learning_rate": 3.4889874392048985e-05, + "loss": 0.0169, + "step": 483 + }, + { + "epoch": 0.9330120481927711, + "grad_norm": 0.2805836498737335, + "learning_rate": 3.48598952282621e-05, + "loss": 0.0105, + "step": 484 + }, + { + "epoch": 0.9349397590361446, + "grad_norm": 0.6315302848815918, + "learning_rate": 3.482984134410067e-05, + "loss": 0.0289, + "step": 485 + }, + { + "epoch": 0.9368674698795181, + "grad_norm": 0.6431388854980469, + "learning_rate": 3.479971289068537e-05, + "loss": 0.0311, + "step": 486 + }, + { + "epoch": 0.9387951807228916, + "grad_norm": 0.9794723391532898, + "learning_rate": 3.476951001951184e-05, + "loss": 0.0452, + "step": 487 + }, + { + "epoch": 0.9407228915662651, + "grad_norm": 0.7984824180603027, + "learning_rate": 3.473923288244991e-05, + "loss": 0.0689, + "step": 488 + }, + { + "epoch": 0.9426506024096386, + "grad_norm": 0.46362006664276123, + "learning_rate": 3.470888163174286e-05, + "loss": 0.0241, + "step": 489 + }, + { + "epoch": 0.944578313253012, + "grad_norm": 0.5051195025444031, + "learning_rate": 3.467845642000661e-05, + "loss": 0.0228, + "step": 490 + }, + { + "epoch": 0.9465060240963855, + "grad_norm": 0.3082812428474426, + "learning_rate": 3.4647957400229004e-05, + "loss": 0.0144, + "step": 491 + }, + { + "epoch": 0.948433734939759, + "grad_norm": 0.2691391110420227, + "learning_rate": 3.461738472576902e-05, + "loss": 0.0167, + "step": 492 + }, + { + "epoch": 0.9503614457831325, + "grad_norm": 0.5627671480178833, + "learning_rate": 3.458673855035597e-05, + "loss": 0.031, + "step": 493 + }, + { + "epoch": 0.952289156626506, + "grad_norm": 0.4571435749530792, + "learning_rate": 3.455601902808876e-05, + "loss": 0.0191, + "step": 494 + }, + { + "epoch": 0.9542168674698795, + "grad_norm": 1.0117709636688232, + "learning_rate": 3.452522631343515e-05, + "loss": 0.0192, + "step": 495 + }, + { + "epoch": 0.9561445783132531, + "grad_norm": 0.28375712037086487, + "learning_rate": 3.449436056123086e-05, + "loss": 0.0159, + "step": 496 + }, + { + "epoch": 0.9580722891566265, + "grad_norm": 0.26381856203079224, + "learning_rate": 3.446342192667893e-05, + "loss": 0.0113, + "step": 497 + }, + { + "epoch": 0.96, + "grad_norm": 0.49317577481269836, + "learning_rate": 3.443241056534884e-05, + "loss": 0.0332, + "step": 498 + }, + { + "epoch": 0.9619277108433735, + "grad_norm": 0.28884485363960266, + "learning_rate": 3.440132663317579e-05, + "loss": 0.0117, + "step": 499 + }, + { + "epoch": 0.963855421686747, + "grad_norm": 0.36255285143852234, + "learning_rate": 3.4370170286459864e-05, + "loss": 0.0169, + "step": 500 + }, + { + "epoch": 0.9657831325301205, + "grad_norm": 0.4265049993991852, + "learning_rate": 3.433894168186529e-05, + "loss": 0.0217, + "step": 501 + }, + { + "epoch": 0.967710843373494, + "grad_norm": 0.8169426321983337, + "learning_rate": 3.430764097641962e-05, + "loss": 0.0207, + "step": 502 + }, + { + "epoch": 0.9696385542168675, + "grad_norm": 1.866077184677124, + "learning_rate": 3.427626832751296e-05, + "loss": 0.0381, + "step": 503 + }, + { + "epoch": 0.971566265060241, + "grad_norm": 0.33124980330467224, + "learning_rate": 3.424482389289716e-05, + "loss": 0.0245, + "step": 504 + }, + { + "epoch": 0.9734939759036144, + "grad_norm": 0.37479540705680847, + "learning_rate": 3.4213307830685055e-05, + "loss": 0.0164, + "step": 505 + }, + { + "epoch": 0.9754216867469879, + "grad_norm": 0.39738863706588745, + "learning_rate": 3.4181720299349615e-05, + "loss": 0.0297, + "step": 506 + }, + { + "epoch": 0.9773493975903614, + "grad_norm": 0.2567287087440491, + "learning_rate": 3.4150061457723205e-05, + "loss": 0.0102, + "step": 507 + }, + { + "epoch": 0.9792771084337349, + "grad_norm": 0.6230517029762268, + "learning_rate": 3.411833146499675e-05, + "loss": 0.0243, + "step": 508 + }, + { + "epoch": 0.9812048192771085, + "grad_norm": 0.44843971729278564, + "learning_rate": 3.408653048071894e-05, + "loss": 0.0357, + "step": 509 + }, + { + "epoch": 0.983132530120482, + "grad_norm": 1.0569655895233154, + "learning_rate": 3.405465866479546e-05, + "loss": 0.037, + "step": 510 + }, + { + "epoch": 0.9850602409638555, + "grad_norm": 0.29000964760780334, + "learning_rate": 3.402271617748812e-05, + "loss": 0.0129, + "step": 511 + }, + { + "epoch": 0.9869879518072289, + "grad_norm": 2.1627447605133057, + "learning_rate": 3.399070317941413e-05, + "loss": 0.0442, + "step": 512 + }, + { + "epoch": 0.9889156626506024, + "grad_norm": 0.27371272444725037, + "learning_rate": 3.395861983154522e-05, + "loss": 0.0119, + "step": 513 + }, + { + "epoch": 0.9908433734939759, + "grad_norm": 0.4117226302623749, + "learning_rate": 3.392646629520688e-05, + "loss": 0.0455, + "step": 514 + }, + { + "epoch": 0.9927710843373494, + "grad_norm": 0.5098996758460999, + "learning_rate": 3.389424273207752e-05, + "loss": 0.0203, + "step": 515 + }, + { + "epoch": 0.9946987951807229, + "grad_norm": 0.5192157626152039, + "learning_rate": 3.386194930418767e-05, + "loss": 0.0329, + "step": 516 + }, + { + "epoch": 0.9966265060240964, + "grad_norm": 0.18757697939872742, + "learning_rate": 3.382958617391915e-05, + "loss": 0.0065, + "step": 517 + }, + { + "epoch": 0.9985542168674699, + "grad_norm": 0.3334413170814514, + "learning_rate": 3.3797153504004296e-05, + "loss": 0.0266, + "step": 518 + }, + { + "epoch": 1.0, + "grad_norm": 0.4152225852012634, + "learning_rate": 3.3764651457525095e-05, + "loss": 0.0169, + "step": 519 + }, + { + "epoch": 1.0019277108433735, + "grad_norm": 0.43535247445106506, + "learning_rate": 3.373208019791237e-05, + "loss": 0.0221, + "step": 520 + }, + { + "epoch": 1.003855421686747, + "grad_norm": 0.39292722940444946, + "learning_rate": 3.3699439888945e-05, + "loss": 0.0211, + "step": 521 + }, + { + "epoch": 1.0057831325301205, + "grad_norm": 0.19566713273525238, + "learning_rate": 3.366673069474904e-05, + "loss": 0.0069, + "step": 522 + }, + { + "epoch": 1.007710843373494, + "grad_norm": 0.5101853609085083, + "learning_rate": 3.3633952779796914e-05, + "loss": 0.0191, + "step": 523 + }, + { + "epoch": 1.0096385542168675, + "grad_norm": 0.999434769153595, + "learning_rate": 3.360110630890664e-05, + "loss": 0.0196, + "step": 524 + }, + { + "epoch": 1.011566265060241, + "grad_norm": 0.4646223783493042, + "learning_rate": 3.356819144724092e-05, + "loss": 0.0328, + "step": 525 + }, + { + "epoch": 1.0134939759036146, + "grad_norm": 0.3132480978965759, + "learning_rate": 3.3535208360306354e-05, + "loss": 0.0203, + "step": 526 + }, + { + "epoch": 1.0154216867469879, + "grad_norm": 0.3038032352924347, + "learning_rate": 3.350215721395261e-05, + "loss": 0.0122, + "step": 527 + }, + { + "epoch": 1.0173493975903614, + "grad_norm": 0.45082882046699524, + "learning_rate": 3.346903817437157e-05, + "loss": 0.0437, + "step": 528 + }, + { + "epoch": 1.0192771084337349, + "grad_norm": 0.26917046308517456, + "learning_rate": 3.343585140809651e-05, + "loss": 0.013, + "step": 529 + }, + { + "epoch": 1.0212048192771084, + "grad_norm": 0.23869264125823975, + "learning_rate": 3.3402597082001276e-05, + "loss": 0.008, + "step": 530 + }, + { + "epoch": 1.0231325301204819, + "grad_norm": 0.31315353512763977, + "learning_rate": 3.3369275363299394e-05, + "loss": 0.0078, + "step": 531 + }, + { + "epoch": 1.0250602409638554, + "grad_norm": 0.4780346751213074, + "learning_rate": 3.333588641954327e-05, + "loss": 0.0225, + "step": 532 + }, + { + "epoch": 1.026987951807229, + "grad_norm": 0.2920368015766144, + "learning_rate": 3.330243041862336e-05, + "loss": 0.0118, + "step": 533 + }, + { + "epoch": 1.0289156626506024, + "grad_norm": 0.543669581413269, + "learning_rate": 3.326890752876728e-05, + "loss": 0.0338, + "step": 534 + }, + { + "epoch": 1.030843373493976, + "grad_norm": 0.4288000464439392, + "learning_rate": 3.323531791853901e-05, + "loss": 0.0341, + "step": 535 + }, + { + "epoch": 1.0327710843373494, + "grad_norm": 0.26600322127342224, + "learning_rate": 3.3201661756838e-05, + "loss": 0.0184, + "step": 536 + }, + { + "epoch": 1.034698795180723, + "grad_norm": 0.290937602519989, + "learning_rate": 3.316793921289835e-05, + "loss": 0.0152, + "step": 537 + }, + { + "epoch": 1.0366265060240965, + "grad_norm": 0.7621443271636963, + "learning_rate": 3.313415045628795e-05, + "loss": 0.0326, + "step": 538 + }, + { + "epoch": 1.03855421686747, + "grad_norm": 0.5581283569335938, + "learning_rate": 3.3100295656907646e-05, + "loss": 0.0164, + "step": 539 + }, + { + "epoch": 1.0404819277108435, + "grad_norm": 0.20930901169776917, + "learning_rate": 3.306637498499034e-05, + "loss": 0.0091, + "step": 540 + }, + { + "epoch": 1.0424096385542168, + "grad_norm": 0.46212059259414673, + "learning_rate": 3.303238861110018e-05, + "loss": 0.0118, + "step": 541 + }, + { + "epoch": 1.0443373493975903, + "grad_norm": 0.38259151577949524, + "learning_rate": 3.299833670613168e-05, + "loss": 0.0081, + "step": 542 + }, + { + "epoch": 1.0462650602409638, + "grad_norm": 0.4888618290424347, + "learning_rate": 3.2964219441308865e-05, + "loss": 0.0138, + "step": 543 + }, + { + "epoch": 1.0481927710843373, + "grad_norm": 0.32103127241134644, + "learning_rate": 3.2930036988184425e-05, + "loss": 0.0171, + "step": 544 + }, + { + "epoch": 1.0501204819277108, + "grad_norm": 0.27787327766418457, + "learning_rate": 3.28957895186388e-05, + "loss": 0.0106, + "step": 545 + }, + { + "epoch": 1.0520481927710843, + "grad_norm": 0.35597777366638184, + "learning_rate": 3.2861477204879395e-05, + "loss": 0.0123, + "step": 546 + }, + { + "epoch": 1.0539759036144578, + "grad_norm": 0.3619804084300995, + "learning_rate": 3.2827100219439656e-05, + "loss": 0.0088, + "step": 547 + }, + { + "epoch": 1.0559036144578313, + "grad_norm": 0.2525513470172882, + "learning_rate": 3.279265873517822e-05, + "loss": 0.0179, + "step": 548 + }, + { + "epoch": 1.0578313253012048, + "grad_norm": 0.3910020887851715, + "learning_rate": 3.275815292527804e-05, + "loss": 0.0142, + "step": 549 + }, + { + "epoch": 1.0597590361445783, + "grad_norm": 0.30515050888061523, + "learning_rate": 3.2723582963245526e-05, + "loss": 0.0123, + "step": 550 + }, + { + "epoch": 1.0616867469879518, + "grad_norm": 0.21708644926548004, + "learning_rate": 3.2688949022909665e-05, + "loss": 0.0098, + "step": 551 + }, + { + "epoch": 1.0636144578313254, + "grad_norm": 0.23307719826698303, + "learning_rate": 3.265425127842114e-05, + "loss": 0.0097, + "step": 552 + }, + { + "epoch": 1.0655421686746989, + "grad_norm": 0.676654577255249, + "learning_rate": 3.261948990425147e-05, + "loss": 0.0227, + "step": 553 + }, + { + "epoch": 1.0674698795180724, + "grad_norm": 0.4593975841999054, + "learning_rate": 3.258466507519213e-05, + "loss": 0.047, + "step": 554 + }, + { + "epoch": 1.0693975903614459, + "grad_norm": 0.19405829906463623, + "learning_rate": 3.254977696635366e-05, + "loss": 0.0314, + "step": 555 + }, + { + "epoch": 1.0713253012048192, + "grad_norm": 0.14563389122486115, + "learning_rate": 3.2514825753164774e-05, + "loss": 0.0046, + "step": 556 + }, + { + "epoch": 1.0732530120481927, + "grad_norm": 0.2642340064048767, + "learning_rate": 3.247981161137153e-05, + "loss": 0.022, + "step": 557 + }, + { + "epoch": 1.0751807228915662, + "grad_norm": 0.17274761199951172, + "learning_rate": 3.2444734717036386e-05, + "loss": 0.0134, + "step": 558 + }, + { + "epoch": 1.0771084337349397, + "grad_norm": 0.44354626536369324, + "learning_rate": 3.240959524653735e-05, + "loss": 0.0211, + "step": 559 + }, + { + "epoch": 1.0790361445783132, + "grad_norm": 0.2806888818740845, + "learning_rate": 3.237439337656708e-05, + "loss": 0.0141, + "step": 560 + }, + { + "epoch": 1.0809638554216867, + "grad_norm": 0.21679501235485077, + "learning_rate": 3.2339129284131994e-05, + "loss": 0.019, + "step": 561 + }, + { + "epoch": 1.0828915662650602, + "grad_norm": 0.3040260076522827, + "learning_rate": 3.2303803146551386e-05, + "loss": 0.0249, + "step": 562 + }, + { + "epoch": 1.0848192771084337, + "grad_norm": 0.2793775200843811, + "learning_rate": 3.226841514145656e-05, + "loss": 0.0088, + "step": 563 + }, + { + "epoch": 1.0867469879518072, + "grad_norm": 0.149955615401268, + "learning_rate": 3.223296544678987e-05, + "loss": 0.0054, + "step": 564 + }, + { + "epoch": 1.0886746987951808, + "grad_norm": 0.22166767716407776, + "learning_rate": 3.219745424080389e-05, + "loss": 0.0109, + "step": 565 + }, + { + "epoch": 1.0906024096385543, + "grad_norm": 0.22399431467056274, + "learning_rate": 3.2161881702060476e-05, + "loss": 0.0106, + "step": 566 + }, + { + "epoch": 1.0925301204819278, + "grad_norm": 0.18537986278533936, + "learning_rate": 3.2126248009429905e-05, + "loss": 0.0077, + "step": 567 + }, + { + "epoch": 1.0944578313253013, + "grad_norm": 0.24511495232582092, + "learning_rate": 3.2090553342089935e-05, + "loss": 0.0093, + "step": 568 + }, + { + "epoch": 1.0963855421686748, + "grad_norm": 0.4766045808792114, + "learning_rate": 3.205479787952494e-05, + "loss": 0.036, + "step": 569 + }, + { + "epoch": 1.0983132530120483, + "grad_norm": 0.1425715535879135, + "learning_rate": 3.201898180152499e-05, + "loss": 0.0085, + "step": 570 + }, + { + "epoch": 1.1002409638554216, + "grad_norm": 0.1909666359424591, + "learning_rate": 3.1983105288184945e-05, + "loss": 0.0081, + "step": 571 + }, + { + "epoch": 1.102168674698795, + "grad_norm": 0.44077104330062866, + "learning_rate": 3.194716851990355e-05, + "loss": 0.017, + "step": 572 + }, + { + "epoch": 1.1040963855421686, + "grad_norm": 0.5757400989532471, + "learning_rate": 3.191117167738253e-05, + "loss": 0.021, + "step": 573 + }, + { + "epoch": 1.106024096385542, + "grad_norm": 0.1977701038122177, + "learning_rate": 3.1875114941625705e-05, + "loss": 0.0096, + "step": 574 + }, + { + "epoch": 1.1079518072289156, + "grad_norm": 0.3524581491947174, + "learning_rate": 3.1838998493938026e-05, + "loss": 0.0118, + "step": 575 + }, + { + "epoch": 1.1098795180722891, + "grad_norm": 0.3301331698894501, + "learning_rate": 3.180282251592472e-05, + "loss": 0.0094, + "step": 576 + }, + { + "epoch": 1.1118072289156626, + "grad_norm": 0.2774488925933838, + "learning_rate": 3.1766587189490336e-05, + "loss": 0.0131, + "step": 577 + }, + { + "epoch": 1.1137349397590361, + "grad_norm": 1.732595443725586, + "learning_rate": 3.173029269683785e-05, + "loss": 0.0445, + "step": 578 + }, + { + "epoch": 1.1156626506024097, + "grad_norm": 0.28746843338012695, + "learning_rate": 3.169393922046776e-05, + "loss": 0.0116, + "step": 579 + }, + { + "epoch": 1.1175903614457832, + "grad_norm": 0.2952995002269745, + "learning_rate": 3.165752694317713e-05, + "loss": 0.0116, + "step": 580 + }, + { + "epoch": 1.1195180722891567, + "grad_norm": 0.2938575744628906, + "learning_rate": 3.16210560480587e-05, + "loss": 0.013, + "step": 581 + }, + { + "epoch": 1.1214457831325302, + "grad_norm": 0.22283495962619781, + "learning_rate": 3.158452671849998e-05, + "loss": 0.0052, + "step": 582 + }, + { + "epoch": 1.1233734939759037, + "grad_norm": 0.6272858381271362, + "learning_rate": 3.154793913818226e-05, + "loss": 0.0182, + "step": 583 + }, + { + "epoch": 1.1253012048192772, + "grad_norm": 0.479753702878952, + "learning_rate": 3.1511293491079804e-05, + "loss": 0.0146, + "step": 584 + }, + { + "epoch": 1.1272289156626507, + "grad_norm": 0.31104400753974915, + "learning_rate": 3.1474589961458786e-05, + "loss": 0.0139, + "step": 585 + }, + { + "epoch": 1.129156626506024, + "grad_norm": 0.4932832419872284, + "learning_rate": 3.1437828733876477e-05, + "loss": 0.0236, + "step": 586 + }, + { + "epoch": 1.1310843373493975, + "grad_norm": 0.222808837890625, + "learning_rate": 3.140100999318025e-05, + "loss": 0.0084, + "step": 587 + }, + { + "epoch": 1.133012048192771, + "grad_norm": 0.4515356719493866, + "learning_rate": 3.136413392450668e-05, + "loss": 0.0215, + "step": 588 + }, + { + "epoch": 1.1349397590361445, + "grad_norm": 0.39302268624305725, + "learning_rate": 3.132720071328061e-05, + "loss": 0.0154, + "step": 589 + }, + { + "epoch": 1.136867469879518, + "grad_norm": 0.43382835388183594, + "learning_rate": 3.1290210545214205e-05, + "loss": 0.0088, + "step": 590 + }, + { + "epoch": 1.1387951807228915, + "grad_norm": 0.18707136809825897, + "learning_rate": 3.125316360630602e-05, + "loss": 0.0126, + "step": 591 + }, + { + "epoch": 1.140722891566265, + "grad_norm": 0.5688219666481018, + "learning_rate": 3.121606008284011e-05, + "loss": 0.0147, + "step": 592 + }, + { + "epoch": 1.1426506024096386, + "grad_norm": 0.3321833312511444, + "learning_rate": 3.1178900161385005e-05, + "loss": 0.0119, + "step": 593 + }, + { + "epoch": 1.144578313253012, + "grad_norm": 0.3738424777984619, + "learning_rate": 3.114168402879286e-05, + "loss": 0.0158, + "step": 594 + }, + { + "epoch": 1.1465060240963856, + "grad_norm": 0.2386978417634964, + "learning_rate": 3.110441187219846e-05, + "loss": 0.0107, + "step": 595 + }, + { + "epoch": 1.148433734939759, + "grad_norm": 0.2165699452161789, + "learning_rate": 3.10670838790183e-05, + "loss": 0.0079, + "step": 596 + }, + { + "epoch": 1.1503614457831326, + "grad_norm": 0.25952696800231934, + "learning_rate": 3.102970023694965e-05, + "loss": 0.0147, + "step": 597 + }, + { + "epoch": 1.152289156626506, + "grad_norm": 0.21448305249214172, + "learning_rate": 3.099226113396959e-05, + "loss": 0.0099, + "step": 598 + }, + { + "epoch": 1.1542168674698796, + "grad_norm": 0.37226060032844543, + "learning_rate": 3.095476675833405e-05, + "loss": 0.0214, + "step": 599 + }, + { + "epoch": 1.1561445783132531, + "grad_norm": 0.29637983441352844, + "learning_rate": 3.0917217298576955e-05, + "loss": 0.0118, + "step": 600 + }, + { + "epoch": 1.1580722891566264, + "grad_norm": 0.18535609543323517, + "learning_rate": 3.0879612943509154e-05, + "loss": 0.0086, + "step": 601 + }, + { + "epoch": 1.16, + "grad_norm": 0.25874125957489014, + "learning_rate": 3.0841953882217536e-05, + "loss": 0.0088, + "step": 602 + }, + { + "epoch": 1.1619277108433734, + "grad_norm": 0.46092745661735535, + "learning_rate": 3.08042403040641e-05, + "loss": 0.0241, + "step": 603 + }, + { + "epoch": 1.163855421686747, + "grad_norm": 0.27023249864578247, + "learning_rate": 3.076647239868494e-05, + "loss": 0.0154, + "step": 604 + }, + { + "epoch": 1.1657831325301204, + "grad_norm": 0.445157527923584, + "learning_rate": 3.072865035598933e-05, + "loss": 0.0197, + "step": 605 + }, + { + "epoch": 1.167710843373494, + "grad_norm": 0.18097272515296936, + "learning_rate": 3.06907743661588e-05, + "loss": 0.0093, + "step": 606 + }, + { + "epoch": 1.1696385542168675, + "grad_norm": 0.22469942271709442, + "learning_rate": 3.065284461964609e-05, + "loss": 0.0171, + "step": 607 + }, + { + "epoch": 1.171566265060241, + "grad_norm": 0.20190906524658203, + "learning_rate": 3.061486130717428e-05, + "loss": 0.008, + "step": 608 + }, + { + "epoch": 1.1734939759036145, + "grad_norm": 0.18294145166873932, + "learning_rate": 3.057682461973579e-05, + "loss": 0.0155, + "step": 609 + }, + { + "epoch": 1.175421686746988, + "grad_norm": 0.34203943610191345, + "learning_rate": 3.053873474859143e-05, + "loss": 0.0212, + "step": 610 + }, + { + "epoch": 1.1773493975903615, + "grad_norm": 0.49073582887649536, + "learning_rate": 3.050059188526942e-05, + "loss": 0.019, + "step": 611 + }, + { + "epoch": 1.179277108433735, + "grad_norm": 0.3537680506706238, + "learning_rate": 3.046239622156446e-05, + "loss": 0.0147, + "step": 612 + }, + { + "epoch": 1.1812048192771085, + "grad_norm": 0.2584632635116577, + "learning_rate": 3.042414794953674e-05, + "loss": 0.0088, + "step": 613 + }, + { + "epoch": 1.1831325301204818, + "grad_norm": 0.3529360890388489, + "learning_rate": 3.0385847261510975e-05, + "loss": 0.0187, + "step": 614 + }, + { + "epoch": 1.1850602409638555, + "grad_norm": 0.3331570327281952, + "learning_rate": 3.0347494350075465e-05, + "loss": 0.0124, + "step": 615 + }, + { + "epoch": 1.1869879518072288, + "grad_norm": 0.2223527580499649, + "learning_rate": 3.0309089408081074e-05, + "loss": 0.01, + "step": 616 + }, + { + "epoch": 1.1889156626506023, + "grad_norm": 0.21985746920108795, + "learning_rate": 3.027063262864032e-05, + "loss": 0.0087, + "step": 617 + }, + { + "epoch": 1.1908433734939758, + "grad_norm": 0.2989653944969177, + "learning_rate": 3.023212420512637e-05, + "loss": 0.0137, + "step": 618 + }, + { + "epoch": 1.1927710843373494, + "grad_norm": 0.17423275113105774, + "learning_rate": 3.0193564331172074e-05, + "loss": 0.0056, + "step": 619 + }, + { + "epoch": 1.1946987951807229, + "grad_norm": 1.0992127656936646, + "learning_rate": 3.0154953200668976e-05, + "loss": 0.0274, + "step": 620 + }, + { + "epoch": 1.1966265060240964, + "grad_norm": 0.21641989052295685, + "learning_rate": 3.011629100776638e-05, + "loss": 0.0151, + "step": 621 + }, + { + "epoch": 1.1985542168674699, + "grad_norm": 0.4558199644088745, + "learning_rate": 3.007757794687033e-05, + "loss": 0.0424, + "step": 622 + }, + { + "epoch": 1.2004819277108434, + "grad_norm": 0.42380189895629883, + "learning_rate": 3.003881421264266e-05, + "loss": 0.0079, + "step": 623 + }, + { + "epoch": 1.202409638554217, + "grad_norm": 0.28791171312332153, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.0142, + "step": 624 + }, + { + "epoch": 1.2043373493975904, + "grad_norm": 0.3906581997871399, + "learning_rate": 2.996113550411281e-05, + "loss": 0.0251, + "step": 625 + }, + { + "epoch": 1.206265060240964, + "grad_norm": 0.47848746180534363, + "learning_rate": 2.9922220920404375e-05, + "loss": 0.0137, + "step": 626 + }, + { + "epoch": 1.2081927710843374, + "grad_norm": 0.22666941583156586, + "learning_rate": 2.9883256444549862e-05, + "loss": 0.0105, + "step": 627 + }, + { + "epoch": 1.210120481927711, + "grad_norm": 0.18968136608600616, + "learning_rate": 2.984424227247529e-05, + "loss": 0.0089, + "step": 628 + }, + { + "epoch": 1.2120481927710842, + "grad_norm": 0.28732606768608093, + "learning_rate": 2.980517860035656e-05, + "loss": 0.0253, + "step": 629 + }, + { + "epoch": 1.213975903614458, + "grad_norm": 0.21131543815135956, + "learning_rate": 2.9766065624618518e-05, + "loss": 0.0134, + "step": 630 + }, + { + "epoch": 1.2159036144578312, + "grad_norm": 0.7594877481460571, + "learning_rate": 2.972690354193388e-05, + "loss": 0.0157, + "step": 631 + }, + { + "epoch": 1.2178313253012047, + "grad_norm": 0.730291485786438, + "learning_rate": 2.96876925492223e-05, + "loss": 0.0204, + "step": 632 + }, + { + "epoch": 1.2197590361445783, + "grad_norm": 0.20333674550056458, + "learning_rate": 2.9648432843649382e-05, + "loss": 0.0114, + "step": 633 + }, + { + "epoch": 1.2216867469879518, + "grad_norm": 0.5680793523788452, + "learning_rate": 2.960912462262566e-05, + "loss": 0.0146, + "step": 634 + }, + { + "epoch": 1.2236144578313253, + "grad_norm": 0.4591079354286194, + "learning_rate": 2.9569768083805618e-05, + "loss": 0.0112, + "step": 635 + }, + { + "epoch": 1.2255421686746988, + "grad_norm": 0.3793511390686035, + "learning_rate": 2.953036342508671e-05, + "loss": 0.0377, + "step": 636 + }, + { + "epoch": 1.2274698795180723, + "grad_norm": 1.118723750114441, + "learning_rate": 2.9490910844608346e-05, + "loss": 0.0432, + "step": 637 + }, + { + "epoch": 1.2293975903614458, + "grad_norm": 0.36990776658058167, + "learning_rate": 2.9451410540750887e-05, + "loss": 0.0203, + "step": 638 + }, + { + "epoch": 1.2313253012048193, + "grad_norm": 0.930397629737854, + "learning_rate": 2.94118627121347e-05, + "loss": 0.0311, + "step": 639 + }, + { + "epoch": 1.2332530120481928, + "grad_norm": 0.2347625195980072, + "learning_rate": 2.9372267557619075e-05, + "loss": 0.0168, + "step": 640 + }, + { + "epoch": 1.2351807228915663, + "grad_norm": 0.3720332384109497, + "learning_rate": 2.933262527630131e-05, + "loss": 0.0136, + "step": 641 + }, + { + "epoch": 1.2371084337349398, + "grad_norm": 0.4871984124183655, + "learning_rate": 2.929293606751565e-05, + "loss": 0.0339, + "step": 642 + }, + { + "epoch": 1.2390361445783133, + "grad_norm": 0.35853689908981323, + "learning_rate": 2.9253200130832322e-05, + "loss": 0.0095, + "step": 643 + }, + { + "epoch": 1.2409638554216866, + "grad_norm": 0.42003703117370605, + "learning_rate": 2.92134176660565e-05, + "loss": 0.0142, + "step": 644 + }, + { + "epoch": 1.2428915662650604, + "grad_norm": 0.3854500651359558, + "learning_rate": 2.9173588873227338e-05, + "loss": 0.0209, + "step": 645 + }, + { + "epoch": 1.2448192771084337, + "grad_norm": 0.24665917456150055, + "learning_rate": 2.913371395261691e-05, + "loss": 0.0087, + "step": 646 + }, + { + "epoch": 1.2467469879518072, + "grad_norm": 0.41571593284606934, + "learning_rate": 2.9093793104729268e-05, + "loss": 0.0164, + "step": 647 + }, + { + "epoch": 1.2486746987951807, + "grad_norm": 0.4597891569137573, + "learning_rate": 2.9053826530299377e-05, + "loss": 0.0138, + "step": 648 + }, + { + "epoch": 1.2506024096385542, + "grad_norm": 0.43345385789871216, + "learning_rate": 2.901381443029215e-05, + "loss": 0.0353, + "step": 649 + }, + { + "epoch": 1.2525301204819277, + "grad_norm": 0.3706768751144409, + "learning_rate": 2.897375700590141e-05, + "loss": 0.007, + "step": 650 + }, + { + "epoch": 1.2544578313253012, + "grad_norm": 0.30305296182632446, + "learning_rate": 2.8933654458548873e-05, + "loss": 0.0123, + "step": 651 + }, + { + "epoch": 1.2563855421686747, + "grad_norm": 0.2042127549648285, + "learning_rate": 2.8893506989883167e-05, + "loss": 0.0099, + "step": 652 + }, + { + "epoch": 1.2583132530120482, + "grad_norm": 0.20524422824382782, + "learning_rate": 2.8853314801778784e-05, + "loss": 0.0097, + "step": 653 + }, + { + "epoch": 1.2602409638554217, + "grad_norm": 0.2351921945810318, + "learning_rate": 2.8813078096335093e-05, + "loss": 0.0091, + "step": 654 + }, + { + "epoch": 1.2621686746987952, + "grad_norm": 0.34547340869903564, + "learning_rate": 2.87727970758753e-05, + "loss": 0.0088, + "step": 655 + }, + { + "epoch": 1.2640963855421687, + "grad_norm": 0.35163217782974243, + "learning_rate": 2.8732471942945443e-05, + "loss": 0.0145, + "step": 656 + }, + { + "epoch": 1.266024096385542, + "grad_norm": 1.715137243270874, + "learning_rate": 2.8692102900313378e-05, + "loss": 0.0198, + "step": 657 + }, + { + "epoch": 1.2679518072289158, + "grad_norm": 0.2860178053379059, + "learning_rate": 2.8651690150967748e-05, + "loss": 0.0085, + "step": 658 + }, + { + "epoch": 1.269879518072289, + "grad_norm": 0.21175967156887054, + "learning_rate": 2.8611233898116967e-05, + "loss": 0.0071, + "step": 659 + }, + { + "epoch": 1.2718072289156628, + "grad_norm": 0.33726972341537476, + "learning_rate": 2.85707343451882e-05, + "loss": 0.012, + "step": 660 + }, + { + "epoch": 1.273734939759036, + "grad_norm": 0.2138456553220749, + "learning_rate": 2.853019169582635e-05, + "loss": 0.0092, + "step": 661 + }, + { + "epoch": 1.2756626506024096, + "grad_norm": 0.2304934412240982, + "learning_rate": 2.8489606153892997e-05, + "loss": 0.0144, + "step": 662 + }, + { + "epoch": 1.277590361445783, + "grad_norm": 0.2691061794757843, + "learning_rate": 2.8448977923465425e-05, + "loss": 0.0121, + "step": 663 + }, + { + "epoch": 1.2795180722891566, + "grad_norm": 0.35254305601119995, + "learning_rate": 2.840830720883555e-05, + "loss": 0.0125, + "step": 664 + }, + { + "epoch": 1.28144578313253, + "grad_norm": 0.36552608013153076, + "learning_rate": 2.836759421450893e-05, + "loss": 0.021, + "step": 665 + }, + { + "epoch": 1.2833734939759036, + "grad_norm": 0.37177154421806335, + "learning_rate": 2.83268391452037e-05, + "loss": 0.0216, + "step": 666 + }, + { + "epoch": 1.2853012048192771, + "grad_norm": 0.20932547748088837, + "learning_rate": 2.828604220584958e-05, + "loss": 0.0077, + "step": 667 + }, + { + "epoch": 1.2872289156626506, + "grad_norm": 0.5158557295799255, + "learning_rate": 2.824520360158681e-05, + "loss": 0.0394, + "step": 668 + }, + { + "epoch": 1.2891566265060241, + "grad_norm": 0.22623969614505768, + "learning_rate": 2.820432353776515e-05, + "loss": 0.0087, + "step": 669 + }, + { + "epoch": 1.2910843373493976, + "grad_norm": 0.2996046245098114, + "learning_rate": 2.8163402219942822e-05, + "loss": 0.01, + "step": 670 + }, + { + "epoch": 1.2930120481927712, + "grad_norm": 0.24957989156246185, + "learning_rate": 2.8122439853885488e-05, + "loss": 0.0127, + "step": 671 + }, + { + "epoch": 1.2949397590361444, + "grad_norm": 0.2636559307575226, + "learning_rate": 2.8081436645565216e-05, + "loss": 0.0128, + "step": 672 + }, + { + "epoch": 1.2968674698795182, + "grad_norm": 0.3531591296195984, + "learning_rate": 2.804039280115944e-05, + "loss": 0.0199, + "step": 673 + }, + { + "epoch": 1.2987951807228915, + "grad_norm": 0.3682299852371216, + "learning_rate": 2.7999308527049927e-05, + "loss": 0.0088, + "step": 674 + }, + { + "epoch": 1.3007228915662652, + "grad_norm": 0.19555217027664185, + "learning_rate": 2.795818402982174e-05, + "loss": 0.0084, + "step": 675 + }, + { + "epoch": 1.3026506024096385, + "grad_norm": 0.2864912450313568, + "learning_rate": 2.7917019516262186e-05, + "loss": 0.0154, + "step": 676 + }, + { + "epoch": 1.304578313253012, + "grad_norm": 0.2211237996816635, + "learning_rate": 2.78758151933598e-05, + "loss": 0.0078, + "step": 677 + }, + { + "epoch": 1.3065060240963855, + "grad_norm": 0.13646945357322693, + "learning_rate": 2.7834571268303294e-05, + "loss": 0.0058, + "step": 678 + }, + { + "epoch": 1.308433734939759, + "grad_norm": 0.16530285775661469, + "learning_rate": 2.779328794848049e-05, + "loss": 0.007, + "step": 679 + }, + { + "epoch": 1.3103614457831325, + "grad_norm": 0.2145693302154541, + "learning_rate": 2.7751965441477325e-05, + "loss": 0.0203, + "step": 680 + }, + { + "epoch": 1.312289156626506, + "grad_norm": 0.24273739755153656, + "learning_rate": 2.771060395507677e-05, + "loss": 0.0106, + "step": 681 + }, + { + "epoch": 1.3142168674698795, + "grad_norm": 0.20430618524551392, + "learning_rate": 2.7669203697257794e-05, + "loss": 0.0122, + "step": 682 + }, + { + "epoch": 1.316144578313253, + "grad_norm": 0.2502615749835968, + "learning_rate": 2.7627764876194335e-05, + "loss": 0.0101, + "step": 683 + }, + { + "epoch": 1.3180722891566266, + "grad_norm": 0.287239670753479, + "learning_rate": 2.7586287700254214e-05, + "loss": 0.0203, + "step": 684 + }, + { + "epoch": 1.32, + "grad_norm": 0.16239754855632782, + "learning_rate": 2.7544772377998147e-05, + "loss": 0.0084, + "step": 685 + }, + { + "epoch": 1.3219277108433736, + "grad_norm": 0.27174142003059387, + "learning_rate": 2.7503219118178636e-05, + "loss": 0.008, + "step": 686 + }, + { + "epoch": 1.3238554216867469, + "grad_norm": 0.12878240644931793, + "learning_rate": 2.7461628129738954e-05, + "loss": 0.0053, + "step": 687 + }, + { + "epoch": 1.3257831325301206, + "grad_norm": 0.16112515330314636, + "learning_rate": 2.7419999621812086e-05, + "loss": 0.0059, + "step": 688 + }, + { + "epoch": 1.3277108433734939, + "grad_norm": 0.2398834228515625, + "learning_rate": 2.7378333803719672e-05, + "loss": 0.0095, + "step": 689 + }, + { + "epoch": 1.3296385542168676, + "grad_norm": 0.18516193330287933, + "learning_rate": 2.733663088497097e-05, + "loss": 0.0071, + "step": 690 + }, + { + "epoch": 1.331566265060241, + "grad_norm": 0.2974924147129059, + "learning_rate": 2.7294891075261785e-05, + "loss": 0.0227, + "step": 691 + }, + { + "epoch": 1.3334939759036144, + "grad_norm": 0.12931054830551147, + "learning_rate": 2.7253114584473418e-05, + "loss": 0.0039, + "step": 692 + }, + { + "epoch": 1.335421686746988, + "grad_norm": 0.16319474577903748, + "learning_rate": 2.7211301622671623e-05, + "loss": 0.008, + "step": 693 + }, + { + "epoch": 1.3373493975903614, + "grad_norm": 0.27622169256210327, + "learning_rate": 2.7169452400105533e-05, + "loss": 0.0238, + "step": 694 + }, + { + "epoch": 1.339277108433735, + "grad_norm": 0.45309779047966003, + "learning_rate": 2.712756712720663e-05, + "loss": 0.0439, + "step": 695 + }, + { + "epoch": 1.3412048192771084, + "grad_norm": 0.2469855099916458, + "learning_rate": 2.708564601458765e-05, + "loss": 0.0085, + "step": 696 + }, + { + "epoch": 1.343132530120482, + "grad_norm": 0.4245856702327728, + "learning_rate": 2.7043689273041535e-05, + "loss": 0.0097, + "step": 697 + }, + { + "epoch": 1.3450602409638555, + "grad_norm": 0.26796087622642517, + "learning_rate": 2.7001697113540414e-05, + "loss": 0.0119, + "step": 698 + }, + { + "epoch": 1.346987951807229, + "grad_norm": 0.3569283187389374, + "learning_rate": 2.6959669747234482e-05, + "loss": 0.0096, + "step": 699 + }, + { + "epoch": 1.3489156626506025, + "grad_norm": 0.7038524150848389, + "learning_rate": 2.6917607385450973e-05, + "loss": 0.0317, + "step": 700 + }, + { + "epoch": 1.350843373493976, + "grad_norm": 0.23568563163280487, + "learning_rate": 2.687551023969308e-05, + "loss": 0.0112, + "step": 701 + }, + { + "epoch": 1.3527710843373493, + "grad_norm": 0.20338499546051025, + "learning_rate": 2.6833378521638935e-05, + "loss": 0.0092, + "step": 702 + }, + { + "epoch": 1.354698795180723, + "grad_norm": 4.22187614440918, + "learning_rate": 2.679121244314046e-05, + "loss": 0.0314, + "step": 703 + }, + { + "epoch": 1.3566265060240963, + "grad_norm": 0.2542206048965454, + "learning_rate": 2.674901221622239e-05, + "loss": 0.0158, + "step": 704 + }, + { + "epoch": 1.3585542168674698, + "grad_norm": 0.49705010652542114, + "learning_rate": 2.670677805308116e-05, + "loss": 0.0162, + "step": 705 + }, + { + "epoch": 1.3604819277108433, + "grad_norm": 0.17502115666866302, + "learning_rate": 2.666451016608383e-05, + "loss": 0.0074, + "step": 706 + }, + { + "epoch": 1.3624096385542168, + "grad_norm": 0.21738742291927338, + "learning_rate": 2.6622208767767075e-05, + "loss": 0.0135, + "step": 707 + }, + { + "epoch": 1.3643373493975903, + "grad_norm": 0.3309847414493561, + "learning_rate": 2.6579874070836032e-05, + "loss": 0.0107, + "step": 708 + }, + { + "epoch": 1.3662650602409638, + "grad_norm": 0.10706827789545059, + "learning_rate": 2.6537506288163303e-05, + "loss": 0.0043, + "step": 709 + }, + { + "epoch": 1.3681927710843373, + "grad_norm": 0.173640176653862, + "learning_rate": 2.6495105632787835e-05, + "loss": 0.0092, + "step": 710 + }, + { + "epoch": 1.3701204819277109, + "grad_norm": 0.2636397182941437, + "learning_rate": 2.6452672317913893e-05, + "loss": 0.0097, + "step": 711 + }, + { + "epoch": 1.3720481927710844, + "grad_norm": 0.28485360741615295, + "learning_rate": 2.6410206556909943e-05, + "loss": 0.0193, + "step": 712 + }, + { + "epoch": 1.3739759036144579, + "grad_norm": 0.23210027813911438, + "learning_rate": 2.636770856330761e-05, + "loss": 0.0229, + "step": 713 + }, + { + "epoch": 1.3759036144578314, + "grad_norm": 0.13388316333293915, + "learning_rate": 2.6325178550800596e-05, + "loss": 0.004, + "step": 714 + }, + { + "epoch": 1.377831325301205, + "grad_norm": 0.5131422877311707, + "learning_rate": 2.6282616733243603e-05, + "loss": 0.0137, + "step": 715 + }, + { + "epoch": 1.3797590361445784, + "grad_norm": 0.3243267834186554, + "learning_rate": 2.6240023324651258e-05, + "loss": 0.0153, + "step": 716 + }, + { + "epoch": 1.3816867469879517, + "grad_norm": 0.1440611034631729, + "learning_rate": 2.619739853919704e-05, + "loss": 0.0031, + "step": 717 + }, + { + "epoch": 1.3836144578313254, + "grad_norm": 0.30346596240997314, + "learning_rate": 2.6154742591212196e-05, + "loss": 0.0109, + "step": 718 + }, + { + "epoch": 1.3855421686746987, + "grad_norm": 0.19109240174293518, + "learning_rate": 2.611205569518468e-05, + "loss": 0.0094, + "step": 719 + }, + { + "epoch": 1.3874698795180722, + "grad_norm": 0.28636518120765686, + "learning_rate": 2.6069338065758056e-05, + "loss": 0.0123, + "step": 720 + }, + { + "epoch": 1.3893975903614457, + "grad_norm": 0.28083911538124084, + "learning_rate": 2.6026589917730416e-05, + "loss": 0.0104, + "step": 721 + }, + { + "epoch": 1.3913253012048192, + "grad_norm": 0.36553966999053955, + "learning_rate": 2.5983811466053327e-05, + "loss": 0.0143, + "step": 722 + }, + { + "epoch": 1.3932530120481927, + "grad_norm": 0.23317205905914307, + "learning_rate": 2.5941002925830708e-05, + "loss": 0.011, + "step": 723 + }, + { + "epoch": 1.3951807228915662, + "grad_norm": 0.3825171887874603, + "learning_rate": 2.589816451231781e-05, + "loss": 0.0098, + "step": 724 + }, + { + "epoch": 1.3971084337349398, + "grad_norm": 0.19916608929634094, + "learning_rate": 2.585529644092006e-05, + "loss": 0.0094, + "step": 725 + }, + { + "epoch": 1.3990361445783133, + "grad_norm": 0.19990523159503937, + "learning_rate": 2.5812398927192027e-05, + "loss": 0.0128, + "step": 726 + }, + { + "epoch": 1.4009638554216868, + "grad_norm": 0.34662899374961853, + "learning_rate": 2.5769472186836347e-05, + "loss": 0.0091, + "step": 727 + }, + { + "epoch": 1.4028915662650603, + "grad_norm": 0.23481112718582153, + "learning_rate": 2.5726516435702583e-05, + "loss": 0.0154, + "step": 728 + }, + { + "epoch": 1.4048192771084338, + "grad_norm": 0.1846667379140854, + "learning_rate": 2.5683531889786194e-05, + "loss": 0.0088, + "step": 729 + }, + { + "epoch": 1.4067469879518073, + "grad_norm": 0.16717663407325745, + "learning_rate": 2.564051876522742e-05, + "loss": 0.0083, + "step": 730 + }, + { + "epoch": 1.4086746987951808, + "grad_norm": 0.4116475284099579, + "learning_rate": 2.5597477278310202e-05, + "loss": 0.0179, + "step": 731 + }, + { + "epoch": 1.410602409638554, + "grad_norm": 0.171807661652565, + "learning_rate": 2.5554407645461115e-05, + "loss": 0.0063, + "step": 732 + }, + { + "epoch": 1.4125301204819278, + "grad_norm": 0.1954439878463745, + "learning_rate": 2.5511310083248243e-05, + "loss": 0.017, + "step": 733 + }, + { + "epoch": 1.4144578313253011, + "grad_norm": 0.37158989906311035, + "learning_rate": 2.5468184808380104e-05, + "loss": 0.0173, + "step": 734 + }, + { + "epoch": 1.4163855421686746, + "grad_norm": 0.2001633644104004, + "learning_rate": 2.542503203770458e-05, + "loss": 0.0165, + "step": 735 + }, + { + "epoch": 1.4183132530120481, + "grad_norm": 0.45673373341560364, + "learning_rate": 2.53818519882078e-05, + "loss": 0.0185, + "step": 736 + }, + { + "epoch": 1.4202409638554216, + "grad_norm": 0.3838701546192169, + "learning_rate": 2.5338644877013067e-05, + "loss": 0.0134, + "step": 737 + }, + { + "epoch": 1.4221686746987952, + "grad_norm": 0.32032477855682373, + "learning_rate": 2.5295410921379745e-05, + "loss": 0.0143, + "step": 738 + }, + { + "epoch": 1.4240963855421687, + "grad_norm": 0.4594039022922516, + "learning_rate": 2.52521503387022e-05, + "loss": 0.0193, + "step": 739 + }, + { + "epoch": 1.4260240963855422, + "grad_norm": 0.3889620900154114, + "learning_rate": 2.5208863346508667e-05, + "loss": 0.0114, + "step": 740 + }, + { + "epoch": 1.4279518072289157, + "grad_norm": 0.33153319358825684, + "learning_rate": 2.5165550162460203e-05, + "loss": 0.0102, + "step": 741 + }, + { + "epoch": 1.4298795180722892, + "grad_norm": 0.7269518375396729, + "learning_rate": 2.5122211004349536e-05, + "loss": 0.0215, + "step": 742 + }, + { + "epoch": 1.4318072289156627, + "grad_norm": 0.31653261184692383, + "learning_rate": 2.5078846090100023e-05, + "loss": 0.0115, + "step": 743 + }, + { + "epoch": 1.4337349397590362, + "grad_norm": 0.20620353519916534, + "learning_rate": 2.5035455637764518e-05, + "loss": 0.0153, + "step": 744 + }, + { + "epoch": 1.4356626506024097, + "grad_norm": 0.17266008257865906, + "learning_rate": 2.4992039865524297e-05, + "loss": 0.0069, + "step": 745 + }, + { + "epoch": 1.4375903614457832, + "grad_norm": 0.24760811030864716, + "learning_rate": 2.494859899168795e-05, + "loss": 0.0108, + "step": 746 + }, + { + "epoch": 1.4395180722891565, + "grad_norm": 0.2584865391254425, + "learning_rate": 2.4905133234690282e-05, + "loss": 0.0095, + "step": 747 + }, + { + "epoch": 1.4414457831325302, + "grad_norm": 0.48847514390945435, + "learning_rate": 2.486164281309122e-05, + "loss": 0.0181, + "step": 748 + }, + { + "epoch": 1.4433734939759035, + "grad_norm": 0.42942047119140625, + "learning_rate": 2.4818127945574717e-05, + "loss": 0.025, + "step": 749 + }, + { + "epoch": 1.445301204819277, + "grad_norm": 0.23713800311088562, + "learning_rate": 2.4774588850947648e-05, + "loss": 0.0085, + "step": 750 + }, + { + "epoch": 1.4472289156626506, + "grad_norm": 0.8797569870948792, + "learning_rate": 2.473102574813871e-05, + "loss": 0.0097, + "step": 751 + }, + { + "epoch": 1.449156626506024, + "grad_norm": 0.2744862735271454, + "learning_rate": 2.4687438856197302e-05, + "loss": 0.0122, + "step": 752 + }, + { + "epoch": 1.4510843373493976, + "grad_norm": 0.12747010588645935, + "learning_rate": 2.4643828394292478e-05, + "loss": 0.0056, + "step": 753 + }, + { + "epoch": 1.453012048192771, + "grad_norm": 0.37376829981803894, + "learning_rate": 2.4600194581711775e-05, + "loss": 0.0052, + "step": 754 + }, + { + "epoch": 1.4549397590361446, + "grad_norm": 0.2536911368370056, + "learning_rate": 2.4556537637860176e-05, + "loss": 0.0113, + "step": 755 + }, + { + "epoch": 1.456867469879518, + "grad_norm": 0.25950780510902405, + "learning_rate": 2.451285778225894e-05, + "loss": 0.0099, + "step": 756 + }, + { + "epoch": 1.4587951807228916, + "grad_norm": 0.19535955786705017, + "learning_rate": 2.4469155234544565e-05, + "loss": 0.0069, + "step": 757 + }, + { + "epoch": 1.4607228915662651, + "grad_norm": 0.22816115617752075, + "learning_rate": 2.442543021446764e-05, + "loss": 0.0088, + "step": 758 + }, + { + "epoch": 1.4626506024096386, + "grad_norm": 0.3363986313343048, + "learning_rate": 2.4381682941891755e-05, + "loss": 0.0182, + "step": 759 + }, + { + "epoch": 1.464578313253012, + "grad_norm": 0.21492891013622284, + "learning_rate": 2.4337913636792382e-05, + "loss": 0.0069, + "step": 760 + }, + { + "epoch": 1.4665060240963856, + "grad_norm": 0.6070862412452698, + "learning_rate": 2.429412251925579e-05, + "loss": 0.0406, + "step": 761 + }, + { + "epoch": 1.468433734939759, + "grad_norm": 2.6469690799713135, + "learning_rate": 2.425030980947793e-05, + "loss": 0.0205, + "step": 762 + }, + { + "epoch": 1.4703614457831327, + "grad_norm": 0.30909740924835205, + "learning_rate": 2.420647572776332e-05, + "loss": 0.0136, + "step": 763 + }, + { + "epoch": 1.472289156626506, + "grad_norm": 0.6639553904533386, + "learning_rate": 2.416262049452395e-05, + "loss": 0.011, + "step": 764 + }, + { + "epoch": 1.4742168674698795, + "grad_norm": 0.2919616997241974, + "learning_rate": 2.4118744330278147e-05, + "loss": 0.0131, + "step": 765 + }, + { + "epoch": 1.476144578313253, + "grad_norm": 0.5232429504394531, + "learning_rate": 2.4074847455649523e-05, + "loss": 0.0138, + "step": 766 + }, + { + "epoch": 1.4780722891566265, + "grad_norm": 5.630630970001221, + "learning_rate": 2.403093009136579e-05, + "loss": 0.0264, + "step": 767 + }, + { + "epoch": 1.48, + "grad_norm": 0.33234721422195435, + "learning_rate": 2.3986992458257707e-05, + "loss": 0.0111, + "step": 768 + }, + { + "epoch": 1.4819277108433735, + "grad_norm": 0.28444772958755493, + "learning_rate": 2.3943034777257945e-05, + "loss": 0.0144, + "step": 769 + }, + { + "epoch": 1.483855421686747, + "grad_norm": 0.16229979693889618, + "learning_rate": 2.38990572694e-05, + "loss": 0.0062, + "step": 770 + }, + { + "epoch": 1.4857831325301205, + "grad_norm": 0.27474716305732727, + "learning_rate": 2.385506015581704e-05, + "loss": 0.0172, + "step": 771 + }, + { + "epoch": 1.487710843373494, + "grad_norm": 0.246526300907135, + "learning_rate": 2.381104365774083e-05, + "loss": 0.012, + "step": 772 + }, + { + "epoch": 1.4896385542168675, + "grad_norm": 0.282047837972641, + "learning_rate": 2.37670079965006e-05, + "loss": 0.0116, + "step": 773 + }, + { + "epoch": 1.491566265060241, + "grad_norm": 0.2878139317035675, + "learning_rate": 2.3722953393521944e-05, + "loss": 0.0147, + "step": 774 + }, + { + "epoch": 1.4934939759036143, + "grad_norm": 0.5586277842521667, + "learning_rate": 2.367888007032571e-05, + "loss": 0.0111, + "step": 775 + }, + { + "epoch": 1.495421686746988, + "grad_norm": 0.562160313129425, + "learning_rate": 2.3634788248526846e-05, + "loss": 0.0061, + "step": 776 + }, + { + "epoch": 1.4973493975903613, + "grad_norm": 0.3452005982398987, + "learning_rate": 2.3590678149833356e-05, + "loss": 0.0205, + "step": 777 + }, + { + "epoch": 1.499277108433735, + "grad_norm": 0.7757686376571655, + "learning_rate": 2.3546549996045114e-05, + "loss": 0.0273, + "step": 778 + }, + { + "epoch": 1.5012048192771084, + "grad_norm": 0.19530551135540009, + "learning_rate": 2.3502404009052812e-05, + "loss": 0.0083, + "step": 779 + }, + { + "epoch": 1.503132530120482, + "grad_norm": 0.2586531639099121, + "learning_rate": 2.3458240410836775e-05, + "loss": 0.0122, + "step": 780 + }, + { + "epoch": 1.5050602409638554, + "grad_norm": 0.30063286423683167, + "learning_rate": 2.3414059423465924e-05, + "loss": 0.0083, + "step": 781 + }, + { + "epoch": 1.5069879518072289, + "grad_norm": 0.18663185834884644, + "learning_rate": 2.3369861269096575e-05, + "loss": 0.0104, + "step": 782 + }, + { + "epoch": 1.5089156626506024, + "grad_norm": 0.4405941069126129, + "learning_rate": 2.3325646169971416e-05, + "loss": 0.0264, + "step": 783 + }, + { + "epoch": 1.510843373493976, + "grad_norm": 0.2947913110256195, + "learning_rate": 2.3281414348418294e-05, + "loss": 0.0107, + "step": 784 + }, + { + "epoch": 1.5127710843373494, + "grad_norm": 0.23813778162002563, + "learning_rate": 2.3237166026849158e-05, + "loss": 0.0084, + "step": 785 + }, + { + "epoch": 1.514698795180723, + "grad_norm": 0.33380329608917236, + "learning_rate": 2.3192901427758932e-05, + "loss": 0.0111, + "step": 786 + }, + { + "epoch": 1.5166265060240964, + "grad_norm": 0.3736988306045532, + "learning_rate": 2.314862077372438e-05, + "loss": 0.0135, + "step": 787 + }, + { + "epoch": 1.5185542168674697, + "grad_norm": 0.3785395920276642, + "learning_rate": 2.3104324287402996e-05, + "loss": 0.0265, + "step": 788 + }, + { + "epoch": 1.5204819277108435, + "grad_norm": 0.3359154462814331, + "learning_rate": 2.3060012191531885e-05, + "loss": 0.0127, + "step": 789 + }, + { + "epoch": 1.5224096385542167, + "grad_norm": 0.720753014087677, + "learning_rate": 2.301568470892664e-05, + "loss": 0.0134, + "step": 790 + }, + { + "epoch": 1.5243373493975905, + "grad_norm": 0.36473193764686584, + "learning_rate": 2.297134206248024e-05, + "loss": 0.0318, + "step": 791 + }, + { + "epoch": 1.5262650602409638, + "grad_norm": 0.29987087845802307, + "learning_rate": 2.2926984475161884e-05, + "loss": 0.008, + "step": 792 + }, + { + "epoch": 1.5281927710843375, + "grad_norm": 0.2883112132549286, + "learning_rate": 2.2882612170015914e-05, + "loss": 0.0125, + "step": 793 + }, + { + "epoch": 1.5301204819277108, + "grad_norm": 0.28983229398727417, + "learning_rate": 2.2838225370160682e-05, + "loss": 0.0155, + "step": 794 + }, + { + "epoch": 1.5320481927710843, + "grad_norm": 0.47236886620521545, + "learning_rate": 2.2793824298787414e-05, + "loss": 0.0132, + "step": 795 + }, + { + "epoch": 1.5339759036144578, + "grad_norm": 0.8328865170478821, + "learning_rate": 2.2749409179159104e-05, + "loss": 0.026, + "step": 796 + }, + { + "epoch": 1.5359036144578313, + "grad_norm": 0.3129172623157501, + "learning_rate": 2.2704980234609396e-05, + "loss": 0.0099, + "step": 797 + }, + { + "epoch": 1.5378313253012048, + "grad_norm": 0.22284500300884247, + "learning_rate": 2.2660537688541416e-05, + "loss": 0.009, + "step": 798 + }, + { + "epoch": 1.5397590361445783, + "grad_norm": 0.3346405625343323, + "learning_rate": 2.2616081764426726e-05, + "loss": 0.0077, + "step": 799 + }, + { + "epoch": 1.5416867469879518, + "grad_norm": 0.2923565208911896, + "learning_rate": 2.2571612685804124e-05, + "loss": 0.0119, + "step": 800 + }, + { + "epoch": 1.5436144578313253, + "grad_norm": 0.1921311914920807, + "learning_rate": 2.252713067627857e-05, + "loss": 0.0083, + "step": 801 + }, + { + "epoch": 1.5455421686746988, + "grad_norm": 0.23221106827259064, + "learning_rate": 2.2482635959520044e-05, + "loss": 0.0049, + "step": 802 + }, + { + "epoch": 1.5474698795180721, + "grad_norm": 0.6340724229812622, + "learning_rate": 2.243812875926241e-05, + "loss": 0.0273, + "step": 803 + }, + { + "epoch": 1.5493975903614459, + "grad_norm": 0.2699439823627472, + "learning_rate": 2.2393609299302314e-05, + "loss": 0.0108, + "step": 804 + }, + { + "epoch": 1.5513253012048192, + "grad_norm": 0.2005189210176468, + "learning_rate": 2.2349077803498052e-05, + "loss": 0.0076, + "step": 805 + }, + { + "epoch": 1.5532530120481929, + "grad_norm": 0.39668548107147217, + "learning_rate": 2.230453449576842e-05, + "loss": 0.0135, + "step": 806 + }, + { + "epoch": 1.5551807228915662, + "grad_norm": 0.2406950294971466, + "learning_rate": 2.2259979600091635e-05, + "loss": 0.0094, + "step": 807 + }, + { + "epoch": 1.55710843373494, + "grad_norm": 0.30363157391548157, + "learning_rate": 2.2215413340504158e-05, + "loss": 0.0178, + "step": 808 + }, + { + "epoch": 1.5590361445783132, + "grad_norm": 0.19508181512355804, + "learning_rate": 2.2170835941099605e-05, + "loss": 0.0069, + "step": 809 + }, + { + "epoch": 1.5609638554216867, + "grad_norm": 0.734106719493866, + "learning_rate": 2.2126247626027615e-05, + "loss": 0.0319, + "step": 810 + }, + { + "epoch": 1.5628915662650602, + "grad_norm": 0.2591583728790283, + "learning_rate": 2.208164861949268e-05, + "loss": 0.0168, + "step": 811 + }, + { + "epoch": 1.5648192771084337, + "grad_norm": 0.2386734038591385, + "learning_rate": 2.20370391457531e-05, + "loss": 0.0041, + "step": 812 + }, + { + "epoch": 1.5667469879518072, + "grad_norm": 0.1675218939781189, + "learning_rate": 2.1992419429119764e-05, + "loss": 0.0078, + "step": 813 + }, + { + "epoch": 1.5686746987951807, + "grad_norm": 0.45591506361961365, + "learning_rate": 2.1947789693955097e-05, + "loss": 0.0166, + "step": 814 + }, + { + "epoch": 1.5706024096385542, + "grad_norm": 0.46940621733665466, + "learning_rate": 2.190315016467188e-05, + "loss": 0.0176, + "step": 815 + }, + { + "epoch": 1.5725301204819278, + "grad_norm": 0.2294205278158188, + "learning_rate": 2.1858501065732146e-05, + "loss": 0.0102, + "step": 816 + }, + { + "epoch": 1.5744578313253013, + "grad_norm": 0.28922322392463684, + "learning_rate": 2.181384262164606e-05, + "loss": 0.0111, + "step": 817 + }, + { + "epoch": 1.5763855421686745, + "grad_norm": 0.19650064408779144, + "learning_rate": 2.1769175056970765e-05, + "loss": 0.0076, + "step": 818 + }, + { + "epoch": 1.5783132530120483, + "grad_norm": 0.19538825750350952, + "learning_rate": 2.172449859630927e-05, + "loss": 0.0118, + "step": 819 + }, + { + "epoch": 1.5802409638554216, + "grad_norm": 0.1900389939546585, + "learning_rate": 2.167981346430931e-05, + "loss": 0.0066, + "step": 820 + }, + { + "epoch": 1.5821686746987953, + "grad_norm": 0.21593710780143738, + "learning_rate": 2.1635119885662235e-05, + "loss": 0.0101, + "step": 821 + }, + { + "epoch": 1.5840963855421686, + "grad_norm": 0.2699289321899414, + "learning_rate": 2.159041808510185e-05, + "loss": 0.0118, + "step": 822 + }, + { + "epoch": 1.5860240963855423, + "grad_norm": 0.31867673993110657, + "learning_rate": 2.1545708287403322e-05, + "loss": 0.0122, + "step": 823 + }, + { + "epoch": 1.5879518072289156, + "grad_norm": 0.2862400412559509, + "learning_rate": 2.1500990717382004e-05, + "loss": 0.0216, + "step": 824 + }, + { + "epoch": 1.589879518072289, + "grad_norm": 0.28482481837272644, + "learning_rate": 2.145626559989237e-05, + "loss": 0.0136, + "step": 825 + }, + { + "epoch": 1.5918072289156626, + "grad_norm": 0.2866958975791931, + "learning_rate": 2.1411533159826803e-05, + "loss": 0.0298, + "step": 826 + }, + { + "epoch": 1.5937349397590361, + "grad_norm": 0.39092838764190674, + "learning_rate": 2.1366793622114533e-05, + "loss": 0.0382, + "step": 827 + }, + { + "epoch": 1.5956626506024096, + "grad_norm": 0.16381537914276123, + "learning_rate": 2.1322047211720468e-05, + "loss": 0.0074, + "step": 828 + } + ], + "logging_steps": 1, + "max_steps": 1557, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 92, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.08743669088623e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-828/training_args.bin b/checkpoint-828/training_args.bin new file mode 100644 index 0000000..ecc7b6b --- /dev/null +++ b/checkpoint-828/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:342dfb3c86216e436950100c79812c54066d5572c4e9975b0f133c067f061bcf +size 7825 diff --git a/checkpoint-920/chat_template.jinja b/checkpoint-920/chat_template.jinja new file mode 100644 index 0000000..1bad6a0 --- /dev/null +++ b/checkpoint-920/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/checkpoint-920/config.json b/checkpoint-920/config.json new file mode 100644 index 0000000..f8bf41e --- /dev/null +++ b/checkpoint-920/config.json @@ -0,0 +1,36 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128009, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": 128009, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "transformers_version": "4.56.2", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/checkpoint-920/generation_config.json b/checkpoint-920/generation_config.json new file mode 100644 index 0000000..2152026 --- /dev/null +++ b/checkpoint-920/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128009, + 128001, + 128008, + 128009 + ], + "pad_token_id": 128009, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.56.2" +} diff --git a/checkpoint-920/model.safetensors b/checkpoint-920/model.safetensors new file mode 100644 index 0000000..c304324 --- /dev/null +++ b/checkpoint-920/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c70b3feb9efc4df852b9a94b1d9d7e550d4b9be4bdd7f7daae5a2080caa5f7dc +size 2996982344 diff --git a/checkpoint-920/special_tokens_map.json b/checkpoint-920/special_tokens_map.json new file mode 100644 index 0000000..14daf45 --- /dev/null +++ b/checkpoint-920/special_tokens_map.json @@ -0,0 +1,26 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/checkpoint-920/tokenizer.json b/checkpoint-920/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/checkpoint-920/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-920/tokenizer_config.json b/checkpoint-920/tokenizer_config.json new file mode 100644 index 0000000..d1e1ea9 --- /dev/null +++ b/checkpoint-920/tokenizer_config.json @@ -0,0 +1,2068 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-920/trainer_state.json b/checkpoint-920/trainer_state.json new file mode 100644 index 0000000..062f4a1 --- /dev/null +++ b/checkpoint-920/trainer_state.json @@ -0,0 +1,6474 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.7730120481927711, + "eval_steps": 500, + "global_step": 920, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0019277108433734939, + "grad_norm": 2.8518834114074707, + "learning_rate": 0.0, + "loss": 0.0891, + "step": 1 + }, + { + "epoch": 0.0038554216867469878, + "grad_norm": 1.8441249132156372, + "learning_rate": 2.564102564102564e-07, + "loss": 0.0539, + "step": 2 + }, + { + "epoch": 0.005783132530120482, + "grad_norm": 2.8263237476348877, + "learning_rate": 5.128205128205128e-07, + "loss": 0.099, + "step": 3 + }, + { + "epoch": 0.0077108433734939755, + "grad_norm": 2.5051236152648926, + "learning_rate": 7.692307692307694e-07, + "loss": 0.0789, + "step": 4 + }, + { + "epoch": 0.00963855421686747, + "grad_norm": 2.6903438568115234, + "learning_rate": 1.0256410256410257e-06, + "loss": 0.0881, + "step": 5 + }, + { + "epoch": 0.011566265060240964, + "grad_norm": 2.6205761432647705, + "learning_rate": 1.282051282051282e-06, + "loss": 0.0776, + "step": 6 + }, + { + "epoch": 0.013493975903614458, + "grad_norm": 2.6309337615966797, + "learning_rate": 1.5384615384615387e-06, + "loss": 0.0827, + "step": 7 + }, + { + "epoch": 0.015421686746987951, + "grad_norm": 1.5427855253219604, + "learning_rate": 1.794871794871795e-06, + "loss": 0.0577, + "step": 8 + }, + { + "epoch": 0.017349397590361446, + "grad_norm": 1.0973446369171143, + "learning_rate": 2.0512820512820513e-06, + "loss": 0.04, + "step": 9 + }, + { + "epoch": 0.01927710843373494, + "grad_norm": 1.3253350257873535, + "learning_rate": 2.307692307692308e-06, + "loss": 0.0506, + "step": 10 + }, + { + "epoch": 0.021204819277108433, + "grad_norm": 1.588739037513733, + "learning_rate": 2.564102564102564e-06, + "loss": 0.0874, + "step": 11 + }, + { + "epoch": 0.02313253012048193, + "grad_norm": 1.4987014532089233, + "learning_rate": 2.8205128205128207e-06, + "loss": 0.0597, + "step": 12 + }, + { + "epoch": 0.02506024096385542, + "grad_norm": 1.6571592092514038, + "learning_rate": 3.0769230769230774e-06, + "loss": 0.0559, + "step": 13 + }, + { + "epoch": 0.026987951807228915, + "grad_norm": 1.8860628604888916, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.0688, + "step": 14 + }, + { + "epoch": 0.02891566265060241, + "grad_norm": 1.3202295303344727, + "learning_rate": 3.58974358974359e-06, + "loss": 0.0433, + "step": 15 + }, + { + "epoch": 0.030843373493975902, + "grad_norm": 1.5870612859725952, + "learning_rate": 3.846153846153847e-06, + "loss": 0.0695, + "step": 16 + }, + { + "epoch": 0.0327710843373494, + "grad_norm": 0.9192284345626831, + "learning_rate": 4.102564102564103e-06, + "loss": 0.0392, + "step": 17 + }, + { + "epoch": 0.03469879518072289, + "grad_norm": 0.7950155735015869, + "learning_rate": 4.358974358974359e-06, + "loss": 0.0351, + "step": 18 + }, + { + "epoch": 0.03662650602409639, + "grad_norm": 0.8854314684867859, + "learning_rate": 4.615384615384616e-06, + "loss": 0.0356, + "step": 19 + }, + { + "epoch": 0.03855421686746988, + "grad_norm": 0.9546788930892944, + "learning_rate": 4.871794871794872e-06, + "loss": 0.0427, + "step": 20 + }, + { + "epoch": 0.04048192771084337, + "grad_norm": 0.6315903663635254, + "learning_rate": 5.128205128205128e-06, + "loss": 0.0397, + "step": 21 + }, + { + "epoch": 0.042409638554216866, + "grad_norm": 0.9230924844741821, + "learning_rate": 5.384615384615385e-06, + "loss": 0.0481, + "step": 22 + }, + { + "epoch": 0.04433734939759036, + "grad_norm": 0.711546003818512, + "learning_rate": 5.641025641025641e-06, + "loss": 0.0479, + "step": 23 + }, + { + "epoch": 0.04626506024096386, + "grad_norm": 0.5288046598434448, + "learning_rate": 5.897435897435898e-06, + "loss": 0.0182, + "step": 24 + }, + { + "epoch": 0.04819277108433735, + "grad_norm": 0.9420496225357056, + "learning_rate": 6.153846153846155e-06, + "loss": 0.0389, + "step": 25 + }, + { + "epoch": 0.05012048192771084, + "grad_norm": 0.5001983046531677, + "learning_rate": 6.410256410256412e-06, + "loss": 0.0268, + "step": 26 + }, + { + "epoch": 0.052048192771084335, + "grad_norm": 0.8084653615951538, + "learning_rate": 6.666666666666667e-06, + "loss": 0.0367, + "step": 27 + }, + { + "epoch": 0.05397590361445783, + "grad_norm": 0.7195103764533997, + "learning_rate": 6.923076923076923e-06, + "loss": 0.0251, + "step": 28 + }, + { + "epoch": 0.055903614457831326, + "grad_norm": 0.529958963394165, + "learning_rate": 7.17948717948718e-06, + "loss": 0.0289, + "step": 29 + }, + { + "epoch": 0.05783132530120482, + "grad_norm": 0.795376181602478, + "learning_rate": 7.435897435897437e-06, + "loss": 0.043, + "step": 30 + }, + { + "epoch": 0.059759036144578316, + "grad_norm": 0.6366249918937683, + "learning_rate": 7.692307692307694e-06, + "loss": 0.029, + "step": 31 + }, + { + "epoch": 0.061686746987951804, + "grad_norm": 0.5414115190505981, + "learning_rate": 7.948717948717949e-06, + "loss": 0.0365, + "step": 32 + }, + { + "epoch": 0.0636144578313253, + "grad_norm": 0.9350972175598145, + "learning_rate": 8.205128205128205e-06, + "loss": 0.0283, + "step": 33 + }, + { + "epoch": 0.0655421686746988, + "grad_norm": 0.5660741925239563, + "learning_rate": 8.461538461538462e-06, + "loss": 0.0234, + "step": 34 + }, + { + "epoch": 0.06746987951807229, + "grad_norm": 0.5623988509178162, + "learning_rate": 8.717948717948719e-06, + "loss": 0.0307, + "step": 35 + }, + { + "epoch": 0.06939759036144579, + "grad_norm": 0.5260195732116699, + "learning_rate": 8.974358974358976e-06, + "loss": 0.0264, + "step": 36 + }, + { + "epoch": 0.07132530120481928, + "grad_norm": 0.4934785068035126, + "learning_rate": 9.230769230769232e-06, + "loss": 0.0224, + "step": 37 + }, + { + "epoch": 0.07325301204819278, + "grad_norm": 0.4797322154045105, + "learning_rate": 9.487179487179487e-06, + "loss": 0.0163, + "step": 38 + }, + { + "epoch": 0.07518072289156627, + "grad_norm": 0.4739217460155487, + "learning_rate": 9.743589743589744e-06, + "loss": 0.0165, + "step": 39 + }, + { + "epoch": 0.07710843373493977, + "grad_norm": 0.4527677595615387, + "learning_rate": 1e-05, + "loss": 0.0163, + "step": 40 + }, + { + "epoch": 0.07903614457831325, + "grad_norm": 0.6241316795349121, + "learning_rate": 1.0256410256410256e-05, + "loss": 0.0302, + "step": 41 + }, + { + "epoch": 0.08096385542168674, + "grad_norm": 0.639043927192688, + "learning_rate": 1.0512820512820514e-05, + "loss": 0.0312, + "step": 42 + }, + { + "epoch": 0.08289156626506024, + "grad_norm": 0.5121409296989441, + "learning_rate": 1.076923076923077e-05, + "loss": 0.0256, + "step": 43 + }, + { + "epoch": 0.08481927710843373, + "grad_norm": 0.6340477466583252, + "learning_rate": 1.1025641025641028e-05, + "loss": 0.04, + "step": 44 + }, + { + "epoch": 0.08674698795180723, + "grad_norm": 0.5260409712791443, + "learning_rate": 1.1282051282051283e-05, + "loss": 0.0282, + "step": 45 + }, + { + "epoch": 0.08867469879518072, + "grad_norm": 0.6390711069107056, + "learning_rate": 1.1538461538461538e-05, + "loss": 0.0243, + "step": 46 + }, + { + "epoch": 0.09060240963855422, + "grad_norm": 0.46469295024871826, + "learning_rate": 1.1794871794871796e-05, + "loss": 0.0208, + "step": 47 + }, + { + "epoch": 0.09253012048192771, + "grad_norm": 0.8711516857147217, + "learning_rate": 1.2051282051282051e-05, + "loss": 0.0291, + "step": 48 + }, + { + "epoch": 0.09445783132530121, + "grad_norm": 0.9164300560951233, + "learning_rate": 1.230769230769231e-05, + "loss": 0.0342, + "step": 49 + }, + { + "epoch": 0.0963855421686747, + "grad_norm": 0.5401139259338379, + "learning_rate": 1.2564102564102565e-05, + "loss": 0.0185, + "step": 50 + }, + { + "epoch": 0.0983132530120482, + "grad_norm": 0.44393008947372437, + "learning_rate": 1.2820512820512823e-05, + "loss": 0.0228, + "step": 51 + }, + { + "epoch": 0.10024096385542168, + "grad_norm": 0.3855767846107483, + "learning_rate": 1.3076923076923078e-05, + "loss": 0.0176, + "step": 52 + }, + { + "epoch": 0.10216867469879518, + "grad_norm": 0.8561235070228577, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.0433, + "step": 53 + }, + { + "epoch": 0.10409638554216867, + "grad_norm": 0.768002450466156, + "learning_rate": 1.3589743589743592e-05, + "loss": 0.0245, + "step": 54 + }, + { + "epoch": 0.10602409638554217, + "grad_norm": 0.4559759497642517, + "learning_rate": 1.3846153846153847e-05, + "loss": 0.0224, + "step": 55 + }, + { + "epoch": 0.10795180722891566, + "grad_norm": 0.6203847527503967, + "learning_rate": 1.4102564102564105e-05, + "loss": 0.0296, + "step": 56 + }, + { + "epoch": 0.10987951807228916, + "grad_norm": 0.6651368141174316, + "learning_rate": 1.435897435897436e-05, + "loss": 0.0336, + "step": 57 + }, + { + "epoch": 0.11180722891566265, + "grad_norm": 0.377734512090683, + "learning_rate": 1.4615384615384615e-05, + "loss": 0.0196, + "step": 58 + }, + { + "epoch": 0.11373493975903615, + "grad_norm": 0.687568724155426, + "learning_rate": 1.4871794871794874e-05, + "loss": 0.0207, + "step": 59 + }, + { + "epoch": 0.11566265060240964, + "grad_norm": 0.7905604243278503, + "learning_rate": 1.5128205128205129e-05, + "loss": 0.047, + "step": 60 + }, + { + "epoch": 0.11759036144578314, + "grad_norm": 0.7938196063041687, + "learning_rate": 1.5384615384615387e-05, + "loss": 0.0198, + "step": 61 + }, + { + "epoch": 0.11951807228915663, + "grad_norm": 0.41340553760528564, + "learning_rate": 1.5641025641025644e-05, + "loss": 0.0161, + "step": 62 + }, + { + "epoch": 0.12144578313253013, + "grad_norm": 0.5668172240257263, + "learning_rate": 1.5897435897435897e-05, + "loss": 0.0275, + "step": 63 + }, + { + "epoch": 0.12337349397590361, + "grad_norm": 0.48333367705345154, + "learning_rate": 1.6153846153846154e-05, + "loss": 0.0137, + "step": 64 + }, + { + "epoch": 0.12530120481927712, + "grad_norm": 0.6843933463096619, + "learning_rate": 1.641025641025641e-05, + "loss": 0.0294, + "step": 65 + }, + { + "epoch": 0.1272289156626506, + "grad_norm": 0.7789272665977478, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.0401, + "step": 66 + }, + { + "epoch": 0.1291566265060241, + "grad_norm": 0.6203492879867554, + "learning_rate": 1.6923076923076924e-05, + "loss": 0.0292, + "step": 67 + }, + { + "epoch": 0.1310843373493976, + "grad_norm": 0.5940662622451782, + "learning_rate": 1.717948717948718e-05, + "loss": 0.0178, + "step": 68 + }, + { + "epoch": 0.13301204819277107, + "grad_norm": 0.35504868626594543, + "learning_rate": 1.7435897435897438e-05, + "loss": 0.0129, + "step": 69 + }, + { + "epoch": 0.13493975903614458, + "grad_norm": 0.8796699643135071, + "learning_rate": 1.7692307692307694e-05, + "loss": 0.034, + "step": 70 + }, + { + "epoch": 0.13686746987951806, + "grad_norm": 0.967444896697998, + "learning_rate": 1.794871794871795e-05, + "loss": 0.0266, + "step": 71 + }, + { + "epoch": 0.13879518072289157, + "grad_norm": 0.4428526759147644, + "learning_rate": 1.8205128205128208e-05, + "loss": 0.0223, + "step": 72 + }, + { + "epoch": 0.14072289156626505, + "grad_norm": 0.42897751927375793, + "learning_rate": 1.8461538461538465e-05, + "loss": 0.0187, + "step": 73 + }, + { + "epoch": 0.14265060240963856, + "grad_norm": 0.5100914835929871, + "learning_rate": 1.8717948717948718e-05, + "loss": 0.0164, + "step": 74 + }, + { + "epoch": 0.14457831325301204, + "grad_norm": 0.6028861999511719, + "learning_rate": 1.8974358974358975e-05, + "loss": 0.0164, + "step": 75 + }, + { + "epoch": 0.14650602409638555, + "grad_norm": 0.6187024116516113, + "learning_rate": 1.923076923076923e-05, + "loss": 0.0296, + "step": 76 + }, + { + "epoch": 0.14843373493975903, + "grad_norm": 0.4822489619255066, + "learning_rate": 1.9487179487179488e-05, + "loss": 0.0148, + "step": 77 + }, + { + "epoch": 0.15036144578313254, + "grad_norm": 0.7231149673461914, + "learning_rate": 1.9743589743589745e-05, + "loss": 0.0395, + "step": 78 + }, + { + "epoch": 0.15228915662650602, + "grad_norm": 0.8409642577171326, + "learning_rate": 2e-05, + "loss": 0.0446, + "step": 79 + }, + { + "epoch": 0.15421686746987953, + "grad_norm": 0.4883500039577484, + "learning_rate": 2.025641025641026e-05, + "loss": 0.0206, + "step": 80 + }, + { + "epoch": 0.156144578313253, + "grad_norm": 0.6287479400634766, + "learning_rate": 2.0512820512820512e-05, + "loss": 0.0333, + "step": 81 + }, + { + "epoch": 0.1580722891566265, + "grad_norm": 0.5041632652282715, + "learning_rate": 2.0769230769230772e-05, + "loss": 0.0414, + "step": 82 + }, + { + "epoch": 0.16, + "grad_norm": 0.5103405117988586, + "learning_rate": 2.102564102564103e-05, + "loss": 0.045, + "step": 83 + }, + { + "epoch": 0.16192771084337348, + "grad_norm": 0.493161678314209, + "learning_rate": 2.1282051282051285e-05, + "loss": 0.021, + "step": 84 + }, + { + "epoch": 0.163855421686747, + "grad_norm": 0.908843994140625, + "learning_rate": 2.153846153846154e-05, + "loss": 0.0389, + "step": 85 + }, + { + "epoch": 0.16578313253012048, + "grad_norm": 0.5067003965377808, + "learning_rate": 2.1794871794871795e-05, + "loss": 0.0272, + "step": 86 + }, + { + "epoch": 0.16771084337349398, + "grad_norm": 0.5791381597518921, + "learning_rate": 2.2051282051282056e-05, + "loss": 0.0368, + "step": 87 + }, + { + "epoch": 0.16963855421686747, + "grad_norm": 0.7056036591529846, + "learning_rate": 2.230769230769231e-05, + "loss": 0.0284, + "step": 88 + }, + { + "epoch": 0.17156626506024097, + "grad_norm": 0.6563822031021118, + "learning_rate": 2.2564102564102566e-05, + "loss": 0.0646, + "step": 89 + }, + { + "epoch": 0.17349397590361446, + "grad_norm": 0.9483286142349243, + "learning_rate": 2.2820512820512822e-05, + "loss": 0.0439, + "step": 90 + }, + { + "epoch": 0.17542168674698796, + "grad_norm": 0.370664119720459, + "learning_rate": 2.3076923076923076e-05, + "loss": 0.0109, + "step": 91 + }, + { + "epoch": 0.17734939759036145, + "grad_norm": 0.9776477813720703, + "learning_rate": 2.3333333333333336e-05, + "loss": 0.0458, + "step": 92 + }, + { + "epoch": 0.17927710843373493, + "grad_norm": 0.45710092782974243, + "learning_rate": 2.3589743589743593e-05, + "loss": 0.0212, + "step": 93 + }, + { + "epoch": 0.18120481927710844, + "grad_norm": 0.8623896837234497, + "learning_rate": 2.384615384615385e-05, + "loss": 0.0215, + "step": 94 + }, + { + "epoch": 0.18313253012048192, + "grad_norm": 0.55814528465271, + "learning_rate": 2.4102564102564103e-05, + "loss": 0.0218, + "step": 95 + }, + { + "epoch": 0.18506024096385543, + "grad_norm": 0.49882641434669495, + "learning_rate": 2.435897435897436e-05, + "loss": 0.0268, + "step": 96 + }, + { + "epoch": 0.1869879518072289, + "grad_norm": 0.3508654534816742, + "learning_rate": 2.461538461538462e-05, + "loss": 0.0172, + "step": 97 + }, + { + "epoch": 0.18891566265060242, + "grad_norm": 0.601170003414154, + "learning_rate": 2.4871794871794873e-05, + "loss": 0.0208, + "step": 98 + }, + { + "epoch": 0.1908433734939759, + "grad_norm": 1.1748133897781372, + "learning_rate": 2.512820512820513e-05, + "loss": 0.0259, + "step": 99 + }, + { + "epoch": 0.1927710843373494, + "grad_norm": 0.46370384097099304, + "learning_rate": 2.5384615384615386e-05, + "loss": 0.0242, + "step": 100 + }, + { + "epoch": 0.1946987951807229, + "grad_norm": 0.525010883808136, + "learning_rate": 2.5641025641025646e-05, + "loss": 0.0188, + "step": 101 + }, + { + "epoch": 0.1966265060240964, + "grad_norm": 0.766501784324646, + "learning_rate": 2.58974358974359e-05, + "loss": 0.0584, + "step": 102 + }, + { + "epoch": 0.19855421686746988, + "grad_norm": 0.3572964370250702, + "learning_rate": 2.6153846153846157e-05, + "loss": 0.0131, + "step": 103 + }, + { + "epoch": 0.20048192771084336, + "grad_norm": 0.6467130780220032, + "learning_rate": 2.6410256410256413e-05, + "loss": 0.0231, + "step": 104 + }, + { + "epoch": 0.20240963855421687, + "grad_norm": 1.1852102279663086, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.027, + "step": 105 + }, + { + "epoch": 0.20433734939759035, + "grad_norm": 2.3659932613372803, + "learning_rate": 2.6923076923076927e-05, + "loss": 0.0224, + "step": 106 + }, + { + "epoch": 0.20626506024096386, + "grad_norm": 0.5343687534332275, + "learning_rate": 2.7179487179487183e-05, + "loss": 0.0198, + "step": 107 + }, + { + "epoch": 0.20819277108433734, + "grad_norm": 1.852160096168518, + "learning_rate": 2.7435897435897437e-05, + "loss": 0.032, + "step": 108 + }, + { + "epoch": 0.21012048192771085, + "grad_norm": 0.47291702032089233, + "learning_rate": 2.7692307692307694e-05, + "loss": 0.0117, + "step": 109 + }, + { + "epoch": 0.21204819277108433, + "grad_norm": 0.7623187899589539, + "learning_rate": 2.794871794871795e-05, + "loss": 0.0337, + "step": 110 + }, + { + "epoch": 0.21397590361445784, + "grad_norm": 0.5272570848464966, + "learning_rate": 2.820512820512821e-05, + "loss": 0.0131, + "step": 111 + }, + { + "epoch": 0.21590361445783132, + "grad_norm": 0.5568500757217407, + "learning_rate": 2.8461538461538464e-05, + "loss": 0.0233, + "step": 112 + }, + { + "epoch": 0.21783132530120483, + "grad_norm": 0.4008469879627228, + "learning_rate": 2.871794871794872e-05, + "loss": 0.0204, + "step": 113 + }, + { + "epoch": 0.2197590361445783, + "grad_norm": 0.4888612926006317, + "learning_rate": 2.8974358974358977e-05, + "loss": 0.016, + "step": 114 + }, + { + "epoch": 0.2216867469879518, + "grad_norm": 0.44903355836868286, + "learning_rate": 2.923076923076923e-05, + "loss": 0.0135, + "step": 115 + }, + { + "epoch": 0.2236144578313253, + "grad_norm": 0.9266762733459473, + "learning_rate": 2.948717948717949e-05, + "loss": 0.0233, + "step": 116 + }, + { + "epoch": 0.22554216867469878, + "grad_norm": 0.5352638959884644, + "learning_rate": 2.9743589743589747e-05, + "loss": 0.0198, + "step": 117 + }, + { + "epoch": 0.2274698795180723, + "grad_norm": 0.6051343679428101, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.0246, + "step": 118 + }, + { + "epoch": 0.22939759036144577, + "grad_norm": 0.9971133470535278, + "learning_rate": 3.0256410256410257e-05, + "loss": 0.025, + "step": 119 + }, + { + "epoch": 0.23132530120481928, + "grad_norm": 0.704236626625061, + "learning_rate": 3.0512820512820514e-05, + "loss": 0.031, + "step": 120 + }, + { + "epoch": 0.23325301204819276, + "grad_norm": 0.6137097477912903, + "learning_rate": 3.0769230769230774e-05, + "loss": 0.0519, + "step": 121 + }, + { + "epoch": 0.23518072289156627, + "grad_norm": 0.7396159768104553, + "learning_rate": 3.102564102564103e-05, + "loss": 0.0325, + "step": 122 + }, + { + "epoch": 0.23710843373493976, + "grad_norm": 1.3282053470611572, + "learning_rate": 3.128205128205129e-05, + "loss": 0.0252, + "step": 123 + }, + { + "epoch": 0.23903614457831326, + "grad_norm": 0.5220731496810913, + "learning_rate": 3.153846153846154e-05, + "loss": 0.0262, + "step": 124 + }, + { + "epoch": 0.24096385542168675, + "grad_norm": 0.5357242822647095, + "learning_rate": 3.1794871794871795e-05, + "loss": 0.0243, + "step": 125 + }, + { + "epoch": 0.24289156626506025, + "grad_norm": 0.48207753896713257, + "learning_rate": 3.205128205128206e-05, + "loss": 0.0178, + "step": 126 + }, + { + "epoch": 0.24481927710843374, + "grad_norm": 0.552988588809967, + "learning_rate": 3.230769230769231e-05, + "loss": 0.023, + "step": 127 + }, + { + "epoch": 0.24674698795180722, + "grad_norm": 1.7962840795516968, + "learning_rate": 3.2564102564102565e-05, + "loss": 0.032, + "step": 128 + }, + { + "epoch": 0.24867469879518073, + "grad_norm": 1.6404600143432617, + "learning_rate": 3.282051282051282e-05, + "loss": 0.0231, + "step": 129 + }, + { + "epoch": 0.25060240963855424, + "grad_norm": 0.39142486453056335, + "learning_rate": 3.307692307692308e-05, + "loss": 0.0147, + "step": 130 + }, + { + "epoch": 0.2525301204819277, + "grad_norm": 1.3272887468338013, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.0439, + "step": 131 + }, + { + "epoch": 0.2544578313253012, + "grad_norm": 1.5122811794281006, + "learning_rate": 3.358974358974359e-05, + "loss": 0.0282, + "step": 132 + }, + { + "epoch": 0.2563855421686747, + "grad_norm": 1.8542430400848389, + "learning_rate": 3.384615384615385e-05, + "loss": 0.0515, + "step": 133 + }, + { + "epoch": 0.2583132530120482, + "grad_norm": 4.059277534484863, + "learning_rate": 3.4102564102564105e-05, + "loss": 0.0781, + "step": 134 + }, + { + "epoch": 0.26024096385542167, + "grad_norm": 0.6206214427947998, + "learning_rate": 3.435897435897436e-05, + "loss": 0.0306, + "step": 135 + }, + { + "epoch": 0.2621686746987952, + "grad_norm": 0.4575510323047638, + "learning_rate": 3.461538461538462e-05, + "loss": 0.0154, + "step": 136 + }, + { + "epoch": 0.2640963855421687, + "grad_norm": 1.1556978225708008, + "learning_rate": 3.4871794871794875e-05, + "loss": 0.0235, + "step": 137 + }, + { + "epoch": 0.26602409638554214, + "grad_norm": 0.6975051760673523, + "learning_rate": 3.512820512820513e-05, + "loss": 0.0453, + "step": 138 + }, + { + "epoch": 0.26795180722891565, + "grad_norm": 0.8686623573303223, + "learning_rate": 3.538461538461539e-05, + "loss": 0.0427, + "step": 139 + }, + { + "epoch": 0.26987951807228916, + "grad_norm": 2.0681848526000977, + "learning_rate": 3.5641025641025646e-05, + "loss": 0.04, + "step": 140 + }, + { + "epoch": 0.27180722891566267, + "grad_norm": 0.4397984445095062, + "learning_rate": 3.58974358974359e-05, + "loss": 0.0188, + "step": 141 + }, + { + "epoch": 0.2737349397590361, + "grad_norm": 0.5871334075927734, + "learning_rate": 3.615384615384616e-05, + "loss": 0.0253, + "step": 142 + }, + { + "epoch": 0.27566265060240963, + "grad_norm": 1.1078568696975708, + "learning_rate": 3.6410256410256416e-05, + "loss": 0.0316, + "step": 143 + }, + { + "epoch": 0.27759036144578314, + "grad_norm": 0.5691841840744019, + "learning_rate": 3.6666666666666666e-05, + "loss": 0.0266, + "step": 144 + }, + { + "epoch": 0.27951807228915665, + "grad_norm": 0.7896255254745483, + "learning_rate": 3.692307692307693e-05, + "loss": 0.0281, + "step": 145 + }, + { + "epoch": 0.2814457831325301, + "grad_norm": 0.9988337159156799, + "learning_rate": 3.7179487179487186e-05, + "loss": 0.0295, + "step": 146 + }, + { + "epoch": 0.2833734939759036, + "grad_norm": 0.9811834692955017, + "learning_rate": 3.7435897435897436e-05, + "loss": 0.0322, + "step": 147 + }, + { + "epoch": 0.2853012048192771, + "grad_norm": 0.6503105759620667, + "learning_rate": 3.769230769230769e-05, + "loss": 0.0266, + "step": 148 + }, + { + "epoch": 0.28722891566265063, + "grad_norm": 1.9164355993270874, + "learning_rate": 3.794871794871795e-05, + "loss": 0.0677, + "step": 149 + }, + { + "epoch": 0.2891566265060241, + "grad_norm": 1.1724557876586914, + "learning_rate": 3.820512820512821e-05, + "loss": 0.0324, + "step": 150 + }, + { + "epoch": 0.2910843373493976, + "grad_norm": 0.8482469916343689, + "learning_rate": 3.846153846153846e-05, + "loss": 0.0259, + "step": 151 + }, + { + "epoch": 0.2930120481927711, + "grad_norm": 0.8572830557823181, + "learning_rate": 3.871794871794872e-05, + "loss": 0.0358, + "step": 152 + }, + { + "epoch": 0.29493975903614456, + "grad_norm": 0.6630825400352478, + "learning_rate": 3.8974358974358976e-05, + "loss": 0.0447, + "step": 153 + }, + { + "epoch": 0.29686746987951806, + "grad_norm": 0.9197093844413757, + "learning_rate": 3.923076923076923e-05, + "loss": 0.0409, + "step": 154 + }, + { + "epoch": 0.2987951807228916, + "grad_norm": 0.6976819634437561, + "learning_rate": 3.948717948717949e-05, + "loss": 0.0317, + "step": 155 + }, + { + "epoch": 0.3007228915662651, + "grad_norm": 0.7353514432907104, + "learning_rate": 3.9743589743589747e-05, + "loss": 0.0306, + "step": 156 + }, + { + "epoch": 0.30265060240963854, + "grad_norm": 0.5730232000350952, + "learning_rate": 4e-05, + "loss": 0.0324, + "step": 157 + }, + { + "epoch": 0.30457831325301205, + "grad_norm": 0.7852078676223755, + "learning_rate": 3.999994971675547e-05, + "loss": 0.0354, + "step": 158 + }, + { + "epoch": 0.30650602409638555, + "grad_norm": 0.5924715399742126, + "learning_rate": 3.999979886727471e-05, + "loss": 0.0366, + "step": 159 + }, + { + "epoch": 0.30843373493975906, + "grad_norm": 0.7359845638275146, + "learning_rate": 3.999954745231624e-05, + "loss": 0.0437, + "step": 160 + }, + { + "epoch": 0.3103614457831325, + "grad_norm": 0.7866976857185364, + "learning_rate": 3.999919547314426e-05, + "loss": 0.0363, + "step": 161 + }, + { + "epoch": 0.312289156626506, + "grad_norm": 0.7425745129585266, + "learning_rate": 3.999874293152863e-05, + "loss": 0.0259, + "step": 162 + }, + { + "epoch": 0.31421686746987953, + "grad_norm": 1.8922245502471924, + "learning_rate": 3.9998189829744885e-05, + "loss": 0.0341, + "step": 163 + }, + { + "epoch": 0.316144578313253, + "grad_norm": 0.7908634543418884, + "learning_rate": 3.99975361705742e-05, + "loss": 0.0424, + "step": 164 + }, + { + "epoch": 0.3180722891566265, + "grad_norm": 2.047368049621582, + "learning_rate": 3.999678195730337e-05, + "loss": 0.0535, + "step": 165 + }, + { + "epoch": 0.32, + "grad_norm": 0.5702639222145081, + "learning_rate": 3.999592719372484e-05, + "loss": 0.0284, + "step": 166 + }, + { + "epoch": 0.3219277108433735, + "grad_norm": 0.45015648007392883, + "learning_rate": 3.9994971884136636e-05, + "loss": 0.0313, + "step": 167 + }, + { + "epoch": 0.32385542168674697, + "grad_norm": 4.094679355621338, + "learning_rate": 3.9993916033342355e-05, + "loss": 0.0524, + "step": 168 + }, + { + "epoch": 0.3257831325301205, + "grad_norm": 0.800846517086029, + "learning_rate": 3.999275964665117e-05, + "loss": 0.0282, + "step": 169 + }, + { + "epoch": 0.327710843373494, + "grad_norm": 0.47881078720092773, + "learning_rate": 3.999150272987776e-05, + "loss": 0.0293, + "step": 170 + }, + { + "epoch": 0.3296385542168675, + "grad_norm": 0.5716657638549805, + "learning_rate": 3.999014528934232e-05, + "loss": 0.0221, + "step": 171 + }, + { + "epoch": 0.33156626506024095, + "grad_norm": 0.6333311200141907, + "learning_rate": 3.998868733187048e-05, + "loss": 0.0302, + "step": 172 + }, + { + "epoch": 0.33349397590361446, + "grad_norm": 6.642521858215332, + "learning_rate": 3.998712886479335e-05, + "loss": 0.0364, + "step": 173 + }, + { + "epoch": 0.33542168674698797, + "grad_norm": 0.7515506148338318, + "learning_rate": 3.998546989594739e-05, + "loss": 0.0296, + "step": 174 + }, + { + "epoch": 0.3373493975903614, + "grad_norm": 1.0728015899658203, + "learning_rate": 3.998371043367445e-05, + "loss": 0.0549, + "step": 175 + }, + { + "epoch": 0.33927710843373493, + "grad_norm": 1.3025579452514648, + "learning_rate": 3.998185048682166e-05, + "loss": 0.0577, + "step": 176 + }, + { + "epoch": 0.34120481927710844, + "grad_norm": 1.0962958335876465, + "learning_rate": 3.997989006474144e-05, + "loss": 0.0313, + "step": 177 + }, + { + "epoch": 0.34313253012048195, + "grad_norm": 0.7064313292503357, + "learning_rate": 3.997782917729143e-05, + "loss": 0.0309, + "step": 178 + }, + { + "epoch": 0.3450602409638554, + "grad_norm": 0.43374207615852356, + "learning_rate": 3.997566783483445e-05, + "loss": 0.0166, + "step": 179 + }, + { + "epoch": 0.3469879518072289, + "grad_norm": 0.7236390113830566, + "learning_rate": 3.9973406048238413e-05, + "loss": 0.0254, + "step": 180 + }, + { + "epoch": 0.3489156626506024, + "grad_norm": 0.5041500926017761, + "learning_rate": 3.9971043828876334e-05, + "loss": 0.0239, + "step": 181 + }, + { + "epoch": 0.35084337349397593, + "grad_norm": 1.2744532823562622, + "learning_rate": 3.9968581188626204e-05, + "loss": 0.0404, + "step": 182 + }, + { + "epoch": 0.3527710843373494, + "grad_norm": 0.45845362544059753, + "learning_rate": 3.996601813987098e-05, + "loss": 0.0127, + "step": 183 + }, + { + "epoch": 0.3546987951807229, + "grad_norm": 0.4426881968975067, + "learning_rate": 3.996335469549852e-05, + "loss": 0.0176, + "step": 184 + }, + { + "epoch": 0.3566265060240964, + "grad_norm": 1.0030732154846191, + "learning_rate": 3.9960590868901465e-05, + "loss": 0.0457, + "step": 185 + }, + { + "epoch": 0.35855421686746985, + "grad_norm": 0.6428582668304443, + "learning_rate": 3.995772667397725e-05, + "loss": 0.0271, + "step": 186 + }, + { + "epoch": 0.36048192771084336, + "grad_norm": 0.5335744619369507, + "learning_rate": 3.995476212512795e-05, + "loss": 0.0297, + "step": 187 + }, + { + "epoch": 0.3624096385542169, + "grad_norm": 0.6995761394500732, + "learning_rate": 3.99516972372603e-05, + "loss": 0.0322, + "step": 188 + }, + { + "epoch": 0.3643373493975904, + "grad_norm": 0.765511155128479, + "learning_rate": 3.9948532025785546e-05, + "loss": 0.0253, + "step": 189 + }, + { + "epoch": 0.36626506024096384, + "grad_norm": 0.6165828108787537, + "learning_rate": 3.9945266506619403e-05, + "loss": 0.0355, + "step": 190 + }, + { + "epoch": 0.36819277108433734, + "grad_norm": 0.851970911026001, + "learning_rate": 3.994190069618195e-05, + "loss": 0.056, + "step": 191 + }, + { + "epoch": 0.37012048192771085, + "grad_norm": 0.9850023984909058, + "learning_rate": 3.993843461139757e-05, + "loss": 0.0415, + "step": 192 + }, + { + "epoch": 0.37204819277108436, + "grad_norm": 0.7455295324325562, + "learning_rate": 3.9934868269694886e-05, + "loss": 0.0379, + "step": 193 + }, + { + "epoch": 0.3739759036144578, + "grad_norm": 1.159469723701477, + "learning_rate": 3.9931201689006595e-05, + "loss": 0.0237, + "step": 194 + }, + { + "epoch": 0.3759036144578313, + "grad_norm": 0.5490080118179321, + "learning_rate": 3.992743488776947e-05, + "loss": 0.024, + "step": 195 + }, + { + "epoch": 0.37783132530120483, + "grad_norm": 1.279831886291504, + "learning_rate": 3.992356788492421e-05, + "loss": 0.0273, + "step": 196 + }, + { + "epoch": 0.3797590361445783, + "grad_norm": 0.859104335308075, + "learning_rate": 3.9919600699915355e-05, + "loss": 0.0411, + "step": 197 + }, + { + "epoch": 0.3816867469879518, + "grad_norm": 1.2525300979614258, + "learning_rate": 3.991553335269119e-05, + "loss": 0.0857, + "step": 198 + }, + { + "epoch": 0.3836144578313253, + "grad_norm": 0.4924193024635315, + "learning_rate": 3.991136586370367e-05, + "loss": 0.0294, + "step": 199 + }, + { + "epoch": 0.3855421686746988, + "grad_norm": 1.417190670967102, + "learning_rate": 3.990709825390828e-05, + "loss": 0.0395, + "step": 200 + }, + { + "epoch": 0.38746987951807227, + "grad_norm": 0.6172056198120117, + "learning_rate": 3.9902730544763936e-05, + "loss": 0.0194, + "step": 201 + }, + { + "epoch": 0.3893975903614458, + "grad_norm": 0.7292149662971497, + "learning_rate": 3.989826275823291e-05, + "loss": 0.0381, + "step": 202 + }, + { + "epoch": 0.3913253012048193, + "grad_norm": 0.5949816107749939, + "learning_rate": 3.989369491678067e-05, + "loss": 0.0254, + "step": 203 + }, + { + "epoch": 0.3932530120481928, + "grad_norm": 0.6012582182884216, + "learning_rate": 3.988902704337582e-05, + "loss": 0.048, + "step": 204 + }, + { + "epoch": 0.39518072289156625, + "grad_norm": 0.6273590922355652, + "learning_rate": 3.9884259161489936e-05, + "loss": 0.0268, + "step": 205 + }, + { + "epoch": 0.39710843373493976, + "grad_norm": 0.9615244269371033, + "learning_rate": 3.987939129509746e-05, + "loss": 0.0192, + "step": 206 + }, + { + "epoch": 0.39903614457831327, + "grad_norm": 0.6009241342544556, + "learning_rate": 3.9874423468675624e-05, + "loss": 0.0362, + "step": 207 + }, + { + "epoch": 0.4009638554216867, + "grad_norm": 0.411335289478302, + "learning_rate": 3.9869355707204266e-05, + "loss": 0.017, + "step": 208 + }, + { + "epoch": 0.40289156626506023, + "grad_norm": 0.6151527166366577, + "learning_rate": 3.986418803616573e-05, + "loss": 0.0283, + "step": 209 + }, + { + "epoch": 0.40481927710843374, + "grad_norm": 0.33808204531669617, + "learning_rate": 3.985892048154474e-05, + "loss": 0.0158, + "step": 210 + }, + { + "epoch": 0.40674698795180725, + "grad_norm": 0.5464187860488892, + "learning_rate": 3.9853553069828284e-05, + "loss": 0.0292, + "step": 211 + }, + { + "epoch": 0.4086746987951807, + "grad_norm": 0.6658390760421753, + "learning_rate": 3.984808582800543e-05, + "loss": 0.0281, + "step": 212 + }, + { + "epoch": 0.4106024096385542, + "grad_norm": 0.4253764748573303, + "learning_rate": 3.984251878356726e-05, + "loss": 0.031, + "step": 213 + }, + { + "epoch": 0.4125301204819277, + "grad_norm": 0.32309481501579285, + "learning_rate": 3.983685196450667e-05, + "loss": 0.0166, + "step": 214 + }, + { + "epoch": 0.41445783132530123, + "grad_norm": 0.43756410479545593, + "learning_rate": 3.9831085399318265e-05, + "loss": 0.0326, + "step": 215 + }, + { + "epoch": 0.4163855421686747, + "grad_norm": 0.264046847820282, + "learning_rate": 3.982521911699822e-05, + "loss": 0.0118, + "step": 216 + }, + { + "epoch": 0.4183132530120482, + "grad_norm": 0.8630897402763367, + "learning_rate": 3.9819253147044084e-05, + "loss": 0.0246, + "step": 217 + }, + { + "epoch": 0.4202409638554217, + "grad_norm": 0.6923379898071289, + "learning_rate": 3.98131875194547e-05, + "loss": 0.036, + "step": 218 + }, + { + "epoch": 0.42216867469879515, + "grad_norm": 0.5874778628349304, + "learning_rate": 3.9807022264730024e-05, + "loss": 0.0255, + "step": 219 + }, + { + "epoch": 0.42409638554216866, + "grad_norm": 0.394336074590683, + "learning_rate": 3.980075741387094e-05, + "loss": 0.0187, + "step": 220 + }, + { + "epoch": 0.4260240963855422, + "grad_norm": 0.6300327777862549, + "learning_rate": 3.979439299837915e-05, + "loss": 0.0214, + "step": 221 + }, + { + "epoch": 0.4279518072289157, + "grad_norm": 0.5200467109680176, + "learning_rate": 3.978792905025702e-05, + "loss": 0.0628, + "step": 222 + }, + { + "epoch": 0.42987951807228914, + "grad_norm": 0.5713880062103271, + "learning_rate": 3.978136560200735e-05, + "loss": 0.0302, + "step": 223 + }, + { + "epoch": 0.43180722891566264, + "grad_norm": 0.5345383286476135, + "learning_rate": 3.977470268663331e-05, + "loss": 0.0125, + "step": 224 + }, + { + "epoch": 0.43373493975903615, + "grad_norm": 0.5378350019454956, + "learning_rate": 3.976794033763819e-05, + "loss": 0.0246, + "step": 225 + }, + { + "epoch": 0.43566265060240966, + "grad_norm": 0.5554935336112976, + "learning_rate": 3.9761078589025276e-05, + "loss": 0.0212, + "step": 226 + }, + { + "epoch": 0.4375903614457831, + "grad_norm": 0.2832634747028351, + "learning_rate": 3.9754117475297664e-05, + "loss": 0.0125, + "step": 227 + }, + { + "epoch": 0.4395180722891566, + "grad_norm": 1.2910150289535522, + "learning_rate": 3.97470570314581e-05, + "loss": 0.0364, + "step": 228 + }, + { + "epoch": 0.44144578313253013, + "grad_norm": 0.3731018602848053, + "learning_rate": 3.973989729300878e-05, + "loss": 0.0128, + "step": 229 + }, + { + "epoch": 0.4433734939759036, + "grad_norm": 0.9433871507644653, + "learning_rate": 3.9732638295951195e-05, + "loss": 0.0367, + "step": 230 + }, + { + "epoch": 0.4453012048192771, + "grad_norm": 1.0779197216033936, + "learning_rate": 3.972528007678594e-05, + "loss": 0.0667, + "step": 231 + }, + { + "epoch": 0.4472289156626506, + "grad_norm": 1.7009105682373047, + "learning_rate": 3.9717822672512516e-05, + "loss": 0.0655, + "step": 232 + }, + { + "epoch": 0.4491566265060241, + "grad_norm": 0.5646032094955444, + "learning_rate": 3.971026612062919e-05, + "loss": 0.064, + "step": 233 + }, + { + "epoch": 0.45108433734939757, + "grad_norm": 0.44474121928215027, + "learning_rate": 3.970261045913274e-05, + "loss": 0.0206, + "step": 234 + }, + { + "epoch": 0.4530120481927711, + "grad_norm": 1.3969277143478394, + "learning_rate": 3.969485572651833e-05, + "loss": 0.0486, + "step": 235 + }, + { + "epoch": 0.4549397590361446, + "grad_norm": 0.6401994228363037, + "learning_rate": 3.968700196177925e-05, + "loss": 0.0262, + "step": 236 + }, + { + "epoch": 0.4568674698795181, + "grad_norm": 0.7091913223266602, + "learning_rate": 3.96790492044068e-05, + "loss": 0.014, + "step": 237 + }, + { + "epoch": 0.45879518072289155, + "grad_norm": 0.6561547517776489, + "learning_rate": 3.967099749439002e-05, + "loss": 0.0482, + "step": 238 + }, + { + "epoch": 0.46072289156626506, + "grad_norm": 0.6924155354499817, + "learning_rate": 3.966284687221551e-05, + "loss": 0.0289, + "step": 239 + }, + { + "epoch": 0.46265060240963857, + "grad_norm": 0.5868663787841797, + "learning_rate": 3.9654597378867256e-05, + "loss": 0.0331, + "step": 240 + }, + { + "epoch": 0.464578313253012, + "grad_norm": 0.7930939793586731, + "learning_rate": 3.964624905582637e-05, + "loss": 0.0925, + "step": 241 + }, + { + "epoch": 0.46650602409638553, + "grad_norm": 0.4888836145401001, + "learning_rate": 3.9637801945070944e-05, + "loss": 0.015, + "step": 242 + }, + { + "epoch": 0.46843373493975904, + "grad_norm": 0.7820287346839905, + "learning_rate": 3.962925608907579e-05, + "loss": 0.0382, + "step": 243 + }, + { + "epoch": 0.47036144578313255, + "grad_norm": 0.4914316236972809, + "learning_rate": 3.962061153081224e-05, + "loss": 0.0257, + "step": 244 + }, + { + "epoch": 0.472289156626506, + "grad_norm": 0.5681505799293518, + "learning_rate": 3.961186831374793e-05, + "loss": 0.0551, + "step": 245 + }, + { + "epoch": 0.4742168674698795, + "grad_norm": 0.5049723386764526, + "learning_rate": 3.9603026481846616e-05, + "loss": 0.0186, + "step": 246 + }, + { + "epoch": 0.476144578313253, + "grad_norm": 0.5034119486808777, + "learning_rate": 3.959408607956787e-05, + "loss": 0.024, + "step": 247 + }, + { + "epoch": 0.47807228915662653, + "grad_norm": 0.4543336033821106, + "learning_rate": 3.958504715186695e-05, + "loss": 0.0256, + "step": 248 + }, + { + "epoch": 0.48, + "grad_norm": 0.5595743656158447, + "learning_rate": 3.957590974419452e-05, + "loss": 0.0222, + "step": 249 + }, + { + "epoch": 0.4819277108433735, + "grad_norm": 0.5701581239700317, + "learning_rate": 3.956667390249642e-05, + "loss": 0.0334, + "step": 250 + }, + { + "epoch": 0.483855421686747, + "grad_norm": 0.53755784034729, + "learning_rate": 3.9557339673213474e-05, + "loss": 0.0345, + "step": 251 + }, + { + "epoch": 0.4857831325301205, + "grad_norm": 0.4368877112865448, + "learning_rate": 3.95479071032812e-05, + "loss": 0.0183, + "step": 252 + }, + { + "epoch": 0.48771084337349396, + "grad_norm": 0.7972906827926636, + "learning_rate": 3.953837624012963e-05, + "loss": 0.0337, + "step": 253 + }, + { + "epoch": 0.48963855421686747, + "grad_norm": 0.6148451566696167, + "learning_rate": 3.9528747131683023e-05, + "loss": 0.0524, + "step": 254 + }, + { + "epoch": 0.491566265060241, + "grad_norm": 0.500840961933136, + "learning_rate": 3.9519019826359676e-05, + "loss": 0.0248, + "step": 255 + }, + { + "epoch": 0.49349397590361443, + "grad_norm": 0.5536255240440369, + "learning_rate": 3.9509194373071624e-05, + "loss": 0.0219, + "step": 256 + }, + { + "epoch": 0.49542168674698794, + "grad_norm": 0.6873176097869873, + "learning_rate": 3.9499270821224444e-05, + "loss": 0.0312, + "step": 257 + }, + { + "epoch": 0.49734939759036145, + "grad_norm": 0.37207168340682983, + "learning_rate": 3.9489249220716974e-05, + "loss": 0.0149, + "step": 258 + }, + { + "epoch": 0.49927710843373496, + "grad_norm": 0.4458799660205841, + "learning_rate": 3.947912962194107e-05, + "loss": 0.0214, + "step": 259 + }, + { + "epoch": 0.5012048192771085, + "grad_norm": 0.4272724390029907, + "learning_rate": 3.9468912075781345e-05, + "loss": 0.0263, + "step": 260 + }, + { + "epoch": 0.503132530120482, + "grad_norm": 0.5245792269706726, + "learning_rate": 3.945859663361496e-05, + "loss": 0.0103, + "step": 261 + }, + { + "epoch": 0.5050602409638554, + "grad_norm": 0.8799260854721069, + "learning_rate": 3.9448183347311284e-05, + "loss": 0.0292, + "step": 262 + }, + { + "epoch": 0.5069879518072289, + "grad_norm": 0.5996833443641663, + "learning_rate": 3.943767226923171e-05, + "loss": 0.0306, + "step": 263 + }, + { + "epoch": 0.5089156626506024, + "grad_norm": 0.6044682860374451, + "learning_rate": 3.942706345222935e-05, + "loss": 0.0218, + "step": 264 + }, + { + "epoch": 0.5108433734939759, + "grad_norm": 0.4770200848579407, + "learning_rate": 3.941635694964878e-05, + "loss": 0.0226, + "step": 265 + }, + { + "epoch": 0.5127710843373494, + "grad_norm": 0.5605704188346863, + "learning_rate": 3.940555281532576e-05, + "loss": 0.0354, + "step": 266 + }, + { + "epoch": 0.5146987951807229, + "grad_norm": 0.46532443165779114, + "learning_rate": 3.939465110358699e-05, + "loss": 0.0223, + "step": 267 + }, + { + "epoch": 0.5166265060240964, + "grad_norm": 0.5190595388412476, + "learning_rate": 3.93836518692498e-05, + "loss": 0.0219, + "step": 268 + }, + { + "epoch": 0.5185542168674698, + "grad_norm": 0.5767757892608643, + "learning_rate": 3.937255516762193e-05, + "loss": 0.0294, + "step": 269 + }, + { + "epoch": 0.5204819277108433, + "grad_norm": 0.4543164372444153, + "learning_rate": 3.936136105450119e-05, + "loss": 0.0244, + "step": 270 + }, + { + "epoch": 0.5224096385542168, + "grad_norm": 0.4155154526233673, + "learning_rate": 3.9350069586175195e-05, + "loss": 0.02, + "step": 271 + }, + { + "epoch": 0.5243373493975904, + "grad_norm": 0.5470768213272095, + "learning_rate": 3.933868081942113e-05, + "loss": 0.0187, + "step": 272 + }, + { + "epoch": 0.5262650602409639, + "grad_norm": 0.9491772651672363, + "learning_rate": 3.9327194811505406e-05, + "loss": 0.0337, + "step": 273 + }, + { + "epoch": 0.5281927710843374, + "grad_norm": 0.9313873052597046, + "learning_rate": 3.93156116201834e-05, + "loss": 0.0573, + "step": 274 + }, + { + "epoch": 0.5301204819277109, + "grad_norm": 0.7181005477905273, + "learning_rate": 3.930393130369915e-05, + "loss": 0.0405, + "step": 275 + }, + { + "epoch": 0.5320481927710843, + "grad_norm": 0.34231385588645935, + "learning_rate": 3.9292153920785076e-05, + "loss": 0.0153, + "step": 276 + }, + { + "epoch": 0.5339759036144578, + "grad_norm": 0.6899610161781311, + "learning_rate": 3.928027953066168e-05, + "loss": 0.0338, + "step": 277 + }, + { + "epoch": 0.5359036144578313, + "grad_norm": 0.7509781718254089, + "learning_rate": 3.926830819303726e-05, + "loss": 0.0416, + "step": 278 + }, + { + "epoch": 0.5378313253012048, + "grad_norm": 0.6326774954795837, + "learning_rate": 3.925623996810757e-05, + "loss": 0.0293, + "step": 279 + }, + { + "epoch": 0.5397590361445783, + "grad_norm": 0.5543203353881836, + "learning_rate": 3.924407491655557e-05, + "loss": 0.0263, + "step": 280 + }, + { + "epoch": 0.5416867469879518, + "grad_norm": 0.5367572903633118, + "learning_rate": 3.9231813099551086e-05, + "loss": 0.0276, + "step": 281 + }, + { + "epoch": 0.5436144578313253, + "grad_norm": 0.3143869638442993, + "learning_rate": 3.921945457875051e-05, + "loss": 0.0146, + "step": 282 + }, + { + "epoch": 0.5455421686746988, + "grad_norm": 0.47403043508529663, + "learning_rate": 3.920699941629649e-05, + "loss": 0.0267, + "step": 283 + }, + { + "epoch": 0.5474698795180722, + "grad_norm": 0.5082595348358154, + "learning_rate": 3.919444767481763e-05, + "loss": 0.0183, + "step": 284 + }, + { + "epoch": 0.5493975903614458, + "grad_norm": 0.747949481010437, + "learning_rate": 3.918179941742816e-05, + "loss": 0.0412, + "step": 285 + }, + { + "epoch": 0.5513253012048193, + "grad_norm": 0.6553886532783508, + "learning_rate": 3.916905470772762e-05, + "loss": 0.0505, + "step": 286 + }, + { + "epoch": 0.5532530120481928, + "grad_norm": 0.3838176131248474, + "learning_rate": 3.9156213609800545e-05, + "loss": 0.0156, + "step": 287 + }, + { + "epoch": 0.5551807228915663, + "grad_norm": 0.7427731156349182, + "learning_rate": 3.914327618821614e-05, + "loss": 0.0278, + "step": 288 + }, + { + "epoch": 0.5571084337349398, + "grad_norm": 0.2612821161746979, + "learning_rate": 3.913024250802796e-05, + "loss": 0.0101, + "step": 289 + }, + { + "epoch": 0.5590361445783133, + "grad_norm": 0.3799416124820709, + "learning_rate": 3.911711263477357e-05, + "loss": 0.0168, + "step": 290 + }, + { + "epoch": 0.5609638554216867, + "grad_norm": 0.5053854584693909, + "learning_rate": 3.910388663447425e-05, + "loss": 0.0249, + "step": 291 + }, + { + "epoch": 0.5628915662650602, + "grad_norm": 0.38095012307167053, + "learning_rate": 3.909056457363461e-05, + "loss": 0.0156, + "step": 292 + }, + { + "epoch": 0.5648192771084337, + "grad_norm": 0.4477892220020294, + "learning_rate": 3.907714651924229e-05, + "loss": 0.0309, + "step": 293 + }, + { + "epoch": 0.5667469879518072, + "grad_norm": 0.5875864624977112, + "learning_rate": 3.906363253876763e-05, + "loss": 0.0287, + "step": 294 + }, + { + "epoch": 0.5686746987951807, + "grad_norm": 0.522990882396698, + "learning_rate": 3.90500227001633e-05, + "loss": 0.0318, + "step": 295 + }, + { + "epoch": 0.5706024096385542, + "grad_norm": 0.4153876304626465, + "learning_rate": 3.9036317071863994e-05, + "loss": 0.0192, + "step": 296 + }, + { + "epoch": 0.5725301204819278, + "grad_norm": 0.4675769507884979, + "learning_rate": 3.902251572278605e-05, + "loss": 0.067, + "step": 297 + }, + { + "epoch": 0.5744578313253013, + "grad_norm": 0.35778650641441345, + "learning_rate": 3.900861872232713e-05, + "loss": 0.0197, + "step": 298 + }, + { + "epoch": 0.5763855421686747, + "grad_norm": 0.7382330894470215, + "learning_rate": 3.899462614036587e-05, + "loss": 0.0283, + "step": 299 + }, + { + "epoch": 0.5783132530120482, + "grad_norm": 0.41268599033355713, + "learning_rate": 3.89805380472615e-05, + "loss": 0.0207, + "step": 300 + }, + { + "epoch": 0.5802409638554217, + "grad_norm": 1.2013020515441895, + "learning_rate": 3.8966354513853535e-05, + "loss": 0.0301, + "step": 301 + }, + { + "epoch": 0.5821686746987952, + "grad_norm": 0.424757719039917, + "learning_rate": 3.895207561146137e-05, + "loss": 0.022, + "step": 302 + }, + { + "epoch": 0.5840963855421687, + "grad_norm": 0.4196677505970001, + "learning_rate": 3.893770141188396e-05, + "loss": 0.0424, + "step": 303 + }, + { + "epoch": 0.5860240963855422, + "grad_norm": 0.8644190430641174, + "learning_rate": 3.892323198739946e-05, + "loss": 0.08, + "step": 304 + }, + { + "epoch": 0.5879518072289157, + "grad_norm": 0.5645135045051575, + "learning_rate": 3.890866741076482e-05, + "loss": 0.0152, + "step": 305 + }, + { + "epoch": 0.5898795180722891, + "grad_norm": 0.5218387246131897, + "learning_rate": 3.889400775521545e-05, + "loss": 0.0205, + "step": 306 + }, + { + "epoch": 0.5918072289156626, + "grad_norm": 0.39709413051605225, + "learning_rate": 3.8879253094464865e-05, + "loss": 0.0233, + "step": 307 + }, + { + "epoch": 0.5937349397590361, + "grad_norm": 0.3572910726070404, + "learning_rate": 3.8864403502704285e-05, + "loss": 0.0198, + "step": 308 + }, + { + "epoch": 0.5956626506024096, + "grad_norm": 0.382709264755249, + "learning_rate": 3.8849459054602274e-05, + "loss": 0.0176, + "step": 309 + }, + { + "epoch": 0.5975903614457831, + "grad_norm": 3.4527227878570557, + "learning_rate": 3.883441982530436e-05, + "loss": 0.0239, + "step": 310 + }, + { + "epoch": 0.5995180722891567, + "grad_norm": 0.4467569589614868, + "learning_rate": 3.8819285890432674e-05, + "loss": 0.0284, + "step": 311 + }, + { + "epoch": 0.6014457831325302, + "grad_norm": 0.44513460993766785, + "learning_rate": 3.880405732608555e-05, + "loss": 0.0233, + "step": 312 + }, + { + "epoch": 0.6033734939759036, + "grad_norm": 0.8029689192771912, + "learning_rate": 3.8788734208837155e-05, + "loss": 0.0433, + "step": 313 + }, + { + "epoch": 0.6053012048192771, + "grad_norm": 0.7291454076766968, + "learning_rate": 3.877331661573709e-05, + "loss": 0.043, + "step": 314 + }, + { + "epoch": 0.6072289156626506, + "grad_norm": 0.6050467491149902, + "learning_rate": 3.8757804624310006e-05, + "loss": 0.0377, + "step": 315 + }, + { + "epoch": 0.6091566265060241, + "grad_norm": 0.6714366674423218, + "learning_rate": 3.874219831255524e-05, + "loss": 0.046, + "step": 316 + }, + { + "epoch": 0.6110843373493976, + "grad_norm": 0.336037278175354, + "learning_rate": 3.8726497758946394e-05, + "loss": 0.0149, + "step": 317 + }, + { + "epoch": 0.6130120481927711, + "grad_norm": 0.3057402968406677, + "learning_rate": 3.871070304243094e-05, + "loss": 0.014, + "step": 318 + }, + { + "epoch": 0.6149397590361446, + "grad_norm": 0.4537644684314728, + "learning_rate": 3.8694814242429834e-05, + "loss": 0.0503, + "step": 319 + }, + { + "epoch": 0.6168674698795181, + "grad_norm": 0.45573824644088745, + "learning_rate": 3.8678831438837116e-05, + "loss": 0.021, + "step": 320 + }, + { + "epoch": 0.6187951807228915, + "grad_norm": 0.30729591846466064, + "learning_rate": 3.866275471201952e-05, + "loss": 0.0163, + "step": 321 + }, + { + "epoch": 0.620722891566265, + "grad_norm": 0.7614850401878357, + "learning_rate": 3.8646584142816036e-05, + "loss": 0.0347, + "step": 322 + }, + { + "epoch": 0.6226506024096385, + "grad_norm": 0.5323611497879028, + "learning_rate": 3.863031981253754e-05, + "loss": 0.0201, + "step": 323 + }, + { + "epoch": 0.624578313253012, + "grad_norm": 0.34426453709602356, + "learning_rate": 3.861396180296635e-05, + "loss": 0.0243, + "step": 324 + }, + { + "epoch": 0.6265060240963856, + "grad_norm": 0.621636152267456, + "learning_rate": 3.859751019635585e-05, + "loss": 0.0166, + "step": 325 + }, + { + "epoch": 0.6284337349397591, + "grad_norm": 0.549324095249176, + "learning_rate": 3.858096507543006e-05, + "loss": 0.0274, + "step": 326 + }, + { + "epoch": 0.6303614457831326, + "grad_norm": 0.358426570892334, + "learning_rate": 3.8564326523383214e-05, + "loss": 0.0207, + "step": 327 + }, + { + "epoch": 0.632289156626506, + "grad_norm": 0.3639723062515259, + "learning_rate": 3.8547594623879346e-05, + "loss": 0.0297, + "step": 328 + }, + { + "epoch": 0.6342168674698795, + "grad_norm": 0.3402212858200073, + "learning_rate": 3.853076946105188e-05, + "loss": 0.0258, + "step": 329 + }, + { + "epoch": 0.636144578313253, + "grad_norm": 0.4083027243614197, + "learning_rate": 3.85138511195032e-05, + "loss": 0.0351, + "step": 330 + }, + { + "epoch": 0.6380722891566265, + "grad_norm": 0.43532121181488037, + "learning_rate": 3.84968396843042e-05, + "loss": 0.0388, + "step": 331 + }, + { + "epoch": 0.64, + "grad_norm": 0.35353463888168335, + "learning_rate": 3.8479735240993904e-05, + "loss": 0.0203, + "step": 332 + }, + { + "epoch": 0.6419277108433735, + "grad_norm": 0.350149929523468, + "learning_rate": 3.846253787557901e-05, + "loss": 0.0261, + "step": 333 + }, + { + "epoch": 0.643855421686747, + "grad_norm": 0.7665389180183411, + "learning_rate": 3.844524767453344e-05, + "loss": 0.0108, + "step": 334 + }, + { + "epoch": 0.6457831325301204, + "grad_norm": 0.44621360301971436, + "learning_rate": 3.842786472479795e-05, + "loss": 0.0282, + "step": 335 + }, + { + "epoch": 0.6477108433734939, + "grad_norm": 0.7787201404571533, + "learning_rate": 3.841038911377962e-05, + "loss": 0.0216, + "step": 336 + }, + { + "epoch": 0.6496385542168674, + "grad_norm": 0.48260653018951416, + "learning_rate": 3.839282092935153e-05, + "loss": 0.0234, + "step": 337 + }, + { + "epoch": 0.651566265060241, + "grad_norm": 0.4987852871417999, + "learning_rate": 3.837516025985219e-05, + "loss": 0.0515, + "step": 338 + }, + { + "epoch": 0.6534939759036145, + "grad_norm": 0.9030266404151917, + "learning_rate": 3.835740719408517e-05, + "loss": 0.0508, + "step": 339 + }, + { + "epoch": 0.655421686746988, + "grad_norm": 0.6381701231002808, + "learning_rate": 3.833956182131867e-05, + "loss": 0.0405, + "step": 340 + }, + { + "epoch": 0.6573493975903615, + "grad_norm": 0.42828986048698425, + "learning_rate": 3.832162423128499e-05, + "loss": 0.024, + "step": 341 + }, + { + "epoch": 0.659277108433735, + "grad_norm": 0.38725873827934265, + "learning_rate": 3.8303594514180164e-05, + "loss": 0.0199, + "step": 342 + }, + { + "epoch": 0.6612048192771084, + "grad_norm": 0.23280498385429382, + "learning_rate": 3.828547276066346e-05, + "loss": 0.0101, + "step": 343 + }, + { + "epoch": 0.6631325301204819, + "grad_norm": 0.7298216819763184, + "learning_rate": 3.8267259061856925e-05, + "loss": 0.0455, + "step": 344 + }, + { + "epoch": 0.6650602409638554, + "grad_norm": 0.5975687503814697, + "learning_rate": 3.824895350934496e-05, + "loss": 0.0372, + "step": 345 + }, + { + "epoch": 0.6669879518072289, + "grad_norm": 0.6295403242111206, + "learning_rate": 3.823055619517381e-05, + "loss": 0.0362, + "step": 346 + }, + { + "epoch": 0.6689156626506024, + "grad_norm": 0.5086020827293396, + "learning_rate": 3.821206721185115e-05, + "loss": 0.0368, + "step": 347 + }, + { + "epoch": 0.6708433734939759, + "grad_norm": 0.34506168961524963, + "learning_rate": 3.819348665234557e-05, + "loss": 0.0178, + "step": 348 + }, + { + "epoch": 0.6727710843373494, + "grad_norm": 1.309940218925476, + "learning_rate": 3.817481461008617e-05, + "loss": 0.024, + "step": 349 + }, + { + "epoch": 0.6746987951807228, + "grad_norm": 0.4074770510196686, + "learning_rate": 3.815605117896204e-05, + "loss": 0.0262, + "step": 350 + }, + { + "epoch": 0.6766265060240964, + "grad_norm": 0.48525840044021606, + "learning_rate": 3.8137196453321775e-05, + "loss": 0.0209, + "step": 351 + }, + { + "epoch": 0.6785542168674699, + "grad_norm": 0.7199739217758179, + "learning_rate": 3.811825052797308e-05, + "loss": 0.0396, + "step": 352 + }, + { + "epoch": 0.6804819277108434, + "grad_norm": 0.519540011882782, + "learning_rate": 3.8099213498182196e-05, + "loss": 0.0453, + "step": 353 + }, + { + "epoch": 0.6824096385542169, + "grad_norm": 0.9738391041755676, + "learning_rate": 3.808008545967349e-05, + "loss": 0.0317, + "step": 354 + }, + { + "epoch": 0.6843373493975904, + "grad_norm": 1.888344407081604, + "learning_rate": 3.8060866508628953e-05, + "loss": 0.0452, + "step": 355 + }, + { + "epoch": 0.6862650602409639, + "grad_norm": 0.48989811539649963, + "learning_rate": 3.8041556741687695e-05, + "loss": 0.0315, + "step": 356 + }, + { + "epoch": 0.6881927710843373, + "grad_norm": 0.3764645457267761, + "learning_rate": 3.8022156255945496e-05, + "loss": 0.0269, + "step": 357 + }, + { + "epoch": 0.6901204819277108, + "grad_norm": 0.46409738063812256, + "learning_rate": 3.800266514895429e-05, + "loss": 0.0171, + "step": 358 + }, + { + "epoch": 0.6920481927710843, + "grad_norm": 0.41091030836105347, + "learning_rate": 3.7983083518721695e-05, + "loss": 0.0167, + "step": 359 + }, + { + "epoch": 0.6939759036144578, + "grad_norm": 0.8375523090362549, + "learning_rate": 3.79634114637105e-05, + "loss": 0.0342, + "step": 360 + }, + { + "epoch": 0.6959036144578313, + "grad_norm": 1.7053394317626953, + "learning_rate": 3.794364908283817e-05, + "loss": 0.02, + "step": 361 + }, + { + "epoch": 0.6978313253012048, + "grad_norm": 0.4163115918636322, + "learning_rate": 3.792379647547637e-05, + "loss": 0.0138, + "step": 362 + }, + { + "epoch": 0.6997590361445784, + "grad_norm": 0.388751745223999, + "learning_rate": 3.790385374145046e-05, + "loss": 0.0172, + "step": 363 + }, + { + "epoch": 0.7016867469879519, + "grad_norm": 0.5584064722061157, + "learning_rate": 3.7883820981038966e-05, + "loss": 0.0254, + "step": 364 + }, + { + "epoch": 0.7036144578313253, + "grad_norm": 1.394264817237854, + "learning_rate": 3.7863698294973114e-05, + "loss": 0.037, + "step": 365 + }, + { + "epoch": 0.7055421686746988, + "grad_norm": 0.46280744671821594, + "learning_rate": 3.78434857844363e-05, + "loss": 0.0234, + "step": 366 + }, + { + "epoch": 0.7074698795180723, + "grad_norm": 0.39548924565315247, + "learning_rate": 3.782318355106358e-05, + "loss": 0.0164, + "step": 367 + }, + { + "epoch": 0.7093975903614458, + "grad_norm": 0.7307773232460022, + "learning_rate": 3.780279169694118e-05, + "loss": 0.0192, + "step": 368 + }, + { + "epoch": 0.7113253012048193, + "grad_norm": 0.28035807609558105, + "learning_rate": 3.778231032460594e-05, + "loss": 0.0131, + "step": 369 + }, + { + "epoch": 0.7132530120481928, + "grad_norm": 0.8376953601837158, + "learning_rate": 3.776173953704486e-05, + "loss": 0.0291, + "step": 370 + }, + { + "epoch": 0.7151807228915663, + "grad_norm": 0.7356843948364258, + "learning_rate": 3.774107943769454e-05, + "loss": 0.0214, + "step": 371 + }, + { + "epoch": 0.7171084337349397, + "grad_norm": 0.41503390669822693, + "learning_rate": 3.772033013044064e-05, + "loss": 0.0221, + "step": 372 + }, + { + "epoch": 0.7190361445783132, + "grad_norm": 0.35732385516166687, + "learning_rate": 3.7699491719617436e-05, + "loss": 0.015, + "step": 373 + }, + { + "epoch": 0.7209638554216867, + "grad_norm": 0.283778578042984, + "learning_rate": 3.76785643100072e-05, + "loss": 0.0146, + "step": 374 + }, + { + "epoch": 0.7228915662650602, + "grad_norm": 0.3219413459300995, + "learning_rate": 3.765754800683974e-05, + "loss": 0.015, + "step": 375 + }, + { + "epoch": 0.7248192771084337, + "grad_norm": 0.610431432723999, + "learning_rate": 3.7636442915791856e-05, + "loss": 0.0326, + "step": 376 + }, + { + "epoch": 0.7267469879518073, + "grad_norm": 4.944870948791504, + "learning_rate": 3.7615249142986784e-05, + "loss": 0.0432, + "step": 377 + }, + { + "epoch": 0.7286746987951808, + "grad_norm": 0.4894593060016632, + "learning_rate": 3.7593966794993696e-05, + "loss": 0.0174, + "step": 378 + }, + { + "epoch": 0.7306024096385542, + "grad_norm": 0.4211325943470001, + "learning_rate": 3.757259597882714e-05, + "loss": 0.023, + "step": 379 + }, + { + "epoch": 0.7325301204819277, + "grad_norm": 0.33621737360954285, + "learning_rate": 3.755113680194651e-05, + "loss": 0.0201, + "step": 380 + }, + { + "epoch": 0.7344578313253012, + "grad_norm": 0.5799694657325745, + "learning_rate": 3.7529589372255514e-05, + "loss": 0.0173, + "step": 381 + }, + { + "epoch": 0.7363855421686747, + "grad_norm": 0.5172572731971741, + "learning_rate": 3.750795379810162e-05, + "loss": 0.0284, + "step": 382 + }, + { + "epoch": 0.7383132530120482, + "grad_norm": 0.5715453028678894, + "learning_rate": 3.748623018827552e-05, + "loss": 0.0194, + "step": 383 + }, + { + "epoch": 0.7402409638554217, + "grad_norm": 0.5284178256988525, + "learning_rate": 3.746441865201056e-05, + "loss": 0.0247, + "step": 384 + }, + { + "epoch": 0.7421686746987952, + "grad_norm": 0.37828654050827026, + "learning_rate": 3.744251929898223e-05, + "loss": 0.0097, + "step": 385 + }, + { + "epoch": 0.7440963855421687, + "grad_norm": 0.3252779543399811, + "learning_rate": 3.742053223930758e-05, + "loss": 0.0238, + "step": 386 + }, + { + "epoch": 0.7460240963855421, + "grad_norm": 0.6031543612480164, + "learning_rate": 3.7398457583544674e-05, + "loss": 0.0332, + "step": 387 + }, + { + "epoch": 0.7479518072289156, + "grad_norm": 0.23846614360809326, + "learning_rate": 3.737629544269206e-05, + "loss": 0.0122, + "step": 388 + }, + { + "epoch": 0.7498795180722891, + "grad_norm": 0.5274029970169067, + "learning_rate": 3.7354045928188155e-05, + "loss": 0.0324, + "step": 389 + }, + { + "epoch": 0.7518072289156627, + "grad_norm": 0.4672217071056366, + "learning_rate": 3.733170915191075e-05, + "loss": 0.0196, + "step": 390 + }, + { + "epoch": 0.7537349397590362, + "grad_norm": 0.29819396138191223, + "learning_rate": 3.730928522617639e-05, + "loss": 0.0131, + "step": 391 + }, + { + "epoch": 0.7556626506024097, + "grad_norm": 0.43824997544288635, + "learning_rate": 3.7286774263739855e-05, + "loss": 0.0238, + "step": 392 + }, + { + "epoch": 0.7575903614457832, + "grad_norm": 0.2822072505950928, + "learning_rate": 3.726417637779357e-05, + "loss": 0.0314, + "step": 393 + }, + { + "epoch": 0.7595180722891566, + "grad_norm": 0.43815648555755615, + "learning_rate": 3.7241491681967044e-05, + "loss": 0.0144, + "step": 394 + }, + { + "epoch": 0.7614457831325301, + "grad_norm": 0.37194815278053284, + "learning_rate": 3.721872029032628e-05, + "loss": 0.0286, + "step": 395 + }, + { + "epoch": 0.7633734939759036, + "grad_norm": 0.7319737672805786, + "learning_rate": 3.719586231737322e-05, + "loss": 0.0427, + "step": 396 + }, + { + "epoch": 0.7653012048192771, + "grad_norm": 0.5870066285133362, + "learning_rate": 3.717291787804517e-05, + "loss": 0.0138, + "step": 397 + }, + { + "epoch": 0.7672289156626506, + "grad_norm": 0.6574277281761169, + "learning_rate": 3.7149887087714225e-05, + "loss": 0.061, + "step": 398 + }, + { + "epoch": 0.7691566265060241, + "grad_norm": 0.5467348694801331, + "learning_rate": 3.712677006218666e-05, + "loss": 0.022, + "step": 399 + }, + { + "epoch": 0.7710843373493976, + "grad_norm": 0.3589288890361786, + "learning_rate": 3.710356691770238e-05, + "loss": 0.0161, + "step": 400 + }, + { + "epoch": 0.7730120481927711, + "grad_norm": 0.574630618095398, + "learning_rate": 3.708027777093433e-05, + "loss": 0.0285, + "step": 401 + }, + { + "epoch": 0.7749397590361445, + "grad_norm": 0.39048445224761963, + "learning_rate": 3.70569027389879e-05, + "loss": 0.012, + "step": 402 + }, + { + "epoch": 0.776867469879518, + "grad_norm": 0.34803536534309387, + "learning_rate": 3.703344193940032e-05, + "loss": 0.0155, + "step": 403 + }, + { + "epoch": 0.7787951807228916, + "grad_norm": 1.188948392868042, + "learning_rate": 3.700989549014011e-05, + "loss": 0.0617, + "step": 404 + }, + { + "epoch": 0.7807228915662651, + "grad_norm": 0.473157674074173, + "learning_rate": 3.698626350960646e-05, + "loss": 0.0298, + "step": 405 + }, + { + "epoch": 0.7826506024096386, + "grad_norm": 0.42009076476097107, + "learning_rate": 3.6962546116628634e-05, + "loss": 0.03, + "step": 406 + }, + { + "epoch": 0.7845783132530121, + "grad_norm": 0.6334308981895447, + "learning_rate": 3.693874343046537e-05, + "loss": 0.0107, + "step": 407 + }, + { + "epoch": 0.7865060240963856, + "grad_norm": 0.35594677925109863, + "learning_rate": 3.6914855570804314e-05, + "loss": 0.0174, + "step": 408 + }, + { + "epoch": 0.788433734939759, + "grad_norm": 0.28985708951950073, + "learning_rate": 3.689088265776136e-05, + "loss": 0.0149, + "step": 409 + }, + { + "epoch": 0.7903614457831325, + "grad_norm": 0.3981950581073761, + "learning_rate": 3.686682481188011e-05, + "loss": 0.019, + "step": 410 + }, + { + "epoch": 0.792289156626506, + "grad_norm": 0.48819583654403687, + "learning_rate": 3.6842682154131193e-05, + "loss": 0.0217, + "step": 411 + }, + { + "epoch": 0.7942168674698795, + "grad_norm": 0.42819952964782715, + "learning_rate": 3.681845480591174e-05, + "loss": 0.0198, + "step": 412 + }, + { + "epoch": 0.796144578313253, + "grad_norm": 0.48591694235801697, + "learning_rate": 3.6794142889044727e-05, + "loss": 0.0253, + "step": 413 + }, + { + "epoch": 0.7980722891566265, + "grad_norm": 0.4730607271194458, + "learning_rate": 3.676974652577835e-05, + "loss": 0.0329, + "step": 414 + }, + { + "epoch": 0.8, + "grad_norm": 0.5390865802764893, + "learning_rate": 3.6745265838785434e-05, + "loss": 0.0479, + "step": 415 + }, + { + "epoch": 0.8019277108433734, + "grad_norm": 0.6377891302108765, + "learning_rate": 3.672070095116283e-05, + "loss": 0.019, + "step": 416 + }, + { + "epoch": 0.803855421686747, + "grad_norm": 0.8984615206718445, + "learning_rate": 3.669605198643075e-05, + "loss": 0.0444, + "step": 417 + }, + { + "epoch": 0.8057831325301205, + "grad_norm": 0.4913877546787262, + "learning_rate": 3.667131906853219e-05, + "loss": 0.031, + "step": 418 + }, + { + "epoch": 0.807710843373494, + "grad_norm": 0.37894028425216675, + "learning_rate": 3.664650232183229e-05, + "loss": 0.0195, + "step": 419 + }, + { + "epoch": 0.8096385542168675, + "grad_norm": 0.3644949495792389, + "learning_rate": 3.66216018711177e-05, + "loss": 0.018, + "step": 420 + }, + { + "epoch": 0.811566265060241, + "grad_norm": 0.414440393447876, + "learning_rate": 3.659661784159597e-05, + "loss": 0.0188, + "step": 421 + }, + { + "epoch": 0.8134939759036145, + "grad_norm": 0.49220341444015503, + "learning_rate": 3.65715503588949e-05, + "loss": 0.016, + "step": 422 + }, + { + "epoch": 0.815421686746988, + "grad_norm": 1.0939836502075195, + "learning_rate": 3.654639954906193e-05, + "loss": 0.0758, + "step": 423 + }, + { + "epoch": 0.8173493975903614, + "grad_norm": 0.43222442269325256, + "learning_rate": 3.652116553856349e-05, + "loss": 0.0308, + "step": 424 + }, + { + "epoch": 0.8192771084337349, + "grad_norm": 0.5081896185874939, + "learning_rate": 3.649584845428438e-05, + "loss": 0.0493, + "step": 425 + }, + { + "epoch": 0.8212048192771084, + "grad_norm": 0.9811948537826538, + "learning_rate": 3.64704484235271e-05, + "loss": 0.019, + "step": 426 + }, + { + "epoch": 0.8231325301204819, + "grad_norm": 0.31656572222709656, + "learning_rate": 3.6444965574011255e-05, + "loss": 0.0135, + "step": 427 + }, + { + "epoch": 0.8250602409638554, + "grad_norm": 0.7844433188438416, + "learning_rate": 3.641940003387289e-05, + "loss": 0.0402, + "step": 428 + }, + { + "epoch": 0.826987951807229, + "grad_norm": 0.3353273570537567, + "learning_rate": 3.6393751931663814e-05, + "loss": 0.0132, + "step": 429 + }, + { + "epoch": 0.8289156626506025, + "grad_norm": 0.7253058552742004, + "learning_rate": 3.6368021396351015e-05, + "loss": 0.0296, + "step": 430 + }, + { + "epoch": 0.8308433734939759, + "grad_norm": 0.45300304889678955, + "learning_rate": 3.634220855731598e-05, + "loss": 0.0258, + "step": 431 + }, + { + "epoch": 0.8327710843373494, + "grad_norm": 0.3480473458766937, + "learning_rate": 3.631631354435403e-05, + "loss": 0.0099, + "step": 432 + }, + { + "epoch": 0.8346987951807229, + "grad_norm": 2.1114516258239746, + "learning_rate": 3.62903364876737e-05, + "loss": 0.0457, + "step": 433 + }, + { + "epoch": 0.8366265060240964, + "grad_norm": 0.5649561882019043, + "learning_rate": 3.626427751789606e-05, + "loss": 0.0444, + "step": 434 + }, + { + "epoch": 0.8385542168674699, + "grad_norm": 0.3864995539188385, + "learning_rate": 3.623813676605405e-05, + "loss": 0.0223, + "step": 435 + }, + { + "epoch": 0.8404819277108434, + "grad_norm": 1.2134298086166382, + "learning_rate": 3.621191436359186e-05, + "loss": 0.0353, + "step": 436 + }, + { + "epoch": 0.8424096385542169, + "grad_norm": 0.4403415024280548, + "learning_rate": 3.6185610442364246e-05, + "loss": 0.0216, + "step": 437 + }, + { + "epoch": 0.8443373493975903, + "grad_norm": 0.6050297021865845, + "learning_rate": 3.6159225134635846e-05, + "loss": 0.0433, + "step": 438 + }, + { + "epoch": 0.8462650602409638, + "grad_norm": 0.7951678037643433, + "learning_rate": 3.6132758573080556e-05, + "loss": 0.031, + "step": 439 + }, + { + "epoch": 0.8481927710843373, + "grad_norm": 0.4991949796676636, + "learning_rate": 3.6106210890780834e-05, + "loss": 0.0313, + "step": 440 + }, + { + "epoch": 0.8501204819277108, + "grad_norm": 0.47951385378837585, + "learning_rate": 3.607958222122704e-05, + "loss": 0.0218, + "step": 441 + }, + { + "epoch": 0.8520481927710843, + "grad_norm": 0.7345194220542908, + "learning_rate": 3.6052872698316755e-05, + "loss": 0.0239, + "step": 442 + }, + { + "epoch": 0.8539759036144579, + "grad_norm": 1.4814884662628174, + "learning_rate": 3.602608245635414e-05, + "loss": 0.0127, + "step": 443 + }, + { + "epoch": 0.8559036144578314, + "grad_norm": 2.4240877628326416, + "learning_rate": 3.599921163004922e-05, + "loss": 0.0618, + "step": 444 + }, + { + "epoch": 0.8578313253012049, + "grad_norm": 0.41523510217666626, + "learning_rate": 3.5972260354517216e-05, + "loss": 0.0283, + "step": 445 + }, + { + "epoch": 0.8597590361445783, + "grad_norm": 0.5577677488327026, + "learning_rate": 3.594522876527791e-05, + "loss": 0.0271, + "step": 446 + }, + { + "epoch": 0.8616867469879518, + "grad_norm": 0.5829064846038818, + "learning_rate": 3.591811699825487e-05, + "loss": 0.0169, + "step": 447 + }, + { + "epoch": 0.8636144578313253, + "grad_norm": 0.4478822350502014, + "learning_rate": 3.5890925189774886e-05, + "loss": 0.0239, + "step": 448 + }, + { + "epoch": 0.8655421686746988, + "grad_norm": 0.3498048782348633, + "learning_rate": 3.586365347656718e-05, + "loss": 0.0137, + "step": 449 + }, + { + "epoch": 0.8674698795180723, + "grad_norm": 0.6571130156517029, + "learning_rate": 3.583630199576278e-05, + "loss": 0.027, + "step": 450 + }, + { + "epoch": 0.8693975903614458, + "grad_norm": 0.344970166683197, + "learning_rate": 3.58088708848938e-05, + "loss": 0.0167, + "step": 451 + }, + { + "epoch": 0.8713253012048193, + "grad_norm": 0.34611570835113525, + "learning_rate": 3.5781360281892775e-05, + "loss": 0.0468, + "step": 452 + }, + { + "epoch": 0.8732530120481927, + "grad_norm": 0.66157066822052, + "learning_rate": 3.575377032509194e-05, + "loss": 0.0344, + "step": 453 + }, + { + "epoch": 0.8751807228915662, + "grad_norm": 0.3676326870918274, + "learning_rate": 3.5726101153222534e-05, + "loss": 0.0366, + "step": 454 + }, + { + "epoch": 0.8771084337349397, + "grad_norm": 0.5958423018455505, + "learning_rate": 3.569835290541414e-05, + "loss": 0.0382, + "step": 455 + }, + { + "epoch": 0.8790361445783132, + "grad_norm": 0.36787471175193787, + "learning_rate": 3.567052572119397e-05, + "loss": 0.018, + "step": 456 + }, + { + "epoch": 0.8809638554216868, + "grad_norm": 0.9478234052658081, + "learning_rate": 3.564261974048611e-05, + "loss": 0.0179, + "step": 457 + }, + { + "epoch": 0.8828915662650603, + "grad_norm": 0.3337579369544983, + "learning_rate": 3.56146351036109e-05, + "loss": 0.0147, + "step": 458 + }, + { + "epoch": 0.8848192771084338, + "grad_norm": 0.4984932243824005, + "learning_rate": 3.558657195128416e-05, + "loss": 0.0224, + "step": 459 + }, + { + "epoch": 0.8867469879518072, + "grad_norm": 0.36718735098838806, + "learning_rate": 3.555843042461653e-05, + "loss": 0.0202, + "step": 460 + }, + { + "epoch": 0.8886746987951807, + "grad_norm": 0.4081745445728302, + "learning_rate": 3.553021066511274e-05, + "loss": 0.0288, + "step": 461 + }, + { + "epoch": 0.8906024096385542, + "grad_norm": 0.3233242332935333, + "learning_rate": 3.55019128146709e-05, + "loss": 0.0362, + "step": 462 + }, + { + "epoch": 0.8925301204819277, + "grad_norm": 0.6560158729553223, + "learning_rate": 3.547353701558178e-05, + "loss": 0.038, + "step": 463 + }, + { + "epoch": 0.8944578313253012, + "grad_norm": 0.47668641805648804, + "learning_rate": 3.544508341052811e-05, + "loss": 0.0399, + "step": 464 + }, + { + "epoch": 0.8963855421686747, + "grad_norm": 0.45512664318084717, + "learning_rate": 3.541655214258383e-05, + "loss": 0.022, + "step": 465 + }, + { + "epoch": 0.8983132530120482, + "grad_norm": 0.8410730361938477, + "learning_rate": 3.538794335521343e-05, + "loss": 0.0315, + "step": 466 + }, + { + "epoch": 0.9002409638554217, + "grad_norm": 0.4872909486293793, + "learning_rate": 3.535925719227117e-05, + "loss": 0.0152, + "step": 467 + }, + { + "epoch": 0.9021686746987951, + "grad_norm": 0.45623311400413513, + "learning_rate": 3.533049379800038e-05, + "loss": 0.0305, + "step": 468 + }, + { + "epoch": 0.9040963855421686, + "grad_norm": 0.43087029457092285, + "learning_rate": 3.530165331703275e-05, + "loss": 0.0131, + "step": 469 + }, + { + "epoch": 0.9060240963855422, + "grad_norm": 0.4610525369644165, + "learning_rate": 3.527273589438756e-05, + "loss": 0.0187, + "step": 470 + }, + { + "epoch": 0.9079518072289157, + "grad_norm": 0.3356114327907562, + "learning_rate": 3.5243741675471006e-05, + "loss": 0.0185, + "step": 471 + }, + { + "epoch": 0.9098795180722892, + "grad_norm": 0.9065960049629211, + "learning_rate": 3.5214670806075426e-05, + "loss": 0.0433, + "step": 472 + }, + { + "epoch": 0.9118072289156627, + "grad_norm": 0.3652578294277191, + "learning_rate": 3.518552343237858e-05, + "loss": 0.02, + "step": 473 + }, + { + "epoch": 0.9137349397590362, + "grad_norm": 0.32377883791923523, + "learning_rate": 3.5156299700942916e-05, + "loss": 0.0165, + "step": 474 + }, + { + "epoch": 0.9156626506024096, + "grad_norm": 0.2431817352771759, + "learning_rate": 3.512699975871485e-05, + "loss": 0.0172, + "step": 475 + }, + { + "epoch": 0.9175903614457831, + "grad_norm": 0.6390707492828369, + "learning_rate": 3.509762375302399e-05, + "loss": 0.0356, + "step": 476 + }, + { + "epoch": 0.9195180722891566, + "grad_norm": 0.2283092886209488, + "learning_rate": 3.506817183158243e-05, + "loss": 0.0088, + "step": 477 + }, + { + "epoch": 0.9214457831325301, + "grad_norm": 0.5053914189338684, + "learning_rate": 3.5038644142483966e-05, + "loss": 0.0389, + "step": 478 + }, + { + "epoch": 0.9233734939759036, + "grad_norm": 0.2567576467990875, + "learning_rate": 3.500904083420342e-05, + "loss": 0.0155, + "step": 479 + }, + { + "epoch": 0.9253012048192771, + "grad_norm": 0.6852384209632874, + "learning_rate": 3.497936205559583e-05, + "loss": 0.0247, + "step": 480 + }, + { + "epoch": 0.9272289156626506, + "grad_norm": 0.36403414607048035, + "learning_rate": 3.494960795589572e-05, + "loss": 0.023, + "step": 481 + }, + { + "epoch": 0.929156626506024, + "grad_norm": 0.506554901599884, + "learning_rate": 3.491977868471635e-05, + "loss": 0.0273, + "step": 482 + }, + { + "epoch": 0.9310843373493976, + "grad_norm": 0.38329923152923584, + "learning_rate": 3.4889874392048985e-05, + "loss": 0.0169, + "step": 483 + }, + { + "epoch": 0.9330120481927711, + "grad_norm": 0.2805836498737335, + "learning_rate": 3.48598952282621e-05, + "loss": 0.0105, + "step": 484 + }, + { + "epoch": 0.9349397590361446, + "grad_norm": 0.6315302848815918, + "learning_rate": 3.482984134410067e-05, + "loss": 0.0289, + "step": 485 + }, + { + "epoch": 0.9368674698795181, + "grad_norm": 0.6431388854980469, + "learning_rate": 3.479971289068537e-05, + "loss": 0.0311, + "step": 486 + }, + { + "epoch": 0.9387951807228916, + "grad_norm": 0.9794723391532898, + "learning_rate": 3.476951001951184e-05, + "loss": 0.0452, + "step": 487 + }, + { + "epoch": 0.9407228915662651, + "grad_norm": 0.7984824180603027, + "learning_rate": 3.473923288244991e-05, + "loss": 0.0689, + "step": 488 + }, + { + "epoch": 0.9426506024096386, + "grad_norm": 0.46362006664276123, + "learning_rate": 3.470888163174286e-05, + "loss": 0.0241, + "step": 489 + }, + { + "epoch": 0.944578313253012, + "grad_norm": 0.5051195025444031, + "learning_rate": 3.467845642000661e-05, + "loss": 0.0228, + "step": 490 + }, + { + "epoch": 0.9465060240963855, + "grad_norm": 0.3082812428474426, + "learning_rate": 3.4647957400229004e-05, + "loss": 0.0144, + "step": 491 + }, + { + "epoch": 0.948433734939759, + "grad_norm": 0.2691391110420227, + "learning_rate": 3.461738472576902e-05, + "loss": 0.0167, + "step": 492 + }, + { + "epoch": 0.9503614457831325, + "grad_norm": 0.5627671480178833, + "learning_rate": 3.458673855035597e-05, + "loss": 0.031, + "step": 493 + }, + { + "epoch": 0.952289156626506, + "grad_norm": 0.4571435749530792, + "learning_rate": 3.455601902808876e-05, + "loss": 0.0191, + "step": 494 + }, + { + "epoch": 0.9542168674698795, + "grad_norm": 1.0117709636688232, + "learning_rate": 3.452522631343515e-05, + "loss": 0.0192, + "step": 495 + }, + { + "epoch": 0.9561445783132531, + "grad_norm": 0.28375712037086487, + "learning_rate": 3.449436056123086e-05, + "loss": 0.0159, + "step": 496 + }, + { + "epoch": 0.9580722891566265, + "grad_norm": 0.26381856203079224, + "learning_rate": 3.446342192667893e-05, + "loss": 0.0113, + "step": 497 + }, + { + "epoch": 0.96, + "grad_norm": 0.49317577481269836, + "learning_rate": 3.443241056534884e-05, + "loss": 0.0332, + "step": 498 + }, + { + "epoch": 0.9619277108433735, + "grad_norm": 0.28884485363960266, + "learning_rate": 3.440132663317579e-05, + "loss": 0.0117, + "step": 499 + }, + { + "epoch": 0.963855421686747, + "grad_norm": 0.36255285143852234, + "learning_rate": 3.4370170286459864e-05, + "loss": 0.0169, + "step": 500 + }, + { + "epoch": 0.9657831325301205, + "grad_norm": 0.4265049993991852, + "learning_rate": 3.433894168186529e-05, + "loss": 0.0217, + "step": 501 + }, + { + "epoch": 0.967710843373494, + "grad_norm": 0.8169426321983337, + "learning_rate": 3.430764097641962e-05, + "loss": 0.0207, + "step": 502 + }, + { + "epoch": 0.9696385542168675, + "grad_norm": 1.866077184677124, + "learning_rate": 3.427626832751296e-05, + "loss": 0.0381, + "step": 503 + }, + { + "epoch": 0.971566265060241, + "grad_norm": 0.33124980330467224, + "learning_rate": 3.424482389289716e-05, + "loss": 0.0245, + "step": 504 + }, + { + "epoch": 0.9734939759036144, + "grad_norm": 0.37479540705680847, + "learning_rate": 3.4213307830685055e-05, + "loss": 0.0164, + "step": 505 + }, + { + "epoch": 0.9754216867469879, + "grad_norm": 0.39738863706588745, + "learning_rate": 3.4181720299349615e-05, + "loss": 0.0297, + "step": 506 + }, + { + "epoch": 0.9773493975903614, + "grad_norm": 0.2567287087440491, + "learning_rate": 3.4150061457723205e-05, + "loss": 0.0102, + "step": 507 + }, + { + "epoch": 0.9792771084337349, + "grad_norm": 0.6230517029762268, + "learning_rate": 3.411833146499675e-05, + "loss": 0.0243, + "step": 508 + }, + { + "epoch": 0.9812048192771085, + "grad_norm": 0.44843971729278564, + "learning_rate": 3.408653048071894e-05, + "loss": 0.0357, + "step": 509 + }, + { + "epoch": 0.983132530120482, + "grad_norm": 1.0569655895233154, + "learning_rate": 3.405465866479546e-05, + "loss": 0.037, + "step": 510 + }, + { + "epoch": 0.9850602409638555, + "grad_norm": 0.29000964760780334, + "learning_rate": 3.402271617748812e-05, + "loss": 0.0129, + "step": 511 + }, + { + "epoch": 0.9869879518072289, + "grad_norm": 2.1627447605133057, + "learning_rate": 3.399070317941413e-05, + "loss": 0.0442, + "step": 512 + }, + { + "epoch": 0.9889156626506024, + "grad_norm": 0.27371272444725037, + "learning_rate": 3.395861983154522e-05, + "loss": 0.0119, + "step": 513 + }, + { + "epoch": 0.9908433734939759, + "grad_norm": 0.4117226302623749, + "learning_rate": 3.392646629520688e-05, + "loss": 0.0455, + "step": 514 + }, + { + "epoch": 0.9927710843373494, + "grad_norm": 0.5098996758460999, + "learning_rate": 3.389424273207752e-05, + "loss": 0.0203, + "step": 515 + }, + { + "epoch": 0.9946987951807229, + "grad_norm": 0.5192157626152039, + "learning_rate": 3.386194930418767e-05, + "loss": 0.0329, + "step": 516 + }, + { + "epoch": 0.9966265060240964, + "grad_norm": 0.18757697939872742, + "learning_rate": 3.382958617391915e-05, + "loss": 0.0065, + "step": 517 + }, + { + "epoch": 0.9985542168674699, + "grad_norm": 0.3334413170814514, + "learning_rate": 3.3797153504004296e-05, + "loss": 0.0266, + "step": 518 + }, + { + "epoch": 1.0, + "grad_norm": 0.4152225852012634, + "learning_rate": 3.3764651457525095e-05, + "loss": 0.0169, + "step": 519 + }, + { + "epoch": 1.0019277108433735, + "grad_norm": 0.43535247445106506, + "learning_rate": 3.373208019791237e-05, + "loss": 0.0221, + "step": 520 + }, + { + "epoch": 1.003855421686747, + "grad_norm": 0.39292722940444946, + "learning_rate": 3.3699439888945e-05, + "loss": 0.0211, + "step": 521 + }, + { + "epoch": 1.0057831325301205, + "grad_norm": 0.19566713273525238, + "learning_rate": 3.366673069474904e-05, + "loss": 0.0069, + "step": 522 + }, + { + "epoch": 1.007710843373494, + "grad_norm": 0.5101853609085083, + "learning_rate": 3.3633952779796914e-05, + "loss": 0.0191, + "step": 523 + }, + { + "epoch": 1.0096385542168675, + "grad_norm": 0.999434769153595, + "learning_rate": 3.360110630890664e-05, + "loss": 0.0196, + "step": 524 + }, + { + "epoch": 1.011566265060241, + "grad_norm": 0.4646223783493042, + "learning_rate": 3.356819144724092e-05, + "loss": 0.0328, + "step": 525 + }, + { + "epoch": 1.0134939759036146, + "grad_norm": 0.3132480978965759, + "learning_rate": 3.3535208360306354e-05, + "loss": 0.0203, + "step": 526 + }, + { + "epoch": 1.0154216867469879, + "grad_norm": 0.3038032352924347, + "learning_rate": 3.350215721395261e-05, + "loss": 0.0122, + "step": 527 + }, + { + "epoch": 1.0173493975903614, + "grad_norm": 0.45082882046699524, + "learning_rate": 3.346903817437157e-05, + "loss": 0.0437, + "step": 528 + }, + { + "epoch": 1.0192771084337349, + "grad_norm": 0.26917046308517456, + "learning_rate": 3.343585140809651e-05, + "loss": 0.013, + "step": 529 + }, + { + "epoch": 1.0212048192771084, + "grad_norm": 0.23869264125823975, + "learning_rate": 3.3402597082001276e-05, + "loss": 0.008, + "step": 530 + }, + { + "epoch": 1.0231325301204819, + "grad_norm": 0.31315353512763977, + "learning_rate": 3.3369275363299394e-05, + "loss": 0.0078, + "step": 531 + }, + { + "epoch": 1.0250602409638554, + "grad_norm": 0.4780346751213074, + "learning_rate": 3.333588641954327e-05, + "loss": 0.0225, + "step": 532 + }, + { + "epoch": 1.026987951807229, + "grad_norm": 0.2920368015766144, + "learning_rate": 3.330243041862336e-05, + "loss": 0.0118, + "step": 533 + }, + { + "epoch": 1.0289156626506024, + "grad_norm": 0.543669581413269, + "learning_rate": 3.326890752876728e-05, + "loss": 0.0338, + "step": 534 + }, + { + "epoch": 1.030843373493976, + "grad_norm": 0.4288000464439392, + "learning_rate": 3.323531791853901e-05, + "loss": 0.0341, + "step": 535 + }, + { + "epoch": 1.0327710843373494, + "grad_norm": 0.26600322127342224, + "learning_rate": 3.3201661756838e-05, + "loss": 0.0184, + "step": 536 + }, + { + "epoch": 1.034698795180723, + "grad_norm": 0.290937602519989, + "learning_rate": 3.316793921289835e-05, + "loss": 0.0152, + "step": 537 + }, + { + "epoch": 1.0366265060240965, + "grad_norm": 0.7621443271636963, + "learning_rate": 3.313415045628795e-05, + "loss": 0.0326, + "step": 538 + }, + { + "epoch": 1.03855421686747, + "grad_norm": 0.5581283569335938, + "learning_rate": 3.3100295656907646e-05, + "loss": 0.0164, + "step": 539 + }, + { + "epoch": 1.0404819277108435, + "grad_norm": 0.20930901169776917, + "learning_rate": 3.306637498499034e-05, + "loss": 0.0091, + "step": 540 + }, + { + "epoch": 1.0424096385542168, + "grad_norm": 0.46212059259414673, + "learning_rate": 3.303238861110018e-05, + "loss": 0.0118, + "step": 541 + }, + { + "epoch": 1.0443373493975903, + "grad_norm": 0.38259151577949524, + "learning_rate": 3.299833670613168e-05, + "loss": 0.0081, + "step": 542 + }, + { + "epoch": 1.0462650602409638, + "grad_norm": 0.4888618290424347, + "learning_rate": 3.2964219441308865e-05, + "loss": 0.0138, + "step": 543 + }, + { + "epoch": 1.0481927710843373, + "grad_norm": 0.32103127241134644, + "learning_rate": 3.2930036988184425e-05, + "loss": 0.0171, + "step": 544 + }, + { + "epoch": 1.0501204819277108, + "grad_norm": 0.27787327766418457, + "learning_rate": 3.28957895186388e-05, + "loss": 0.0106, + "step": 545 + }, + { + "epoch": 1.0520481927710843, + "grad_norm": 0.35597777366638184, + "learning_rate": 3.2861477204879395e-05, + "loss": 0.0123, + "step": 546 + }, + { + "epoch": 1.0539759036144578, + "grad_norm": 0.3619804084300995, + "learning_rate": 3.2827100219439656e-05, + "loss": 0.0088, + "step": 547 + }, + { + "epoch": 1.0559036144578313, + "grad_norm": 0.2525513470172882, + "learning_rate": 3.279265873517822e-05, + "loss": 0.0179, + "step": 548 + }, + { + "epoch": 1.0578313253012048, + "grad_norm": 0.3910020887851715, + "learning_rate": 3.275815292527804e-05, + "loss": 0.0142, + "step": 549 + }, + { + "epoch": 1.0597590361445783, + "grad_norm": 0.30515050888061523, + "learning_rate": 3.2723582963245526e-05, + "loss": 0.0123, + "step": 550 + }, + { + "epoch": 1.0616867469879518, + "grad_norm": 0.21708644926548004, + "learning_rate": 3.2688949022909665e-05, + "loss": 0.0098, + "step": 551 + }, + { + "epoch": 1.0636144578313254, + "grad_norm": 0.23307719826698303, + "learning_rate": 3.265425127842114e-05, + "loss": 0.0097, + "step": 552 + }, + { + "epoch": 1.0655421686746989, + "grad_norm": 0.676654577255249, + "learning_rate": 3.261948990425147e-05, + "loss": 0.0227, + "step": 553 + }, + { + "epoch": 1.0674698795180724, + "grad_norm": 0.4593975841999054, + "learning_rate": 3.258466507519213e-05, + "loss": 0.047, + "step": 554 + }, + { + "epoch": 1.0693975903614459, + "grad_norm": 0.19405829906463623, + "learning_rate": 3.254977696635366e-05, + "loss": 0.0314, + "step": 555 + }, + { + "epoch": 1.0713253012048192, + "grad_norm": 0.14563389122486115, + "learning_rate": 3.2514825753164774e-05, + "loss": 0.0046, + "step": 556 + }, + { + "epoch": 1.0732530120481927, + "grad_norm": 0.2642340064048767, + "learning_rate": 3.247981161137153e-05, + "loss": 0.022, + "step": 557 + }, + { + "epoch": 1.0751807228915662, + "grad_norm": 0.17274761199951172, + "learning_rate": 3.2444734717036386e-05, + "loss": 0.0134, + "step": 558 + }, + { + "epoch": 1.0771084337349397, + "grad_norm": 0.44354626536369324, + "learning_rate": 3.240959524653735e-05, + "loss": 0.0211, + "step": 559 + }, + { + "epoch": 1.0790361445783132, + "grad_norm": 0.2806888818740845, + "learning_rate": 3.237439337656708e-05, + "loss": 0.0141, + "step": 560 + }, + { + "epoch": 1.0809638554216867, + "grad_norm": 0.21679501235485077, + "learning_rate": 3.2339129284131994e-05, + "loss": 0.019, + "step": 561 + }, + { + "epoch": 1.0828915662650602, + "grad_norm": 0.3040260076522827, + "learning_rate": 3.2303803146551386e-05, + "loss": 0.0249, + "step": 562 + }, + { + "epoch": 1.0848192771084337, + "grad_norm": 0.2793775200843811, + "learning_rate": 3.226841514145656e-05, + "loss": 0.0088, + "step": 563 + }, + { + "epoch": 1.0867469879518072, + "grad_norm": 0.149955615401268, + "learning_rate": 3.223296544678987e-05, + "loss": 0.0054, + "step": 564 + }, + { + "epoch": 1.0886746987951808, + "grad_norm": 0.22166767716407776, + "learning_rate": 3.219745424080389e-05, + "loss": 0.0109, + "step": 565 + }, + { + "epoch": 1.0906024096385543, + "grad_norm": 0.22399431467056274, + "learning_rate": 3.2161881702060476e-05, + "loss": 0.0106, + "step": 566 + }, + { + "epoch": 1.0925301204819278, + "grad_norm": 0.18537986278533936, + "learning_rate": 3.2126248009429905e-05, + "loss": 0.0077, + "step": 567 + }, + { + "epoch": 1.0944578313253013, + "grad_norm": 0.24511495232582092, + "learning_rate": 3.2090553342089935e-05, + "loss": 0.0093, + "step": 568 + }, + { + "epoch": 1.0963855421686748, + "grad_norm": 0.4766045808792114, + "learning_rate": 3.205479787952494e-05, + "loss": 0.036, + "step": 569 + }, + { + "epoch": 1.0983132530120483, + "grad_norm": 0.1425715535879135, + "learning_rate": 3.201898180152499e-05, + "loss": 0.0085, + "step": 570 + }, + { + "epoch": 1.1002409638554216, + "grad_norm": 0.1909666359424591, + "learning_rate": 3.1983105288184945e-05, + "loss": 0.0081, + "step": 571 + }, + { + "epoch": 1.102168674698795, + "grad_norm": 0.44077104330062866, + "learning_rate": 3.194716851990355e-05, + "loss": 0.017, + "step": 572 + }, + { + "epoch": 1.1040963855421686, + "grad_norm": 0.5757400989532471, + "learning_rate": 3.191117167738253e-05, + "loss": 0.021, + "step": 573 + }, + { + "epoch": 1.106024096385542, + "grad_norm": 0.1977701038122177, + "learning_rate": 3.1875114941625705e-05, + "loss": 0.0096, + "step": 574 + }, + { + "epoch": 1.1079518072289156, + "grad_norm": 0.3524581491947174, + "learning_rate": 3.1838998493938026e-05, + "loss": 0.0118, + "step": 575 + }, + { + "epoch": 1.1098795180722891, + "grad_norm": 0.3301331698894501, + "learning_rate": 3.180282251592472e-05, + "loss": 0.0094, + "step": 576 + }, + { + "epoch": 1.1118072289156626, + "grad_norm": 0.2774488925933838, + "learning_rate": 3.1766587189490336e-05, + "loss": 0.0131, + "step": 577 + }, + { + "epoch": 1.1137349397590361, + "grad_norm": 1.732595443725586, + "learning_rate": 3.173029269683785e-05, + "loss": 0.0445, + "step": 578 + }, + { + "epoch": 1.1156626506024097, + "grad_norm": 0.28746843338012695, + "learning_rate": 3.169393922046776e-05, + "loss": 0.0116, + "step": 579 + }, + { + "epoch": 1.1175903614457832, + "grad_norm": 0.2952995002269745, + "learning_rate": 3.165752694317713e-05, + "loss": 0.0116, + "step": 580 + }, + { + "epoch": 1.1195180722891567, + "grad_norm": 0.2938575744628906, + "learning_rate": 3.16210560480587e-05, + "loss": 0.013, + "step": 581 + }, + { + "epoch": 1.1214457831325302, + "grad_norm": 0.22283495962619781, + "learning_rate": 3.158452671849998e-05, + "loss": 0.0052, + "step": 582 + }, + { + "epoch": 1.1233734939759037, + "grad_norm": 0.6272858381271362, + "learning_rate": 3.154793913818226e-05, + "loss": 0.0182, + "step": 583 + }, + { + "epoch": 1.1253012048192772, + "grad_norm": 0.479753702878952, + "learning_rate": 3.1511293491079804e-05, + "loss": 0.0146, + "step": 584 + }, + { + "epoch": 1.1272289156626507, + "grad_norm": 0.31104400753974915, + "learning_rate": 3.1474589961458786e-05, + "loss": 0.0139, + "step": 585 + }, + { + "epoch": 1.129156626506024, + "grad_norm": 0.4932832419872284, + "learning_rate": 3.1437828733876477e-05, + "loss": 0.0236, + "step": 586 + }, + { + "epoch": 1.1310843373493975, + "grad_norm": 0.222808837890625, + "learning_rate": 3.140100999318025e-05, + "loss": 0.0084, + "step": 587 + }, + { + "epoch": 1.133012048192771, + "grad_norm": 0.4515356719493866, + "learning_rate": 3.136413392450668e-05, + "loss": 0.0215, + "step": 588 + }, + { + "epoch": 1.1349397590361445, + "grad_norm": 0.39302268624305725, + "learning_rate": 3.132720071328061e-05, + "loss": 0.0154, + "step": 589 + }, + { + "epoch": 1.136867469879518, + "grad_norm": 0.43382835388183594, + "learning_rate": 3.1290210545214205e-05, + "loss": 0.0088, + "step": 590 + }, + { + "epoch": 1.1387951807228915, + "grad_norm": 0.18707136809825897, + "learning_rate": 3.125316360630602e-05, + "loss": 0.0126, + "step": 591 + }, + { + "epoch": 1.140722891566265, + "grad_norm": 0.5688219666481018, + "learning_rate": 3.121606008284011e-05, + "loss": 0.0147, + "step": 592 + }, + { + "epoch": 1.1426506024096386, + "grad_norm": 0.3321833312511444, + "learning_rate": 3.1178900161385005e-05, + "loss": 0.0119, + "step": 593 + }, + { + "epoch": 1.144578313253012, + "grad_norm": 0.3738424777984619, + "learning_rate": 3.114168402879286e-05, + "loss": 0.0158, + "step": 594 + }, + { + "epoch": 1.1465060240963856, + "grad_norm": 0.2386978417634964, + "learning_rate": 3.110441187219846e-05, + "loss": 0.0107, + "step": 595 + }, + { + "epoch": 1.148433734939759, + "grad_norm": 0.2165699452161789, + "learning_rate": 3.10670838790183e-05, + "loss": 0.0079, + "step": 596 + }, + { + "epoch": 1.1503614457831326, + "grad_norm": 0.25952696800231934, + "learning_rate": 3.102970023694965e-05, + "loss": 0.0147, + "step": 597 + }, + { + "epoch": 1.152289156626506, + "grad_norm": 0.21448305249214172, + "learning_rate": 3.099226113396959e-05, + "loss": 0.0099, + "step": 598 + }, + { + "epoch": 1.1542168674698796, + "grad_norm": 0.37226060032844543, + "learning_rate": 3.095476675833405e-05, + "loss": 0.0214, + "step": 599 + }, + { + "epoch": 1.1561445783132531, + "grad_norm": 0.29637983441352844, + "learning_rate": 3.0917217298576955e-05, + "loss": 0.0118, + "step": 600 + }, + { + "epoch": 1.1580722891566264, + "grad_norm": 0.18535609543323517, + "learning_rate": 3.0879612943509154e-05, + "loss": 0.0086, + "step": 601 + }, + { + "epoch": 1.16, + "grad_norm": 0.25874125957489014, + "learning_rate": 3.0841953882217536e-05, + "loss": 0.0088, + "step": 602 + }, + { + "epoch": 1.1619277108433734, + "grad_norm": 0.46092745661735535, + "learning_rate": 3.08042403040641e-05, + "loss": 0.0241, + "step": 603 + }, + { + "epoch": 1.163855421686747, + "grad_norm": 0.27023249864578247, + "learning_rate": 3.076647239868494e-05, + "loss": 0.0154, + "step": 604 + }, + { + "epoch": 1.1657831325301204, + "grad_norm": 0.445157527923584, + "learning_rate": 3.072865035598933e-05, + "loss": 0.0197, + "step": 605 + }, + { + "epoch": 1.167710843373494, + "grad_norm": 0.18097272515296936, + "learning_rate": 3.06907743661588e-05, + "loss": 0.0093, + "step": 606 + }, + { + "epoch": 1.1696385542168675, + "grad_norm": 0.22469942271709442, + "learning_rate": 3.065284461964609e-05, + "loss": 0.0171, + "step": 607 + }, + { + "epoch": 1.171566265060241, + "grad_norm": 0.20190906524658203, + "learning_rate": 3.061486130717428e-05, + "loss": 0.008, + "step": 608 + }, + { + "epoch": 1.1734939759036145, + "grad_norm": 0.18294145166873932, + "learning_rate": 3.057682461973579e-05, + "loss": 0.0155, + "step": 609 + }, + { + "epoch": 1.175421686746988, + "grad_norm": 0.34203943610191345, + "learning_rate": 3.053873474859143e-05, + "loss": 0.0212, + "step": 610 + }, + { + "epoch": 1.1773493975903615, + "grad_norm": 0.49073582887649536, + "learning_rate": 3.050059188526942e-05, + "loss": 0.019, + "step": 611 + }, + { + "epoch": 1.179277108433735, + "grad_norm": 0.3537680506706238, + "learning_rate": 3.046239622156446e-05, + "loss": 0.0147, + "step": 612 + }, + { + "epoch": 1.1812048192771085, + "grad_norm": 0.2584632635116577, + "learning_rate": 3.042414794953674e-05, + "loss": 0.0088, + "step": 613 + }, + { + "epoch": 1.1831325301204818, + "grad_norm": 0.3529360890388489, + "learning_rate": 3.0385847261510975e-05, + "loss": 0.0187, + "step": 614 + }, + { + "epoch": 1.1850602409638555, + "grad_norm": 0.3331570327281952, + "learning_rate": 3.0347494350075465e-05, + "loss": 0.0124, + "step": 615 + }, + { + "epoch": 1.1869879518072288, + "grad_norm": 0.2223527580499649, + "learning_rate": 3.0309089408081074e-05, + "loss": 0.01, + "step": 616 + }, + { + "epoch": 1.1889156626506023, + "grad_norm": 0.21985746920108795, + "learning_rate": 3.027063262864032e-05, + "loss": 0.0087, + "step": 617 + }, + { + "epoch": 1.1908433734939758, + "grad_norm": 0.2989653944969177, + "learning_rate": 3.023212420512637e-05, + "loss": 0.0137, + "step": 618 + }, + { + "epoch": 1.1927710843373494, + "grad_norm": 0.17423275113105774, + "learning_rate": 3.0193564331172074e-05, + "loss": 0.0056, + "step": 619 + }, + { + "epoch": 1.1946987951807229, + "grad_norm": 1.0992127656936646, + "learning_rate": 3.0154953200668976e-05, + "loss": 0.0274, + "step": 620 + }, + { + "epoch": 1.1966265060240964, + "grad_norm": 0.21641989052295685, + "learning_rate": 3.011629100776638e-05, + "loss": 0.0151, + "step": 621 + }, + { + "epoch": 1.1985542168674699, + "grad_norm": 0.4558199644088745, + "learning_rate": 3.007757794687033e-05, + "loss": 0.0424, + "step": 622 + }, + { + "epoch": 1.2004819277108434, + "grad_norm": 0.42380189895629883, + "learning_rate": 3.003881421264266e-05, + "loss": 0.0079, + "step": 623 + }, + { + "epoch": 1.202409638554217, + "grad_norm": 0.28791171312332153, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.0142, + "step": 624 + }, + { + "epoch": 1.2043373493975904, + "grad_norm": 0.3906581997871399, + "learning_rate": 2.996113550411281e-05, + "loss": 0.0251, + "step": 625 + }, + { + "epoch": 1.206265060240964, + "grad_norm": 0.47848746180534363, + "learning_rate": 2.9922220920404375e-05, + "loss": 0.0137, + "step": 626 + }, + { + "epoch": 1.2081927710843374, + "grad_norm": 0.22666941583156586, + "learning_rate": 2.9883256444549862e-05, + "loss": 0.0105, + "step": 627 + }, + { + "epoch": 1.210120481927711, + "grad_norm": 0.18968136608600616, + "learning_rate": 2.984424227247529e-05, + "loss": 0.0089, + "step": 628 + }, + { + "epoch": 1.2120481927710842, + "grad_norm": 0.28732606768608093, + "learning_rate": 2.980517860035656e-05, + "loss": 0.0253, + "step": 629 + }, + { + "epoch": 1.213975903614458, + "grad_norm": 0.21131543815135956, + "learning_rate": 2.9766065624618518e-05, + "loss": 0.0134, + "step": 630 + }, + { + "epoch": 1.2159036144578312, + "grad_norm": 0.7594877481460571, + "learning_rate": 2.972690354193388e-05, + "loss": 0.0157, + "step": 631 + }, + { + "epoch": 1.2178313253012047, + "grad_norm": 0.730291485786438, + "learning_rate": 2.96876925492223e-05, + "loss": 0.0204, + "step": 632 + }, + { + "epoch": 1.2197590361445783, + "grad_norm": 0.20333674550056458, + "learning_rate": 2.9648432843649382e-05, + "loss": 0.0114, + "step": 633 + }, + { + "epoch": 1.2216867469879518, + "grad_norm": 0.5680793523788452, + "learning_rate": 2.960912462262566e-05, + "loss": 0.0146, + "step": 634 + }, + { + "epoch": 1.2236144578313253, + "grad_norm": 0.4591079354286194, + "learning_rate": 2.9569768083805618e-05, + "loss": 0.0112, + "step": 635 + }, + { + "epoch": 1.2255421686746988, + "grad_norm": 0.3793511390686035, + "learning_rate": 2.953036342508671e-05, + "loss": 0.0377, + "step": 636 + }, + { + "epoch": 1.2274698795180723, + "grad_norm": 1.118723750114441, + "learning_rate": 2.9490910844608346e-05, + "loss": 0.0432, + "step": 637 + }, + { + "epoch": 1.2293975903614458, + "grad_norm": 0.36990776658058167, + "learning_rate": 2.9451410540750887e-05, + "loss": 0.0203, + "step": 638 + }, + { + "epoch": 1.2313253012048193, + "grad_norm": 0.930397629737854, + "learning_rate": 2.94118627121347e-05, + "loss": 0.0311, + "step": 639 + }, + { + "epoch": 1.2332530120481928, + "grad_norm": 0.2347625195980072, + "learning_rate": 2.9372267557619075e-05, + "loss": 0.0168, + "step": 640 + }, + { + "epoch": 1.2351807228915663, + "grad_norm": 0.3720332384109497, + "learning_rate": 2.933262527630131e-05, + "loss": 0.0136, + "step": 641 + }, + { + "epoch": 1.2371084337349398, + "grad_norm": 0.4871984124183655, + "learning_rate": 2.929293606751565e-05, + "loss": 0.0339, + "step": 642 + }, + { + "epoch": 1.2390361445783133, + "grad_norm": 0.35853689908981323, + "learning_rate": 2.9253200130832322e-05, + "loss": 0.0095, + "step": 643 + }, + { + "epoch": 1.2409638554216866, + "grad_norm": 0.42003703117370605, + "learning_rate": 2.92134176660565e-05, + "loss": 0.0142, + "step": 644 + }, + { + "epoch": 1.2428915662650604, + "grad_norm": 0.3854500651359558, + "learning_rate": 2.9173588873227338e-05, + "loss": 0.0209, + "step": 645 + }, + { + "epoch": 1.2448192771084337, + "grad_norm": 0.24665917456150055, + "learning_rate": 2.913371395261691e-05, + "loss": 0.0087, + "step": 646 + }, + { + "epoch": 1.2467469879518072, + "grad_norm": 0.41571593284606934, + "learning_rate": 2.9093793104729268e-05, + "loss": 0.0164, + "step": 647 + }, + { + "epoch": 1.2486746987951807, + "grad_norm": 0.4597891569137573, + "learning_rate": 2.9053826530299377e-05, + "loss": 0.0138, + "step": 648 + }, + { + "epoch": 1.2506024096385542, + "grad_norm": 0.43345385789871216, + "learning_rate": 2.901381443029215e-05, + "loss": 0.0353, + "step": 649 + }, + { + "epoch": 1.2525301204819277, + "grad_norm": 0.3706768751144409, + "learning_rate": 2.897375700590141e-05, + "loss": 0.007, + "step": 650 + }, + { + "epoch": 1.2544578313253012, + "grad_norm": 0.30305296182632446, + "learning_rate": 2.8933654458548873e-05, + "loss": 0.0123, + "step": 651 + }, + { + "epoch": 1.2563855421686747, + "grad_norm": 0.2042127549648285, + "learning_rate": 2.8893506989883167e-05, + "loss": 0.0099, + "step": 652 + }, + { + "epoch": 1.2583132530120482, + "grad_norm": 0.20524422824382782, + "learning_rate": 2.8853314801778784e-05, + "loss": 0.0097, + "step": 653 + }, + { + "epoch": 1.2602409638554217, + "grad_norm": 0.2351921945810318, + "learning_rate": 2.8813078096335093e-05, + "loss": 0.0091, + "step": 654 + }, + { + "epoch": 1.2621686746987952, + "grad_norm": 0.34547340869903564, + "learning_rate": 2.87727970758753e-05, + "loss": 0.0088, + "step": 655 + }, + { + "epoch": 1.2640963855421687, + "grad_norm": 0.35163217782974243, + "learning_rate": 2.8732471942945443e-05, + "loss": 0.0145, + "step": 656 + }, + { + "epoch": 1.266024096385542, + "grad_norm": 1.715137243270874, + "learning_rate": 2.8692102900313378e-05, + "loss": 0.0198, + "step": 657 + }, + { + "epoch": 1.2679518072289158, + "grad_norm": 0.2860178053379059, + "learning_rate": 2.8651690150967748e-05, + "loss": 0.0085, + "step": 658 + }, + { + "epoch": 1.269879518072289, + "grad_norm": 0.21175967156887054, + "learning_rate": 2.8611233898116967e-05, + "loss": 0.0071, + "step": 659 + }, + { + "epoch": 1.2718072289156628, + "grad_norm": 0.33726972341537476, + "learning_rate": 2.85707343451882e-05, + "loss": 0.012, + "step": 660 + }, + { + "epoch": 1.273734939759036, + "grad_norm": 0.2138456553220749, + "learning_rate": 2.853019169582635e-05, + "loss": 0.0092, + "step": 661 + }, + { + "epoch": 1.2756626506024096, + "grad_norm": 0.2304934412240982, + "learning_rate": 2.8489606153892997e-05, + "loss": 0.0144, + "step": 662 + }, + { + "epoch": 1.277590361445783, + "grad_norm": 0.2691061794757843, + "learning_rate": 2.8448977923465425e-05, + "loss": 0.0121, + "step": 663 + }, + { + "epoch": 1.2795180722891566, + "grad_norm": 0.35254305601119995, + "learning_rate": 2.840830720883555e-05, + "loss": 0.0125, + "step": 664 + }, + { + "epoch": 1.28144578313253, + "grad_norm": 0.36552608013153076, + "learning_rate": 2.836759421450893e-05, + "loss": 0.021, + "step": 665 + }, + { + "epoch": 1.2833734939759036, + "grad_norm": 0.37177154421806335, + "learning_rate": 2.83268391452037e-05, + "loss": 0.0216, + "step": 666 + }, + { + "epoch": 1.2853012048192771, + "grad_norm": 0.20932547748088837, + "learning_rate": 2.828604220584958e-05, + "loss": 0.0077, + "step": 667 + }, + { + "epoch": 1.2872289156626506, + "grad_norm": 0.5158557295799255, + "learning_rate": 2.824520360158681e-05, + "loss": 0.0394, + "step": 668 + }, + { + "epoch": 1.2891566265060241, + "grad_norm": 0.22623969614505768, + "learning_rate": 2.820432353776515e-05, + "loss": 0.0087, + "step": 669 + }, + { + "epoch": 1.2910843373493976, + "grad_norm": 0.2996046245098114, + "learning_rate": 2.8163402219942822e-05, + "loss": 0.01, + "step": 670 + }, + { + "epoch": 1.2930120481927712, + "grad_norm": 0.24957989156246185, + "learning_rate": 2.8122439853885488e-05, + "loss": 0.0127, + "step": 671 + }, + { + "epoch": 1.2949397590361444, + "grad_norm": 0.2636559307575226, + "learning_rate": 2.8081436645565216e-05, + "loss": 0.0128, + "step": 672 + }, + { + "epoch": 1.2968674698795182, + "grad_norm": 0.3531591296195984, + "learning_rate": 2.804039280115944e-05, + "loss": 0.0199, + "step": 673 + }, + { + "epoch": 1.2987951807228915, + "grad_norm": 0.3682299852371216, + "learning_rate": 2.7999308527049927e-05, + "loss": 0.0088, + "step": 674 + }, + { + "epoch": 1.3007228915662652, + "grad_norm": 0.19555217027664185, + "learning_rate": 2.795818402982174e-05, + "loss": 0.0084, + "step": 675 + }, + { + "epoch": 1.3026506024096385, + "grad_norm": 0.2864912450313568, + "learning_rate": 2.7917019516262186e-05, + "loss": 0.0154, + "step": 676 + }, + { + "epoch": 1.304578313253012, + "grad_norm": 0.2211237996816635, + "learning_rate": 2.78758151933598e-05, + "loss": 0.0078, + "step": 677 + }, + { + "epoch": 1.3065060240963855, + "grad_norm": 0.13646945357322693, + "learning_rate": 2.7834571268303294e-05, + "loss": 0.0058, + "step": 678 + }, + { + "epoch": 1.308433734939759, + "grad_norm": 0.16530285775661469, + "learning_rate": 2.779328794848049e-05, + "loss": 0.007, + "step": 679 + }, + { + "epoch": 1.3103614457831325, + "grad_norm": 0.2145693302154541, + "learning_rate": 2.7751965441477325e-05, + "loss": 0.0203, + "step": 680 + }, + { + "epoch": 1.312289156626506, + "grad_norm": 0.24273739755153656, + "learning_rate": 2.771060395507677e-05, + "loss": 0.0106, + "step": 681 + }, + { + "epoch": 1.3142168674698795, + "grad_norm": 0.20430618524551392, + "learning_rate": 2.7669203697257794e-05, + "loss": 0.0122, + "step": 682 + }, + { + "epoch": 1.316144578313253, + "grad_norm": 0.2502615749835968, + "learning_rate": 2.7627764876194335e-05, + "loss": 0.0101, + "step": 683 + }, + { + "epoch": 1.3180722891566266, + "grad_norm": 0.287239670753479, + "learning_rate": 2.7586287700254214e-05, + "loss": 0.0203, + "step": 684 + }, + { + "epoch": 1.32, + "grad_norm": 0.16239754855632782, + "learning_rate": 2.7544772377998147e-05, + "loss": 0.0084, + "step": 685 + }, + { + "epoch": 1.3219277108433736, + "grad_norm": 0.27174142003059387, + "learning_rate": 2.7503219118178636e-05, + "loss": 0.008, + "step": 686 + }, + { + "epoch": 1.3238554216867469, + "grad_norm": 0.12878240644931793, + "learning_rate": 2.7461628129738954e-05, + "loss": 0.0053, + "step": 687 + }, + { + "epoch": 1.3257831325301206, + "grad_norm": 0.16112515330314636, + "learning_rate": 2.7419999621812086e-05, + "loss": 0.0059, + "step": 688 + }, + { + "epoch": 1.3277108433734939, + "grad_norm": 0.2398834228515625, + "learning_rate": 2.7378333803719672e-05, + "loss": 0.0095, + "step": 689 + }, + { + "epoch": 1.3296385542168676, + "grad_norm": 0.18516193330287933, + "learning_rate": 2.733663088497097e-05, + "loss": 0.0071, + "step": 690 + }, + { + "epoch": 1.331566265060241, + "grad_norm": 0.2974924147129059, + "learning_rate": 2.7294891075261785e-05, + "loss": 0.0227, + "step": 691 + }, + { + "epoch": 1.3334939759036144, + "grad_norm": 0.12931054830551147, + "learning_rate": 2.7253114584473418e-05, + "loss": 0.0039, + "step": 692 + }, + { + "epoch": 1.335421686746988, + "grad_norm": 0.16319474577903748, + "learning_rate": 2.7211301622671623e-05, + "loss": 0.008, + "step": 693 + }, + { + "epoch": 1.3373493975903614, + "grad_norm": 0.27622169256210327, + "learning_rate": 2.7169452400105533e-05, + "loss": 0.0238, + "step": 694 + }, + { + "epoch": 1.339277108433735, + "grad_norm": 0.45309779047966003, + "learning_rate": 2.712756712720663e-05, + "loss": 0.0439, + "step": 695 + }, + { + "epoch": 1.3412048192771084, + "grad_norm": 0.2469855099916458, + "learning_rate": 2.708564601458765e-05, + "loss": 0.0085, + "step": 696 + }, + { + "epoch": 1.343132530120482, + "grad_norm": 0.4245856702327728, + "learning_rate": 2.7043689273041535e-05, + "loss": 0.0097, + "step": 697 + }, + { + "epoch": 1.3450602409638555, + "grad_norm": 0.26796087622642517, + "learning_rate": 2.7001697113540414e-05, + "loss": 0.0119, + "step": 698 + }, + { + "epoch": 1.346987951807229, + "grad_norm": 0.3569283187389374, + "learning_rate": 2.6959669747234482e-05, + "loss": 0.0096, + "step": 699 + }, + { + "epoch": 1.3489156626506025, + "grad_norm": 0.7038524150848389, + "learning_rate": 2.6917607385450973e-05, + "loss": 0.0317, + "step": 700 + }, + { + "epoch": 1.350843373493976, + "grad_norm": 0.23568563163280487, + "learning_rate": 2.687551023969308e-05, + "loss": 0.0112, + "step": 701 + }, + { + "epoch": 1.3527710843373493, + "grad_norm": 0.20338499546051025, + "learning_rate": 2.6833378521638935e-05, + "loss": 0.0092, + "step": 702 + }, + { + "epoch": 1.354698795180723, + "grad_norm": 4.22187614440918, + "learning_rate": 2.679121244314046e-05, + "loss": 0.0314, + "step": 703 + }, + { + "epoch": 1.3566265060240963, + "grad_norm": 0.2542206048965454, + "learning_rate": 2.674901221622239e-05, + "loss": 0.0158, + "step": 704 + }, + { + "epoch": 1.3585542168674698, + "grad_norm": 0.49705010652542114, + "learning_rate": 2.670677805308116e-05, + "loss": 0.0162, + "step": 705 + }, + { + "epoch": 1.3604819277108433, + "grad_norm": 0.17502115666866302, + "learning_rate": 2.666451016608383e-05, + "loss": 0.0074, + "step": 706 + }, + { + "epoch": 1.3624096385542168, + "grad_norm": 0.21738742291927338, + "learning_rate": 2.6622208767767075e-05, + "loss": 0.0135, + "step": 707 + }, + { + "epoch": 1.3643373493975903, + "grad_norm": 0.3309847414493561, + "learning_rate": 2.6579874070836032e-05, + "loss": 0.0107, + "step": 708 + }, + { + "epoch": 1.3662650602409638, + "grad_norm": 0.10706827789545059, + "learning_rate": 2.6537506288163303e-05, + "loss": 0.0043, + "step": 709 + }, + { + "epoch": 1.3681927710843373, + "grad_norm": 0.173640176653862, + "learning_rate": 2.6495105632787835e-05, + "loss": 0.0092, + "step": 710 + }, + { + "epoch": 1.3701204819277109, + "grad_norm": 0.2636397182941437, + "learning_rate": 2.6452672317913893e-05, + "loss": 0.0097, + "step": 711 + }, + { + "epoch": 1.3720481927710844, + "grad_norm": 0.28485360741615295, + "learning_rate": 2.6410206556909943e-05, + "loss": 0.0193, + "step": 712 + }, + { + "epoch": 1.3739759036144579, + "grad_norm": 0.23210027813911438, + "learning_rate": 2.636770856330761e-05, + "loss": 0.0229, + "step": 713 + }, + { + "epoch": 1.3759036144578314, + "grad_norm": 0.13388316333293915, + "learning_rate": 2.6325178550800596e-05, + "loss": 0.004, + "step": 714 + }, + { + "epoch": 1.377831325301205, + "grad_norm": 0.5131422877311707, + "learning_rate": 2.6282616733243603e-05, + "loss": 0.0137, + "step": 715 + }, + { + "epoch": 1.3797590361445784, + "grad_norm": 0.3243267834186554, + "learning_rate": 2.6240023324651258e-05, + "loss": 0.0153, + "step": 716 + }, + { + "epoch": 1.3816867469879517, + "grad_norm": 0.1440611034631729, + "learning_rate": 2.619739853919704e-05, + "loss": 0.0031, + "step": 717 + }, + { + "epoch": 1.3836144578313254, + "grad_norm": 0.30346596240997314, + "learning_rate": 2.6154742591212196e-05, + "loss": 0.0109, + "step": 718 + }, + { + "epoch": 1.3855421686746987, + "grad_norm": 0.19109240174293518, + "learning_rate": 2.611205569518468e-05, + "loss": 0.0094, + "step": 719 + }, + { + "epoch": 1.3874698795180722, + "grad_norm": 0.28636518120765686, + "learning_rate": 2.6069338065758056e-05, + "loss": 0.0123, + "step": 720 + }, + { + "epoch": 1.3893975903614457, + "grad_norm": 0.28083911538124084, + "learning_rate": 2.6026589917730416e-05, + "loss": 0.0104, + "step": 721 + }, + { + "epoch": 1.3913253012048192, + "grad_norm": 0.36553966999053955, + "learning_rate": 2.5983811466053327e-05, + "loss": 0.0143, + "step": 722 + }, + { + "epoch": 1.3932530120481927, + "grad_norm": 0.23317205905914307, + "learning_rate": 2.5941002925830708e-05, + "loss": 0.011, + "step": 723 + }, + { + "epoch": 1.3951807228915662, + "grad_norm": 0.3825171887874603, + "learning_rate": 2.589816451231781e-05, + "loss": 0.0098, + "step": 724 + }, + { + "epoch": 1.3971084337349398, + "grad_norm": 0.19916608929634094, + "learning_rate": 2.585529644092006e-05, + "loss": 0.0094, + "step": 725 + }, + { + "epoch": 1.3990361445783133, + "grad_norm": 0.19990523159503937, + "learning_rate": 2.5812398927192027e-05, + "loss": 0.0128, + "step": 726 + }, + { + "epoch": 1.4009638554216868, + "grad_norm": 0.34662899374961853, + "learning_rate": 2.5769472186836347e-05, + "loss": 0.0091, + "step": 727 + }, + { + "epoch": 1.4028915662650603, + "grad_norm": 0.23481112718582153, + "learning_rate": 2.5726516435702583e-05, + "loss": 0.0154, + "step": 728 + }, + { + "epoch": 1.4048192771084338, + "grad_norm": 0.1846667379140854, + "learning_rate": 2.5683531889786194e-05, + "loss": 0.0088, + "step": 729 + }, + { + "epoch": 1.4067469879518073, + "grad_norm": 0.16717663407325745, + "learning_rate": 2.564051876522742e-05, + "loss": 0.0083, + "step": 730 + }, + { + "epoch": 1.4086746987951808, + "grad_norm": 0.4116475284099579, + "learning_rate": 2.5597477278310202e-05, + "loss": 0.0179, + "step": 731 + }, + { + "epoch": 1.410602409638554, + "grad_norm": 0.171807661652565, + "learning_rate": 2.5554407645461115e-05, + "loss": 0.0063, + "step": 732 + }, + { + "epoch": 1.4125301204819278, + "grad_norm": 0.1954439878463745, + "learning_rate": 2.5511310083248243e-05, + "loss": 0.017, + "step": 733 + }, + { + "epoch": 1.4144578313253011, + "grad_norm": 0.37158989906311035, + "learning_rate": 2.5468184808380104e-05, + "loss": 0.0173, + "step": 734 + }, + { + "epoch": 1.4163855421686746, + "grad_norm": 0.2001633644104004, + "learning_rate": 2.542503203770458e-05, + "loss": 0.0165, + "step": 735 + }, + { + "epoch": 1.4183132530120481, + "grad_norm": 0.45673373341560364, + "learning_rate": 2.53818519882078e-05, + "loss": 0.0185, + "step": 736 + }, + { + "epoch": 1.4202409638554216, + "grad_norm": 0.3838701546192169, + "learning_rate": 2.5338644877013067e-05, + "loss": 0.0134, + "step": 737 + }, + { + "epoch": 1.4221686746987952, + "grad_norm": 0.32032477855682373, + "learning_rate": 2.5295410921379745e-05, + "loss": 0.0143, + "step": 738 + }, + { + "epoch": 1.4240963855421687, + "grad_norm": 0.4594039022922516, + "learning_rate": 2.52521503387022e-05, + "loss": 0.0193, + "step": 739 + }, + { + "epoch": 1.4260240963855422, + "grad_norm": 0.3889620900154114, + "learning_rate": 2.5208863346508667e-05, + "loss": 0.0114, + "step": 740 + }, + { + "epoch": 1.4279518072289157, + "grad_norm": 0.33153319358825684, + "learning_rate": 2.5165550162460203e-05, + "loss": 0.0102, + "step": 741 + }, + { + "epoch": 1.4298795180722892, + "grad_norm": 0.7269518375396729, + "learning_rate": 2.5122211004349536e-05, + "loss": 0.0215, + "step": 742 + }, + { + "epoch": 1.4318072289156627, + "grad_norm": 0.31653261184692383, + "learning_rate": 2.5078846090100023e-05, + "loss": 0.0115, + "step": 743 + }, + { + "epoch": 1.4337349397590362, + "grad_norm": 0.20620353519916534, + "learning_rate": 2.5035455637764518e-05, + "loss": 0.0153, + "step": 744 + }, + { + "epoch": 1.4356626506024097, + "grad_norm": 0.17266008257865906, + "learning_rate": 2.4992039865524297e-05, + "loss": 0.0069, + "step": 745 + }, + { + "epoch": 1.4375903614457832, + "grad_norm": 0.24760811030864716, + "learning_rate": 2.494859899168795e-05, + "loss": 0.0108, + "step": 746 + }, + { + "epoch": 1.4395180722891565, + "grad_norm": 0.2584865391254425, + "learning_rate": 2.4905133234690282e-05, + "loss": 0.0095, + "step": 747 + }, + { + "epoch": 1.4414457831325302, + "grad_norm": 0.48847514390945435, + "learning_rate": 2.486164281309122e-05, + "loss": 0.0181, + "step": 748 + }, + { + "epoch": 1.4433734939759035, + "grad_norm": 0.42942047119140625, + "learning_rate": 2.4818127945574717e-05, + "loss": 0.025, + "step": 749 + }, + { + "epoch": 1.445301204819277, + "grad_norm": 0.23713800311088562, + "learning_rate": 2.4774588850947648e-05, + "loss": 0.0085, + "step": 750 + }, + { + "epoch": 1.4472289156626506, + "grad_norm": 0.8797569870948792, + "learning_rate": 2.473102574813871e-05, + "loss": 0.0097, + "step": 751 + }, + { + "epoch": 1.449156626506024, + "grad_norm": 0.2744862735271454, + "learning_rate": 2.4687438856197302e-05, + "loss": 0.0122, + "step": 752 + }, + { + "epoch": 1.4510843373493976, + "grad_norm": 0.12747010588645935, + "learning_rate": 2.4643828394292478e-05, + "loss": 0.0056, + "step": 753 + }, + { + "epoch": 1.453012048192771, + "grad_norm": 0.37376829981803894, + "learning_rate": 2.4600194581711775e-05, + "loss": 0.0052, + "step": 754 + }, + { + "epoch": 1.4549397590361446, + "grad_norm": 0.2536911368370056, + "learning_rate": 2.4556537637860176e-05, + "loss": 0.0113, + "step": 755 + }, + { + "epoch": 1.456867469879518, + "grad_norm": 0.25950780510902405, + "learning_rate": 2.451285778225894e-05, + "loss": 0.0099, + "step": 756 + }, + { + "epoch": 1.4587951807228916, + "grad_norm": 0.19535955786705017, + "learning_rate": 2.4469155234544565e-05, + "loss": 0.0069, + "step": 757 + }, + { + "epoch": 1.4607228915662651, + "grad_norm": 0.22816115617752075, + "learning_rate": 2.442543021446764e-05, + "loss": 0.0088, + "step": 758 + }, + { + "epoch": 1.4626506024096386, + "grad_norm": 0.3363986313343048, + "learning_rate": 2.4381682941891755e-05, + "loss": 0.0182, + "step": 759 + }, + { + "epoch": 1.464578313253012, + "grad_norm": 0.21492891013622284, + "learning_rate": 2.4337913636792382e-05, + "loss": 0.0069, + "step": 760 + }, + { + "epoch": 1.4665060240963856, + "grad_norm": 0.6070862412452698, + "learning_rate": 2.429412251925579e-05, + "loss": 0.0406, + "step": 761 + }, + { + "epoch": 1.468433734939759, + "grad_norm": 2.6469690799713135, + "learning_rate": 2.425030980947793e-05, + "loss": 0.0205, + "step": 762 + }, + { + "epoch": 1.4703614457831327, + "grad_norm": 0.30909740924835205, + "learning_rate": 2.420647572776332e-05, + "loss": 0.0136, + "step": 763 + }, + { + "epoch": 1.472289156626506, + "grad_norm": 0.6639553904533386, + "learning_rate": 2.416262049452395e-05, + "loss": 0.011, + "step": 764 + }, + { + "epoch": 1.4742168674698795, + "grad_norm": 0.2919616997241974, + "learning_rate": 2.4118744330278147e-05, + "loss": 0.0131, + "step": 765 + }, + { + "epoch": 1.476144578313253, + "grad_norm": 0.5232429504394531, + "learning_rate": 2.4074847455649523e-05, + "loss": 0.0138, + "step": 766 + }, + { + "epoch": 1.4780722891566265, + "grad_norm": 5.630630970001221, + "learning_rate": 2.403093009136579e-05, + "loss": 0.0264, + "step": 767 + }, + { + "epoch": 1.48, + "grad_norm": 0.33234721422195435, + "learning_rate": 2.3986992458257707e-05, + "loss": 0.0111, + "step": 768 + }, + { + "epoch": 1.4819277108433735, + "grad_norm": 0.28444772958755493, + "learning_rate": 2.3943034777257945e-05, + "loss": 0.0144, + "step": 769 + }, + { + "epoch": 1.483855421686747, + "grad_norm": 0.16229979693889618, + "learning_rate": 2.38990572694e-05, + "loss": 0.0062, + "step": 770 + }, + { + "epoch": 1.4857831325301205, + "grad_norm": 0.27474716305732727, + "learning_rate": 2.385506015581704e-05, + "loss": 0.0172, + "step": 771 + }, + { + "epoch": 1.487710843373494, + "grad_norm": 0.246526300907135, + "learning_rate": 2.381104365774083e-05, + "loss": 0.012, + "step": 772 + }, + { + "epoch": 1.4896385542168675, + "grad_norm": 0.282047837972641, + "learning_rate": 2.37670079965006e-05, + "loss": 0.0116, + "step": 773 + }, + { + "epoch": 1.491566265060241, + "grad_norm": 0.2878139317035675, + "learning_rate": 2.3722953393521944e-05, + "loss": 0.0147, + "step": 774 + }, + { + "epoch": 1.4934939759036143, + "grad_norm": 0.5586277842521667, + "learning_rate": 2.367888007032571e-05, + "loss": 0.0111, + "step": 775 + }, + { + "epoch": 1.495421686746988, + "grad_norm": 0.562160313129425, + "learning_rate": 2.3634788248526846e-05, + "loss": 0.0061, + "step": 776 + }, + { + "epoch": 1.4973493975903613, + "grad_norm": 0.3452005982398987, + "learning_rate": 2.3590678149833356e-05, + "loss": 0.0205, + "step": 777 + }, + { + "epoch": 1.499277108433735, + "grad_norm": 0.7757686376571655, + "learning_rate": 2.3546549996045114e-05, + "loss": 0.0273, + "step": 778 + }, + { + "epoch": 1.5012048192771084, + "grad_norm": 0.19530551135540009, + "learning_rate": 2.3502404009052812e-05, + "loss": 0.0083, + "step": 779 + }, + { + "epoch": 1.503132530120482, + "grad_norm": 0.2586531639099121, + "learning_rate": 2.3458240410836775e-05, + "loss": 0.0122, + "step": 780 + }, + { + "epoch": 1.5050602409638554, + "grad_norm": 0.30063286423683167, + "learning_rate": 2.3414059423465924e-05, + "loss": 0.0083, + "step": 781 + }, + { + "epoch": 1.5069879518072289, + "grad_norm": 0.18663185834884644, + "learning_rate": 2.3369861269096575e-05, + "loss": 0.0104, + "step": 782 + }, + { + "epoch": 1.5089156626506024, + "grad_norm": 0.4405941069126129, + "learning_rate": 2.3325646169971416e-05, + "loss": 0.0264, + "step": 783 + }, + { + "epoch": 1.510843373493976, + "grad_norm": 0.2947913110256195, + "learning_rate": 2.3281414348418294e-05, + "loss": 0.0107, + "step": 784 + }, + { + "epoch": 1.5127710843373494, + "grad_norm": 0.23813778162002563, + "learning_rate": 2.3237166026849158e-05, + "loss": 0.0084, + "step": 785 + }, + { + "epoch": 1.514698795180723, + "grad_norm": 0.33380329608917236, + "learning_rate": 2.3192901427758932e-05, + "loss": 0.0111, + "step": 786 + }, + { + "epoch": 1.5166265060240964, + "grad_norm": 0.3736988306045532, + "learning_rate": 2.314862077372438e-05, + "loss": 0.0135, + "step": 787 + }, + { + "epoch": 1.5185542168674697, + "grad_norm": 0.3785395920276642, + "learning_rate": 2.3104324287402996e-05, + "loss": 0.0265, + "step": 788 + }, + { + "epoch": 1.5204819277108435, + "grad_norm": 0.3359154462814331, + "learning_rate": 2.3060012191531885e-05, + "loss": 0.0127, + "step": 789 + }, + { + "epoch": 1.5224096385542167, + "grad_norm": 0.720753014087677, + "learning_rate": 2.301568470892664e-05, + "loss": 0.0134, + "step": 790 + }, + { + "epoch": 1.5243373493975905, + "grad_norm": 0.36473193764686584, + "learning_rate": 2.297134206248024e-05, + "loss": 0.0318, + "step": 791 + }, + { + "epoch": 1.5262650602409638, + "grad_norm": 0.29987087845802307, + "learning_rate": 2.2926984475161884e-05, + "loss": 0.008, + "step": 792 + }, + { + "epoch": 1.5281927710843375, + "grad_norm": 0.2883112132549286, + "learning_rate": 2.2882612170015914e-05, + "loss": 0.0125, + "step": 793 + }, + { + "epoch": 1.5301204819277108, + "grad_norm": 0.28983229398727417, + "learning_rate": 2.2838225370160682e-05, + "loss": 0.0155, + "step": 794 + }, + { + "epoch": 1.5320481927710843, + "grad_norm": 0.47236886620521545, + "learning_rate": 2.2793824298787414e-05, + "loss": 0.0132, + "step": 795 + }, + { + "epoch": 1.5339759036144578, + "grad_norm": 0.8328865170478821, + "learning_rate": 2.2749409179159104e-05, + "loss": 0.026, + "step": 796 + }, + { + "epoch": 1.5359036144578313, + "grad_norm": 0.3129172623157501, + "learning_rate": 2.2704980234609396e-05, + "loss": 0.0099, + "step": 797 + }, + { + "epoch": 1.5378313253012048, + "grad_norm": 0.22284500300884247, + "learning_rate": 2.2660537688541416e-05, + "loss": 0.009, + "step": 798 + }, + { + "epoch": 1.5397590361445783, + "grad_norm": 0.3346405625343323, + "learning_rate": 2.2616081764426726e-05, + "loss": 0.0077, + "step": 799 + }, + { + "epoch": 1.5416867469879518, + "grad_norm": 0.2923565208911896, + "learning_rate": 2.2571612685804124e-05, + "loss": 0.0119, + "step": 800 + }, + { + "epoch": 1.5436144578313253, + "grad_norm": 0.1921311914920807, + "learning_rate": 2.252713067627857e-05, + "loss": 0.0083, + "step": 801 + }, + { + "epoch": 1.5455421686746988, + "grad_norm": 0.23221106827259064, + "learning_rate": 2.2482635959520044e-05, + "loss": 0.0049, + "step": 802 + }, + { + "epoch": 1.5474698795180721, + "grad_norm": 0.6340724229812622, + "learning_rate": 2.243812875926241e-05, + "loss": 0.0273, + "step": 803 + }, + { + "epoch": 1.5493975903614459, + "grad_norm": 0.2699439823627472, + "learning_rate": 2.2393609299302314e-05, + "loss": 0.0108, + "step": 804 + }, + { + "epoch": 1.5513253012048192, + "grad_norm": 0.2005189210176468, + "learning_rate": 2.2349077803498052e-05, + "loss": 0.0076, + "step": 805 + }, + { + "epoch": 1.5532530120481929, + "grad_norm": 0.39668548107147217, + "learning_rate": 2.230453449576842e-05, + "loss": 0.0135, + "step": 806 + }, + { + "epoch": 1.5551807228915662, + "grad_norm": 0.2406950294971466, + "learning_rate": 2.2259979600091635e-05, + "loss": 0.0094, + "step": 807 + }, + { + "epoch": 1.55710843373494, + "grad_norm": 0.30363157391548157, + "learning_rate": 2.2215413340504158e-05, + "loss": 0.0178, + "step": 808 + }, + { + "epoch": 1.5590361445783132, + "grad_norm": 0.19508181512355804, + "learning_rate": 2.2170835941099605e-05, + "loss": 0.0069, + "step": 809 + }, + { + "epoch": 1.5609638554216867, + "grad_norm": 0.734106719493866, + "learning_rate": 2.2126247626027615e-05, + "loss": 0.0319, + "step": 810 + }, + { + "epoch": 1.5628915662650602, + "grad_norm": 0.2591583728790283, + "learning_rate": 2.208164861949268e-05, + "loss": 0.0168, + "step": 811 + }, + { + "epoch": 1.5648192771084337, + "grad_norm": 0.2386734038591385, + "learning_rate": 2.20370391457531e-05, + "loss": 0.0041, + "step": 812 + }, + { + "epoch": 1.5667469879518072, + "grad_norm": 0.1675218939781189, + "learning_rate": 2.1992419429119764e-05, + "loss": 0.0078, + "step": 813 + }, + { + "epoch": 1.5686746987951807, + "grad_norm": 0.45591506361961365, + "learning_rate": 2.1947789693955097e-05, + "loss": 0.0166, + "step": 814 + }, + { + "epoch": 1.5706024096385542, + "grad_norm": 0.46940621733665466, + "learning_rate": 2.190315016467188e-05, + "loss": 0.0176, + "step": 815 + }, + { + "epoch": 1.5725301204819278, + "grad_norm": 0.2294205278158188, + "learning_rate": 2.1858501065732146e-05, + "loss": 0.0102, + "step": 816 + }, + { + "epoch": 1.5744578313253013, + "grad_norm": 0.28922322392463684, + "learning_rate": 2.181384262164606e-05, + "loss": 0.0111, + "step": 817 + }, + { + "epoch": 1.5763855421686745, + "grad_norm": 0.19650064408779144, + "learning_rate": 2.1769175056970765e-05, + "loss": 0.0076, + "step": 818 + }, + { + "epoch": 1.5783132530120483, + "grad_norm": 0.19538825750350952, + "learning_rate": 2.172449859630927e-05, + "loss": 0.0118, + "step": 819 + }, + { + "epoch": 1.5802409638554216, + "grad_norm": 0.1900389939546585, + "learning_rate": 2.167981346430931e-05, + "loss": 0.0066, + "step": 820 + }, + { + "epoch": 1.5821686746987953, + "grad_norm": 0.21593710780143738, + "learning_rate": 2.1635119885662235e-05, + "loss": 0.0101, + "step": 821 + }, + { + "epoch": 1.5840963855421686, + "grad_norm": 0.2699289321899414, + "learning_rate": 2.159041808510185e-05, + "loss": 0.0118, + "step": 822 + }, + { + "epoch": 1.5860240963855423, + "grad_norm": 0.31867673993110657, + "learning_rate": 2.1545708287403322e-05, + "loss": 0.0122, + "step": 823 + }, + { + "epoch": 1.5879518072289156, + "grad_norm": 0.2862400412559509, + "learning_rate": 2.1500990717382004e-05, + "loss": 0.0216, + "step": 824 + }, + { + "epoch": 1.589879518072289, + "grad_norm": 0.28482481837272644, + "learning_rate": 2.145626559989237e-05, + "loss": 0.0136, + "step": 825 + }, + { + "epoch": 1.5918072289156626, + "grad_norm": 0.2866958975791931, + "learning_rate": 2.1411533159826803e-05, + "loss": 0.0298, + "step": 826 + }, + { + "epoch": 1.5937349397590361, + "grad_norm": 0.39092838764190674, + "learning_rate": 2.1366793622114533e-05, + "loss": 0.0382, + "step": 827 + }, + { + "epoch": 1.5956626506024096, + "grad_norm": 0.16381537914276123, + "learning_rate": 2.1322047211720468e-05, + "loss": 0.0074, + "step": 828 + }, + { + "epoch": 1.5975903614457831, + "grad_norm": 0.22146940231323242, + "learning_rate": 2.1277294153644083e-05, + "loss": 0.0103, + "step": 829 + }, + { + "epoch": 1.5995180722891567, + "grad_norm": 0.2155209183692932, + "learning_rate": 2.123253467291827e-05, + "loss": 0.0095, + "step": 830 + }, + { + "epoch": 1.6014457831325302, + "grad_norm": 0.41510409116744995, + "learning_rate": 2.118776899460822e-05, + "loss": 0.0457, + "step": 831 + }, + { + "epoch": 1.6033734939759037, + "grad_norm": 0.19718150794506073, + "learning_rate": 2.1142997343810293e-05, + "loss": 0.0192, + "step": 832 + }, + { + "epoch": 1.605301204819277, + "grad_norm": 0.40924403071403503, + "learning_rate": 2.1098219945650865e-05, + "loss": 0.0278, + "step": 833 + }, + { + "epoch": 1.6072289156626507, + "grad_norm": 0.18657824397087097, + "learning_rate": 2.105343702528524e-05, + "loss": 0.0076, + "step": 834 + }, + { + "epoch": 1.609156626506024, + "grad_norm": 0.1727641075849533, + "learning_rate": 2.100864880789645e-05, + "loss": 0.0076, + "step": 835 + }, + { + "epoch": 1.6110843373493977, + "grad_norm": 0.18138745427131653, + "learning_rate": 2.0963855518694203e-05, + "loss": 0.005, + "step": 836 + }, + { + "epoch": 1.613012048192771, + "grad_norm": 0.19173955917358398, + "learning_rate": 2.0919057382913675e-05, + "loss": 0.0084, + "step": 837 + }, + { + "epoch": 1.6149397590361447, + "grad_norm": 0.3812403380870819, + "learning_rate": 2.0874254625814435e-05, + "loss": 0.009, + "step": 838 + }, + { + "epoch": 1.616867469879518, + "grad_norm": 0.2009759545326233, + "learning_rate": 2.0829447472679285e-05, + "loss": 0.0098, + "step": 839 + }, + { + "epoch": 1.6187951807228915, + "grad_norm": 0.48703446984291077, + "learning_rate": 2.0784636148813124e-05, + "loss": 0.0099, + "step": 840 + }, + { + "epoch": 1.620722891566265, + "grad_norm": 0.28995075821876526, + "learning_rate": 2.0739820879541827e-05, + "loss": 0.0075, + "step": 841 + }, + { + "epoch": 1.6226506024096385, + "grad_norm": 0.2130059450864792, + "learning_rate": 2.069500189021111e-05, + "loss": 0.007, + "step": 842 + }, + { + "epoch": 1.624578313253012, + "grad_norm": 0.252524733543396, + "learning_rate": 2.0650179406185397e-05, + "loss": 0.0249, + "step": 843 + }, + { + "epoch": 1.6265060240963856, + "grad_norm": 0.23069098591804504, + "learning_rate": 2.060535365284668e-05, + "loss": 0.0084, + "step": 844 + }, + { + "epoch": 1.628433734939759, + "grad_norm": 0.25051403045654297, + "learning_rate": 2.056052485559338e-05, + "loss": 0.0071, + "step": 845 + }, + { + "epoch": 1.6303614457831326, + "grad_norm": 0.27664798498153687, + "learning_rate": 2.051569323983924e-05, + "loss": 0.0198, + "step": 846 + }, + { + "epoch": 1.632289156626506, + "grad_norm": 0.2954922318458557, + "learning_rate": 2.047085903101218e-05, + "loss": 0.006, + "step": 847 + }, + { + "epoch": 1.6342168674698794, + "grad_norm": 0.28477591276168823, + "learning_rate": 2.0426022454553137e-05, + "loss": 0.0147, + "step": 848 + }, + { + "epoch": 1.636144578313253, + "grad_norm": 0.2785305678844452, + "learning_rate": 2.0381183735914968e-05, + "loss": 0.0117, + "step": 849 + }, + { + "epoch": 1.6380722891566264, + "grad_norm": 0.2500309348106384, + "learning_rate": 2.0336343100561295e-05, + "loss": 0.008, + "step": 850 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.18932047486305237, + "learning_rate": 2.0291500773965392e-05, + "loss": 0.0256, + "step": 851 + }, + { + "epoch": 1.6419277108433734, + "grad_norm": 0.6396257877349854, + "learning_rate": 2.0246656981609013e-05, + "loss": 0.0141, + "step": 852 + }, + { + "epoch": 1.6438554216867471, + "grad_norm": 0.5072891116142273, + "learning_rate": 2.02018119489813e-05, + "loss": 0.008, + "step": 853 + }, + { + "epoch": 1.6457831325301204, + "grad_norm": 0.2920839488506317, + "learning_rate": 2.0156965901577635e-05, + "loss": 0.0085, + "step": 854 + }, + { + "epoch": 1.647710843373494, + "grad_norm": 0.1391262263059616, + "learning_rate": 2.011211906489848e-05, + "loss": 0.0078, + "step": 855 + }, + { + "epoch": 1.6496385542168674, + "grad_norm": 0.29620468616485596, + "learning_rate": 2.00672716644483e-05, + "loss": 0.0109, + "step": 856 + }, + { + "epoch": 1.651566265060241, + "grad_norm": 0.13946573436260223, + "learning_rate": 2.002242392573436e-05, + "loss": 0.0076, + "step": 857 + }, + { + "epoch": 1.6534939759036145, + "grad_norm": 0.9766128659248352, + "learning_rate": 1.997757607426565e-05, + "loss": 0.0309, + "step": 858 + }, + { + "epoch": 1.655421686746988, + "grad_norm": 0.18002203106880188, + "learning_rate": 1.9932728335551702e-05, + "loss": 0.0072, + "step": 859 + }, + { + "epoch": 1.6573493975903615, + "grad_norm": 0.28073111176490784, + "learning_rate": 1.988788093510152e-05, + "loss": 0.0246, + "step": 860 + }, + { + "epoch": 1.659277108433735, + "grad_norm": 0.1919957399368286, + "learning_rate": 1.9843034098422375e-05, + "loss": 0.0087, + "step": 861 + }, + { + "epoch": 1.6612048192771085, + "grad_norm": 0.1825258433818817, + "learning_rate": 1.9798188051018705e-05, + "loss": 0.0092, + "step": 862 + }, + { + "epoch": 1.6631325301204818, + "grad_norm": 0.32412952184677124, + "learning_rate": 1.9753343018390997e-05, + "loss": 0.0118, + "step": 863 + }, + { + "epoch": 1.6650602409638555, + "grad_norm": 0.12828563153743744, + "learning_rate": 1.9708499226034618e-05, + "loss": 0.0056, + "step": 864 + }, + { + "epoch": 1.6669879518072288, + "grad_norm": 0.18647560477256775, + "learning_rate": 1.966365689943871e-05, + "loss": 0.0094, + "step": 865 + }, + { + "epoch": 1.6689156626506025, + "grad_norm": 0.19835828244686127, + "learning_rate": 1.9618816264085042e-05, + "loss": 0.0097, + "step": 866 + }, + { + "epoch": 1.6708433734939758, + "grad_norm": 0.22364282608032227, + "learning_rate": 1.957397754544687e-05, + "loss": 0.0062, + "step": 867 + }, + { + "epoch": 1.6727710843373496, + "grad_norm": 0.29420018196105957, + "learning_rate": 1.952914096898783e-05, + "loss": 0.0182, + "step": 868 + }, + { + "epoch": 1.6746987951807228, + "grad_norm": 0.2149929702281952, + "learning_rate": 1.9484306760160766e-05, + "loss": 0.0125, + "step": 869 + }, + { + "epoch": 1.6766265060240964, + "grad_norm": 0.16844330728054047, + "learning_rate": 1.9439475144406623e-05, + "loss": 0.0074, + "step": 870 + }, + { + "epoch": 1.6785542168674699, + "grad_norm": 0.5010282397270203, + "learning_rate": 1.9394646347153334e-05, + "loss": 0.0213, + "step": 871 + }, + { + "epoch": 1.6804819277108434, + "grad_norm": 0.29847195744514465, + "learning_rate": 1.9349820593814606e-05, + "loss": 0.0173, + "step": 872 + }, + { + "epoch": 1.6824096385542169, + "grad_norm": 0.23835812509059906, + "learning_rate": 1.930499810978889e-05, + "loss": 0.011, + "step": 873 + }, + { + "epoch": 1.6843373493975904, + "grad_norm": 0.3269020617008209, + "learning_rate": 1.9260179120458177e-05, + "loss": 0.0285, + "step": 874 + }, + { + "epoch": 1.686265060240964, + "grad_norm": 0.2142144739627838, + "learning_rate": 1.9215363851186883e-05, + "loss": 0.0146, + "step": 875 + }, + { + "epoch": 1.6881927710843372, + "grad_norm": 0.3098377585411072, + "learning_rate": 1.9170552527320725e-05, + "loss": 0.0104, + "step": 876 + }, + { + "epoch": 1.690120481927711, + "grad_norm": 0.22504115104675293, + "learning_rate": 1.9125745374185568e-05, + "loss": 0.0091, + "step": 877 + }, + { + "epoch": 1.6920481927710842, + "grad_norm": 0.20633333921432495, + "learning_rate": 1.908094261708633e-05, + "loss": 0.0097, + "step": 878 + }, + { + "epoch": 1.693975903614458, + "grad_norm": 1.179566502571106, + "learning_rate": 1.9036144481305807e-05, + "loss": 0.0143, + "step": 879 + }, + { + "epoch": 1.6959036144578312, + "grad_norm": 0.15525613725185394, + "learning_rate": 1.8991351192103554e-05, + "loss": 0.0062, + "step": 880 + }, + { + "epoch": 1.697831325301205, + "grad_norm": 0.15966367721557617, + "learning_rate": 1.8946562974714763e-05, + "loss": 0.0048, + "step": 881 + }, + { + "epoch": 1.6997590361445782, + "grad_norm": 0.18902607262134552, + "learning_rate": 1.890178005434914e-05, + "loss": 0.0124, + "step": 882 + }, + { + "epoch": 1.701686746987952, + "grad_norm": 0.21692413091659546, + "learning_rate": 1.885700265618971e-05, + "loss": 0.0135, + "step": 883 + }, + { + "epoch": 1.7036144578313253, + "grad_norm": 0.38948455452919006, + "learning_rate": 1.8812231005391786e-05, + "loss": 0.0365, + "step": 884 + }, + { + "epoch": 1.7055421686746988, + "grad_norm": 0.2483491599559784, + "learning_rate": 1.8767465327081736e-05, + "loss": 0.0202, + "step": 885 + }, + { + "epoch": 1.7074698795180723, + "grad_norm": 0.15305832028388977, + "learning_rate": 1.872270584635592e-05, + "loss": 0.0035, + "step": 886 + }, + { + "epoch": 1.7093975903614458, + "grad_norm": 0.17794466018676758, + "learning_rate": 1.867795278827954e-05, + "loss": 0.0157, + "step": 887 + }, + { + "epoch": 1.7113253012048193, + "grad_norm": 0.1938813328742981, + "learning_rate": 1.863320637788547e-05, + "loss": 0.0071, + "step": 888 + }, + { + "epoch": 1.7132530120481928, + "grad_norm": 0.27061617374420166, + "learning_rate": 1.8588466840173207e-05, + "loss": 0.0347, + "step": 889 + }, + { + "epoch": 1.7151807228915663, + "grad_norm": 0.1541014313697815, + "learning_rate": 1.8543734400107637e-05, + "loss": 0.006, + "step": 890 + }, + { + "epoch": 1.7171084337349396, + "grad_norm": 0.1436876654624939, + "learning_rate": 1.8499009282617996e-05, + "loss": 0.0059, + "step": 891 + }, + { + "epoch": 1.7190361445783133, + "grad_norm": 1.0573723316192627, + "learning_rate": 1.8454291712596688e-05, + "loss": 0.008, + "step": 892 + }, + { + "epoch": 1.7209638554216866, + "grad_norm": 0.15406259894371033, + "learning_rate": 1.8409581914898157e-05, + "loss": 0.0061, + "step": 893 + }, + { + "epoch": 1.7228915662650603, + "grad_norm": 0.24822913110256195, + "learning_rate": 1.836488011433777e-05, + "loss": 0.0085, + "step": 894 + }, + { + "epoch": 1.7248192771084336, + "grad_norm": 0.21049316227436066, + "learning_rate": 1.83201865356907e-05, + "loss": 0.0075, + "step": 895 + }, + { + "epoch": 1.7267469879518074, + "grad_norm": 0.24159866571426392, + "learning_rate": 1.8275501403690733e-05, + "loss": 0.0156, + "step": 896 + }, + { + "epoch": 1.7286746987951807, + "grad_norm": 0.3191063106060028, + "learning_rate": 1.823082494302924e-05, + "loss": 0.0218, + "step": 897 + }, + { + "epoch": 1.7306024096385542, + "grad_norm": 0.20296362042427063, + "learning_rate": 1.8186157378353945e-05, + "loss": 0.0126, + "step": 898 + }, + { + "epoch": 1.7325301204819277, + "grad_norm": 0.1905524581670761, + "learning_rate": 1.8141498934267858e-05, + "loss": 0.0131, + "step": 899 + }, + { + "epoch": 1.7344578313253012, + "grad_norm": 0.5350520610809326, + "learning_rate": 1.809684983532813e-05, + "loss": 0.0115, + "step": 900 + }, + { + "epoch": 1.7363855421686747, + "grad_norm": 0.17144092917442322, + "learning_rate": 1.8052210306044907e-05, + "loss": 0.0113, + "step": 901 + }, + { + "epoch": 1.7383132530120482, + "grad_norm": 0.11777982115745544, + "learning_rate": 1.8007580570880236e-05, + "loss": 0.0058, + "step": 902 + }, + { + "epoch": 1.7402409638554217, + "grad_norm": 0.2078275978565216, + "learning_rate": 1.7962960854246908e-05, + "loss": 0.0106, + "step": 903 + }, + { + "epoch": 1.7421686746987952, + "grad_norm": 0.2550877630710602, + "learning_rate": 1.791835138050732e-05, + "loss": 0.0076, + "step": 904 + }, + { + "epoch": 1.7440963855421687, + "grad_norm": 0.11553912609815598, + "learning_rate": 1.7873752373972395e-05, + "loss": 0.0038, + "step": 905 + }, + { + "epoch": 1.746024096385542, + "grad_norm": 0.10724586248397827, + "learning_rate": 1.7829164058900398e-05, + "loss": 0.0043, + "step": 906 + }, + { + "epoch": 1.7479518072289157, + "grad_norm": 0.30152231454849243, + "learning_rate": 1.7784586659495845e-05, + "loss": 0.0099, + "step": 907 + }, + { + "epoch": 1.749879518072289, + "grad_norm": 0.18372933566570282, + "learning_rate": 1.7740020399908372e-05, + "loss": 0.0074, + "step": 908 + }, + { + "epoch": 1.7518072289156628, + "grad_norm": 0.35184428095817566, + "learning_rate": 1.7695465504231586e-05, + "loss": 0.0184, + "step": 909 + }, + { + "epoch": 1.753734939759036, + "grad_norm": 0.15083615481853485, + "learning_rate": 1.765092219650196e-05, + "loss": 0.0061, + "step": 910 + }, + { + "epoch": 1.7556626506024098, + "grad_norm": 0.2599961459636688, + "learning_rate": 1.7606390700697693e-05, + "loss": 0.0101, + "step": 911 + }, + { + "epoch": 1.757590361445783, + "grad_norm": 0.10829206556081772, + "learning_rate": 1.7561871240737595e-05, + "loss": 0.0034, + "step": 912 + }, + { + "epoch": 1.7595180722891566, + "grad_norm": 0.38098782300949097, + "learning_rate": 1.7517364040479966e-05, + "loss": 0.0384, + "step": 913 + }, + { + "epoch": 1.76144578313253, + "grad_norm": 0.14975085854530334, + "learning_rate": 1.7472869323721432e-05, + "loss": 0.0055, + "step": 914 + }, + { + "epoch": 1.7633734939759036, + "grad_norm": 0.4151444733142853, + "learning_rate": 1.742838731419588e-05, + "loss": 0.0307, + "step": 915 + }, + { + "epoch": 1.765301204819277, + "grad_norm": 0.22238481044769287, + "learning_rate": 1.738391823557328e-05, + "loss": 0.0059, + "step": 916 + }, + { + "epoch": 1.7672289156626506, + "grad_norm": 0.23386356234550476, + "learning_rate": 1.7339462311458587e-05, + "loss": 0.0113, + "step": 917 + }, + { + "epoch": 1.7691566265060241, + "grad_norm": 0.21911191940307617, + "learning_rate": 1.7295019765390618e-05, + "loss": 0.0071, + "step": 918 + }, + { + "epoch": 1.7710843373493976, + "grad_norm": 0.343159943819046, + "learning_rate": 1.7250590820840903e-05, + "loss": 0.0144, + "step": 919 + }, + { + "epoch": 1.7730120481927711, + "grad_norm": 0.32204556465148926, + "learning_rate": 1.720617570121259e-05, + "loss": 0.0131, + "step": 920 + } + ], + "logging_steps": 1, + "max_steps": 1557, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 92, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.2080308880513434e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-920/training_args.bin b/checkpoint-920/training_args.bin new file mode 100644 index 0000000..ecc7b6b --- /dev/null +++ b/checkpoint-920/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:342dfb3c86216e436950100c79812c54066d5572c4e9975b0f133c067f061bcf +size 7825 diff --git a/config.json b/config.json new file mode 100644 index 0000000..f8bf41e --- /dev/null +++ b/config.json @@ -0,0 +1,36 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128009, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": 128009, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "transformers_version": "4.56.2", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..2152026 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128009, + 128001, + 128008, + 128009 + ], + "pad_token_id": 128009, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.56.2" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..9832a64 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:310cc3b4c5cd95f89d0465923767832cc5bad4d98a9c532e2504b385a5fb6698 +size 2996982344 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..14daf45 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,26 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..d1e1ea9 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2068 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..62f8703 --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 3.0, + "total_flos": 2.043435500286509e+18, + "train_loss": 0.016654981696585226, + "train_runtime": 5294.7714, + "train_samples_per_second": 9.403, + "train_steps_per_second": 0.294 +} \ No newline at end of file diff --git a/trainer_log.jsonl b/trainer_log.jsonl new file mode 100644 index 0000000..4f90e62 --- /dev/null +++ b/trainer_log.jsonl @@ -0,0 +1,1558 @@ +{"current_steps": 1, "total_steps": 1557, "loss": 0.0891, "lr": 0.0, "epoch": 0.0019277108433734939, "percentage": 0.06, "elapsed_time": "0:00:04", "remaining_time": "1:56:41"} +{"current_steps": 2, "total_steps": 1557, "loss": 0.0539, "lr": 2.564102564102564e-07, "epoch": 0.0038554216867469878, "percentage": 0.13, "elapsed_time": "0:00:07", "remaining_time": "1:42:30"} +{"current_steps": 3, "total_steps": 1557, "loss": 0.099, "lr": 5.128205128205128e-07, "epoch": 0.005783132530120482, "percentage": 0.19, "elapsed_time": "0:00:11", "remaining_time": "1:39:21"} +{"current_steps": 4, "total_steps": 1557, "loss": 0.0789, "lr": 7.692307692307694e-07, "epoch": 0.0077108433734939755, "percentage": 0.26, "elapsed_time": "0:00:15", "remaining_time": "1:38:10"} +{"current_steps": 5, "total_steps": 1557, "loss": 0.0881, "lr": 1.0256410256410257e-06, "epoch": 0.00963855421686747, "percentage": 0.32, "elapsed_time": "0:00:18", "remaining_time": "1:37:16"} +{"current_steps": 6, "total_steps": 1557, "loss": 0.0776, "lr": 1.282051282051282e-06, "epoch": 0.011566265060240964, "percentage": 0.39, "elapsed_time": "0:00:22", "remaining_time": "1:35:25"} +{"current_steps": 7, "total_steps": 1557, "loss": 0.0827, "lr": 1.5384615384615387e-06, "epoch": 0.013493975903614458, "percentage": 0.45, "elapsed_time": "0:00:25", "remaining_time": "1:34:48"} +{"current_steps": 8, "total_steps": 1557, "loss": 0.0577, "lr": 1.794871794871795e-06, "epoch": 0.015421686746987951, "percentage": 0.51, "elapsed_time": "0:00:29", "remaining_time": "1:34:19"} +{"current_steps": 9, "total_steps": 1557, "loss": 0.04, "lr": 2.0512820512820513e-06, "epoch": 0.017349397590361446, "percentage": 0.58, "elapsed_time": "0:00:32", "remaining_time": "1:33:15"} +{"current_steps": 10, "total_steps": 1557, "loss": 0.0506, "lr": 2.307692307692308e-06, "epoch": 0.01927710843373494, "percentage": 0.64, "elapsed_time": "0:00:35", "remaining_time": "1:32:06"} +{"current_steps": 11, "total_steps": 1557, "loss": 0.0874, "lr": 2.564102564102564e-06, "epoch": 0.021204819277108433, "percentage": 0.71, "elapsed_time": "0:00:39", "remaining_time": "1:31:57"} +{"current_steps": 12, "total_steps": 1557, "loss": 0.0597, "lr": 2.8205128205128207e-06, "epoch": 0.02313253012048193, "percentage": 0.77, "elapsed_time": "0:00:42", "remaining_time": "1:31:08"} +{"current_steps": 13, "total_steps": 1557, "loss": 0.0559, "lr": 3.0769230769230774e-06, "epoch": 0.02506024096385542, "percentage": 0.83, "elapsed_time": "0:00:45", "remaining_time": "1:30:49"} +{"current_steps": 14, "total_steps": 1557, "loss": 0.0688, "lr": 3.3333333333333333e-06, "epoch": 0.026987951807228915, "percentage": 0.9, "elapsed_time": "0:00:48", "remaining_time": "1:29:41"} +{"current_steps": 15, "total_steps": 1557, "loss": 0.0433, "lr": 3.58974358974359e-06, "epoch": 0.02891566265060241, "percentage": 0.96, "elapsed_time": "0:00:52", "remaining_time": "1:29:06"} +{"current_steps": 16, "total_steps": 1557, "loss": 0.0695, "lr": 3.846153846153847e-06, "epoch": 0.030843373493975902, "percentage": 1.03, "elapsed_time": "0:00:55", "remaining_time": "1:29:04"} +{"current_steps": 17, "total_steps": 1557, "loss": 0.0392, "lr": 4.102564102564103e-06, "epoch": 0.0327710843373494, "percentage": 1.09, "elapsed_time": "0:00:59", "remaining_time": "1:29:07"} +{"current_steps": 18, "total_steps": 1557, "loss": 0.0351, "lr": 4.358974358974359e-06, "epoch": 0.03469879518072289, "percentage": 1.16, "elapsed_time": "0:01:02", "remaining_time": "1:29:11"} +{"current_steps": 19, "total_steps": 1557, "loss": 0.0356, "lr": 4.615384615384616e-06, "epoch": 0.03662650602409639, "percentage": 1.22, "elapsed_time": "0:01:05", "remaining_time": "1:29:00"} +{"current_steps": 20, "total_steps": 1557, "loss": 0.0427, "lr": 4.871794871794872e-06, "epoch": 0.03855421686746988, "percentage": 1.28, "elapsed_time": "0:01:09", "remaining_time": "1:28:45"} +{"current_steps": 21, "total_steps": 1557, "loss": 0.0397, "lr": 5.128205128205128e-06, "epoch": 0.04048192771084337, "percentage": 1.35, "elapsed_time": "0:01:12", "remaining_time": "1:28:35"} +{"current_steps": 22, "total_steps": 1557, "loss": 0.0481, "lr": 5.384615384615385e-06, "epoch": 0.042409638554216866, "percentage": 1.41, "elapsed_time": "0:01:16", "remaining_time": "1:28:52"} +{"current_steps": 23, "total_steps": 1557, "loss": 0.0479, "lr": 5.641025641025641e-06, "epoch": 0.04433734939759036, "percentage": 1.48, "elapsed_time": "0:01:19", "remaining_time": "1:28:49"} +{"current_steps": 24, "total_steps": 1557, "loss": 0.0182, "lr": 5.897435897435898e-06, "epoch": 0.04626506024096386, "percentage": 1.54, "elapsed_time": "0:01:23", "remaining_time": "1:28:43"} +{"current_steps": 25, "total_steps": 1557, "loss": 0.0389, "lr": 6.153846153846155e-06, "epoch": 0.04819277108433735, "percentage": 1.61, "elapsed_time": "0:01:26", "remaining_time": "1:28:11"} +{"current_steps": 26, "total_steps": 1557, "loss": 0.0268, "lr": 6.410256410256412e-06, "epoch": 0.05012048192771084, "percentage": 1.67, "elapsed_time": "0:01:29", "remaining_time": "1:27:56"} +{"current_steps": 27, "total_steps": 1557, "loss": 0.0367, "lr": 6.666666666666667e-06, "epoch": 0.052048192771084335, "percentage": 1.73, "elapsed_time": "0:01:33", "remaining_time": "1:28:04"} +{"current_steps": 28, "total_steps": 1557, "loss": 0.0251, "lr": 6.923076923076923e-06, "epoch": 0.05397590361445783, "percentage": 1.8, "elapsed_time": "0:01:36", "remaining_time": "1:27:44"} +{"current_steps": 29, "total_steps": 1557, "loss": 0.0289, "lr": 7.17948717948718e-06, "epoch": 0.055903614457831326, "percentage": 1.86, "elapsed_time": "0:01:39", "remaining_time": "1:27:41"} +{"current_steps": 30, "total_steps": 1557, "loss": 0.043, "lr": 7.435897435897437e-06, "epoch": 0.05783132530120482, "percentage": 1.93, "elapsed_time": "0:01:43", "remaining_time": "1:27:22"} +{"current_steps": 31, "total_steps": 1557, "loss": 0.029, "lr": 7.692307692307694e-06, "epoch": 0.059759036144578316, "percentage": 1.99, "elapsed_time": "0:01:46", "remaining_time": "1:27:02"} +{"current_steps": 32, "total_steps": 1557, "loss": 0.0365, "lr": 7.948717948717949e-06, "epoch": 0.061686746987951804, "percentage": 2.06, "elapsed_time": "0:01:49", "remaining_time": "1:27:03"} +{"current_steps": 33, "total_steps": 1557, "loss": 0.0283, "lr": 8.205128205128205e-06, "epoch": 0.0636144578313253, "percentage": 2.12, "elapsed_time": "0:01:52", "remaining_time": "1:26:42"} +{"current_steps": 34, "total_steps": 1557, "loss": 0.0234, "lr": 8.461538461538462e-06, "epoch": 0.0655421686746988, "percentage": 2.18, "elapsed_time": "0:01:55", "remaining_time": "1:26:19"} +{"current_steps": 35, "total_steps": 1557, "loss": 0.0307, "lr": 8.717948717948719e-06, "epoch": 0.06746987951807229, "percentage": 2.25, "elapsed_time": "0:01:59", "remaining_time": "1:26:26"} +{"current_steps": 36, "total_steps": 1557, "loss": 0.0264, "lr": 8.974358974358976e-06, "epoch": 0.06939759036144579, "percentage": 2.31, "elapsed_time": "0:02:02", "remaining_time": "1:26:24"} +{"current_steps": 37, "total_steps": 1557, "loss": 0.0224, "lr": 9.230769230769232e-06, "epoch": 0.07132530120481928, "percentage": 2.38, "elapsed_time": "0:02:06", "remaining_time": "1:26:25"} +{"current_steps": 38, "total_steps": 1557, "loss": 0.0163, "lr": 9.487179487179487e-06, "epoch": 0.07325301204819278, "percentage": 2.44, "elapsed_time": "0:02:09", "remaining_time": "1:26:08"} +{"current_steps": 39, "total_steps": 1557, "loss": 0.0165, "lr": 9.743589743589744e-06, "epoch": 0.07518072289156627, "percentage": 2.5, "elapsed_time": "0:02:12", "remaining_time": "1:26:06"} +{"current_steps": 40, "total_steps": 1557, "loss": 0.0163, "lr": 1e-05, "epoch": 0.07710843373493977, "percentage": 2.57, "elapsed_time": "0:02:16", "remaining_time": "1:26:08"} +{"current_steps": 41, "total_steps": 1557, "loss": 0.0302, "lr": 1.0256410256410256e-05, "epoch": 0.07903614457831325, "percentage": 2.63, "elapsed_time": "0:02:19", "remaining_time": "1:25:56"} +{"current_steps": 42, "total_steps": 1557, "loss": 0.0312, "lr": 1.0512820512820514e-05, "epoch": 0.08096385542168674, "percentage": 2.7, "elapsed_time": "0:02:23", "remaining_time": "1:25:59"} +{"current_steps": 43, "total_steps": 1557, "loss": 0.0256, "lr": 1.076923076923077e-05, "epoch": 0.08289156626506024, "percentage": 2.76, "elapsed_time": "0:02:26", "remaining_time": "1:25:49"} +{"current_steps": 44, "total_steps": 1557, "loss": 0.04, "lr": 1.1025641025641028e-05, "epoch": 0.08481927710843373, "percentage": 2.83, "elapsed_time": "0:02:29", "remaining_time": "1:25:37"} +{"current_steps": 45, "total_steps": 1557, "loss": 0.0282, "lr": 1.1282051282051283e-05, "epoch": 0.08674698795180723, "percentage": 2.89, "elapsed_time": "0:02:32", "remaining_time": "1:25:35"} +{"current_steps": 46, "total_steps": 1557, "loss": 0.0243, "lr": 1.1538461538461538e-05, "epoch": 0.08867469879518072, "percentage": 2.95, "elapsed_time": "0:02:36", "remaining_time": "1:25:38"} +{"current_steps": 47, "total_steps": 1557, "loss": 0.0208, "lr": 1.1794871794871796e-05, "epoch": 0.09060240963855422, "percentage": 3.02, "elapsed_time": "0:02:39", "remaining_time": "1:25:33"} +{"current_steps": 48, "total_steps": 1557, "loss": 0.0291, "lr": 1.2051282051282051e-05, "epoch": 0.09253012048192771, "percentage": 3.08, "elapsed_time": "0:02:42", "remaining_time": "1:25:23"} +{"current_steps": 49, "total_steps": 1557, "loss": 0.0342, "lr": 1.230769230769231e-05, "epoch": 0.09445783132530121, "percentage": 3.15, "elapsed_time": "0:02:45", "remaining_time": "1:25:08"} +{"current_steps": 50, "total_steps": 1557, "loss": 0.0185, "lr": 1.2564102564102565e-05, "epoch": 0.0963855421686747, "percentage": 3.21, "elapsed_time": "0:02:49", "remaining_time": "1:25:03"} +{"current_steps": 51, "total_steps": 1557, "loss": 0.0228, "lr": 1.2820512820512823e-05, "epoch": 0.0983132530120482, "percentage": 3.28, "elapsed_time": "0:02:52", "remaining_time": "1:25:04"} +{"current_steps": 52, "total_steps": 1557, "loss": 0.0176, "lr": 1.3076923076923078e-05, "epoch": 0.10024096385542168, "percentage": 3.34, "elapsed_time": "0:02:55", "remaining_time": "1:24:50"} +{"current_steps": 53, "total_steps": 1557, "loss": 0.0433, "lr": 1.3333333333333333e-05, "epoch": 0.10216867469879518, "percentage": 3.4, "elapsed_time": "0:02:59", "remaining_time": "1:24:43"} +{"current_steps": 54, "total_steps": 1557, "loss": 0.0245, "lr": 1.3589743589743592e-05, "epoch": 0.10409638554216867, "percentage": 3.47, "elapsed_time": "0:03:02", "remaining_time": "1:24:30"} +{"current_steps": 55, "total_steps": 1557, "loss": 0.0224, "lr": 1.3846153846153847e-05, "epoch": 0.10602409638554217, "percentage": 3.53, "elapsed_time": "0:03:05", "remaining_time": "1:24:32"} +{"current_steps": 56, "total_steps": 1557, "loss": 0.0296, "lr": 1.4102564102564105e-05, "epoch": 0.10795180722891566, "percentage": 3.6, "elapsed_time": "0:03:09", "remaining_time": "1:24:28"} +{"current_steps": 57, "total_steps": 1557, "loss": 0.0336, "lr": 1.435897435897436e-05, "epoch": 0.10987951807228916, "percentage": 3.66, "elapsed_time": "0:03:12", "remaining_time": "1:24:22"} +{"current_steps": 58, "total_steps": 1557, "loss": 0.0196, "lr": 1.4615384615384615e-05, "epoch": 0.11180722891566265, "percentage": 3.73, "elapsed_time": "0:03:15", "remaining_time": "1:24:24"} +{"current_steps": 59, "total_steps": 1557, "loss": 0.0207, "lr": 1.4871794871794874e-05, "epoch": 0.11373493975903615, "percentage": 3.79, "elapsed_time": "0:03:19", "remaining_time": "1:24:23"} +{"current_steps": 60, "total_steps": 1557, "loss": 0.047, "lr": 1.5128205128205129e-05, "epoch": 0.11566265060240964, "percentage": 3.85, "elapsed_time": "0:03:22", "remaining_time": "1:24:20"} +{"current_steps": 61, "total_steps": 1557, "loss": 0.0198, "lr": 1.5384615384615387e-05, "epoch": 0.11759036144578314, "percentage": 3.92, "elapsed_time": "0:03:26", "remaining_time": "1:24:18"} +{"current_steps": 62, "total_steps": 1557, "loss": 0.0161, "lr": 1.5641025641025644e-05, "epoch": 0.11951807228915663, "percentage": 3.98, "elapsed_time": "0:03:29", "remaining_time": "1:24:09"} +{"current_steps": 63, "total_steps": 1557, "loss": 0.0275, "lr": 1.5897435897435897e-05, "epoch": 0.12144578313253013, "percentage": 4.05, "elapsed_time": "0:03:32", "remaining_time": "1:24:11"} +{"current_steps": 64, "total_steps": 1557, "loss": 0.0137, "lr": 1.6153846153846154e-05, "epoch": 0.12337349397590361, "percentage": 4.11, "elapsed_time": "0:03:36", "remaining_time": "1:24:04"} +{"current_steps": 65, "total_steps": 1557, "loss": 0.0294, "lr": 1.641025641025641e-05, "epoch": 0.12530120481927712, "percentage": 4.17, "elapsed_time": "0:03:39", "remaining_time": "1:23:57"} +{"current_steps": 66, "total_steps": 1557, "loss": 0.0401, "lr": 1.6666666666666667e-05, "epoch": 0.1272289156626506, "percentage": 4.24, "elapsed_time": "0:03:42", "remaining_time": "1:23:54"} +{"current_steps": 67, "total_steps": 1557, "loss": 0.0292, "lr": 1.6923076923076924e-05, "epoch": 0.1291566265060241, "percentage": 4.3, "elapsed_time": "0:03:46", "remaining_time": "1:23:55"} +{"current_steps": 68, "total_steps": 1557, "loss": 0.0178, "lr": 1.717948717948718e-05, "epoch": 0.1310843373493976, "percentage": 4.37, "elapsed_time": "0:03:49", "remaining_time": "1:23:52"} +{"current_steps": 69, "total_steps": 1557, "loss": 0.0129, "lr": 1.7435897435897438e-05, "epoch": 0.13301204819277107, "percentage": 4.43, "elapsed_time": "0:03:52", "remaining_time": "1:23:37"} +{"current_steps": 70, "total_steps": 1557, "loss": 0.034, "lr": 1.7692307692307694e-05, "epoch": 0.13493975903614458, "percentage": 4.5, "elapsed_time": "0:03:56", "remaining_time": "1:23:34"} +{"current_steps": 71, "total_steps": 1557, "loss": 0.0266, "lr": 1.794871794871795e-05, "epoch": 0.13686746987951806, "percentage": 4.56, "elapsed_time": "0:03:59", "remaining_time": "1:23:34"} +{"current_steps": 72, "total_steps": 1557, "loss": 0.0223, "lr": 1.8205128205128208e-05, "epoch": 0.13879518072289157, "percentage": 4.62, "elapsed_time": "0:04:02", "remaining_time": "1:23:29"} +{"current_steps": 73, "total_steps": 1557, "loss": 0.0187, "lr": 1.8461538461538465e-05, "epoch": 0.14072289156626505, "percentage": 4.69, "elapsed_time": "0:04:06", "remaining_time": "1:23:21"} +{"current_steps": 74, "total_steps": 1557, "loss": 0.0164, "lr": 1.8717948717948718e-05, "epoch": 0.14265060240963856, "percentage": 4.75, "elapsed_time": "0:04:09", "remaining_time": "1:23:20"} +{"current_steps": 75, "total_steps": 1557, "loss": 0.0164, "lr": 1.8974358974358975e-05, "epoch": 0.14457831325301204, "percentage": 4.82, "elapsed_time": "0:04:13", "remaining_time": "1:23:21"} +{"current_steps": 76, "total_steps": 1557, "loss": 0.0296, "lr": 1.923076923076923e-05, "epoch": 0.14650602409638555, "percentage": 4.88, "elapsed_time": "0:04:16", "remaining_time": "1:23:09"} +{"current_steps": 77, "total_steps": 1557, "loss": 0.0148, "lr": 1.9487179487179488e-05, "epoch": 0.14843373493975903, "percentage": 4.95, "elapsed_time": "0:04:19", "remaining_time": "1:23:11"} +{"current_steps": 78, "total_steps": 1557, "loss": 0.0395, "lr": 1.9743589743589745e-05, "epoch": 0.15036144578313254, "percentage": 5.01, "elapsed_time": "0:04:23", "remaining_time": "1:23:09"} +{"current_steps": 79, "total_steps": 1557, "loss": 0.0446, "lr": 2e-05, "epoch": 0.15228915662650602, "percentage": 5.07, "elapsed_time": "0:04:26", "remaining_time": "1:23:11"} +{"current_steps": 80, "total_steps": 1557, "loss": 0.0206, "lr": 2.025641025641026e-05, "epoch": 0.15421686746987953, "percentage": 5.14, "elapsed_time": "0:04:30", "remaining_time": "1:23:06"} +{"current_steps": 81, "total_steps": 1557, "loss": 0.0333, "lr": 2.0512820512820512e-05, "epoch": 0.156144578313253, "percentage": 5.2, "elapsed_time": "0:04:33", "remaining_time": "1:23:04"} +{"current_steps": 82, "total_steps": 1557, "loss": 0.0414, "lr": 2.0769230769230772e-05, "epoch": 0.1580722891566265, "percentage": 5.27, "elapsed_time": "0:04:36", "remaining_time": "1:23:01"} +{"current_steps": 83, "total_steps": 1557, "loss": 0.045, "lr": 2.102564102564103e-05, "epoch": 0.16, "percentage": 5.33, "elapsed_time": "0:04:40", "remaining_time": "1:22:56"} +{"current_steps": 84, "total_steps": 1557, "loss": 0.021, "lr": 2.1282051282051285e-05, "epoch": 0.16192771084337348, "percentage": 5.39, "elapsed_time": "0:04:43", "remaining_time": "1:22:50"} +{"current_steps": 85, "total_steps": 1557, "loss": 0.0389, "lr": 2.153846153846154e-05, "epoch": 0.163855421686747, "percentage": 5.46, "elapsed_time": "0:04:46", "remaining_time": "1:22:46"} +{"current_steps": 86, "total_steps": 1557, "loss": 0.0272, "lr": 2.1794871794871795e-05, "epoch": 0.16578313253012048, "percentage": 5.52, "elapsed_time": "0:04:50", "remaining_time": "1:22:42"} +{"current_steps": 87, "total_steps": 1557, "loss": 0.0368, "lr": 2.2051282051282056e-05, "epoch": 0.16771084337349398, "percentage": 5.59, "elapsed_time": "0:04:53", "remaining_time": "1:22:38"} +{"current_steps": 88, "total_steps": 1557, "loss": 0.0284, "lr": 2.230769230769231e-05, "epoch": 0.16963855421686747, "percentage": 5.65, "elapsed_time": "0:04:57", "remaining_time": "1:22:40"} +{"current_steps": 89, "total_steps": 1557, "loss": 0.0646, "lr": 2.2564102564102566e-05, "epoch": 0.17156626506024097, "percentage": 5.72, "elapsed_time": "0:05:00", "remaining_time": "1:22:37"} +{"current_steps": 90, "total_steps": 1557, "loss": 0.0439, "lr": 2.2820512820512822e-05, "epoch": 0.17349397590361446, "percentage": 5.78, "elapsed_time": "0:05:04", "remaining_time": "1:22:36"} +{"current_steps": 91, "total_steps": 1557, "loss": 0.0109, "lr": 2.3076923076923076e-05, "epoch": 0.17542168674698796, "percentage": 5.84, "elapsed_time": "0:05:07", "remaining_time": "1:22:35"} +{"current_steps": 92, "total_steps": 1557, "loss": 0.0458, "lr": 2.3333333333333336e-05, "epoch": 0.17734939759036145, "percentage": 5.91, "elapsed_time": "0:05:10", "remaining_time": "1:22:27"} +{"current_steps": 93, "total_steps": 1557, "loss": 0.0212, "lr": 2.3589743589743593e-05, "epoch": 0.17927710843373493, "percentage": 5.97, "elapsed_time": "0:05:19", "remaining_time": "1:23:44"} +{"current_steps": 94, "total_steps": 1557, "loss": 0.0215, "lr": 2.384615384615385e-05, "epoch": 0.18120481927710844, "percentage": 6.04, "elapsed_time": "0:05:22", "remaining_time": "1:23:36"} +{"current_steps": 95, "total_steps": 1557, "loss": 0.0218, "lr": 2.4102564102564103e-05, "epoch": 0.18313253012048192, "percentage": 6.1, "elapsed_time": "0:05:25", "remaining_time": "1:23:31"} +{"current_steps": 96, "total_steps": 1557, "loss": 0.0268, "lr": 2.435897435897436e-05, "epoch": 0.18506024096385543, "percentage": 6.17, "elapsed_time": "0:05:29", "remaining_time": "1:23:29"} +{"current_steps": 97, "total_steps": 1557, "loss": 0.0172, "lr": 2.461538461538462e-05, "epoch": 0.1869879518072289, "percentage": 6.23, "elapsed_time": "0:05:32", "remaining_time": "1:23:28"} +{"current_steps": 98, "total_steps": 1557, "loss": 0.0208, "lr": 2.4871794871794873e-05, "epoch": 0.18891566265060242, "percentage": 6.29, "elapsed_time": "0:05:36", "remaining_time": "1:23:23"} +{"current_steps": 99, "total_steps": 1557, "loss": 0.0259, "lr": 2.512820512820513e-05, "epoch": 0.1908433734939759, "percentage": 6.36, "elapsed_time": "0:05:39", "remaining_time": "1:23:16"} +{"current_steps": 100, "total_steps": 1557, "loss": 0.0242, "lr": 2.5384615384615386e-05, "epoch": 0.1927710843373494, "percentage": 6.42, "elapsed_time": "0:05:42", "remaining_time": "1:23:14"} +{"current_steps": 101, "total_steps": 1557, "loss": 0.0188, "lr": 2.5641025641025646e-05, "epoch": 0.1946987951807229, "percentage": 6.49, "elapsed_time": "0:05:46", "remaining_time": "1:23:11"} +{"current_steps": 102, "total_steps": 1557, "loss": 0.0584, "lr": 2.58974358974359e-05, "epoch": 0.1966265060240964, "percentage": 6.55, "elapsed_time": "0:05:49", "remaining_time": "1:23:07"} +{"current_steps": 103, "total_steps": 1557, "loss": 0.0131, "lr": 2.6153846153846157e-05, "epoch": 0.19855421686746988, "percentage": 6.62, "elapsed_time": "0:05:53", "remaining_time": "1:23:05"} +{"current_steps": 104, "total_steps": 1557, "loss": 0.0231, "lr": 2.6410256410256413e-05, "epoch": 0.20048192771084336, "percentage": 6.68, "elapsed_time": "0:05:56", "remaining_time": "1:23:01"} +{"current_steps": 105, "total_steps": 1557, "loss": 0.027, "lr": 2.6666666666666667e-05, "epoch": 0.20240963855421687, "percentage": 6.74, "elapsed_time": "0:06:00", "remaining_time": "1:22:58"} +{"current_steps": 106, "total_steps": 1557, "loss": 0.0224, "lr": 2.6923076923076927e-05, "epoch": 0.20433734939759035, "percentage": 6.81, "elapsed_time": "0:06:03", "remaining_time": "1:22:57"} +{"current_steps": 107, "total_steps": 1557, "loss": 0.0198, "lr": 2.7179487179487183e-05, "epoch": 0.20626506024096386, "percentage": 6.87, "elapsed_time": "0:06:06", "remaining_time": "1:22:50"} +{"current_steps": 108, "total_steps": 1557, "loss": 0.032, "lr": 2.7435897435897437e-05, "epoch": 0.20819277108433734, "percentage": 6.94, "elapsed_time": "0:06:10", "remaining_time": "1:22:46"} +{"current_steps": 109, "total_steps": 1557, "loss": 0.0117, "lr": 2.7692307692307694e-05, "epoch": 0.21012048192771085, "percentage": 7.0, "elapsed_time": "0:06:13", "remaining_time": "1:22:40"} +{"current_steps": 110, "total_steps": 1557, "loss": 0.0337, "lr": 2.794871794871795e-05, "epoch": 0.21204819277108433, "percentage": 7.06, "elapsed_time": "0:06:16", "remaining_time": "1:22:36"} +{"current_steps": 111, "total_steps": 1557, "loss": 0.0131, "lr": 2.820512820512821e-05, "epoch": 0.21397590361445784, "percentage": 7.13, "elapsed_time": "0:06:20", "remaining_time": "1:22:32"} +{"current_steps": 112, "total_steps": 1557, "loss": 0.0233, "lr": 2.8461538461538464e-05, "epoch": 0.21590361445783132, "percentage": 7.19, "elapsed_time": "0:06:23", "remaining_time": "1:22:28"} +{"current_steps": 113, "total_steps": 1557, "loss": 0.0204, "lr": 2.871794871794872e-05, "epoch": 0.21783132530120483, "percentage": 7.26, "elapsed_time": "0:06:27", "remaining_time": "1:22:26"} +{"current_steps": 114, "total_steps": 1557, "loss": 0.016, "lr": 2.8974358974358977e-05, "epoch": 0.2197590361445783, "percentage": 7.32, "elapsed_time": "0:06:30", "remaining_time": "1:22:21"} +{"current_steps": 115, "total_steps": 1557, "loss": 0.0135, "lr": 2.923076923076923e-05, "epoch": 0.2216867469879518, "percentage": 7.39, "elapsed_time": "0:06:33", "remaining_time": "1:22:18"} +{"current_steps": 116, "total_steps": 1557, "loss": 0.0233, "lr": 2.948717948717949e-05, "epoch": 0.2236144578313253, "percentage": 7.45, "elapsed_time": "0:06:37", "remaining_time": "1:22:12"} +{"current_steps": 117, "total_steps": 1557, "loss": 0.0198, "lr": 2.9743589743589747e-05, "epoch": 0.22554216867469878, "percentage": 7.51, "elapsed_time": "0:06:40", "remaining_time": "1:22:09"} +{"current_steps": 118, "total_steps": 1557, "loss": 0.0246, "lr": 3.0000000000000004e-05, "epoch": 0.2274698795180723, "percentage": 7.58, "elapsed_time": "0:06:43", "remaining_time": "1:22:01"} +{"current_steps": 119, "total_steps": 1557, "loss": 0.025, "lr": 3.0256410256410257e-05, "epoch": 0.22939759036144577, "percentage": 7.64, "elapsed_time": "0:06:46", "remaining_time": "1:21:57"} +{"current_steps": 120, "total_steps": 1557, "loss": 0.031, "lr": 3.0512820512820514e-05, "epoch": 0.23132530120481928, "percentage": 7.71, "elapsed_time": "0:06:50", "remaining_time": "1:21:52"} +{"current_steps": 121, "total_steps": 1557, "loss": 0.0519, "lr": 3.0769230769230774e-05, "epoch": 0.23325301204819276, "percentage": 7.77, "elapsed_time": "0:06:53", "remaining_time": "1:21:44"} +{"current_steps": 122, "total_steps": 1557, "loss": 0.0325, "lr": 3.102564102564103e-05, "epoch": 0.23518072289156627, "percentage": 7.84, "elapsed_time": "0:06:56", "remaining_time": "1:21:39"} +{"current_steps": 123, "total_steps": 1557, "loss": 0.0252, "lr": 3.128205128205129e-05, "epoch": 0.23710843373493976, "percentage": 7.9, "elapsed_time": "0:06:59", "remaining_time": "1:21:33"} +{"current_steps": 124, "total_steps": 1557, "loss": 0.0262, "lr": 3.153846153846154e-05, "epoch": 0.23903614457831326, "percentage": 7.96, "elapsed_time": "0:07:02", "remaining_time": "1:21:28"} +{"current_steps": 125, "total_steps": 1557, "loss": 0.0243, "lr": 3.1794871794871795e-05, "epoch": 0.24096385542168675, "percentage": 8.03, "elapsed_time": "0:07:06", "remaining_time": "1:21:22"} +{"current_steps": 126, "total_steps": 1557, "loss": 0.0178, "lr": 3.205128205128206e-05, "epoch": 0.24289156626506025, "percentage": 8.09, "elapsed_time": "0:07:09", "remaining_time": "1:21:18"} +{"current_steps": 127, "total_steps": 1557, "loss": 0.023, "lr": 3.230769230769231e-05, "epoch": 0.24481927710843374, "percentage": 8.16, "elapsed_time": "0:07:12", "remaining_time": "1:21:11"} +{"current_steps": 128, "total_steps": 1557, "loss": 0.032, "lr": 3.2564102564102565e-05, "epoch": 0.24674698795180722, "percentage": 8.22, "elapsed_time": "0:07:16", "remaining_time": "1:21:10"} +{"current_steps": 129, "total_steps": 1557, "loss": 0.0231, "lr": 3.282051282051282e-05, "epoch": 0.24867469879518073, "percentage": 8.29, "elapsed_time": "0:07:19", "remaining_time": "1:21:09"} +{"current_steps": 130, "total_steps": 1557, "loss": 0.0147, "lr": 3.307692307692308e-05, "epoch": 0.25060240963855424, "percentage": 8.35, "elapsed_time": "0:07:23", "remaining_time": "1:21:04"} +{"current_steps": 131, "total_steps": 1557, "loss": 0.0439, "lr": 3.3333333333333335e-05, "epoch": 0.2525301204819277, "percentage": 8.41, "elapsed_time": "0:07:26", "remaining_time": "1:20:59"} +{"current_steps": 132, "total_steps": 1557, "loss": 0.0282, "lr": 3.358974358974359e-05, "epoch": 0.2544578313253012, "percentage": 8.48, "elapsed_time": "0:07:29", "remaining_time": "1:20:56"} +{"current_steps": 133, "total_steps": 1557, "loss": 0.0515, "lr": 3.384615384615385e-05, "epoch": 0.2563855421686747, "percentage": 8.54, "elapsed_time": "0:07:33", "remaining_time": "1:20:53"} +{"current_steps": 134, "total_steps": 1557, "loss": 0.0781, "lr": 3.4102564102564105e-05, "epoch": 0.2583132530120482, "percentage": 8.61, "elapsed_time": "0:07:36", "remaining_time": "1:20:52"} +{"current_steps": 135, "total_steps": 1557, "loss": 0.0306, "lr": 3.435897435897436e-05, "epoch": 0.26024096385542167, "percentage": 8.67, "elapsed_time": "0:07:40", "remaining_time": "1:20:49"} +{"current_steps": 136, "total_steps": 1557, "loss": 0.0154, "lr": 3.461538461538462e-05, "epoch": 0.2621686746987952, "percentage": 8.73, "elapsed_time": "0:07:43", "remaining_time": "1:20:46"} +{"current_steps": 137, "total_steps": 1557, "loss": 0.0235, "lr": 3.4871794871794875e-05, "epoch": 0.2640963855421687, "percentage": 8.8, "elapsed_time": "0:07:47", "remaining_time": "1:20:42"} +{"current_steps": 138, "total_steps": 1557, "loss": 0.0453, "lr": 3.512820512820513e-05, "epoch": 0.26602409638554214, "percentage": 8.86, "elapsed_time": "0:07:50", "remaining_time": "1:20:37"} +{"current_steps": 139, "total_steps": 1557, "loss": 0.0427, "lr": 3.538461538461539e-05, "epoch": 0.26795180722891565, "percentage": 8.93, "elapsed_time": "0:07:53", "remaining_time": "1:20:33"} +{"current_steps": 140, "total_steps": 1557, "loss": 0.04, "lr": 3.5641025641025646e-05, "epoch": 0.26987951807228916, "percentage": 8.99, "elapsed_time": "0:07:57", "remaining_time": "1:20:33"} +{"current_steps": 141, "total_steps": 1557, "loss": 0.0188, "lr": 3.58974358974359e-05, "epoch": 0.27180722891566267, "percentage": 9.06, "elapsed_time": "0:08:00", "remaining_time": "1:20:29"} +{"current_steps": 142, "total_steps": 1557, "loss": 0.0253, "lr": 3.615384615384616e-05, "epoch": 0.2737349397590361, "percentage": 9.12, "elapsed_time": "0:08:04", "remaining_time": "1:20:27"} +{"current_steps": 143, "total_steps": 1557, "loss": 0.0316, "lr": 3.6410256410256416e-05, "epoch": 0.27566265060240963, "percentage": 9.18, "elapsed_time": "0:08:07", "remaining_time": "1:20:22"} +{"current_steps": 144, "total_steps": 1557, "loss": 0.0266, "lr": 3.6666666666666666e-05, "epoch": 0.27759036144578314, "percentage": 9.25, "elapsed_time": "0:08:10", "remaining_time": "1:20:16"} +{"current_steps": 145, "total_steps": 1557, "loss": 0.0281, "lr": 3.692307692307693e-05, "epoch": 0.27951807228915665, "percentage": 9.31, "elapsed_time": "0:08:14", "remaining_time": "1:20:12"} +{"current_steps": 146, "total_steps": 1557, "loss": 0.0295, "lr": 3.7179487179487186e-05, "epoch": 0.2814457831325301, "percentage": 9.38, "elapsed_time": "0:08:17", "remaining_time": "1:20:07"} +{"current_steps": 147, "total_steps": 1557, "loss": 0.0322, "lr": 3.7435897435897436e-05, "epoch": 0.2833734939759036, "percentage": 9.44, "elapsed_time": "0:08:20", "remaining_time": "1:20:02"} +{"current_steps": 148, "total_steps": 1557, "loss": 0.0266, "lr": 3.769230769230769e-05, "epoch": 0.2853012048192771, "percentage": 9.51, "elapsed_time": "0:08:23", "remaining_time": "1:19:57"} +{"current_steps": 149, "total_steps": 1557, "loss": 0.0677, "lr": 3.794871794871795e-05, "epoch": 0.28722891566265063, "percentage": 9.57, "elapsed_time": "0:08:27", "remaining_time": "1:19:55"} +{"current_steps": 150, "total_steps": 1557, "loss": 0.0324, "lr": 3.820512820512821e-05, "epoch": 0.2891566265060241, "percentage": 9.63, "elapsed_time": "0:08:30", "remaining_time": "1:19:52"} +{"current_steps": 151, "total_steps": 1557, "loss": 0.0259, "lr": 3.846153846153846e-05, "epoch": 0.2910843373493976, "percentage": 9.7, "elapsed_time": "0:08:34", "remaining_time": "1:19:47"} +{"current_steps": 152, "total_steps": 1557, "loss": 0.0358, "lr": 3.871794871794872e-05, "epoch": 0.2930120481927711, "percentage": 9.76, "elapsed_time": "0:08:37", "remaining_time": "1:19:43"} +{"current_steps": 153, "total_steps": 1557, "loss": 0.0447, "lr": 3.8974358974358976e-05, "epoch": 0.29493975903614456, "percentage": 9.83, "elapsed_time": "0:08:41", "remaining_time": "1:19:41"} +{"current_steps": 154, "total_steps": 1557, "loss": 0.0409, "lr": 3.923076923076923e-05, "epoch": 0.29686746987951806, "percentage": 9.89, "elapsed_time": "0:08:44", "remaining_time": "1:19:35"} +{"current_steps": 155, "total_steps": 1557, "loss": 0.0317, "lr": 3.948717948717949e-05, "epoch": 0.2987951807228916, "percentage": 9.96, "elapsed_time": "0:08:47", "remaining_time": "1:19:29"} +{"current_steps": 156, "total_steps": 1557, "loss": 0.0306, "lr": 3.9743589743589747e-05, "epoch": 0.3007228915662651, "percentage": 10.02, "elapsed_time": "0:08:50", "remaining_time": "1:19:27"} +{"current_steps": 157, "total_steps": 1557, "loss": 0.0324, "lr": 4e-05, "epoch": 0.30265060240963854, "percentage": 10.08, "elapsed_time": "0:08:54", "remaining_time": "1:19:23"} +{"current_steps": 158, "total_steps": 1557, "loss": 0.0354, "lr": 3.999994971675547e-05, "epoch": 0.30457831325301205, "percentage": 10.15, "elapsed_time": "0:08:57", "remaining_time": "1:19:21"} +{"current_steps": 159, "total_steps": 1557, "loss": 0.0366, "lr": 3.999979886727471e-05, "epoch": 0.30650602409638555, "percentage": 10.21, "elapsed_time": "0:09:01", "remaining_time": "1:19:17"} +{"current_steps": 160, "total_steps": 1557, "loss": 0.0437, "lr": 3.999954745231624e-05, "epoch": 0.30843373493975906, "percentage": 10.28, "elapsed_time": "0:09:04", "remaining_time": "1:19:13"} +{"current_steps": 161, "total_steps": 1557, "loss": 0.0363, "lr": 3.999919547314426e-05, "epoch": 0.3103614457831325, "percentage": 10.34, "elapsed_time": "0:09:07", "remaining_time": "1:19:09"} +{"current_steps": 162, "total_steps": 1557, "loss": 0.0259, "lr": 3.999874293152863e-05, "epoch": 0.312289156626506, "percentage": 10.4, "elapsed_time": "0:09:10", "remaining_time": "1:19:04"} +{"current_steps": 163, "total_steps": 1557, "loss": 0.0341, "lr": 3.9998189829744885e-05, "epoch": 0.31421686746987953, "percentage": 10.47, "elapsed_time": "0:09:14", "remaining_time": "1:18:59"} +{"current_steps": 164, "total_steps": 1557, "loss": 0.0424, "lr": 3.99975361705742e-05, "epoch": 0.316144578313253, "percentage": 10.53, "elapsed_time": "0:09:17", "remaining_time": "1:18:54"} +{"current_steps": 165, "total_steps": 1557, "loss": 0.0535, "lr": 3.999678195730337e-05, "epoch": 0.3180722891566265, "percentage": 10.6, "elapsed_time": "0:09:20", "remaining_time": "1:18:51"} +{"current_steps": 166, "total_steps": 1557, "loss": 0.0284, "lr": 3.999592719372484e-05, "epoch": 0.32, "percentage": 10.66, "elapsed_time": "0:09:24", "remaining_time": "1:18:49"} +{"current_steps": 167, "total_steps": 1557, "loss": 0.0313, "lr": 3.9994971884136636e-05, "epoch": 0.3219277108433735, "percentage": 10.73, "elapsed_time": "0:09:27", "remaining_time": "1:18:46"} +{"current_steps": 168, "total_steps": 1557, "loss": 0.0524, "lr": 3.9993916033342355e-05, "epoch": 0.32385542168674697, "percentage": 10.79, "elapsed_time": "0:09:31", "remaining_time": "1:18:41"} +{"current_steps": 169, "total_steps": 1557, "loss": 0.0282, "lr": 3.999275964665117e-05, "epoch": 0.3257831325301205, "percentage": 10.85, "elapsed_time": "0:09:34", "remaining_time": "1:18:38"} +{"current_steps": 170, "total_steps": 1557, "loss": 0.0293, "lr": 3.999150272987776e-05, "epoch": 0.327710843373494, "percentage": 10.92, "elapsed_time": "0:09:37", "remaining_time": "1:18:32"} +{"current_steps": 171, "total_steps": 1557, "loss": 0.0221, "lr": 3.999014528934232e-05, "epoch": 0.3296385542168675, "percentage": 10.98, "elapsed_time": "0:09:40", "remaining_time": "1:18:28"} +{"current_steps": 172, "total_steps": 1557, "loss": 0.0302, "lr": 3.998868733187048e-05, "epoch": 0.33156626506024095, "percentage": 11.05, "elapsed_time": "0:09:44", "remaining_time": "1:18:24"} +{"current_steps": 173, "total_steps": 1557, "loss": 0.0364, "lr": 3.998712886479335e-05, "epoch": 0.33349397590361446, "percentage": 11.11, "elapsed_time": "0:09:47", "remaining_time": "1:18:20"} +{"current_steps": 174, "total_steps": 1557, "loss": 0.0296, "lr": 3.998546989594739e-05, "epoch": 0.33542168674698797, "percentage": 11.18, "elapsed_time": "0:09:50", "remaining_time": "1:18:16"} +{"current_steps": 175, "total_steps": 1557, "loss": 0.0549, "lr": 3.998371043367445e-05, "epoch": 0.3373493975903614, "percentage": 11.24, "elapsed_time": "0:09:54", "remaining_time": "1:18:12"} +{"current_steps": 176, "total_steps": 1557, "loss": 0.0577, "lr": 3.998185048682166e-05, "epoch": 0.33927710843373493, "percentage": 11.3, "elapsed_time": "0:09:57", "remaining_time": "1:18:07"} +{"current_steps": 177, "total_steps": 1557, "loss": 0.0313, "lr": 3.997989006474144e-05, "epoch": 0.34120481927710844, "percentage": 11.37, "elapsed_time": "0:10:01", "remaining_time": "1:18:05"} +{"current_steps": 178, "total_steps": 1557, "loss": 0.0309, "lr": 3.997782917729143e-05, "epoch": 0.34313253012048195, "percentage": 11.43, "elapsed_time": "0:10:04", "remaining_time": "1:18:04"} +{"current_steps": 179, "total_steps": 1557, "loss": 0.0166, "lr": 3.997566783483445e-05, "epoch": 0.3450602409638554, "percentage": 11.5, "elapsed_time": "0:10:07", "remaining_time": "1:17:56"} +{"current_steps": 180, "total_steps": 1557, "loss": 0.0254, "lr": 3.9973406048238413e-05, "epoch": 0.3469879518072289, "percentage": 11.56, "elapsed_time": "0:10:10", "remaining_time": "1:17:52"} +{"current_steps": 181, "total_steps": 1557, "loss": 0.0239, "lr": 3.9971043828876334e-05, "epoch": 0.3489156626506024, "percentage": 11.62, "elapsed_time": "0:10:14", "remaining_time": "1:17:48"} +{"current_steps": 182, "total_steps": 1557, "loss": 0.0404, "lr": 3.9968581188626204e-05, "epoch": 0.35084337349397593, "percentage": 11.69, "elapsed_time": "0:10:17", "remaining_time": "1:17:42"} +{"current_steps": 183, "total_steps": 1557, "loss": 0.0127, "lr": 3.996601813987098e-05, "epoch": 0.3527710843373494, "percentage": 11.75, "elapsed_time": "0:10:20", "remaining_time": "1:17:38"} +{"current_steps": 184, "total_steps": 1557, "loss": 0.0176, "lr": 3.996335469549852e-05, "epoch": 0.3546987951807229, "percentage": 11.82, "elapsed_time": "0:10:23", "remaining_time": "1:17:31"} +{"current_steps": 185, "total_steps": 1557, "loss": 0.0457, "lr": 3.9960590868901465e-05, "epoch": 0.3566265060240964, "percentage": 11.88, "elapsed_time": "0:10:31", "remaining_time": "1:18:05"} +{"current_steps": 186, "total_steps": 1557, "loss": 0.0271, "lr": 3.995772667397725e-05, "epoch": 0.35855421686746985, "percentage": 11.95, "elapsed_time": "0:10:34", "remaining_time": "1:17:58"} +{"current_steps": 187, "total_steps": 1557, "loss": 0.0297, "lr": 3.995476212512795e-05, "epoch": 0.36048192771084336, "percentage": 12.01, "elapsed_time": "0:10:37", "remaining_time": "1:17:53"} +{"current_steps": 188, "total_steps": 1557, "loss": 0.0322, "lr": 3.99516972372603e-05, "epoch": 0.3624096385542169, "percentage": 12.07, "elapsed_time": "0:10:41", "remaining_time": "1:17:49"} +{"current_steps": 189, "total_steps": 1557, "loss": 0.0253, "lr": 3.9948532025785546e-05, "epoch": 0.3643373493975904, "percentage": 12.14, "elapsed_time": "0:10:44", "remaining_time": "1:17:46"} +{"current_steps": 190, "total_steps": 1557, "loss": 0.0355, "lr": 3.9945266506619403e-05, "epoch": 0.36626506024096384, "percentage": 12.2, "elapsed_time": "0:10:48", "remaining_time": "1:17:44"} +{"current_steps": 191, "total_steps": 1557, "loss": 0.056, "lr": 3.994190069618195e-05, "epoch": 0.36819277108433734, "percentage": 12.27, "elapsed_time": "0:10:51", "remaining_time": "1:17:38"} +{"current_steps": 192, "total_steps": 1557, "loss": 0.0415, "lr": 3.993843461139757e-05, "epoch": 0.37012048192771085, "percentage": 12.33, "elapsed_time": "0:10:54", "remaining_time": "1:17:33"} +{"current_steps": 193, "total_steps": 1557, "loss": 0.0379, "lr": 3.9934868269694886e-05, "epoch": 0.37204819277108436, "percentage": 12.4, "elapsed_time": "0:10:58", "remaining_time": "1:17:31"} +{"current_steps": 194, "total_steps": 1557, "loss": 0.0237, "lr": 3.9931201689006595e-05, "epoch": 0.3739759036144578, "percentage": 12.46, "elapsed_time": "0:11:01", "remaining_time": "1:17:27"} +{"current_steps": 195, "total_steps": 1557, "loss": 0.024, "lr": 3.992743488776947e-05, "epoch": 0.3759036144578313, "percentage": 12.52, "elapsed_time": "0:11:04", "remaining_time": "1:17:22"} +{"current_steps": 196, "total_steps": 1557, "loss": 0.0273, "lr": 3.992356788492421e-05, "epoch": 0.37783132530120483, "percentage": 12.59, "elapsed_time": "0:11:07", "remaining_time": "1:17:14"} +{"current_steps": 197, "total_steps": 1557, "loss": 0.0411, "lr": 3.9919600699915355e-05, "epoch": 0.3797590361445783, "percentage": 12.65, "elapsed_time": "0:11:10", "remaining_time": "1:17:11"} +{"current_steps": 198, "total_steps": 1557, "loss": 0.0857, "lr": 3.991553335269119e-05, "epoch": 0.3816867469879518, "percentage": 12.72, "elapsed_time": "0:11:14", "remaining_time": "1:17:07"} +{"current_steps": 199, "total_steps": 1557, "loss": 0.0294, "lr": 3.991136586370367e-05, "epoch": 0.3836144578313253, "percentage": 12.78, "elapsed_time": "0:11:17", "remaining_time": "1:17:04"} +{"current_steps": 200, "total_steps": 1557, "loss": 0.0395, "lr": 3.990709825390828e-05, "epoch": 0.3855421686746988, "percentage": 12.85, "elapsed_time": "0:11:21", "remaining_time": "1:17:00"} +{"current_steps": 201, "total_steps": 1557, "loss": 0.0194, "lr": 3.9902730544763936e-05, "epoch": 0.38746987951807227, "percentage": 12.91, "elapsed_time": "0:11:24", "remaining_time": "1:16:56"} +{"current_steps": 202, "total_steps": 1557, "loss": 0.0381, "lr": 3.989826275823291e-05, "epoch": 0.3893975903614458, "percentage": 12.97, "elapsed_time": "0:11:27", "remaining_time": "1:16:53"} +{"current_steps": 203, "total_steps": 1557, "loss": 0.0254, "lr": 3.989369491678067e-05, "epoch": 0.3913253012048193, "percentage": 13.04, "elapsed_time": "0:11:31", "remaining_time": "1:16:50"} +{"current_steps": 204, "total_steps": 1557, "loss": 0.048, "lr": 3.988902704337582e-05, "epoch": 0.3932530120481928, "percentage": 13.1, "elapsed_time": "0:11:34", "remaining_time": "1:16:44"} +{"current_steps": 205, "total_steps": 1557, "loss": 0.0268, "lr": 3.9884259161489936e-05, "epoch": 0.39518072289156625, "percentage": 13.17, "elapsed_time": "0:11:37", "remaining_time": "1:16:40"} +{"current_steps": 206, "total_steps": 1557, "loss": 0.0192, "lr": 3.987939129509746e-05, "epoch": 0.39710843373493976, "percentage": 13.23, "elapsed_time": "0:11:41", "remaining_time": "1:16:38"} +{"current_steps": 207, "total_steps": 1557, "loss": 0.0362, "lr": 3.9874423468675624e-05, "epoch": 0.39903614457831327, "percentage": 13.29, "elapsed_time": "0:11:44", "remaining_time": "1:16:34"} +{"current_steps": 208, "total_steps": 1557, "loss": 0.017, "lr": 3.9869355707204266e-05, "epoch": 0.4009638554216867, "percentage": 13.36, "elapsed_time": "0:11:47", "remaining_time": "1:16:30"} +{"current_steps": 209, "total_steps": 1557, "loss": 0.0283, "lr": 3.986418803616573e-05, "epoch": 0.40289156626506023, "percentage": 13.42, "elapsed_time": "0:11:51", "remaining_time": "1:16:25"} +{"current_steps": 210, "total_steps": 1557, "loss": 0.0158, "lr": 3.985892048154474e-05, "epoch": 0.40481927710843374, "percentage": 13.49, "elapsed_time": "0:11:54", "remaining_time": "1:16:22"} +{"current_steps": 211, "total_steps": 1557, "loss": 0.0292, "lr": 3.9853553069828284e-05, "epoch": 0.40674698795180725, "percentage": 13.55, "elapsed_time": "0:11:57", "remaining_time": "1:16:16"} +{"current_steps": 212, "total_steps": 1557, "loss": 0.0281, "lr": 3.984808582800543e-05, "epoch": 0.4086746987951807, "percentage": 13.62, "elapsed_time": "0:12:00", "remaining_time": "1:16:13"} +{"current_steps": 213, "total_steps": 1557, "loss": 0.031, "lr": 3.984251878356726e-05, "epoch": 0.4106024096385542, "percentage": 13.68, "elapsed_time": "0:12:04", "remaining_time": "1:16:11"} +{"current_steps": 214, "total_steps": 1557, "loss": 0.0166, "lr": 3.983685196450667e-05, "epoch": 0.4125301204819277, "percentage": 13.74, "elapsed_time": "0:12:07", "remaining_time": "1:16:07"} +{"current_steps": 215, "total_steps": 1557, "loss": 0.0326, "lr": 3.9831085399318265e-05, "epoch": 0.41445783132530123, "percentage": 13.81, "elapsed_time": "0:12:11", "remaining_time": "1:16:03"} +{"current_steps": 216, "total_steps": 1557, "loss": 0.0118, "lr": 3.982521911699822e-05, "epoch": 0.4163855421686747, "percentage": 13.87, "elapsed_time": "0:12:14", "remaining_time": "1:15:59"} +{"current_steps": 217, "total_steps": 1557, "loss": 0.0246, "lr": 3.9819253147044084e-05, "epoch": 0.4183132530120482, "percentage": 13.94, "elapsed_time": "0:12:17", "remaining_time": "1:15:56"} +{"current_steps": 218, "total_steps": 1557, "loss": 0.036, "lr": 3.98131875194547e-05, "epoch": 0.4202409638554217, "percentage": 14.0, "elapsed_time": "0:12:21", "remaining_time": "1:15:54"} +{"current_steps": 219, "total_steps": 1557, "loss": 0.0255, "lr": 3.9807022264730024e-05, "epoch": 0.42216867469879515, "percentage": 14.07, "elapsed_time": "0:12:24", "remaining_time": "1:15:49"} +{"current_steps": 220, "total_steps": 1557, "loss": 0.0187, "lr": 3.980075741387094e-05, "epoch": 0.42409638554216866, "percentage": 14.13, "elapsed_time": "0:12:27", "remaining_time": "1:15:45"} +{"current_steps": 221, "total_steps": 1557, "loss": 0.0214, "lr": 3.979439299837915e-05, "epoch": 0.4260240963855422, "percentage": 14.19, "elapsed_time": "0:12:31", "remaining_time": "1:15:43"} +{"current_steps": 222, "total_steps": 1557, "loss": 0.0628, "lr": 3.978792905025702e-05, "epoch": 0.4279518072289157, "percentage": 14.26, "elapsed_time": "0:12:34", "remaining_time": "1:15:39"} +{"current_steps": 223, "total_steps": 1557, "loss": 0.0302, "lr": 3.978136560200735e-05, "epoch": 0.42987951807228914, "percentage": 14.32, "elapsed_time": "0:12:38", "remaining_time": "1:15:36"} +{"current_steps": 224, "total_steps": 1557, "loss": 0.0125, "lr": 3.977470268663331e-05, "epoch": 0.43180722891566264, "percentage": 14.39, "elapsed_time": "0:12:41", "remaining_time": "1:15:28"} +{"current_steps": 225, "total_steps": 1557, "loss": 0.0246, "lr": 3.976794033763819e-05, "epoch": 0.43373493975903615, "percentage": 14.45, "elapsed_time": "0:12:44", "remaining_time": "1:15:25"} +{"current_steps": 226, "total_steps": 1557, "loss": 0.0212, "lr": 3.9761078589025276e-05, "epoch": 0.43566265060240966, "percentage": 14.52, "elapsed_time": "0:12:47", "remaining_time": "1:15:21"} +{"current_steps": 227, "total_steps": 1557, "loss": 0.0125, "lr": 3.9754117475297664e-05, "epoch": 0.4375903614457831, "percentage": 14.58, "elapsed_time": "0:12:50", "remaining_time": "1:15:15"} +{"current_steps": 228, "total_steps": 1557, "loss": 0.0364, "lr": 3.97470570314581e-05, "epoch": 0.4395180722891566, "percentage": 14.64, "elapsed_time": "0:12:54", "remaining_time": "1:15:12"} +{"current_steps": 229, "total_steps": 1557, "loss": 0.0128, "lr": 3.973989729300878e-05, "epoch": 0.44144578313253013, "percentage": 14.71, "elapsed_time": "0:12:57", "remaining_time": "1:15:08"} +{"current_steps": 230, "total_steps": 1557, "loss": 0.0367, "lr": 3.9732638295951195e-05, "epoch": 0.4433734939759036, "percentage": 14.77, "elapsed_time": "0:13:00", "remaining_time": "1:15:04"} +{"current_steps": 231, "total_steps": 1557, "loss": 0.0667, "lr": 3.972528007678594e-05, "epoch": 0.4453012048192771, "percentage": 14.84, "elapsed_time": "0:13:04", "remaining_time": "1:15:00"} +{"current_steps": 232, "total_steps": 1557, "loss": 0.0655, "lr": 3.9717822672512516e-05, "epoch": 0.4472289156626506, "percentage": 14.9, "elapsed_time": "0:13:07", "remaining_time": "1:14:58"} +{"current_steps": 233, "total_steps": 1557, "loss": 0.064, "lr": 3.971026612062919e-05, "epoch": 0.4491566265060241, "percentage": 14.96, "elapsed_time": "0:13:11", "remaining_time": "1:14:56"} +{"current_steps": 234, "total_steps": 1557, "loss": 0.0206, "lr": 3.970261045913274e-05, "epoch": 0.45108433734939757, "percentage": 15.03, "elapsed_time": "0:13:14", "remaining_time": "1:14:52"} +{"current_steps": 235, "total_steps": 1557, "loss": 0.0486, "lr": 3.969485572651833e-05, "epoch": 0.4530120481927711, "percentage": 15.09, "elapsed_time": "0:13:17", "remaining_time": "1:14:45"} +{"current_steps": 236, "total_steps": 1557, "loss": 0.0262, "lr": 3.968700196177925e-05, "epoch": 0.4549397590361446, "percentage": 15.16, "elapsed_time": "0:13:20", "remaining_time": "1:14:42"} +{"current_steps": 237, "total_steps": 1557, "loss": 0.014, "lr": 3.96790492044068e-05, "epoch": 0.4568674698795181, "percentage": 15.22, "elapsed_time": "0:13:24", "remaining_time": "1:14:40"} +{"current_steps": 238, "total_steps": 1557, "loss": 0.0482, "lr": 3.967099749439002e-05, "epoch": 0.45879518072289155, "percentage": 15.29, "elapsed_time": "0:13:27", "remaining_time": "1:14:36"} +{"current_steps": 239, "total_steps": 1557, "loss": 0.0289, "lr": 3.966284687221551e-05, "epoch": 0.46072289156626506, "percentage": 15.35, "elapsed_time": "0:13:30", "remaining_time": "1:14:31"} +{"current_steps": 240, "total_steps": 1557, "loss": 0.0331, "lr": 3.9654597378867256e-05, "epoch": 0.46265060240963857, "percentage": 15.41, "elapsed_time": "0:13:34", "remaining_time": "1:14:28"} +{"current_steps": 241, "total_steps": 1557, "loss": 0.0925, "lr": 3.964624905582637e-05, "epoch": 0.464578313253012, "percentage": 15.48, "elapsed_time": "0:13:37", "remaining_time": "1:14:25"} +{"current_steps": 242, "total_steps": 1557, "loss": 0.015, "lr": 3.9637801945070944e-05, "epoch": 0.46650602409638553, "percentage": 15.54, "elapsed_time": "0:13:41", "remaining_time": "1:14:21"} +{"current_steps": 243, "total_steps": 1557, "loss": 0.0382, "lr": 3.962925608907579e-05, "epoch": 0.46843373493975904, "percentage": 15.61, "elapsed_time": "0:13:44", "remaining_time": "1:14:18"} +{"current_steps": 244, "total_steps": 1557, "loss": 0.0257, "lr": 3.962061153081224e-05, "epoch": 0.47036144578313255, "percentage": 15.67, "elapsed_time": "0:13:48", "remaining_time": "1:14:16"} +{"current_steps": 245, "total_steps": 1557, "loss": 0.0551, "lr": 3.961186831374793e-05, "epoch": 0.472289156626506, "percentage": 15.74, "elapsed_time": "0:13:51", "remaining_time": "1:14:13"} +{"current_steps": 246, "total_steps": 1557, "loss": 0.0186, "lr": 3.9603026481846616e-05, "epoch": 0.4742168674698795, "percentage": 15.8, "elapsed_time": "0:13:54", "remaining_time": "1:14:08"} +{"current_steps": 247, "total_steps": 1557, "loss": 0.024, "lr": 3.959408607956787e-05, "epoch": 0.476144578313253, "percentage": 15.86, "elapsed_time": "0:13:57", "remaining_time": "1:14:03"} +{"current_steps": 248, "total_steps": 1557, "loss": 0.0256, "lr": 3.958504715186695e-05, "epoch": 0.47807228915662653, "percentage": 15.93, "elapsed_time": "0:14:01", "remaining_time": "1:13:59"} +{"current_steps": 249, "total_steps": 1557, "loss": 0.0222, "lr": 3.957590974419452e-05, "epoch": 0.48, "percentage": 15.99, "elapsed_time": "0:14:04", "remaining_time": "1:13:55"} +{"current_steps": 250, "total_steps": 1557, "loss": 0.0334, "lr": 3.956667390249642e-05, "epoch": 0.4819277108433735, "percentage": 16.06, "elapsed_time": "0:14:07", "remaining_time": "1:13:52"} +{"current_steps": 251, "total_steps": 1557, "loss": 0.0345, "lr": 3.9557339673213474e-05, "epoch": 0.483855421686747, "percentage": 16.12, "elapsed_time": "0:14:11", "remaining_time": "1:13:48"} +{"current_steps": 252, "total_steps": 1557, "loss": 0.0183, "lr": 3.95479071032812e-05, "epoch": 0.4857831325301205, "percentage": 16.18, "elapsed_time": "0:14:14", "remaining_time": "1:13:44"} +{"current_steps": 253, "total_steps": 1557, "loss": 0.0337, "lr": 3.953837624012963e-05, "epoch": 0.48771084337349396, "percentage": 16.25, "elapsed_time": "0:14:17", "remaining_time": "1:13:41"} +{"current_steps": 254, "total_steps": 1557, "loss": 0.0524, "lr": 3.9528747131683023e-05, "epoch": 0.48963855421686747, "percentage": 16.31, "elapsed_time": "0:14:21", "remaining_time": "1:13:38"} +{"current_steps": 255, "total_steps": 1557, "loss": 0.0248, "lr": 3.9519019826359676e-05, "epoch": 0.491566265060241, "percentage": 16.38, "elapsed_time": "0:14:24", "remaining_time": "1:13:33"} +{"current_steps": 256, "total_steps": 1557, "loss": 0.0219, "lr": 3.9509194373071624e-05, "epoch": 0.49349397590361443, "percentage": 16.44, "elapsed_time": "0:14:27", "remaining_time": "1:13:29"} +{"current_steps": 257, "total_steps": 1557, "loss": 0.0312, "lr": 3.9499270821224444e-05, "epoch": 0.49542168674698794, "percentage": 16.51, "elapsed_time": "0:14:31", "remaining_time": "1:13:27"} +{"current_steps": 258, "total_steps": 1557, "loss": 0.0149, "lr": 3.9489249220716974e-05, "epoch": 0.49734939759036145, "percentage": 16.57, "elapsed_time": "0:14:34", "remaining_time": "1:13:23"} +{"current_steps": 259, "total_steps": 1557, "loss": 0.0214, "lr": 3.947912962194107e-05, "epoch": 0.49927710843373496, "percentage": 16.63, "elapsed_time": "0:14:37", "remaining_time": "1:13:19"} +{"current_steps": 260, "total_steps": 1557, "loss": 0.0263, "lr": 3.9468912075781345e-05, "epoch": 0.5012048192771085, "percentage": 16.7, "elapsed_time": "0:14:41", "remaining_time": "1:13:16"} +{"current_steps": 261, "total_steps": 1557, "loss": 0.0103, "lr": 3.945859663361496e-05, "epoch": 0.503132530120482, "percentage": 16.76, "elapsed_time": "0:14:44", "remaining_time": "1:13:12"} +{"current_steps": 262, "total_steps": 1557, "loss": 0.0292, "lr": 3.9448183347311284e-05, "epoch": 0.5050602409638554, "percentage": 16.83, "elapsed_time": "0:14:48", "remaining_time": "1:13:09"} +{"current_steps": 263, "total_steps": 1557, "loss": 0.0306, "lr": 3.943767226923171e-05, "epoch": 0.5069879518072289, "percentage": 16.89, "elapsed_time": "0:14:51", "remaining_time": "1:13:06"} +{"current_steps": 264, "total_steps": 1557, "loss": 0.0218, "lr": 3.942706345222935e-05, "epoch": 0.5089156626506024, "percentage": 16.96, "elapsed_time": "0:14:55", "remaining_time": "1:13:04"} +{"current_steps": 265, "total_steps": 1557, "loss": 0.0226, "lr": 3.941635694964878e-05, "epoch": 0.5108433734939759, "percentage": 17.02, "elapsed_time": "0:14:58", "remaining_time": "1:12:58"} +{"current_steps": 266, "total_steps": 1557, "loss": 0.0354, "lr": 3.940555281532576e-05, "epoch": 0.5127710843373494, "percentage": 17.08, "elapsed_time": "0:15:01", "remaining_time": "1:12:54"} +{"current_steps": 267, "total_steps": 1557, "loss": 0.0223, "lr": 3.939465110358699e-05, "epoch": 0.5146987951807229, "percentage": 17.15, "elapsed_time": "0:15:04", "remaining_time": "1:12:51"} +{"current_steps": 268, "total_steps": 1557, "loss": 0.0219, "lr": 3.93836518692498e-05, "epoch": 0.5166265060240964, "percentage": 17.21, "elapsed_time": "0:15:08", "remaining_time": "1:12:48"} +{"current_steps": 269, "total_steps": 1557, "loss": 0.0294, "lr": 3.937255516762193e-05, "epoch": 0.5185542168674698, "percentage": 17.28, "elapsed_time": "0:15:11", "remaining_time": "1:12:45"} +{"current_steps": 270, "total_steps": 1557, "loss": 0.0244, "lr": 3.936136105450119e-05, "epoch": 0.5204819277108433, "percentage": 17.34, "elapsed_time": "0:15:15", "remaining_time": "1:12:41"} +{"current_steps": 271, "total_steps": 1557, "loss": 0.02, "lr": 3.9350069586175195e-05, "epoch": 0.5224096385542168, "percentage": 17.41, "elapsed_time": "0:15:18", "remaining_time": "1:12:37"} +{"current_steps": 272, "total_steps": 1557, "loss": 0.0187, "lr": 3.933868081942113e-05, "epoch": 0.5243373493975904, "percentage": 17.47, "elapsed_time": "0:15:21", "remaining_time": "1:12:35"} +{"current_steps": 273, "total_steps": 1557, "loss": 0.0337, "lr": 3.9327194811505406e-05, "epoch": 0.5262650602409639, "percentage": 17.53, "elapsed_time": "0:15:24", "remaining_time": "1:12:30"} +{"current_steps": 274, "total_steps": 1557, "loss": 0.0573, "lr": 3.93156116201834e-05, "epoch": 0.5281927710843374, "percentage": 17.6, "elapsed_time": "0:15:28", "remaining_time": "1:12:26"} +{"current_steps": 275, "total_steps": 1557, "loss": 0.0405, "lr": 3.930393130369915e-05, "epoch": 0.5301204819277109, "percentage": 17.66, "elapsed_time": "0:15:32", "remaining_time": "1:12:24"} +{"current_steps": 276, "total_steps": 1557, "loss": 0.0153, "lr": 3.9292153920785076e-05, "epoch": 0.5320481927710843, "percentage": 17.73, "elapsed_time": "0:15:35", "remaining_time": "1:12:21"} +{"current_steps": 277, "total_steps": 1557, "loss": 0.0338, "lr": 3.928027953066168e-05, "epoch": 0.5339759036144578, "percentage": 17.79, "elapsed_time": "0:15:44", "remaining_time": "1:12:42"} +{"current_steps": 278, "total_steps": 1557, "loss": 0.0416, "lr": 3.926830819303726e-05, "epoch": 0.5359036144578313, "percentage": 17.85, "elapsed_time": "0:15:47", "remaining_time": "1:12:38"} +{"current_steps": 279, "total_steps": 1557, "loss": 0.0293, "lr": 3.925623996810757e-05, "epoch": 0.5378313253012048, "percentage": 17.92, "elapsed_time": "0:15:50", "remaining_time": "1:12:34"} +{"current_steps": 280, "total_steps": 1557, "loss": 0.0263, "lr": 3.924407491655557e-05, "epoch": 0.5397590361445783, "percentage": 17.98, "elapsed_time": "0:15:54", "remaining_time": "1:12:32"} +{"current_steps": 281, "total_steps": 1557, "loss": 0.0276, "lr": 3.9231813099551086e-05, "epoch": 0.5416867469879518, "percentage": 18.05, "elapsed_time": "0:15:57", "remaining_time": "1:12:27"} +{"current_steps": 282, "total_steps": 1557, "loss": 0.0146, "lr": 3.921945457875051e-05, "epoch": 0.5436144578313253, "percentage": 18.11, "elapsed_time": "0:16:01", "remaining_time": "1:12:25"} +{"current_steps": 283, "total_steps": 1557, "loss": 0.0267, "lr": 3.920699941629649e-05, "epoch": 0.5455421686746988, "percentage": 18.18, "elapsed_time": "0:16:04", "remaining_time": "1:12:22"} +{"current_steps": 284, "total_steps": 1557, "loss": 0.0183, "lr": 3.919444767481763e-05, "epoch": 0.5474698795180722, "percentage": 18.24, "elapsed_time": "0:16:07", "remaining_time": "1:12:18"} +{"current_steps": 285, "total_steps": 1557, "loss": 0.0412, "lr": 3.918179941742816e-05, "epoch": 0.5493975903614458, "percentage": 18.3, "elapsed_time": "0:16:11", "remaining_time": "1:12:13"} +{"current_steps": 286, "total_steps": 1557, "loss": 0.0505, "lr": 3.916905470772762e-05, "epoch": 0.5513253012048193, "percentage": 18.37, "elapsed_time": "0:16:14", "remaining_time": "1:12:10"} +{"current_steps": 287, "total_steps": 1557, "loss": 0.0156, "lr": 3.9156213609800545e-05, "epoch": 0.5532530120481928, "percentage": 18.43, "elapsed_time": "0:16:17", "remaining_time": "1:12:07"} +{"current_steps": 288, "total_steps": 1557, "loss": 0.0278, "lr": 3.914327618821614e-05, "epoch": 0.5551807228915663, "percentage": 18.5, "elapsed_time": "0:16:21", "remaining_time": "1:12:04"} +{"current_steps": 289, "total_steps": 1557, "loss": 0.0101, "lr": 3.913024250802796e-05, "epoch": 0.5571084337349398, "percentage": 18.56, "elapsed_time": "0:16:24", "remaining_time": "1:12:00"} +{"current_steps": 290, "total_steps": 1557, "loss": 0.0168, "lr": 3.911711263477357e-05, "epoch": 0.5590361445783133, "percentage": 18.63, "elapsed_time": "0:16:28", "remaining_time": "1:11:58"} +{"current_steps": 291, "total_steps": 1557, "loss": 0.0249, "lr": 3.910388663447425e-05, "epoch": 0.5609638554216867, "percentage": 18.69, "elapsed_time": "0:16:31", "remaining_time": "1:11:53"} +{"current_steps": 292, "total_steps": 1557, "loss": 0.0156, "lr": 3.909056457363461e-05, "epoch": 0.5628915662650602, "percentage": 18.75, "elapsed_time": "0:16:35", "remaining_time": "1:11:50"} +{"current_steps": 293, "total_steps": 1557, "loss": 0.0309, "lr": 3.907714651924229e-05, "epoch": 0.5648192771084337, "percentage": 18.82, "elapsed_time": "0:16:38", "remaining_time": "1:11:45"} +{"current_steps": 294, "total_steps": 1557, "loss": 0.0287, "lr": 3.906363253876763e-05, "epoch": 0.5667469879518072, "percentage": 18.88, "elapsed_time": "0:16:41", "remaining_time": "1:11:41"} +{"current_steps": 295, "total_steps": 1557, "loss": 0.0318, "lr": 3.90500227001633e-05, "epoch": 0.5686746987951807, "percentage": 18.95, "elapsed_time": "0:16:44", "remaining_time": "1:11:39"} +{"current_steps": 296, "total_steps": 1557, "loss": 0.0192, "lr": 3.9036317071863994e-05, "epoch": 0.5706024096385542, "percentage": 19.01, "elapsed_time": "0:16:48", "remaining_time": "1:11:35"} +{"current_steps": 297, "total_steps": 1557, "loss": 0.067, "lr": 3.902251572278605e-05, "epoch": 0.5725301204819278, "percentage": 19.08, "elapsed_time": "0:16:51", "remaining_time": "1:11:32"} +{"current_steps": 298, "total_steps": 1557, "loss": 0.0197, "lr": 3.900861872232713e-05, "epoch": 0.5744578313253013, "percentage": 19.14, "elapsed_time": "0:16:55", "remaining_time": "1:11:29"} +{"current_steps": 299, "total_steps": 1557, "loss": 0.0283, "lr": 3.899462614036587e-05, "epoch": 0.5763855421686747, "percentage": 19.2, "elapsed_time": "0:16:58", "remaining_time": "1:11:26"} +{"current_steps": 300, "total_steps": 1557, "loss": 0.0207, "lr": 3.89805380472615e-05, "epoch": 0.5783132530120482, "percentage": 19.27, "elapsed_time": "0:17:01", "remaining_time": "1:11:21"} +{"current_steps": 301, "total_steps": 1557, "loss": 0.0301, "lr": 3.8966354513853535e-05, "epoch": 0.5802409638554217, "percentage": 19.33, "elapsed_time": "0:17:04", "remaining_time": "1:11:16"} +{"current_steps": 302, "total_steps": 1557, "loss": 0.022, "lr": 3.895207561146137e-05, "epoch": 0.5821686746987952, "percentage": 19.4, "elapsed_time": "0:17:08", "remaining_time": "1:11:12"} +{"current_steps": 303, "total_steps": 1557, "loss": 0.0424, "lr": 3.893770141188396e-05, "epoch": 0.5840963855421687, "percentage": 19.46, "elapsed_time": "0:17:11", "remaining_time": "1:11:08"} +{"current_steps": 304, "total_steps": 1557, "loss": 0.08, "lr": 3.892323198739946e-05, "epoch": 0.5860240963855422, "percentage": 19.52, "elapsed_time": "0:17:14", "remaining_time": "1:11:04"} +{"current_steps": 305, "total_steps": 1557, "loss": 0.0152, "lr": 3.890866741076482e-05, "epoch": 0.5879518072289157, "percentage": 19.59, "elapsed_time": "0:17:17", "remaining_time": "1:11:00"} +{"current_steps": 306, "total_steps": 1557, "loss": 0.0205, "lr": 3.889400775521545e-05, "epoch": 0.5898795180722891, "percentage": 19.65, "elapsed_time": "0:17:21", "remaining_time": "1:10:56"} +{"current_steps": 307, "total_steps": 1557, "loss": 0.0233, "lr": 3.8879253094464865e-05, "epoch": 0.5918072289156626, "percentage": 19.72, "elapsed_time": "0:17:24", "remaining_time": "1:10:52"} +{"current_steps": 308, "total_steps": 1557, "loss": 0.0198, "lr": 3.8864403502704285e-05, "epoch": 0.5937349397590361, "percentage": 19.78, "elapsed_time": "0:17:27", "remaining_time": "1:10:49"} +{"current_steps": 309, "total_steps": 1557, "loss": 0.0176, "lr": 3.8849459054602274e-05, "epoch": 0.5956626506024096, "percentage": 19.85, "elapsed_time": "0:17:31", "remaining_time": "1:10:46"} +{"current_steps": 310, "total_steps": 1557, "loss": 0.0239, "lr": 3.883441982530436e-05, "epoch": 0.5975903614457831, "percentage": 19.91, "elapsed_time": "0:17:34", "remaining_time": "1:10:40"} +{"current_steps": 311, "total_steps": 1557, "loss": 0.0284, "lr": 3.8819285890432674e-05, "epoch": 0.5995180722891567, "percentage": 19.97, "elapsed_time": "0:17:37", "remaining_time": "1:10:36"} +{"current_steps": 312, "total_steps": 1557, "loss": 0.0233, "lr": 3.880405732608555e-05, "epoch": 0.6014457831325302, "percentage": 20.04, "elapsed_time": "0:17:41", "remaining_time": "1:10:33"} +{"current_steps": 313, "total_steps": 1557, "loss": 0.0433, "lr": 3.8788734208837155e-05, "epoch": 0.6033734939759036, "percentage": 20.1, "elapsed_time": "0:17:44", "remaining_time": "1:10:30"} +{"current_steps": 314, "total_steps": 1557, "loss": 0.043, "lr": 3.877331661573709e-05, "epoch": 0.6053012048192771, "percentage": 20.17, "elapsed_time": "0:17:47", "remaining_time": "1:10:27"} +{"current_steps": 315, "total_steps": 1557, "loss": 0.0377, "lr": 3.8757804624310006e-05, "epoch": 0.6072289156626506, "percentage": 20.23, "elapsed_time": "0:17:50", "remaining_time": "1:10:22"} +{"current_steps": 316, "total_steps": 1557, "loss": 0.046, "lr": 3.874219831255524e-05, "epoch": 0.6091566265060241, "percentage": 20.3, "elapsed_time": "0:17:54", "remaining_time": "1:10:18"} +{"current_steps": 317, "total_steps": 1557, "loss": 0.0149, "lr": 3.8726497758946394e-05, "epoch": 0.6110843373493976, "percentage": 20.36, "elapsed_time": "0:17:57", "remaining_time": "1:10:14"} +{"current_steps": 318, "total_steps": 1557, "loss": 0.014, "lr": 3.871070304243094e-05, "epoch": 0.6130120481927711, "percentage": 20.42, "elapsed_time": "0:18:00", "remaining_time": "1:10:11"} +{"current_steps": 319, "total_steps": 1557, "loss": 0.0503, "lr": 3.8694814242429834e-05, "epoch": 0.6149397590361446, "percentage": 20.49, "elapsed_time": "0:18:04", "remaining_time": "1:10:08"} +{"current_steps": 320, "total_steps": 1557, "loss": 0.021, "lr": 3.8678831438837116e-05, "epoch": 0.6168674698795181, "percentage": 20.55, "elapsed_time": "0:18:07", "remaining_time": "1:10:05"} +{"current_steps": 321, "total_steps": 1557, "loss": 0.0163, "lr": 3.866275471201952e-05, "epoch": 0.6187951807228915, "percentage": 20.62, "elapsed_time": "0:18:10", "remaining_time": "1:09:59"} +{"current_steps": 322, "total_steps": 1557, "loss": 0.0347, "lr": 3.8646584142816036e-05, "epoch": 0.620722891566265, "percentage": 20.68, "elapsed_time": "0:18:14", "remaining_time": "1:09:56"} +{"current_steps": 323, "total_steps": 1557, "loss": 0.0201, "lr": 3.863031981253754e-05, "epoch": 0.6226506024096385, "percentage": 20.75, "elapsed_time": "0:18:17", "remaining_time": "1:09:53"} +{"current_steps": 324, "total_steps": 1557, "loss": 0.0243, "lr": 3.861396180296635e-05, "epoch": 0.624578313253012, "percentage": 20.81, "elapsed_time": "0:18:20", "remaining_time": "1:09:49"} +{"current_steps": 325, "total_steps": 1557, "loss": 0.0166, "lr": 3.859751019635585e-05, "epoch": 0.6265060240963856, "percentage": 20.87, "elapsed_time": "0:18:24", "remaining_time": "1:09:46"} +{"current_steps": 326, "total_steps": 1557, "loss": 0.0274, "lr": 3.858096507543006e-05, "epoch": 0.6284337349397591, "percentage": 20.94, "elapsed_time": "0:18:27", "remaining_time": "1:09:42"} +{"current_steps": 327, "total_steps": 1557, "loss": 0.0207, "lr": 3.8564326523383214e-05, "epoch": 0.6303614457831326, "percentage": 21.0, "elapsed_time": "0:18:31", "remaining_time": "1:09:39"} +{"current_steps": 328, "total_steps": 1557, "loss": 0.0297, "lr": 3.8547594623879346e-05, "epoch": 0.632289156626506, "percentage": 21.07, "elapsed_time": "0:18:34", "remaining_time": "1:09:35"} +{"current_steps": 329, "total_steps": 1557, "loss": 0.0258, "lr": 3.853076946105188e-05, "epoch": 0.6342168674698795, "percentage": 21.13, "elapsed_time": "0:18:37", "remaining_time": "1:09:31"} +{"current_steps": 330, "total_steps": 1557, "loss": 0.0351, "lr": 3.85138511195032e-05, "epoch": 0.636144578313253, "percentage": 21.19, "elapsed_time": "0:18:41", "remaining_time": "1:09:28"} +{"current_steps": 331, "total_steps": 1557, "loss": 0.0388, "lr": 3.84968396843042e-05, "epoch": 0.6380722891566265, "percentage": 21.26, "elapsed_time": "0:18:44", "remaining_time": "1:09:24"} +{"current_steps": 332, "total_steps": 1557, "loss": 0.0203, "lr": 3.8479735240993904e-05, "epoch": 0.64, "percentage": 21.32, "elapsed_time": "0:18:48", "remaining_time": "1:09:22"} +{"current_steps": 333, "total_steps": 1557, "loss": 0.0261, "lr": 3.846253787557901e-05, "epoch": 0.6419277108433735, "percentage": 21.39, "elapsed_time": "0:18:51", "remaining_time": "1:09:18"} +{"current_steps": 334, "total_steps": 1557, "loss": 0.0108, "lr": 3.844524767453344e-05, "epoch": 0.643855421686747, "percentage": 21.45, "elapsed_time": "0:18:54", "remaining_time": "1:09:14"} +{"current_steps": 335, "total_steps": 1557, "loss": 0.0282, "lr": 3.842786472479795e-05, "epoch": 0.6457831325301204, "percentage": 21.52, "elapsed_time": "0:18:57", "remaining_time": "1:09:10"} +{"current_steps": 336, "total_steps": 1557, "loss": 0.0216, "lr": 3.841038911377962e-05, "epoch": 0.6477108433734939, "percentage": 21.58, "elapsed_time": "0:19:00", "remaining_time": "1:09:05"} +{"current_steps": 337, "total_steps": 1557, "loss": 0.0234, "lr": 3.839282092935153e-05, "epoch": 0.6496385542168674, "percentage": 21.64, "elapsed_time": "0:19:04", "remaining_time": "1:09:01"} +{"current_steps": 338, "total_steps": 1557, "loss": 0.0515, "lr": 3.837516025985219e-05, "epoch": 0.651566265060241, "percentage": 21.71, "elapsed_time": "0:19:07", "remaining_time": "1:08:58"} +{"current_steps": 339, "total_steps": 1557, "loss": 0.0508, "lr": 3.835740719408517e-05, "epoch": 0.6534939759036145, "percentage": 21.77, "elapsed_time": "0:19:10", "remaining_time": "1:08:54"} +{"current_steps": 340, "total_steps": 1557, "loss": 0.0405, "lr": 3.833956182131867e-05, "epoch": 0.655421686746988, "percentage": 21.84, "elapsed_time": "0:19:14", "remaining_time": "1:08:51"} +{"current_steps": 341, "total_steps": 1557, "loss": 0.024, "lr": 3.832162423128499e-05, "epoch": 0.6573493975903615, "percentage": 21.9, "elapsed_time": "0:19:17", "remaining_time": "1:08:48"} +{"current_steps": 342, "total_steps": 1557, "loss": 0.0199, "lr": 3.8303594514180164e-05, "epoch": 0.659277108433735, "percentage": 21.97, "elapsed_time": "0:19:20", "remaining_time": "1:08:43"} +{"current_steps": 343, "total_steps": 1557, "loss": 0.0101, "lr": 3.828547276066346e-05, "epoch": 0.6612048192771084, "percentage": 22.03, "elapsed_time": "0:19:24", "remaining_time": "1:08:40"} +{"current_steps": 344, "total_steps": 1557, "loss": 0.0455, "lr": 3.8267259061856925e-05, "epoch": 0.6631325301204819, "percentage": 22.09, "elapsed_time": "0:19:27", "remaining_time": "1:08:37"} +{"current_steps": 345, "total_steps": 1557, "loss": 0.0372, "lr": 3.824895350934496e-05, "epoch": 0.6650602409638554, "percentage": 22.16, "elapsed_time": "0:19:30", "remaining_time": "1:08:32"} +{"current_steps": 346, "total_steps": 1557, "loss": 0.0362, "lr": 3.823055619517381e-05, "epoch": 0.6669879518072289, "percentage": 22.22, "elapsed_time": "0:19:34", "remaining_time": "1:08:29"} +{"current_steps": 347, "total_steps": 1557, "loss": 0.0368, "lr": 3.821206721185115e-05, "epoch": 0.6689156626506024, "percentage": 22.29, "elapsed_time": "0:19:37", "remaining_time": "1:08:26"} +{"current_steps": 348, "total_steps": 1557, "loss": 0.0178, "lr": 3.819348665234557e-05, "epoch": 0.6708433734939759, "percentage": 22.35, "elapsed_time": "0:19:40", "remaining_time": "1:08:22"} +{"current_steps": 349, "total_steps": 1557, "loss": 0.024, "lr": 3.817481461008617e-05, "epoch": 0.6727710843373494, "percentage": 22.41, "elapsed_time": "0:19:44", "remaining_time": "1:08:20"} +{"current_steps": 350, "total_steps": 1557, "loss": 0.0262, "lr": 3.815605117896204e-05, "epoch": 0.6746987951807228, "percentage": 22.48, "elapsed_time": "0:19:48", "remaining_time": "1:08:17"} +{"current_steps": 351, "total_steps": 1557, "loss": 0.0209, "lr": 3.8137196453321775e-05, "epoch": 0.6766265060240964, "percentage": 22.54, "elapsed_time": "0:19:50", "remaining_time": "1:08:12"} +{"current_steps": 352, "total_steps": 1557, "loss": 0.0396, "lr": 3.811825052797308e-05, "epoch": 0.6785542168674699, "percentage": 22.61, "elapsed_time": "0:19:54", "remaining_time": "1:08:08"} +{"current_steps": 353, "total_steps": 1557, "loss": 0.0453, "lr": 3.8099213498182196e-05, "epoch": 0.6804819277108434, "percentage": 22.67, "elapsed_time": "0:19:57", "remaining_time": "1:08:04"} +{"current_steps": 354, "total_steps": 1557, "loss": 0.0317, "lr": 3.808008545967349e-05, "epoch": 0.6824096385542169, "percentage": 22.74, "elapsed_time": "0:20:00", "remaining_time": "1:08:00"} +{"current_steps": 355, "total_steps": 1557, "loss": 0.0452, "lr": 3.8060866508628953e-05, "epoch": 0.6843373493975904, "percentage": 22.8, "elapsed_time": "0:20:03", "remaining_time": "1:07:54"} +{"current_steps": 356, "total_steps": 1557, "loss": 0.0315, "lr": 3.8041556741687695e-05, "epoch": 0.6862650602409639, "percentage": 22.86, "elapsed_time": "0:20:07", "remaining_time": "1:07:51"} +{"current_steps": 357, "total_steps": 1557, "loss": 0.0269, "lr": 3.8022156255945496e-05, "epoch": 0.6881927710843373, "percentage": 22.93, "elapsed_time": "0:20:10", "remaining_time": "1:07:49"} +{"current_steps": 358, "total_steps": 1557, "loss": 0.0171, "lr": 3.800266514895429e-05, "epoch": 0.6901204819277108, "percentage": 22.99, "elapsed_time": "0:20:14", "remaining_time": "1:07:46"} +{"current_steps": 359, "total_steps": 1557, "loss": 0.0167, "lr": 3.7983083518721695e-05, "epoch": 0.6920481927710843, "percentage": 23.06, "elapsed_time": "0:20:17", "remaining_time": "1:07:42"} +{"current_steps": 360, "total_steps": 1557, "loss": 0.0342, "lr": 3.79634114637105e-05, "epoch": 0.6939759036144578, "percentage": 23.12, "elapsed_time": "0:20:20", "remaining_time": "1:07:38"} +{"current_steps": 361, "total_steps": 1557, "loss": 0.02, "lr": 3.794364908283817e-05, "epoch": 0.6959036144578313, "percentage": 23.19, "elapsed_time": "0:20:24", "remaining_time": "1:07:35"} +{"current_steps": 362, "total_steps": 1557, "loss": 0.0138, "lr": 3.792379647547637e-05, "epoch": 0.6978313253012048, "percentage": 23.25, "elapsed_time": "0:20:27", "remaining_time": "1:07:31"} +{"current_steps": 363, "total_steps": 1557, "loss": 0.0172, "lr": 3.790385374145046e-05, "epoch": 0.6997590361445784, "percentage": 23.31, "elapsed_time": "0:20:30", "remaining_time": "1:07:28"} +{"current_steps": 364, "total_steps": 1557, "loss": 0.0254, "lr": 3.7883820981038966e-05, "epoch": 0.7016867469879519, "percentage": 23.38, "elapsed_time": "0:20:34", "remaining_time": "1:07:25"} +{"current_steps": 365, "total_steps": 1557, "loss": 0.037, "lr": 3.7863698294973114e-05, "epoch": 0.7036144578313253, "percentage": 23.44, "elapsed_time": "0:20:37", "remaining_time": "1:07:22"} +{"current_steps": 366, "total_steps": 1557, "loss": 0.0234, "lr": 3.78434857844363e-05, "epoch": 0.7055421686746988, "percentage": 23.51, "elapsed_time": "0:20:41", "remaining_time": "1:07:18"} +{"current_steps": 367, "total_steps": 1557, "loss": 0.0164, "lr": 3.782318355106358e-05, "epoch": 0.7074698795180723, "percentage": 23.57, "elapsed_time": "0:20:44", "remaining_time": "1:07:15"} +{"current_steps": 368, "total_steps": 1557, "loss": 0.0192, "lr": 3.780279169694118e-05, "epoch": 0.7093975903614458, "percentage": 23.64, "elapsed_time": "0:20:47", "remaining_time": "1:07:11"} +{"current_steps": 369, "total_steps": 1557, "loss": 0.0131, "lr": 3.778231032460594e-05, "epoch": 0.7113253012048193, "percentage": 23.7, "elapsed_time": "0:20:56", "remaining_time": "1:07:23"} +{"current_steps": 370, "total_steps": 1557, "loss": 0.0291, "lr": 3.776173953704486e-05, "epoch": 0.7132530120481928, "percentage": 23.76, "elapsed_time": "0:20:59", "remaining_time": "1:07:19"} +{"current_steps": 371, "total_steps": 1557, "loss": 0.0214, "lr": 3.774107943769454e-05, "epoch": 0.7151807228915663, "percentage": 23.83, "elapsed_time": "0:21:02", "remaining_time": "1:07:14"} +{"current_steps": 372, "total_steps": 1557, "loss": 0.0221, "lr": 3.772033013044064e-05, "epoch": 0.7171084337349397, "percentage": 23.89, "elapsed_time": "0:21:05", "remaining_time": "1:07:10"} +{"current_steps": 373, "total_steps": 1557, "loss": 0.015, "lr": 3.7699491719617436e-05, "epoch": 0.7190361445783132, "percentage": 23.96, "elapsed_time": "0:21:08", "remaining_time": "1:07:07"} +{"current_steps": 374, "total_steps": 1557, "loss": 0.0146, "lr": 3.76785643100072e-05, "epoch": 0.7209638554216867, "percentage": 24.02, "elapsed_time": "0:21:12", "remaining_time": "1:07:04"} +{"current_steps": 375, "total_steps": 1557, "loss": 0.015, "lr": 3.765754800683974e-05, "epoch": 0.7228915662650602, "percentage": 24.08, "elapsed_time": "0:21:15", "remaining_time": "1:06:59"} +{"current_steps": 376, "total_steps": 1557, "loss": 0.0326, "lr": 3.7636442915791856e-05, "epoch": 0.7248192771084337, "percentage": 24.15, "elapsed_time": "0:21:18", "remaining_time": "1:06:56"} +{"current_steps": 377, "total_steps": 1557, "loss": 0.0432, "lr": 3.7615249142986784e-05, "epoch": 0.7267469879518073, "percentage": 24.21, "elapsed_time": "0:21:22", "remaining_time": "1:06:53"} +{"current_steps": 378, "total_steps": 1557, "loss": 0.0174, "lr": 3.7593966794993696e-05, "epoch": 0.7286746987951808, "percentage": 24.28, "elapsed_time": "0:21:25", "remaining_time": "1:06:49"} +{"current_steps": 379, "total_steps": 1557, "loss": 0.023, "lr": 3.757259597882714e-05, "epoch": 0.7306024096385542, "percentage": 24.34, "elapsed_time": "0:21:28", "remaining_time": "1:06:45"} +{"current_steps": 380, "total_steps": 1557, "loss": 0.0201, "lr": 3.755113680194651e-05, "epoch": 0.7325301204819277, "percentage": 24.41, "elapsed_time": "0:21:32", "remaining_time": "1:06:42"} +{"current_steps": 381, "total_steps": 1557, "loss": 0.0173, "lr": 3.7529589372255514e-05, "epoch": 0.7344578313253012, "percentage": 24.47, "elapsed_time": "0:21:35", "remaining_time": "1:06:37"} +{"current_steps": 382, "total_steps": 1557, "loss": 0.0284, "lr": 3.750795379810162e-05, "epoch": 0.7363855421686747, "percentage": 24.53, "elapsed_time": "0:21:38", "remaining_time": "1:06:33"} +{"current_steps": 383, "total_steps": 1557, "loss": 0.0194, "lr": 3.748623018827552e-05, "epoch": 0.7383132530120482, "percentage": 24.6, "elapsed_time": "0:21:41", "remaining_time": "1:06:30"} +{"current_steps": 384, "total_steps": 1557, "loss": 0.0247, "lr": 3.746441865201056e-05, "epoch": 0.7402409638554217, "percentage": 24.66, "elapsed_time": "0:21:45", "remaining_time": "1:06:26"} +{"current_steps": 385, "total_steps": 1557, "loss": 0.0097, "lr": 3.744251929898223e-05, "epoch": 0.7421686746987952, "percentage": 24.73, "elapsed_time": "0:21:48", "remaining_time": "1:06:23"} +{"current_steps": 386, "total_steps": 1557, "loss": 0.0238, "lr": 3.742053223930758e-05, "epoch": 0.7440963855421687, "percentage": 24.79, "elapsed_time": "0:21:51", "remaining_time": "1:06:19"} +{"current_steps": 387, "total_steps": 1557, "loss": 0.0332, "lr": 3.7398457583544674e-05, "epoch": 0.7460240963855421, "percentage": 24.86, "elapsed_time": "0:21:55", "remaining_time": "1:06:16"} +{"current_steps": 388, "total_steps": 1557, "loss": 0.0122, "lr": 3.737629544269206e-05, "epoch": 0.7479518072289156, "percentage": 24.92, "elapsed_time": "0:21:58", "remaining_time": "1:06:12"} +{"current_steps": 389, "total_steps": 1557, "loss": 0.0324, "lr": 3.7354045928188155e-05, "epoch": 0.7498795180722891, "percentage": 24.98, "elapsed_time": "0:22:02", "remaining_time": "1:06:09"} +{"current_steps": 390, "total_steps": 1557, "loss": 0.0196, "lr": 3.733170915191075e-05, "epoch": 0.7518072289156627, "percentage": 25.05, "elapsed_time": "0:22:05", "remaining_time": "1:06:04"} +{"current_steps": 391, "total_steps": 1557, "loss": 0.0131, "lr": 3.730928522617639e-05, "epoch": 0.7537349397590362, "percentage": 25.11, "elapsed_time": "0:22:08", "remaining_time": "1:06:00"} +{"current_steps": 392, "total_steps": 1557, "loss": 0.0238, "lr": 3.7286774263739855e-05, "epoch": 0.7556626506024097, "percentage": 25.18, "elapsed_time": "0:22:11", "remaining_time": "1:05:57"} +{"current_steps": 393, "total_steps": 1557, "loss": 0.0314, "lr": 3.726417637779357e-05, "epoch": 0.7575903614457832, "percentage": 25.24, "elapsed_time": "0:22:15", "remaining_time": "1:05:54"} +{"current_steps": 394, "total_steps": 1557, "loss": 0.0144, "lr": 3.7241491681967044e-05, "epoch": 0.7595180722891566, "percentage": 25.31, "elapsed_time": "0:22:18", "remaining_time": "1:05:50"} +{"current_steps": 395, "total_steps": 1557, "loss": 0.0286, "lr": 3.721872029032628e-05, "epoch": 0.7614457831325301, "percentage": 25.37, "elapsed_time": "0:22:21", "remaining_time": "1:05:47"} +{"current_steps": 396, "total_steps": 1557, "loss": 0.0427, "lr": 3.719586231737322e-05, "epoch": 0.7633734939759036, "percentage": 25.43, "elapsed_time": "0:22:24", "remaining_time": "1:05:42"} +{"current_steps": 397, "total_steps": 1557, "loss": 0.0138, "lr": 3.717291787804517e-05, "epoch": 0.7653012048192771, "percentage": 25.5, "elapsed_time": "0:22:27", "remaining_time": "1:05:38"} +{"current_steps": 398, "total_steps": 1557, "loss": 0.061, "lr": 3.7149887087714225e-05, "epoch": 0.7672289156626506, "percentage": 25.56, "elapsed_time": "0:22:31", "remaining_time": "1:05:35"} +{"current_steps": 399, "total_steps": 1557, "loss": 0.022, "lr": 3.712677006218666e-05, "epoch": 0.7691566265060241, "percentage": 25.63, "elapsed_time": "0:22:34", "remaining_time": "1:05:30"} +{"current_steps": 400, "total_steps": 1557, "loss": 0.0161, "lr": 3.710356691770238e-05, "epoch": 0.7710843373493976, "percentage": 25.69, "elapsed_time": "0:22:37", "remaining_time": "1:05:26"} +{"current_steps": 401, "total_steps": 1557, "loss": 0.0285, "lr": 3.708027777093433e-05, "epoch": 0.7730120481927711, "percentage": 25.75, "elapsed_time": "0:22:40", "remaining_time": "1:05:22"} +{"current_steps": 402, "total_steps": 1557, "loss": 0.012, "lr": 3.70569027389879e-05, "epoch": 0.7749397590361445, "percentage": 25.82, "elapsed_time": "0:22:44", "remaining_time": "1:05:19"} +{"current_steps": 403, "total_steps": 1557, "loss": 0.0155, "lr": 3.703344193940032e-05, "epoch": 0.776867469879518, "percentage": 25.88, "elapsed_time": "0:22:47", "remaining_time": "1:05:14"} +{"current_steps": 404, "total_steps": 1557, "loss": 0.0617, "lr": 3.700989549014011e-05, "epoch": 0.7787951807228916, "percentage": 25.95, "elapsed_time": "0:22:50", "remaining_time": "1:05:11"} +{"current_steps": 405, "total_steps": 1557, "loss": 0.0298, "lr": 3.698626350960646e-05, "epoch": 0.7807228915662651, "percentage": 26.01, "elapsed_time": "0:22:53", "remaining_time": "1:05:07"} +{"current_steps": 406, "total_steps": 1557, "loss": 0.03, "lr": 3.6962546116628634e-05, "epoch": 0.7826506024096386, "percentage": 26.08, "elapsed_time": "0:22:57", "remaining_time": "1:05:04"} +{"current_steps": 407, "total_steps": 1557, "loss": 0.0107, "lr": 3.693874343046537e-05, "epoch": 0.7845783132530121, "percentage": 26.14, "elapsed_time": "0:23:00", "remaining_time": "1:05:01"} +{"current_steps": 408, "total_steps": 1557, "loss": 0.0174, "lr": 3.6914855570804314e-05, "epoch": 0.7865060240963856, "percentage": 26.2, "elapsed_time": "0:23:03", "remaining_time": "1:04:57"} +{"current_steps": 409, "total_steps": 1557, "loss": 0.0149, "lr": 3.689088265776136e-05, "epoch": 0.788433734939759, "percentage": 26.27, "elapsed_time": "0:23:07", "remaining_time": "1:04:53"} +{"current_steps": 410, "total_steps": 1557, "loss": 0.019, "lr": 3.686682481188011e-05, "epoch": 0.7903614457831325, "percentage": 26.33, "elapsed_time": "0:23:10", "remaining_time": "1:04:50"} +{"current_steps": 411, "total_steps": 1557, "loss": 0.0217, "lr": 3.6842682154131193e-05, "epoch": 0.792289156626506, "percentage": 26.4, "elapsed_time": "0:23:13", "remaining_time": "1:04:46"} +{"current_steps": 412, "total_steps": 1557, "loss": 0.0198, "lr": 3.681845480591174e-05, "epoch": 0.7942168674698795, "percentage": 26.46, "elapsed_time": "0:23:16", "remaining_time": "1:04:41"} +{"current_steps": 413, "total_steps": 1557, "loss": 0.0253, "lr": 3.6794142889044727e-05, "epoch": 0.796144578313253, "percentage": 26.53, "elapsed_time": "0:23:20", "remaining_time": "1:04:38"} +{"current_steps": 414, "total_steps": 1557, "loss": 0.0329, "lr": 3.676974652577835e-05, "epoch": 0.7980722891566265, "percentage": 26.59, "elapsed_time": "0:23:23", "remaining_time": "1:04:35"} +{"current_steps": 415, "total_steps": 1557, "loss": 0.0479, "lr": 3.6745265838785434e-05, "epoch": 0.8, "percentage": 26.65, "elapsed_time": "0:23:26", "remaining_time": "1:04:31"} +{"current_steps": 416, "total_steps": 1557, "loss": 0.019, "lr": 3.672070095116283e-05, "epoch": 0.8019277108433734, "percentage": 26.72, "elapsed_time": "0:23:30", "remaining_time": "1:04:27"} +{"current_steps": 417, "total_steps": 1557, "loss": 0.0444, "lr": 3.669605198643075e-05, "epoch": 0.803855421686747, "percentage": 26.78, "elapsed_time": "0:23:33", "remaining_time": "1:04:24"} +{"current_steps": 418, "total_steps": 1557, "loss": 0.031, "lr": 3.667131906853219e-05, "epoch": 0.8057831325301205, "percentage": 26.85, "elapsed_time": "0:23:37", "remaining_time": "1:04:21"} +{"current_steps": 419, "total_steps": 1557, "loss": 0.0195, "lr": 3.664650232183229e-05, "epoch": 0.807710843373494, "percentage": 26.91, "elapsed_time": "0:23:40", "remaining_time": "1:04:18"} +{"current_steps": 420, "total_steps": 1557, "loss": 0.018, "lr": 3.66216018711177e-05, "epoch": 0.8096385542168675, "percentage": 26.97, "elapsed_time": "0:23:44", "remaining_time": "1:04:15"} +{"current_steps": 421, "total_steps": 1557, "loss": 0.0188, "lr": 3.659661784159597e-05, "epoch": 0.811566265060241, "percentage": 27.04, "elapsed_time": "0:23:47", "remaining_time": "1:04:11"} +{"current_steps": 422, "total_steps": 1557, "loss": 0.016, "lr": 3.65715503588949e-05, "epoch": 0.8134939759036145, "percentage": 27.1, "elapsed_time": "0:23:51", "remaining_time": "1:04:09"} +{"current_steps": 423, "total_steps": 1557, "loss": 0.0758, "lr": 3.654639954906193e-05, "epoch": 0.815421686746988, "percentage": 27.17, "elapsed_time": "0:23:54", "remaining_time": "1:04:05"} +{"current_steps": 424, "total_steps": 1557, "loss": 0.0308, "lr": 3.652116553856349e-05, "epoch": 0.8173493975903614, "percentage": 27.23, "elapsed_time": "0:23:57", "remaining_time": "1:04:01"} +{"current_steps": 425, "total_steps": 1557, "loss": 0.0493, "lr": 3.649584845428438e-05, "epoch": 0.8192771084337349, "percentage": 27.3, "elapsed_time": "0:24:00", "remaining_time": "1:03:57"} +{"current_steps": 426, "total_steps": 1557, "loss": 0.019, "lr": 3.64704484235271e-05, "epoch": 0.8212048192771084, "percentage": 27.36, "elapsed_time": "0:24:04", "remaining_time": "1:03:53"} +{"current_steps": 427, "total_steps": 1557, "loss": 0.0135, "lr": 3.6444965574011255e-05, "epoch": 0.8231325301204819, "percentage": 27.42, "elapsed_time": "0:24:07", "remaining_time": "1:03:50"} +{"current_steps": 428, "total_steps": 1557, "loss": 0.0402, "lr": 3.641940003387289e-05, "epoch": 0.8250602409638554, "percentage": 27.49, "elapsed_time": "0:24:10", "remaining_time": "1:03:46"} +{"current_steps": 429, "total_steps": 1557, "loss": 0.0132, "lr": 3.6393751931663814e-05, "epoch": 0.826987951807229, "percentage": 27.55, "elapsed_time": "0:24:14", "remaining_time": "1:03:43"} +{"current_steps": 430, "total_steps": 1557, "loss": 0.0296, "lr": 3.6368021396351015e-05, "epoch": 0.8289156626506025, "percentage": 27.62, "elapsed_time": "0:24:17", "remaining_time": "1:03:40"} +{"current_steps": 431, "total_steps": 1557, "loss": 0.0258, "lr": 3.634220855731598e-05, "epoch": 0.8308433734939759, "percentage": 27.68, "elapsed_time": "0:24:20", "remaining_time": "1:03:36"} +{"current_steps": 432, "total_steps": 1557, "loss": 0.0099, "lr": 3.631631354435403e-05, "epoch": 0.8327710843373494, "percentage": 27.75, "elapsed_time": "0:24:24", "remaining_time": "1:03:33"} +{"current_steps": 433, "total_steps": 1557, "loss": 0.0457, "lr": 3.62903364876737e-05, "epoch": 0.8346987951807229, "percentage": 27.81, "elapsed_time": "0:24:27", "remaining_time": "1:03:29"} +{"current_steps": 434, "total_steps": 1557, "loss": 0.0444, "lr": 3.626427751789606e-05, "epoch": 0.8366265060240964, "percentage": 27.87, "elapsed_time": "0:24:30", "remaining_time": "1:03:25"} +{"current_steps": 435, "total_steps": 1557, "loss": 0.0223, "lr": 3.623813676605405e-05, "epoch": 0.8385542168674699, "percentage": 27.94, "elapsed_time": "0:24:33", "remaining_time": "1:03:21"} +{"current_steps": 436, "total_steps": 1557, "loss": 0.0353, "lr": 3.621191436359186e-05, "epoch": 0.8404819277108434, "percentage": 28.0, "elapsed_time": "0:24:37", "remaining_time": "1:03:18"} +{"current_steps": 437, "total_steps": 1557, "loss": 0.0216, "lr": 3.6185610442364246e-05, "epoch": 0.8424096385542169, "percentage": 28.07, "elapsed_time": "0:24:40", "remaining_time": "1:03:15"} +{"current_steps": 438, "total_steps": 1557, "loss": 0.0433, "lr": 3.6159225134635846e-05, "epoch": 0.8443373493975903, "percentage": 28.13, "elapsed_time": "0:24:44", "remaining_time": "1:03:12"} +{"current_steps": 439, "total_steps": 1557, "loss": 0.031, "lr": 3.6132758573080556e-05, "epoch": 0.8462650602409638, "percentage": 28.2, "elapsed_time": "0:24:47", "remaining_time": "1:03:08"} +{"current_steps": 440, "total_steps": 1557, "loss": 0.0313, "lr": 3.6106210890780834e-05, "epoch": 0.8481927710843373, "percentage": 28.26, "elapsed_time": "0:24:51", "remaining_time": "1:03:05"} +{"current_steps": 441, "total_steps": 1557, "loss": 0.0218, "lr": 3.607958222122704e-05, "epoch": 0.8501204819277108, "percentage": 28.32, "elapsed_time": "0:24:54", "remaining_time": "1:03:02"} +{"current_steps": 442, "total_steps": 1557, "loss": 0.0239, "lr": 3.6052872698316755e-05, "epoch": 0.8520481927710843, "percentage": 28.39, "elapsed_time": "0:24:57", "remaining_time": "1:02:57"} +{"current_steps": 443, "total_steps": 1557, "loss": 0.0127, "lr": 3.602608245635414e-05, "epoch": 0.8539759036144579, "percentage": 28.45, "elapsed_time": "0:25:00", "remaining_time": "1:02:54"} +{"current_steps": 444, "total_steps": 1557, "loss": 0.0618, "lr": 3.599921163004922e-05, "epoch": 0.8559036144578314, "percentage": 28.52, "elapsed_time": "0:25:04", "remaining_time": "1:02:50"} +{"current_steps": 445, "total_steps": 1557, "loss": 0.0283, "lr": 3.5972260354517216e-05, "epoch": 0.8578313253012049, "percentage": 28.58, "elapsed_time": "0:25:07", "remaining_time": "1:02:47"} +{"current_steps": 446, "total_steps": 1557, "loss": 0.0271, "lr": 3.594522876527791e-05, "epoch": 0.8597590361445783, "percentage": 28.64, "elapsed_time": "0:25:11", "remaining_time": "1:02:44"} +{"current_steps": 447, "total_steps": 1557, "loss": 0.0169, "lr": 3.591811699825487e-05, "epoch": 0.8616867469879518, "percentage": 28.71, "elapsed_time": "0:25:14", "remaining_time": "1:02:40"} +{"current_steps": 448, "total_steps": 1557, "loss": 0.0239, "lr": 3.5890925189774886e-05, "epoch": 0.8636144578313253, "percentage": 28.77, "elapsed_time": "0:25:17", "remaining_time": "1:02:37"} +{"current_steps": 449, "total_steps": 1557, "loss": 0.0137, "lr": 3.586365347656718e-05, "epoch": 0.8655421686746988, "percentage": 28.84, "elapsed_time": "0:25:20", "remaining_time": "1:02:32"} +{"current_steps": 450, "total_steps": 1557, "loss": 0.027, "lr": 3.583630199576278e-05, "epoch": 0.8674698795180723, "percentage": 28.9, "elapsed_time": "0:25:24", "remaining_time": "1:02:29"} +{"current_steps": 451, "total_steps": 1557, "loss": 0.0167, "lr": 3.58088708848938e-05, "epoch": 0.8693975903614458, "percentage": 28.97, "elapsed_time": "0:25:27", "remaining_time": "1:02:25"} +{"current_steps": 452, "total_steps": 1557, "loss": 0.0468, "lr": 3.5781360281892775e-05, "epoch": 0.8713253012048193, "percentage": 29.03, "elapsed_time": "0:25:30", "remaining_time": "1:02:22"} +{"current_steps": 453, "total_steps": 1557, "loss": 0.0344, "lr": 3.575377032509194e-05, "epoch": 0.8732530120481927, "percentage": 29.09, "elapsed_time": "0:25:34", "remaining_time": "1:02:19"} +{"current_steps": 454, "total_steps": 1557, "loss": 0.0366, "lr": 3.5726101153222534e-05, "epoch": 0.8751807228915662, "percentage": 29.16, "elapsed_time": "0:25:37", "remaining_time": "1:02:16"} +{"current_steps": 455, "total_steps": 1557, "loss": 0.0382, "lr": 3.569835290541414e-05, "epoch": 0.8771084337349397, "percentage": 29.22, "elapsed_time": "0:25:40", "remaining_time": "1:02:12"} +{"current_steps": 456, "total_steps": 1557, "loss": 0.018, "lr": 3.567052572119397e-05, "epoch": 0.8790361445783132, "percentage": 29.29, "elapsed_time": "0:25:44", "remaining_time": "1:02:08"} +{"current_steps": 457, "total_steps": 1557, "loss": 0.0179, "lr": 3.564261974048611e-05, "epoch": 0.8809638554216868, "percentage": 29.35, "elapsed_time": "0:25:47", "remaining_time": "1:02:04"} +{"current_steps": 458, "total_steps": 1557, "loss": 0.0147, "lr": 3.56146351036109e-05, "epoch": 0.8828915662650603, "percentage": 29.42, "elapsed_time": "0:25:50", "remaining_time": "1:02:00"} +{"current_steps": 459, "total_steps": 1557, "loss": 0.0224, "lr": 3.558657195128416e-05, "epoch": 0.8848192771084338, "percentage": 29.48, "elapsed_time": "0:25:53", "remaining_time": "1:01:56"} +{"current_steps": 460, "total_steps": 1557, "loss": 0.0202, "lr": 3.555843042461653e-05, "epoch": 0.8867469879518072, "percentage": 29.54, "elapsed_time": "0:25:56", "remaining_time": "1:01:53"} +{"current_steps": 461, "total_steps": 1557, "loss": 0.0288, "lr": 3.553021066511274e-05, "epoch": 0.8886746987951807, "percentage": 29.61, "elapsed_time": "0:26:05", "remaining_time": "1:02:01"} +{"current_steps": 462, "total_steps": 1557, "loss": 0.0362, "lr": 3.55019128146709e-05, "epoch": 0.8906024096385542, "percentage": 29.67, "elapsed_time": "0:26:08", "remaining_time": "1:01:57"} +{"current_steps": 463, "total_steps": 1557, "loss": 0.038, "lr": 3.547353701558178e-05, "epoch": 0.8925301204819277, "percentage": 29.74, "elapsed_time": "0:26:12", "remaining_time": "1:01:54"} +{"current_steps": 464, "total_steps": 1557, "loss": 0.0399, "lr": 3.544508341052811e-05, "epoch": 0.8944578313253012, "percentage": 29.8, "elapsed_time": "0:26:15", "remaining_time": "1:01:51"} +{"current_steps": 465, "total_steps": 1557, "loss": 0.022, "lr": 3.541655214258383e-05, "epoch": 0.8963855421686747, "percentage": 29.87, "elapsed_time": "0:26:19", "remaining_time": "1:01:48"} +{"current_steps": 466, "total_steps": 1557, "loss": 0.0315, "lr": 3.538794335521343e-05, "epoch": 0.8983132530120482, "percentage": 29.93, "elapsed_time": "0:26:22", "remaining_time": "1:01:45"} +{"current_steps": 467, "total_steps": 1557, "loss": 0.0152, "lr": 3.535925719227117e-05, "epoch": 0.9002409638554217, "percentage": 29.99, "elapsed_time": "0:26:25", "remaining_time": "1:01:40"} +{"current_steps": 468, "total_steps": 1557, "loss": 0.0305, "lr": 3.533049379800038e-05, "epoch": 0.9021686746987951, "percentage": 30.06, "elapsed_time": "0:26:29", "remaining_time": "1:01:37"} +{"current_steps": 469, "total_steps": 1557, "loss": 0.0131, "lr": 3.530165331703275e-05, "epoch": 0.9040963855421686, "percentage": 30.12, "elapsed_time": "0:26:32", "remaining_time": "1:01:34"} +{"current_steps": 470, "total_steps": 1557, "loss": 0.0187, "lr": 3.527273589438756e-05, "epoch": 0.9060240963855422, "percentage": 30.19, "elapsed_time": "0:26:36", "remaining_time": "1:01:31"} +{"current_steps": 471, "total_steps": 1557, "loss": 0.0185, "lr": 3.5243741675471006e-05, "epoch": 0.9079518072289157, "percentage": 30.25, "elapsed_time": "0:26:39", "remaining_time": "1:01:28"} +{"current_steps": 472, "total_steps": 1557, "loss": 0.0433, "lr": 3.5214670806075426e-05, "epoch": 0.9098795180722892, "percentage": 30.31, "elapsed_time": "0:26:43", "remaining_time": "1:01:25"} +{"current_steps": 473, "total_steps": 1557, "loss": 0.02, "lr": 3.518552343237858e-05, "epoch": 0.9118072289156627, "percentage": 30.38, "elapsed_time": "0:26:46", "remaining_time": "1:01:21"} +{"current_steps": 474, "total_steps": 1557, "loss": 0.0165, "lr": 3.5156299700942916e-05, "epoch": 0.9137349397590362, "percentage": 30.44, "elapsed_time": "0:26:49", "remaining_time": "1:01:17"} +{"current_steps": 475, "total_steps": 1557, "loss": 0.0172, "lr": 3.512699975871485e-05, "epoch": 0.9156626506024096, "percentage": 30.51, "elapsed_time": "0:26:53", "remaining_time": "1:01:14"} +{"current_steps": 476, "total_steps": 1557, "loss": 0.0356, "lr": 3.509762375302399e-05, "epoch": 0.9175903614457831, "percentage": 30.57, "elapsed_time": "0:26:56", "remaining_time": "1:01:10"} +{"current_steps": 477, "total_steps": 1557, "loss": 0.0088, "lr": 3.506817183158243e-05, "epoch": 0.9195180722891566, "percentage": 30.64, "elapsed_time": "0:26:59", "remaining_time": "1:01:06"} +{"current_steps": 478, "total_steps": 1557, "loss": 0.0389, "lr": 3.5038644142483966e-05, "epoch": 0.9214457831325301, "percentage": 30.7, "elapsed_time": "0:27:02", "remaining_time": "1:01:03"} +{"current_steps": 479, "total_steps": 1557, "loss": 0.0155, "lr": 3.500904083420342e-05, "epoch": 0.9233734939759036, "percentage": 30.76, "elapsed_time": "0:27:06", "remaining_time": "1:01:00"} +{"current_steps": 480, "total_steps": 1557, "loss": 0.0247, "lr": 3.497936205559583e-05, "epoch": 0.9253012048192771, "percentage": 30.83, "elapsed_time": "0:27:09", "remaining_time": "1:00:57"} +{"current_steps": 481, "total_steps": 1557, "loss": 0.023, "lr": 3.494960795589572e-05, "epoch": 0.9272289156626506, "percentage": 30.89, "elapsed_time": "0:27:13", "remaining_time": "1:00:53"} +{"current_steps": 482, "total_steps": 1557, "loss": 0.0273, "lr": 3.491977868471635e-05, "epoch": 0.929156626506024, "percentage": 30.96, "elapsed_time": "0:27:17", "remaining_time": "1:00:51"} +{"current_steps": 483, "total_steps": 1557, "loss": 0.0169, "lr": 3.4889874392048985e-05, "epoch": 0.9310843373493976, "percentage": 31.02, "elapsed_time": "0:27:20", "remaining_time": "1:00:47"} +{"current_steps": 484, "total_steps": 1557, "loss": 0.0105, "lr": 3.48598952282621e-05, "epoch": 0.9330120481927711, "percentage": 31.09, "elapsed_time": "0:27:23", "remaining_time": "1:00:44"} +{"current_steps": 485, "total_steps": 1557, "loss": 0.0289, "lr": 3.482984134410067e-05, "epoch": 0.9349397590361446, "percentage": 31.15, "elapsed_time": "0:27:27", "remaining_time": "1:00:41"} +{"current_steps": 486, "total_steps": 1557, "loss": 0.0311, "lr": 3.479971289068537e-05, "epoch": 0.9368674698795181, "percentage": 31.21, "elapsed_time": "0:27:30", "remaining_time": "1:00:38"} +{"current_steps": 487, "total_steps": 1557, "loss": 0.0452, "lr": 3.476951001951184e-05, "epoch": 0.9387951807228916, "percentage": 31.28, "elapsed_time": "0:27:34", "remaining_time": "1:00:34"} +{"current_steps": 488, "total_steps": 1557, "loss": 0.0689, "lr": 3.473923288244991e-05, "epoch": 0.9407228915662651, "percentage": 31.34, "elapsed_time": "0:27:37", "remaining_time": "1:00:30"} +{"current_steps": 489, "total_steps": 1557, "loss": 0.0241, "lr": 3.470888163174286e-05, "epoch": 0.9426506024096386, "percentage": 31.41, "elapsed_time": "0:27:40", "remaining_time": "1:00:26"} +{"current_steps": 490, "total_steps": 1557, "loss": 0.0228, "lr": 3.467845642000661e-05, "epoch": 0.944578313253012, "percentage": 31.47, "elapsed_time": "0:27:43", "remaining_time": "1:00:23"} +{"current_steps": 491, "total_steps": 1557, "loss": 0.0144, "lr": 3.4647957400229004e-05, "epoch": 0.9465060240963855, "percentage": 31.54, "elapsed_time": "0:27:47", "remaining_time": "1:00:19"} +{"current_steps": 492, "total_steps": 1557, "loss": 0.0167, "lr": 3.461738472576902e-05, "epoch": 0.948433734939759, "percentage": 31.6, "elapsed_time": "0:27:50", "remaining_time": "1:00:17"} +{"current_steps": 493, "total_steps": 1557, "loss": 0.031, "lr": 3.458673855035597e-05, "epoch": 0.9503614457831325, "percentage": 31.66, "elapsed_time": "0:27:54", "remaining_time": "1:00:13"} +{"current_steps": 494, "total_steps": 1557, "loss": 0.0191, "lr": 3.455601902808876e-05, "epoch": 0.952289156626506, "percentage": 31.73, "elapsed_time": "0:27:57", "remaining_time": "1:00:08"} +{"current_steps": 495, "total_steps": 1557, "loss": 0.0192, "lr": 3.452522631343515e-05, "epoch": 0.9542168674698795, "percentage": 31.79, "elapsed_time": "0:28:00", "remaining_time": "1:00:04"} +{"current_steps": 496, "total_steps": 1557, "loss": 0.0159, "lr": 3.449436056123086e-05, "epoch": 0.9561445783132531, "percentage": 31.86, "elapsed_time": "0:28:03", "remaining_time": "1:00:01"} +{"current_steps": 497, "total_steps": 1557, "loss": 0.0113, "lr": 3.446342192667893e-05, "epoch": 0.9580722891566265, "percentage": 31.92, "elapsed_time": "0:28:06", "remaining_time": "0:59:57"} +{"current_steps": 498, "total_steps": 1557, "loss": 0.0332, "lr": 3.443241056534884e-05, "epoch": 0.96, "percentage": 31.98, "elapsed_time": "0:28:10", "remaining_time": "0:59:54"} +{"current_steps": 499, "total_steps": 1557, "loss": 0.0117, "lr": 3.440132663317579e-05, "epoch": 0.9619277108433735, "percentage": 32.05, "elapsed_time": "0:28:13", "remaining_time": "0:59:50"} +{"current_steps": 500, "total_steps": 1557, "loss": 0.0169, "lr": 3.4370170286459864e-05, "epoch": 0.963855421686747, "percentage": 32.11, "elapsed_time": "0:28:16", "remaining_time": "0:59:46"} +{"current_steps": 501, "total_steps": 1557, "loss": 0.0217, "lr": 3.433894168186529e-05, "epoch": 0.9657831325301205, "percentage": 32.18, "elapsed_time": "0:28:19", "remaining_time": "0:59:43"} +{"current_steps": 502, "total_steps": 1557, "loss": 0.0207, "lr": 3.430764097641962e-05, "epoch": 0.967710843373494, "percentage": 32.24, "elapsed_time": "0:28:23", "remaining_time": "0:59:39"} +{"current_steps": 503, "total_steps": 1557, "loss": 0.0381, "lr": 3.427626832751296e-05, "epoch": 0.9696385542168675, "percentage": 32.31, "elapsed_time": "0:28:26", "remaining_time": "0:59:36"} +{"current_steps": 504, "total_steps": 1557, "loss": 0.0245, "lr": 3.424482389289716e-05, "epoch": 0.971566265060241, "percentage": 32.37, "elapsed_time": "0:28:30", "remaining_time": "0:59:32"} +{"current_steps": 505, "total_steps": 1557, "loss": 0.0164, "lr": 3.4213307830685055e-05, "epoch": 0.9734939759036144, "percentage": 32.43, "elapsed_time": "0:28:33", "remaining_time": "0:59:29"} +{"current_steps": 506, "total_steps": 1557, "loss": 0.0297, "lr": 3.4181720299349615e-05, "epoch": 0.9754216867469879, "percentage": 32.5, "elapsed_time": "0:28:37", "remaining_time": "0:59:26"} +{"current_steps": 507, "total_steps": 1557, "loss": 0.0102, "lr": 3.4150061457723205e-05, "epoch": 0.9773493975903614, "percentage": 32.56, "elapsed_time": "0:28:40", "remaining_time": "0:59:22"} +{"current_steps": 508, "total_steps": 1557, "loss": 0.0243, "lr": 3.411833146499675e-05, "epoch": 0.9792771084337349, "percentage": 32.63, "elapsed_time": "0:28:43", "remaining_time": "0:59:18"} +{"current_steps": 509, "total_steps": 1557, "loss": 0.0357, "lr": 3.408653048071894e-05, "epoch": 0.9812048192771085, "percentage": 32.69, "elapsed_time": "0:28:46", "remaining_time": "0:59:15"} +{"current_steps": 510, "total_steps": 1557, "loss": 0.037, "lr": 3.405465866479546e-05, "epoch": 0.983132530120482, "percentage": 32.76, "elapsed_time": "0:28:49", "remaining_time": "0:59:10"} +{"current_steps": 511, "total_steps": 1557, "loss": 0.0129, "lr": 3.402271617748812e-05, "epoch": 0.9850602409638555, "percentage": 32.82, "elapsed_time": "0:28:52", "remaining_time": "0:59:07"} +{"current_steps": 512, "total_steps": 1557, "loss": 0.0442, "lr": 3.399070317941413e-05, "epoch": 0.9869879518072289, "percentage": 32.88, "elapsed_time": "0:28:56", "remaining_time": "0:59:04"} +{"current_steps": 513, "total_steps": 1557, "loss": 0.0119, "lr": 3.395861983154522e-05, "epoch": 0.9889156626506024, "percentage": 32.95, "elapsed_time": "0:28:59", "remaining_time": "0:59:00"} +{"current_steps": 514, "total_steps": 1557, "loss": 0.0455, "lr": 3.392646629520688e-05, "epoch": 0.9908433734939759, "percentage": 33.01, "elapsed_time": "0:29:03", "remaining_time": "0:58:57"} +{"current_steps": 515, "total_steps": 1557, "loss": 0.0203, "lr": 3.389424273207752e-05, "epoch": 0.9927710843373494, "percentage": 33.08, "elapsed_time": "0:29:06", "remaining_time": "0:58:54"} +{"current_steps": 516, "total_steps": 1557, "loss": 0.0329, "lr": 3.386194930418767e-05, "epoch": 0.9946987951807229, "percentage": 33.14, "elapsed_time": "0:29:10", "remaining_time": "0:58:50"} +{"current_steps": 517, "total_steps": 1557, "loss": 0.0065, "lr": 3.382958617391915e-05, "epoch": 0.9966265060240964, "percentage": 33.2, "elapsed_time": "0:29:13", "remaining_time": "0:58:47"} +{"current_steps": 518, "total_steps": 1557, "loss": 0.0266, "lr": 3.3797153504004296e-05, "epoch": 0.9985542168674699, "percentage": 33.27, "elapsed_time": "0:29:16", "remaining_time": "0:58:43"} +{"current_steps": 519, "total_steps": 1557, "loss": 0.0169, "lr": 3.3764651457525095e-05, "epoch": 1.0, "percentage": 33.33, "elapsed_time": "0:29:19", "remaining_time": "0:58:38"} +{"current_steps": 520, "total_steps": 1557, "loss": 0.0221, "lr": 3.373208019791237e-05, "epoch": 1.0019277108433735, "percentage": 33.4, "elapsed_time": "0:29:22", "remaining_time": "0:58:35"} +{"current_steps": 521, "total_steps": 1557, "loss": 0.0211, "lr": 3.3699439888945e-05, "epoch": 1.003855421686747, "percentage": 33.46, "elapsed_time": "0:29:26", "remaining_time": "0:58:32"} +{"current_steps": 522, "total_steps": 1557, "loss": 0.0069, "lr": 3.366673069474904e-05, "epoch": 1.0057831325301205, "percentage": 33.53, "elapsed_time": "0:29:29", "remaining_time": "0:58:28"} +{"current_steps": 523, "total_steps": 1557, "loss": 0.0191, "lr": 3.3633952779796914e-05, "epoch": 1.007710843373494, "percentage": 33.59, "elapsed_time": "0:29:33", "remaining_time": "0:58:25"} +{"current_steps": 524, "total_steps": 1557, "loss": 0.0196, "lr": 3.360110630890664e-05, "epoch": 1.0096385542168675, "percentage": 33.65, "elapsed_time": "0:29:36", "remaining_time": "0:58:21"} +{"current_steps": 525, "total_steps": 1557, "loss": 0.0328, "lr": 3.356819144724092e-05, "epoch": 1.011566265060241, "percentage": 33.72, "elapsed_time": "0:29:39", "remaining_time": "0:58:17"} +{"current_steps": 526, "total_steps": 1557, "loss": 0.0203, "lr": 3.3535208360306354e-05, "epoch": 1.0134939759036146, "percentage": 33.78, "elapsed_time": "0:29:42", "remaining_time": "0:58:14"} +{"current_steps": 527, "total_steps": 1557, "loss": 0.0122, "lr": 3.350215721395261e-05, "epoch": 1.0154216867469879, "percentage": 33.85, "elapsed_time": "0:29:46", "remaining_time": "0:58:11"} +{"current_steps": 528, "total_steps": 1557, "loss": 0.0437, "lr": 3.346903817437157e-05, "epoch": 1.0173493975903614, "percentage": 33.91, "elapsed_time": "0:29:49", "remaining_time": "0:58:07"} +{"current_steps": 529, "total_steps": 1557, "loss": 0.013, "lr": 3.343585140809651e-05, "epoch": 1.0192771084337349, "percentage": 33.98, "elapsed_time": "0:29:52", "remaining_time": "0:58:04"} +{"current_steps": 530, "total_steps": 1557, "loss": 0.008, "lr": 3.3402597082001276e-05, "epoch": 1.0212048192771084, "percentage": 34.04, "elapsed_time": "0:29:56", "remaining_time": "0:58:00"} +{"current_steps": 531, "total_steps": 1557, "loss": 0.0078, "lr": 3.3369275363299394e-05, "epoch": 1.0231325301204819, "percentage": 34.1, "elapsed_time": "0:29:59", "remaining_time": "0:57:57"} +{"current_steps": 532, "total_steps": 1557, "loss": 0.0225, "lr": 3.333588641954327e-05, "epoch": 1.0250602409638554, "percentage": 34.17, "elapsed_time": "0:30:03", "remaining_time": "0:57:53"} +{"current_steps": 533, "total_steps": 1557, "loss": 0.0118, "lr": 3.330243041862336e-05, "epoch": 1.026987951807229, "percentage": 34.23, "elapsed_time": "0:30:06", "remaining_time": "0:57:50"} +{"current_steps": 534, "total_steps": 1557, "loss": 0.0338, "lr": 3.326890752876728e-05, "epoch": 1.0289156626506024, "percentage": 34.3, "elapsed_time": "0:30:09", "remaining_time": "0:57:47"} +{"current_steps": 535, "total_steps": 1557, "loss": 0.0341, "lr": 3.323531791853901e-05, "epoch": 1.030843373493976, "percentage": 34.36, "elapsed_time": "0:30:13", "remaining_time": "0:57:43"} +{"current_steps": 536, "total_steps": 1557, "loss": 0.0184, "lr": 3.3201661756838e-05, "epoch": 1.0327710843373494, "percentage": 34.43, "elapsed_time": "0:30:16", "remaining_time": "0:57:40"} +{"current_steps": 537, "total_steps": 1557, "loss": 0.0152, "lr": 3.316793921289835e-05, "epoch": 1.034698795180723, "percentage": 34.49, "elapsed_time": "0:30:20", "remaining_time": "0:57:37"} +{"current_steps": 538, "total_steps": 1557, "loss": 0.0326, "lr": 3.313415045628795e-05, "epoch": 1.0366265060240965, "percentage": 34.55, "elapsed_time": "0:30:23", "remaining_time": "0:57:34"} +{"current_steps": 539, "total_steps": 1557, "loss": 0.0164, "lr": 3.3100295656907646e-05, "epoch": 1.03855421686747, "percentage": 34.62, "elapsed_time": "0:30:27", "remaining_time": "0:57:31"} +{"current_steps": 540, "total_steps": 1557, "loss": 0.0091, "lr": 3.306637498499034e-05, "epoch": 1.0404819277108435, "percentage": 34.68, "elapsed_time": "0:30:30", "remaining_time": "0:57:27"} +{"current_steps": 541, "total_steps": 1557, "loss": 0.0118, "lr": 3.303238861110018e-05, "epoch": 1.0424096385542168, "percentage": 34.75, "elapsed_time": "0:30:34", "remaining_time": "0:57:24"} +{"current_steps": 542, "total_steps": 1557, "loss": 0.0081, "lr": 3.299833670613168e-05, "epoch": 1.0443373493975903, "percentage": 34.81, "elapsed_time": "0:30:37", "remaining_time": "0:57:20"} +{"current_steps": 543, "total_steps": 1557, "loss": 0.0138, "lr": 3.2964219441308865e-05, "epoch": 1.0462650602409638, "percentage": 34.87, "elapsed_time": "0:30:40", "remaining_time": "0:57:16"} +{"current_steps": 544, "total_steps": 1557, "loss": 0.0171, "lr": 3.2930036988184425e-05, "epoch": 1.0481927710843373, "percentage": 34.94, "elapsed_time": "0:30:43", "remaining_time": "0:57:12"} +{"current_steps": 545, "total_steps": 1557, "loss": 0.0106, "lr": 3.28957895186388e-05, "epoch": 1.0501204819277108, "percentage": 35.0, "elapsed_time": "0:30:47", "remaining_time": "0:57:09"} +{"current_steps": 546, "total_steps": 1557, "loss": 0.0123, "lr": 3.2861477204879395e-05, "epoch": 1.0520481927710843, "percentage": 35.07, "elapsed_time": "0:30:50", "remaining_time": "0:57:06"} +{"current_steps": 547, "total_steps": 1557, "loss": 0.0088, "lr": 3.2827100219439656e-05, "epoch": 1.0539759036144578, "percentage": 35.13, "elapsed_time": "0:30:53", "remaining_time": "0:57:03"} +{"current_steps": 548, "total_steps": 1557, "loss": 0.0179, "lr": 3.279265873517822e-05, "epoch": 1.0559036144578313, "percentage": 35.2, "elapsed_time": "0:30:57", "remaining_time": "0:56:59"} +{"current_steps": 549, "total_steps": 1557, "loss": 0.0142, "lr": 3.275815292527804e-05, "epoch": 1.0578313253012048, "percentage": 35.26, "elapsed_time": "0:31:00", "remaining_time": "0:56:56"} +{"current_steps": 550, "total_steps": 1557, "loss": 0.0123, "lr": 3.2723582963245526e-05, "epoch": 1.0597590361445783, "percentage": 35.32, "elapsed_time": "0:31:04", "remaining_time": "0:56:53"} +{"current_steps": 551, "total_steps": 1557, "loss": 0.0098, "lr": 3.2688949022909665e-05, "epoch": 1.0616867469879518, "percentage": 35.39, "elapsed_time": "0:31:07", "remaining_time": "0:56:49"} +{"current_steps": 552, "total_steps": 1557, "loss": 0.0097, "lr": 3.265425127842114e-05, "epoch": 1.0636144578313254, "percentage": 35.45, "elapsed_time": "0:31:11", "remaining_time": "0:56:46"} +{"current_steps": 553, "total_steps": 1557, "loss": 0.0227, "lr": 3.261948990425147e-05, "epoch": 1.0655421686746989, "percentage": 35.52, "elapsed_time": "0:31:19", "remaining_time": "0:56:52"} +{"current_steps": 554, "total_steps": 1557, "loss": 0.047, "lr": 3.258466507519213e-05, "epoch": 1.0674698795180724, "percentage": 35.58, "elapsed_time": "0:31:23", "remaining_time": "0:56:49"} +{"current_steps": 555, "total_steps": 1557, "loss": 0.0314, "lr": 3.254977696635366e-05, "epoch": 1.0693975903614459, "percentage": 35.65, "elapsed_time": "0:31:26", "remaining_time": "0:56:46"} +{"current_steps": 556, "total_steps": 1557, "loss": 0.0046, "lr": 3.2514825753164774e-05, "epoch": 1.0713253012048192, "percentage": 35.71, "elapsed_time": "0:31:29", "remaining_time": "0:56:42"} +{"current_steps": 557, "total_steps": 1557, "loss": 0.022, "lr": 3.247981161137153e-05, "epoch": 1.0732530120481927, "percentage": 35.77, "elapsed_time": "0:31:32", "remaining_time": "0:56:37"} +{"current_steps": 558, "total_steps": 1557, "loss": 0.0134, "lr": 3.2444734717036386e-05, "epoch": 1.0751807228915662, "percentage": 35.84, "elapsed_time": "0:31:36", "remaining_time": "0:56:34"} +{"current_steps": 559, "total_steps": 1557, "loss": 0.0211, "lr": 3.240959524653735e-05, "epoch": 1.0771084337349397, "percentage": 35.9, "elapsed_time": "0:31:39", "remaining_time": "0:56:31"} +{"current_steps": 560, "total_steps": 1557, "loss": 0.0141, "lr": 3.237439337656708e-05, "epoch": 1.0790361445783132, "percentage": 35.97, "elapsed_time": "0:31:43", "remaining_time": "0:56:28"} +{"current_steps": 561, "total_steps": 1557, "loss": 0.019, "lr": 3.2339129284131994e-05, "epoch": 1.0809638554216867, "percentage": 36.03, "elapsed_time": "0:31:47", "remaining_time": "0:56:25"} +{"current_steps": 562, "total_steps": 1557, "loss": 0.0249, "lr": 3.2303803146551386e-05, "epoch": 1.0828915662650602, "percentage": 36.1, "elapsed_time": "0:31:50", "remaining_time": "0:56:22"} +{"current_steps": 563, "total_steps": 1557, "loss": 0.0088, "lr": 3.226841514145656e-05, "epoch": 1.0848192771084337, "percentage": 36.16, "elapsed_time": "0:31:53", "remaining_time": "0:56:19"} +{"current_steps": 564, "total_steps": 1557, "loss": 0.0054, "lr": 3.223296544678987e-05, "epoch": 1.0867469879518072, "percentage": 36.22, "elapsed_time": "0:31:56", "remaining_time": "0:56:15"} +{"current_steps": 565, "total_steps": 1557, "loss": 0.0109, "lr": 3.219745424080389e-05, "epoch": 1.0886746987951808, "percentage": 36.29, "elapsed_time": "0:32:00", "remaining_time": "0:56:12"} +{"current_steps": 566, "total_steps": 1557, "loss": 0.0106, "lr": 3.2161881702060476e-05, "epoch": 1.0906024096385543, "percentage": 36.35, "elapsed_time": "0:32:04", "remaining_time": "0:56:08"} +{"current_steps": 567, "total_steps": 1557, "loss": 0.0077, "lr": 3.2126248009429905e-05, "epoch": 1.0925301204819278, "percentage": 36.42, "elapsed_time": "0:32:07", "remaining_time": "0:56:05"} +{"current_steps": 568, "total_steps": 1557, "loss": 0.0093, "lr": 3.2090553342089935e-05, "epoch": 1.0944578313253013, "percentage": 36.48, "elapsed_time": "0:32:11", "remaining_time": "0:56:02"} +{"current_steps": 569, "total_steps": 1557, "loss": 0.036, "lr": 3.205479787952494e-05, "epoch": 1.0963855421686748, "percentage": 36.54, "elapsed_time": "0:32:14", "remaining_time": "0:55:59"} +{"current_steps": 570, "total_steps": 1557, "loss": 0.0085, "lr": 3.201898180152499e-05, "epoch": 1.0983132530120483, "percentage": 36.61, "elapsed_time": "0:32:18", "remaining_time": "0:55:56"} +{"current_steps": 571, "total_steps": 1557, "loss": 0.0081, "lr": 3.1983105288184945e-05, "epoch": 1.1002409638554216, "percentage": 36.67, "elapsed_time": "0:32:21", "remaining_time": "0:55:52"} +{"current_steps": 572, "total_steps": 1557, "loss": 0.017, "lr": 3.194716851990355e-05, "epoch": 1.102168674698795, "percentage": 36.74, "elapsed_time": "0:32:24", "remaining_time": "0:55:49"} +{"current_steps": 573, "total_steps": 1557, "loss": 0.021, "lr": 3.191117167738253e-05, "epoch": 1.1040963855421686, "percentage": 36.8, "elapsed_time": "0:32:28", "remaining_time": "0:55:45"} +{"current_steps": 574, "total_steps": 1557, "loss": 0.0096, "lr": 3.1875114941625705e-05, "epoch": 1.106024096385542, "percentage": 36.87, "elapsed_time": "0:32:31", "remaining_time": "0:55:42"} +{"current_steps": 575, "total_steps": 1557, "loss": 0.0118, "lr": 3.1838998493938026e-05, "epoch": 1.1079518072289156, "percentage": 36.93, "elapsed_time": "0:32:35", "remaining_time": "0:55:39"} +{"current_steps": 576, "total_steps": 1557, "loss": 0.0094, "lr": 3.180282251592472e-05, "epoch": 1.1098795180722891, "percentage": 36.99, "elapsed_time": "0:32:38", "remaining_time": "0:55:36"} +{"current_steps": 577, "total_steps": 1557, "loss": 0.0131, "lr": 3.1766587189490336e-05, "epoch": 1.1118072289156626, "percentage": 37.06, "elapsed_time": "0:32:42", "remaining_time": "0:55:32"} +{"current_steps": 578, "total_steps": 1557, "loss": 0.0445, "lr": 3.173029269683785e-05, "epoch": 1.1137349397590361, "percentage": 37.12, "elapsed_time": "0:32:45", "remaining_time": "0:55:29"} +{"current_steps": 579, "total_steps": 1557, "loss": 0.0116, "lr": 3.169393922046776e-05, "epoch": 1.1156626506024097, "percentage": 37.19, "elapsed_time": "0:32:49", "remaining_time": "0:55:26"} +{"current_steps": 580, "total_steps": 1557, "loss": 0.0116, "lr": 3.165752694317713e-05, "epoch": 1.1175903614457832, "percentage": 37.25, "elapsed_time": "0:32:52", "remaining_time": "0:55:23"} +{"current_steps": 581, "total_steps": 1557, "loss": 0.013, "lr": 3.16210560480587e-05, "epoch": 1.1195180722891567, "percentage": 37.32, "elapsed_time": "0:32:55", "remaining_time": "0:55:18"} +{"current_steps": 582, "total_steps": 1557, "loss": 0.0052, "lr": 3.158452671849998e-05, "epoch": 1.1214457831325302, "percentage": 37.38, "elapsed_time": "0:32:58", "remaining_time": "0:55:15"} +{"current_steps": 583, "total_steps": 1557, "loss": 0.0182, "lr": 3.154793913818226e-05, "epoch": 1.1233734939759037, "percentage": 37.44, "elapsed_time": "0:33:02", "remaining_time": "0:55:12"} +{"current_steps": 584, "total_steps": 1557, "loss": 0.0146, "lr": 3.1511293491079804e-05, "epoch": 1.1253012048192772, "percentage": 37.51, "elapsed_time": "0:33:05", "remaining_time": "0:55:08"} +{"current_steps": 585, "total_steps": 1557, "loss": 0.0139, "lr": 3.1474589961458786e-05, "epoch": 1.1272289156626507, "percentage": 37.57, "elapsed_time": "0:33:09", "remaining_time": "0:55:05"} +{"current_steps": 586, "total_steps": 1557, "loss": 0.0236, "lr": 3.1437828733876477e-05, "epoch": 1.129156626506024, "percentage": 37.64, "elapsed_time": "0:33:12", "remaining_time": "0:55:02"} +{"current_steps": 587, "total_steps": 1557, "loss": 0.0084, "lr": 3.140100999318025e-05, "epoch": 1.1310843373493975, "percentage": 37.7, "elapsed_time": "0:33:16", "remaining_time": "0:54:58"} +{"current_steps": 588, "total_steps": 1557, "loss": 0.0215, "lr": 3.136413392450668e-05, "epoch": 1.133012048192771, "percentage": 37.76, "elapsed_time": "0:33:19", "remaining_time": "0:54:54"} +{"current_steps": 589, "total_steps": 1557, "loss": 0.0154, "lr": 3.132720071328061e-05, "epoch": 1.1349397590361445, "percentage": 37.83, "elapsed_time": "0:33:22", "remaining_time": "0:54:51"} +{"current_steps": 590, "total_steps": 1557, "loss": 0.0088, "lr": 3.1290210545214205e-05, "epoch": 1.136867469879518, "percentage": 37.89, "elapsed_time": "0:33:26", "remaining_time": "0:54:48"} +{"current_steps": 591, "total_steps": 1557, "loss": 0.0126, "lr": 3.125316360630602e-05, "epoch": 1.1387951807228915, "percentage": 37.96, "elapsed_time": "0:33:29", "remaining_time": "0:54:45"} +{"current_steps": 592, "total_steps": 1557, "loss": 0.0147, "lr": 3.121606008284011e-05, "epoch": 1.140722891566265, "percentage": 38.02, "elapsed_time": "0:33:33", "remaining_time": "0:54:42"} +{"current_steps": 593, "total_steps": 1557, "loss": 0.0119, "lr": 3.1178900161385005e-05, "epoch": 1.1426506024096386, "percentage": 38.09, "elapsed_time": "0:33:36", "remaining_time": "0:54:38"} +{"current_steps": 594, "total_steps": 1557, "loss": 0.0158, "lr": 3.114168402879286e-05, "epoch": 1.144578313253012, "percentage": 38.15, "elapsed_time": "0:33:40", "remaining_time": "0:54:35"} +{"current_steps": 595, "total_steps": 1557, "loss": 0.0107, "lr": 3.110441187219846e-05, "epoch": 1.1465060240963856, "percentage": 38.21, "elapsed_time": "0:33:43", "remaining_time": "0:54:31"} +{"current_steps": 596, "total_steps": 1557, "loss": 0.0079, "lr": 3.10670838790183e-05, "epoch": 1.148433734939759, "percentage": 38.28, "elapsed_time": "0:33:46", "remaining_time": "0:54:28"} +{"current_steps": 597, "total_steps": 1557, "loss": 0.0147, "lr": 3.102970023694965e-05, "epoch": 1.1503614457831326, "percentage": 38.34, "elapsed_time": "0:33:50", "remaining_time": "0:54:24"} +{"current_steps": 598, "total_steps": 1557, "loss": 0.0099, "lr": 3.099226113396959e-05, "epoch": 1.152289156626506, "percentage": 38.41, "elapsed_time": "0:33:53", "remaining_time": "0:54:20"} +{"current_steps": 599, "total_steps": 1557, "loss": 0.0214, "lr": 3.095476675833405e-05, "epoch": 1.1542168674698796, "percentage": 38.47, "elapsed_time": "0:33:56", "remaining_time": "0:54:17"} +{"current_steps": 600, "total_steps": 1557, "loss": 0.0118, "lr": 3.0917217298576955e-05, "epoch": 1.1561445783132531, "percentage": 38.54, "elapsed_time": "0:33:59", "remaining_time": "0:54:13"} +{"current_steps": 601, "total_steps": 1557, "loss": 0.0086, "lr": 3.0879612943509154e-05, "epoch": 1.1580722891566264, "percentage": 38.6, "elapsed_time": "0:34:03", "remaining_time": "0:54:09"} +{"current_steps": 602, "total_steps": 1557, "loss": 0.0088, "lr": 3.0841953882217536e-05, "epoch": 1.16, "percentage": 38.66, "elapsed_time": "0:34:06", "remaining_time": "0:54:06"} +{"current_steps": 603, "total_steps": 1557, "loss": 0.0241, "lr": 3.08042403040641e-05, "epoch": 1.1619277108433734, "percentage": 38.73, "elapsed_time": "0:34:10", "remaining_time": "0:54:03"} +{"current_steps": 604, "total_steps": 1557, "loss": 0.0154, "lr": 3.076647239868494e-05, "epoch": 1.163855421686747, "percentage": 38.79, "elapsed_time": "0:34:13", "remaining_time": "0:53:59"} +{"current_steps": 605, "total_steps": 1557, "loss": 0.0197, "lr": 3.072865035598933e-05, "epoch": 1.1657831325301204, "percentage": 38.86, "elapsed_time": "0:34:16", "remaining_time": "0:53:56"} +{"current_steps": 606, "total_steps": 1557, "loss": 0.0093, "lr": 3.06907743661588e-05, "epoch": 1.167710843373494, "percentage": 38.92, "elapsed_time": "0:34:19", "remaining_time": "0:53:52"} +{"current_steps": 607, "total_steps": 1557, "loss": 0.0171, "lr": 3.065284461964609e-05, "epoch": 1.1696385542168675, "percentage": 38.99, "elapsed_time": "0:34:23", "remaining_time": "0:53:49"} +{"current_steps": 608, "total_steps": 1557, "loss": 0.008, "lr": 3.061486130717428e-05, "epoch": 1.171566265060241, "percentage": 39.05, "elapsed_time": "0:34:27", "remaining_time": "0:53:46"} +{"current_steps": 609, "total_steps": 1557, "loss": 0.0155, "lr": 3.057682461973579e-05, "epoch": 1.1734939759036145, "percentage": 39.11, "elapsed_time": "0:34:30", "remaining_time": "0:53:43"} +{"current_steps": 610, "total_steps": 1557, "loss": 0.0212, "lr": 3.053873474859143e-05, "epoch": 1.175421686746988, "percentage": 39.18, "elapsed_time": "0:34:33", "remaining_time": "0:53:39"} +{"current_steps": 611, "total_steps": 1557, "loss": 0.019, "lr": 3.050059188526942e-05, "epoch": 1.1773493975903615, "percentage": 39.24, "elapsed_time": "0:34:37", "remaining_time": "0:53:36"} +{"current_steps": 612, "total_steps": 1557, "loss": 0.0147, "lr": 3.046239622156446e-05, "epoch": 1.179277108433735, "percentage": 39.31, "elapsed_time": "0:34:40", "remaining_time": "0:53:32"} +{"current_steps": 613, "total_steps": 1557, "loss": 0.0088, "lr": 3.042414794953674e-05, "epoch": 1.1812048192771085, "percentage": 39.37, "elapsed_time": "0:34:43", "remaining_time": "0:53:28"} +{"current_steps": 614, "total_steps": 1557, "loss": 0.0187, "lr": 3.0385847261510975e-05, "epoch": 1.1831325301204818, "percentage": 39.43, "elapsed_time": "0:34:47", "remaining_time": "0:53:25"} +{"current_steps": 615, "total_steps": 1557, "loss": 0.0124, "lr": 3.0347494350075465e-05, "epoch": 1.1850602409638555, "percentage": 39.5, "elapsed_time": "0:34:50", "remaining_time": "0:53:21"} +{"current_steps": 616, "total_steps": 1557, "loss": 0.01, "lr": 3.0309089408081074e-05, "epoch": 1.1869879518072288, "percentage": 39.56, "elapsed_time": "0:34:53", "remaining_time": "0:53:17"} +{"current_steps": 617, "total_steps": 1557, "loss": 0.0087, "lr": 3.027063262864032e-05, "epoch": 1.1889156626506023, "percentage": 39.63, "elapsed_time": "0:34:56", "remaining_time": "0:53:14"} +{"current_steps": 618, "total_steps": 1557, "loss": 0.0137, "lr": 3.023212420512637e-05, "epoch": 1.1908433734939758, "percentage": 39.69, "elapsed_time": "0:35:00", "remaining_time": "0:53:10"} +{"current_steps": 619, "total_steps": 1557, "loss": 0.0056, "lr": 3.0193564331172074e-05, "epoch": 1.1927710843373494, "percentage": 39.76, "elapsed_time": "0:35:03", "remaining_time": "0:53:07"} +{"current_steps": 620, "total_steps": 1557, "loss": 0.0274, "lr": 3.0154953200668976e-05, "epoch": 1.1946987951807229, "percentage": 39.82, "elapsed_time": "0:35:06", "remaining_time": "0:53:04"} +{"current_steps": 621, "total_steps": 1557, "loss": 0.0151, "lr": 3.011629100776638e-05, "epoch": 1.1966265060240964, "percentage": 39.88, "elapsed_time": "0:35:10", "remaining_time": "0:53:00"} +{"current_steps": 622, "total_steps": 1557, "loss": 0.0424, "lr": 3.007757794687033e-05, "epoch": 1.1985542168674699, "percentage": 39.95, "elapsed_time": "0:35:13", "remaining_time": "0:52:57"} +{"current_steps": 623, "total_steps": 1557, "loss": 0.0079, "lr": 3.003881421264266e-05, "epoch": 1.2004819277108434, "percentage": 40.01, "elapsed_time": "0:35:16", "remaining_time": "0:52:53"} +{"current_steps": 624, "total_steps": 1557, "loss": 0.0142, "lr": 3.0000000000000004e-05, "epoch": 1.202409638554217, "percentage": 40.08, "elapsed_time": "0:35:20", "remaining_time": "0:52:50"} +{"current_steps": 625, "total_steps": 1557, "loss": 0.0251, "lr": 2.996113550411281e-05, "epoch": 1.2043373493975904, "percentage": 40.14, "elapsed_time": "0:35:23", "remaining_time": "0:52:46"} +{"current_steps": 626, "total_steps": 1557, "loss": 0.0137, "lr": 2.9922220920404375e-05, "epoch": 1.206265060240964, "percentage": 40.21, "elapsed_time": "0:35:26", "remaining_time": "0:52:42"} +{"current_steps": 627, "total_steps": 1557, "loss": 0.0105, "lr": 2.9883256444549862e-05, "epoch": 1.2081927710843374, "percentage": 40.27, "elapsed_time": "0:35:30", "remaining_time": "0:52:39"} +{"current_steps": 628, "total_steps": 1557, "loss": 0.0089, "lr": 2.984424227247529e-05, "epoch": 1.210120481927711, "percentage": 40.33, "elapsed_time": "0:35:33", "remaining_time": "0:52:36"} +{"current_steps": 629, "total_steps": 1557, "loss": 0.0253, "lr": 2.980517860035656e-05, "epoch": 1.2120481927710842, "percentage": 40.4, "elapsed_time": "0:35:37", "remaining_time": "0:52:33"} +{"current_steps": 630, "total_steps": 1557, "loss": 0.0134, "lr": 2.9766065624618518e-05, "epoch": 1.213975903614458, "percentage": 40.46, "elapsed_time": "0:35:40", "remaining_time": "0:52:29"} +{"current_steps": 631, "total_steps": 1557, "loss": 0.0157, "lr": 2.972690354193388e-05, "epoch": 1.2159036144578312, "percentage": 40.53, "elapsed_time": "0:35:43", "remaining_time": "0:52:26"} +{"current_steps": 632, "total_steps": 1557, "loss": 0.0204, "lr": 2.96876925492223e-05, "epoch": 1.2178313253012047, "percentage": 40.59, "elapsed_time": "0:35:46", "remaining_time": "0:52:22"} +{"current_steps": 633, "total_steps": 1557, "loss": 0.0114, "lr": 2.9648432843649382e-05, "epoch": 1.2197590361445783, "percentage": 40.66, "elapsed_time": "0:35:50", "remaining_time": "0:52:19"} +{"current_steps": 634, "total_steps": 1557, "loss": 0.0146, "lr": 2.960912462262566e-05, "epoch": 1.2216867469879518, "percentage": 40.72, "elapsed_time": "0:35:54", "remaining_time": "0:52:16"} +{"current_steps": 635, "total_steps": 1557, "loss": 0.0112, "lr": 2.9569768083805618e-05, "epoch": 1.2236144578313253, "percentage": 40.78, "elapsed_time": "0:35:57", "remaining_time": "0:52:12"} +{"current_steps": 636, "total_steps": 1557, "loss": 0.0377, "lr": 2.953036342508671e-05, "epoch": 1.2255421686746988, "percentage": 40.85, "elapsed_time": "0:36:01", "remaining_time": "0:52:09"} +{"current_steps": 637, "total_steps": 1557, "loss": 0.0432, "lr": 2.9490910844608346e-05, "epoch": 1.2274698795180723, "percentage": 40.91, "elapsed_time": "0:36:04", "remaining_time": "0:52:06"} +{"current_steps": 638, "total_steps": 1557, "loss": 0.0203, "lr": 2.9451410540750887e-05, "epoch": 1.2293975903614458, "percentage": 40.98, "elapsed_time": "0:36:07", "remaining_time": "0:52:02"} +{"current_steps": 639, "total_steps": 1557, "loss": 0.0311, "lr": 2.94118627121347e-05, "epoch": 1.2313253012048193, "percentage": 41.04, "elapsed_time": "0:36:11", "remaining_time": "0:51:59"} +{"current_steps": 640, "total_steps": 1557, "loss": 0.0168, "lr": 2.9372267557619075e-05, "epoch": 1.2332530120481928, "percentage": 41.1, "elapsed_time": "0:36:15", "remaining_time": "0:51:56"} +{"current_steps": 641, "total_steps": 1557, "loss": 0.0136, "lr": 2.933262527630131e-05, "epoch": 1.2351807228915663, "percentage": 41.17, "elapsed_time": "0:36:18", "remaining_time": "0:51:52"} +{"current_steps": 642, "total_steps": 1557, "loss": 0.0339, "lr": 2.929293606751565e-05, "epoch": 1.2371084337349398, "percentage": 41.23, "elapsed_time": "0:36:21", "remaining_time": "0:51:49"} +{"current_steps": 643, "total_steps": 1557, "loss": 0.0095, "lr": 2.9253200130832322e-05, "epoch": 1.2390361445783133, "percentage": 41.3, "elapsed_time": "0:36:24", "remaining_time": "0:51:45"} +{"current_steps": 644, "total_steps": 1557, "loss": 0.0142, "lr": 2.92134176660565e-05, "epoch": 1.2409638554216866, "percentage": 41.36, "elapsed_time": "0:36:28", "remaining_time": "0:51:42"} +{"current_steps": 645, "total_steps": 1557, "loss": 0.0209, "lr": 2.9173588873227338e-05, "epoch": 1.2428915662650604, "percentage": 41.43, "elapsed_time": "0:36:36", "remaining_time": "0:51:46"} +{"current_steps": 646, "total_steps": 1557, "loss": 0.0087, "lr": 2.913371395261691e-05, "epoch": 1.2448192771084337, "percentage": 41.49, "elapsed_time": "0:36:39", "remaining_time": "0:51:42"} +{"current_steps": 647, "total_steps": 1557, "loss": 0.0164, "lr": 2.9093793104729268e-05, "epoch": 1.2467469879518072, "percentage": 41.55, "elapsed_time": "0:36:43", "remaining_time": "0:51:39"} +{"current_steps": 648, "total_steps": 1557, "loss": 0.0138, "lr": 2.9053826530299377e-05, "epoch": 1.2486746987951807, "percentage": 41.62, "elapsed_time": "0:36:46", "remaining_time": "0:51:35"} +{"current_steps": 649, "total_steps": 1557, "loss": 0.0353, "lr": 2.901381443029215e-05, "epoch": 1.2506024096385542, "percentage": 41.68, "elapsed_time": "0:36:49", "remaining_time": "0:51:31"} +{"current_steps": 650, "total_steps": 1557, "loss": 0.007, "lr": 2.897375700590141e-05, "epoch": 1.2525301204819277, "percentage": 41.75, "elapsed_time": "0:36:52", "remaining_time": "0:51:27"} +{"current_steps": 651, "total_steps": 1557, "loss": 0.0123, "lr": 2.8933654458548873e-05, "epoch": 1.2544578313253012, "percentage": 41.81, "elapsed_time": "0:36:56", "remaining_time": "0:51:24"} +{"current_steps": 652, "total_steps": 1557, "loss": 0.0099, "lr": 2.8893506989883167e-05, "epoch": 1.2563855421686747, "percentage": 41.88, "elapsed_time": "0:36:59", "remaining_time": "0:51:20"} +{"current_steps": 653, "total_steps": 1557, "loss": 0.0097, "lr": 2.8853314801778784e-05, "epoch": 1.2583132530120482, "percentage": 41.94, "elapsed_time": "0:37:03", "remaining_time": "0:51:17"} +{"current_steps": 654, "total_steps": 1557, "loss": 0.0091, "lr": 2.8813078096335093e-05, "epoch": 1.2602409638554217, "percentage": 42.0, "elapsed_time": "0:37:06", "remaining_time": "0:51:13"} +{"current_steps": 655, "total_steps": 1557, "loss": 0.0088, "lr": 2.87727970758753e-05, "epoch": 1.2621686746987952, "percentage": 42.07, "elapsed_time": "0:37:09", "remaining_time": "0:51:10"} +{"current_steps": 656, "total_steps": 1557, "loss": 0.0145, "lr": 2.8732471942945443e-05, "epoch": 1.2640963855421687, "percentage": 42.13, "elapsed_time": "0:37:12", "remaining_time": "0:51:06"} +{"current_steps": 657, "total_steps": 1557, "loss": 0.0198, "lr": 2.8692102900313378e-05, "epoch": 1.266024096385542, "percentage": 42.2, "elapsed_time": "0:37:16", "remaining_time": "0:51:03"} +{"current_steps": 658, "total_steps": 1557, "loss": 0.0085, "lr": 2.8651690150967748e-05, "epoch": 1.2679518072289158, "percentage": 42.26, "elapsed_time": "0:37:18", "remaining_time": "0:50:59"} +{"current_steps": 659, "total_steps": 1557, "loss": 0.0071, "lr": 2.8611233898116967e-05, "epoch": 1.269879518072289, "percentage": 42.32, "elapsed_time": "0:37:22", "remaining_time": "0:50:55"} +{"current_steps": 660, "total_steps": 1557, "loss": 0.012, "lr": 2.85707343451882e-05, "epoch": 1.2718072289156628, "percentage": 42.39, "elapsed_time": "0:37:26", "remaining_time": "0:50:52"} +{"current_steps": 661, "total_steps": 1557, "loss": 0.0092, "lr": 2.853019169582635e-05, "epoch": 1.273734939759036, "percentage": 42.45, "elapsed_time": "0:37:29", "remaining_time": "0:50:49"} +{"current_steps": 662, "total_steps": 1557, "loss": 0.0144, "lr": 2.8489606153892997e-05, "epoch": 1.2756626506024096, "percentage": 42.52, "elapsed_time": "0:37:33", "remaining_time": "0:50:46"} +{"current_steps": 663, "total_steps": 1557, "loss": 0.0121, "lr": 2.8448977923465425e-05, "epoch": 1.277590361445783, "percentage": 42.58, "elapsed_time": "0:37:36", "remaining_time": "0:50:42"} +{"current_steps": 664, "total_steps": 1557, "loss": 0.0125, "lr": 2.840830720883555e-05, "epoch": 1.2795180722891566, "percentage": 42.65, "elapsed_time": "0:37:40", "remaining_time": "0:50:39"} +{"current_steps": 665, "total_steps": 1557, "loss": 0.021, "lr": 2.836759421450893e-05, "epoch": 1.28144578313253, "percentage": 42.71, "elapsed_time": "0:37:43", "remaining_time": "0:50:36"} +{"current_steps": 666, "total_steps": 1557, "loss": 0.0216, "lr": 2.83268391452037e-05, "epoch": 1.2833734939759036, "percentage": 42.77, "elapsed_time": "0:37:46", "remaining_time": "0:50:32"} +{"current_steps": 667, "total_steps": 1557, "loss": 0.0077, "lr": 2.828604220584958e-05, "epoch": 1.2853012048192771, "percentage": 42.84, "elapsed_time": "0:37:49", "remaining_time": "0:50:28"} +{"current_steps": 668, "total_steps": 1557, "loss": 0.0394, "lr": 2.824520360158681e-05, "epoch": 1.2872289156626506, "percentage": 42.9, "elapsed_time": "0:37:53", "remaining_time": "0:50:25"} +{"current_steps": 669, "total_steps": 1557, "loss": 0.0087, "lr": 2.820432353776515e-05, "epoch": 1.2891566265060241, "percentage": 42.97, "elapsed_time": "0:37:56", "remaining_time": "0:50:21"} +{"current_steps": 670, "total_steps": 1557, "loss": 0.01, "lr": 2.8163402219942822e-05, "epoch": 1.2910843373493976, "percentage": 43.03, "elapsed_time": "0:38:00", "remaining_time": "0:50:18"} +{"current_steps": 671, "total_steps": 1557, "loss": 0.0127, "lr": 2.8122439853885488e-05, "epoch": 1.2930120481927712, "percentage": 43.1, "elapsed_time": "0:38:03", "remaining_time": "0:50:15"} +{"current_steps": 672, "total_steps": 1557, "loss": 0.0128, "lr": 2.8081436645565216e-05, "epoch": 1.2949397590361444, "percentage": 43.16, "elapsed_time": "0:38:06", "remaining_time": "0:50:11"} +{"current_steps": 673, "total_steps": 1557, "loss": 0.0199, "lr": 2.804039280115944e-05, "epoch": 1.2968674698795182, "percentage": 43.22, "elapsed_time": "0:38:09", "remaining_time": "0:50:07"} +{"current_steps": 674, "total_steps": 1557, "loss": 0.0088, "lr": 2.7999308527049927e-05, "epoch": 1.2987951807228915, "percentage": 43.29, "elapsed_time": "0:38:12", "remaining_time": "0:50:03"} +{"current_steps": 675, "total_steps": 1557, "loss": 0.0084, "lr": 2.795818402982174e-05, "epoch": 1.3007228915662652, "percentage": 43.35, "elapsed_time": "0:38:16", "remaining_time": "0:50:00"} +{"current_steps": 676, "total_steps": 1557, "loss": 0.0154, "lr": 2.7917019516262186e-05, "epoch": 1.3026506024096385, "percentage": 43.42, "elapsed_time": "0:38:19", "remaining_time": "0:49:56"} +{"current_steps": 677, "total_steps": 1557, "loss": 0.0078, "lr": 2.78758151933598e-05, "epoch": 1.304578313253012, "percentage": 43.48, "elapsed_time": "0:38:22", "remaining_time": "0:49:53"} +{"current_steps": 678, "total_steps": 1557, "loss": 0.0058, "lr": 2.7834571268303294e-05, "epoch": 1.3065060240963855, "percentage": 43.55, "elapsed_time": "0:38:26", "remaining_time": "0:49:49"} +{"current_steps": 679, "total_steps": 1557, "loss": 0.007, "lr": 2.779328794848049e-05, "epoch": 1.308433734939759, "percentage": 43.61, "elapsed_time": "0:38:29", "remaining_time": "0:49:46"} +{"current_steps": 680, "total_steps": 1557, "loss": 0.0203, "lr": 2.7751965441477325e-05, "epoch": 1.3103614457831325, "percentage": 43.67, "elapsed_time": "0:38:32", "remaining_time": "0:49:42"} +{"current_steps": 681, "total_steps": 1557, "loss": 0.0106, "lr": 2.771060395507677e-05, "epoch": 1.312289156626506, "percentage": 43.74, "elapsed_time": "0:38:35", "remaining_time": "0:49:38"} +{"current_steps": 682, "total_steps": 1557, "loss": 0.0122, "lr": 2.7669203697257794e-05, "epoch": 1.3142168674698795, "percentage": 43.8, "elapsed_time": "0:38:39", "remaining_time": "0:49:35"} +{"current_steps": 683, "total_steps": 1557, "loss": 0.0101, "lr": 2.7627764876194335e-05, "epoch": 1.316144578313253, "percentage": 43.87, "elapsed_time": "0:38:42", "remaining_time": "0:49:31"} +{"current_steps": 684, "total_steps": 1557, "loss": 0.0203, "lr": 2.7586287700254214e-05, "epoch": 1.3180722891566266, "percentage": 43.93, "elapsed_time": "0:38:45", "remaining_time": "0:49:28"} +{"current_steps": 685, "total_steps": 1557, "loss": 0.0084, "lr": 2.7544772377998147e-05, "epoch": 1.32, "percentage": 43.99, "elapsed_time": "0:38:49", "remaining_time": "0:49:24"} +{"current_steps": 686, "total_steps": 1557, "loss": 0.008, "lr": 2.7503219118178636e-05, "epoch": 1.3219277108433736, "percentage": 44.06, "elapsed_time": "0:38:52", "remaining_time": "0:49:21"} +{"current_steps": 687, "total_steps": 1557, "loss": 0.0053, "lr": 2.7461628129738954e-05, "epoch": 1.3238554216867469, "percentage": 44.12, "elapsed_time": "0:38:55", "remaining_time": "0:49:17"} +{"current_steps": 688, "total_steps": 1557, "loss": 0.0059, "lr": 2.7419999621812086e-05, "epoch": 1.3257831325301206, "percentage": 44.19, "elapsed_time": "0:38:59", "remaining_time": "0:49:14"} +{"current_steps": 689, "total_steps": 1557, "loss": 0.0095, "lr": 2.7378333803719672e-05, "epoch": 1.3277108433734939, "percentage": 44.25, "elapsed_time": "0:39:02", "remaining_time": "0:49:11"} +{"current_steps": 690, "total_steps": 1557, "loss": 0.0071, "lr": 2.733663088497097e-05, "epoch": 1.3296385542168676, "percentage": 44.32, "elapsed_time": "0:39:05", "remaining_time": "0:49:07"} +{"current_steps": 691, "total_steps": 1557, "loss": 0.0227, "lr": 2.7294891075261785e-05, "epoch": 1.331566265060241, "percentage": 44.38, "elapsed_time": "0:39:09", "remaining_time": "0:49:04"} +{"current_steps": 692, "total_steps": 1557, "loss": 0.0039, "lr": 2.7253114584473418e-05, "epoch": 1.3334939759036144, "percentage": 44.44, "elapsed_time": "0:39:12", "remaining_time": "0:49:00"} +{"current_steps": 693, "total_steps": 1557, "loss": 0.008, "lr": 2.7211301622671623e-05, "epoch": 1.335421686746988, "percentage": 44.51, "elapsed_time": "0:39:15", "remaining_time": "0:48:56"} +{"current_steps": 694, "total_steps": 1557, "loss": 0.0238, "lr": 2.7169452400105533e-05, "epoch": 1.3373493975903614, "percentage": 44.57, "elapsed_time": "0:39:19", "remaining_time": "0:48:53"} +{"current_steps": 695, "total_steps": 1557, "loss": 0.0439, "lr": 2.712756712720663e-05, "epoch": 1.339277108433735, "percentage": 44.64, "elapsed_time": "0:39:22", "remaining_time": "0:48:50"} +{"current_steps": 696, "total_steps": 1557, "loss": 0.0085, "lr": 2.708564601458765e-05, "epoch": 1.3412048192771084, "percentage": 44.7, "elapsed_time": "0:39:25", "remaining_time": "0:48:46"} +{"current_steps": 697, "total_steps": 1557, "loss": 0.0097, "lr": 2.7043689273041535e-05, "epoch": 1.343132530120482, "percentage": 44.77, "elapsed_time": "0:39:28", "remaining_time": "0:48:42"} +{"current_steps": 698, "total_steps": 1557, "loss": 0.0119, "lr": 2.7001697113540414e-05, "epoch": 1.3450602409638555, "percentage": 44.83, "elapsed_time": "0:39:31", "remaining_time": "0:48:38"} +{"current_steps": 699, "total_steps": 1557, "loss": 0.0096, "lr": 2.6959669747234482e-05, "epoch": 1.346987951807229, "percentage": 44.89, "elapsed_time": "0:39:35", "remaining_time": "0:48:35"} +{"current_steps": 700, "total_steps": 1557, "loss": 0.0317, "lr": 2.6917607385450973e-05, "epoch": 1.3489156626506025, "percentage": 44.96, "elapsed_time": "0:39:38", "remaining_time": "0:48:31"} +{"current_steps": 701, "total_steps": 1557, "loss": 0.0112, "lr": 2.687551023969308e-05, "epoch": 1.350843373493976, "percentage": 45.02, "elapsed_time": "0:39:41", "remaining_time": "0:48:28"} +{"current_steps": 702, "total_steps": 1557, "loss": 0.0092, "lr": 2.6833378521638935e-05, "epoch": 1.3527710843373493, "percentage": 45.09, "elapsed_time": "0:39:45", "remaining_time": "0:48:25"} +{"current_steps": 703, "total_steps": 1557, "loss": 0.0314, "lr": 2.679121244314046e-05, "epoch": 1.354698795180723, "percentage": 45.15, "elapsed_time": "0:39:48", "remaining_time": "0:48:21"} +{"current_steps": 704, "total_steps": 1557, "loss": 0.0158, "lr": 2.674901221622239e-05, "epoch": 1.3566265060240963, "percentage": 45.22, "elapsed_time": "0:39:51", "remaining_time": "0:48:18"} +{"current_steps": 705, "total_steps": 1557, "loss": 0.0162, "lr": 2.670677805308116e-05, "epoch": 1.3585542168674698, "percentage": 45.28, "elapsed_time": "0:39:55", "remaining_time": "0:48:14"} +{"current_steps": 706, "total_steps": 1557, "loss": 0.0074, "lr": 2.666451016608383e-05, "epoch": 1.3604819277108433, "percentage": 45.34, "elapsed_time": "0:39:58", "remaining_time": "0:48:11"} +{"current_steps": 707, "total_steps": 1557, "loss": 0.0135, "lr": 2.6622208767767075e-05, "epoch": 1.3624096385542168, "percentage": 45.41, "elapsed_time": "0:40:02", "remaining_time": "0:48:07"} +{"current_steps": 708, "total_steps": 1557, "loss": 0.0107, "lr": 2.6579874070836032e-05, "epoch": 1.3643373493975903, "percentage": 45.47, "elapsed_time": "0:40:05", "remaining_time": "0:48:04"} +{"current_steps": 709, "total_steps": 1557, "loss": 0.0043, "lr": 2.6537506288163303e-05, "epoch": 1.3662650602409638, "percentage": 45.54, "elapsed_time": "0:40:08", "remaining_time": "0:48:00"} +{"current_steps": 710, "total_steps": 1557, "loss": 0.0092, "lr": 2.6495105632787835e-05, "epoch": 1.3681927710843373, "percentage": 45.6, "elapsed_time": "0:40:12", "remaining_time": "0:47:57"} +{"current_steps": 711, "total_steps": 1557, "loss": 0.0097, "lr": 2.6452672317913893e-05, "epoch": 1.3701204819277109, "percentage": 45.66, "elapsed_time": "0:40:15", "remaining_time": "0:47:54"} +{"current_steps": 712, "total_steps": 1557, "loss": 0.0193, "lr": 2.6410206556909943e-05, "epoch": 1.3720481927710844, "percentage": 45.73, "elapsed_time": "0:40:18", "remaining_time": "0:47:50"} +{"current_steps": 713, "total_steps": 1557, "loss": 0.0229, "lr": 2.636770856330761e-05, "epoch": 1.3739759036144579, "percentage": 45.79, "elapsed_time": "0:40:21", "remaining_time": "0:47:46"} +{"current_steps": 714, "total_steps": 1557, "loss": 0.004, "lr": 2.6325178550800596e-05, "epoch": 1.3759036144578314, "percentage": 45.86, "elapsed_time": "0:40:25", "remaining_time": "0:47:43"} +{"current_steps": 715, "total_steps": 1557, "loss": 0.0137, "lr": 2.6282616733243603e-05, "epoch": 1.377831325301205, "percentage": 45.92, "elapsed_time": "0:40:28", "remaining_time": "0:47:40"} +{"current_steps": 716, "total_steps": 1557, "loss": 0.0153, "lr": 2.6240023324651258e-05, "epoch": 1.3797590361445784, "percentage": 45.99, "elapsed_time": "0:40:32", "remaining_time": "0:47:37"} +{"current_steps": 717, "total_steps": 1557, "loss": 0.0031, "lr": 2.619739853919704e-05, "epoch": 1.3816867469879517, "percentage": 46.05, "elapsed_time": "0:40:35", "remaining_time": "0:47:33"} +{"current_steps": 718, "total_steps": 1557, "loss": 0.0109, "lr": 2.6154742591212196e-05, "epoch": 1.3836144578313254, "percentage": 46.11, "elapsed_time": "0:40:39", "remaining_time": "0:47:30"} +{"current_steps": 719, "total_steps": 1557, "loss": 0.0094, "lr": 2.611205569518468e-05, "epoch": 1.3855421686746987, "percentage": 46.18, "elapsed_time": "0:40:42", "remaining_time": "0:47:26"} +{"current_steps": 720, "total_steps": 1557, "loss": 0.0123, "lr": 2.6069338065758056e-05, "epoch": 1.3874698795180722, "percentage": 46.24, "elapsed_time": "0:40:45", "remaining_time": "0:47:23"} +{"current_steps": 721, "total_steps": 1557, "loss": 0.0104, "lr": 2.6026589917730416e-05, "epoch": 1.3893975903614457, "percentage": 46.31, "elapsed_time": "0:40:48", "remaining_time": "0:47:19"} +{"current_steps": 722, "total_steps": 1557, "loss": 0.0143, "lr": 2.5983811466053327e-05, "epoch": 1.3913253012048192, "percentage": 46.37, "elapsed_time": "0:40:51", "remaining_time": "0:47:15"} +{"current_steps": 723, "total_steps": 1557, "loss": 0.011, "lr": 2.5941002925830708e-05, "epoch": 1.3932530120481927, "percentage": 46.44, "elapsed_time": "0:40:55", "remaining_time": "0:47:11"} +{"current_steps": 724, "total_steps": 1557, "loss": 0.0098, "lr": 2.589816451231781e-05, "epoch": 1.3951807228915662, "percentage": 46.5, "elapsed_time": "0:40:58", "remaining_time": "0:47:08"} +{"current_steps": 725, "total_steps": 1557, "loss": 0.0094, "lr": 2.585529644092006e-05, "epoch": 1.3971084337349398, "percentage": 46.56, "elapsed_time": "0:41:01", "remaining_time": "0:47:04"} +{"current_steps": 726, "total_steps": 1557, "loss": 0.0128, "lr": 2.5812398927192027e-05, "epoch": 1.3990361445783133, "percentage": 46.63, "elapsed_time": "0:41:04", "remaining_time": "0:47:01"} +{"current_steps": 727, "total_steps": 1557, "loss": 0.0091, "lr": 2.5769472186836347e-05, "epoch": 1.4009638554216868, "percentage": 46.69, "elapsed_time": "0:41:08", "remaining_time": "0:46:58"} +{"current_steps": 728, "total_steps": 1557, "loss": 0.0154, "lr": 2.5726516435702583e-05, "epoch": 1.4028915662650603, "percentage": 46.76, "elapsed_time": "0:41:11", "remaining_time": "0:46:54"} +{"current_steps": 729, "total_steps": 1557, "loss": 0.0088, "lr": 2.5683531889786194e-05, "epoch": 1.4048192771084338, "percentage": 46.82, "elapsed_time": "0:41:15", "remaining_time": "0:46:51"} +{"current_steps": 730, "total_steps": 1557, "loss": 0.0083, "lr": 2.564051876522742e-05, "epoch": 1.4067469879518073, "percentage": 46.89, "elapsed_time": "0:41:18", "remaining_time": "0:46:47"} +{"current_steps": 731, "total_steps": 1557, "loss": 0.0179, "lr": 2.5597477278310202e-05, "epoch": 1.4086746987951808, "percentage": 46.95, "elapsed_time": "0:41:21", "remaining_time": "0:46:44"} +{"current_steps": 732, "total_steps": 1557, "loss": 0.0063, "lr": 2.5554407645461115e-05, "epoch": 1.410602409638554, "percentage": 47.01, "elapsed_time": "0:41:24", "remaining_time": "0:46:40"} +{"current_steps": 733, "total_steps": 1557, "loss": 0.017, "lr": 2.5511310083248243e-05, "epoch": 1.4125301204819278, "percentage": 47.08, "elapsed_time": "0:41:28", "remaining_time": "0:46:37"} +{"current_steps": 734, "total_steps": 1557, "loss": 0.0173, "lr": 2.5468184808380104e-05, "epoch": 1.4144578313253011, "percentage": 47.14, "elapsed_time": "0:41:32", "remaining_time": "0:46:34"} +{"current_steps": 735, "total_steps": 1557, "loss": 0.0165, "lr": 2.542503203770458e-05, "epoch": 1.4163855421686746, "percentage": 47.21, "elapsed_time": "0:41:35", "remaining_time": "0:46:31"} +{"current_steps": 736, "total_steps": 1557, "loss": 0.0185, "lr": 2.53818519882078e-05, "epoch": 1.4183132530120481, "percentage": 47.27, "elapsed_time": "0:41:39", "remaining_time": "0:46:27"} +{"current_steps": 737, "total_steps": 1557, "loss": 0.0134, "lr": 2.5338644877013067e-05, "epoch": 1.4202409638554216, "percentage": 47.33, "elapsed_time": "0:41:47", "remaining_time": "0:46:29"} +{"current_steps": 738, "total_steps": 1557, "loss": 0.0143, "lr": 2.5295410921379745e-05, "epoch": 1.4221686746987952, "percentage": 47.4, "elapsed_time": "0:41:50", "remaining_time": "0:46:26"} +{"current_steps": 739, "total_steps": 1557, "loss": 0.0193, "lr": 2.52521503387022e-05, "epoch": 1.4240963855421687, "percentage": 47.46, "elapsed_time": "0:41:54", "remaining_time": "0:46:23"} +{"current_steps": 740, "total_steps": 1557, "loss": 0.0114, "lr": 2.5208863346508667e-05, "epoch": 1.4260240963855422, "percentage": 47.53, "elapsed_time": "0:41:57", "remaining_time": "0:46:19"} +{"current_steps": 741, "total_steps": 1557, "loss": 0.0102, "lr": 2.5165550162460203e-05, "epoch": 1.4279518072289157, "percentage": 47.59, "elapsed_time": "0:42:00", "remaining_time": "0:46:15"} +{"current_steps": 742, "total_steps": 1557, "loss": 0.0215, "lr": 2.5122211004349536e-05, "epoch": 1.4298795180722892, "percentage": 47.66, "elapsed_time": "0:42:03", "remaining_time": "0:46:11"} +{"current_steps": 743, "total_steps": 1557, "loss": 0.0115, "lr": 2.5078846090100023e-05, "epoch": 1.4318072289156627, "percentage": 47.72, "elapsed_time": "0:42:06", "remaining_time": "0:46:08"} +{"current_steps": 744, "total_steps": 1557, "loss": 0.0153, "lr": 2.5035455637764518e-05, "epoch": 1.4337349397590362, "percentage": 47.78, "elapsed_time": "0:42:10", "remaining_time": "0:46:04"} +{"current_steps": 745, "total_steps": 1557, "loss": 0.0069, "lr": 2.4992039865524297e-05, "epoch": 1.4356626506024097, "percentage": 47.85, "elapsed_time": "0:42:13", "remaining_time": "0:46:01"} +{"current_steps": 746, "total_steps": 1557, "loss": 0.0108, "lr": 2.494859899168795e-05, "epoch": 1.4375903614457832, "percentage": 47.91, "elapsed_time": "0:42:16", "remaining_time": "0:45:57"} +{"current_steps": 747, "total_steps": 1557, "loss": 0.0095, "lr": 2.4905133234690282e-05, "epoch": 1.4395180722891565, "percentage": 47.98, "elapsed_time": "0:42:20", "remaining_time": "0:45:54"} +{"current_steps": 748, "total_steps": 1557, "loss": 0.0181, "lr": 2.486164281309122e-05, "epoch": 1.4414457831325302, "percentage": 48.04, "elapsed_time": "0:42:23", "remaining_time": "0:45:51"} +{"current_steps": 749, "total_steps": 1557, "loss": 0.025, "lr": 2.4818127945574717e-05, "epoch": 1.4433734939759035, "percentage": 48.11, "elapsed_time": "0:42:27", "remaining_time": "0:45:47"} +{"current_steps": 750, "total_steps": 1557, "loss": 0.0085, "lr": 2.4774588850947648e-05, "epoch": 1.445301204819277, "percentage": 48.17, "elapsed_time": "0:42:29", "remaining_time": "0:45:43"} +{"current_steps": 751, "total_steps": 1557, "loss": 0.0097, "lr": 2.473102574813871e-05, "epoch": 1.4472289156626506, "percentage": 48.23, "elapsed_time": "0:42:33", "remaining_time": "0:45:40"} +{"current_steps": 752, "total_steps": 1557, "loss": 0.0122, "lr": 2.4687438856197302e-05, "epoch": 1.449156626506024, "percentage": 48.3, "elapsed_time": "0:42:36", "remaining_time": "0:45:37"} +{"current_steps": 753, "total_steps": 1557, "loss": 0.0056, "lr": 2.4643828394292478e-05, "epoch": 1.4510843373493976, "percentage": 48.36, "elapsed_time": "0:42:40", "remaining_time": "0:45:33"} +{"current_steps": 754, "total_steps": 1557, "loss": 0.0052, "lr": 2.4600194581711775e-05, "epoch": 1.453012048192771, "percentage": 48.43, "elapsed_time": "0:42:43", "remaining_time": "0:45:30"} +{"current_steps": 755, "total_steps": 1557, "loss": 0.0113, "lr": 2.4556537637860176e-05, "epoch": 1.4549397590361446, "percentage": 48.49, "elapsed_time": "0:42:47", "remaining_time": "0:45:27"} +{"current_steps": 756, "total_steps": 1557, "loss": 0.0099, "lr": 2.451285778225894e-05, "epoch": 1.456867469879518, "percentage": 48.55, "elapsed_time": "0:42:50", "remaining_time": "0:45:23"} +{"current_steps": 757, "total_steps": 1557, "loss": 0.0069, "lr": 2.4469155234544565e-05, "epoch": 1.4587951807228916, "percentage": 48.62, "elapsed_time": "0:42:53", "remaining_time": "0:45:19"} +{"current_steps": 758, "total_steps": 1557, "loss": 0.0088, "lr": 2.442543021446764e-05, "epoch": 1.4607228915662651, "percentage": 48.68, "elapsed_time": "0:42:56", "remaining_time": "0:45:16"} +{"current_steps": 759, "total_steps": 1557, "loss": 0.0182, "lr": 2.4381682941891755e-05, "epoch": 1.4626506024096386, "percentage": 48.75, "elapsed_time": "0:43:00", "remaining_time": "0:45:12"} +{"current_steps": 760, "total_steps": 1557, "loss": 0.0069, "lr": 2.4337913636792382e-05, "epoch": 1.464578313253012, "percentage": 48.81, "elapsed_time": "0:43:02", "remaining_time": "0:45:08"} +{"current_steps": 761, "total_steps": 1557, "loss": 0.0406, "lr": 2.429412251925579e-05, "epoch": 1.4665060240963856, "percentage": 48.88, "elapsed_time": "0:43:06", "remaining_time": "0:45:05"} +{"current_steps": 762, "total_steps": 1557, "loss": 0.0205, "lr": 2.425030980947793e-05, "epoch": 1.468433734939759, "percentage": 48.94, "elapsed_time": "0:43:09", "remaining_time": "0:45:02"} +{"current_steps": 763, "total_steps": 1557, "loss": 0.0136, "lr": 2.420647572776332e-05, "epoch": 1.4703614457831327, "percentage": 49.0, "elapsed_time": "0:43:13", "remaining_time": "0:44:58"} +{"current_steps": 764, "total_steps": 1557, "loss": 0.011, "lr": 2.416262049452395e-05, "epoch": 1.472289156626506, "percentage": 49.07, "elapsed_time": "0:43:16", "remaining_time": "0:44:55"} +{"current_steps": 765, "total_steps": 1557, "loss": 0.0131, "lr": 2.4118744330278147e-05, "epoch": 1.4742168674698795, "percentage": 49.13, "elapsed_time": "0:43:20", "remaining_time": "0:44:51"} +{"current_steps": 766, "total_steps": 1557, "loss": 0.0138, "lr": 2.4074847455649523e-05, "epoch": 1.476144578313253, "percentage": 49.2, "elapsed_time": "0:43:23", "remaining_time": "0:44:48"} +{"current_steps": 767, "total_steps": 1557, "loss": 0.0264, "lr": 2.403093009136579e-05, "epoch": 1.4780722891566265, "percentage": 49.26, "elapsed_time": "0:43:26", "remaining_time": "0:44:44"} +{"current_steps": 768, "total_steps": 1557, "loss": 0.0111, "lr": 2.3986992458257707e-05, "epoch": 1.48, "percentage": 49.33, "elapsed_time": "0:43:30", "remaining_time": "0:44:41"} +{"current_steps": 769, "total_steps": 1557, "loss": 0.0144, "lr": 2.3943034777257945e-05, "epoch": 1.4819277108433735, "percentage": 49.39, "elapsed_time": "0:43:33", "remaining_time": "0:44:38"} +{"current_steps": 770, "total_steps": 1557, "loss": 0.0062, "lr": 2.38990572694e-05, "epoch": 1.483855421686747, "percentage": 49.45, "elapsed_time": "0:43:37", "remaining_time": "0:44:34"} +{"current_steps": 771, "total_steps": 1557, "loss": 0.0172, "lr": 2.385506015581704e-05, "epoch": 1.4857831325301205, "percentage": 49.52, "elapsed_time": "0:43:40", "remaining_time": "0:44:31"} +{"current_steps": 772, "total_steps": 1557, "loss": 0.012, "lr": 2.381104365774083e-05, "epoch": 1.487710843373494, "percentage": 49.58, "elapsed_time": "0:43:44", "remaining_time": "0:44:28"} +{"current_steps": 773, "total_steps": 1557, "loss": 0.0116, "lr": 2.37670079965006e-05, "epoch": 1.4896385542168675, "percentage": 49.65, "elapsed_time": "0:43:47", "remaining_time": "0:44:25"} +{"current_steps": 774, "total_steps": 1557, "loss": 0.0147, "lr": 2.3722953393521944e-05, "epoch": 1.491566265060241, "percentage": 49.71, "elapsed_time": "0:43:51", "remaining_time": "0:44:21"} +{"current_steps": 775, "total_steps": 1557, "loss": 0.0111, "lr": 2.367888007032571e-05, "epoch": 1.4934939759036143, "percentage": 49.78, "elapsed_time": "0:43:54", "remaining_time": "0:44:18"} +{"current_steps": 776, "total_steps": 1557, "loss": 0.0061, "lr": 2.3634788248526846e-05, "epoch": 1.495421686746988, "percentage": 49.84, "elapsed_time": "0:43:57", "remaining_time": "0:44:14"} +{"current_steps": 777, "total_steps": 1557, "loss": 0.0205, "lr": 2.3590678149833356e-05, "epoch": 1.4973493975903613, "percentage": 49.9, "elapsed_time": "0:44:00", "remaining_time": "0:44:11"} +{"current_steps": 778, "total_steps": 1557, "loss": 0.0273, "lr": 2.3546549996045114e-05, "epoch": 1.499277108433735, "percentage": 49.97, "elapsed_time": "0:44:04", "remaining_time": "0:44:07"} +{"current_steps": 779, "total_steps": 1557, "loss": 0.0083, "lr": 2.3502404009052812e-05, "epoch": 1.5012048192771084, "percentage": 50.03, "elapsed_time": "0:44:07", "remaining_time": "0:44:04"} +{"current_steps": 780, "total_steps": 1557, "loss": 0.0122, "lr": 2.3458240410836775e-05, "epoch": 1.503132530120482, "percentage": 50.1, "elapsed_time": "0:44:11", "remaining_time": "0:44:00"} +{"current_steps": 781, "total_steps": 1557, "loss": 0.0083, "lr": 2.3414059423465924e-05, "epoch": 1.5050602409638554, "percentage": 50.16, "elapsed_time": "0:44:14", "remaining_time": "0:43:57"} +{"current_steps": 782, "total_steps": 1557, "loss": 0.0104, "lr": 2.3369861269096575e-05, "epoch": 1.5069879518072289, "percentage": 50.22, "elapsed_time": "0:44:17", "remaining_time": "0:43:53"} +{"current_steps": 783, "total_steps": 1557, "loss": 0.0264, "lr": 2.3325646169971416e-05, "epoch": 1.5089156626506024, "percentage": 50.29, "elapsed_time": "0:44:21", "remaining_time": "0:43:50"} +{"current_steps": 784, "total_steps": 1557, "loss": 0.0107, "lr": 2.3281414348418294e-05, "epoch": 1.510843373493976, "percentage": 50.35, "elapsed_time": "0:44:24", "remaining_time": "0:43:47"} +{"current_steps": 785, "total_steps": 1557, "loss": 0.0084, "lr": 2.3237166026849158e-05, "epoch": 1.5127710843373494, "percentage": 50.42, "elapsed_time": "0:44:28", "remaining_time": "0:43:43"} +{"current_steps": 786, "total_steps": 1557, "loss": 0.0111, "lr": 2.3192901427758932e-05, "epoch": 1.514698795180723, "percentage": 50.48, "elapsed_time": "0:44:31", "remaining_time": "0:43:40"} +{"current_steps": 787, "total_steps": 1557, "loss": 0.0135, "lr": 2.314862077372438e-05, "epoch": 1.5166265060240964, "percentage": 50.55, "elapsed_time": "0:44:34", "remaining_time": "0:43:36"} +{"current_steps": 788, "total_steps": 1557, "loss": 0.0265, "lr": 2.3104324287402996e-05, "epoch": 1.5185542168674697, "percentage": 50.61, "elapsed_time": "0:44:37", "remaining_time": "0:43:32"} +{"current_steps": 789, "total_steps": 1557, "loss": 0.0127, "lr": 2.3060012191531885e-05, "epoch": 1.5204819277108435, "percentage": 50.67, "elapsed_time": "0:44:40", "remaining_time": "0:43:29"} +{"current_steps": 790, "total_steps": 1557, "loss": 0.0134, "lr": 2.301568470892664e-05, "epoch": 1.5224096385542167, "percentage": 50.74, "elapsed_time": "0:44:43", "remaining_time": "0:43:25"} +{"current_steps": 791, "total_steps": 1557, "loss": 0.0318, "lr": 2.297134206248024e-05, "epoch": 1.5243373493975905, "percentage": 50.8, "elapsed_time": "0:44:46", "remaining_time": "0:43:21"} +{"current_steps": 792, "total_steps": 1557, "loss": 0.008, "lr": 2.2926984475161884e-05, "epoch": 1.5262650602409638, "percentage": 50.87, "elapsed_time": "0:44:49", "remaining_time": "0:43:17"} +{"current_steps": 793, "total_steps": 1557, "loss": 0.0125, "lr": 2.2882612170015914e-05, "epoch": 1.5281927710843375, "percentage": 50.93, "elapsed_time": "0:44:52", "remaining_time": "0:43:14"} +{"current_steps": 794, "total_steps": 1557, "loss": 0.0155, "lr": 2.2838225370160682e-05, "epoch": 1.5301204819277108, "percentage": 51.0, "elapsed_time": "0:44:55", "remaining_time": "0:43:10"} +{"current_steps": 795, "total_steps": 1557, "loss": 0.0132, "lr": 2.2793824298787414e-05, "epoch": 1.5320481927710843, "percentage": 51.06, "elapsed_time": "0:44:58", "remaining_time": "0:43:06"} +{"current_steps": 796, "total_steps": 1557, "loss": 0.026, "lr": 2.2749409179159104e-05, "epoch": 1.5339759036144578, "percentage": 51.12, "elapsed_time": "0:45:02", "remaining_time": "0:43:03"} +{"current_steps": 797, "total_steps": 1557, "loss": 0.0099, "lr": 2.2704980234609396e-05, "epoch": 1.5359036144578313, "percentage": 51.19, "elapsed_time": "0:45:05", "remaining_time": "0:42:59"} +{"current_steps": 798, "total_steps": 1557, "loss": 0.009, "lr": 2.2660537688541416e-05, "epoch": 1.5378313253012048, "percentage": 51.25, "elapsed_time": "0:45:08", "remaining_time": "0:42:56"} +{"current_steps": 799, "total_steps": 1557, "loss": 0.0077, "lr": 2.2616081764426726e-05, "epoch": 1.5397590361445783, "percentage": 51.32, "elapsed_time": "0:45:12", "remaining_time": "0:42:52"} +{"current_steps": 800, "total_steps": 1557, "loss": 0.0119, "lr": 2.2571612685804124e-05, "epoch": 1.5416867469879518, "percentage": 51.38, "elapsed_time": "0:45:15", "remaining_time": "0:42:49"} +{"current_steps": 801, "total_steps": 1557, "loss": 0.0083, "lr": 2.252713067627857e-05, "epoch": 1.5436144578313253, "percentage": 51.45, "elapsed_time": "0:45:18", "remaining_time": "0:42:46"} +{"current_steps": 802, "total_steps": 1557, "loss": 0.0049, "lr": 2.2482635959520044e-05, "epoch": 1.5455421686746988, "percentage": 51.51, "elapsed_time": "0:45:22", "remaining_time": "0:42:42"} +{"current_steps": 803, "total_steps": 1557, "loss": 0.0273, "lr": 2.243812875926241e-05, "epoch": 1.5474698795180721, "percentage": 51.57, "elapsed_time": "0:45:25", "remaining_time": "0:42:39"} +{"current_steps": 804, "total_steps": 1557, "loss": 0.0108, "lr": 2.2393609299302314e-05, "epoch": 1.5493975903614459, "percentage": 51.64, "elapsed_time": "0:45:29", "remaining_time": "0:42:36"} +{"current_steps": 805, "total_steps": 1557, "loss": 0.0076, "lr": 2.2349077803498052e-05, "epoch": 1.5513253012048192, "percentage": 51.7, "elapsed_time": "0:45:32", "remaining_time": "0:42:32"} +{"current_steps": 806, "total_steps": 1557, "loss": 0.0135, "lr": 2.230453449576842e-05, "epoch": 1.5532530120481929, "percentage": 51.77, "elapsed_time": "0:45:36", "remaining_time": "0:42:29"} +{"current_steps": 807, "total_steps": 1557, "loss": 0.0094, "lr": 2.2259979600091635e-05, "epoch": 1.5551807228915662, "percentage": 51.83, "elapsed_time": "0:45:40", "remaining_time": "0:42:26"} +{"current_steps": 808, "total_steps": 1557, "loss": 0.0178, "lr": 2.2215413340504158e-05, "epoch": 1.55710843373494, "percentage": 51.89, "elapsed_time": "0:45:43", "remaining_time": "0:42:23"} +{"current_steps": 809, "total_steps": 1557, "loss": 0.0069, "lr": 2.2170835941099605e-05, "epoch": 1.5590361445783132, "percentage": 51.96, "elapsed_time": "0:45:47", "remaining_time": "0:42:20"} +{"current_steps": 810, "total_steps": 1557, "loss": 0.0319, "lr": 2.2126247626027615e-05, "epoch": 1.5609638554216867, "percentage": 52.02, "elapsed_time": "0:45:50", "remaining_time": "0:42:16"} +{"current_steps": 811, "total_steps": 1557, "loss": 0.0168, "lr": 2.208164861949268e-05, "epoch": 1.5628915662650602, "percentage": 52.09, "elapsed_time": "0:45:54", "remaining_time": "0:42:13"} +{"current_steps": 812, "total_steps": 1557, "loss": 0.0041, "lr": 2.20370391457531e-05, "epoch": 1.5648192771084337, "percentage": 52.15, "elapsed_time": "0:45:57", "remaining_time": "0:42:10"} +{"current_steps": 813, "total_steps": 1557, "loss": 0.0078, "lr": 2.1992419429119764e-05, "epoch": 1.5667469879518072, "percentage": 52.22, "elapsed_time": "0:46:00", "remaining_time": "0:42:06"} +{"current_steps": 814, "total_steps": 1557, "loss": 0.0166, "lr": 2.1947789693955097e-05, "epoch": 1.5686746987951807, "percentage": 52.28, "elapsed_time": "0:46:03", "remaining_time": "0:42:02"} +{"current_steps": 815, "total_steps": 1557, "loss": 0.0176, "lr": 2.190315016467188e-05, "epoch": 1.5706024096385542, "percentage": 52.34, "elapsed_time": "0:46:06", "remaining_time": "0:41:59"} +{"current_steps": 816, "total_steps": 1557, "loss": 0.0102, "lr": 2.1858501065732146e-05, "epoch": 1.5725301204819278, "percentage": 52.41, "elapsed_time": "0:46:10", "remaining_time": "0:41:55"} +{"current_steps": 817, "total_steps": 1557, "loss": 0.0111, "lr": 2.181384262164606e-05, "epoch": 1.5744578313253013, "percentage": 52.47, "elapsed_time": "0:46:13", "remaining_time": "0:41:52"} +{"current_steps": 818, "total_steps": 1557, "loss": 0.0076, "lr": 2.1769175056970765e-05, "epoch": 1.5763855421686745, "percentage": 52.54, "elapsed_time": "0:46:16", "remaining_time": "0:41:48"} +{"current_steps": 819, "total_steps": 1557, "loss": 0.0118, "lr": 2.172449859630927e-05, "epoch": 1.5783132530120483, "percentage": 52.6, "elapsed_time": "0:46:20", "remaining_time": "0:41:45"} +{"current_steps": 820, "total_steps": 1557, "loss": 0.0066, "lr": 2.167981346430931e-05, "epoch": 1.5802409638554216, "percentage": 52.67, "elapsed_time": "0:46:23", "remaining_time": "0:41:41"} +{"current_steps": 821, "total_steps": 1557, "loss": 0.0101, "lr": 2.1635119885662235e-05, "epoch": 1.5821686746987953, "percentage": 52.73, "elapsed_time": "0:46:26", "remaining_time": "0:41:38"} +{"current_steps": 822, "total_steps": 1557, "loss": 0.0118, "lr": 2.159041808510185e-05, "epoch": 1.5840963855421686, "percentage": 52.79, "elapsed_time": "0:46:30", "remaining_time": "0:41:35"} +{"current_steps": 823, "total_steps": 1557, "loss": 0.0122, "lr": 2.1545708287403322e-05, "epoch": 1.5860240963855423, "percentage": 52.86, "elapsed_time": "0:46:33", "remaining_time": "0:41:31"} +{"current_steps": 824, "total_steps": 1557, "loss": 0.0216, "lr": 2.1500990717382004e-05, "epoch": 1.5879518072289156, "percentage": 52.92, "elapsed_time": "0:46:36", "remaining_time": "0:41:27"} +{"current_steps": 825, "total_steps": 1557, "loss": 0.0136, "lr": 2.145626559989237e-05, "epoch": 1.589879518072289, "percentage": 52.99, "elapsed_time": "0:46:39", "remaining_time": "0:41:24"} +{"current_steps": 826, "total_steps": 1557, "loss": 0.0298, "lr": 2.1411533159826803e-05, "epoch": 1.5918072289156626, "percentage": 53.05, "elapsed_time": "0:46:42", "remaining_time": "0:41:20"} +{"current_steps": 827, "total_steps": 1557, "loss": 0.0382, "lr": 2.1366793622114533e-05, "epoch": 1.5937349397590361, "percentage": 53.11, "elapsed_time": "0:46:46", "remaining_time": "0:41:16"} +{"current_steps": 828, "total_steps": 1557, "loss": 0.0074, "lr": 2.1322047211720468e-05, "epoch": 1.5956626506024096, "percentage": 53.18, "elapsed_time": "0:46:49", "remaining_time": "0:41:13"} +{"current_steps": 829, "total_steps": 1557, "loss": 0.0103, "lr": 2.1277294153644083e-05, "epoch": 1.5975903614457831, "percentage": 53.24, "elapsed_time": "0:46:57", "remaining_time": "0:41:14"} +{"current_steps": 830, "total_steps": 1557, "loss": 0.0095, "lr": 2.123253467291827e-05, "epoch": 1.5995180722891567, "percentage": 53.31, "elapsed_time": "0:47:01", "remaining_time": "0:41:11"} +{"current_steps": 831, "total_steps": 1557, "loss": 0.0457, "lr": 2.118776899460822e-05, "epoch": 1.6014457831325302, "percentage": 53.37, "elapsed_time": "0:47:04", "remaining_time": "0:41:07"} +{"current_steps": 832, "total_steps": 1557, "loss": 0.0192, "lr": 2.1142997343810293e-05, "epoch": 1.6033734939759037, "percentage": 53.44, "elapsed_time": "0:47:07", "remaining_time": "0:41:04"} +{"current_steps": 833, "total_steps": 1557, "loss": 0.0278, "lr": 2.1098219945650865e-05, "epoch": 1.605301204819277, "percentage": 53.5, "elapsed_time": "0:47:10", "remaining_time": "0:41:00"} +{"current_steps": 834, "total_steps": 1557, "loss": 0.0076, "lr": 2.105343702528524e-05, "epoch": 1.6072289156626507, "percentage": 53.56, "elapsed_time": "0:47:14", "remaining_time": "0:40:57"} +{"current_steps": 835, "total_steps": 1557, "loss": 0.0076, "lr": 2.100864880789645e-05, "epoch": 1.609156626506024, "percentage": 53.63, "elapsed_time": "0:47:17", "remaining_time": "0:40:53"} +{"current_steps": 836, "total_steps": 1557, "loss": 0.005, "lr": 2.0963855518694203e-05, "epoch": 1.6110843373493977, "percentage": 53.69, "elapsed_time": "0:47:21", "remaining_time": "0:40:50"} +{"current_steps": 837, "total_steps": 1557, "loss": 0.0084, "lr": 2.0919057382913675e-05, "epoch": 1.613012048192771, "percentage": 53.76, "elapsed_time": "0:47:24", "remaining_time": "0:40:46"} +{"current_steps": 838, "total_steps": 1557, "loss": 0.009, "lr": 2.0874254625814435e-05, "epoch": 1.6149397590361447, "percentage": 53.82, "elapsed_time": "0:47:27", "remaining_time": "0:40:42"} +{"current_steps": 839, "total_steps": 1557, "loss": 0.0098, "lr": 2.0829447472679285e-05, "epoch": 1.616867469879518, "percentage": 53.89, "elapsed_time": "0:47:30", "remaining_time": "0:40:39"} +{"current_steps": 840, "total_steps": 1557, "loss": 0.0099, "lr": 2.0784636148813124e-05, "epoch": 1.6187951807228915, "percentage": 53.95, "elapsed_time": "0:47:34", "remaining_time": "0:40:36"} +{"current_steps": 841, "total_steps": 1557, "loss": 0.0075, "lr": 2.0739820879541827e-05, "epoch": 1.620722891566265, "percentage": 54.01, "elapsed_time": "0:47:37", "remaining_time": "0:40:32"} +{"current_steps": 842, "total_steps": 1557, "loss": 0.007, "lr": 2.069500189021111e-05, "epoch": 1.6226506024096385, "percentage": 54.08, "elapsed_time": "0:47:41", "remaining_time": "0:40:29"} +{"current_steps": 843, "total_steps": 1557, "loss": 0.0249, "lr": 2.0650179406185397e-05, "epoch": 1.624578313253012, "percentage": 54.14, "elapsed_time": "0:47:45", "remaining_time": "0:40:26"} +{"current_steps": 844, "total_steps": 1557, "loss": 0.0084, "lr": 2.060535365284668e-05, "epoch": 1.6265060240963856, "percentage": 54.21, "elapsed_time": "0:47:48", "remaining_time": "0:40:23"} +{"current_steps": 845, "total_steps": 1557, "loss": 0.0071, "lr": 2.056052485559338e-05, "epoch": 1.628433734939759, "percentage": 54.27, "elapsed_time": "0:47:51", "remaining_time": "0:40:19"} +{"current_steps": 846, "total_steps": 1557, "loss": 0.0198, "lr": 2.051569323983924e-05, "epoch": 1.6303614457831326, "percentage": 54.34, "elapsed_time": "0:47:55", "remaining_time": "0:40:16"} +{"current_steps": 847, "total_steps": 1557, "loss": 0.006, "lr": 2.047085903101218e-05, "epoch": 1.632289156626506, "percentage": 54.4, "elapsed_time": "0:47:58", "remaining_time": "0:40:13"} +{"current_steps": 848, "total_steps": 1557, "loss": 0.0147, "lr": 2.0426022454553137e-05, "epoch": 1.6342168674698794, "percentage": 54.46, "elapsed_time": "0:48:02", "remaining_time": "0:40:09"} +{"current_steps": 849, "total_steps": 1557, "loss": 0.0117, "lr": 2.0381183735914968e-05, "epoch": 1.636144578313253, "percentage": 54.53, "elapsed_time": "0:48:05", "remaining_time": "0:40:06"} +{"current_steps": 850, "total_steps": 1557, "loss": 0.008, "lr": 2.0336343100561295e-05, "epoch": 1.6380722891566264, "percentage": 54.59, "elapsed_time": "0:48:09", "remaining_time": "0:40:03"} +{"current_steps": 851, "total_steps": 1557, "loss": 0.0256, "lr": 2.0291500773965392e-05, "epoch": 1.6400000000000001, "percentage": 54.66, "elapsed_time": "0:48:12", "remaining_time": "0:39:59"} +{"current_steps": 852, "total_steps": 1557, "loss": 0.0141, "lr": 2.0246656981609013e-05, "epoch": 1.6419277108433734, "percentage": 54.72, "elapsed_time": "0:48:15", "remaining_time": "0:39:56"} +{"current_steps": 853, "total_steps": 1557, "loss": 0.008, "lr": 2.02018119489813e-05, "epoch": 1.6438554216867471, "percentage": 54.78, "elapsed_time": "0:48:18", "remaining_time": "0:39:52"} +{"current_steps": 854, "total_steps": 1557, "loss": 0.0085, "lr": 2.0156965901577635e-05, "epoch": 1.6457831325301204, "percentage": 54.85, "elapsed_time": "0:48:21", "remaining_time": "0:39:48"} +{"current_steps": 855, "total_steps": 1557, "loss": 0.0078, "lr": 2.011211906489848e-05, "epoch": 1.647710843373494, "percentage": 54.91, "elapsed_time": "0:48:25", "remaining_time": "0:39:45"} +{"current_steps": 856, "total_steps": 1557, "loss": 0.0109, "lr": 2.00672716644483e-05, "epoch": 1.6496385542168674, "percentage": 54.98, "elapsed_time": "0:48:28", "remaining_time": "0:39:41"} +{"current_steps": 857, "total_steps": 1557, "loss": 0.0076, "lr": 2.002242392573436e-05, "epoch": 1.651566265060241, "percentage": 55.04, "elapsed_time": "0:48:31", "remaining_time": "0:39:38"} +{"current_steps": 858, "total_steps": 1557, "loss": 0.0309, "lr": 1.997757607426565e-05, "epoch": 1.6534939759036145, "percentage": 55.11, "elapsed_time": "0:48:35", "remaining_time": "0:39:35"} +{"current_steps": 859, "total_steps": 1557, "loss": 0.0072, "lr": 1.9932728335551702e-05, "epoch": 1.655421686746988, "percentage": 55.17, "elapsed_time": "0:48:38", "remaining_time": "0:39:31"} +{"current_steps": 860, "total_steps": 1557, "loss": 0.0246, "lr": 1.988788093510152e-05, "epoch": 1.6573493975903615, "percentage": 55.23, "elapsed_time": "0:48:42", "remaining_time": "0:39:28"} +{"current_steps": 861, "total_steps": 1557, "loss": 0.0087, "lr": 1.9843034098422375e-05, "epoch": 1.659277108433735, "percentage": 55.3, "elapsed_time": "0:48:45", "remaining_time": "0:39:25"} +{"current_steps": 862, "total_steps": 1557, "loss": 0.0092, "lr": 1.9798188051018705e-05, "epoch": 1.6612048192771085, "percentage": 55.36, "elapsed_time": "0:48:48", "remaining_time": "0:39:21"} +{"current_steps": 863, "total_steps": 1557, "loss": 0.0118, "lr": 1.9753343018390997e-05, "epoch": 1.6631325301204818, "percentage": 55.43, "elapsed_time": "0:48:52", "remaining_time": "0:39:18"} +{"current_steps": 864, "total_steps": 1557, "loss": 0.0056, "lr": 1.9708499226034618e-05, "epoch": 1.6650602409638555, "percentage": 55.49, "elapsed_time": "0:48:55", "remaining_time": "0:39:14"} +{"current_steps": 865, "total_steps": 1557, "loss": 0.0094, "lr": 1.966365689943871e-05, "epoch": 1.6669879518072288, "percentage": 55.56, "elapsed_time": "0:48:58", "remaining_time": "0:39:10"} +{"current_steps": 866, "total_steps": 1557, "loss": 0.0097, "lr": 1.9618816264085042e-05, "epoch": 1.6689156626506025, "percentage": 55.62, "elapsed_time": "0:49:02", "remaining_time": "0:39:07"} +{"current_steps": 867, "total_steps": 1557, "loss": 0.0062, "lr": 1.957397754544687e-05, "epoch": 1.6708433734939758, "percentage": 55.68, "elapsed_time": "0:49:05", "remaining_time": "0:39:04"} +{"current_steps": 868, "total_steps": 1557, "loss": 0.0182, "lr": 1.952914096898783e-05, "epoch": 1.6727710843373496, "percentage": 55.75, "elapsed_time": "0:49:08", "remaining_time": "0:39:00"} +{"current_steps": 869, "total_steps": 1557, "loss": 0.0125, "lr": 1.9484306760160766e-05, "epoch": 1.6746987951807228, "percentage": 55.81, "elapsed_time": "0:49:12", "remaining_time": "0:38:57"} +{"current_steps": 870, "total_steps": 1557, "loss": 0.0074, "lr": 1.9439475144406623e-05, "epoch": 1.6766265060240964, "percentage": 55.88, "elapsed_time": "0:49:15", "remaining_time": "0:38:53"} +{"current_steps": 871, "total_steps": 1557, "loss": 0.0213, "lr": 1.9394646347153334e-05, "epoch": 1.6785542168674699, "percentage": 55.94, "elapsed_time": "0:49:19", "remaining_time": "0:38:50"} +{"current_steps": 872, "total_steps": 1557, "loss": 0.0173, "lr": 1.9349820593814606e-05, "epoch": 1.6804819277108434, "percentage": 56.01, "elapsed_time": "0:49:22", "remaining_time": "0:38:46"} +{"current_steps": 873, "total_steps": 1557, "loss": 0.011, "lr": 1.930499810978889e-05, "epoch": 1.6824096385542169, "percentage": 56.07, "elapsed_time": "0:49:25", "remaining_time": "0:38:43"} +{"current_steps": 874, "total_steps": 1557, "loss": 0.0285, "lr": 1.9260179120458177e-05, "epoch": 1.6843373493975904, "percentage": 56.13, "elapsed_time": "0:49:29", "remaining_time": "0:38:40"} +{"current_steps": 875, "total_steps": 1557, "loss": 0.0146, "lr": 1.9215363851186883e-05, "epoch": 1.686265060240964, "percentage": 56.2, "elapsed_time": "0:49:32", "remaining_time": "0:38:36"} +{"current_steps": 876, "total_steps": 1557, "loss": 0.0104, "lr": 1.9170552527320725e-05, "epoch": 1.6881927710843372, "percentage": 56.26, "elapsed_time": "0:49:35", "remaining_time": "0:38:33"} +{"current_steps": 877, "total_steps": 1557, "loss": 0.0091, "lr": 1.9125745374185568e-05, "epoch": 1.690120481927711, "percentage": 56.33, "elapsed_time": "0:49:38", "remaining_time": "0:38:29"} +{"current_steps": 878, "total_steps": 1557, "loss": 0.0097, "lr": 1.908094261708633e-05, "epoch": 1.6920481927710842, "percentage": 56.39, "elapsed_time": "0:49:42", "remaining_time": "0:38:26"} +{"current_steps": 879, "total_steps": 1557, "loss": 0.0143, "lr": 1.9036144481305807e-05, "epoch": 1.693975903614458, "percentage": 56.45, "elapsed_time": "0:49:45", "remaining_time": "0:38:22"} +{"current_steps": 880, "total_steps": 1557, "loss": 0.0062, "lr": 1.8991351192103554e-05, "epoch": 1.6959036144578312, "percentage": 56.52, "elapsed_time": "0:49:48", "remaining_time": "0:38:18"} +{"current_steps": 881, "total_steps": 1557, "loss": 0.0048, "lr": 1.8946562974714763e-05, "epoch": 1.697831325301205, "percentage": 56.58, "elapsed_time": "0:49:51", "remaining_time": "0:38:15"} +{"current_steps": 882, "total_steps": 1557, "loss": 0.0124, "lr": 1.890178005434914e-05, "epoch": 1.6997590361445782, "percentage": 56.65, "elapsed_time": "0:49:55", "remaining_time": "0:38:12"} +{"current_steps": 883, "total_steps": 1557, "loss": 0.0135, "lr": 1.885700265618971e-05, "epoch": 1.701686746987952, "percentage": 56.71, "elapsed_time": "0:49:57", "remaining_time": "0:38:08"} +{"current_steps": 884, "total_steps": 1557, "loss": 0.0365, "lr": 1.8812231005391786e-05, "epoch": 1.7036144578313253, "percentage": 56.78, "elapsed_time": "0:50:01", "remaining_time": "0:38:05"} +{"current_steps": 885, "total_steps": 1557, "loss": 0.0202, "lr": 1.8767465327081736e-05, "epoch": 1.7055421686746988, "percentage": 56.84, "elapsed_time": "0:50:05", "remaining_time": "0:38:01"} +{"current_steps": 886, "total_steps": 1557, "loss": 0.0035, "lr": 1.872270584635592e-05, "epoch": 1.7074698795180723, "percentage": 56.9, "elapsed_time": "0:50:07", "remaining_time": "0:37:57"} +{"current_steps": 887, "total_steps": 1557, "loss": 0.0157, "lr": 1.867795278827954e-05, "epoch": 1.7093975903614458, "percentage": 56.97, "elapsed_time": "0:50:11", "remaining_time": "0:37:54"} +{"current_steps": 888, "total_steps": 1557, "loss": 0.0071, "lr": 1.863320637788547e-05, "epoch": 1.7113253012048193, "percentage": 57.03, "elapsed_time": "0:50:14", "remaining_time": "0:37:51"} +{"current_steps": 889, "total_steps": 1557, "loss": 0.0347, "lr": 1.8588466840173207e-05, "epoch": 1.7132530120481928, "percentage": 57.1, "elapsed_time": "0:50:18", "remaining_time": "0:37:47"} +{"current_steps": 890, "total_steps": 1557, "loss": 0.006, "lr": 1.8543734400107637e-05, "epoch": 1.7151807228915663, "percentage": 57.16, "elapsed_time": "0:50:21", "remaining_time": "0:37:44"} +{"current_steps": 891, "total_steps": 1557, "loss": 0.0059, "lr": 1.8499009282617996e-05, "epoch": 1.7171084337349396, "percentage": 57.23, "elapsed_time": "0:50:24", "remaining_time": "0:37:40"} +{"current_steps": 892, "total_steps": 1557, "loss": 0.008, "lr": 1.8454291712596688e-05, "epoch": 1.7190361445783133, "percentage": 57.29, "elapsed_time": "0:50:28", "remaining_time": "0:37:37"} +{"current_steps": 893, "total_steps": 1557, "loss": 0.0061, "lr": 1.8409581914898157e-05, "epoch": 1.7209638554216866, "percentage": 57.35, "elapsed_time": "0:50:31", "remaining_time": "0:37:34"} +{"current_steps": 894, "total_steps": 1557, "loss": 0.0085, "lr": 1.836488011433777e-05, "epoch": 1.7228915662650603, "percentage": 57.42, "elapsed_time": "0:50:35", "remaining_time": "0:37:31"} +{"current_steps": 895, "total_steps": 1557, "loss": 0.0075, "lr": 1.83201865356907e-05, "epoch": 1.7248192771084336, "percentage": 57.48, "elapsed_time": "0:50:38", "remaining_time": "0:37:27"} +{"current_steps": 896, "total_steps": 1557, "loss": 0.0156, "lr": 1.8275501403690733e-05, "epoch": 1.7267469879518074, "percentage": 57.55, "elapsed_time": "0:50:41", "remaining_time": "0:37:23"} +{"current_steps": 897, "total_steps": 1557, "loss": 0.0218, "lr": 1.823082494302924e-05, "epoch": 1.7286746987951807, "percentage": 57.61, "elapsed_time": "0:50:45", "remaining_time": "0:37:20"} +{"current_steps": 898, "total_steps": 1557, "loss": 0.0126, "lr": 1.8186157378353945e-05, "epoch": 1.7306024096385542, "percentage": 57.68, "elapsed_time": "0:50:48", "remaining_time": "0:37:17"} +{"current_steps": 899, "total_steps": 1557, "loss": 0.0131, "lr": 1.8141498934267858e-05, "epoch": 1.7325301204819277, "percentage": 57.74, "elapsed_time": "0:50:51", "remaining_time": "0:37:13"} +{"current_steps": 900, "total_steps": 1557, "loss": 0.0115, "lr": 1.809684983532813e-05, "epoch": 1.7344578313253012, "percentage": 57.8, "elapsed_time": "0:50:55", "remaining_time": "0:37:10"} +{"current_steps": 901, "total_steps": 1557, "loss": 0.0113, "lr": 1.8052210306044907e-05, "epoch": 1.7363855421686747, "percentage": 57.87, "elapsed_time": "0:50:58", "remaining_time": "0:37:06"} +{"current_steps": 902, "total_steps": 1557, "loss": 0.0058, "lr": 1.8007580570880236e-05, "epoch": 1.7383132530120482, "percentage": 57.93, "elapsed_time": "0:51:02", "remaining_time": "0:37:03"} +{"current_steps": 903, "total_steps": 1557, "loss": 0.0106, "lr": 1.7962960854246908e-05, "epoch": 1.7402409638554217, "percentage": 58.0, "elapsed_time": "0:51:05", "remaining_time": "0:37:00"} +{"current_steps": 904, "total_steps": 1557, "loss": 0.0076, "lr": 1.791835138050732e-05, "epoch": 1.7421686746987952, "percentage": 58.06, "elapsed_time": "0:51:08", "remaining_time": "0:36:56"} +{"current_steps": 905, "total_steps": 1557, "loss": 0.0038, "lr": 1.7873752373972395e-05, "epoch": 1.7440963855421687, "percentage": 58.12, "elapsed_time": "0:51:12", "remaining_time": "0:36:53"} +{"current_steps": 906, "total_steps": 1557, "loss": 0.0043, "lr": 1.7829164058900398e-05, "epoch": 1.746024096385542, "percentage": 58.19, "elapsed_time": "0:51:15", "remaining_time": "0:36:49"} +{"current_steps": 907, "total_steps": 1557, "loss": 0.0099, "lr": 1.7784586659495845e-05, "epoch": 1.7479518072289157, "percentage": 58.25, "elapsed_time": "0:51:18", "remaining_time": "0:36:46"} +{"current_steps": 908, "total_steps": 1557, "loss": 0.0074, "lr": 1.7740020399908372e-05, "epoch": 1.749879518072289, "percentage": 58.32, "elapsed_time": "0:51:22", "remaining_time": "0:36:42"} +{"current_steps": 909, "total_steps": 1557, "loss": 0.0184, "lr": 1.7695465504231586e-05, "epoch": 1.7518072289156628, "percentage": 58.38, "elapsed_time": "0:51:25", "remaining_time": "0:36:39"} +{"current_steps": 910, "total_steps": 1557, "loss": 0.0061, "lr": 1.765092219650196e-05, "epoch": 1.753734939759036, "percentage": 58.45, "elapsed_time": "0:51:28", "remaining_time": "0:36:35"} +{"current_steps": 911, "total_steps": 1557, "loss": 0.0101, "lr": 1.7606390700697693e-05, "epoch": 1.7556626506024098, "percentage": 58.51, "elapsed_time": "0:51:31", "remaining_time": "0:36:32"} +{"current_steps": 912, "total_steps": 1557, "loss": 0.0034, "lr": 1.7561871240737595e-05, "epoch": 1.757590361445783, "percentage": 58.57, "elapsed_time": "0:51:35", "remaining_time": "0:36:28"} +{"current_steps": 913, "total_steps": 1557, "loss": 0.0384, "lr": 1.7517364040479966e-05, "epoch": 1.7595180722891566, "percentage": 58.64, "elapsed_time": "0:51:38", "remaining_time": "0:36:25"} +{"current_steps": 914, "total_steps": 1557, "loss": 0.0055, "lr": 1.7472869323721432e-05, "epoch": 1.76144578313253, "percentage": 58.7, "elapsed_time": "0:51:41", "remaining_time": "0:36:22"} +{"current_steps": 915, "total_steps": 1557, "loss": 0.0307, "lr": 1.742838731419588e-05, "epoch": 1.7633734939759036, "percentage": 58.77, "elapsed_time": "0:51:45", "remaining_time": "0:36:18"} +{"current_steps": 916, "total_steps": 1557, "loss": 0.0059, "lr": 1.738391823557328e-05, "epoch": 1.765301204819277, "percentage": 58.83, "elapsed_time": "0:51:48", "remaining_time": "0:36:15"} +{"current_steps": 917, "total_steps": 1557, "loss": 0.0113, "lr": 1.7339462311458587e-05, "epoch": 1.7672289156626506, "percentage": 58.9, "elapsed_time": "0:51:51", "remaining_time": "0:36:11"} +{"current_steps": 918, "total_steps": 1557, "loss": 0.0071, "lr": 1.7295019765390618e-05, "epoch": 1.7691566265060241, "percentage": 58.96, "elapsed_time": "0:51:55", "remaining_time": "0:36:08"} +{"current_steps": 919, "total_steps": 1557, "loss": 0.0144, "lr": 1.7250590820840903e-05, "epoch": 1.7710843373493976, "percentage": 59.02, "elapsed_time": "0:51:58", "remaining_time": "0:36:04"} +{"current_steps": 920, "total_steps": 1557, "loss": 0.0131, "lr": 1.720617570121259e-05, "epoch": 1.7730120481927711, "percentage": 59.09, "elapsed_time": "0:52:02", "remaining_time": "0:36:01"} +{"current_steps": 921, "total_steps": 1557, "loss": 0.0148, "lr": 1.7161774629839328e-05, "epoch": 1.7749397590361444, "percentage": 59.15, "elapsed_time": "0:52:10", "remaining_time": "0:36:01"} +{"current_steps": 922, "total_steps": 1557, "loss": 0.0066, "lr": 1.7117387829984093e-05, "epoch": 1.7768674698795182, "percentage": 59.22, "elapsed_time": "0:52:13", "remaining_time": "0:35:58"} +{"current_steps": 923, "total_steps": 1557, "loss": 0.0105, "lr": 1.707301552483813e-05, "epoch": 1.7787951807228914, "percentage": 59.28, "elapsed_time": "0:52:16", "remaining_time": "0:35:54"} +{"current_steps": 924, "total_steps": 1557, "loss": 0.0104, "lr": 1.7028657937519767e-05, "epoch": 1.7807228915662652, "percentage": 59.34, "elapsed_time": "0:52:19", "remaining_time": "0:35:50"} +{"current_steps": 925, "total_steps": 1557, "loss": 0.0134, "lr": 1.6984315291073355e-05, "epoch": 1.7826506024096385, "percentage": 59.41, "elapsed_time": "0:52:22", "remaining_time": "0:35:47"} +{"current_steps": 926, "total_steps": 1557, "loss": 0.0078, "lr": 1.6939987808468125e-05, "epoch": 1.7845783132530122, "percentage": 59.47, "elapsed_time": "0:52:25", "remaining_time": "0:35:43"} +{"current_steps": 927, "total_steps": 1557, "loss": 0.0108, "lr": 1.689567571259701e-05, "epoch": 1.7865060240963855, "percentage": 59.54, "elapsed_time": "0:52:28", "remaining_time": "0:35:40"} +{"current_steps": 928, "total_steps": 1557, "loss": 0.0266, "lr": 1.6851379226275624e-05, "epoch": 1.788433734939759, "percentage": 59.6, "elapsed_time": "0:52:32", "remaining_time": "0:35:36"} +{"current_steps": 929, "total_steps": 1557, "loss": 0.0109, "lr": 1.6807098572241075e-05, "epoch": 1.7903614457831325, "percentage": 59.67, "elapsed_time": "0:52:35", "remaining_time": "0:35:33"} +{"current_steps": 930, "total_steps": 1557, "loss": 0.0113, "lr": 1.6762833973150846e-05, "epoch": 1.792289156626506, "percentage": 59.73, "elapsed_time": "0:52:39", "remaining_time": "0:35:29"} +{"current_steps": 931, "total_steps": 1557, "loss": 0.0196, "lr": 1.671858565158172e-05, "epoch": 1.7942168674698795, "percentage": 59.79, "elapsed_time": "0:52:42", "remaining_time": "0:35:26"} +{"current_steps": 932, "total_steps": 1557, "loss": 0.0089, "lr": 1.6674353830028587e-05, "epoch": 1.796144578313253, "percentage": 59.86, "elapsed_time": "0:52:45", "remaining_time": "0:35:23"} +{"current_steps": 933, "total_steps": 1557, "loss": 0.0074, "lr": 1.663013873090342e-05, "epoch": 1.7980722891566265, "percentage": 59.92, "elapsed_time": "0:52:49", "remaining_time": "0:35:19"} +{"current_steps": 934, "total_steps": 1557, "loss": 0.0063, "lr": 1.6585940576534086e-05, "epoch": 1.8, "percentage": 59.99, "elapsed_time": "0:52:52", "remaining_time": "0:35:16"} +{"current_steps": 935, "total_steps": 1557, "loss": 0.0101, "lr": 1.654175958916323e-05, "epoch": 1.8019277108433736, "percentage": 60.05, "elapsed_time": "0:52:55", "remaining_time": "0:35:12"} +{"current_steps": 936, "total_steps": 1557, "loss": 0.0131, "lr": 1.6497595990947195e-05, "epoch": 1.8038554216867468, "percentage": 60.12, "elapsed_time": "0:52:59", "remaining_time": "0:35:09"} +{"current_steps": 937, "total_steps": 1557, "loss": 0.0068, "lr": 1.645345000395489e-05, "epoch": 1.8057831325301206, "percentage": 60.18, "elapsed_time": "0:53:02", "remaining_time": "0:35:06"} +{"current_steps": 938, "total_steps": 1557, "loss": 0.0094, "lr": 1.6409321850166647e-05, "epoch": 1.8077108433734939, "percentage": 60.24, "elapsed_time": "0:53:05", "remaining_time": "0:35:02"} +{"current_steps": 939, "total_steps": 1557, "loss": 0.0202, "lr": 1.636521175147316e-05, "epoch": 1.8096385542168676, "percentage": 60.31, "elapsed_time": "0:53:09", "remaining_time": "0:34:58"} +{"current_steps": 940, "total_steps": 1557, "loss": 0.0176, "lr": 1.6321119929674297e-05, "epoch": 1.8115662650602409, "percentage": 60.37, "elapsed_time": "0:53:12", "remaining_time": "0:34:55"} +{"current_steps": 941, "total_steps": 1557, "loss": 0.0085, "lr": 1.6277046606478056e-05, "epoch": 1.8134939759036146, "percentage": 60.44, "elapsed_time": "0:53:16", "remaining_time": "0:34:52"} +{"current_steps": 942, "total_steps": 1557, "loss": 0.0474, "lr": 1.6232992003499405e-05, "epoch": 1.815421686746988, "percentage": 60.5, "elapsed_time": "0:53:19", "remaining_time": "0:34:48"} +{"current_steps": 943, "total_steps": 1557, "loss": 0.0078, "lr": 1.6188956342259177e-05, "epoch": 1.8173493975903614, "percentage": 60.57, "elapsed_time": "0:53:22", "remaining_time": "0:34:45"} +{"current_steps": 944, "total_steps": 1557, "loss": 0.0174, "lr": 1.614493984418297e-05, "epoch": 1.819277108433735, "percentage": 60.63, "elapsed_time": "0:53:25", "remaining_time": "0:34:41"} +{"current_steps": 945, "total_steps": 1557, "loss": 0.0054, "lr": 1.6100942730600003e-05, "epoch": 1.8212048192771084, "percentage": 60.69, "elapsed_time": "0:53:28", "remaining_time": "0:34:37"} +{"current_steps": 946, "total_steps": 1557, "loss": 0.0063, "lr": 1.6056965222742055e-05, "epoch": 1.823132530120482, "percentage": 60.76, "elapsed_time": "0:53:32", "remaining_time": "0:34:34"} +{"current_steps": 947, "total_steps": 1557, "loss": 0.0234, "lr": 1.6013007541742303e-05, "epoch": 1.8250602409638554, "percentage": 60.82, "elapsed_time": "0:53:35", "remaining_time": "0:34:31"} +{"current_steps": 948, "total_steps": 1557, "loss": 0.0095, "lr": 1.596906990863422e-05, "epoch": 1.826987951807229, "percentage": 60.89, "elapsed_time": "0:53:39", "remaining_time": "0:34:27"} +{"current_steps": 949, "total_steps": 1557, "loss": 0.0356, "lr": 1.592515254435048e-05, "epoch": 1.8289156626506025, "percentage": 60.95, "elapsed_time": "0:53:42", "remaining_time": "0:34:24"} +{"current_steps": 950, "total_steps": 1557, "loss": 0.008, "lr": 1.5881255669721857e-05, "epoch": 1.830843373493976, "percentage": 61.01, "elapsed_time": "0:53:46", "remaining_time": "0:34:21"} +{"current_steps": 951, "total_steps": 1557, "loss": 0.0108, "lr": 1.5837379505476054e-05, "epoch": 1.8327710843373493, "percentage": 61.08, "elapsed_time": "0:53:49", "remaining_time": "0:34:17"} +{"current_steps": 952, "total_steps": 1557, "loss": 0.006, "lr": 1.5793524272236683e-05, "epoch": 1.834698795180723, "percentage": 61.14, "elapsed_time": "0:53:52", "remaining_time": "0:34:14"} +{"current_steps": 953, "total_steps": 1557, "loss": 0.0065, "lr": 1.5749690190522076e-05, "epoch": 1.8366265060240963, "percentage": 61.21, "elapsed_time": "0:53:56", "remaining_time": "0:34:11"} +{"current_steps": 954, "total_steps": 1557, "loss": 0.0092, "lr": 1.5705877480744214e-05, "epoch": 1.83855421686747, "percentage": 61.27, "elapsed_time": "0:53:59", "remaining_time": "0:34:07"} +{"current_steps": 955, "total_steps": 1557, "loss": 0.012, "lr": 1.5662086363207628e-05, "epoch": 1.8404819277108433, "percentage": 61.34, "elapsed_time": "0:54:03", "remaining_time": "0:34:04"} +{"current_steps": 956, "total_steps": 1557, "loss": 0.0113, "lr": 1.561831705810825e-05, "epoch": 1.842409638554217, "percentage": 61.4, "elapsed_time": "0:54:06", "remaining_time": "0:34:01"} +{"current_steps": 957, "total_steps": 1557, "loss": 0.0168, "lr": 1.557456978553236e-05, "epoch": 1.8443373493975903, "percentage": 61.46, "elapsed_time": "0:54:10", "remaining_time": "0:33:57"} +{"current_steps": 958, "total_steps": 1557, "loss": 0.0042, "lr": 1.553084476545544e-05, "epoch": 1.8462650602409638, "percentage": 61.53, "elapsed_time": "0:54:13", "remaining_time": "0:33:54"} +{"current_steps": 959, "total_steps": 1557, "loss": 0.0145, "lr": 1.5487142217741062e-05, "epoch": 1.8481927710843373, "percentage": 61.59, "elapsed_time": "0:54:16", "remaining_time": "0:33:50"} +{"current_steps": 960, "total_steps": 1557, "loss": 0.0059, "lr": 1.5443462362139834e-05, "epoch": 1.8501204819277108, "percentage": 61.66, "elapsed_time": "0:54:20", "remaining_time": "0:33:47"} +{"current_steps": 961, "total_steps": 1557, "loss": 0.0257, "lr": 1.539980541828823e-05, "epoch": 1.8520481927710843, "percentage": 61.72, "elapsed_time": "0:54:23", "remaining_time": "0:33:43"} +{"current_steps": 962, "total_steps": 1557, "loss": 0.0111, "lr": 1.5356171605707522e-05, "epoch": 1.8539759036144579, "percentage": 61.79, "elapsed_time": "0:54:26", "remaining_time": "0:33:40"} +{"current_steps": 963, "total_steps": 1557, "loss": 0.0049, "lr": 1.5312561143802704e-05, "epoch": 1.8559036144578314, "percentage": 61.85, "elapsed_time": "0:54:30", "remaining_time": "0:33:37"} +{"current_steps": 964, "total_steps": 1557, "loss": 0.0077, "lr": 1.5268974251861298e-05, "epoch": 1.8578313253012049, "percentage": 61.91, "elapsed_time": "0:54:33", "remaining_time": "0:33:33"} +{"current_steps": 965, "total_steps": 1557, "loss": 0.017, "lr": 1.5225411149052356e-05, "epoch": 1.8597590361445784, "percentage": 61.98, "elapsed_time": "0:54:37", "remaining_time": "0:33:30"} +{"current_steps": 966, "total_steps": 1557, "loss": 0.0049, "lr": 1.5181872054425287e-05, "epoch": 1.8616867469879517, "percentage": 62.04, "elapsed_time": "0:54:40", "remaining_time": "0:33:27"} +{"current_steps": 967, "total_steps": 1557, "loss": 0.0317, "lr": 1.5138357186908785e-05, "epoch": 1.8636144578313254, "percentage": 62.11, "elapsed_time": "0:54:44", "remaining_time": "0:33:24"} +{"current_steps": 968, "total_steps": 1557, "loss": 0.0126, "lr": 1.5094866765309728e-05, "epoch": 1.8655421686746987, "percentage": 62.17, "elapsed_time": "0:54:47", "remaining_time": "0:33:20"} +{"current_steps": 969, "total_steps": 1557, "loss": 0.0101, "lr": 1.5051401008312054e-05, "epoch": 1.8674698795180724, "percentage": 62.24, "elapsed_time": "0:54:51", "remaining_time": "0:33:17"} +{"current_steps": 970, "total_steps": 1557, "loss": 0.0155, "lr": 1.5007960134475706e-05, "epoch": 1.8693975903614457, "percentage": 62.3, "elapsed_time": "0:54:54", "remaining_time": "0:33:13"} +{"current_steps": 971, "total_steps": 1557, "loss": 0.0187, "lr": 1.4964544362235487e-05, "epoch": 1.8713253012048194, "percentage": 62.36, "elapsed_time": "0:54:57", "remaining_time": "0:33:09"} +{"current_steps": 972, "total_steps": 1557, "loss": 0.0084, "lr": 1.4921153909899983e-05, "epoch": 1.8732530120481927, "percentage": 62.43, "elapsed_time": "0:55:00", "remaining_time": "0:33:06"} +{"current_steps": 973, "total_steps": 1557, "loss": 0.007, "lr": 1.487778899565047e-05, "epoch": 1.8751807228915662, "percentage": 62.49, "elapsed_time": "0:55:03", "remaining_time": "0:33:02"} +{"current_steps": 974, "total_steps": 1557, "loss": 0.0058, "lr": 1.4834449837539806e-05, "epoch": 1.8771084337349397, "percentage": 62.56, "elapsed_time": "0:55:07", "remaining_time": "0:32:59"} +{"current_steps": 975, "total_steps": 1557, "loss": 0.0323, "lr": 1.4791136653491333e-05, "epoch": 1.8790361445783132, "percentage": 62.62, "elapsed_time": "0:55:10", "remaining_time": "0:32:56"} +{"current_steps": 976, "total_steps": 1557, "loss": 0.0126, "lr": 1.4747849661297808e-05, "epoch": 1.8809638554216868, "percentage": 62.68, "elapsed_time": "0:55:13", "remaining_time": "0:32:52"} +{"current_steps": 977, "total_steps": 1557, "loss": 0.0067, "lr": 1.470458907862026e-05, "epoch": 1.8828915662650603, "percentage": 62.75, "elapsed_time": "0:55:16", "remaining_time": "0:32:48"} +{"current_steps": 978, "total_steps": 1557, "loss": 0.0147, "lr": 1.4661355122986945e-05, "epoch": 1.8848192771084338, "percentage": 62.81, "elapsed_time": "0:55:20", "remaining_time": "0:32:45"} +{"current_steps": 979, "total_steps": 1557, "loss": 0.0038, "lr": 1.4618148011792206e-05, "epoch": 1.886746987951807, "percentage": 62.88, "elapsed_time": "0:55:23", "remaining_time": "0:32:41"} +{"current_steps": 980, "total_steps": 1557, "loss": 0.0139, "lr": 1.4574967962295419e-05, "epoch": 1.8886746987951808, "percentage": 62.94, "elapsed_time": "0:55:25", "remaining_time": "0:32:38"} +{"current_steps": 981, "total_steps": 1557, "loss": 0.0094, "lr": 1.4531815191619903e-05, "epoch": 1.890602409638554, "percentage": 63.01, "elapsed_time": "0:55:29", "remaining_time": "0:32:34"} +{"current_steps": 982, "total_steps": 1557, "loss": 0.0065, "lr": 1.4488689916751762e-05, "epoch": 1.8925301204819278, "percentage": 63.07, "elapsed_time": "0:55:32", "remaining_time": "0:32:31"} +{"current_steps": 983, "total_steps": 1557, "loss": 0.0057, "lr": 1.4445592354538885e-05, "epoch": 1.894457831325301, "percentage": 63.13, "elapsed_time": "0:55:35", "remaining_time": "0:32:27"} +{"current_steps": 984, "total_steps": 1557, "loss": 0.0142, "lr": 1.44025227216898e-05, "epoch": 1.8963855421686748, "percentage": 63.2, "elapsed_time": "0:55:38", "remaining_time": "0:32:24"} +{"current_steps": 985, "total_steps": 1557, "loss": 0.012, "lr": 1.435948123477259e-05, "epoch": 1.8983132530120481, "percentage": 63.26, "elapsed_time": "0:55:41", "remaining_time": "0:32:20"} +{"current_steps": 986, "total_steps": 1557, "loss": 0.0097, "lr": 1.431646811021382e-05, "epoch": 1.9002409638554218, "percentage": 63.33, "elapsed_time": "0:55:45", "remaining_time": "0:32:17"} +{"current_steps": 987, "total_steps": 1557, "loss": 0.0046, "lr": 1.4273483564297425e-05, "epoch": 1.9021686746987951, "percentage": 63.39, "elapsed_time": "0:55:48", "remaining_time": "0:32:13"} +{"current_steps": 988, "total_steps": 1557, "loss": 0.0038, "lr": 1.4230527813163656e-05, "epoch": 1.9040963855421686, "percentage": 63.46, "elapsed_time": "0:55:52", "remaining_time": "0:32:10"} +{"current_steps": 989, "total_steps": 1557, "loss": 0.0123, "lr": 1.4187601072807975e-05, "epoch": 1.9060240963855422, "percentage": 63.52, "elapsed_time": "0:55:55", "remaining_time": "0:32:07"} +{"current_steps": 990, "total_steps": 1557, "loss": 0.0093, "lr": 1.4144703559079948e-05, "epoch": 1.9079518072289157, "percentage": 63.58, "elapsed_time": "0:55:58", "remaining_time": "0:32:03"} +{"current_steps": 991, "total_steps": 1557, "loss": 0.0051, "lr": 1.4101835487682198e-05, "epoch": 1.9098795180722892, "percentage": 63.65, "elapsed_time": "0:56:01", "remaining_time": "0:32:00"} +{"current_steps": 992, "total_steps": 1557, "loss": 0.0083, "lr": 1.4058997074169299e-05, "epoch": 1.9118072289156627, "percentage": 63.71, "elapsed_time": "0:56:05", "remaining_time": "0:31:56"} +{"current_steps": 993, "total_steps": 1557, "loss": 0.0086, "lr": 1.401618853394668e-05, "epoch": 1.9137349397590362, "percentage": 63.78, "elapsed_time": "0:56:08", "remaining_time": "0:31:53"} +{"current_steps": 994, "total_steps": 1557, "loss": 0.015, "lr": 1.3973410082269591e-05, "epoch": 1.9156626506024095, "percentage": 63.84, "elapsed_time": "0:56:12", "remaining_time": "0:31:50"} +{"current_steps": 995, "total_steps": 1557, "loss": 0.0089, "lr": 1.3930661934241947e-05, "epoch": 1.9175903614457832, "percentage": 63.9, "elapsed_time": "0:56:15", "remaining_time": "0:31:46"} +{"current_steps": 996, "total_steps": 1557, "loss": 0.0072, "lr": 1.388794430481532e-05, "epoch": 1.9195180722891565, "percentage": 63.97, "elapsed_time": "0:56:19", "remaining_time": "0:31:43"} +{"current_steps": 997, "total_steps": 1557, "loss": 0.0131, "lr": 1.3845257408787807e-05, "epoch": 1.9214457831325302, "percentage": 64.03, "elapsed_time": "0:56:22", "remaining_time": "0:31:39"} +{"current_steps": 998, "total_steps": 1557, "loss": 0.0198, "lr": 1.3802601460802967e-05, "epoch": 1.9233734939759035, "percentage": 64.1, "elapsed_time": "0:56:26", "remaining_time": "0:31:36"} +{"current_steps": 999, "total_steps": 1557, "loss": 0.014, "lr": 1.3759976675348754e-05, "epoch": 1.9253012048192772, "percentage": 64.16, "elapsed_time": "0:56:29", "remaining_time": "0:31:33"} +{"current_steps": 1000, "total_steps": 1557, "loss": 0.0065, "lr": 1.3717383266756403e-05, "epoch": 1.9272289156626505, "percentage": 64.23, "elapsed_time": "0:56:33", "remaining_time": "0:31:29"} +{"current_steps": 1001, "total_steps": 1557, "loss": 0.0059, "lr": 1.367482144919941e-05, "epoch": 1.929156626506024, "percentage": 64.29, "elapsed_time": "0:56:36", "remaining_time": "0:31:26"} +{"current_steps": 1002, "total_steps": 1557, "loss": 0.0054, "lr": 1.3632291436692397e-05, "epoch": 1.9310843373493976, "percentage": 64.35, "elapsed_time": "0:56:39", "remaining_time": "0:31:23"} +{"current_steps": 1003, "total_steps": 1557, "loss": 0.0097, "lr": 1.3589793443090064e-05, "epoch": 1.933012048192771, "percentage": 64.42, "elapsed_time": "0:56:43", "remaining_time": "0:31:19"} +{"current_steps": 1004, "total_steps": 1557, "loss": 0.0125, "lr": 1.3547327682086114e-05, "epoch": 1.9349397590361446, "percentage": 64.48, "elapsed_time": "0:56:46", "remaining_time": "0:31:16"} +{"current_steps": 1005, "total_steps": 1557, "loss": 0.0131, "lr": 1.3504894367212171e-05, "epoch": 1.936867469879518, "percentage": 64.55, "elapsed_time": "0:56:49", "remaining_time": "0:31:12"} +{"current_steps": 1006, "total_steps": 1557, "loss": 0.0115, "lr": 1.34624937118367e-05, "epoch": 1.9387951807228916, "percentage": 64.61, "elapsed_time": "0:56:52", "remaining_time": "0:31:09"} +{"current_steps": 1007, "total_steps": 1557, "loss": 0.0309, "lr": 1.3420125929163976e-05, "epoch": 1.940722891566265, "percentage": 64.68, "elapsed_time": "0:56:56", "remaining_time": "0:31:05"} +{"current_steps": 1008, "total_steps": 1557, "loss": 0.0078, "lr": 1.3377791232232929e-05, "epoch": 1.9426506024096386, "percentage": 64.74, "elapsed_time": "0:56:59", "remaining_time": "0:31:02"} +{"current_steps": 1009, "total_steps": 1557, "loss": 0.0142, "lr": 1.333548983391617e-05, "epoch": 1.944578313253012, "percentage": 64.8, "elapsed_time": "0:57:02", "remaining_time": "0:30:59"} +{"current_steps": 1010, "total_steps": 1557, "loss": 0.0035, "lr": 1.3293221946918853e-05, "epoch": 1.9465060240963856, "percentage": 64.87, "elapsed_time": "0:57:06", "remaining_time": "0:30:55"} +{"current_steps": 1011, "total_steps": 1557, "loss": 0.0242, "lr": 1.325098778377762e-05, "epoch": 1.948433734939759, "percentage": 64.93, "elapsed_time": "0:57:10", "remaining_time": "0:30:52"} +{"current_steps": 1012, "total_steps": 1557, "loss": 0.0096, "lr": 1.3208787556859543e-05, "epoch": 1.9503614457831326, "percentage": 65.0, "elapsed_time": "0:57:13", "remaining_time": "0:30:48"} +{"current_steps": 1013, "total_steps": 1557, "loss": 0.0103, "lr": 1.3166621478361075e-05, "epoch": 1.952289156626506, "percentage": 65.06, "elapsed_time": "0:57:22", "remaining_time": "0:30:48"} +{"current_steps": 1014, "total_steps": 1557, "loss": 0.0078, "lr": 1.3124489760306917e-05, "epoch": 1.9542168674698797, "percentage": 65.13, "elapsed_time": "0:57:24", "remaining_time": "0:30:44"} +{"current_steps": 1015, "total_steps": 1557, "loss": 0.0077, "lr": 1.3082392614549036e-05, "epoch": 1.956144578313253, "percentage": 65.19, "elapsed_time": "0:57:28", "remaining_time": "0:30:41"} +{"current_steps": 1016, "total_steps": 1557, "loss": 0.0106, "lr": 1.3040330252765526e-05, "epoch": 1.9580722891566265, "percentage": 65.25, "elapsed_time": "0:57:31", "remaining_time": "0:30:38"} +{"current_steps": 1017, "total_steps": 1557, "loss": 0.0082, "lr": 1.2998302886459586e-05, "epoch": 1.96, "percentage": 65.32, "elapsed_time": "0:57:34", "remaining_time": "0:30:34"} +{"current_steps": 1018, "total_steps": 1557, "loss": 0.0068, "lr": 1.2956310726958472e-05, "epoch": 1.9619277108433735, "percentage": 65.38, "elapsed_time": "0:57:37", "remaining_time": "0:30:30"} +{"current_steps": 1019, "total_steps": 1557, "loss": 0.0086, "lr": 1.291435398541236e-05, "epoch": 1.963855421686747, "percentage": 65.45, "elapsed_time": "0:57:41", "remaining_time": "0:30:27"} +{"current_steps": 1020, "total_steps": 1557, "loss": 0.0116, "lr": 1.2872432872793379e-05, "epoch": 1.9657831325301205, "percentage": 65.51, "elapsed_time": "0:57:44", "remaining_time": "0:30:23"} +{"current_steps": 1021, "total_steps": 1557, "loss": 0.0055, "lr": 1.283054759989447e-05, "epoch": 1.967710843373494, "percentage": 65.57, "elapsed_time": "0:57:47", "remaining_time": "0:30:20"} +{"current_steps": 1022, "total_steps": 1557, "loss": 0.0264, "lr": 1.2788698377328385e-05, "epoch": 1.9696385542168675, "percentage": 65.64, "elapsed_time": "0:57:50", "remaining_time": "0:30:16"} +{"current_steps": 1023, "total_steps": 1557, "loss": 0.0046, "lr": 1.2746885415526594e-05, "epoch": 1.971566265060241, "percentage": 65.7, "elapsed_time": "0:57:53", "remaining_time": "0:30:13"} +{"current_steps": 1024, "total_steps": 1557, "loss": 0.0056, "lr": 1.2705108924738223e-05, "epoch": 1.9734939759036143, "percentage": 65.77, "elapsed_time": "0:57:56", "remaining_time": "0:30:09"} +{"current_steps": 1025, "total_steps": 1557, "loss": 0.0056, "lr": 1.2663369115029034e-05, "epoch": 1.975421686746988, "percentage": 65.83, "elapsed_time": "0:57:59", "remaining_time": "0:30:06"} +{"current_steps": 1026, "total_steps": 1557, "loss": 0.0101, "lr": 1.2621666196280333e-05, "epoch": 1.9773493975903613, "percentage": 65.9, "elapsed_time": "0:58:03", "remaining_time": "0:30:02"} +{"current_steps": 1027, "total_steps": 1557, "loss": 0.0059, "lr": 1.258000037818792e-05, "epoch": 1.979277108433735, "percentage": 65.96, "elapsed_time": "0:58:06", "remaining_time": "0:29:59"} +{"current_steps": 1028, "total_steps": 1557, "loss": 0.0115, "lr": 1.2538371870261053e-05, "epoch": 1.9812048192771083, "percentage": 66.02, "elapsed_time": "0:58:10", "remaining_time": "0:29:55"} +{"current_steps": 1029, "total_steps": 1557, "loss": 0.0046, "lr": 1.249678088182137e-05, "epoch": 1.983132530120482, "percentage": 66.09, "elapsed_time": "0:58:13", "remaining_time": "0:29:52"} +{"current_steps": 1030, "total_steps": 1557, "loss": 0.0086, "lr": 1.2455227622001851e-05, "epoch": 1.9850602409638554, "percentage": 66.15, "elapsed_time": "0:58:16", "remaining_time": "0:29:49"} +{"current_steps": 1031, "total_steps": 1557, "loss": 0.0034, "lr": 1.241371229974579e-05, "epoch": 1.9869879518072289, "percentage": 66.22, "elapsed_time": "0:58:19", "remaining_time": "0:29:45"} +{"current_steps": 1032, "total_steps": 1557, "loss": 0.0245, "lr": 1.2372235123805672e-05, "epoch": 1.9889156626506024, "percentage": 66.28, "elapsed_time": "0:58:23", "remaining_time": "0:29:42"} +{"current_steps": 1033, "total_steps": 1557, "loss": 0.0104, "lr": 1.2330796302742211e-05, "epoch": 1.9908433734939759, "percentage": 66.35, "elapsed_time": "0:58:26", "remaining_time": "0:29:38"} +{"current_steps": 1034, "total_steps": 1557, "loss": 0.0176, "lr": 1.2289396044923238e-05, "epoch": 1.9927710843373494, "percentage": 66.41, "elapsed_time": "0:58:29", "remaining_time": "0:29:35"} +{"current_steps": 1035, "total_steps": 1557, "loss": 0.0113, "lr": 1.2248034558522682e-05, "epoch": 1.994698795180723, "percentage": 66.47, "elapsed_time": "0:58:33", "remaining_time": "0:29:31"} +{"current_steps": 1036, "total_steps": 1557, "loss": 0.0036, "lr": 1.2206712051519518e-05, "epoch": 1.9966265060240964, "percentage": 66.54, "elapsed_time": "0:58:36", "remaining_time": "0:29:28"} +{"current_steps": 1037, "total_steps": 1557, "loss": 0.0077, "lr": 1.2165428731696713e-05, "epoch": 1.99855421686747, "percentage": 66.6, "elapsed_time": "0:58:39", "remaining_time": "0:29:25"} +{"current_steps": 1038, "total_steps": 1557, "loss": 0.0114, "lr": 1.2124184806640202e-05, "epoch": 2.0, "percentage": 66.67, "elapsed_time": "0:58:42", "remaining_time": "0:29:21"} +{"current_steps": 1039, "total_steps": 1557, "loss": 0.0043, "lr": 1.208298048373782e-05, "epoch": 2.0019277108433733, "percentage": 66.73, "elapsed_time": "0:58:46", "remaining_time": "0:29:18"} +{"current_steps": 1040, "total_steps": 1557, "loss": 0.0214, "lr": 1.2041815970178268e-05, "epoch": 2.003855421686747, "percentage": 66.8, "elapsed_time": "0:58:49", "remaining_time": "0:29:14"} +{"current_steps": 1041, "total_steps": 1557, "loss": 0.0079, "lr": 1.2000691472950081e-05, "epoch": 2.0057831325301203, "percentage": 66.86, "elapsed_time": "0:58:53", "remaining_time": "0:29:11"} +{"current_steps": 1042, "total_steps": 1557, "loss": 0.0041, "lr": 1.1959607198840568e-05, "epoch": 2.007710843373494, "percentage": 66.92, "elapsed_time": "0:58:55", "remaining_time": "0:29:07"} +{"current_steps": 1043, "total_steps": 1557, "loss": 0.0033, "lr": 1.1918563354434784e-05, "epoch": 2.0096385542168673, "percentage": 66.99, "elapsed_time": "0:58:59", "remaining_time": "0:29:04"} +{"current_steps": 1044, "total_steps": 1557, "loss": 0.0128, "lr": 1.1877560146114515e-05, "epoch": 2.011566265060241, "percentage": 67.05, "elapsed_time": "0:59:02", "remaining_time": "0:29:00"} +{"current_steps": 1045, "total_steps": 1557, "loss": 0.0078, "lr": 1.1836597780057183e-05, "epoch": 2.0134939759036143, "percentage": 67.12, "elapsed_time": "0:59:06", "remaining_time": "0:28:57"} +{"current_steps": 1046, "total_steps": 1557, "loss": 0.0149, "lr": 1.179567646223485e-05, "epoch": 2.015421686746988, "percentage": 67.18, "elapsed_time": "0:59:09", "remaining_time": "0:28:54"} +{"current_steps": 1047, "total_steps": 1557, "loss": 0.0038, "lr": 1.1754796398413196e-05, "epoch": 2.0173493975903614, "percentage": 67.24, "elapsed_time": "0:59:13", "remaining_time": "0:28:50"} +{"current_steps": 1048, "total_steps": 1557, "loss": 0.0041, "lr": 1.1713957794150423e-05, "epoch": 2.019277108433735, "percentage": 67.31, "elapsed_time": "0:59:16", "remaining_time": "0:28:47"} +{"current_steps": 1049, "total_steps": 1557, "loss": 0.0041, "lr": 1.1673160854796307e-05, "epoch": 2.0212048192771084, "percentage": 67.37, "elapsed_time": "0:59:19", "remaining_time": "0:28:43"} +{"current_steps": 1050, "total_steps": 1557, "loss": 0.0043, "lr": 1.1632405785491077e-05, "epoch": 2.023132530120482, "percentage": 67.44, "elapsed_time": "0:59:22", "remaining_time": "0:28:40"} +{"current_steps": 1051, "total_steps": 1557, "loss": 0.0066, "lr": 1.159169279116445e-05, "epoch": 2.0250602409638554, "percentage": 67.5, "elapsed_time": "0:59:26", "remaining_time": "0:28:36"} +{"current_steps": 1052, "total_steps": 1557, "loss": 0.0024, "lr": 1.1551022076534585e-05, "epoch": 2.026987951807229, "percentage": 67.57, "elapsed_time": "0:59:29", "remaining_time": "0:28:33"} +{"current_steps": 1053, "total_steps": 1557, "loss": 0.0051, "lr": 1.1510393846107001e-05, "epoch": 2.0289156626506024, "percentage": 67.63, "elapsed_time": "0:59:33", "remaining_time": "0:28:30"} +{"current_steps": 1054, "total_steps": 1557, "loss": 0.0334, "lr": 1.1469808304173658e-05, "epoch": 2.0308433734939757, "percentage": 67.69, "elapsed_time": "0:59:36", "remaining_time": "0:28:26"} +{"current_steps": 1055, "total_steps": 1557, "loss": 0.0068, "lr": 1.1429265654811803e-05, "epoch": 2.0327710843373494, "percentage": 67.76, "elapsed_time": "0:59:40", "remaining_time": "0:28:23"} +{"current_steps": 1056, "total_steps": 1557, "loss": 0.0087, "lr": 1.1388766101883038e-05, "epoch": 2.0346987951807227, "percentage": 67.82, "elapsed_time": "0:59:43", "remaining_time": "0:28:20"} +{"current_steps": 1057, "total_steps": 1557, "loss": 0.0076, "lr": 1.1348309849032257e-05, "epoch": 2.0366265060240965, "percentage": 67.89, "elapsed_time": "0:59:46", "remaining_time": "0:28:16"} +{"current_steps": 1058, "total_steps": 1557, "loss": 0.0029, "lr": 1.1307897099686627e-05, "epoch": 2.0385542168674697, "percentage": 67.95, "elapsed_time": "0:59:50", "remaining_time": "0:28:13"} +{"current_steps": 1059, "total_steps": 1557, "loss": 0.0062, "lr": 1.1267528057054562e-05, "epoch": 2.0404819277108435, "percentage": 68.02, "elapsed_time": "0:59:53", "remaining_time": "0:28:10"} +{"current_steps": 1060, "total_steps": 1557, "loss": 0.0067, "lr": 1.1227202924124704e-05, "epoch": 2.0424096385542168, "percentage": 68.08, "elapsed_time": "0:59:57", "remaining_time": "0:28:06"} +{"current_steps": 1061, "total_steps": 1557, "loss": 0.0055, "lr": 1.118692190366491e-05, "epoch": 2.0443373493975905, "percentage": 68.14, "elapsed_time": "1:00:00", "remaining_time": "0:28:03"} +{"current_steps": 1062, "total_steps": 1557, "loss": 0.0036, "lr": 1.1146685198221222e-05, "epoch": 2.0462650602409638, "percentage": 68.21, "elapsed_time": "1:00:04", "remaining_time": "0:27:59"} +{"current_steps": 1063, "total_steps": 1557, "loss": 0.0058, "lr": 1.1106493010116842e-05, "epoch": 2.0481927710843375, "percentage": 68.27, "elapsed_time": "1:00:07", "remaining_time": "0:27:56"} +{"current_steps": 1064, "total_steps": 1557, "loss": 0.0059, "lr": 1.1066345541451127e-05, "epoch": 2.050120481927711, "percentage": 68.34, "elapsed_time": "1:00:10", "remaining_time": "0:27:53"} +{"current_steps": 1065, "total_steps": 1557, "loss": 0.0033, "lr": 1.1026242994098597e-05, "epoch": 2.0520481927710845, "percentage": 68.4, "elapsed_time": "1:00:14", "remaining_time": "0:27:49"} +{"current_steps": 1066, "total_steps": 1557, "loss": 0.0038, "lr": 1.0986185569707852e-05, "epoch": 2.053975903614458, "percentage": 68.46, "elapsed_time": "1:00:17", "remaining_time": "0:27:46"} +{"current_steps": 1067, "total_steps": 1557, "loss": 0.0158, "lr": 1.0946173469700625e-05, "epoch": 2.0559036144578315, "percentage": 68.53, "elapsed_time": "1:00:21", "remaining_time": "0:27:43"} +{"current_steps": 1068, "total_steps": 1557, "loss": 0.0085, "lr": 1.0906206895270739e-05, "epoch": 2.057831325301205, "percentage": 68.59, "elapsed_time": "1:00:24", "remaining_time": "0:27:39"} +{"current_steps": 1069, "total_steps": 1557, "loss": 0.0053, "lr": 1.0866286047383094e-05, "epoch": 2.059759036144578, "percentage": 68.66, "elapsed_time": "1:00:28", "remaining_time": "0:27:36"} +{"current_steps": 1070, "total_steps": 1557, "loss": 0.0025, "lr": 1.0826411126772675e-05, "epoch": 2.061686746987952, "percentage": 68.72, "elapsed_time": "1:00:31", "remaining_time": "0:27:33"} +{"current_steps": 1071, "total_steps": 1557, "loss": 0.0017, "lr": 1.0786582333943499e-05, "epoch": 2.063614457831325, "percentage": 68.79, "elapsed_time": "1:00:35", "remaining_time": "0:27:29"} +{"current_steps": 1072, "total_steps": 1557, "loss": 0.0033, "lr": 1.0746799869167679e-05, "epoch": 2.065542168674699, "percentage": 68.85, "elapsed_time": "1:00:38", "remaining_time": "0:27:26"} +{"current_steps": 1073, "total_steps": 1557, "loss": 0.0046, "lr": 1.0707063932484357e-05, "epoch": 2.067469879518072, "percentage": 68.91, "elapsed_time": "1:00:42", "remaining_time": "0:27:23"} +{"current_steps": 1074, "total_steps": 1557, "loss": 0.009, "lr": 1.0667374723698698e-05, "epoch": 2.069397590361446, "percentage": 68.98, "elapsed_time": "1:00:45", "remaining_time": "0:27:19"} +{"current_steps": 1075, "total_steps": 1557, "loss": 0.0034, "lr": 1.0627732442380932e-05, "epoch": 2.071325301204819, "percentage": 69.04, "elapsed_time": "1:00:49", "remaining_time": "0:27:16"} +{"current_steps": 1076, "total_steps": 1557, "loss": 0.0048, "lr": 1.058813728786531e-05, "epoch": 2.073253012048193, "percentage": 69.11, "elapsed_time": "1:00:52", "remaining_time": "0:27:12"} +{"current_steps": 1077, "total_steps": 1557, "loss": 0.0026, "lr": 1.0548589459249112e-05, "epoch": 2.075180722891566, "percentage": 69.17, "elapsed_time": "1:00:56", "remaining_time": "0:27:09"} +{"current_steps": 1078, "total_steps": 1557, "loss": 0.0284, "lr": 1.0509089155391661e-05, "epoch": 2.07710843373494, "percentage": 69.24, "elapsed_time": "1:00:59", "remaining_time": "0:27:06"} +{"current_steps": 1079, "total_steps": 1557, "loss": 0.0088, "lr": 1.0469636574913288e-05, "epoch": 2.079036144578313, "percentage": 69.3, "elapsed_time": "1:01:02", "remaining_time": "0:27:02"} +{"current_steps": 1080, "total_steps": 1557, "loss": 0.0042, "lr": 1.043023191619438e-05, "epoch": 2.080963855421687, "percentage": 69.36, "elapsed_time": "1:01:05", "remaining_time": "0:26:58"} +{"current_steps": 1081, "total_steps": 1557, "loss": 0.0037, "lr": 1.039087537737435e-05, "epoch": 2.0828915662650602, "percentage": 69.43, "elapsed_time": "1:01:08", "remaining_time": "0:26:55"} +{"current_steps": 1082, "total_steps": 1557, "loss": 0.0044, "lr": 1.0351567156350617e-05, "epoch": 2.0848192771084335, "percentage": 69.49, "elapsed_time": "1:01:12", "remaining_time": "0:26:52"} +{"current_steps": 1083, "total_steps": 1557, "loss": 0.0019, "lr": 1.0312307450777706e-05, "epoch": 2.0867469879518072, "percentage": 69.56, "elapsed_time": "1:01:15", "remaining_time": "0:26:48"} +{"current_steps": 1084, "total_steps": 1557, "loss": 0.0065, "lr": 1.027309645806613e-05, "epoch": 2.0886746987951805, "percentage": 69.62, "elapsed_time": "1:01:18", "remaining_time": "0:26:45"} +{"current_steps": 1085, "total_steps": 1557, "loss": 0.0238, "lr": 1.0233934375381489e-05, "epoch": 2.0906024096385543, "percentage": 69.69, "elapsed_time": "1:01:22", "remaining_time": "0:26:41"} +{"current_steps": 1086, "total_steps": 1557, "loss": 0.0092, "lr": 1.019482139964344e-05, "epoch": 2.0925301204819275, "percentage": 69.75, "elapsed_time": "1:01:25", "remaining_time": "0:26:38"} +{"current_steps": 1087, "total_steps": 1557, "loss": 0.0038, "lr": 1.015575772752472e-05, "epoch": 2.0944578313253013, "percentage": 69.81, "elapsed_time": "1:01:28", "remaining_time": "0:26:34"} +{"current_steps": 1088, "total_steps": 1557, "loss": 0.0024, "lr": 1.0116743555450148e-05, "epoch": 2.0963855421686746, "percentage": 69.88, "elapsed_time": "1:01:32", "remaining_time": "0:26:31"} +{"current_steps": 1089, "total_steps": 1557, "loss": 0.0136, "lr": 1.0077779079595631e-05, "epoch": 2.0983132530120483, "percentage": 69.94, "elapsed_time": "1:01:35", "remaining_time": "0:26:28"} +{"current_steps": 1090, "total_steps": 1557, "loss": 0.0493, "lr": 1.003886449588719e-05, "epoch": 2.1002409638554216, "percentage": 70.01, "elapsed_time": "1:01:38", "remaining_time": "0:26:24"} +{"current_steps": 1091, "total_steps": 1557, "loss": 0.0034, "lr": 1.0000000000000006e-05, "epoch": 2.1021686746987953, "percentage": 70.07, "elapsed_time": "1:01:42", "remaining_time": "0:26:21"} +{"current_steps": 1092, "total_steps": 1557, "loss": 0.0129, "lr": 9.961185787357346e-06, "epoch": 2.1040963855421686, "percentage": 70.13, "elapsed_time": "1:01:45", "remaining_time": "0:26:17"} +{"current_steps": 1093, "total_steps": 1557, "loss": 0.0184, "lr": 9.922422053129674e-06, "epoch": 2.1060240963855423, "percentage": 70.2, "elapsed_time": "1:01:48", "remaining_time": "0:26:14"} +{"current_steps": 1094, "total_steps": 1557, "loss": 0.0054, "lr": 9.883708992233626e-06, "epoch": 2.1079518072289156, "percentage": 70.26, "elapsed_time": "1:01:51", "remaining_time": "0:26:10"} +{"current_steps": 1095, "total_steps": 1557, "loss": 0.0037, "lr": 9.845046799331029e-06, "epoch": 2.1098795180722894, "percentage": 70.33, "elapsed_time": "1:01:54", "remaining_time": "0:26:07"} +{"current_steps": 1096, "total_steps": 1557, "loss": 0.006, "lr": 9.806435668827941e-06, "epoch": 2.1118072289156626, "percentage": 70.39, "elapsed_time": "1:01:58", "remaining_time": "0:26:04"} +{"current_steps": 1097, "total_steps": 1557, "loss": 0.0049, "lr": 9.76787579487363e-06, "epoch": 2.113734939759036, "percentage": 70.46, "elapsed_time": "1:02:01", "remaining_time": "0:26:00"} +{"current_steps": 1098, "total_steps": 1557, "loss": 0.0086, "lr": 9.729367371359681e-06, "epoch": 2.1156626506024097, "percentage": 70.52, "elapsed_time": "1:02:04", "remaining_time": "0:25:57"} +{"current_steps": 1099, "total_steps": 1557, "loss": 0.0106, "lr": 9.690910591918936e-06, "epoch": 2.117590361445783, "percentage": 70.58, "elapsed_time": "1:02:08", "remaining_time": "0:25:53"} +{"current_steps": 1100, "total_steps": 1557, "loss": 0.0012, "lr": 9.652505649924547e-06, "epoch": 2.1195180722891567, "percentage": 70.65, "elapsed_time": "1:02:11", "remaining_time": "0:25:50"} +{"current_steps": 1101, "total_steps": 1557, "loss": 0.0048, "lr": 9.614152738489021e-06, "epoch": 2.12144578313253, "percentage": 70.71, "elapsed_time": "1:02:14", "remaining_time": "0:25:46"} +{"current_steps": 1102, "total_steps": 1557, "loss": 0.0089, "lr": 9.575852050463268e-06, "epoch": 2.1233734939759037, "percentage": 70.78, "elapsed_time": "1:02:18", "remaining_time": "0:25:43"} +{"current_steps": 1103, "total_steps": 1557, "loss": 0.0065, "lr": 9.537603778435545e-06, "epoch": 2.125301204819277, "percentage": 70.84, "elapsed_time": "1:02:21", "remaining_time": "0:25:40"} +{"current_steps": 1104, "total_steps": 1557, "loss": 0.016, "lr": 9.499408114730583e-06, "epoch": 2.1272289156626507, "percentage": 70.91, "elapsed_time": "1:02:25", "remaining_time": "0:25:36"} +{"current_steps": 1105, "total_steps": 1557, "loss": 0.0036, "lr": 9.461265251408575e-06, "epoch": 2.129156626506024, "percentage": 70.97, "elapsed_time": "1:02:34", "remaining_time": "0:25:35"} +{"current_steps": 1106, "total_steps": 1557, "loss": 0.0037, "lr": 9.423175380264211e-06, "epoch": 2.1310843373493977, "percentage": 71.03, "elapsed_time": "1:02:37", "remaining_time": "0:25:32"} +{"current_steps": 1107, "total_steps": 1557, "loss": 0.0031, "lr": 9.385138692825729e-06, "epoch": 2.133012048192771, "percentage": 71.1, "elapsed_time": "1:02:41", "remaining_time": "0:25:28"} +{"current_steps": 1108, "total_steps": 1557, "loss": 0.0087, "lr": 9.347155380353912e-06, "epoch": 2.1349397590361447, "percentage": 71.16, "elapsed_time": "1:02:44", "remaining_time": "0:25:25"} +{"current_steps": 1109, "total_steps": 1557, "loss": 0.0045, "lr": 9.30922563384121e-06, "epoch": 2.136867469879518, "percentage": 71.23, "elapsed_time": "1:02:47", "remaining_time": "0:25:22"} +{"current_steps": 1110, "total_steps": 1557, "loss": 0.003, "lr": 9.271349644010672e-06, "epoch": 2.1387951807228918, "percentage": 71.29, "elapsed_time": "1:02:51", "remaining_time": "0:25:18"} +{"current_steps": 1111, "total_steps": 1557, "loss": 0.0042, "lr": 9.233527601315069e-06, "epoch": 2.140722891566265, "percentage": 71.36, "elapsed_time": "1:02:54", "remaining_time": "0:25:15"} +{"current_steps": 1112, "total_steps": 1557, "loss": 0.0173, "lr": 9.195759695935907e-06, "epoch": 2.1426506024096383, "percentage": 71.42, "elapsed_time": "1:02:57", "remaining_time": "0:25:11"} +{"current_steps": 1113, "total_steps": 1557, "loss": 0.0031, "lr": 9.158046117782464e-06, "epoch": 2.144578313253012, "percentage": 71.48, "elapsed_time": "1:03:00", "remaining_time": "0:25:08"} +{"current_steps": 1114, "total_steps": 1557, "loss": 0.0097, "lr": 9.120387056490851e-06, "epoch": 2.1465060240963854, "percentage": 71.55, "elapsed_time": "1:03:04", "remaining_time": "0:25:04"} +{"current_steps": 1115, "total_steps": 1557, "loss": 0.0026, "lr": 9.082782701423047e-06, "epoch": 2.148433734939759, "percentage": 71.61, "elapsed_time": "1:03:07", "remaining_time": "0:25:01"} +{"current_steps": 1116, "total_steps": 1557, "loss": 0.0019, "lr": 9.045233241665947e-06, "epoch": 2.1503614457831324, "percentage": 71.68, "elapsed_time": "1:03:11", "remaining_time": "0:24:58"} +{"current_steps": 1117, "total_steps": 1557, "loss": 0.0039, "lr": 9.007738866030427e-06, "epoch": 2.152289156626506, "percentage": 71.74, "elapsed_time": "1:03:14", "remaining_time": "0:24:54"} +{"current_steps": 1118, "total_steps": 1557, "loss": 0.0033, "lr": 8.970299763050356e-06, "epoch": 2.1542168674698794, "percentage": 71.8, "elapsed_time": "1:03:17", "remaining_time": "0:24:51"} +{"current_steps": 1119, "total_steps": 1557, "loss": 0.0076, "lr": 8.932916120981695e-06, "epoch": 2.156144578313253, "percentage": 71.87, "elapsed_time": "1:03:20", "remaining_time": "0:24:47"} +{"current_steps": 1120, "total_steps": 1557, "loss": 0.0052, "lr": 8.895588127801545e-06, "epoch": 2.1580722891566264, "percentage": 71.93, "elapsed_time": "1:03:24", "remaining_time": "0:24:44"} +{"current_steps": 1121, "total_steps": 1557, "loss": 0.0022, "lr": 8.858315971207146e-06, "epoch": 2.16, "percentage": 72.0, "elapsed_time": "1:03:27", "remaining_time": "0:24:41"} +{"current_steps": 1122, "total_steps": 1557, "loss": 0.0203, "lr": 8.821099838614996e-06, "epoch": 2.1619277108433734, "percentage": 72.06, "elapsed_time": "1:03:31", "remaining_time": "0:24:37"} +{"current_steps": 1123, "total_steps": 1557, "loss": 0.002, "lr": 8.783939917159897e-06, "epoch": 2.163855421686747, "percentage": 72.13, "elapsed_time": "1:03:34", "remaining_time": "0:24:34"} +{"current_steps": 1124, "total_steps": 1557, "loss": 0.0055, "lr": 8.746836393693978e-06, "epoch": 2.1657831325301204, "percentage": 72.19, "elapsed_time": "1:03:38", "remaining_time": "0:24:30"} +{"current_steps": 1125, "total_steps": 1557, "loss": 0.0077, "lr": 8.709789454785809e-06, "epoch": 2.167710843373494, "percentage": 72.25, "elapsed_time": "1:03:41", "remaining_time": "0:24:27"} +{"current_steps": 1126, "total_steps": 1557, "loss": 0.0032, "lr": 8.67279928671939e-06, "epoch": 2.1696385542168675, "percentage": 72.32, "elapsed_time": "1:03:44", "remaining_time": "0:24:24"} +{"current_steps": 1127, "total_steps": 1557, "loss": 0.0028, "lr": 8.635866075493318e-06, "epoch": 2.1715662650602408, "percentage": 72.38, "elapsed_time": "1:03:48", "remaining_time": "0:24:20"} +{"current_steps": 1128, "total_steps": 1557, "loss": 0.0047, "lr": 8.598990006819756e-06, "epoch": 2.1734939759036145, "percentage": 72.45, "elapsed_time": "1:03:51", "remaining_time": "0:24:17"} +{"current_steps": 1129, "total_steps": 1557, "loss": 0.0015, "lr": 8.562171266123528e-06, "epoch": 2.1754216867469878, "percentage": 72.51, "elapsed_time": "1:03:54", "remaining_time": "0:24:13"} +{"current_steps": 1130, "total_steps": 1557, "loss": 0.0094, "lr": 8.525410038541218e-06, "epoch": 2.1773493975903615, "percentage": 72.58, "elapsed_time": "1:03:57", "remaining_time": "0:24:10"} +{"current_steps": 1131, "total_steps": 1557, "loss": 0.0067, "lr": 8.488706508920202e-06, "epoch": 2.179277108433735, "percentage": 72.64, "elapsed_time": "1:04:01", "remaining_time": "0:24:06"} +{"current_steps": 1132, "total_steps": 1557, "loss": 0.0082, "lr": 8.452060861817738e-06, "epoch": 2.1812048192771085, "percentage": 72.7, "elapsed_time": "1:04:04", "remaining_time": "0:24:03"} +{"current_steps": 1133, "total_steps": 1557, "loss": 0.0059, "lr": 8.415473281500037e-06, "epoch": 2.183132530120482, "percentage": 72.77, "elapsed_time": "1:04:08", "remaining_time": "0:24:00"} +{"current_steps": 1134, "total_steps": 1557, "loss": 0.0107, "lr": 8.378943951941301e-06, "epoch": 2.1850602409638555, "percentage": 72.83, "elapsed_time": "1:04:11", "remaining_time": "0:23:56"} +{"current_steps": 1135, "total_steps": 1557, "loss": 0.0025, "lr": 8.342473056822873e-06, "epoch": 2.186987951807229, "percentage": 72.9, "elapsed_time": "1:04:14", "remaining_time": "0:23:53"} +{"current_steps": 1136, "total_steps": 1557, "loss": 0.0059, "lr": 8.306060779532245e-06, "epoch": 2.1889156626506026, "percentage": 72.96, "elapsed_time": "1:04:18", "remaining_time": "0:23:49"} +{"current_steps": 1137, "total_steps": 1557, "loss": 0.0022, "lr": 8.26970730316215e-06, "epoch": 2.190843373493976, "percentage": 73.03, "elapsed_time": "1:04:21", "remaining_time": "0:23:46"} +{"current_steps": 1138, "total_steps": 1557, "loss": 0.0131, "lr": 8.233412810509669e-06, "epoch": 2.1927710843373496, "percentage": 73.09, "elapsed_time": "1:04:24", "remaining_time": "0:23:42"} +{"current_steps": 1139, "total_steps": 1557, "loss": 0.0025, "lr": 8.197177484075284e-06, "epoch": 2.194698795180723, "percentage": 73.15, "elapsed_time": "1:04:27", "remaining_time": "0:23:39"} +{"current_steps": 1140, "total_steps": 1557, "loss": 0.0031, "lr": 8.161001506061979e-06, "epoch": 2.1966265060240966, "percentage": 73.22, "elapsed_time": "1:04:31", "remaining_time": "0:23:35"} +{"current_steps": 1141, "total_steps": 1557, "loss": 0.0034, "lr": 8.124885058374302e-06, "epoch": 2.19855421686747, "percentage": 73.28, "elapsed_time": "1:04:34", "remaining_time": "0:23:32"} +{"current_steps": 1142, "total_steps": 1557, "loss": 0.0044, "lr": 8.088828322617473e-06, "epoch": 2.200481927710843, "percentage": 73.35, "elapsed_time": "1:04:37", "remaining_time": "0:23:29"} +{"current_steps": 1143, "total_steps": 1557, "loss": 0.0168, "lr": 8.052831480096464e-06, "epoch": 2.202409638554217, "percentage": 73.41, "elapsed_time": "1:04:41", "remaining_time": "0:23:25"} +{"current_steps": 1144, "total_steps": 1557, "loss": 0.007, "lr": 8.016894711815067e-06, "epoch": 2.20433734939759, "percentage": 73.47, "elapsed_time": "1:04:44", "remaining_time": "0:23:22"} +{"current_steps": 1145, "total_steps": 1557, "loss": 0.0091, "lr": 7.98101819847501e-06, "epoch": 2.206265060240964, "percentage": 73.54, "elapsed_time": "1:04:48", "remaining_time": "0:23:19"} +{"current_steps": 1146, "total_steps": 1557, "loss": 0.0046, "lr": 7.945202120475063e-06, "epoch": 2.208192771084337, "percentage": 73.6, "elapsed_time": "1:04:51", "remaining_time": "0:23:15"} +{"current_steps": 1147, "total_steps": 1557, "loss": 0.0032, "lr": 7.909446657910072e-06, "epoch": 2.210120481927711, "percentage": 73.67, "elapsed_time": "1:04:55", "remaining_time": "0:23:12"} +{"current_steps": 1148, "total_steps": 1557, "loss": 0.0057, "lr": 7.873751990570104e-06, "epoch": 2.212048192771084, "percentage": 73.73, "elapsed_time": "1:04:58", "remaining_time": "0:23:08"} +{"current_steps": 1149, "total_steps": 1557, "loss": 0.0039, "lr": 7.838118297939529e-06, "epoch": 2.213975903614458, "percentage": 73.8, "elapsed_time": "1:05:01", "remaining_time": "0:23:05"} +{"current_steps": 1150, "total_steps": 1557, "loss": 0.005, "lr": 7.802545759196117e-06, "epoch": 2.2159036144578312, "percentage": 73.86, "elapsed_time": "1:05:05", "remaining_time": "0:23:02"} +{"current_steps": 1151, "total_steps": 1557, "loss": 0.0025, "lr": 7.76703455321014e-06, "epoch": 2.217831325301205, "percentage": 73.92, "elapsed_time": "1:05:08", "remaining_time": "0:22:58"} +{"current_steps": 1152, "total_steps": 1557, "loss": 0.0151, "lr": 7.73158485854344e-06, "epoch": 2.2197590361445783, "percentage": 73.99, "elapsed_time": "1:05:12", "remaining_time": "0:22:55"} +{"current_steps": 1153, "total_steps": 1557, "loss": 0.0027, "lr": 7.696196853448612e-06, "epoch": 2.221686746987952, "percentage": 74.05, "elapsed_time": "1:05:15", "remaining_time": "0:22:52"} +{"current_steps": 1154, "total_steps": 1557, "loss": 0.006, "lr": 7.660870715868018e-06, "epoch": 2.2236144578313253, "percentage": 74.12, "elapsed_time": "1:05:19", "remaining_time": "0:22:48"} +{"current_steps": 1155, "total_steps": 1557, "loss": 0.0041, "lr": 7.625606623432933e-06, "epoch": 2.225542168674699, "percentage": 74.18, "elapsed_time": "1:05:22", "remaining_time": "0:22:45"} +{"current_steps": 1156, "total_steps": 1557, "loss": 0.0125, "lr": 7.590404753462653e-06, "epoch": 2.2274698795180723, "percentage": 74.25, "elapsed_time": "1:05:26", "remaining_time": "0:22:42"} +{"current_steps": 1157, "total_steps": 1557, "loss": 0.0022, "lr": 7.55526528296362e-06, "epoch": 2.2293975903614456, "percentage": 74.31, "elapsed_time": "1:05:29", "remaining_time": "0:22:38"} +{"current_steps": 1158, "total_steps": 1557, "loss": 0.0123, "lr": 7.520188388628473e-06, "epoch": 2.2313253012048193, "percentage": 74.37, "elapsed_time": "1:05:32", "remaining_time": "0:22:34"} +{"current_steps": 1159, "total_steps": 1557, "loss": 0.0039, "lr": 7.485174246835227e-06, "epoch": 2.2332530120481926, "percentage": 74.44, "elapsed_time": "1:05:35", "remaining_time": "0:22:31"} +{"current_steps": 1160, "total_steps": 1557, "loss": 0.003, "lr": 7.4502230336463466e-06, "epoch": 2.2351807228915663, "percentage": 74.5, "elapsed_time": "1:05:39", "remaining_time": "0:22:28"} +{"current_steps": 1161, "total_steps": 1557, "loss": 0.0044, "lr": 7.415334924807869e-06, "epoch": 2.2371084337349396, "percentage": 74.57, "elapsed_time": "1:05:42", "remaining_time": "0:22:24"} +{"current_steps": 1162, "total_steps": 1557, "loss": 0.0071, "lr": 7.380510095748535e-06, "epoch": 2.2390361445783133, "percentage": 74.63, "elapsed_time": "1:05:46", "remaining_time": "0:22:21"} +{"current_steps": 1163, "total_steps": 1557, "loss": 0.0046, "lr": 7.3457487215788605e-06, "epoch": 2.2409638554216866, "percentage": 74.69, "elapsed_time": "1:05:49", "remaining_time": "0:22:17"} +{"current_steps": 1164, "total_steps": 1557, "loss": 0.0079, "lr": 7.311050977090343e-06, "epoch": 2.2428915662650604, "percentage": 74.76, "elapsed_time": "1:05:52", "remaining_time": "0:22:14"} +{"current_steps": 1165, "total_steps": 1557, "loss": 0.0042, "lr": 7.276417036754479e-06, "epoch": 2.2448192771084337, "percentage": 74.82, "elapsed_time": "1:05:55", "remaining_time": "0:22:11"} +{"current_steps": 1166, "total_steps": 1557, "loss": 0.0087, "lr": 7.241847074721964e-06, "epoch": 2.2467469879518074, "percentage": 74.89, "elapsed_time": "1:05:58", "remaining_time": "0:22:07"} +{"current_steps": 1167, "total_steps": 1557, "loss": 0.002, "lr": 7.207341264821783e-06, "epoch": 2.2486746987951807, "percentage": 74.95, "elapsed_time": "1:06:02", "remaining_time": "0:22:04"} +{"current_steps": 1168, "total_steps": 1557, "loss": 0.0069, "lr": 7.172899780560345e-06, "epoch": 2.2506024096385544, "percentage": 75.02, "elapsed_time": "1:06:05", "remaining_time": "0:22:00"} +{"current_steps": 1169, "total_steps": 1557, "loss": 0.0122, "lr": 7.138522795120606e-06, "epoch": 2.2525301204819277, "percentage": 75.08, "elapsed_time": "1:06:09", "remaining_time": "0:21:57"} +{"current_steps": 1170, "total_steps": 1557, "loss": 0.0025, "lr": 7.104210481361204e-06, "epoch": 2.2544578313253014, "percentage": 75.14, "elapsed_time": "1:06:12", "remaining_time": "0:21:54"} +{"current_steps": 1171, "total_steps": 1557, "loss": 0.0039, "lr": 7.069963011815584e-06, "epoch": 2.2563855421686747, "percentage": 75.21, "elapsed_time": "1:06:16", "remaining_time": "0:21:50"} +{"current_steps": 1172, "total_steps": 1557, "loss": 0.0025, "lr": 7.035780558691141e-06, "epoch": 2.258313253012048, "percentage": 75.27, "elapsed_time": "1:06:19", "remaining_time": "0:21:47"} +{"current_steps": 1173, "total_steps": 1557, "loss": 0.0014, "lr": 7.001663293868328e-06, "epoch": 2.2602409638554217, "percentage": 75.34, "elapsed_time": "1:06:22", "remaining_time": "0:21:43"} +{"current_steps": 1174, "total_steps": 1557, "loss": 0.0067, "lr": 6.967611388899826e-06, "epoch": 2.262168674698795, "percentage": 75.4, "elapsed_time": "1:06:26", "remaining_time": "0:21:40"} +{"current_steps": 1175, "total_steps": 1557, "loss": 0.0036, "lr": 6.933625015009666e-06, "epoch": 2.2640963855421687, "percentage": 75.47, "elapsed_time": "1:06:29", "remaining_time": "0:21:37"} +{"current_steps": 1176, "total_steps": 1557, "loss": 0.0014, "lr": 6.899704343092359e-06, "epoch": 2.266024096385542, "percentage": 75.53, "elapsed_time": "1:06:33", "remaining_time": "0:21:33"} +{"current_steps": 1177, "total_steps": 1557, "loss": 0.009, "lr": 6.865849543712058e-06, "epoch": 2.2679518072289158, "percentage": 75.59, "elapsed_time": "1:06:36", "remaining_time": "0:21:30"} +{"current_steps": 1178, "total_steps": 1557, "loss": 0.0117, "lr": 6.832060787101658e-06, "epoch": 2.269879518072289, "percentage": 75.66, "elapsed_time": "1:06:39", "remaining_time": "0:21:26"} +{"current_steps": 1179, "total_steps": 1557, "loss": 0.0024, "lr": 6.798338243162008e-06, "epoch": 2.271807228915663, "percentage": 75.72, "elapsed_time": "1:06:43", "remaining_time": "0:21:23"} +{"current_steps": 1180, "total_steps": 1557, "loss": 0.013, "lr": 6.764682081461002e-06, "epoch": 2.273734939759036, "percentage": 75.79, "elapsed_time": "1:06:46", "remaining_time": "0:21:20"} +{"current_steps": 1181, "total_steps": 1557, "loss": 0.0074, "lr": 6.73109247123273e-06, "epoch": 2.27566265060241, "percentage": 75.85, "elapsed_time": "1:06:49", "remaining_time": "0:21:16"} +{"current_steps": 1182, "total_steps": 1557, "loss": 0.0052, "lr": 6.6975695813766465e-06, "epoch": 2.277590361445783, "percentage": 75.92, "elapsed_time": "1:06:53", "remaining_time": "0:21:13"} +{"current_steps": 1183, "total_steps": 1557, "loss": 0.0265, "lr": 6.664113580456739e-06, "epoch": 2.279518072289157, "percentage": 75.98, "elapsed_time": "1:06:56", "remaining_time": "0:21:09"} +{"current_steps": 1184, "total_steps": 1557, "loss": 0.0026, "lr": 6.630724636700618e-06, "epoch": 2.28144578313253, "percentage": 76.04, "elapsed_time": "1:07:00", "remaining_time": "0:21:06"} +{"current_steps": 1185, "total_steps": 1557, "loss": 0.0046, "lr": 6.59740291799873e-06, "epoch": 2.283373493975904, "percentage": 76.11, "elapsed_time": "1:07:03", "remaining_time": "0:21:03"} +{"current_steps": 1186, "total_steps": 1557, "loss": 0.0063, "lr": 6.564148591903488e-06, "epoch": 2.285301204819277, "percentage": 76.17, "elapsed_time": "1:07:06", "remaining_time": "0:20:59"} +{"current_steps": 1187, "total_steps": 1557, "loss": 0.0012, "lr": 6.530961825628432e-06, "epoch": 2.2872289156626504, "percentage": 76.24, "elapsed_time": "1:07:09", "remaining_time": "0:20:56"} +{"current_steps": 1188, "total_steps": 1557, "loss": 0.0048, "lr": 6.4978427860474015e-06, "epoch": 2.289156626506024, "percentage": 76.3, "elapsed_time": "1:07:13", "remaining_time": "0:20:52"} +{"current_steps": 1189, "total_steps": 1557, "loss": 0.0049, "lr": 6.464791639693648e-06, "epoch": 2.2910843373493974, "percentage": 76.36, "elapsed_time": "1:07:16", "remaining_time": "0:20:49"} +{"current_steps": 1190, "total_steps": 1557, "loss": 0.0019, "lr": 6.431808552759083e-06, "epoch": 2.293012048192771, "percentage": 76.43, "elapsed_time": "1:07:20", "remaining_time": "0:20:45"} +{"current_steps": 1191, "total_steps": 1557, "loss": 0.0033, "lr": 6.398893691093367e-06, "epoch": 2.2949397590361444, "percentage": 76.49, "elapsed_time": "1:07:23", "remaining_time": "0:20:42"} +{"current_steps": 1192, "total_steps": 1557, "loss": 0.0032, "lr": 6.366047220203088e-06, "epoch": 2.296867469879518, "percentage": 76.56, "elapsed_time": "1:07:26", "remaining_time": "0:20:39"} +{"current_steps": 1193, "total_steps": 1557, "loss": 0.0027, "lr": 6.333269305250971e-06, "epoch": 2.2987951807228915, "percentage": 76.62, "elapsed_time": "1:07:30", "remaining_time": "0:20:35"} +{"current_steps": 1194, "total_steps": 1557, "loss": 0.0062, "lr": 6.300560111055006e-06, "epoch": 2.300722891566265, "percentage": 76.69, "elapsed_time": "1:07:32", "remaining_time": "0:20:32"} +{"current_steps": 1195, "total_steps": 1557, "loss": 0.0113, "lr": 6.2679198020876275e-06, "epoch": 2.3026506024096385, "percentage": 76.75, "elapsed_time": "1:07:36", "remaining_time": "0:20:28"} +{"current_steps": 1196, "total_steps": 1557, "loss": 0.0273, "lr": 6.235348542474908e-06, "epoch": 2.304578313253012, "percentage": 76.81, "elapsed_time": "1:07:40", "remaining_time": "0:20:25"} +{"current_steps": 1197, "total_steps": 1557, "loss": 0.0056, "lr": 6.202846495995705e-06, "epoch": 2.3065060240963855, "percentage": 76.88, "elapsed_time": "1:07:48", "remaining_time": "0:20:23"} +{"current_steps": 1198, "total_steps": 1557, "loss": 0.0034, "lr": 6.170413826080856e-06, "epoch": 2.3084337349397592, "percentage": 76.94, "elapsed_time": "1:07:52", "remaining_time": "0:20:20"} +{"current_steps": 1199, "total_steps": 1557, "loss": 0.0042, "lr": 6.138050695812343e-06, "epoch": 2.3103614457831325, "percentage": 77.01, "elapsed_time": "1:07:56", "remaining_time": "0:20:17"} +{"current_steps": 1200, "total_steps": 1557, "loss": 0.0045, "lr": 6.105757267922481e-06, "epoch": 2.3122891566265062, "percentage": 77.07, "elapsed_time": "1:07:59", "remaining_time": "0:20:13"} +{"current_steps": 1201, "total_steps": 1557, "loss": 0.0035, "lr": 6.073533704793122e-06, "epoch": 2.3142168674698795, "percentage": 77.14, "elapsed_time": "1:08:02", "remaining_time": "0:20:10"} +{"current_steps": 1202, "total_steps": 1557, "loss": 0.0088, "lr": 6.04138016845478e-06, "epoch": 2.316144578313253, "percentage": 77.2, "elapsed_time": "1:08:05", "remaining_time": "0:20:06"} +{"current_steps": 1203, "total_steps": 1557, "loss": 0.0059, "lr": 6.009296820585871e-06, "epoch": 2.3180722891566266, "percentage": 77.26, "elapsed_time": "1:08:08", "remaining_time": "0:20:03"} +{"current_steps": 1204, "total_steps": 1557, "loss": 0.0028, "lr": 5.977283822511879e-06, "epoch": 2.32, "percentage": 77.33, "elapsed_time": "1:08:12", "remaining_time": "0:19:59"} +{"current_steps": 1205, "total_steps": 1557, "loss": 0.0044, "lr": 5.945341335204547e-06, "epoch": 2.3219277108433736, "percentage": 77.39, "elapsed_time": "1:08:15", "remaining_time": "0:19:56"} +{"current_steps": 1206, "total_steps": 1557, "loss": 0.0043, "lr": 5.9134695192810695e-06, "epoch": 2.323855421686747, "percentage": 77.46, "elapsed_time": "1:08:18", "remaining_time": "0:19:52"} +{"current_steps": 1207, "total_steps": 1557, "loss": 0.0066, "lr": 5.8816685350032575e-06, "epoch": 2.3257831325301206, "percentage": 77.52, "elapsed_time": "1:08:22", "remaining_time": "0:19:49"} +{"current_steps": 1208, "total_steps": 1557, "loss": 0.0022, "lr": 5.849938542276801e-06, "epoch": 2.327710843373494, "percentage": 77.59, "elapsed_time": "1:08:25", "remaining_time": "0:19:46"} +{"current_steps": 1209, "total_steps": 1557, "loss": 0.0037, "lr": 5.818279700650393e-06, "epoch": 2.3296385542168676, "percentage": 77.65, "elapsed_time": "1:08:28", "remaining_time": "0:19:42"} +{"current_steps": 1210, "total_steps": 1557, "loss": 0.0049, "lr": 5.786692169314954e-06, "epoch": 2.331566265060241, "percentage": 77.71, "elapsed_time": "1:08:31", "remaining_time": "0:19:39"} +{"current_steps": 1211, "total_steps": 1557, "loss": 0.002, "lr": 5.755176107102833e-06, "epoch": 2.3334939759036146, "percentage": 77.78, "elapsed_time": "1:08:35", "remaining_time": "0:19:35"} +{"current_steps": 1212, "total_steps": 1557, "loss": 0.002, "lr": 5.723731672487043e-06, "epoch": 2.335421686746988, "percentage": 77.84, "elapsed_time": "1:08:39", "remaining_time": "0:19:32"} +{"current_steps": 1213, "total_steps": 1557, "loss": 0.0013, "lr": 5.69235902358038e-06, "epoch": 2.337349397590361, "percentage": 77.91, "elapsed_time": "1:08:42", "remaining_time": "0:19:28"} +{"current_steps": 1214, "total_steps": 1557, "loss": 0.0041, "lr": 5.661058318134711e-06, "epoch": 2.339277108433735, "percentage": 77.97, "elapsed_time": "1:08:45", "remaining_time": "0:19:25"} +{"current_steps": 1215, "total_steps": 1557, "loss": 0.0022, "lr": 5.6298297135401355e-06, "epoch": 2.3412048192771087, "percentage": 78.03, "elapsed_time": "1:08:48", "remaining_time": "0:19:22"} +{"current_steps": 1216, "total_steps": 1557, "loss": 0.0036, "lr": 5.598673366824212e-06, "epoch": 2.343132530120482, "percentage": 78.1, "elapsed_time": "1:08:51", "remaining_time": "0:19:18"} +{"current_steps": 1217, "total_steps": 1557, "loss": 0.0151, "lr": 5.567589434651164e-06, "epoch": 2.3450602409638552, "percentage": 78.16, "elapsed_time": "1:08:55", "remaining_time": "0:19:15"} +{"current_steps": 1218, "total_steps": 1557, "loss": 0.006, "lr": 5.536578073321073e-06, "epoch": 2.346987951807229, "percentage": 78.23, "elapsed_time": "1:08:58", "remaining_time": "0:19:11"} +{"current_steps": 1219, "total_steps": 1557, "loss": 0.0052, "lr": 5.505639438769146e-06, "epoch": 2.3489156626506023, "percentage": 78.29, "elapsed_time": "1:09:02", "remaining_time": "0:19:08"} +{"current_steps": 1220, "total_steps": 1557, "loss": 0.0048, "lr": 5.47477368656486e-06, "epoch": 2.350843373493976, "percentage": 78.36, "elapsed_time": "1:09:05", "remaining_time": "0:19:05"} +{"current_steps": 1221, "total_steps": 1557, "loss": 0.0028, "lr": 5.443980971911238e-06, "epoch": 2.3527710843373493, "percentage": 78.42, "elapsed_time": "1:09:08", "remaining_time": "0:19:01"} +{"current_steps": 1222, "total_steps": 1557, "loss": 0.0043, "lr": 5.413261449644039e-06, "epoch": 2.354698795180723, "percentage": 78.48, "elapsed_time": "1:09:11", "remaining_time": "0:18:58"} +{"current_steps": 1223, "total_steps": 1557, "loss": 0.0075, "lr": 5.382615274230987e-06, "epoch": 2.3566265060240963, "percentage": 78.55, "elapsed_time": "1:09:14", "remaining_time": "0:18:54"} +{"current_steps": 1224, "total_steps": 1557, "loss": 0.0061, "lr": 5.352042599770995e-06, "epoch": 2.35855421686747, "percentage": 78.61, "elapsed_time": "1:09:18", "remaining_time": "0:18:51"} +{"current_steps": 1225, "total_steps": 1557, "loss": 0.0015, "lr": 5.321543579993398e-06, "epoch": 2.3604819277108433, "percentage": 78.68, "elapsed_time": "1:09:21", "remaining_time": "0:18:47"} +{"current_steps": 1226, "total_steps": 1557, "loss": 0.0034, "lr": 5.2911183682571446e-06, "epoch": 2.362409638554217, "percentage": 78.74, "elapsed_time": "1:09:25", "remaining_time": "0:18:44"} +{"current_steps": 1227, "total_steps": 1557, "loss": 0.0076, "lr": 5.260767117550094e-06, "epoch": 2.3643373493975903, "percentage": 78.81, "elapsed_time": "1:09:28", "remaining_time": "0:18:41"} +{"current_steps": 1228, "total_steps": 1557, "loss": 0.0148, "lr": 5.230489980488165e-06, "epoch": 2.3662650602409636, "percentage": 78.87, "elapsed_time": "1:09:32", "remaining_time": "0:18:37"} +{"current_steps": 1229, "total_steps": 1557, "loss": 0.0049, "lr": 5.200287109314633e-06, "epoch": 2.3681927710843373, "percentage": 78.93, "elapsed_time": "1:09:35", "remaining_time": "0:18:34"} +{"current_steps": 1230, "total_steps": 1557, "loss": 0.0031, "lr": 5.1701586558993285e-06, "epoch": 2.370120481927711, "percentage": 79.0, "elapsed_time": "1:09:38", "remaining_time": "0:18:30"} +{"current_steps": 1231, "total_steps": 1557, "loss": 0.0058, "lr": 5.140104771737899e-06, "epoch": 2.3720481927710844, "percentage": 79.06, "elapsed_time": "1:09:41", "remaining_time": "0:18:27"} +{"current_steps": 1232, "total_steps": 1557, "loss": 0.0051, "lr": 5.110125607951024e-06, "epoch": 2.3739759036144576, "percentage": 79.13, "elapsed_time": "1:09:45", "remaining_time": "0:18:24"} +{"current_steps": 1233, "total_steps": 1557, "loss": 0.0173, "lr": 5.0802213152836514e-06, "epoch": 2.3759036144578314, "percentage": 79.19, "elapsed_time": "1:09:48", "remaining_time": "0:18:20"} +{"current_steps": 1234, "total_steps": 1557, "loss": 0.0045, "lr": 5.0503920441042845e-06, "epoch": 2.3778313253012047, "percentage": 79.25, "elapsed_time": "1:09:52", "remaining_time": "0:18:17"} +{"current_steps": 1235, "total_steps": 1557, "loss": 0.0024, "lr": 5.0206379444041764e-06, "epoch": 2.3797590361445784, "percentage": 79.32, "elapsed_time": "1:09:55", "remaining_time": "0:18:13"} +{"current_steps": 1236, "total_steps": 1557, "loss": 0.0088, "lr": 4.990959165796585e-06, "epoch": 2.3816867469879517, "percentage": 79.38, "elapsed_time": "1:09:58", "remaining_time": "0:18:10"} +{"current_steps": 1237, "total_steps": 1557, "loss": 0.0094, "lr": 4.961355857516034e-06, "epoch": 2.3836144578313254, "percentage": 79.45, "elapsed_time": "1:10:02", "remaining_time": "0:18:07"} +{"current_steps": 1238, "total_steps": 1557, "loss": 0.0086, "lr": 4.931828168417583e-06, "epoch": 2.3855421686746987, "percentage": 79.51, "elapsed_time": "1:10:05", "remaining_time": "0:18:03"} +{"current_steps": 1239, "total_steps": 1557, "loss": 0.0014, "lr": 4.902376246976015e-06, "epoch": 2.3874698795180724, "percentage": 79.58, "elapsed_time": "1:10:08", "remaining_time": "0:18:00"} +{"current_steps": 1240, "total_steps": 1557, "loss": 0.0043, "lr": 4.873000241285153e-06, "epoch": 2.3893975903614457, "percentage": 79.64, "elapsed_time": "1:10:11", "remaining_time": "0:17:56"} +{"current_steps": 1241, "total_steps": 1557, "loss": 0.0014, "lr": 4.8437002990570835e-06, "epoch": 2.3913253012048195, "percentage": 79.7, "elapsed_time": "1:10:15", "remaining_time": "0:17:53"} +{"current_steps": 1242, "total_steps": 1557, "loss": 0.0525, "lr": 4.8144765676214245e-06, "epoch": 2.3932530120481927, "percentage": 79.77, "elapsed_time": "1:10:18", "remaining_time": "0:17:49"} +{"current_steps": 1243, "total_steps": 1557, "loss": 0.008, "lr": 4.7853291939245814e-06, "epoch": 2.395180722891566, "percentage": 79.83, "elapsed_time": "1:10:22", "remaining_time": "0:17:46"} +{"current_steps": 1244, "total_steps": 1557, "loss": 0.0044, "lr": 4.756258324528995e-06, "epoch": 2.3971084337349398, "percentage": 79.9, "elapsed_time": "1:10:25", "remaining_time": "0:17:43"} +{"current_steps": 1245, "total_steps": 1557, "loss": 0.0186, "lr": 4.727264105612439e-06, "epoch": 2.3990361445783135, "percentage": 79.96, "elapsed_time": "1:10:28", "remaining_time": "0:17:39"} +{"current_steps": 1246, "total_steps": 1557, "loss": 0.0106, "lr": 4.698346682967258e-06, "epoch": 2.4009638554216868, "percentage": 80.03, "elapsed_time": "1:10:31", "remaining_time": "0:17:36"} +{"current_steps": 1247, "total_steps": 1557, "loss": 0.0035, "lr": 4.669506201999625e-06, "epoch": 2.40289156626506, "percentage": 80.09, "elapsed_time": "1:10:34", "remaining_time": "0:17:32"} +{"current_steps": 1248, "total_steps": 1557, "loss": 0.0038, "lr": 4.640742807728837e-06, "epoch": 2.404819277108434, "percentage": 80.15, "elapsed_time": "1:10:38", "remaining_time": "0:17:29"} +{"current_steps": 1249, "total_steps": 1557, "loss": 0.0021, "lr": 4.612056644786575e-06, "epoch": 2.406746987951807, "percentage": 80.22, "elapsed_time": "1:10:41", "remaining_time": "0:17:26"} +{"current_steps": 1250, "total_steps": 1557, "loss": 0.0028, "lr": 4.583447857416175e-06, "epoch": 2.408674698795181, "percentage": 80.28, "elapsed_time": "1:10:45", "remaining_time": "0:17:22"} +{"current_steps": 1251, "total_steps": 1557, "loss": 0.0027, "lr": 4.554916589471898e-06, "epoch": 2.410602409638554, "percentage": 80.35, "elapsed_time": "1:10:48", "remaining_time": "0:17:19"} +{"current_steps": 1252, "total_steps": 1557, "loss": 0.0037, "lr": 4.526462984418221e-06, "epoch": 2.412530120481928, "percentage": 80.41, "elapsed_time": "1:10:51", "remaining_time": "0:17:15"} +{"current_steps": 1253, "total_steps": 1557, "loss": 0.003, "lr": 4.498087185329105e-06, "epoch": 2.414457831325301, "percentage": 80.48, "elapsed_time": "1:10:54", "remaining_time": "0:17:12"} +{"current_steps": 1254, "total_steps": 1557, "loss": 0.009, "lr": 4.469789334887265e-06, "epoch": 2.416385542168675, "percentage": 80.54, "elapsed_time": "1:10:57", "remaining_time": "0:17:08"} +{"current_steps": 1255, "total_steps": 1557, "loss": 0.0033, "lr": 4.441569575383471e-06, "epoch": 2.418313253012048, "percentage": 80.6, "elapsed_time": "1:11:01", "remaining_time": "0:17:05"} +{"current_steps": 1256, "total_steps": 1557, "loss": 0.0021, "lr": 4.413428048715851e-06, "epoch": 2.420240963855422, "percentage": 80.67, "elapsed_time": "1:11:04", "remaining_time": "0:17:01"} +{"current_steps": 1257, "total_steps": 1557, "loss": 0.0041, "lr": 4.38536489638911e-06, "epoch": 2.422168674698795, "percentage": 80.73, "elapsed_time": "1:11:08", "remaining_time": "0:16:58"} +{"current_steps": 1258, "total_steps": 1557, "loss": 0.0039, "lr": 4.3573802595138945e-06, "epoch": 2.4240963855421684, "percentage": 80.8, "elapsed_time": "1:11:11", "remaining_time": "0:16:55"} +{"current_steps": 1259, "total_steps": 1557, "loss": 0.0087, "lr": 4.329474278806034e-06, "epoch": 2.426024096385542, "percentage": 80.86, "elapsed_time": "1:11:14", "remaining_time": "0:16:51"} +{"current_steps": 1260, "total_steps": 1557, "loss": 0.0046, "lr": 4.301647094585855e-06, "epoch": 2.427951807228916, "percentage": 80.92, "elapsed_time": "1:11:18", "remaining_time": "0:16:48"} +{"current_steps": 1261, "total_steps": 1557, "loss": 0.0054, "lr": 4.273898846777473e-06, "epoch": 2.429879518072289, "percentage": 80.99, "elapsed_time": "1:11:21", "remaining_time": "0:16:44"} +{"current_steps": 1262, "total_steps": 1557, "loss": 0.0072, "lr": 4.246229674908067e-06, "epoch": 2.4318072289156625, "percentage": 81.05, "elapsed_time": "1:11:24", "remaining_time": "0:16:41"} +{"current_steps": 1263, "total_steps": 1557, "loss": 0.003, "lr": 4.218639718107225e-06, "epoch": 2.433734939759036, "percentage": 81.12, "elapsed_time": "1:11:27", "remaining_time": "0:16:38"} +{"current_steps": 1264, "total_steps": 1557, "loss": 0.0109, "lr": 4.1911291151062e-06, "epoch": 2.4356626506024095, "percentage": 81.18, "elapsed_time": "1:11:31", "remaining_time": "0:16:34"} +{"current_steps": 1265, "total_steps": 1557, "loss": 0.0027, "lr": 4.163698004237222e-06, "epoch": 2.4375903614457832, "percentage": 81.25, "elapsed_time": "1:11:34", "remaining_time": "0:16:31"} +{"current_steps": 1266, "total_steps": 1557, "loss": 0.0018, "lr": 4.136346523432821e-06, "epoch": 2.4395180722891565, "percentage": 81.31, "elapsed_time": "1:11:38", "remaining_time": "0:16:27"} +{"current_steps": 1267, "total_steps": 1557, "loss": 0.0048, "lr": 4.109074810225118e-06, "epoch": 2.4414457831325302, "percentage": 81.37, "elapsed_time": "1:11:41", "remaining_time": "0:16:24"} +{"current_steps": 1268, "total_steps": 1557, "loss": 0.0021, "lr": 4.08188300174513e-06, "epoch": 2.4433734939759035, "percentage": 81.44, "elapsed_time": "1:11:44", "remaining_time": "0:16:21"} +{"current_steps": 1269, "total_steps": 1557, "loss": 0.0066, "lr": 4.054771234722106e-06, "epoch": 2.4453012048192773, "percentage": 81.5, "elapsed_time": "1:11:48", "remaining_time": "0:16:17"} +{"current_steps": 1270, "total_steps": 1557, "loss": 0.0043, "lr": 4.027739645482784e-06, "epoch": 2.4472289156626506, "percentage": 81.57, "elapsed_time": "1:11:51", "remaining_time": "0:16:14"} +{"current_steps": 1271, "total_steps": 1557, "loss": 0.0236, "lr": 4.0007883699507855e-06, "epoch": 2.4491566265060243, "percentage": 81.63, "elapsed_time": "1:11:54", "remaining_time": "0:16:10"} +{"current_steps": 1272, "total_steps": 1557, "loss": 0.0068, "lr": 3.973917543645867e-06, "epoch": 2.4510843373493976, "percentage": 81.7, "elapsed_time": "1:11:58", "remaining_time": "0:16:07"} +{"current_steps": 1273, "total_steps": 1557, "loss": 0.0194, "lr": 3.947127301683249e-06, "epoch": 2.453012048192771, "percentage": 81.76, "elapsed_time": "1:12:01", "remaining_time": "0:16:04"} +{"current_steps": 1274, "total_steps": 1557, "loss": 0.0042, "lr": 3.920417778772967e-06, "epoch": 2.4549397590361446, "percentage": 81.82, "elapsed_time": "1:12:05", "remaining_time": "0:16:00"} +{"current_steps": 1275, "total_steps": 1557, "loss": 0.0224, "lr": 3.893789109219171e-06, "epoch": 2.4568674698795183, "percentage": 81.89, "elapsed_time": "1:12:08", "remaining_time": "0:15:57"} +{"current_steps": 1276, "total_steps": 1557, "loss": 0.0046, "lr": 3.867241426919446e-06, "epoch": 2.4587951807228916, "percentage": 81.95, "elapsed_time": "1:12:12", "remaining_time": "0:15:54"} +{"current_steps": 1277, "total_steps": 1557, "loss": 0.0019, "lr": 3.840774865364157e-06, "epoch": 2.460722891566265, "percentage": 82.02, "elapsed_time": "1:12:15", "remaining_time": "0:15:50"} +{"current_steps": 1278, "total_steps": 1557, "loss": 0.0063, "lr": 3.8143895576357605e-06, "epoch": 2.4626506024096386, "percentage": 82.08, "elapsed_time": "1:12:18", "remaining_time": "0:15:47"} +{"current_steps": 1279, "total_steps": 1557, "loss": 0.0055, "lr": 3.788085636408143e-06, "epoch": 2.464578313253012, "percentage": 82.15, "elapsed_time": "1:12:21", "remaining_time": "0:15:43"} +{"current_steps": 1280, "total_steps": 1557, "loss": 0.0164, "lr": 3.7618632339459616e-06, "epoch": 2.4665060240963856, "percentage": 82.21, "elapsed_time": "1:12:24", "remaining_time": "0:15:40"} +{"current_steps": 1281, "total_steps": 1557, "loss": 0.0112, "lr": 3.7357224821039497e-06, "epoch": 2.468433734939759, "percentage": 82.27, "elapsed_time": "1:12:27", "remaining_time": "0:15:36"} +{"current_steps": 1282, "total_steps": 1557, "loss": 0.0112, "lr": 3.7096635123263068e-06, "epoch": 2.4703614457831327, "percentage": 82.34, "elapsed_time": "1:12:31", "remaining_time": "0:15:33"} +{"current_steps": 1283, "total_steps": 1557, "loss": 0.0013, "lr": 3.683686455645974e-06, "epoch": 2.472289156626506, "percentage": 82.4, "elapsed_time": "1:12:34", "remaining_time": "0:15:29"} +{"current_steps": 1284, "total_steps": 1557, "loss": 0.0038, "lr": 3.6577914426840266e-06, "epoch": 2.4742168674698797, "percentage": 82.47, "elapsed_time": "1:12:37", "remaining_time": "0:15:26"} +{"current_steps": 1285, "total_steps": 1557, "loss": 0.0023, "lr": 3.631978603648989e-06, "epoch": 2.476144578313253, "percentage": 82.53, "elapsed_time": "1:12:41", "remaining_time": "0:15:23"} +{"current_steps": 1286, "total_steps": 1557, "loss": 0.0025, "lr": 3.6062480683361935e-06, "epoch": 2.4780722891566267, "percentage": 82.59, "elapsed_time": "1:12:44", "remaining_time": "0:15:19"} +{"current_steps": 1287, "total_steps": 1557, "loss": 0.003, "lr": 3.580599966127123e-06, "epoch": 2.48, "percentage": 82.66, "elapsed_time": "1:12:47", "remaining_time": "0:15:16"} +{"current_steps": 1288, "total_steps": 1557, "loss": 0.0023, "lr": 3.5550344259887438e-06, "epoch": 2.4819277108433733, "percentage": 82.72, "elapsed_time": "1:12:50", "remaining_time": "0:15:12"} +{"current_steps": 1289, "total_steps": 1557, "loss": 0.0015, "lr": 3.5295515764729003e-06, "epoch": 2.483855421686747, "percentage": 82.79, "elapsed_time": "1:13:00", "remaining_time": "0:15:10"} +{"current_steps": 1290, "total_steps": 1557, "loss": 0.0041, "lr": 3.5041515457156303e-06, "epoch": 2.4857831325301207, "percentage": 82.85, "elapsed_time": "1:13:03", "remaining_time": "0:15:07"} +{"current_steps": 1291, "total_steps": 1557, "loss": 0.0029, "lr": 3.4788344614365155e-06, "epoch": 2.487710843373494, "percentage": 82.92, "elapsed_time": "1:13:07", "remaining_time": "0:15:03"} +{"current_steps": 1292, "total_steps": 1557, "loss": 0.0072, "lr": 3.453600450938073e-06, "epoch": 2.4896385542168673, "percentage": 82.98, "elapsed_time": "1:13:10", "remaining_time": "0:15:00"} +{"current_steps": 1293, "total_steps": 1557, "loss": 0.0205, "lr": 3.428449641105107e-06, "epoch": 2.491566265060241, "percentage": 83.04, "elapsed_time": "1:13:13", "remaining_time": "0:14:57"} +{"current_steps": 1294, "total_steps": 1557, "loss": 0.0065, "lr": 3.4033821584040383e-06, "epoch": 2.4934939759036143, "percentage": 83.11, "elapsed_time": "1:13:17", "remaining_time": "0:14:53"} +{"current_steps": 1295, "total_steps": 1557, "loss": 0.0025, "lr": 3.378398128882305e-06, "epoch": 2.495421686746988, "percentage": 83.17, "elapsed_time": "1:13:20", "remaining_time": "0:14:50"} +{"current_steps": 1296, "total_steps": 1557, "loss": 0.0071, "lr": 3.3534976781677142e-06, "epoch": 2.4973493975903613, "percentage": 83.24, "elapsed_time": "1:13:23", "remaining_time": "0:14:46"} +{"current_steps": 1297, "total_steps": 1557, "loss": 0.0024, "lr": 3.3286809314678137e-06, "epoch": 2.499277108433735, "percentage": 83.3, "elapsed_time": "1:13:26", "remaining_time": "0:14:43"} +{"current_steps": 1298, "total_steps": 1557, "loss": 0.0013, "lr": 3.30394801356926e-06, "epoch": 2.5012048192771084, "percentage": 83.37, "elapsed_time": "1:13:30", "remaining_time": "0:14:40"} +{"current_steps": 1299, "total_steps": 1557, "loss": 0.0228, "lr": 3.279299048837177e-06, "epoch": 2.503132530120482, "percentage": 83.43, "elapsed_time": "1:13:33", "remaining_time": "0:14:36"} +{"current_steps": 1300, "total_steps": 1557, "loss": 0.0056, "lr": 3.2547341612145654e-06, "epoch": 2.5050602409638554, "percentage": 83.49, "elapsed_time": "1:13:37", "remaining_time": "0:14:33"} +{"current_steps": 1301, "total_steps": 1557, "loss": 0.0081, "lr": 3.2302534742216586e-06, "epoch": 2.506987951807229, "percentage": 83.56, "elapsed_time": "1:13:40", "remaining_time": "0:14:29"} +{"current_steps": 1302, "total_steps": 1557, "loss": 0.0029, "lr": 3.205857110955277e-06, "epoch": 2.5089156626506024, "percentage": 83.62, "elapsed_time": "1:13:43", "remaining_time": "0:14:26"} +{"current_steps": 1303, "total_steps": 1557, "loss": 0.0059, "lr": 3.18154519408826e-06, "epoch": 2.5108433734939757, "percentage": 83.69, "elapsed_time": "1:13:47", "remaining_time": "0:14:23"} +{"current_steps": 1304, "total_steps": 1557, "loss": 0.0042, "lr": 3.1573178458688102e-06, "epoch": 2.5127710843373494, "percentage": 83.75, "elapsed_time": "1:13:49", "remaining_time": "0:14:19"} +{"current_steps": 1305, "total_steps": 1557, "loss": 0.0041, "lr": 3.133175188119899e-06, "epoch": 2.514698795180723, "percentage": 83.82, "elapsed_time": "1:13:53", "remaining_time": "0:14:16"} +{"current_steps": 1306, "total_steps": 1557, "loss": 0.0053, "lr": 3.109117342238639e-06, "epoch": 2.5166265060240964, "percentage": 83.88, "elapsed_time": "1:13:56", "remaining_time": "0:14:12"} +{"current_steps": 1307, "total_steps": 1557, "loss": 0.0084, "lr": 3.085144429195688e-06, "epoch": 2.5185542168674697, "percentage": 83.94, "elapsed_time": "1:14:00", "remaining_time": "0:14:09"} +{"current_steps": 1308, "total_steps": 1557, "loss": 0.0053, "lr": 3.061256569534634e-06, "epoch": 2.5204819277108435, "percentage": 84.01, "elapsed_time": "1:14:03", "remaining_time": "0:14:05"} +{"current_steps": 1309, "total_steps": 1557, "loss": 0.0018, "lr": 3.037453883371375e-06, "epoch": 2.5224096385542167, "percentage": 84.07, "elapsed_time": "1:14:07", "remaining_time": "0:14:02"} +{"current_steps": 1310, "total_steps": 1557, "loss": 0.0037, "lr": 3.0137364903935464e-06, "epoch": 2.5243373493975905, "percentage": 84.14, "elapsed_time": "1:14:10", "remaining_time": "0:13:59"} +{"current_steps": 1311, "total_steps": 1557, "loss": 0.0024, "lr": 2.990104509859897e-06, "epoch": 2.5262650602409638, "percentage": 84.2, "elapsed_time": "1:14:14", "remaining_time": "0:13:55"} +{"current_steps": 1312, "total_steps": 1557, "loss": 0.0063, "lr": 2.966558060599689e-06, "epoch": 2.5281927710843375, "percentage": 84.26, "elapsed_time": "1:14:17", "remaining_time": "0:13:52"} +{"current_steps": 1313, "total_steps": 1557, "loss": 0.0054, "lr": 2.9430972610121087e-06, "epoch": 2.5301204819277108, "percentage": 84.33, "elapsed_time": "1:14:20", "remaining_time": "0:13:48"} +{"current_steps": 1314, "total_steps": 1557, "loss": 0.0095, "lr": 2.9197222290656737e-06, "epoch": 2.532048192771084, "percentage": 84.39, "elapsed_time": "1:14:24", "remaining_time": "0:13:45"} +{"current_steps": 1315, "total_steps": 1557, "loss": 0.006, "lr": 2.8964330822976227e-06, "epoch": 2.533975903614458, "percentage": 84.46, "elapsed_time": "1:14:27", "remaining_time": "0:13:42"} +{"current_steps": 1316, "total_steps": 1557, "loss": 0.0067, "lr": 2.873229937813349e-06, "epoch": 2.5359036144578315, "percentage": 84.52, "elapsed_time": "1:14:31", "remaining_time": "0:13:38"} +{"current_steps": 1317, "total_steps": 1557, "loss": 0.0015, "lr": 2.850112912285783e-06, "epoch": 2.537831325301205, "percentage": 84.59, "elapsed_time": "1:14:34", "remaining_time": "0:13:35"} +{"current_steps": 1318, "total_steps": 1557, "loss": 0.0036, "lr": 2.8270821219548296e-06, "epoch": 2.539759036144578, "percentage": 84.65, "elapsed_time": "1:14:37", "remaining_time": "0:13:32"} +{"current_steps": 1319, "total_steps": 1557, "loss": 0.0068, "lr": 2.8041376826267862e-06, "epoch": 2.541686746987952, "percentage": 84.71, "elapsed_time": "1:14:40", "remaining_time": "0:13:28"} +{"current_steps": 1320, "total_steps": 1557, "loss": 0.0048, "lr": 2.7812797096737253e-06, "epoch": 2.5436144578313256, "percentage": 84.78, "elapsed_time": "1:14:44", "remaining_time": "0:13:25"} +{"current_steps": 1321, "total_steps": 1557, "loss": 0.0017, "lr": 2.7585083180329575e-06, "epoch": 2.545542168674699, "percentage": 84.84, "elapsed_time": "1:14:47", "remaining_time": "0:13:21"} +{"current_steps": 1322, "total_steps": 1557, "loss": 0.003, "lr": 2.7358236222064283e-06, "epoch": 2.547469879518072, "percentage": 84.91, "elapsed_time": "1:14:51", "remaining_time": "0:13:18"} +{"current_steps": 1323, "total_steps": 1557, "loss": 0.005, "lr": 2.7132257362601453e-06, "epoch": 2.549397590361446, "percentage": 84.97, "elapsed_time": "1:14:54", "remaining_time": "0:13:14"} +{"current_steps": 1324, "total_steps": 1557, "loss": 0.0077, "lr": 2.6907147738236193e-06, "epoch": 2.551325301204819, "percentage": 85.04, "elapsed_time": "1:14:57", "remaining_time": "0:13:11"} +{"current_steps": 1325, "total_steps": 1557, "loss": 0.0013, "lr": 2.6682908480892567e-06, "epoch": 2.553253012048193, "percentage": 85.1, "elapsed_time": "1:15:01", "remaining_time": "0:13:08"} +{"current_steps": 1326, "total_steps": 1557, "loss": 0.0092, "lr": 2.645954071811847e-06, "epoch": 2.555180722891566, "percentage": 85.16, "elapsed_time": "1:15:04", "remaining_time": "0:13:04"} +{"current_steps": 1327, "total_steps": 1557, "loss": 0.0031, "lr": 2.623704557307949e-06, "epoch": 2.55710843373494, "percentage": 85.23, "elapsed_time": "1:15:08", "remaining_time": "0:13:01"} +{"current_steps": 1328, "total_steps": 1557, "loss": 0.0104, "lr": 2.6015424164553295e-06, "epoch": 2.559036144578313, "percentage": 85.29, "elapsed_time": "1:15:11", "remaining_time": "0:12:58"} +{"current_steps": 1329, "total_steps": 1557, "loss": 0.004, "lr": 2.579467760692427e-06, "epoch": 2.5609638554216865, "percentage": 85.36, "elapsed_time": "1:15:14", "remaining_time": "0:12:54"} +{"current_steps": 1330, "total_steps": 1557, "loss": 0.0035, "lr": 2.557480701017776e-06, "epoch": 2.56289156626506, "percentage": 85.42, "elapsed_time": "1:15:18", "remaining_time": "0:12:51"} +{"current_steps": 1331, "total_steps": 1557, "loss": 0.0034, "lr": 2.5355813479894464e-06, "epoch": 2.564819277108434, "percentage": 85.48, "elapsed_time": "1:15:21", "remaining_time": "0:12:47"} +{"current_steps": 1332, "total_steps": 1557, "loss": 0.0076, "lr": 2.513769811724487e-06, "epoch": 2.5667469879518072, "percentage": 85.55, "elapsed_time": "1:15:25", "remaining_time": "0:12:44"} +{"current_steps": 1333, "total_steps": 1557, "loss": 0.0046, "lr": 2.4920462018983816e-06, "epoch": 2.5686746987951805, "percentage": 85.61, "elapsed_time": "1:15:28", "remaining_time": "0:12:40"} +{"current_steps": 1334, "total_steps": 1557, "loss": 0.0034, "lr": 2.4704106277444884e-06, "epoch": 2.5706024096385542, "percentage": 85.68, "elapsed_time": "1:15:31", "remaining_time": "0:12:37"} +{"current_steps": 1335, "total_steps": 1557, "loss": 0.0127, "lr": 2.4488631980534995e-06, "epoch": 2.572530120481928, "percentage": 85.74, "elapsed_time": "1:15:34", "remaining_time": "0:12:34"} +{"current_steps": 1336, "total_steps": 1557, "loss": 0.0031, "lr": 2.427404021172868e-06, "epoch": 2.5744578313253013, "percentage": 85.81, "elapsed_time": "1:15:37", "remaining_time": "0:12:30"} +{"current_steps": 1337, "total_steps": 1557, "loss": 0.0039, "lr": 2.406033205006313e-06, "epoch": 2.5763855421686745, "percentage": 85.87, "elapsed_time": "1:15:40", "remaining_time": "0:12:27"} +{"current_steps": 1338, "total_steps": 1557, "loss": 0.0029, "lr": 2.3847508570132226e-06, "epoch": 2.5783132530120483, "percentage": 85.93, "elapsed_time": "1:15:43", "remaining_time": "0:12:23"} +{"current_steps": 1339, "total_steps": 1557, "loss": 0.011, "lr": 2.36355708420815e-06, "epoch": 2.5802409638554216, "percentage": 86.0, "elapsed_time": "1:15:47", "remaining_time": "0:12:20"} +{"current_steps": 1340, "total_steps": 1557, "loss": 0.006, "lr": 2.342451993160262e-06, "epoch": 2.5821686746987953, "percentage": 86.06, "elapsed_time": "1:15:50", "remaining_time": "0:12:16"} +{"current_steps": 1341, "total_steps": 1557, "loss": 0.0051, "lr": 2.3214356899928036e-06, "epoch": 2.5840963855421686, "percentage": 86.13, "elapsed_time": "1:15:53", "remaining_time": "0:12:13"} +{"current_steps": 1342, "total_steps": 1557, "loss": 0.0012, "lr": 2.300508280382572e-06, "epoch": 2.5860240963855423, "percentage": 86.19, "elapsed_time": "1:15:57", "remaining_time": "0:12:10"} +{"current_steps": 1343, "total_steps": 1557, "loss": 0.0024, "lr": 2.279669869559358e-06, "epoch": 2.5879518072289156, "percentage": 86.26, "elapsed_time": "1:16:01", "remaining_time": "0:12:06"} +{"current_steps": 1344, "total_steps": 1557, "loss": 0.0024, "lr": 2.2589205623054646e-06, "epoch": 2.589879518072289, "percentage": 86.32, "elapsed_time": "1:16:04", "remaining_time": "0:12:03"} +{"current_steps": 1345, "total_steps": 1557, "loss": 0.0064, "lr": 2.238260462955142e-06, "epoch": 2.5918072289156626, "percentage": 86.38, "elapsed_time": "1:16:07", "remaining_time": "0:12:00"} +{"current_steps": 1346, "total_steps": 1557, "loss": 0.0012, "lr": 2.2176896753940637e-06, "epoch": 2.5937349397590364, "percentage": 86.45, "elapsed_time": "1:16:11", "remaining_time": "0:11:56"} +{"current_steps": 1347, "total_steps": 1557, "loss": 0.0092, "lr": 2.1972083030588244e-06, "epoch": 2.5956626506024096, "percentage": 86.51, "elapsed_time": "1:16:14", "remaining_time": "0:11:53"} +{"current_steps": 1348, "total_steps": 1557, "loss": 0.0067, "lr": 2.176816448936423e-06, "epoch": 2.597590361445783, "percentage": 86.58, "elapsed_time": "1:16:18", "remaining_time": "0:11:49"} +{"current_steps": 1349, "total_steps": 1557, "loss": 0.0059, "lr": 2.156514215563703e-06, "epoch": 2.5995180722891567, "percentage": 86.64, "elapsed_time": "1:16:21", "remaining_time": "0:11:46"} +{"current_steps": 1350, "total_steps": 1557, "loss": 0.0021, "lr": 2.1363017050268886e-06, "epoch": 2.6014457831325304, "percentage": 86.71, "elapsed_time": "1:16:25", "remaining_time": "0:11:43"} +{"current_steps": 1351, "total_steps": 1557, "loss": 0.0038, "lr": 2.1161790189610377e-06, "epoch": 2.6033734939759037, "percentage": 86.77, "elapsed_time": "1:16:28", "remaining_time": "0:11:39"} +{"current_steps": 1352, "total_steps": 1557, "loss": 0.0114, "lr": 2.0961462585495474e-06, "epoch": 2.605301204819277, "percentage": 86.83, "elapsed_time": "1:16:32", "remaining_time": "0:11:36"} +{"current_steps": 1353, "total_steps": 1557, "loss": 0.0054, "lr": 2.076203524523637e-06, "epoch": 2.6072289156626507, "percentage": 86.9, "elapsed_time": "1:16:35", "remaining_time": "0:11:32"} +{"current_steps": 1354, "total_steps": 1557, "loss": 0.007, "lr": 2.056350917161836e-06, "epoch": 2.609156626506024, "percentage": 86.96, "elapsed_time": "1:16:39", "remaining_time": "0:11:29"} +{"current_steps": 1355, "total_steps": 1557, "loss": 0.0061, "lr": 2.0365885362895053e-06, "epoch": 2.6110843373493977, "percentage": 87.03, "elapsed_time": "1:16:42", "remaining_time": "0:11:26"} +{"current_steps": 1356, "total_steps": 1557, "loss": 0.0114, "lr": 2.016916481278306e-06, "epoch": 2.613012048192771, "percentage": 87.09, "elapsed_time": "1:16:45", "remaining_time": "0:11:22"} +{"current_steps": 1357, "total_steps": 1557, "loss": 0.0057, "lr": 1.997334851045709e-06, "epoch": 2.6149397590361447, "percentage": 87.15, "elapsed_time": "1:16:49", "remaining_time": "0:11:19"} +{"current_steps": 1358, "total_steps": 1557, "loss": 0.0071, "lr": 1.9778437440545085e-06, "epoch": 2.616867469879518, "percentage": 87.22, "elapsed_time": "1:16:52", "remaining_time": "0:11:15"} +{"current_steps": 1359, "total_steps": 1557, "loss": 0.0054, "lr": 1.95844325831231e-06, "epoch": 2.6187951807228913, "percentage": 87.28, "elapsed_time": "1:16:56", "remaining_time": "0:11:12"} +{"current_steps": 1360, "total_steps": 1557, "loss": 0.0028, "lr": 1.9391334913710545e-06, "epoch": 2.620722891566265, "percentage": 87.35, "elapsed_time": "1:16:59", "remaining_time": "0:11:09"} +{"current_steps": 1361, "total_steps": 1557, "loss": 0.0048, "lr": 1.9199145403265175e-06, "epoch": 2.6226506024096388, "percentage": 87.41, "elapsed_time": "1:17:02", "remaining_time": "0:11:05"} +{"current_steps": 1362, "total_steps": 1557, "loss": 0.0072, "lr": 1.9007865018178107e-06, "epoch": 2.624578313253012, "percentage": 87.48, "elapsed_time": "1:17:06", "remaining_time": "0:11:02"} +{"current_steps": 1363, "total_steps": 1557, "loss": 0.0071, "lr": 1.8817494720269302e-06, "epoch": 2.6265060240963853, "percentage": 87.54, "elapsed_time": "1:17:09", "remaining_time": "0:10:58"} +{"current_steps": 1364, "total_steps": 1557, "loss": 0.0038, "lr": 1.8628035466782268e-06, "epoch": 2.628433734939759, "percentage": 87.6, "elapsed_time": "1:17:13", "remaining_time": "0:10:55"} +{"current_steps": 1365, "total_steps": 1557, "loss": 0.0043, "lr": 1.8439488210379687e-06, "epoch": 2.630361445783133, "percentage": 87.67, "elapsed_time": "1:17:16", "remaining_time": "0:10:52"} +{"current_steps": 1366, "total_steps": 1557, "loss": 0.0041, "lr": 1.8251853899138306e-06, "epoch": 2.632289156626506, "percentage": 87.73, "elapsed_time": "1:17:19", "remaining_time": "0:10:48"} +{"current_steps": 1367, "total_steps": 1557, "loss": 0.0034, "lr": 1.8065133476544306e-06, "epoch": 2.6342168674698794, "percentage": 87.8, "elapsed_time": "1:17:23", "remaining_time": "0:10:45"} +{"current_steps": 1368, "total_steps": 1557, "loss": 0.0141, "lr": 1.7879327881488584e-06, "epoch": 2.636144578313253, "percentage": 87.86, "elapsed_time": "1:17:26", "remaining_time": "0:10:41"} +{"current_steps": 1369, "total_steps": 1557, "loss": 0.0047, "lr": 1.769443804826194e-06, "epoch": 2.6380722891566264, "percentage": 87.93, "elapsed_time": "1:17:29", "remaining_time": "0:10:38"} +{"current_steps": 1370, "total_steps": 1557, "loss": 0.0031, "lr": 1.751046490655046e-06, "epoch": 2.64, "percentage": 87.99, "elapsed_time": "1:17:32", "remaining_time": "0:10:35"} +{"current_steps": 1371, "total_steps": 1557, "loss": 0.0019, "lr": 1.7327409381430804e-06, "epoch": 2.6419277108433734, "percentage": 88.05, "elapsed_time": "1:17:36", "remaining_time": "0:10:31"} +{"current_steps": 1372, "total_steps": 1557, "loss": 0.0035, "lr": 1.7145272393365498e-06, "epoch": 2.643855421686747, "percentage": 88.12, "elapsed_time": "1:17:39", "remaining_time": "0:10:28"} +{"current_steps": 1373, "total_steps": 1557, "loss": 0.0086, "lr": 1.6964054858198386e-06, "epoch": 2.6457831325301204, "percentage": 88.18, "elapsed_time": "1:17:43", "remaining_time": "0:10:24"} +{"current_steps": 1374, "total_steps": 1557, "loss": 0.0019, "lr": 1.6783757687150149e-06, "epoch": 2.6477108433734937, "percentage": 88.25, "elapsed_time": "1:17:46", "remaining_time": "0:10:21"} +{"current_steps": 1375, "total_steps": 1557, "loss": 0.0047, "lr": 1.6604381786813383e-06, "epoch": 2.6496385542168674, "percentage": 88.31, "elapsed_time": "1:17:49", "remaining_time": "0:10:18"} +{"current_steps": 1376, "total_steps": 1557, "loss": 0.0027, "lr": 1.6425928059148312e-06, "epoch": 2.651566265060241, "percentage": 88.38, "elapsed_time": "1:17:52", "remaining_time": "0:10:14"} +{"current_steps": 1377, "total_steps": 1557, "loss": 0.0071, "lr": 1.624839740147819e-06, "epoch": 2.6534939759036145, "percentage": 88.44, "elapsed_time": "1:17:56", "remaining_time": "0:10:11"} +{"current_steps": 1378, "total_steps": 1557, "loss": 0.0109, "lr": 1.6071790706484746e-06, "epoch": 2.6554216867469878, "percentage": 88.5, "elapsed_time": "1:17:59", "remaining_time": "0:10:07"} +{"current_steps": 1379, "total_steps": 1557, "loss": 0.0046, "lr": 1.589610886220383e-06, "epoch": 2.6573493975903615, "percentage": 88.57, "elapsed_time": "1:18:02", "remaining_time": "0:10:04"} +{"current_steps": 1380, "total_steps": 1557, "loss": 0.0138, "lr": 1.5721352752020602e-06, "epoch": 2.659277108433735, "percentage": 88.63, "elapsed_time": "1:18:06", "remaining_time": "0:10:01"} +{"current_steps": 1381, "total_steps": 1557, "loss": 0.0066, "lr": 1.5547523254665598e-06, "epoch": 2.6612048192771085, "percentage": 88.7, "elapsed_time": "1:18:15", "remaining_time": "0:09:58"} +{"current_steps": 1382, "total_steps": 1557, "loss": 0.0039, "lr": 1.5374621244209965e-06, "epoch": 2.663132530120482, "percentage": 88.76, "elapsed_time": "1:18:18", "remaining_time": "0:09:54"} +{"current_steps": 1383, "total_steps": 1557, "loss": 0.0029, "lr": 1.5202647590060983e-06, "epoch": 2.6650602409638555, "percentage": 88.82, "elapsed_time": "1:18:22", "remaining_time": "0:09:51"} +{"current_steps": 1384, "total_steps": 1557, "loss": 0.0032, "lr": 1.5031603156958064e-06, "epoch": 2.666987951807229, "percentage": 88.89, "elapsed_time": "1:18:25", "remaining_time": "0:09:48"} +{"current_steps": 1385, "total_steps": 1557, "loss": 0.024, "lr": 1.4861488804968093e-06, "epoch": 2.6689156626506025, "percentage": 88.95, "elapsed_time": "1:18:28", "remaining_time": "0:09:44"} +{"current_steps": 1386, "total_steps": 1557, "loss": 0.0047, "lr": 1.4692305389481232e-06, "epoch": 2.670843373493976, "percentage": 89.02, "elapsed_time": "1:18:31", "remaining_time": "0:09:41"} +{"current_steps": 1387, "total_steps": 1557, "loss": 0.0014, "lr": 1.452405376120658e-06, "epoch": 2.6727710843373496, "percentage": 89.08, "elapsed_time": "1:18:34", "remaining_time": "0:09:37"} +{"current_steps": 1388, "total_steps": 1557, "loss": 0.0035, "lr": 1.4356734766167925e-06, "epoch": 2.674698795180723, "percentage": 89.15, "elapsed_time": "1:18:38", "remaining_time": "0:09:34"} +{"current_steps": 1389, "total_steps": 1557, "loss": 0.0063, "lr": 1.4190349245699443e-06, "epoch": 2.676626506024096, "percentage": 89.21, "elapsed_time": "1:18:41", "remaining_time": "0:09:31"} +{"current_steps": 1390, "total_steps": 1557, "loss": 0.008, "lr": 1.402489803644156e-06, "epoch": 2.67855421686747, "percentage": 89.27, "elapsed_time": "1:18:44", "remaining_time": "0:09:27"} +{"current_steps": 1391, "total_steps": 1557, "loss": 0.0039, "lr": 1.3860381970336544e-06, "epoch": 2.6804819277108436, "percentage": 89.34, "elapsed_time": "1:18:48", "remaining_time": "0:09:24"} +{"current_steps": 1392, "total_steps": 1557, "loss": 0.0028, "lr": 1.3696801874624698e-06, "epoch": 2.682409638554217, "percentage": 89.4, "elapsed_time": "1:18:51", "remaining_time": "0:09:20"} +{"current_steps": 1393, "total_steps": 1557, "loss": 0.0029, "lr": 1.353415857183966e-06, "epoch": 2.68433734939759, "percentage": 89.47, "elapsed_time": "1:18:55", "remaining_time": "0:09:17"} +{"current_steps": 1394, "total_steps": 1557, "loss": 0.0068, "lr": 1.337245287980482e-06, "epoch": 2.686265060240964, "percentage": 89.53, "elapsed_time": "1:18:58", "remaining_time": "0:09:14"} +{"current_steps": 1395, "total_steps": 1557, "loss": 0.1645, "lr": 1.3211685611628844e-06, "epoch": 2.688192771084337, "percentage": 89.6, "elapsed_time": "1:19:02", "remaining_time": "0:09:10"} +{"current_steps": 1396, "total_steps": 1557, "loss": 0.0044, "lr": 1.3051857575701732e-06, "epoch": 2.690120481927711, "percentage": 89.66, "elapsed_time": "1:19:05", "remaining_time": "0:09:07"} +{"current_steps": 1397, "total_steps": 1557, "loss": 0.0035, "lr": 1.2892969575690685e-06, "epoch": 2.692048192771084, "percentage": 89.72, "elapsed_time": "1:19:08", "remaining_time": "0:09:03"} +{"current_steps": 1398, "total_steps": 1557, "loss": 0.0108, "lr": 1.273502241053608e-06, "epoch": 2.693975903614458, "percentage": 89.79, "elapsed_time": "1:19:11", "remaining_time": "0:09:00"} +{"current_steps": 1399, "total_steps": 1557, "loss": 0.0073, "lr": 1.2578016874447596e-06, "epoch": 2.695903614457831, "percentage": 89.85, "elapsed_time": "1:19:14", "remaining_time": "0:08:56"} +{"current_steps": 1400, "total_steps": 1557, "loss": 0.0037, "lr": 1.2421953756899985e-06, "epoch": 2.697831325301205, "percentage": 89.92, "elapsed_time": "1:19:17", "remaining_time": "0:08:53"} +{"current_steps": 1401, "total_steps": 1557, "loss": 0.0041, "lr": 1.226683384262919e-06, "epoch": 2.6997590361445782, "percentage": 89.98, "elapsed_time": "1:19:21", "remaining_time": "0:08:50"} +{"current_steps": 1402, "total_steps": 1557, "loss": 0.0101, "lr": 1.21126579116285e-06, "epoch": 2.701686746987952, "percentage": 90.04, "elapsed_time": "1:19:24", "remaining_time": "0:08:46"} +{"current_steps": 1403, "total_steps": 1557, "loss": 0.0022, "lr": 1.1959426739144497e-06, "epoch": 2.7036144578313253, "percentage": 90.11, "elapsed_time": "1:19:27", "remaining_time": "0:08:43"} +{"current_steps": 1404, "total_steps": 1557, "loss": 0.0013, "lr": 1.1807141095673291e-06, "epoch": 2.7055421686746985, "percentage": 90.17, "elapsed_time": "1:19:31", "remaining_time": "0:08:39"} +{"current_steps": 1405, "total_steps": 1557, "loss": 0.0066, "lr": 1.1655801746956463e-06, "epoch": 2.7074698795180723, "percentage": 90.24, "elapsed_time": "1:19:34", "remaining_time": "0:08:36"} +{"current_steps": 1406, "total_steps": 1557, "loss": 0.0045, "lr": 1.1505409453977334e-06, "epoch": 2.709397590361446, "percentage": 90.3, "elapsed_time": "1:19:38", "remaining_time": "0:08:33"} +{"current_steps": 1407, "total_steps": 1557, "loss": 0.0181, "lr": 1.135596497295719e-06, "epoch": 2.7113253012048193, "percentage": 90.37, "elapsed_time": "1:19:41", "remaining_time": "0:08:29"} +{"current_steps": 1408, "total_steps": 1557, "loss": 0.0042, "lr": 1.1207469055351395e-06, "epoch": 2.7132530120481926, "percentage": 90.43, "elapsed_time": "1:19:44", "remaining_time": "0:08:26"} +{"current_steps": 1409, "total_steps": 1557, "loss": 0.0059, "lr": 1.105992244784555e-06, "epoch": 2.7151807228915663, "percentage": 90.49, "elapsed_time": "1:19:47", "remaining_time": "0:08:22"} +{"current_steps": 1410, "total_steps": 1557, "loss": 0.0023, "lr": 1.0913325892351857e-06, "epoch": 2.7171084337349396, "percentage": 90.56, "elapsed_time": "1:19:51", "remaining_time": "0:08:19"} +{"current_steps": 1411, "total_steps": 1557, "loss": 0.0019, "lr": 1.0767680126005443e-06, "epoch": 2.7190361445783133, "percentage": 90.62, "elapsed_time": "1:19:54", "remaining_time": "0:08:16"} +{"current_steps": 1412, "total_steps": 1557, "loss": 0.0018, "lr": 1.0622985881160396e-06, "epoch": 2.7209638554216866, "percentage": 90.69, "elapsed_time": "1:19:57", "remaining_time": "0:08:12"} +{"current_steps": 1413, "total_steps": 1557, "loss": 0.0023, "lr": 1.0479243885386347e-06, "epoch": 2.7228915662650603, "percentage": 90.75, "elapsed_time": "1:20:01", "remaining_time": "0:08:09"} +{"current_steps": 1414, "total_steps": 1557, "loss": 0.0033, "lr": 1.0336454861464706e-06, "epoch": 2.7248192771084336, "percentage": 90.82, "elapsed_time": "1:20:04", "remaining_time": "0:08:05"} +{"current_steps": 1415, "total_steps": 1557, "loss": 0.0029, "lr": 1.0194619527385007e-06, "epoch": 2.7267469879518074, "percentage": 90.88, "elapsed_time": "1:20:07", "remaining_time": "0:08:02"} +{"current_steps": 1416, "total_steps": 1557, "loss": 0.0026, "lr": 1.0053738596341355e-06, "epoch": 2.7286746987951807, "percentage": 90.94, "elapsed_time": "1:20:11", "remaining_time": "0:07:59"} +{"current_steps": 1417, "total_steps": 1557, "loss": 0.005, "lr": 9.91381277672867e-07, "epoch": 2.7306024096385544, "percentage": 91.01, "elapsed_time": "1:20:14", "remaining_time": "0:07:55"} +{"current_steps": 1418, "total_steps": 1557, "loss": 0.0038, "lr": 9.774842772139537e-07, "epoch": 2.7325301204819277, "percentage": 91.07, "elapsed_time": "1:20:17", "remaining_time": "0:07:52"} +{"current_steps": 1419, "total_steps": 1557, "loss": 0.0034, "lr": 9.636829281360116e-07, "epoch": 2.734457831325301, "percentage": 91.14, "elapsed_time": "1:20:20", "remaining_time": "0:07:48"} +{"current_steps": 1420, "total_steps": 1557, "loss": 0.0038, "lr": 9.499772998367018e-07, "epoch": 2.7363855421686747, "percentage": 91.2, "elapsed_time": "1:20:24", "remaining_time": "0:07:45"} +{"current_steps": 1421, "total_steps": 1557, "loss": 0.002, "lr": 9.36367461232377e-07, "epoch": 2.7383132530120484, "percentage": 91.27, "elapsed_time": "1:20:27", "remaining_time": "0:07:42"} +{"current_steps": 1422, "total_steps": 1557, "loss": 0.0028, "lr": 9.22853480757715e-07, "epoch": 2.7402409638554217, "percentage": 91.33, "elapsed_time": "1:20:30", "remaining_time": "0:07:38"} +{"current_steps": 1423, "total_steps": 1557, "loss": 0.0065, "lr": 9.094354263653971e-07, "epoch": 2.742168674698795, "percentage": 91.39, "elapsed_time": "1:20:33", "remaining_time": "0:07:35"} +{"current_steps": 1424, "total_steps": 1557, "loss": 0.0031, "lr": 8.961133655257548e-07, "epoch": 2.7440963855421687, "percentage": 91.46, "elapsed_time": "1:20:37", "remaining_time": "0:07:31"} +{"current_steps": 1425, "total_steps": 1557, "loss": 0.0043, "lr": 8.828873652264303e-07, "epoch": 2.746024096385542, "percentage": 91.52, "elapsed_time": "1:20:40", "remaining_time": "0:07:28"} +{"current_steps": 1426, "total_steps": 1557, "loss": 0.004, "lr": 8.697574919720497e-07, "epoch": 2.7479518072289157, "percentage": 91.59, "elapsed_time": "1:20:44", "remaining_time": "0:07:25"} +{"current_steps": 1427, "total_steps": 1557, "loss": 0.0035, "lr": 8.567238117838683e-07, "epoch": 2.749879518072289, "percentage": 91.65, "elapsed_time": "1:20:47", "remaining_time": "0:07:21"} +{"current_steps": 1428, "total_steps": 1557, "loss": 0.0022, "lr": 8.437863901994592e-07, "epoch": 2.7518072289156628, "percentage": 91.71, "elapsed_time": "1:20:51", "remaining_time": "0:07:18"} +{"current_steps": 1429, "total_steps": 1557, "loss": 0.0042, "lr": 8.309452922723849e-07, "epoch": 2.753734939759036, "percentage": 91.78, "elapsed_time": "1:20:54", "remaining_time": "0:07:14"} +{"current_steps": 1430, "total_steps": 1557, "loss": 0.0149, "lr": 8.18200582571842e-07, "epoch": 2.75566265060241, "percentage": 91.84, "elapsed_time": "1:20:58", "remaining_time": "0:07:11"} +{"current_steps": 1431, "total_steps": 1557, "loss": 0.0029, "lr": 8.055523251823705e-07, "epoch": 2.757590361445783, "percentage": 91.91, "elapsed_time": "1:21:01", "remaining_time": "0:07:08"} +{"current_steps": 1432, "total_steps": 1557, "loss": 0.0036, "lr": 7.930005837035138e-07, "epoch": 2.759518072289157, "percentage": 91.97, "elapsed_time": "1:21:04", "remaining_time": "0:07:04"} +{"current_steps": 1433, "total_steps": 1557, "loss": 0.0066, "lr": 7.805454212494967e-07, "epoch": 2.76144578313253, "percentage": 92.04, "elapsed_time": "1:21:08", "remaining_time": "0:07:01"} +{"current_steps": 1434, "total_steps": 1557, "loss": 0.0066, "lr": 7.681869004489218e-07, "epoch": 2.7633734939759034, "percentage": 92.1, "elapsed_time": "1:21:11", "remaining_time": "0:06:57"} +{"current_steps": 1435, "total_steps": 1557, "loss": 0.0073, "lr": 7.559250834444332e-07, "epoch": 2.765301204819277, "percentage": 92.16, "elapsed_time": "1:21:15", "remaining_time": "0:06:54"} +{"current_steps": 1436, "total_steps": 1557, "loss": 0.0023, "lr": 7.437600318924332e-07, "epoch": 2.767228915662651, "percentage": 92.23, "elapsed_time": "1:21:18", "remaining_time": "0:06:51"} +{"current_steps": 1437, "total_steps": 1557, "loss": 0.003, "lr": 7.316918069627488e-07, "epoch": 2.769156626506024, "percentage": 92.29, "elapsed_time": "1:21:21", "remaining_time": "0:06:47"} +{"current_steps": 1438, "total_steps": 1557, "loss": 0.0021, "lr": 7.197204693383231e-07, "epoch": 2.7710843373493974, "percentage": 92.36, "elapsed_time": "1:21:24", "remaining_time": "0:06:44"} +{"current_steps": 1439, "total_steps": 1557, "loss": 0.0017, "lr": 7.078460792149311e-07, "epoch": 2.773012048192771, "percentage": 92.42, "elapsed_time": "1:21:28", "remaining_time": "0:06:40"} +{"current_steps": 1440, "total_steps": 1557, "loss": 0.0035, "lr": 6.960686963008556e-07, "epoch": 2.7749397590361444, "percentage": 92.49, "elapsed_time": "1:21:31", "remaining_time": "0:06:37"} +{"current_steps": 1441, "total_steps": 1557, "loss": 0.0027, "lr": 6.843883798166029e-07, "epoch": 2.776867469879518, "percentage": 92.55, "elapsed_time": "1:21:34", "remaining_time": "0:06:34"} +{"current_steps": 1442, "total_steps": 1557, "loss": 0.0029, "lr": 6.728051884945941e-07, "epoch": 2.7787951807228914, "percentage": 92.61, "elapsed_time": "1:21:37", "remaining_time": "0:06:30"} +{"current_steps": 1443, "total_steps": 1557, "loss": 0.0112, "lr": 6.613191805788699e-07, "epoch": 2.780722891566265, "percentage": 92.68, "elapsed_time": "1:21:41", "remaining_time": "0:06:27"} +{"current_steps": 1444, "total_steps": 1557, "loss": 0.0062, "lr": 6.499304138248064e-07, "epoch": 2.7826506024096385, "percentage": 92.74, "elapsed_time": "1:21:44", "remaining_time": "0:06:23"} +{"current_steps": 1445, "total_steps": 1557, "loss": 0.0021, "lr": 6.386389454988195e-07, "epoch": 2.784578313253012, "percentage": 92.81, "elapsed_time": "1:21:47", "remaining_time": "0:06:20"} +{"current_steps": 1446, "total_steps": 1557, "loss": 0.0094, "lr": 6.274448323780724e-07, "epoch": 2.7865060240963855, "percentage": 92.87, "elapsed_time": "1:21:51", "remaining_time": "0:06:16"} +{"current_steps": 1447, "total_steps": 1557, "loss": 0.0026, "lr": 6.163481307501995e-07, "epoch": 2.788433734939759, "percentage": 92.94, "elapsed_time": "1:21:54", "remaining_time": "0:06:13"} +{"current_steps": 1448, "total_steps": 1557, "loss": 0.0075, "lr": 6.053488964130183e-07, "epoch": 2.7903614457831325, "percentage": 93.0, "elapsed_time": "1:21:57", "remaining_time": "0:06:10"} +{"current_steps": 1449, "total_steps": 1557, "loss": 0.0067, "lr": 5.94447184674245e-07, "epoch": 2.792289156626506, "percentage": 93.06, "elapsed_time": "1:22:01", "remaining_time": "0:06:06"} +{"current_steps": 1450, "total_steps": 1557, "loss": 0.0106, "lr": 5.836430503512236e-07, "epoch": 2.7942168674698795, "percentage": 93.13, "elapsed_time": "1:22:04", "remaining_time": "0:06:03"} +{"current_steps": 1451, "total_steps": 1557, "loss": 0.0062, "lr": 5.729365477706505e-07, "epoch": 2.7961445783132532, "percentage": 93.19, "elapsed_time": "1:22:07", "remaining_time": "0:06:00"} +{"current_steps": 1452, "total_steps": 1557, "loss": 0.0045, "lr": 5.623277307682929e-07, "epoch": 2.7980722891566265, "percentage": 93.26, "elapsed_time": "1:22:11", "remaining_time": "0:05:56"} +{"current_steps": 1453, "total_steps": 1557, "loss": 0.0073, "lr": 5.518166526887214e-07, "epoch": 2.8, "percentage": 93.32, "elapsed_time": "1:22:15", "remaining_time": "0:05:53"} +{"current_steps": 1454, "total_steps": 1557, "loss": 0.002, "lr": 5.41403366385047e-07, "epoch": 2.8019277108433736, "percentage": 93.38, "elapsed_time": "1:22:18", "remaining_time": "0:05:49"} +{"current_steps": 1455, "total_steps": 1557, "loss": 0.0021, "lr": 5.310879242186606e-07, "epoch": 2.803855421686747, "percentage": 93.45, "elapsed_time": "1:22:21", "remaining_time": "0:05:46"} +{"current_steps": 1456, "total_steps": 1557, "loss": 0.0019, "lr": 5.208703780589419e-07, "epoch": 2.8057831325301206, "percentage": 93.51, "elapsed_time": "1:22:24", "remaining_time": "0:05:43"} +{"current_steps": 1457, "total_steps": 1557, "loss": 0.0052, "lr": 5.107507792830335e-07, "epoch": 2.807710843373494, "percentage": 93.58, "elapsed_time": "1:22:28", "remaining_time": "0:05:39"} +{"current_steps": 1458, "total_steps": 1557, "loss": 0.0023, "lr": 5.007291787755586e-07, "epoch": 2.8096385542168676, "percentage": 93.64, "elapsed_time": "1:22:31", "remaining_time": "0:05:36"} +{"current_steps": 1459, "total_steps": 1557, "loss": 0.0073, "lr": 4.908056269283789e-07, "epoch": 2.811566265060241, "percentage": 93.71, "elapsed_time": "1:22:34", "remaining_time": "0:05:32"} +{"current_steps": 1460, "total_steps": 1557, "loss": 0.0016, "lr": 4.809801736403308e-07, "epoch": 2.8134939759036146, "percentage": 93.77, "elapsed_time": "1:22:38", "remaining_time": "0:05:29"} +{"current_steps": 1461, "total_steps": 1557, "loss": 0.0035, "lr": 4.7125286831698034e-07, "epoch": 2.815421686746988, "percentage": 93.83, "elapsed_time": "1:22:41", "remaining_time": "0:05:26"} +{"current_steps": 1462, "total_steps": 1557, "loss": 0.004, "lr": 4.6162375987037766e-07, "epoch": 2.8173493975903616, "percentage": 93.9, "elapsed_time": "1:22:45", "remaining_time": "0:05:22"} +{"current_steps": 1463, "total_steps": 1557, "loss": 0.0022, "lr": 4.520928967188054e-07, "epoch": 2.819277108433735, "percentage": 93.96, "elapsed_time": "1:22:48", "remaining_time": "0:05:19"} +{"current_steps": 1464, "total_steps": 1557, "loss": 0.0042, "lr": 4.426603267865326e-07, "epoch": 2.821204819277108, "percentage": 94.03, "elapsed_time": "1:22:51", "remaining_time": "0:05:15"} +{"current_steps": 1465, "total_steps": 1557, "loss": 0.0089, "lr": 4.333260975035769e-07, "epoch": 2.823132530120482, "percentage": 94.09, "elapsed_time": "1:22:55", "remaining_time": "0:05:12"} +{"current_steps": 1466, "total_steps": 1557, "loss": 0.013, "lr": 4.240902558054827e-07, "epoch": 2.8250602409638557, "percentage": 94.16, "elapsed_time": "1:22:58", "remaining_time": "0:05:09"} +{"current_steps": 1467, "total_steps": 1557, "loss": 0.0018, "lr": 4.1495284813305003e-07, "epoch": 2.826987951807229, "percentage": 94.22, "elapsed_time": "1:23:01", "remaining_time": "0:05:05"} +{"current_steps": 1468, "total_steps": 1557, "loss": 0.0144, "lr": 4.0591392043213275e-07, "epoch": 2.8289156626506022, "percentage": 94.28, "elapsed_time": "1:23:05", "remaining_time": "0:05:02"} +{"current_steps": 1469, "total_steps": 1557, "loss": 0.0028, "lr": 3.969735181533918e-07, "epoch": 2.830843373493976, "percentage": 94.35, "elapsed_time": "1:23:08", "remaining_time": "0:04:58"} +{"current_steps": 1470, "total_steps": 1557, "loss": 0.0042, "lr": 3.881316862520712e-07, "epoch": 2.8327710843373493, "percentage": 94.41, "elapsed_time": "1:23:11", "remaining_time": "0:04:55"} +{"current_steps": 1471, "total_steps": 1557, "loss": 0.0047, "lr": 3.7938846918776917e-07, "epoch": 2.834698795180723, "percentage": 94.48, "elapsed_time": "1:23:15", "remaining_time": "0:04:52"} +{"current_steps": 1472, "total_steps": 1557, "loss": 0.0061, "lr": 3.707439109242139e-07, "epoch": 2.8366265060240963, "percentage": 94.54, "elapsed_time": "1:23:18", "remaining_time": "0:04:48"} +{"current_steps": 1473, "total_steps": 1557, "loss": 0.0029, "lr": 3.6219805492905934e-07, "epoch": 2.83855421686747, "percentage": 94.61, "elapsed_time": "1:23:27", "remaining_time": "0:04:45"} +{"current_steps": 1474, "total_steps": 1557, "loss": 0.0044, "lr": 3.53750944173632e-07, "epoch": 2.8404819277108433, "percentage": 94.67, "elapsed_time": "1:23:31", "remaining_time": "0:04:42"} +{"current_steps": 1475, "total_steps": 1557, "loss": 0.0059, "lr": 3.45402621132751e-07, "epoch": 2.842409638554217, "percentage": 94.73, "elapsed_time": "1:23:35", "remaining_time": "0:04:38"} +{"current_steps": 1476, "total_steps": 1557, "loss": 0.005, "lr": 3.3715312778449305e-07, "epoch": 2.8443373493975903, "percentage": 94.8, "elapsed_time": "1:23:38", "remaining_time": "0:04:35"} +{"current_steps": 1477, "total_steps": 1557, "loss": 0.004, "lr": 3.2900250560998546e-07, "epoch": 2.846265060240964, "percentage": 94.86, "elapsed_time": "1:23:41", "remaining_time": "0:04:31"} +{"current_steps": 1478, "total_steps": 1557, "loss": 0.0076, "lr": 3.209507955932001e-07, "epoch": 2.8481927710843373, "percentage": 94.93, "elapsed_time": "1:23:44", "remaining_time": "0:04:28"} +{"current_steps": 1479, "total_steps": 1557, "loss": 0.0092, "lr": 3.129980382207509e-07, "epoch": 2.8501204819277106, "percentage": 94.99, "elapsed_time": "1:23:48", "remaining_time": "0:04:25"} +{"current_steps": 1480, "total_steps": 1557, "loss": 0.0058, "lr": 3.05144273481679e-07, "epoch": 2.8520481927710843, "percentage": 95.05, "elapsed_time": "1:23:51", "remaining_time": "0:04:21"} +{"current_steps": 1481, "total_steps": 1557, "loss": 0.014, "lr": 2.9738954086726334e-07, "epoch": 2.853975903614458, "percentage": 95.12, "elapsed_time": "1:23:54", "remaining_time": "0:04:18"} +{"current_steps": 1482, "total_steps": 1557, "loss": 0.0047, "lr": 2.8973387937081485e-07, "epoch": 2.8559036144578314, "percentage": 95.18, "elapsed_time": "1:23:58", "remaining_time": "0:04:14"} +{"current_steps": 1483, "total_steps": 1557, "loss": 0.0028, "lr": 2.821773274874828e-07, "epoch": 2.8578313253012047, "percentage": 95.25, "elapsed_time": "1:24:01", "remaining_time": "0:04:11"} +{"current_steps": 1484, "total_steps": 1557, "loss": 0.0168, "lr": 2.7471992321406624e-07, "epoch": 2.8597590361445784, "percentage": 95.31, "elapsed_time": "1:24:05", "remaining_time": "0:04:08"} +{"current_steps": 1485, "total_steps": 1557, "loss": 0.0017, "lr": 2.6736170404880744e-07, "epoch": 2.8616867469879517, "percentage": 95.38, "elapsed_time": "1:24:08", "remaining_time": "0:04:04"} +{"current_steps": 1486, "total_steps": 1557, "loss": 0.0045, "lr": 2.6010270699122096e-07, "epoch": 2.8636144578313254, "percentage": 95.44, "elapsed_time": "1:24:11", "remaining_time": "0:04:01"} +{"current_steps": 1487, "total_steps": 1557, "loss": 0.007, "lr": 2.529429685419027e-07, "epoch": 2.8655421686746987, "percentage": 95.5, "elapsed_time": "1:24:14", "remaining_time": "0:03:57"} +{"current_steps": 1488, "total_steps": 1557, "loss": 0.0112, "lr": 2.458825247023389e-07, "epoch": 2.8674698795180724, "percentage": 95.57, "elapsed_time": "1:24:18", "remaining_time": "0:03:54"} +{"current_steps": 1489, "total_steps": 1557, "loss": 0.0103, "lr": 2.3892141097473063e-07, "epoch": 2.8693975903614457, "percentage": 95.63, "elapsed_time": "1:24:21", "remaining_time": "0:03:51"} +{"current_steps": 1490, "total_steps": 1557, "loss": 0.0195, "lr": 2.3205966236181433e-07, "epoch": 2.8713253012048194, "percentage": 95.7, "elapsed_time": "1:24:25", "remaining_time": "0:03:47"} +{"current_steps": 1491, "total_steps": 1557, "loss": 0.0034, "lr": 2.252973133666947e-07, "epoch": 2.8732530120481927, "percentage": 95.76, "elapsed_time": "1:24:28", "remaining_time": "0:03:44"} +{"current_steps": 1492, "total_steps": 1557, "loss": 0.0063, "lr": 2.1863439799265195e-07, "epoch": 2.8751807228915665, "percentage": 95.83, "elapsed_time": "1:24:31", "remaining_time": "0:03:40"} +{"current_steps": 1493, "total_steps": 1557, "loss": 0.0049, "lr": 2.1207094974298847e-07, "epoch": 2.8771084337349397, "percentage": 95.89, "elapsed_time": "1:24:35", "remaining_time": "0:03:37"} +{"current_steps": 1494, "total_steps": 1557, "loss": 0.0021, "lr": 2.056070016208489e-07, "epoch": 2.879036144578313, "percentage": 95.95, "elapsed_time": "1:24:38", "remaining_time": "0:03:34"} +{"current_steps": 1495, "total_steps": 1557, "loss": 0.0052, "lr": 1.9924258612906256e-07, "epoch": 2.8809638554216868, "percentage": 96.02, "elapsed_time": "1:24:41", "remaining_time": "0:03:30"} +{"current_steps": 1496, "total_steps": 1557, "loss": 0.0065, "lr": 1.929777352699791e-07, "epoch": 2.8828915662650605, "percentage": 96.08, "elapsed_time": "1:24:45", "remaining_time": "0:03:27"} +{"current_steps": 1497, "total_steps": 1557, "loss": 0.0334, "lr": 1.8681248054529754e-07, "epoch": 2.8848192771084338, "percentage": 96.15, "elapsed_time": "1:24:48", "remaining_time": "0:03:23"} +{"current_steps": 1498, "total_steps": 1557, "loss": 0.0034, "lr": 1.8074685295591754e-07, "epoch": 2.886746987951807, "percentage": 96.21, "elapsed_time": "1:24:51", "remaining_time": "0:03:20"} +{"current_steps": 1499, "total_steps": 1557, "loss": 0.0038, "lr": 1.7478088300178608e-07, "epoch": 2.888674698795181, "percentage": 96.27, "elapsed_time": "1:24:54", "remaining_time": "0:03:17"} +{"current_steps": 1500, "total_steps": 1557, "loss": 0.0042, "lr": 1.6891460068173548e-07, "epoch": 2.890602409638554, "percentage": 96.34, "elapsed_time": "1:24:57", "remaining_time": "0:03:13"} +{"current_steps": 1501, "total_steps": 1557, "loss": 0.0016, "lr": 1.631480354933346e-07, "epoch": 2.892530120481928, "percentage": 96.4, "elapsed_time": "1:25:01", "remaining_time": "0:03:10"} +{"current_steps": 1502, "total_steps": 1557, "loss": 0.0062, "lr": 1.5748121643274661e-07, "epoch": 2.894457831325301, "percentage": 96.47, "elapsed_time": "1:25:04", "remaining_time": "0:03:06"} +{"current_steps": 1503, "total_steps": 1557, "loss": 0.0025, "lr": 1.519141719945738e-07, "epoch": 2.896385542168675, "percentage": 96.53, "elapsed_time": "1:25:08", "remaining_time": "0:03:03"} +{"current_steps": 1504, "total_steps": 1557, "loss": 0.0045, "lr": 1.4644693017172418e-07, "epoch": 2.898313253012048, "percentage": 96.6, "elapsed_time": "1:25:11", "remaining_time": "0:03:00"} +{"current_steps": 1505, "total_steps": 1557, "loss": 0.0059, "lr": 1.4107951845526267e-07, "epoch": 2.900240963855422, "percentage": 96.66, "elapsed_time": "1:25:14", "remaining_time": "0:02:56"} +{"current_steps": 1506, "total_steps": 1557, "loss": 0.0021, "lr": 1.3581196383427586e-07, "epoch": 2.902168674698795, "percentage": 96.72, "elapsed_time": "1:25:18", "remaining_time": "0:02:53"} +{"current_steps": 1507, "total_steps": 1557, "loss": 0.0036, "lr": 1.3064429279573853e-07, "epoch": 2.904096385542169, "percentage": 96.79, "elapsed_time": "1:25:21", "remaining_time": "0:02:49"} +{"current_steps": 1508, "total_steps": 1557, "loss": 0.001, "lr": 1.255765313243762e-07, "epoch": 2.906024096385542, "percentage": 96.85, "elapsed_time": "1:25:25", "remaining_time": "0:02:46"} +{"current_steps": 1509, "total_steps": 1557, "loss": 0.008, "lr": 1.206087049025384e-07, "epoch": 2.9079518072289154, "percentage": 96.92, "elapsed_time": "1:25:28", "remaining_time": "0:02:43"} +{"current_steps": 1510, "total_steps": 1557, "loss": 0.0086, "lr": 1.1574083851007e-07, "epoch": 2.909879518072289, "percentage": 96.98, "elapsed_time": "1:25:32", "remaining_time": "0:02:39"} +{"current_steps": 1511, "total_steps": 1557, "loss": 0.0023, "lr": 1.1097295662418018e-07, "epoch": 2.911807228915663, "percentage": 97.05, "elapsed_time": "1:25:35", "remaining_time": "0:02:36"} +{"current_steps": 1512, "total_steps": 1557, "loss": 0.0029, "lr": 1.0630508321932687e-07, "epoch": 2.913734939759036, "percentage": 97.11, "elapsed_time": "1:25:38", "remaining_time": "0:02:32"} +{"current_steps": 1513, "total_steps": 1557, "loss": 0.003, "lr": 1.0173724176709254e-07, "epoch": 2.9156626506024095, "percentage": 97.17, "elapsed_time": "1:25:41", "remaining_time": "0:02:29"} +{"current_steps": 1514, "total_steps": 1557, "loss": 0.0013, "lr": 9.726945523606646e-08, "epoch": 2.917590361445783, "percentage": 97.24, "elapsed_time": "1:25:44", "remaining_time": "0:02:26"} +{"current_steps": 1515, "total_steps": 1557, "loss": 0.0204, "lr": 9.290174609172697e-08, "epoch": 2.9195180722891565, "percentage": 97.3, "elapsed_time": "1:25:48", "remaining_time": "0:02:22"} +{"current_steps": 1516, "total_steps": 1557, "loss": 0.0026, "lr": 8.863413629633277e-08, "epoch": 2.9214457831325302, "percentage": 97.37, "elapsed_time": "1:25:51", "remaining_time": "0:02:19"} +{"current_steps": 1517, "total_steps": 1557, "loss": 0.0038, "lr": 8.446664730881182e-08, "epoch": 2.9233734939759035, "percentage": 97.43, "elapsed_time": "1:25:55", "remaining_time": "0:02:15"} +{"current_steps": 1518, "total_steps": 1557, "loss": 0.0094, "lr": 8.039930008465257e-08, "epoch": 2.9253012048192772, "percentage": 97.5, "elapsed_time": "1:25:58", "remaining_time": "0:02:12"} +{"current_steps": 1519, "total_steps": 1557, "loss": 0.0062, "lr": 7.643211507579296e-08, "epoch": 2.9272289156626505, "percentage": 97.56, "elapsed_time": "1:26:02", "remaining_time": "0:02:09"} +{"current_steps": 1520, "total_steps": 1557, "loss": 0.0024, "lr": 7.25651122305293e-08, "epoch": 2.929156626506024, "percentage": 97.62, "elapsed_time": "1:26:05", "remaining_time": "0:02:05"} +{"current_steps": 1521, "total_steps": 1557, "loss": 0.0056, "lr": 6.87983109934054e-08, "epoch": 2.9310843373493976, "percentage": 97.69, "elapsed_time": "1:26:08", "remaining_time": "0:02:02"} +{"current_steps": 1522, "total_steps": 1557, "loss": 0.0047, "lr": 6.51317303051191e-08, "epoch": 2.9330120481927713, "percentage": 97.75, "elapsed_time": "1:26:11", "remaining_time": "0:01:58"} +{"current_steps": 1523, "total_steps": 1557, "loss": 0.0111, "lr": 6.156538860242922e-08, "epoch": 2.9349397590361446, "percentage": 97.82, "elapsed_time": "1:26:15", "remaining_time": "0:01:55"} +{"current_steps": 1524, "total_steps": 1557, "loss": 0.0033, "lr": 5.809930381805773e-08, "epoch": 2.936867469879518, "percentage": 97.88, "elapsed_time": "1:26:18", "remaining_time": "0:01:52"} +{"current_steps": 1525, "total_steps": 1557, "loss": 0.0028, "lr": 5.4733493380603183e-08, "epoch": 2.9387951807228916, "percentage": 97.94, "elapsed_time": "1:26:22", "remaining_time": "0:01:48"} +{"current_steps": 1526, "total_steps": 1557, "loss": 0.0037, "lr": 5.1467974214456374e-08, "epoch": 2.9407228915662653, "percentage": 98.01, "elapsed_time": "1:26:25", "remaining_time": "0:01:45"} +{"current_steps": 1527, "total_steps": 1557, "loss": 0.003, "lr": 4.830276273970258e-08, "epoch": 2.9426506024096386, "percentage": 98.07, "elapsed_time": "1:26:28", "remaining_time": "0:01:41"} +{"current_steps": 1528, "total_steps": 1557, "loss": 0.0032, "lr": 4.5237874872052776e-08, "epoch": 2.944578313253012, "percentage": 98.14, "elapsed_time": "1:26:31", "remaining_time": "0:01:38"} +{"current_steps": 1529, "total_steps": 1557, "loss": 0.0105, "lr": 4.227332602275924e-08, "epoch": 2.9465060240963856, "percentage": 98.2, "elapsed_time": "1:26:35", "remaining_time": "0:01:35"} +{"current_steps": 1530, "total_steps": 1557, "loss": 0.0055, "lr": 3.940913109853561e-08, "epoch": 2.948433734939759, "percentage": 98.27, "elapsed_time": "1:26:38", "remaining_time": "0:01:31"} +{"current_steps": 1531, "total_steps": 1557, "loss": 0.0044, "lr": 3.66453045014814e-08, "epoch": 2.9503614457831326, "percentage": 98.33, "elapsed_time": "1:26:42", "remaining_time": "0:01:28"} +{"current_steps": 1532, "total_steps": 1557, "loss": 0.0042, "lr": 3.398186012901539e-08, "epoch": 2.952289156626506, "percentage": 98.39, "elapsed_time": "1:26:45", "remaining_time": "0:01:24"} +{"current_steps": 1533, "total_steps": 1557, "loss": 0.0073, "lr": 3.141881137379788e-08, "epoch": 2.9542168674698797, "percentage": 98.46, "elapsed_time": "1:26:49", "remaining_time": "0:01:21"} +{"current_steps": 1534, "total_steps": 1557, "loss": 0.0055, "lr": 2.8956171123670774e-08, "epoch": 2.956144578313253, "percentage": 98.52, "elapsed_time": "1:26:52", "remaining_time": "0:01:18"} +{"current_steps": 1535, "total_steps": 1557, "loss": 0.0016, "lr": 2.6593951761588744e-08, "epoch": 2.9580722891566262, "percentage": 98.59, "elapsed_time": "1:26:55", "remaining_time": "0:01:14"} +{"current_steps": 1536, "total_steps": 1557, "loss": 0.0026, "lr": 2.4332165165557032e-08, "epoch": 2.96, "percentage": 98.65, "elapsed_time": "1:26:58", "remaining_time": "0:01:11"} +{"current_steps": 1537, "total_steps": 1557, "loss": 0.0036, "lr": 2.2170822708573736e-08, "epoch": 2.9619277108433737, "percentage": 98.72, "elapsed_time": "1:27:02", "remaining_time": "0:01:07"} +{"current_steps": 1538, "total_steps": 1557, "loss": 0.0063, "lr": 2.0109935258565415e-08, "epoch": 2.963855421686747, "percentage": 98.78, "elapsed_time": "1:27:05", "remaining_time": "0:01:04"} +{"current_steps": 1539, "total_steps": 1557, "loss": 0.0081, "lr": 1.8149513178347122e-08, "epoch": 2.9657831325301203, "percentage": 98.84, "elapsed_time": "1:27:08", "remaining_time": "0:01:01"} +{"current_steps": 1540, "total_steps": 1557, "loss": 0.006, "lr": 1.6289566325555783e-08, "epoch": 2.967710843373494, "percentage": 98.91, "elapsed_time": "1:27:12", "remaining_time": "0:00:57"} +{"current_steps": 1541, "total_steps": 1557, "loss": 0.0021, "lr": 1.4530104052610239e-08, "epoch": 2.9696385542168677, "percentage": 98.97, "elapsed_time": "1:27:15", "remaining_time": "0:00:54"} +{"current_steps": 1542, "total_steps": 1557, "loss": 0.0016, "lr": 1.2871135206651287e-08, "epoch": 2.971566265060241, "percentage": 99.04, "elapsed_time": "1:27:19", "remaining_time": "0:00:50"} +{"current_steps": 1543, "total_steps": 1557, "loss": 0.0023, "lr": 1.1312668129519477e-08, "epoch": 2.9734939759036143, "percentage": 99.1, "elapsed_time": "1:27:22", "remaining_time": "0:00:47"} +{"current_steps": 1544, "total_steps": 1557, "loss": 0.0025, "lr": 9.854710657688504e-09, "epoch": 2.975421686746988, "percentage": 99.17, "elapsed_time": "1:27:25", "remaining_time": "0:00:44"} +{"current_steps": 1545, "total_steps": 1557, "loss": 0.0038, "lr": 8.497270122242996e-09, "epoch": 2.9773493975903613, "percentage": 99.23, "elapsed_time": "1:27:28", "remaining_time": "0:00:40"} +{"current_steps": 1546, "total_steps": 1557, "loss": 0.0027, "lr": 7.240353348834106e-09, "epoch": 2.979277108433735, "percentage": 99.29, "elapsed_time": "1:27:31", "remaining_time": "0:00:37"} +{"current_steps": 1547, "total_steps": 1557, "loss": 0.003, "lr": 6.083966657646212e-09, "epoch": 2.9812048192771083, "percentage": 99.36, "elapsed_time": "1:27:34", "remaining_time": "0:00:33"} +{"current_steps": 1548, "total_steps": 1557, "loss": 0.0021, "lr": 5.028115863370265e-09, "epoch": 2.983132530120482, "percentage": 99.42, "elapsed_time": "1:27:37", "remaining_time": "0:00:30"} +{"current_steps": 1549, "total_steps": 1557, "loss": 0.0039, "lr": 4.072806275163821e-09, "epoch": 2.9850602409638554, "percentage": 99.49, "elapsed_time": "1:27:40", "remaining_time": "0:00:27"} +{"current_steps": 1550, "total_steps": 1557, "loss": 0.0048, "lr": 3.2180426966332833e-09, "epoch": 2.9869879518072286, "percentage": 99.55, "elapsed_time": "1:27:44", "remaining_time": "0:00:23"} +{"current_steps": 1551, "total_steps": 1557, "loss": 0.0032, "lr": 2.4638294258072513e-09, "epoch": 2.9889156626506024, "percentage": 99.61, "elapsed_time": "1:27:47", "remaining_time": "0:00:20"} +{"current_steps": 1552, "total_steps": 1557, "loss": 0.0038, "lr": 1.810170255116539e-09, "epoch": 2.990843373493976, "percentage": 99.68, "elapsed_time": "1:27:50", "remaining_time": "0:00:16"} +{"current_steps": 1553, "total_steps": 1557, "loss": 0.0247, "lr": 1.2570684713719695e-09, "epoch": 2.9927710843373494, "percentage": 99.74, "elapsed_time": "1:27:54", "remaining_time": "0:00:13"} +{"current_steps": 1554, "total_steps": 1557, "loss": 0.0029, "lr": 8.045268557443919e-10, "epoch": 2.9946987951807227, "percentage": 99.81, "elapsed_time": "1:27:56", "remaining_time": "0:00:10"} +{"current_steps": 1555, "total_steps": 1557, "loss": 0.0119, "lr": 4.5254768376468137e-10, "epoch": 2.9966265060240964, "percentage": 99.87, "elapsed_time": "1:28:00", "remaining_time": "0:00:06"} +{"current_steps": 1556, "total_steps": 1557, "loss": 0.0038, "lr": 2.011327252948725e-10, "epoch": 2.99855421686747, "percentage": 99.94, "elapsed_time": "1:28:03", "remaining_time": "0:00:03"} +{"current_steps": 1557, "total_steps": 1557, "loss": 0.0016, "lr": 5.028324453482114e-11, "epoch": 3.0, "percentage": 100.0, "elapsed_time": "1:28:06", "remaining_time": "0:00:00"} +{"current_steps": 1557, "total_steps": 1557, "epoch": 3.0, "percentage": 100.0, "elapsed_time": "1:28:12", "remaining_time": "0:00:00"} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..6751b51 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,10942 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1557, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0019277108433734939, + "grad_norm": 2.8518834114074707, + "learning_rate": 0.0, + "loss": 0.0891, + "step": 1 + }, + { + "epoch": 0.0038554216867469878, + "grad_norm": 1.8441249132156372, + "learning_rate": 2.564102564102564e-07, + "loss": 0.0539, + "step": 2 + }, + { + "epoch": 0.005783132530120482, + "grad_norm": 2.8263237476348877, + "learning_rate": 5.128205128205128e-07, + "loss": 0.099, + "step": 3 + }, + { + "epoch": 0.0077108433734939755, + "grad_norm": 2.5051236152648926, + "learning_rate": 7.692307692307694e-07, + "loss": 0.0789, + "step": 4 + }, + { + "epoch": 0.00963855421686747, + "grad_norm": 2.6903438568115234, + "learning_rate": 1.0256410256410257e-06, + "loss": 0.0881, + "step": 5 + }, + { + "epoch": 0.011566265060240964, + "grad_norm": 2.6205761432647705, + "learning_rate": 1.282051282051282e-06, + "loss": 0.0776, + "step": 6 + }, + { + "epoch": 0.013493975903614458, + "grad_norm": 2.6309337615966797, + "learning_rate": 1.5384615384615387e-06, + "loss": 0.0827, + "step": 7 + }, + { + "epoch": 0.015421686746987951, + "grad_norm": 1.5427855253219604, + "learning_rate": 1.794871794871795e-06, + "loss": 0.0577, + "step": 8 + }, + { + "epoch": 0.017349397590361446, + "grad_norm": 1.0973446369171143, + "learning_rate": 2.0512820512820513e-06, + "loss": 0.04, + "step": 9 + }, + { + "epoch": 0.01927710843373494, + "grad_norm": 1.3253350257873535, + "learning_rate": 2.307692307692308e-06, + "loss": 0.0506, + "step": 10 + }, + { + "epoch": 0.021204819277108433, + "grad_norm": 1.588739037513733, + "learning_rate": 2.564102564102564e-06, + "loss": 0.0874, + "step": 11 + }, + { + "epoch": 0.02313253012048193, + "grad_norm": 1.4987014532089233, + "learning_rate": 2.8205128205128207e-06, + "loss": 0.0597, + "step": 12 + }, + { + "epoch": 0.02506024096385542, + "grad_norm": 1.6571592092514038, + "learning_rate": 3.0769230769230774e-06, + "loss": 0.0559, + "step": 13 + }, + { + "epoch": 0.026987951807228915, + "grad_norm": 1.8860628604888916, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.0688, + "step": 14 + }, + { + "epoch": 0.02891566265060241, + "grad_norm": 1.3202295303344727, + "learning_rate": 3.58974358974359e-06, + "loss": 0.0433, + "step": 15 + }, + { + "epoch": 0.030843373493975902, + "grad_norm": 1.5870612859725952, + "learning_rate": 3.846153846153847e-06, + "loss": 0.0695, + "step": 16 + }, + { + "epoch": 0.0327710843373494, + "grad_norm": 0.9192284345626831, + "learning_rate": 4.102564102564103e-06, + "loss": 0.0392, + "step": 17 + }, + { + "epoch": 0.03469879518072289, + "grad_norm": 0.7950155735015869, + "learning_rate": 4.358974358974359e-06, + "loss": 0.0351, + "step": 18 + }, + { + "epoch": 0.03662650602409639, + "grad_norm": 0.8854314684867859, + "learning_rate": 4.615384615384616e-06, + "loss": 0.0356, + "step": 19 + }, + { + "epoch": 0.03855421686746988, + "grad_norm": 0.9546788930892944, + "learning_rate": 4.871794871794872e-06, + "loss": 0.0427, + "step": 20 + }, + { + "epoch": 0.04048192771084337, + "grad_norm": 0.6315903663635254, + "learning_rate": 5.128205128205128e-06, + "loss": 0.0397, + "step": 21 + }, + { + "epoch": 0.042409638554216866, + "grad_norm": 0.9230924844741821, + "learning_rate": 5.384615384615385e-06, + "loss": 0.0481, + "step": 22 + }, + { + "epoch": 0.04433734939759036, + "grad_norm": 0.711546003818512, + "learning_rate": 5.641025641025641e-06, + "loss": 0.0479, + "step": 23 + }, + { + "epoch": 0.04626506024096386, + "grad_norm": 0.5288046598434448, + "learning_rate": 5.897435897435898e-06, + "loss": 0.0182, + "step": 24 + }, + { + "epoch": 0.04819277108433735, + "grad_norm": 0.9420496225357056, + "learning_rate": 6.153846153846155e-06, + "loss": 0.0389, + "step": 25 + }, + { + "epoch": 0.05012048192771084, + "grad_norm": 0.5001983046531677, + "learning_rate": 6.410256410256412e-06, + "loss": 0.0268, + "step": 26 + }, + { + "epoch": 0.052048192771084335, + "grad_norm": 0.8084653615951538, + "learning_rate": 6.666666666666667e-06, + "loss": 0.0367, + "step": 27 + }, + { + "epoch": 0.05397590361445783, + "grad_norm": 0.7195103764533997, + "learning_rate": 6.923076923076923e-06, + "loss": 0.0251, + "step": 28 + }, + { + "epoch": 0.055903614457831326, + "grad_norm": 0.529958963394165, + "learning_rate": 7.17948717948718e-06, + "loss": 0.0289, + "step": 29 + }, + { + "epoch": 0.05783132530120482, + "grad_norm": 0.795376181602478, + "learning_rate": 7.435897435897437e-06, + "loss": 0.043, + "step": 30 + }, + { + "epoch": 0.059759036144578316, + "grad_norm": 0.6366249918937683, + "learning_rate": 7.692307692307694e-06, + "loss": 0.029, + "step": 31 + }, + { + "epoch": 0.061686746987951804, + "grad_norm": 0.5414115190505981, + "learning_rate": 7.948717948717949e-06, + "loss": 0.0365, + "step": 32 + }, + { + "epoch": 0.0636144578313253, + "grad_norm": 0.9350972175598145, + "learning_rate": 8.205128205128205e-06, + "loss": 0.0283, + "step": 33 + }, + { + "epoch": 0.0655421686746988, + "grad_norm": 0.5660741925239563, + "learning_rate": 8.461538461538462e-06, + "loss": 0.0234, + "step": 34 + }, + { + "epoch": 0.06746987951807229, + "grad_norm": 0.5623988509178162, + "learning_rate": 8.717948717948719e-06, + "loss": 0.0307, + "step": 35 + }, + { + "epoch": 0.06939759036144579, + "grad_norm": 0.5260195732116699, + "learning_rate": 8.974358974358976e-06, + "loss": 0.0264, + "step": 36 + }, + { + "epoch": 0.07132530120481928, + "grad_norm": 0.4934785068035126, + "learning_rate": 9.230769230769232e-06, + "loss": 0.0224, + "step": 37 + }, + { + "epoch": 0.07325301204819278, + "grad_norm": 0.4797322154045105, + "learning_rate": 9.487179487179487e-06, + "loss": 0.0163, + "step": 38 + }, + { + "epoch": 0.07518072289156627, + "grad_norm": 0.4739217460155487, + "learning_rate": 9.743589743589744e-06, + "loss": 0.0165, + "step": 39 + }, + { + "epoch": 0.07710843373493977, + "grad_norm": 0.4527677595615387, + "learning_rate": 1e-05, + "loss": 0.0163, + "step": 40 + }, + { + "epoch": 0.07903614457831325, + "grad_norm": 0.6241316795349121, + "learning_rate": 1.0256410256410256e-05, + "loss": 0.0302, + "step": 41 + }, + { + "epoch": 0.08096385542168674, + "grad_norm": 0.639043927192688, + "learning_rate": 1.0512820512820514e-05, + "loss": 0.0312, + "step": 42 + }, + { + "epoch": 0.08289156626506024, + "grad_norm": 0.5121409296989441, + "learning_rate": 1.076923076923077e-05, + "loss": 0.0256, + "step": 43 + }, + { + "epoch": 0.08481927710843373, + "grad_norm": 0.6340477466583252, + "learning_rate": 1.1025641025641028e-05, + "loss": 0.04, + "step": 44 + }, + { + "epoch": 0.08674698795180723, + "grad_norm": 0.5260409712791443, + "learning_rate": 1.1282051282051283e-05, + "loss": 0.0282, + "step": 45 + }, + { + "epoch": 0.08867469879518072, + "grad_norm": 0.6390711069107056, + "learning_rate": 1.1538461538461538e-05, + "loss": 0.0243, + "step": 46 + }, + { + "epoch": 0.09060240963855422, + "grad_norm": 0.46469295024871826, + "learning_rate": 1.1794871794871796e-05, + "loss": 0.0208, + "step": 47 + }, + { + "epoch": 0.09253012048192771, + "grad_norm": 0.8711516857147217, + "learning_rate": 1.2051282051282051e-05, + "loss": 0.0291, + "step": 48 + }, + { + "epoch": 0.09445783132530121, + "grad_norm": 0.9164300560951233, + "learning_rate": 1.230769230769231e-05, + "loss": 0.0342, + "step": 49 + }, + { + "epoch": 0.0963855421686747, + "grad_norm": 0.5401139259338379, + "learning_rate": 1.2564102564102565e-05, + "loss": 0.0185, + "step": 50 + }, + { + "epoch": 0.0983132530120482, + "grad_norm": 0.44393008947372437, + "learning_rate": 1.2820512820512823e-05, + "loss": 0.0228, + "step": 51 + }, + { + "epoch": 0.10024096385542168, + "grad_norm": 0.3855767846107483, + "learning_rate": 1.3076923076923078e-05, + "loss": 0.0176, + "step": 52 + }, + { + "epoch": 0.10216867469879518, + "grad_norm": 0.8561235070228577, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.0433, + "step": 53 + }, + { + "epoch": 0.10409638554216867, + "grad_norm": 0.768002450466156, + "learning_rate": 1.3589743589743592e-05, + "loss": 0.0245, + "step": 54 + }, + { + "epoch": 0.10602409638554217, + "grad_norm": 0.4559759497642517, + "learning_rate": 1.3846153846153847e-05, + "loss": 0.0224, + "step": 55 + }, + { + "epoch": 0.10795180722891566, + "grad_norm": 0.6203847527503967, + "learning_rate": 1.4102564102564105e-05, + "loss": 0.0296, + "step": 56 + }, + { + "epoch": 0.10987951807228916, + "grad_norm": 0.6651368141174316, + "learning_rate": 1.435897435897436e-05, + "loss": 0.0336, + "step": 57 + }, + { + "epoch": 0.11180722891566265, + "grad_norm": 0.377734512090683, + "learning_rate": 1.4615384615384615e-05, + "loss": 0.0196, + "step": 58 + }, + { + "epoch": 0.11373493975903615, + "grad_norm": 0.687568724155426, + "learning_rate": 1.4871794871794874e-05, + "loss": 0.0207, + "step": 59 + }, + { + "epoch": 0.11566265060240964, + "grad_norm": 0.7905604243278503, + "learning_rate": 1.5128205128205129e-05, + "loss": 0.047, + "step": 60 + }, + { + "epoch": 0.11759036144578314, + "grad_norm": 0.7938196063041687, + "learning_rate": 1.5384615384615387e-05, + "loss": 0.0198, + "step": 61 + }, + { + "epoch": 0.11951807228915663, + "grad_norm": 0.41340553760528564, + "learning_rate": 1.5641025641025644e-05, + "loss": 0.0161, + "step": 62 + }, + { + "epoch": 0.12144578313253013, + "grad_norm": 0.5668172240257263, + "learning_rate": 1.5897435897435897e-05, + "loss": 0.0275, + "step": 63 + }, + { + "epoch": 0.12337349397590361, + "grad_norm": 0.48333367705345154, + "learning_rate": 1.6153846153846154e-05, + "loss": 0.0137, + "step": 64 + }, + { + "epoch": 0.12530120481927712, + "grad_norm": 0.6843933463096619, + "learning_rate": 1.641025641025641e-05, + "loss": 0.0294, + "step": 65 + }, + { + "epoch": 0.1272289156626506, + "grad_norm": 0.7789272665977478, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.0401, + "step": 66 + }, + { + "epoch": 0.1291566265060241, + "grad_norm": 0.6203492879867554, + "learning_rate": 1.6923076923076924e-05, + "loss": 0.0292, + "step": 67 + }, + { + "epoch": 0.1310843373493976, + "grad_norm": 0.5940662622451782, + "learning_rate": 1.717948717948718e-05, + "loss": 0.0178, + "step": 68 + }, + { + "epoch": 0.13301204819277107, + "grad_norm": 0.35504868626594543, + "learning_rate": 1.7435897435897438e-05, + "loss": 0.0129, + "step": 69 + }, + { + "epoch": 0.13493975903614458, + "grad_norm": 0.8796699643135071, + "learning_rate": 1.7692307692307694e-05, + "loss": 0.034, + "step": 70 + }, + { + "epoch": 0.13686746987951806, + "grad_norm": 0.967444896697998, + "learning_rate": 1.794871794871795e-05, + "loss": 0.0266, + "step": 71 + }, + { + "epoch": 0.13879518072289157, + "grad_norm": 0.4428526759147644, + "learning_rate": 1.8205128205128208e-05, + "loss": 0.0223, + "step": 72 + }, + { + "epoch": 0.14072289156626505, + "grad_norm": 0.42897751927375793, + "learning_rate": 1.8461538461538465e-05, + "loss": 0.0187, + "step": 73 + }, + { + "epoch": 0.14265060240963856, + "grad_norm": 0.5100914835929871, + "learning_rate": 1.8717948717948718e-05, + "loss": 0.0164, + "step": 74 + }, + { + "epoch": 0.14457831325301204, + "grad_norm": 0.6028861999511719, + "learning_rate": 1.8974358974358975e-05, + "loss": 0.0164, + "step": 75 + }, + { + "epoch": 0.14650602409638555, + "grad_norm": 0.6187024116516113, + "learning_rate": 1.923076923076923e-05, + "loss": 0.0296, + "step": 76 + }, + { + "epoch": 0.14843373493975903, + "grad_norm": 0.4822489619255066, + "learning_rate": 1.9487179487179488e-05, + "loss": 0.0148, + "step": 77 + }, + { + "epoch": 0.15036144578313254, + "grad_norm": 0.7231149673461914, + "learning_rate": 1.9743589743589745e-05, + "loss": 0.0395, + "step": 78 + }, + { + "epoch": 0.15228915662650602, + "grad_norm": 0.8409642577171326, + "learning_rate": 2e-05, + "loss": 0.0446, + "step": 79 + }, + { + "epoch": 0.15421686746987953, + "grad_norm": 0.4883500039577484, + "learning_rate": 2.025641025641026e-05, + "loss": 0.0206, + "step": 80 + }, + { + "epoch": 0.156144578313253, + "grad_norm": 0.6287479400634766, + "learning_rate": 2.0512820512820512e-05, + "loss": 0.0333, + "step": 81 + }, + { + "epoch": 0.1580722891566265, + "grad_norm": 0.5041632652282715, + "learning_rate": 2.0769230769230772e-05, + "loss": 0.0414, + "step": 82 + }, + { + "epoch": 0.16, + "grad_norm": 0.5103405117988586, + "learning_rate": 2.102564102564103e-05, + "loss": 0.045, + "step": 83 + }, + { + "epoch": 0.16192771084337348, + "grad_norm": 0.493161678314209, + "learning_rate": 2.1282051282051285e-05, + "loss": 0.021, + "step": 84 + }, + { + "epoch": 0.163855421686747, + "grad_norm": 0.908843994140625, + "learning_rate": 2.153846153846154e-05, + "loss": 0.0389, + "step": 85 + }, + { + "epoch": 0.16578313253012048, + "grad_norm": 0.5067003965377808, + "learning_rate": 2.1794871794871795e-05, + "loss": 0.0272, + "step": 86 + }, + { + "epoch": 0.16771084337349398, + "grad_norm": 0.5791381597518921, + "learning_rate": 2.2051282051282056e-05, + "loss": 0.0368, + "step": 87 + }, + { + "epoch": 0.16963855421686747, + "grad_norm": 0.7056036591529846, + "learning_rate": 2.230769230769231e-05, + "loss": 0.0284, + "step": 88 + }, + { + "epoch": 0.17156626506024097, + "grad_norm": 0.6563822031021118, + "learning_rate": 2.2564102564102566e-05, + "loss": 0.0646, + "step": 89 + }, + { + "epoch": 0.17349397590361446, + "grad_norm": 0.9483286142349243, + "learning_rate": 2.2820512820512822e-05, + "loss": 0.0439, + "step": 90 + }, + { + "epoch": 0.17542168674698796, + "grad_norm": 0.370664119720459, + "learning_rate": 2.3076923076923076e-05, + "loss": 0.0109, + "step": 91 + }, + { + "epoch": 0.17734939759036145, + "grad_norm": 0.9776477813720703, + "learning_rate": 2.3333333333333336e-05, + "loss": 0.0458, + "step": 92 + }, + { + "epoch": 0.17927710843373493, + "grad_norm": 0.45710092782974243, + "learning_rate": 2.3589743589743593e-05, + "loss": 0.0212, + "step": 93 + }, + { + "epoch": 0.18120481927710844, + "grad_norm": 0.8623896837234497, + "learning_rate": 2.384615384615385e-05, + "loss": 0.0215, + "step": 94 + }, + { + "epoch": 0.18313253012048192, + "grad_norm": 0.55814528465271, + "learning_rate": 2.4102564102564103e-05, + "loss": 0.0218, + "step": 95 + }, + { + "epoch": 0.18506024096385543, + "grad_norm": 0.49882641434669495, + "learning_rate": 2.435897435897436e-05, + "loss": 0.0268, + "step": 96 + }, + { + "epoch": 0.1869879518072289, + "grad_norm": 0.3508654534816742, + "learning_rate": 2.461538461538462e-05, + "loss": 0.0172, + "step": 97 + }, + { + "epoch": 0.18891566265060242, + "grad_norm": 0.601170003414154, + "learning_rate": 2.4871794871794873e-05, + "loss": 0.0208, + "step": 98 + }, + { + "epoch": 0.1908433734939759, + "grad_norm": 1.1748133897781372, + "learning_rate": 2.512820512820513e-05, + "loss": 0.0259, + "step": 99 + }, + { + "epoch": 0.1927710843373494, + "grad_norm": 0.46370384097099304, + "learning_rate": 2.5384615384615386e-05, + "loss": 0.0242, + "step": 100 + }, + { + "epoch": 0.1946987951807229, + "grad_norm": 0.525010883808136, + "learning_rate": 2.5641025641025646e-05, + "loss": 0.0188, + "step": 101 + }, + { + "epoch": 0.1966265060240964, + "grad_norm": 0.766501784324646, + "learning_rate": 2.58974358974359e-05, + "loss": 0.0584, + "step": 102 + }, + { + "epoch": 0.19855421686746988, + "grad_norm": 0.3572964370250702, + "learning_rate": 2.6153846153846157e-05, + "loss": 0.0131, + "step": 103 + }, + { + "epoch": 0.20048192771084336, + "grad_norm": 0.6467130780220032, + "learning_rate": 2.6410256410256413e-05, + "loss": 0.0231, + "step": 104 + }, + { + "epoch": 0.20240963855421687, + "grad_norm": 1.1852102279663086, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.027, + "step": 105 + }, + { + "epoch": 0.20433734939759035, + "grad_norm": 2.3659932613372803, + "learning_rate": 2.6923076923076927e-05, + "loss": 0.0224, + "step": 106 + }, + { + "epoch": 0.20626506024096386, + "grad_norm": 0.5343687534332275, + "learning_rate": 2.7179487179487183e-05, + "loss": 0.0198, + "step": 107 + }, + { + "epoch": 0.20819277108433734, + "grad_norm": 1.852160096168518, + "learning_rate": 2.7435897435897437e-05, + "loss": 0.032, + "step": 108 + }, + { + "epoch": 0.21012048192771085, + "grad_norm": 0.47291702032089233, + "learning_rate": 2.7692307692307694e-05, + "loss": 0.0117, + "step": 109 + }, + { + "epoch": 0.21204819277108433, + "grad_norm": 0.7623187899589539, + "learning_rate": 2.794871794871795e-05, + "loss": 0.0337, + "step": 110 + }, + { + "epoch": 0.21397590361445784, + "grad_norm": 0.5272570848464966, + "learning_rate": 2.820512820512821e-05, + "loss": 0.0131, + "step": 111 + }, + { + "epoch": 0.21590361445783132, + "grad_norm": 0.5568500757217407, + "learning_rate": 2.8461538461538464e-05, + "loss": 0.0233, + "step": 112 + }, + { + "epoch": 0.21783132530120483, + "grad_norm": 0.4008469879627228, + "learning_rate": 2.871794871794872e-05, + "loss": 0.0204, + "step": 113 + }, + { + "epoch": 0.2197590361445783, + "grad_norm": 0.4888612926006317, + "learning_rate": 2.8974358974358977e-05, + "loss": 0.016, + "step": 114 + }, + { + "epoch": 0.2216867469879518, + "grad_norm": 0.44903355836868286, + "learning_rate": 2.923076923076923e-05, + "loss": 0.0135, + "step": 115 + }, + { + "epoch": 0.2236144578313253, + "grad_norm": 0.9266762733459473, + "learning_rate": 2.948717948717949e-05, + "loss": 0.0233, + "step": 116 + }, + { + "epoch": 0.22554216867469878, + "grad_norm": 0.5352638959884644, + "learning_rate": 2.9743589743589747e-05, + "loss": 0.0198, + "step": 117 + }, + { + "epoch": 0.2274698795180723, + "grad_norm": 0.6051343679428101, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.0246, + "step": 118 + }, + { + "epoch": 0.22939759036144577, + "grad_norm": 0.9971133470535278, + "learning_rate": 3.0256410256410257e-05, + "loss": 0.025, + "step": 119 + }, + { + "epoch": 0.23132530120481928, + "grad_norm": 0.704236626625061, + "learning_rate": 3.0512820512820514e-05, + "loss": 0.031, + "step": 120 + }, + { + "epoch": 0.23325301204819276, + "grad_norm": 0.6137097477912903, + "learning_rate": 3.0769230769230774e-05, + "loss": 0.0519, + "step": 121 + }, + { + "epoch": 0.23518072289156627, + "grad_norm": 0.7396159768104553, + "learning_rate": 3.102564102564103e-05, + "loss": 0.0325, + "step": 122 + }, + { + "epoch": 0.23710843373493976, + "grad_norm": 1.3282053470611572, + "learning_rate": 3.128205128205129e-05, + "loss": 0.0252, + "step": 123 + }, + { + "epoch": 0.23903614457831326, + "grad_norm": 0.5220731496810913, + "learning_rate": 3.153846153846154e-05, + "loss": 0.0262, + "step": 124 + }, + { + "epoch": 0.24096385542168675, + "grad_norm": 0.5357242822647095, + "learning_rate": 3.1794871794871795e-05, + "loss": 0.0243, + "step": 125 + }, + { + "epoch": 0.24289156626506025, + "grad_norm": 0.48207753896713257, + "learning_rate": 3.205128205128206e-05, + "loss": 0.0178, + "step": 126 + }, + { + "epoch": 0.24481927710843374, + "grad_norm": 0.552988588809967, + "learning_rate": 3.230769230769231e-05, + "loss": 0.023, + "step": 127 + }, + { + "epoch": 0.24674698795180722, + "grad_norm": 1.7962840795516968, + "learning_rate": 3.2564102564102565e-05, + "loss": 0.032, + "step": 128 + }, + { + "epoch": 0.24867469879518073, + "grad_norm": 1.6404600143432617, + "learning_rate": 3.282051282051282e-05, + "loss": 0.0231, + "step": 129 + }, + { + "epoch": 0.25060240963855424, + "grad_norm": 0.39142486453056335, + "learning_rate": 3.307692307692308e-05, + "loss": 0.0147, + "step": 130 + }, + { + "epoch": 0.2525301204819277, + "grad_norm": 1.3272887468338013, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.0439, + "step": 131 + }, + { + "epoch": 0.2544578313253012, + "grad_norm": 1.5122811794281006, + "learning_rate": 3.358974358974359e-05, + "loss": 0.0282, + "step": 132 + }, + { + "epoch": 0.2563855421686747, + "grad_norm": 1.8542430400848389, + "learning_rate": 3.384615384615385e-05, + "loss": 0.0515, + "step": 133 + }, + { + "epoch": 0.2583132530120482, + "grad_norm": 4.059277534484863, + "learning_rate": 3.4102564102564105e-05, + "loss": 0.0781, + "step": 134 + }, + { + "epoch": 0.26024096385542167, + "grad_norm": 0.6206214427947998, + "learning_rate": 3.435897435897436e-05, + "loss": 0.0306, + "step": 135 + }, + { + "epoch": 0.2621686746987952, + "grad_norm": 0.4575510323047638, + "learning_rate": 3.461538461538462e-05, + "loss": 0.0154, + "step": 136 + }, + { + "epoch": 0.2640963855421687, + "grad_norm": 1.1556978225708008, + "learning_rate": 3.4871794871794875e-05, + "loss": 0.0235, + "step": 137 + }, + { + "epoch": 0.26602409638554214, + "grad_norm": 0.6975051760673523, + "learning_rate": 3.512820512820513e-05, + "loss": 0.0453, + "step": 138 + }, + { + "epoch": 0.26795180722891565, + "grad_norm": 0.8686623573303223, + "learning_rate": 3.538461538461539e-05, + "loss": 0.0427, + "step": 139 + }, + { + "epoch": 0.26987951807228916, + "grad_norm": 2.0681848526000977, + "learning_rate": 3.5641025641025646e-05, + "loss": 0.04, + "step": 140 + }, + { + "epoch": 0.27180722891566267, + "grad_norm": 0.4397984445095062, + "learning_rate": 3.58974358974359e-05, + "loss": 0.0188, + "step": 141 + }, + { + "epoch": 0.2737349397590361, + "grad_norm": 0.5871334075927734, + "learning_rate": 3.615384615384616e-05, + "loss": 0.0253, + "step": 142 + }, + { + "epoch": 0.27566265060240963, + "grad_norm": 1.1078568696975708, + "learning_rate": 3.6410256410256416e-05, + "loss": 0.0316, + "step": 143 + }, + { + "epoch": 0.27759036144578314, + "grad_norm": 0.5691841840744019, + "learning_rate": 3.6666666666666666e-05, + "loss": 0.0266, + "step": 144 + }, + { + "epoch": 0.27951807228915665, + "grad_norm": 0.7896255254745483, + "learning_rate": 3.692307692307693e-05, + "loss": 0.0281, + "step": 145 + }, + { + "epoch": 0.2814457831325301, + "grad_norm": 0.9988337159156799, + "learning_rate": 3.7179487179487186e-05, + "loss": 0.0295, + "step": 146 + }, + { + "epoch": 0.2833734939759036, + "grad_norm": 0.9811834692955017, + "learning_rate": 3.7435897435897436e-05, + "loss": 0.0322, + "step": 147 + }, + { + "epoch": 0.2853012048192771, + "grad_norm": 0.6503105759620667, + "learning_rate": 3.769230769230769e-05, + "loss": 0.0266, + "step": 148 + }, + { + "epoch": 0.28722891566265063, + "grad_norm": 1.9164355993270874, + "learning_rate": 3.794871794871795e-05, + "loss": 0.0677, + "step": 149 + }, + { + "epoch": 0.2891566265060241, + "grad_norm": 1.1724557876586914, + "learning_rate": 3.820512820512821e-05, + "loss": 0.0324, + "step": 150 + }, + { + "epoch": 0.2910843373493976, + "grad_norm": 0.8482469916343689, + "learning_rate": 3.846153846153846e-05, + "loss": 0.0259, + "step": 151 + }, + { + "epoch": 0.2930120481927711, + "grad_norm": 0.8572830557823181, + "learning_rate": 3.871794871794872e-05, + "loss": 0.0358, + "step": 152 + }, + { + "epoch": 0.29493975903614456, + "grad_norm": 0.6630825400352478, + "learning_rate": 3.8974358974358976e-05, + "loss": 0.0447, + "step": 153 + }, + { + "epoch": 0.29686746987951806, + "grad_norm": 0.9197093844413757, + "learning_rate": 3.923076923076923e-05, + "loss": 0.0409, + "step": 154 + }, + { + "epoch": 0.2987951807228916, + "grad_norm": 0.6976819634437561, + "learning_rate": 3.948717948717949e-05, + "loss": 0.0317, + "step": 155 + }, + { + "epoch": 0.3007228915662651, + "grad_norm": 0.7353514432907104, + "learning_rate": 3.9743589743589747e-05, + "loss": 0.0306, + "step": 156 + }, + { + "epoch": 0.30265060240963854, + "grad_norm": 0.5730232000350952, + "learning_rate": 4e-05, + "loss": 0.0324, + "step": 157 + }, + { + "epoch": 0.30457831325301205, + "grad_norm": 0.7852078676223755, + "learning_rate": 3.999994971675547e-05, + "loss": 0.0354, + "step": 158 + }, + { + "epoch": 0.30650602409638555, + "grad_norm": 0.5924715399742126, + "learning_rate": 3.999979886727471e-05, + "loss": 0.0366, + "step": 159 + }, + { + "epoch": 0.30843373493975906, + "grad_norm": 0.7359845638275146, + "learning_rate": 3.999954745231624e-05, + "loss": 0.0437, + "step": 160 + }, + { + "epoch": 0.3103614457831325, + "grad_norm": 0.7866976857185364, + "learning_rate": 3.999919547314426e-05, + "loss": 0.0363, + "step": 161 + }, + { + "epoch": 0.312289156626506, + "grad_norm": 0.7425745129585266, + "learning_rate": 3.999874293152863e-05, + "loss": 0.0259, + "step": 162 + }, + { + "epoch": 0.31421686746987953, + "grad_norm": 1.8922245502471924, + "learning_rate": 3.9998189829744885e-05, + "loss": 0.0341, + "step": 163 + }, + { + "epoch": 0.316144578313253, + "grad_norm": 0.7908634543418884, + "learning_rate": 3.99975361705742e-05, + "loss": 0.0424, + "step": 164 + }, + { + "epoch": 0.3180722891566265, + "grad_norm": 2.047368049621582, + "learning_rate": 3.999678195730337e-05, + "loss": 0.0535, + "step": 165 + }, + { + "epoch": 0.32, + "grad_norm": 0.5702639222145081, + "learning_rate": 3.999592719372484e-05, + "loss": 0.0284, + "step": 166 + }, + { + "epoch": 0.3219277108433735, + "grad_norm": 0.45015648007392883, + "learning_rate": 3.9994971884136636e-05, + "loss": 0.0313, + "step": 167 + }, + { + "epoch": 0.32385542168674697, + "grad_norm": 4.094679355621338, + "learning_rate": 3.9993916033342355e-05, + "loss": 0.0524, + "step": 168 + }, + { + "epoch": 0.3257831325301205, + "grad_norm": 0.800846517086029, + "learning_rate": 3.999275964665117e-05, + "loss": 0.0282, + "step": 169 + }, + { + "epoch": 0.327710843373494, + "grad_norm": 0.47881078720092773, + "learning_rate": 3.999150272987776e-05, + "loss": 0.0293, + "step": 170 + }, + { + "epoch": 0.3296385542168675, + "grad_norm": 0.5716657638549805, + "learning_rate": 3.999014528934232e-05, + "loss": 0.0221, + "step": 171 + }, + { + "epoch": 0.33156626506024095, + "grad_norm": 0.6333311200141907, + "learning_rate": 3.998868733187048e-05, + "loss": 0.0302, + "step": 172 + }, + { + "epoch": 0.33349397590361446, + "grad_norm": 6.642521858215332, + "learning_rate": 3.998712886479335e-05, + "loss": 0.0364, + "step": 173 + }, + { + "epoch": 0.33542168674698797, + "grad_norm": 0.7515506148338318, + "learning_rate": 3.998546989594739e-05, + "loss": 0.0296, + "step": 174 + }, + { + "epoch": 0.3373493975903614, + "grad_norm": 1.0728015899658203, + "learning_rate": 3.998371043367445e-05, + "loss": 0.0549, + "step": 175 + }, + { + "epoch": 0.33927710843373493, + "grad_norm": 1.3025579452514648, + "learning_rate": 3.998185048682166e-05, + "loss": 0.0577, + "step": 176 + }, + { + "epoch": 0.34120481927710844, + "grad_norm": 1.0962958335876465, + "learning_rate": 3.997989006474144e-05, + "loss": 0.0313, + "step": 177 + }, + { + "epoch": 0.34313253012048195, + "grad_norm": 0.7064313292503357, + "learning_rate": 3.997782917729143e-05, + "loss": 0.0309, + "step": 178 + }, + { + "epoch": 0.3450602409638554, + "grad_norm": 0.43374207615852356, + "learning_rate": 3.997566783483445e-05, + "loss": 0.0166, + "step": 179 + }, + { + "epoch": 0.3469879518072289, + "grad_norm": 0.7236390113830566, + "learning_rate": 3.9973406048238413e-05, + "loss": 0.0254, + "step": 180 + }, + { + "epoch": 0.3489156626506024, + "grad_norm": 0.5041500926017761, + "learning_rate": 3.9971043828876334e-05, + "loss": 0.0239, + "step": 181 + }, + { + "epoch": 0.35084337349397593, + "grad_norm": 1.2744532823562622, + "learning_rate": 3.9968581188626204e-05, + "loss": 0.0404, + "step": 182 + }, + { + "epoch": 0.3527710843373494, + "grad_norm": 0.45845362544059753, + "learning_rate": 3.996601813987098e-05, + "loss": 0.0127, + "step": 183 + }, + { + "epoch": 0.3546987951807229, + "grad_norm": 0.4426881968975067, + "learning_rate": 3.996335469549852e-05, + "loss": 0.0176, + "step": 184 + }, + { + "epoch": 0.3566265060240964, + "grad_norm": 1.0030732154846191, + "learning_rate": 3.9960590868901465e-05, + "loss": 0.0457, + "step": 185 + }, + { + "epoch": 0.35855421686746985, + "grad_norm": 0.6428582668304443, + "learning_rate": 3.995772667397725e-05, + "loss": 0.0271, + "step": 186 + }, + { + "epoch": 0.36048192771084336, + "grad_norm": 0.5335744619369507, + "learning_rate": 3.995476212512795e-05, + "loss": 0.0297, + "step": 187 + }, + { + "epoch": 0.3624096385542169, + "grad_norm": 0.6995761394500732, + "learning_rate": 3.99516972372603e-05, + "loss": 0.0322, + "step": 188 + }, + { + "epoch": 0.3643373493975904, + "grad_norm": 0.765511155128479, + "learning_rate": 3.9948532025785546e-05, + "loss": 0.0253, + "step": 189 + }, + { + "epoch": 0.36626506024096384, + "grad_norm": 0.6165828108787537, + "learning_rate": 3.9945266506619403e-05, + "loss": 0.0355, + "step": 190 + }, + { + "epoch": 0.36819277108433734, + "grad_norm": 0.851970911026001, + "learning_rate": 3.994190069618195e-05, + "loss": 0.056, + "step": 191 + }, + { + "epoch": 0.37012048192771085, + "grad_norm": 0.9850023984909058, + "learning_rate": 3.993843461139757e-05, + "loss": 0.0415, + "step": 192 + }, + { + "epoch": 0.37204819277108436, + "grad_norm": 0.7455295324325562, + "learning_rate": 3.9934868269694886e-05, + "loss": 0.0379, + "step": 193 + }, + { + "epoch": 0.3739759036144578, + "grad_norm": 1.159469723701477, + "learning_rate": 3.9931201689006595e-05, + "loss": 0.0237, + "step": 194 + }, + { + "epoch": 0.3759036144578313, + "grad_norm": 0.5490080118179321, + "learning_rate": 3.992743488776947e-05, + "loss": 0.024, + "step": 195 + }, + { + "epoch": 0.37783132530120483, + "grad_norm": 1.279831886291504, + "learning_rate": 3.992356788492421e-05, + "loss": 0.0273, + "step": 196 + }, + { + "epoch": 0.3797590361445783, + "grad_norm": 0.859104335308075, + "learning_rate": 3.9919600699915355e-05, + "loss": 0.0411, + "step": 197 + }, + { + "epoch": 0.3816867469879518, + "grad_norm": 1.2525300979614258, + "learning_rate": 3.991553335269119e-05, + "loss": 0.0857, + "step": 198 + }, + { + "epoch": 0.3836144578313253, + "grad_norm": 0.4924193024635315, + "learning_rate": 3.991136586370367e-05, + "loss": 0.0294, + "step": 199 + }, + { + "epoch": 0.3855421686746988, + "grad_norm": 1.417190670967102, + "learning_rate": 3.990709825390828e-05, + "loss": 0.0395, + "step": 200 + }, + { + "epoch": 0.38746987951807227, + "grad_norm": 0.6172056198120117, + "learning_rate": 3.9902730544763936e-05, + "loss": 0.0194, + "step": 201 + }, + { + "epoch": 0.3893975903614458, + "grad_norm": 0.7292149662971497, + "learning_rate": 3.989826275823291e-05, + "loss": 0.0381, + "step": 202 + }, + { + "epoch": 0.3913253012048193, + "grad_norm": 0.5949816107749939, + "learning_rate": 3.989369491678067e-05, + "loss": 0.0254, + "step": 203 + }, + { + "epoch": 0.3932530120481928, + "grad_norm": 0.6012582182884216, + "learning_rate": 3.988902704337582e-05, + "loss": 0.048, + "step": 204 + }, + { + "epoch": 0.39518072289156625, + "grad_norm": 0.6273590922355652, + "learning_rate": 3.9884259161489936e-05, + "loss": 0.0268, + "step": 205 + }, + { + "epoch": 0.39710843373493976, + "grad_norm": 0.9615244269371033, + "learning_rate": 3.987939129509746e-05, + "loss": 0.0192, + "step": 206 + }, + { + "epoch": 0.39903614457831327, + "grad_norm": 0.6009241342544556, + "learning_rate": 3.9874423468675624e-05, + "loss": 0.0362, + "step": 207 + }, + { + "epoch": 0.4009638554216867, + "grad_norm": 0.411335289478302, + "learning_rate": 3.9869355707204266e-05, + "loss": 0.017, + "step": 208 + }, + { + "epoch": 0.40289156626506023, + "grad_norm": 0.6151527166366577, + "learning_rate": 3.986418803616573e-05, + "loss": 0.0283, + "step": 209 + }, + { + "epoch": 0.40481927710843374, + "grad_norm": 0.33808204531669617, + "learning_rate": 3.985892048154474e-05, + "loss": 0.0158, + "step": 210 + }, + { + "epoch": 0.40674698795180725, + "grad_norm": 0.5464187860488892, + "learning_rate": 3.9853553069828284e-05, + "loss": 0.0292, + "step": 211 + }, + { + "epoch": 0.4086746987951807, + "grad_norm": 0.6658390760421753, + "learning_rate": 3.984808582800543e-05, + "loss": 0.0281, + "step": 212 + }, + { + "epoch": 0.4106024096385542, + "grad_norm": 0.4253764748573303, + "learning_rate": 3.984251878356726e-05, + "loss": 0.031, + "step": 213 + }, + { + "epoch": 0.4125301204819277, + "grad_norm": 0.32309481501579285, + "learning_rate": 3.983685196450667e-05, + "loss": 0.0166, + "step": 214 + }, + { + "epoch": 0.41445783132530123, + "grad_norm": 0.43756410479545593, + "learning_rate": 3.9831085399318265e-05, + "loss": 0.0326, + "step": 215 + }, + { + "epoch": 0.4163855421686747, + "grad_norm": 0.264046847820282, + "learning_rate": 3.982521911699822e-05, + "loss": 0.0118, + "step": 216 + }, + { + "epoch": 0.4183132530120482, + "grad_norm": 0.8630897402763367, + "learning_rate": 3.9819253147044084e-05, + "loss": 0.0246, + "step": 217 + }, + { + "epoch": 0.4202409638554217, + "grad_norm": 0.6923379898071289, + "learning_rate": 3.98131875194547e-05, + "loss": 0.036, + "step": 218 + }, + { + "epoch": 0.42216867469879515, + "grad_norm": 0.5874778628349304, + "learning_rate": 3.9807022264730024e-05, + "loss": 0.0255, + "step": 219 + }, + { + "epoch": 0.42409638554216866, + "grad_norm": 0.394336074590683, + "learning_rate": 3.980075741387094e-05, + "loss": 0.0187, + "step": 220 + }, + { + "epoch": 0.4260240963855422, + "grad_norm": 0.6300327777862549, + "learning_rate": 3.979439299837915e-05, + "loss": 0.0214, + "step": 221 + }, + { + "epoch": 0.4279518072289157, + "grad_norm": 0.5200467109680176, + "learning_rate": 3.978792905025702e-05, + "loss": 0.0628, + "step": 222 + }, + { + "epoch": 0.42987951807228914, + "grad_norm": 0.5713880062103271, + "learning_rate": 3.978136560200735e-05, + "loss": 0.0302, + "step": 223 + }, + { + "epoch": 0.43180722891566264, + "grad_norm": 0.5345383286476135, + "learning_rate": 3.977470268663331e-05, + "loss": 0.0125, + "step": 224 + }, + { + "epoch": 0.43373493975903615, + "grad_norm": 0.5378350019454956, + "learning_rate": 3.976794033763819e-05, + "loss": 0.0246, + "step": 225 + }, + { + "epoch": 0.43566265060240966, + "grad_norm": 0.5554935336112976, + "learning_rate": 3.9761078589025276e-05, + "loss": 0.0212, + "step": 226 + }, + { + "epoch": 0.4375903614457831, + "grad_norm": 0.2832634747028351, + "learning_rate": 3.9754117475297664e-05, + "loss": 0.0125, + "step": 227 + }, + { + "epoch": 0.4395180722891566, + "grad_norm": 1.2910150289535522, + "learning_rate": 3.97470570314581e-05, + "loss": 0.0364, + "step": 228 + }, + { + "epoch": 0.44144578313253013, + "grad_norm": 0.3731018602848053, + "learning_rate": 3.973989729300878e-05, + "loss": 0.0128, + "step": 229 + }, + { + "epoch": 0.4433734939759036, + "grad_norm": 0.9433871507644653, + "learning_rate": 3.9732638295951195e-05, + "loss": 0.0367, + "step": 230 + }, + { + "epoch": 0.4453012048192771, + "grad_norm": 1.0779197216033936, + "learning_rate": 3.972528007678594e-05, + "loss": 0.0667, + "step": 231 + }, + { + "epoch": 0.4472289156626506, + "grad_norm": 1.7009105682373047, + "learning_rate": 3.9717822672512516e-05, + "loss": 0.0655, + "step": 232 + }, + { + "epoch": 0.4491566265060241, + "grad_norm": 0.5646032094955444, + "learning_rate": 3.971026612062919e-05, + "loss": 0.064, + "step": 233 + }, + { + "epoch": 0.45108433734939757, + "grad_norm": 0.44474121928215027, + "learning_rate": 3.970261045913274e-05, + "loss": 0.0206, + "step": 234 + }, + { + "epoch": 0.4530120481927711, + "grad_norm": 1.3969277143478394, + "learning_rate": 3.969485572651833e-05, + "loss": 0.0486, + "step": 235 + }, + { + "epoch": 0.4549397590361446, + "grad_norm": 0.6401994228363037, + "learning_rate": 3.968700196177925e-05, + "loss": 0.0262, + "step": 236 + }, + { + "epoch": 0.4568674698795181, + "grad_norm": 0.7091913223266602, + "learning_rate": 3.96790492044068e-05, + "loss": 0.014, + "step": 237 + }, + { + "epoch": 0.45879518072289155, + "grad_norm": 0.6561547517776489, + "learning_rate": 3.967099749439002e-05, + "loss": 0.0482, + "step": 238 + }, + { + "epoch": 0.46072289156626506, + "grad_norm": 0.6924155354499817, + "learning_rate": 3.966284687221551e-05, + "loss": 0.0289, + "step": 239 + }, + { + "epoch": 0.46265060240963857, + "grad_norm": 0.5868663787841797, + "learning_rate": 3.9654597378867256e-05, + "loss": 0.0331, + "step": 240 + }, + { + "epoch": 0.464578313253012, + "grad_norm": 0.7930939793586731, + "learning_rate": 3.964624905582637e-05, + "loss": 0.0925, + "step": 241 + }, + { + "epoch": 0.46650602409638553, + "grad_norm": 0.4888836145401001, + "learning_rate": 3.9637801945070944e-05, + "loss": 0.015, + "step": 242 + }, + { + "epoch": 0.46843373493975904, + "grad_norm": 0.7820287346839905, + "learning_rate": 3.962925608907579e-05, + "loss": 0.0382, + "step": 243 + }, + { + "epoch": 0.47036144578313255, + "grad_norm": 0.4914316236972809, + "learning_rate": 3.962061153081224e-05, + "loss": 0.0257, + "step": 244 + }, + { + "epoch": 0.472289156626506, + "grad_norm": 0.5681505799293518, + "learning_rate": 3.961186831374793e-05, + "loss": 0.0551, + "step": 245 + }, + { + "epoch": 0.4742168674698795, + "grad_norm": 0.5049723386764526, + "learning_rate": 3.9603026481846616e-05, + "loss": 0.0186, + "step": 246 + }, + { + "epoch": 0.476144578313253, + "grad_norm": 0.5034119486808777, + "learning_rate": 3.959408607956787e-05, + "loss": 0.024, + "step": 247 + }, + { + "epoch": 0.47807228915662653, + "grad_norm": 0.4543336033821106, + "learning_rate": 3.958504715186695e-05, + "loss": 0.0256, + "step": 248 + }, + { + "epoch": 0.48, + "grad_norm": 0.5595743656158447, + "learning_rate": 3.957590974419452e-05, + "loss": 0.0222, + "step": 249 + }, + { + "epoch": 0.4819277108433735, + "grad_norm": 0.5701581239700317, + "learning_rate": 3.956667390249642e-05, + "loss": 0.0334, + "step": 250 + }, + { + "epoch": 0.483855421686747, + "grad_norm": 0.53755784034729, + "learning_rate": 3.9557339673213474e-05, + "loss": 0.0345, + "step": 251 + }, + { + "epoch": 0.4857831325301205, + "grad_norm": 0.4368877112865448, + "learning_rate": 3.95479071032812e-05, + "loss": 0.0183, + "step": 252 + }, + { + "epoch": 0.48771084337349396, + "grad_norm": 0.7972906827926636, + "learning_rate": 3.953837624012963e-05, + "loss": 0.0337, + "step": 253 + }, + { + "epoch": 0.48963855421686747, + "grad_norm": 0.6148451566696167, + "learning_rate": 3.9528747131683023e-05, + "loss": 0.0524, + "step": 254 + }, + { + "epoch": 0.491566265060241, + "grad_norm": 0.500840961933136, + "learning_rate": 3.9519019826359676e-05, + "loss": 0.0248, + "step": 255 + }, + { + "epoch": 0.49349397590361443, + "grad_norm": 0.5536255240440369, + "learning_rate": 3.9509194373071624e-05, + "loss": 0.0219, + "step": 256 + }, + { + "epoch": 0.49542168674698794, + "grad_norm": 0.6873176097869873, + "learning_rate": 3.9499270821224444e-05, + "loss": 0.0312, + "step": 257 + }, + { + "epoch": 0.49734939759036145, + "grad_norm": 0.37207168340682983, + "learning_rate": 3.9489249220716974e-05, + "loss": 0.0149, + "step": 258 + }, + { + "epoch": 0.49927710843373496, + "grad_norm": 0.4458799660205841, + "learning_rate": 3.947912962194107e-05, + "loss": 0.0214, + "step": 259 + }, + { + "epoch": 0.5012048192771085, + "grad_norm": 0.4272724390029907, + "learning_rate": 3.9468912075781345e-05, + "loss": 0.0263, + "step": 260 + }, + { + "epoch": 0.503132530120482, + "grad_norm": 0.5245792269706726, + "learning_rate": 3.945859663361496e-05, + "loss": 0.0103, + "step": 261 + }, + { + "epoch": 0.5050602409638554, + "grad_norm": 0.8799260854721069, + "learning_rate": 3.9448183347311284e-05, + "loss": 0.0292, + "step": 262 + }, + { + "epoch": 0.5069879518072289, + "grad_norm": 0.5996833443641663, + "learning_rate": 3.943767226923171e-05, + "loss": 0.0306, + "step": 263 + }, + { + "epoch": 0.5089156626506024, + "grad_norm": 0.6044682860374451, + "learning_rate": 3.942706345222935e-05, + "loss": 0.0218, + "step": 264 + }, + { + "epoch": 0.5108433734939759, + "grad_norm": 0.4770200848579407, + "learning_rate": 3.941635694964878e-05, + "loss": 0.0226, + "step": 265 + }, + { + "epoch": 0.5127710843373494, + "grad_norm": 0.5605704188346863, + "learning_rate": 3.940555281532576e-05, + "loss": 0.0354, + "step": 266 + }, + { + "epoch": 0.5146987951807229, + "grad_norm": 0.46532443165779114, + "learning_rate": 3.939465110358699e-05, + "loss": 0.0223, + "step": 267 + }, + { + "epoch": 0.5166265060240964, + "grad_norm": 0.5190595388412476, + "learning_rate": 3.93836518692498e-05, + "loss": 0.0219, + "step": 268 + }, + { + "epoch": 0.5185542168674698, + "grad_norm": 0.5767757892608643, + "learning_rate": 3.937255516762193e-05, + "loss": 0.0294, + "step": 269 + }, + { + "epoch": 0.5204819277108433, + "grad_norm": 0.4543164372444153, + "learning_rate": 3.936136105450119e-05, + "loss": 0.0244, + "step": 270 + }, + { + "epoch": 0.5224096385542168, + "grad_norm": 0.4155154526233673, + "learning_rate": 3.9350069586175195e-05, + "loss": 0.02, + "step": 271 + }, + { + "epoch": 0.5243373493975904, + "grad_norm": 0.5470768213272095, + "learning_rate": 3.933868081942113e-05, + "loss": 0.0187, + "step": 272 + }, + { + "epoch": 0.5262650602409639, + "grad_norm": 0.9491772651672363, + "learning_rate": 3.9327194811505406e-05, + "loss": 0.0337, + "step": 273 + }, + { + "epoch": 0.5281927710843374, + "grad_norm": 0.9313873052597046, + "learning_rate": 3.93156116201834e-05, + "loss": 0.0573, + "step": 274 + }, + { + "epoch": 0.5301204819277109, + "grad_norm": 0.7181005477905273, + "learning_rate": 3.930393130369915e-05, + "loss": 0.0405, + "step": 275 + }, + { + "epoch": 0.5320481927710843, + "grad_norm": 0.34231385588645935, + "learning_rate": 3.9292153920785076e-05, + "loss": 0.0153, + "step": 276 + }, + { + "epoch": 0.5339759036144578, + "grad_norm": 0.6899610161781311, + "learning_rate": 3.928027953066168e-05, + "loss": 0.0338, + "step": 277 + }, + { + "epoch": 0.5359036144578313, + "grad_norm": 0.7509781718254089, + "learning_rate": 3.926830819303726e-05, + "loss": 0.0416, + "step": 278 + }, + { + "epoch": 0.5378313253012048, + "grad_norm": 0.6326774954795837, + "learning_rate": 3.925623996810757e-05, + "loss": 0.0293, + "step": 279 + }, + { + "epoch": 0.5397590361445783, + "grad_norm": 0.5543203353881836, + "learning_rate": 3.924407491655557e-05, + "loss": 0.0263, + "step": 280 + }, + { + "epoch": 0.5416867469879518, + "grad_norm": 0.5367572903633118, + "learning_rate": 3.9231813099551086e-05, + "loss": 0.0276, + "step": 281 + }, + { + "epoch": 0.5436144578313253, + "grad_norm": 0.3143869638442993, + "learning_rate": 3.921945457875051e-05, + "loss": 0.0146, + "step": 282 + }, + { + "epoch": 0.5455421686746988, + "grad_norm": 0.47403043508529663, + "learning_rate": 3.920699941629649e-05, + "loss": 0.0267, + "step": 283 + }, + { + "epoch": 0.5474698795180722, + "grad_norm": 0.5082595348358154, + "learning_rate": 3.919444767481763e-05, + "loss": 0.0183, + "step": 284 + }, + { + "epoch": 0.5493975903614458, + "grad_norm": 0.747949481010437, + "learning_rate": 3.918179941742816e-05, + "loss": 0.0412, + "step": 285 + }, + { + "epoch": 0.5513253012048193, + "grad_norm": 0.6553886532783508, + "learning_rate": 3.916905470772762e-05, + "loss": 0.0505, + "step": 286 + }, + { + "epoch": 0.5532530120481928, + "grad_norm": 0.3838176131248474, + "learning_rate": 3.9156213609800545e-05, + "loss": 0.0156, + "step": 287 + }, + { + "epoch": 0.5551807228915663, + "grad_norm": 0.7427731156349182, + "learning_rate": 3.914327618821614e-05, + "loss": 0.0278, + "step": 288 + }, + { + "epoch": 0.5571084337349398, + "grad_norm": 0.2612821161746979, + "learning_rate": 3.913024250802796e-05, + "loss": 0.0101, + "step": 289 + }, + { + "epoch": 0.5590361445783133, + "grad_norm": 0.3799416124820709, + "learning_rate": 3.911711263477357e-05, + "loss": 0.0168, + "step": 290 + }, + { + "epoch": 0.5609638554216867, + "grad_norm": 0.5053854584693909, + "learning_rate": 3.910388663447425e-05, + "loss": 0.0249, + "step": 291 + }, + { + "epoch": 0.5628915662650602, + "grad_norm": 0.38095012307167053, + "learning_rate": 3.909056457363461e-05, + "loss": 0.0156, + "step": 292 + }, + { + "epoch": 0.5648192771084337, + "grad_norm": 0.4477892220020294, + "learning_rate": 3.907714651924229e-05, + "loss": 0.0309, + "step": 293 + }, + { + "epoch": 0.5667469879518072, + "grad_norm": 0.5875864624977112, + "learning_rate": 3.906363253876763e-05, + "loss": 0.0287, + "step": 294 + }, + { + "epoch": 0.5686746987951807, + "grad_norm": 0.522990882396698, + "learning_rate": 3.90500227001633e-05, + "loss": 0.0318, + "step": 295 + }, + { + "epoch": 0.5706024096385542, + "grad_norm": 0.4153876304626465, + "learning_rate": 3.9036317071863994e-05, + "loss": 0.0192, + "step": 296 + }, + { + "epoch": 0.5725301204819278, + "grad_norm": 0.4675769507884979, + "learning_rate": 3.902251572278605e-05, + "loss": 0.067, + "step": 297 + }, + { + "epoch": 0.5744578313253013, + "grad_norm": 0.35778650641441345, + "learning_rate": 3.900861872232713e-05, + "loss": 0.0197, + "step": 298 + }, + { + "epoch": 0.5763855421686747, + "grad_norm": 0.7382330894470215, + "learning_rate": 3.899462614036587e-05, + "loss": 0.0283, + "step": 299 + }, + { + "epoch": 0.5783132530120482, + "grad_norm": 0.41268599033355713, + "learning_rate": 3.89805380472615e-05, + "loss": 0.0207, + "step": 300 + }, + { + "epoch": 0.5802409638554217, + "grad_norm": 1.2013020515441895, + "learning_rate": 3.8966354513853535e-05, + "loss": 0.0301, + "step": 301 + }, + { + "epoch": 0.5821686746987952, + "grad_norm": 0.424757719039917, + "learning_rate": 3.895207561146137e-05, + "loss": 0.022, + "step": 302 + }, + { + "epoch": 0.5840963855421687, + "grad_norm": 0.4196677505970001, + "learning_rate": 3.893770141188396e-05, + "loss": 0.0424, + "step": 303 + }, + { + "epoch": 0.5860240963855422, + "grad_norm": 0.8644190430641174, + "learning_rate": 3.892323198739946e-05, + "loss": 0.08, + "step": 304 + }, + { + "epoch": 0.5879518072289157, + "grad_norm": 0.5645135045051575, + "learning_rate": 3.890866741076482e-05, + "loss": 0.0152, + "step": 305 + }, + { + "epoch": 0.5898795180722891, + "grad_norm": 0.5218387246131897, + "learning_rate": 3.889400775521545e-05, + "loss": 0.0205, + "step": 306 + }, + { + "epoch": 0.5918072289156626, + "grad_norm": 0.39709413051605225, + "learning_rate": 3.8879253094464865e-05, + "loss": 0.0233, + "step": 307 + }, + { + "epoch": 0.5937349397590361, + "grad_norm": 0.3572910726070404, + "learning_rate": 3.8864403502704285e-05, + "loss": 0.0198, + "step": 308 + }, + { + "epoch": 0.5956626506024096, + "grad_norm": 0.382709264755249, + "learning_rate": 3.8849459054602274e-05, + "loss": 0.0176, + "step": 309 + }, + { + "epoch": 0.5975903614457831, + "grad_norm": 3.4527227878570557, + "learning_rate": 3.883441982530436e-05, + "loss": 0.0239, + "step": 310 + }, + { + "epoch": 0.5995180722891567, + "grad_norm": 0.4467569589614868, + "learning_rate": 3.8819285890432674e-05, + "loss": 0.0284, + "step": 311 + }, + { + "epoch": 0.6014457831325302, + "grad_norm": 0.44513460993766785, + "learning_rate": 3.880405732608555e-05, + "loss": 0.0233, + "step": 312 + }, + { + "epoch": 0.6033734939759036, + "grad_norm": 0.8029689192771912, + "learning_rate": 3.8788734208837155e-05, + "loss": 0.0433, + "step": 313 + }, + { + "epoch": 0.6053012048192771, + "grad_norm": 0.7291454076766968, + "learning_rate": 3.877331661573709e-05, + "loss": 0.043, + "step": 314 + }, + { + "epoch": 0.6072289156626506, + "grad_norm": 0.6050467491149902, + "learning_rate": 3.8757804624310006e-05, + "loss": 0.0377, + "step": 315 + }, + { + "epoch": 0.6091566265060241, + "grad_norm": 0.6714366674423218, + "learning_rate": 3.874219831255524e-05, + "loss": 0.046, + "step": 316 + }, + { + "epoch": 0.6110843373493976, + "grad_norm": 0.336037278175354, + "learning_rate": 3.8726497758946394e-05, + "loss": 0.0149, + "step": 317 + }, + { + "epoch": 0.6130120481927711, + "grad_norm": 0.3057402968406677, + "learning_rate": 3.871070304243094e-05, + "loss": 0.014, + "step": 318 + }, + { + "epoch": 0.6149397590361446, + "grad_norm": 0.4537644684314728, + "learning_rate": 3.8694814242429834e-05, + "loss": 0.0503, + "step": 319 + }, + { + "epoch": 0.6168674698795181, + "grad_norm": 0.45573824644088745, + "learning_rate": 3.8678831438837116e-05, + "loss": 0.021, + "step": 320 + }, + { + "epoch": 0.6187951807228915, + "grad_norm": 0.30729591846466064, + "learning_rate": 3.866275471201952e-05, + "loss": 0.0163, + "step": 321 + }, + { + "epoch": 0.620722891566265, + "grad_norm": 0.7614850401878357, + "learning_rate": 3.8646584142816036e-05, + "loss": 0.0347, + "step": 322 + }, + { + "epoch": 0.6226506024096385, + "grad_norm": 0.5323611497879028, + "learning_rate": 3.863031981253754e-05, + "loss": 0.0201, + "step": 323 + }, + { + "epoch": 0.624578313253012, + "grad_norm": 0.34426453709602356, + "learning_rate": 3.861396180296635e-05, + "loss": 0.0243, + "step": 324 + }, + { + "epoch": 0.6265060240963856, + "grad_norm": 0.621636152267456, + "learning_rate": 3.859751019635585e-05, + "loss": 0.0166, + "step": 325 + }, + { + "epoch": 0.6284337349397591, + "grad_norm": 0.549324095249176, + "learning_rate": 3.858096507543006e-05, + "loss": 0.0274, + "step": 326 + }, + { + "epoch": 0.6303614457831326, + "grad_norm": 0.358426570892334, + "learning_rate": 3.8564326523383214e-05, + "loss": 0.0207, + "step": 327 + }, + { + "epoch": 0.632289156626506, + "grad_norm": 0.3639723062515259, + "learning_rate": 3.8547594623879346e-05, + "loss": 0.0297, + "step": 328 + }, + { + "epoch": 0.6342168674698795, + "grad_norm": 0.3402212858200073, + "learning_rate": 3.853076946105188e-05, + "loss": 0.0258, + "step": 329 + }, + { + "epoch": 0.636144578313253, + "grad_norm": 0.4083027243614197, + "learning_rate": 3.85138511195032e-05, + "loss": 0.0351, + "step": 330 + }, + { + "epoch": 0.6380722891566265, + "grad_norm": 0.43532121181488037, + "learning_rate": 3.84968396843042e-05, + "loss": 0.0388, + "step": 331 + }, + { + "epoch": 0.64, + "grad_norm": 0.35353463888168335, + "learning_rate": 3.8479735240993904e-05, + "loss": 0.0203, + "step": 332 + }, + { + "epoch": 0.6419277108433735, + "grad_norm": 0.350149929523468, + "learning_rate": 3.846253787557901e-05, + "loss": 0.0261, + "step": 333 + }, + { + "epoch": 0.643855421686747, + "grad_norm": 0.7665389180183411, + "learning_rate": 3.844524767453344e-05, + "loss": 0.0108, + "step": 334 + }, + { + "epoch": 0.6457831325301204, + "grad_norm": 0.44621360301971436, + "learning_rate": 3.842786472479795e-05, + "loss": 0.0282, + "step": 335 + }, + { + "epoch": 0.6477108433734939, + "grad_norm": 0.7787201404571533, + "learning_rate": 3.841038911377962e-05, + "loss": 0.0216, + "step": 336 + }, + { + "epoch": 0.6496385542168674, + "grad_norm": 0.48260653018951416, + "learning_rate": 3.839282092935153e-05, + "loss": 0.0234, + "step": 337 + }, + { + "epoch": 0.651566265060241, + "grad_norm": 0.4987852871417999, + "learning_rate": 3.837516025985219e-05, + "loss": 0.0515, + "step": 338 + }, + { + "epoch": 0.6534939759036145, + "grad_norm": 0.9030266404151917, + "learning_rate": 3.835740719408517e-05, + "loss": 0.0508, + "step": 339 + }, + { + "epoch": 0.655421686746988, + "grad_norm": 0.6381701231002808, + "learning_rate": 3.833956182131867e-05, + "loss": 0.0405, + "step": 340 + }, + { + "epoch": 0.6573493975903615, + "grad_norm": 0.42828986048698425, + "learning_rate": 3.832162423128499e-05, + "loss": 0.024, + "step": 341 + }, + { + "epoch": 0.659277108433735, + "grad_norm": 0.38725873827934265, + "learning_rate": 3.8303594514180164e-05, + "loss": 0.0199, + "step": 342 + }, + { + "epoch": 0.6612048192771084, + "grad_norm": 0.23280498385429382, + "learning_rate": 3.828547276066346e-05, + "loss": 0.0101, + "step": 343 + }, + { + "epoch": 0.6631325301204819, + "grad_norm": 0.7298216819763184, + "learning_rate": 3.8267259061856925e-05, + "loss": 0.0455, + "step": 344 + }, + { + "epoch": 0.6650602409638554, + "grad_norm": 0.5975687503814697, + "learning_rate": 3.824895350934496e-05, + "loss": 0.0372, + "step": 345 + }, + { + "epoch": 0.6669879518072289, + "grad_norm": 0.6295403242111206, + "learning_rate": 3.823055619517381e-05, + "loss": 0.0362, + "step": 346 + }, + { + "epoch": 0.6689156626506024, + "grad_norm": 0.5086020827293396, + "learning_rate": 3.821206721185115e-05, + "loss": 0.0368, + "step": 347 + }, + { + "epoch": 0.6708433734939759, + "grad_norm": 0.34506168961524963, + "learning_rate": 3.819348665234557e-05, + "loss": 0.0178, + "step": 348 + }, + { + "epoch": 0.6727710843373494, + "grad_norm": 1.309940218925476, + "learning_rate": 3.817481461008617e-05, + "loss": 0.024, + "step": 349 + }, + { + "epoch": 0.6746987951807228, + "grad_norm": 0.4074770510196686, + "learning_rate": 3.815605117896204e-05, + "loss": 0.0262, + "step": 350 + }, + { + "epoch": 0.6766265060240964, + "grad_norm": 0.48525840044021606, + "learning_rate": 3.8137196453321775e-05, + "loss": 0.0209, + "step": 351 + }, + { + "epoch": 0.6785542168674699, + "grad_norm": 0.7199739217758179, + "learning_rate": 3.811825052797308e-05, + "loss": 0.0396, + "step": 352 + }, + { + "epoch": 0.6804819277108434, + "grad_norm": 0.519540011882782, + "learning_rate": 3.8099213498182196e-05, + "loss": 0.0453, + "step": 353 + }, + { + "epoch": 0.6824096385542169, + "grad_norm": 0.9738391041755676, + "learning_rate": 3.808008545967349e-05, + "loss": 0.0317, + "step": 354 + }, + { + "epoch": 0.6843373493975904, + "grad_norm": 1.888344407081604, + "learning_rate": 3.8060866508628953e-05, + "loss": 0.0452, + "step": 355 + }, + { + "epoch": 0.6862650602409639, + "grad_norm": 0.48989811539649963, + "learning_rate": 3.8041556741687695e-05, + "loss": 0.0315, + "step": 356 + }, + { + "epoch": 0.6881927710843373, + "grad_norm": 0.3764645457267761, + "learning_rate": 3.8022156255945496e-05, + "loss": 0.0269, + "step": 357 + }, + { + "epoch": 0.6901204819277108, + "grad_norm": 0.46409738063812256, + "learning_rate": 3.800266514895429e-05, + "loss": 0.0171, + "step": 358 + }, + { + "epoch": 0.6920481927710843, + "grad_norm": 0.41091030836105347, + "learning_rate": 3.7983083518721695e-05, + "loss": 0.0167, + "step": 359 + }, + { + "epoch": 0.6939759036144578, + "grad_norm": 0.8375523090362549, + "learning_rate": 3.79634114637105e-05, + "loss": 0.0342, + "step": 360 + }, + { + "epoch": 0.6959036144578313, + "grad_norm": 1.7053394317626953, + "learning_rate": 3.794364908283817e-05, + "loss": 0.02, + "step": 361 + }, + { + "epoch": 0.6978313253012048, + "grad_norm": 0.4163115918636322, + "learning_rate": 3.792379647547637e-05, + "loss": 0.0138, + "step": 362 + }, + { + "epoch": 0.6997590361445784, + "grad_norm": 0.388751745223999, + "learning_rate": 3.790385374145046e-05, + "loss": 0.0172, + "step": 363 + }, + { + "epoch": 0.7016867469879519, + "grad_norm": 0.5584064722061157, + "learning_rate": 3.7883820981038966e-05, + "loss": 0.0254, + "step": 364 + }, + { + "epoch": 0.7036144578313253, + "grad_norm": 1.394264817237854, + "learning_rate": 3.7863698294973114e-05, + "loss": 0.037, + "step": 365 + }, + { + "epoch": 0.7055421686746988, + "grad_norm": 0.46280744671821594, + "learning_rate": 3.78434857844363e-05, + "loss": 0.0234, + "step": 366 + }, + { + "epoch": 0.7074698795180723, + "grad_norm": 0.39548924565315247, + "learning_rate": 3.782318355106358e-05, + "loss": 0.0164, + "step": 367 + }, + { + "epoch": 0.7093975903614458, + "grad_norm": 0.7307773232460022, + "learning_rate": 3.780279169694118e-05, + "loss": 0.0192, + "step": 368 + }, + { + "epoch": 0.7113253012048193, + "grad_norm": 0.28035807609558105, + "learning_rate": 3.778231032460594e-05, + "loss": 0.0131, + "step": 369 + }, + { + "epoch": 0.7132530120481928, + "grad_norm": 0.8376953601837158, + "learning_rate": 3.776173953704486e-05, + "loss": 0.0291, + "step": 370 + }, + { + "epoch": 0.7151807228915663, + "grad_norm": 0.7356843948364258, + "learning_rate": 3.774107943769454e-05, + "loss": 0.0214, + "step": 371 + }, + { + "epoch": 0.7171084337349397, + "grad_norm": 0.41503390669822693, + "learning_rate": 3.772033013044064e-05, + "loss": 0.0221, + "step": 372 + }, + { + "epoch": 0.7190361445783132, + "grad_norm": 0.35732385516166687, + "learning_rate": 3.7699491719617436e-05, + "loss": 0.015, + "step": 373 + }, + { + "epoch": 0.7209638554216867, + "grad_norm": 0.283778578042984, + "learning_rate": 3.76785643100072e-05, + "loss": 0.0146, + "step": 374 + }, + { + "epoch": 0.7228915662650602, + "grad_norm": 0.3219413459300995, + "learning_rate": 3.765754800683974e-05, + "loss": 0.015, + "step": 375 + }, + { + "epoch": 0.7248192771084337, + "grad_norm": 0.610431432723999, + "learning_rate": 3.7636442915791856e-05, + "loss": 0.0326, + "step": 376 + }, + { + "epoch": 0.7267469879518073, + "grad_norm": 4.944870948791504, + "learning_rate": 3.7615249142986784e-05, + "loss": 0.0432, + "step": 377 + }, + { + "epoch": 0.7286746987951808, + "grad_norm": 0.4894593060016632, + "learning_rate": 3.7593966794993696e-05, + "loss": 0.0174, + "step": 378 + }, + { + "epoch": 0.7306024096385542, + "grad_norm": 0.4211325943470001, + "learning_rate": 3.757259597882714e-05, + "loss": 0.023, + "step": 379 + }, + { + "epoch": 0.7325301204819277, + "grad_norm": 0.33621737360954285, + "learning_rate": 3.755113680194651e-05, + "loss": 0.0201, + "step": 380 + }, + { + "epoch": 0.7344578313253012, + "grad_norm": 0.5799694657325745, + "learning_rate": 3.7529589372255514e-05, + "loss": 0.0173, + "step": 381 + }, + { + "epoch": 0.7363855421686747, + "grad_norm": 0.5172572731971741, + "learning_rate": 3.750795379810162e-05, + "loss": 0.0284, + "step": 382 + }, + { + "epoch": 0.7383132530120482, + "grad_norm": 0.5715453028678894, + "learning_rate": 3.748623018827552e-05, + "loss": 0.0194, + "step": 383 + }, + { + "epoch": 0.7402409638554217, + "grad_norm": 0.5284178256988525, + "learning_rate": 3.746441865201056e-05, + "loss": 0.0247, + "step": 384 + }, + { + "epoch": 0.7421686746987952, + "grad_norm": 0.37828654050827026, + "learning_rate": 3.744251929898223e-05, + "loss": 0.0097, + "step": 385 + }, + { + "epoch": 0.7440963855421687, + "grad_norm": 0.3252779543399811, + "learning_rate": 3.742053223930758e-05, + "loss": 0.0238, + "step": 386 + }, + { + "epoch": 0.7460240963855421, + "grad_norm": 0.6031543612480164, + "learning_rate": 3.7398457583544674e-05, + "loss": 0.0332, + "step": 387 + }, + { + "epoch": 0.7479518072289156, + "grad_norm": 0.23846614360809326, + "learning_rate": 3.737629544269206e-05, + "loss": 0.0122, + "step": 388 + }, + { + "epoch": 0.7498795180722891, + "grad_norm": 0.5274029970169067, + "learning_rate": 3.7354045928188155e-05, + "loss": 0.0324, + "step": 389 + }, + { + "epoch": 0.7518072289156627, + "grad_norm": 0.4672217071056366, + "learning_rate": 3.733170915191075e-05, + "loss": 0.0196, + "step": 390 + }, + { + "epoch": 0.7537349397590362, + "grad_norm": 0.29819396138191223, + "learning_rate": 3.730928522617639e-05, + "loss": 0.0131, + "step": 391 + }, + { + "epoch": 0.7556626506024097, + "grad_norm": 0.43824997544288635, + "learning_rate": 3.7286774263739855e-05, + "loss": 0.0238, + "step": 392 + }, + { + "epoch": 0.7575903614457832, + "grad_norm": 0.2822072505950928, + "learning_rate": 3.726417637779357e-05, + "loss": 0.0314, + "step": 393 + }, + { + "epoch": 0.7595180722891566, + "grad_norm": 0.43815648555755615, + "learning_rate": 3.7241491681967044e-05, + "loss": 0.0144, + "step": 394 + }, + { + "epoch": 0.7614457831325301, + "grad_norm": 0.37194815278053284, + "learning_rate": 3.721872029032628e-05, + "loss": 0.0286, + "step": 395 + }, + { + "epoch": 0.7633734939759036, + "grad_norm": 0.7319737672805786, + "learning_rate": 3.719586231737322e-05, + "loss": 0.0427, + "step": 396 + }, + { + "epoch": 0.7653012048192771, + "grad_norm": 0.5870066285133362, + "learning_rate": 3.717291787804517e-05, + "loss": 0.0138, + "step": 397 + }, + { + "epoch": 0.7672289156626506, + "grad_norm": 0.6574277281761169, + "learning_rate": 3.7149887087714225e-05, + "loss": 0.061, + "step": 398 + }, + { + "epoch": 0.7691566265060241, + "grad_norm": 0.5467348694801331, + "learning_rate": 3.712677006218666e-05, + "loss": 0.022, + "step": 399 + }, + { + "epoch": 0.7710843373493976, + "grad_norm": 0.3589288890361786, + "learning_rate": 3.710356691770238e-05, + "loss": 0.0161, + "step": 400 + }, + { + "epoch": 0.7730120481927711, + "grad_norm": 0.574630618095398, + "learning_rate": 3.708027777093433e-05, + "loss": 0.0285, + "step": 401 + }, + { + "epoch": 0.7749397590361445, + "grad_norm": 0.39048445224761963, + "learning_rate": 3.70569027389879e-05, + "loss": 0.012, + "step": 402 + }, + { + "epoch": 0.776867469879518, + "grad_norm": 0.34803536534309387, + "learning_rate": 3.703344193940032e-05, + "loss": 0.0155, + "step": 403 + }, + { + "epoch": 0.7787951807228916, + "grad_norm": 1.188948392868042, + "learning_rate": 3.700989549014011e-05, + "loss": 0.0617, + "step": 404 + }, + { + "epoch": 0.7807228915662651, + "grad_norm": 0.473157674074173, + "learning_rate": 3.698626350960646e-05, + "loss": 0.0298, + "step": 405 + }, + { + "epoch": 0.7826506024096386, + "grad_norm": 0.42009076476097107, + "learning_rate": 3.6962546116628634e-05, + "loss": 0.03, + "step": 406 + }, + { + "epoch": 0.7845783132530121, + "grad_norm": 0.6334308981895447, + "learning_rate": 3.693874343046537e-05, + "loss": 0.0107, + "step": 407 + }, + { + "epoch": 0.7865060240963856, + "grad_norm": 0.35594677925109863, + "learning_rate": 3.6914855570804314e-05, + "loss": 0.0174, + "step": 408 + }, + { + "epoch": 0.788433734939759, + "grad_norm": 0.28985708951950073, + "learning_rate": 3.689088265776136e-05, + "loss": 0.0149, + "step": 409 + }, + { + "epoch": 0.7903614457831325, + "grad_norm": 0.3981950581073761, + "learning_rate": 3.686682481188011e-05, + "loss": 0.019, + "step": 410 + }, + { + "epoch": 0.792289156626506, + "grad_norm": 0.48819583654403687, + "learning_rate": 3.6842682154131193e-05, + "loss": 0.0217, + "step": 411 + }, + { + "epoch": 0.7942168674698795, + "grad_norm": 0.42819952964782715, + "learning_rate": 3.681845480591174e-05, + "loss": 0.0198, + "step": 412 + }, + { + "epoch": 0.796144578313253, + "grad_norm": 0.48591694235801697, + "learning_rate": 3.6794142889044727e-05, + "loss": 0.0253, + "step": 413 + }, + { + "epoch": 0.7980722891566265, + "grad_norm": 0.4730607271194458, + "learning_rate": 3.676974652577835e-05, + "loss": 0.0329, + "step": 414 + }, + { + "epoch": 0.8, + "grad_norm": 0.5390865802764893, + "learning_rate": 3.6745265838785434e-05, + "loss": 0.0479, + "step": 415 + }, + { + "epoch": 0.8019277108433734, + "grad_norm": 0.6377891302108765, + "learning_rate": 3.672070095116283e-05, + "loss": 0.019, + "step": 416 + }, + { + "epoch": 0.803855421686747, + "grad_norm": 0.8984615206718445, + "learning_rate": 3.669605198643075e-05, + "loss": 0.0444, + "step": 417 + }, + { + "epoch": 0.8057831325301205, + "grad_norm": 0.4913877546787262, + "learning_rate": 3.667131906853219e-05, + "loss": 0.031, + "step": 418 + }, + { + "epoch": 0.807710843373494, + "grad_norm": 0.37894028425216675, + "learning_rate": 3.664650232183229e-05, + "loss": 0.0195, + "step": 419 + }, + { + "epoch": 0.8096385542168675, + "grad_norm": 0.3644949495792389, + "learning_rate": 3.66216018711177e-05, + "loss": 0.018, + "step": 420 + }, + { + "epoch": 0.811566265060241, + "grad_norm": 0.414440393447876, + "learning_rate": 3.659661784159597e-05, + "loss": 0.0188, + "step": 421 + }, + { + "epoch": 0.8134939759036145, + "grad_norm": 0.49220341444015503, + "learning_rate": 3.65715503588949e-05, + "loss": 0.016, + "step": 422 + }, + { + "epoch": 0.815421686746988, + "grad_norm": 1.0939836502075195, + "learning_rate": 3.654639954906193e-05, + "loss": 0.0758, + "step": 423 + }, + { + "epoch": 0.8173493975903614, + "grad_norm": 0.43222442269325256, + "learning_rate": 3.652116553856349e-05, + "loss": 0.0308, + "step": 424 + }, + { + "epoch": 0.8192771084337349, + "grad_norm": 0.5081896185874939, + "learning_rate": 3.649584845428438e-05, + "loss": 0.0493, + "step": 425 + }, + { + "epoch": 0.8212048192771084, + "grad_norm": 0.9811948537826538, + "learning_rate": 3.64704484235271e-05, + "loss": 0.019, + "step": 426 + }, + { + "epoch": 0.8231325301204819, + "grad_norm": 0.31656572222709656, + "learning_rate": 3.6444965574011255e-05, + "loss": 0.0135, + "step": 427 + }, + { + "epoch": 0.8250602409638554, + "grad_norm": 0.7844433188438416, + "learning_rate": 3.641940003387289e-05, + "loss": 0.0402, + "step": 428 + }, + { + "epoch": 0.826987951807229, + "grad_norm": 0.3353273570537567, + "learning_rate": 3.6393751931663814e-05, + "loss": 0.0132, + "step": 429 + }, + { + "epoch": 0.8289156626506025, + "grad_norm": 0.7253058552742004, + "learning_rate": 3.6368021396351015e-05, + "loss": 0.0296, + "step": 430 + }, + { + "epoch": 0.8308433734939759, + "grad_norm": 0.45300304889678955, + "learning_rate": 3.634220855731598e-05, + "loss": 0.0258, + "step": 431 + }, + { + "epoch": 0.8327710843373494, + "grad_norm": 0.3480473458766937, + "learning_rate": 3.631631354435403e-05, + "loss": 0.0099, + "step": 432 + }, + { + "epoch": 0.8346987951807229, + "grad_norm": 2.1114516258239746, + "learning_rate": 3.62903364876737e-05, + "loss": 0.0457, + "step": 433 + }, + { + "epoch": 0.8366265060240964, + "grad_norm": 0.5649561882019043, + "learning_rate": 3.626427751789606e-05, + "loss": 0.0444, + "step": 434 + }, + { + "epoch": 0.8385542168674699, + "grad_norm": 0.3864995539188385, + "learning_rate": 3.623813676605405e-05, + "loss": 0.0223, + "step": 435 + }, + { + "epoch": 0.8404819277108434, + "grad_norm": 1.2134298086166382, + "learning_rate": 3.621191436359186e-05, + "loss": 0.0353, + "step": 436 + }, + { + "epoch": 0.8424096385542169, + "grad_norm": 0.4403415024280548, + "learning_rate": 3.6185610442364246e-05, + "loss": 0.0216, + "step": 437 + }, + { + "epoch": 0.8443373493975903, + "grad_norm": 0.6050297021865845, + "learning_rate": 3.6159225134635846e-05, + "loss": 0.0433, + "step": 438 + }, + { + "epoch": 0.8462650602409638, + "grad_norm": 0.7951678037643433, + "learning_rate": 3.6132758573080556e-05, + "loss": 0.031, + "step": 439 + }, + { + "epoch": 0.8481927710843373, + "grad_norm": 0.4991949796676636, + "learning_rate": 3.6106210890780834e-05, + "loss": 0.0313, + "step": 440 + }, + { + "epoch": 0.8501204819277108, + "grad_norm": 0.47951385378837585, + "learning_rate": 3.607958222122704e-05, + "loss": 0.0218, + "step": 441 + }, + { + "epoch": 0.8520481927710843, + "grad_norm": 0.7345194220542908, + "learning_rate": 3.6052872698316755e-05, + "loss": 0.0239, + "step": 442 + }, + { + "epoch": 0.8539759036144579, + "grad_norm": 1.4814884662628174, + "learning_rate": 3.602608245635414e-05, + "loss": 0.0127, + "step": 443 + }, + { + "epoch": 0.8559036144578314, + "grad_norm": 2.4240877628326416, + "learning_rate": 3.599921163004922e-05, + "loss": 0.0618, + "step": 444 + }, + { + "epoch": 0.8578313253012049, + "grad_norm": 0.41523510217666626, + "learning_rate": 3.5972260354517216e-05, + "loss": 0.0283, + "step": 445 + }, + { + "epoch": 0.8597590361445783, + "grad_norm": 0.5577677488327026, + "learning_rate": 3.594522876527791e-05, + "loss": 0.0271, + "step": 446 + }, + { + "epoch": 0.8616867469879518, + "grad_norm": 0.5829064846038818, + "learning_rate": 3.591811699825487e-05, + "loss": 0.0169, + "step": 447 + }, + { + "epoch": 0.8636144578313253, + "grad_norm": 0.4478822350502014, + "learning_rate": 3.5890925189774886e-05, + "loss": 0.0239, + "step": 448 + }, + { + "epoch": 0.8655421686746988, + "grad_norm": 0.3498048782348633, + "learning_rate": 3.586365347656718e-05, + "loss": 0.0137, + "step": 449 + }, + { + "epoch": 0.8674698795180723, + "grad_norm": 0.6571130156517029, + "learning_rate": 3.583630199576278e-05, + "loss": 0.027, + "step": 450 + }, + { + "epoch": 0.8693975903614458, + "grad_norm": 0.344970166683197, + "learning_rate": 3.58088708848938e-05, + "loss": 0.0167, + "step": 451 + }, + { + "epoch": 0.8713253012048193, + "grad_norm": 0.34611570835113525, + "learning_rate": 3.5781360281892775e-05, + "loss": 0.0468, + "step": 452 + }, + { + "epoch": 0.8732530120481927, + "grad_norm": 0.66157066822052, + "learning_rate": 3.575377032509194e-05, + "loss": 0.0344, + "step": 453 + }, + { + "epoch": 0.8751807228915662, + "grad_norm": 0.3676326870918274, + "learning_rate": 3.5726101153222534e-05, + "loss": 0.0366, + "step": 454 + }, + { + "epoch": 0.8771084337349397, + "grad_norm": 0.5958423018455505, + "learning_rate": 3.569835290541414e-05, + "loss": 0.0382, + "step": 455 + }, + { + "epoch": 0.8790361445783132, + "grad_norm": 0.36787471175193787, + "learning_rate": 3.567052572119397e-05, + "loss": 0.018, + "step": 456 + }, + { + "epoch": 0.8809638554216868, + "grad_norm": 0.9478234052658081, + "learning_rate": 3.564261974048611e-05, + "loss": 0.0179, + "step": 457 + }, + { + "epoch": 0.8828915662650603, + "grad_norm": 0.3337579369544983, + "learning_rate": 3.56146351036109e-05, + "loss": 0.0147, + "step": 458 + }, + { + "epoch": 0.8848192771084338, + "grad_norm": 0.4984932243824005, + "learning_rate": 3.558657195128416e-05, + "loss": 0.0224, + "step": 459 + }, + { + "epoch": 0.8867469879518072, + "grad_norm": 0.36718735098838806, + "learning_rate": 3.555843042461653e-05, + "loss": 0.0202, + "step": 460 + }, + { + "epoch": 0.8886746987951807, + "grad_norm": 0.4081745445728302, + "learning_rate": 3.553021066511274e-05, + "loss": 0.0288, + "step": 461 + }, + { + "epoch": 0.8906024096385542, + "grad_norm": 0.3233242332935333, + "learning_rate": 3.55019128146709e-05, + "loss": 0.0362, + "step": 462 + }, + { + "epoch": 0.8925301204819277, + "grad_norm": 0.6560158729553223, + "learning_rate": 3.547353701558178e-05, + "loss": 0.038, + "step": 463 + }, + { + "epoch": 0.8944578313253012, + "grad_norm": 0.47668641805648804, + "learning_rate": 3.544508341052811e-05, + "loss": 0.0399, + "step": 464 + }, + { + "epoch": 0.8963855421686747, + "grad_norm": 0.45512664318084717, + "learning_rate": 3.541655214258383e-05, + "loss": 0.022, + "step": 465 + }, + { + "epoch": 0.8983132530120482, + "grad_norm": 0.8410730361938477, + "learning_rate": 3.538794335521343e-05, + "loss": 0.0315, + "step": 466 + }, + { + "epoch": 0.9002409638554217, + "grad_norm": 0.4872909486293793, + "learning_rate": 3.535925719227117e-05, + "loss": 0.0152, + "step": 467 + }, + { + "epoch": 0.9021686746987951, + "grad_norm": 0.45623311400413513, + "learning_rate": 3.533049379800038e-05, + "loss": 0.0305, + "step": 468 + }, + { + "epoch": 0.9040963855421686, + "grad_norm": 0.43087029457092285, + "learning_rate": 3.530165331703275e-05, + "loss": 0.0131, + "step": 469 + }, + { + "epoch": 0.9060240963855422, + "grad_norm": 0.4610525369644165, + "learning_rate": 3.527273589438756e-05, + "loss": 0.0187, + "step": 470 + }, + { + "epoch": 0.9079518072289157, + "grad_norm": 0.3356114327907562, + "learning_rate": 3.5243741675471006e-05, + "loss": 0.0185, + "step": 471 + }, + { + "epoch": 0.9098795180722892, + "grad_norm": 0.9065960049629211, + "learning_rate": 3.5214670806075426e-05, + "loss": 0.0433, + "step": 472 + }, + { + "epoch": 0.9118072289156627, + "grad_norm": 0.3652578294277191, + "learning_rate": 3.518552343237858e-05, + "loss": 0.02, + "step": 473 + }, + { + "epoch": 0.9137349397590362, + "grad_norm": 0.32377883791923523, + "learning_rate": 3.5156299700942916e-05, + "loss": 0.0165, + "step": 474 + }, + { + "epoch": 0.9156626506024096, + "grad_norm": 0.2431817352771759, + "learning_rate": 3.512699975871485e-05, + "loss": 0.0172, + "step": 475 + }, + { + "epoch": 0.9175903614457831, + "grad_norm": 0.6390707492828369, + "learning_rate": 3.509762375302399e-05, + "loss": 0.0356, + "step": 476 + }, + { + "epoch": 0.9195180722891566, + "grad_norm": 0.2283092886209488, + "learning_rate": 3.506817183158243e-05, + "loss": 0.0088, + "step": 477 + }, + { + "epoch": 0.9214457831325301, + "grad_norm": 0.5053914189338684, + "learning_rate": 3.5038644142483966e-05, + "loss": 0.0389, + "step": 478 + }, + { + "epoch": 0.9233734939759036, + "grad_norm": 0.2567576467990875, + "learning_rate": 3.500904083420342e-05, + "loss": 0.0155, + "step": 479 + }, + { + "epoch": 0.9253012048192771, + "grad_norm": 0.6852384209632874, + "learning_rate": 3.497936205559583e-05, + "loss": 0.0247, + "step": 480 + }, + { + "epoch": 0.9272289156626506, + "grad_norm": 0.36403414607048035, + "learning_rate": 3.494960795589572e-05, + "loss": 0.023, + "step": 481 + }, + { + "epoch": 0.929156626506024, + "grad_norm": 0.506554901599884, + "learning_rate": 3.491977868471635e-05, + "loss": 0.0273, + "step": 482 + }, + { + "epoch": 0.9310843373493976, + "grad_norm": 0.38329923152923584, + "learning_rate": 3.4889874392048985e-05, + "loss": 0.0169, + "step": 483 + }, + { + "epoch": 0.9330120481927711, + "grad_norm": 0.2805836498737335, + "learning_rate": 3.48598952282621e-05, + "loss": 0.0105, + "step": 484 + }, + { + "epoch": 0.9349397590361446, + "grad_norm": 0.6315302848815918, + "learning_rate": 3.482984134410067e-05, + "loss": 0.0289, + "step": 485 + }, + { + "epoch": 0.9368674698795181, + "grad_norm": 0.6431388854980469, + "learning_rate": 3.479971289068537e-05, + "loss": 0.0311, + "step": 486 + }, + { + "epoch": 0.9387951807228916, + "grad_norm": 0.9794723391532898, + "learning_rate": 3.476951001951184e-05, + "loss": 0.0452, + "step": 487 + }, + { + "epoch": 0.9407228915662651, + "grad_norm": 0.7984824180603027, + "learning_rate": 3.473923288244991e-05, + "loss": 0.0689, + "step": 488 + }, + { + "epoch": 0.9426506024096386, + "grad_norm": 0.46362006664276123, + "learning_rate": 3.470888163174286e-05, + "loss": 0.0241, + "step": 489 + }, + { + "epoch": 0.944578313253012, + "grad_norm": 0.5051195025444031, + "learning_rate": 3.467845642000661e-05, + "loss": 0.0228, + "step": 490 + }, + { + "epoch": 0.9465060240963855, + "grad_norm": 0.3082812428474426, + "learning_rate": 3.4647957400229004e-05, + "loss": 0.0144, + "step": 491 + }, + { + "epoch": 0.948433734939759, + "grad_norm": 0.2691391110420227, + "learning_rate": 3.461738472576902e-05, + "loss": 0.0167, + "step": 492 + }, + { + "epoch": 0.9503614457831325, + "grad_norm": 0.5627671480178833, + "learning_rate": 3.458673855035597e-05, + "loss": 0.031, + "step": 493 + }, + { + "epoch": 0.952289156626506, + "grad_norm": 0.4571435749530792, + "learning_rate": 3.455601902808876e-05, + "loss": 0.0191, + "step": 494 + }, + { + "epoch": 0.9542168674698795, + "grad_norm": 1.0117709636688232, + "learning_rate": 3.452522631343515e-05, + "loss": 0.0192, + "step": 495 + }, + { + "epoch": 0.9561445783132531, + "grad_norm": 0.28375712037086487, + "learning_rate": 3.449436056123086e-05, + "loss": 0.0159, + "step": 496 + }, + { + "epoch": 0.9580722891566265, + "grad_norm": 0.26381856203079224, + "learning_rate": 3.446342192667893e-05, + "loss": 0.0113, + "step": 497 + }, + { + "epoch": 0.96, + "grad_norm": 0.49317577481269836, + "learning_rate": 3.443241056534884e-05, + "loss": 0.0332, + "step": 498 + }, + { + "epoch": 0.9619277108433735, + "grad_norm": 0.28884485363960266, + "learning_rate": 3.440132663317579e-05, + "loss": 0.0117, + "step": 499 + }, + { + "epoch": 0.963855421686747, + "grad_norm": 0.36255285143852234, + "learning_rate": 3.4370170286459864e-05, + "loss": 0.0169, + "step": 500 + }, + { + "epoch": 0.9657831325301205, + "grad_norm": 0.4265049993991852, + "learning_rate": 3.433894168186529e-05, + "loss": 0.0217, + "step": 501 + }, + { + "epoch": 0.967710843373494, + "grad_norm": 0.8169426321983337, + "learning_rate": 3.430764097641962e-05, + "loss": 0.0207, + "step": 502 + }, + { + "epoch": 0.9696385542168675, + "grad_norm": 1.866077184677124, + "learning_rate": 3.427626832751296e-05, + "loss": 0.0381, + "step": 503 + }, + { + "epoch": 0.971566265060241, + "grad_norm": 0.33124980330467224, + "learning_rate": 3.424482389289716e-05, + "loss": 0.0245, + "step": 504 + }, + { + "epoch": 0.9734939759036144, + "grad_norm": 0.37479540705680847, + "learning_rate": 3.4213307830685055e-05, + "loss": 0.0164, + "step": 505 + }, + { + "epoch": 0.9754216867469879, + "grad_norm": 0.39738863706588745, + "learning_rate": 3.4181720299349615e-05, + "loss": 0.0297, + "step": 506 + }, + { + "epoch": 0.9773493975903614, + "grad_norm": 0.2567287087440491, + "learning_rate": 3.4150061457723205e-05, + "loss": 0.0102, + "step": 507 + }, + { + "epoch": 0.9792771084337349, + "grad_norm": 0.6230517029762268, + "learning_rate": 3.411833146499675e-05, + "loss": 0.0243, + "step": 508 + }, + { + "epoch": 0.9812048192771085, + "grad_norm": 0.44843971729278564, + "learning_rate": 3.408653048071894e-05, + "loss": 0.0357, + "step": 509 + }, + { + "epoch": 0.983132530120482, + "grad_norm": 1.0569655895233154, + "learning_rate": 3.405465866479546e-05, + "loss": 0.037, + "step": 510 + }, + { + "epoch": 0.9850602409638555, + "grad_norm": 0.29000964760780334, + "learning_rate": 3.402271617748812e-05, + "loss": 0.0129, + "step": 511 + }, + { + "epoch": 0.9869879518072289, + "grad_norm": 2.1627447605133057, + "learning_rate": 3.399070317941413e-05, + "loss": 0.0442, + "step": 512 + }, + { + "epoch": 0.9889156626506024, + "grad_norm": 0.27371272444725037, + "learning_rate": 3.395861983154522e-05, + "loss": 0.0119, + "step": 513 + }, + { + "epoch": 0.9908433734939759, + "grad_norm": 0.4117226302623749, + "learning_rate": 3.392646629520688e-05, + "loss": 0.0455, + "step": 514 + }, + { + "epoch": 0.9927710843373494, + "grad_norm": 0.5098996758460999, + "learning_rate": 3.389424273207752e-05, + "loss": 0.0203, + "step": 515 + }, + { + "epoch": 0.9946987951807229, + "grad_norm": 0.5192157626152039, + "learning_rate": 3.386194930418767e-05, + "loss": 0.0329, + "step": 516 + }, + { + "epoch": 0.9966265060240964, + "grad_norm": 0.18757697939872742, + "learning_rate": 3.382958617391915e-05, + "loss": 0.0065, + "step": 517 + }, + { + "epoch": 0.9985542168674699, + "grad_norm": 0.3334413170814514, + "learning_rate": 3.3797153504004296e-05, + "loss": 0.0266, + "step": 518 + }, + { + "epoch": 1.0, + "grad_norm": 0.4152225852012634, + "learning_rate": 3.3764651457525095e-05, + "loss": 0.0169, + "step": 519 + }, + { + "epoch": 1.0019277108433735, + "grad_norm": 0.43535247445106506, + "learning_rate": 3.373208019791237e-05, + "loss": 0.0221, + "step": 520 + }, + { + "epoch": 1.003855421686747, + "grad_norm": 0.39292722940444946, + "learning_rate": 3.3699439888945e-05, + "loss": 0.0211, + "step": 521 + }, + { + "epoch": 1.0057831325301205, + "grad_norm": 0.19566713273525238, + "learning_rate": 3.366673069474904e-05, + "loss": 0.0069, + "step": 522 + }, + { + "epoch": 1.007710843373494, + "grad_norm": 0.5101853609085083, + "learning_rate": 3.3633952779796914e-05, + "loss": 0.0191, + "step": 523 + }, + { + "epoch": 1.0096385542168675, + "grad_norm": 0.999434769153595, + "learning_rate": 3.360110630890664e-05, + "loss": 0.0196, + "step": 524 + }, + { + "epoch": 1.011566265060241, + "grad_norm": 0.4646223783493042, + "learning_rate": 3.356819144724092e-05, + "loss": 0.0328, + "step": 525 + }, + { + "epoch": 1.0134939759036146, + "grad_norm": 0.3132480978965759, + "learning_rate": 3.3535208360306354e-05, + "loss": 0.0203, + "step": 526 + }, + { + "epoch": 1.0154216867469879, + "grad_norm": 0.3038032352924347, + "learning_rate": 3.350215721395261e-05, + "loss": 0.0122, + "step": 527 + }, + { + "epoch": 1.0173493975903614, + "grad_norm": 0.45082882046699524, + "learning_rate": 3.346903817437157e-05, + "loss": 0.0437, + "step": 528 + }, + { + "epoch": 1.0192771084337349, + "grad_norm": 0.26917046308517456, + "learning_rate": 3.343585140809651e-05, + "loss": 0.013, + "step": 529 + }, + { + "epoch": 1.0212048192771084, + "grad_norm": 0.23869264125823975, + "learning_rate": 3.3402597082001276e-05, + "loss": 0.008, + "step": 530 + }, + { + "epoch": 1.0231325301204819, + "grad_norm": 0.31315353512763977, + "learning_rate": 3.3369275363299394e-05, + "loss": 0.0078, + "step": 531 + }, + { + "epoch": 1.0250602409638554, + "grad_norm": 0.4780346751213074, + "learning_rate": 3.333588641954327e-05, + "loss": 0.0225, + "step": 532 + }, + { + "epoch": 1.026987951807229, + "grad_norm": 0.2920368015766144, + "learning_rate": 3.330243041862336e-05, + "loss": 0.0118, + "step": 533 + }, + { + "epoch": 1.0289156626506024, + "grad_norm": 0.543669581413269, + "learning_rate": 3.326890752876728e-05, + "loss": 0.0338, + "step": 534 + }, + { + "epoch": 1.030843373493976, + "grad_norm": 0.4288000464439392, + "learning_rate": 3.323531791853901e-05, + "loss": 0.0341, + "step": 535 + }, + { + "epoch": 1.0327710843373494, + "grad_norm": 0.26600322127342224, + "learning_rate": 3.3201661756838e-05, + "loss": 0.0184, + "step": 536 + }, + { + "epoch": 1.034698795180723, + "grad_norm": 0.290937602519989, + "learning_rate": 3.316793921289835e-05, + "loss": 0.0152, + "step": 537 + }, + { + "epoch": 1.0366265060240965, + "grad_norm": 0.7621443271636963, + "learning_rate": 3.313415045628795e-05, + "loss": 0.0326, + "step": 538 + }, + { + "epoch": 1.03855421686747, + "grad_norm": 0.5581283569335938, + "learning_rate": 3.3100295656907646e-05, + "loss": 0.0164, + "step": 539 + }, + { + "epoch": 1.0404819277108435, + "grad_norm": 0.20930901169776917, + "learning_rate": 3.306637498499034e-05, + "loss": 0.0091, + "step": 540 + }, + { + "epoch": 1.0424096385542168, + "grad_norm": 0.46212059259414673, + "learning_rate": 3.303238861110018e-05, + "loss": 0.0118, + "step": 541 + }, + { + "epoch": 1.0443373493975903, + "grad_norm": 0.38259151577949524, + "learning_rate": 3.299833670613168e-05, + "loss": 0.0081, + "step": 542 + }, + { + "epoch": 1.0462650602409638, + "grad_norm": 0.4888618290424347, + "learning_rate": 3.2964219441308865e-05, + "loss": 0.0138, + "step": 543 + }, + { + "epoch": 1.0481927710843373, + "grad_norm": 0.32103127241134644, + "learning_rate": 3.2930036988184425e-05, + "loss": 0.0171, + "step": 544 + }, + { + "epoch": 1.0501204819277108, + "grad_norm": 0.27787327766418457, + "learning_rate": 3.28957895186388e-05, + "loss": 0.0106, + "step": 545 + }, + { + "epoch": 1.0520481927710843, + "grad_norm": 0.35597777366638184, + "learning_rate": 3.2861477204879395e-05, + "loss": 0.0123, + "step": 546 + }, + { + "epoch": 1.0539759036144578, + "grad_norm": 0.3619804084300995, + "learning_rate": 3.2827100219439656e-05, + "loss": 0.0088, + "step": 547 + }, + { + "epoch": 1.0559036144578313, + "grad_norm": 0.2525513470172882, + "learning_rate": 3.279265873517822e-05, + "loss": 0.0179, + "step": 548 + }, + { + "epoch": 1.0578313253012048, + "grad_norm": 0.3910020887851715, + "learning_rate": 3.275815292527804e-05, + "loss": 0.0142, + "step": 549 + }, + { + "epoch": 1.0597590361445783, + "grad_norm": 0.30515050888061523, + "learning_rate": 3.2723582963245526e-05, + "loss": 0.0123, + "step": 550 + }, + { + "epoch": 1.0616867469879518, + "grad_norm": 0.21708644926548004, + "learning_rate": 3.2688949022909665e-05, + "loss": 0.0098, + "step": 551 + }, + { + "epoch": 1.0636144578313254, + "grad_norm": 0.23307719826698303, + "learning_rate": 3.265425127842114e-05, + "loss": 0.0097, + "step": 552 + }, + { + "epoch": 1.0655421686746989, + "grad_norm": 0.676654577255249, + "learning_rate": 3.261948990425147e-05, + "loss": 0.0227, + "step": 553 + }, + { + "epoch": 1.0674698795180724, + "grad_norm": 0.4593975841999054, + "learning_rate": 3.258466507519213e-05, + "loss": 0.047, + "step": 554 + }, + { + "epoch": 1.0693975903614459, + "grad_norm": 0.19405829906463623, + "learning_rate": 3.254977696635366e-05, + "loss": 0.0314, + "step": 555 + }, + { + "epoch": 1.0713253012048192, + "grad_norm": 0.14563389122486115, + "learning_rate": 3.2514825753164774e-05, + "loss": 0.0046, + "step": 556 + }, + { + "epoch": 1.0732530120481927, + "grad_norm": 0.2642340064048767, + "learning_rate": 3.247981161137153e-05, + "loss": 0.022, + "step": 557 + }, + { + "epoch": 1.0751807228915662, + "grad_norm": 0.17274761199951172, + "learning_rate": 3.2444734717036386e-05, + "loss": 0.0134, + "step": 558 + }, + { + "epoch": 1.0771084337349397, + "grad_norm": 0.44354626536369324, + "learning_rate": 3.240959524653735e-05, + "loss": 0.0211, + "step": 559 + }, + { + "epoch": 1.0790361445783132, + "grad_norm": 0.2806888818740845, + "learning_rate": 3.237439337656708e-05, + "loss": 0.0141, + "step": 560 + }, + { + "epoch": 1.0809638554216867, + "grad_norm": 0.21679501235485077, + "learning_rate": 3.2339129284131994e-05, + "loss": 0.019, + "step": 561 + }, + { + "epoch": 1.0828915662650602, + "grad_norm": 0.3040260076522827, + "learning_rate": 3.2303803146551386e-05, + "loss": 0.0249, + "step": 562 + }, + { + "epoch": 1.0848192771084337, + "grad_norm": 0.2793775200843811, + "learning_rate": 3.226841514145656e-05, + "loss": 0.0088, + "step": 563 + }, + { + "epoch": 1.0867469879518072, + "grad_norm": 0.149955615401268, + "learning_rate": 3.223296544678987e-05, + "loss": 0.0054, + "step": 564 + }, + { + "epoch": 1.0886746987951808, + "grad_norm": 0.22166767716407776, + "learning_rate": 3.219745424080389e-05, + "loss": 0.0109, + "step": 565 + }, + { + "epoch": 1.0906024096385543, + "grad_norm": 0.22399431467056274, + "learning_rate": 3.2161881702060476e-05, + "loss": 0.0106, + "step": 566 + }, + { + "epoch": 1.0925301204819278, + "grad_norm": 0.18537986278533936, + "learning_rate": 3.2126248009429905e-05, + "loss": 0.0077, + "step": 567 + }, + { + "epoch": 1.0944578313253013, + "grad_norm": 0.24511495232582092, + "learning_rate": 3.2090553342089935e-05, + "loss": 0.0093, + "step": 568 + }, + { + "epoch": 1.0963855421686748, + "grad_norm": 0.4766045808792114, + "learning_rate": 3.205479787952494e-05, + "loss": 0.036, + "step": 569 + }, + { + "epoch": 1.0983132530120483, + "grad_norm": 0.1425715535879135, + "learning_rate": 3.201898180152499e-05, + "loss": 0.0085, + "step": 570 + }, + { + "epoch": 1.1002409638554216, + "grad_norm": 0.1909666359424591, + "learning_rate": 3.1983105288184945e-05, + "loss": 0.0081, + "step": 571 + }, + { + "epoch": 1.102168674698795, + "grad_norm": 0.44077104330062866, + "learning_rate": 3.194716851990355e-05, + "loss": 0.017, + "step": 572 + }, + { + "epoch": 1.1040963855421686, + "grad_norm": 0.5757400989532471, + "learning_rate": 3.191117167738253e-05, + "loss": 0.021, + "step": 573 + }, + { + "epoch": 1.106024096385542, + "grad_norm": 0.1977701038122177, + "learning_rate": 3.1875114941625705e-05, + "loss": 0.0096, + "step": 574 + }, + { + "epoch": 1.1079518072289156, + "grad_norm": 0.3524581491947174, + "learning_rate": 3.1838998493938026e-05, + "loss": 0.0118, + "step": 575 + }, + { + "epoch": 1.1098795180722891, + "grad_norm": 0.3301331698894501, + "learning_rate": 3.180282251592472e-05, + "loss": 0.0094, + "step": 576 + }, + { + "epoch": 1.1118072289156626, + "grad_norm": 0.2774488925933838, + "learning_rate": 3.1766587189490336e-05, + "loss": 0.0131, + "step": 577 + }, + { + "epoch": 1.1137349397590361, + "grad_norm": 1.732595443725586, + "learning_rate": 3.173029269683785e-05, + "loss": 0.0445, + "step": 578 + }, + { + "epoch": 1.1156626506024097, + "grad_norm": 0.28746843338012695, + "learning_rate": 3.169393922046776e-05, + "loss": 0.0116, + "step": 579 + }, + { + "epoch": 1.1175903614457832, + "grad_norm": 0.2952995002269745, + "learning_rate": 3.165752694317713e-05, + "loss": 0.0116, + "step": 580 + }, + { + "epoch": 1.1195180722891567, + "grad_norm": 0.2938575744628906, + "learning_rate": 3.16210560480587e-05, + "loss": 0.013, + "step": 581 + }, + { + "epoch": 1.1214457831325302, + "grad_norm": 0.22283495962619781, + "learning_rate": 3.158452671849998e-05, + "loss": 0.0052, + "step": 582 + }, + { + "epoch": 1.1233734939759037, + "grad_norm": 0.6272858381271362, + "learning_rate": 3.154793913818226e-05, + "loss": 0.0182, + "step": 583 + }, + { + "epoch": 1.1253012048192772, + "grad_norm": 0.479753702878952, + "learning_rate": 3.1511293491079804e-05, + "loss": 0.0146, + "step": 584 + }, + { + "epoch": 1.1272289156626507, + "grad_norm": 0.31104400753974915, + "learning_rate": 3.1474589961458786e-05, + "loss": 0.0139, + "step": 585 + }, + { + "epoch": 1.129156626506024, + "grad_norm": 0.4932832419872284, + "learning_rate": 3.1437828733876477e-05, + "loss": 0.0236, + "step": 586 + }, + { + "epoch": 1.1310843373493975, + "grad_norm": 0.222808837890625, + "learning_rate": 3.140100999318025e-05, + "loss": 0.0084, + "step": 587 + }, + { + "epoch": 1.133012048192771, + "grad_norm": 0.4515356719493866, + "learning_rate": 3.136413392450668e-05, + "loss": 0.0215, + "step": 588 + }, + { + "epoch": 1.1349397590361445, + "grad_norm": 0.39302268624305725, + "learning_rate": 3.132720071328061e-05, + "loss": 0.0154, + "step": 589 + }, + { + "epoch": 1.136867469879518, + "grad_norm": 0.43382835388183594, + "learning_rate": 3.1290210545214205e-05, + "loss": 0.0088, + "step": 590 + }, + { + "epoch": 1.1387951807228915, + "grad_norm": 0.18707136809825897, + "learning_rate": 3.125316360630602e-05, + "loss": 0.0126, + "step": 591 + }, + { + "epoch": 1.140722891566265, + "grad_norm": 0.5688219666481018, + "learning_rate": 3.121606008284011e-05, + "loss": 0.0147, + "step": 592 + }, + { + "epoch": 1.1426506024096386, + "grad_norm": 0.3321833312511444, + "learning_rate": 3.1178900161385005e-05, + "loss": 0.0119, + "step": 593 + }, + { + "epoch": 1.144578313253012, + "grad_norm": 0.3738424777984619, + "learning_rate": 3.114168402879286e-05, + "loss": 0.0158, + "step": 594 + }, + { + "epoch": 1.1465060240963856, + "grad_norm": 0.2386978417634964, + "learning_rate": 3.110441187219846e-05, + "loss": 0.0107, + "step": 595 + }, + { + "epoch": 1.148433734939759, + "grad_norm": 0.2165699452161789, + "learning_rate": 3.10670838790183e-05, + "loss": 0.0079, + "step": 596 + }, + { + "epoch": 1.1503614457831326, + "grad_norm": 0.25952696800231934, + "learning_rate": 3.102970023694965e-05, + "loss": 0.0147, + "step": 597 + }, + { + "epoch": 1.152289156626506, + "grad_norm": 0.21448305249214172, + "learning_rate": 3.099226113396959e-05, + "loss": 0.0099, + "step": 598 + }, + { + "epoch": 1.1542168674698796, + "grad_norm": 0.37226060032844543, + "learning_rate": 3.095476675833405e-05, + "loss": 0.0214, + "step": 599 + }, + { + "epoch": 1.1561445783132531, + "grad_norm": 0.29637983441352844, + "learning_rate": 3.0917217298576955e-05, + "loss": 0.0118, + "step": 600 + }, + { + "epoch": 1.1580722891566264, + "grad_norm": 0.18535609543323517, + "learning_rate": 3.0879612943509154e-05, + "loss": 0.0086, + "step": 601 + }, + { + "epoch": 1.16, + "grad_norm": 0.25874125957489014, + "learning_rate": 3.0841953882217536e-05, + "loss": 0.0088, + "step": 602 + }, + { + "epoch": 1.1619277108433734, + "grad_norm": 0.46092745661735535, + "learning_rate": 3.08042403040641e-05, + "loss": 0.0241, + "step": 603 + }, + { + "epoch": 1.163855421686747, + "grad_norm": 0.27023249864578247, + "learning_rate": 3.076647239868494e-05, + "loss": 0.0154, + "step": 604 + }, + { + "epoch": 1.1657831325301204, + "grad_norm": 0.445157527923584, + "learning_rate": 3.072865035598933e-05, + "loss": 0.0197, + "step": 605 + }, + { + "epoch": 1.167710843373494, + "grad_norm": 0.18097272515296936, + "learning_rate": 3.06907743661588e-05, + "loss": 0.0093, + "step": 606 + }, + { + "epoch": 1.1696385542168675, + "grad_norm": 0.22469942271709442, + "learning_rate": 3.065284461964609e-05, + "loss": 0.0171, + "step": 607 + }, + { + "epoch": 1.171566265060241, + "grad_norm": 0.20190906524658203, + "learning_rate": 3.061486130717428e-05, + "loss": 0.008, + "step": 608 + }, + { + "epoch": 1.1734939759036145, + "grad_norm": 0.18294145166873932, + "learning_rate": 3.057682461973579e-05, + "loss": 0.0155, + "step": 609 + }, + { + "epoch": 1.175421686746988, + "grad_norm": 0.34203943610191345, + "learning_rate": 3.053873474859143e-05, + "loss": 0.0212, + "step": 610 + }, + { + "epoch": 1.1773493975903615, + "grad_norm": 0.49073582887649536, + "learning_rate": 3.050059188526942e-05, + "loss": 0.019, + "step": 611 + }, + { + "epoch": 1.179277108433735, + "grad_norm": 0.3537680506706238, + "learning_rate": 3.046239622156446e-05, + "loss": 0.0147, + "step": 612 + }, + { + "epoch": 1.1812048192771085, + "grad_norm": 0.2584632635116577, + "learning_rate": 3.042414794953674e-05, + "loss": 0.0088, + "step": 613 + }, + { + "epoch": 1.1831325301204818, + "grad_norm": 0.3529360890388489, + "learning_rate": 3.0385847261510975e-05, + "loss": 0.0187, + "step": 614 + }, + { + "epoch": 1.1850602409638555, + "grad_norm": 0.3331570327281952, + "learning_rate": 3.0347494350075465e-05, + "loss": 0.0124, + "step": 615 + }, + { + "epoch": 1.1869879518072288, + "grad_norm": 0.2223527580499649, + "learning_rate": 3.0309089408081074e-05, + "loss": 0.01, + "step": 616 + }, + { + "epoch": 1.1889156626506023, + "grad_norm": 0.21985746920108795, + "learning_rate": 3.027063262864032e-05, + "loss": 0.0087, + "step": 617 + }, + { + "epoch": 1.1908433734939758, + "grad_norm": 0.2989653944969177, + "learning_rate": 3.023212420512637e-05, + "loss": 0.0137, + "step": 618 + }, + { + "epoch": 1.1927710843373494, + "grad_norm": 0.17423275113105774, + "learning_rate": 3.0193564331172074e-05, + "loss": 0.0056, + "step": 619 + }, + { + "epoch": 1.1946987951807229, + "grad_norm": 1.0992127656936646, + "learning_rate": 3.0154953200668976e-05, + "loss": 0.0274, + "step": 620 + }, + { + "epoch": 1.1966265060240964, + "grad_norm": 0.21641989052295685, + "learning_rate": 3.011629100776638e-05, + "loss": 0.0151, + "step": 621 + }, + { + "epoch": 1.1985542168674699, + "grad_norm": 0.4558199644088745, + "learning_rate": 3.007757794687033e-05, + "loss": 0.0424, + "step": 622 + }, + { + "epoch": 1.2004819277108434, + "grad_norm": 0.42380189895629883, + "learning_rate": 3.003881421264266e-05, + "loss": 0.0079, + "step": 623 + }, + { + "epoch": 1.202409638554217, + "grad_norm": 0.28791171312332153, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.0142, + "step": 624 + }, + { + "epoch": 1.2043373493975904, + "grad_norm": 0.3906581997871399, + "learning_rate": 2.996113550411281e-05, + "loss": 0.0251, + "step": 625 + }, + { + "epoch": 1.206265060240964, + "grad_norm": 0.47848746180534363, + "learning_rate": 2.9922220920404375e-05, + "loss": 0.0137, + "step": 626 + }, + { + "epoch": 1.2081927710843374, + "grad_norm": 0.22666941583156586, + "learning_rate": 2.9883256444549862e-05, + "loss": 0.0105, + "step": 627 + }, + { + "epoch": 1.210120481927711, + "grad_norm": 0.18968136608600616, + "learning_rate": 2.984424227247529e-05, + "loss": 0.0089, + "step": 628 + }, + { + "epoch": 1.2120481927710842, + "grad_norm": 0.28732606768608093, + "learning_rate": 2.980517860035656e-05, + "loss": 0.0253, + "step": 629 + }, + { + "epoch": 1.213975903614458, + "grad_norm": 0.21131543815135956, + "learning_rate": 2.9766065624618518e-05, + "loss": 0.0134, + "step": 630 + }, + { + "epoch": 1.2159036144578312, + "grad_norm": 0.7594877481460571, + "learning_rate": 2.972690354193388e-05, + "loss": 0.0157, + "step": 631 + }, + { + "epoch": 1.2178313253012047, + "grad_norm": 0.730291485786438, + "learning_rate": 2.96876925492223e-05, + "loss": 0.0204, + "step": 632 + }, + { + "epoch": 1.2197590361445783, + "grad_norm": 0.20333674550056458, + "learning_rate": 2.9648432843649382e-05, + "loss": 0.0114, + "step": 633 + }, + { + "epoch": 1.2216867469879518, + "grad_norm": 0.5680793523788452, + "learning_rate": 2.960912462262566e-05, + "loss": 0.0146, + "step": 634 + }, + { + "epoch": 1.2236144578313253, + "grad_norm": 0.4591079354286194, + "learning_rate": 2.9569768083805618e-05, + "loss": 0.0112, + "step": 635 + }, + { + "epoch": 1.2255421686746988, + "grad_norm": 0.3793511390686035, + "learning_rate": 2.953036342508671e-05, + "loss": 0.0377, + "step": 636 + }, + { + "epoch": 1.2274698795180723, + "grad_norm": 1.118723750114441, + "learning_rate": 2.9490910844608346e-05, + "loss": 0.0432, + "step": 637 + }, + { + "epoch": 1.2293975903614458, + "grad_norm": 0.36990776658058167, + "learning_rate": 2.9451410540750887e-05, + "loss": 0.0203, + "step": 638 + }, + { + "epoch": 1.2313253012048193, + "grad_norm": 0.930397629737854, + "learning_rate": 2.94118627121347e-05, + "loss": 0.0311, + "step": 639 + }, + { + "epoch": 1.2332530120481928, + "grad_norm": 0.2347625195980072, + "learning_rate": 2.9372267557619075e-05, + "loss": 0.0168, + "step": 640 + }, + { + "epoch": 1.2351807228915663, + "grad_norm": 0.3720332384109497, + "learning_rate": 2.933262527630131e-05, + "loss": 0.0136, + "step": 641 + }, + { + "epoch": 1.2371084337349398, + "grad_norm": 0.4871984124183655, + "learning_rate": 2.929293606751565e-05, + "loss": 0.0339, + "step": 642 + }, + { + "epoch": 1.2390361445783133, + "grad_norm": 0.35853689908981323, + "learning_rate": 2.9253200130832322e-05, + "loss": 0.0095, + "step": 643 + }, + { + "epoch": 1.2409638554216866, + "grad_norm": 0.42003703117370605, + "learning_rate": 2.92134176660565e-05, + "loss": 0.0142, + "step": 644 + }, + { + "epoch": 1.2428915662650604, + "grad_norm": 0.3854500651359558, + "learning_rate": 2.9173588873227338e-05, + "loss": 0.0209, + "step": 645 + }, + { + "epoch": 1.2448192771084337, + "grad_norm": 0.24665917456150055, + "learning_rate": 2.913371395261691e-05, + "loss": 0.0087, + "step": 646 + }, + { + "epoch": 1.2467469879518072, + "grad_norm": 0.41571593284606934, + "learning_rate": 2.9093793104729268e-05, + "loss": 0.0164, + "step": 647 + }, + { + "epoch": 1.2486746987951807, + "grad_norm": 0.4597891569137573, + "learning_rate": 2.9053826530299377e-05, + "loss": 0.0138, + "step": 648 + }, + { + "epoch": 1.2506024096385542, + "grad_norm": 0.43345385789871216, + "learning_rate": 2.901381443029215e-05, + "loss": 0.0353, + "step": 649 + }, + { + "epoch": 1.2525301204819277, + "grad_norm": 0.3706768751144409, + "learning_rate": 2.897375700590141e-05, + "loss": 0.007, + "step": 650 + }, + { + "epoch": 1.2544578313253012, + "grad_norm": 0.30305296182632446, + "learning_rate": 2.8933654458548873e-05, + "loss": 0.0123, + "step": 651 + }, + { + "epoch": 1.2563855421686747, + "grad_norm": 0.2042127549648285, + "learning_rate": 2.8893506989883167e-05, + "loss": 0.0099, + "step": 652 + }, + { + "epoch": 1.2583132530120482, + "grad_norm": 0.20524422824382782, + "learning_rate": 2.8853314801778784e-05, + "loss": 0.0097, + "step": 653 + }, + { + "epoch": 1.2602409638554217, + "grad_norm": 0.2351921945810318, + "learning_rate": 2.8813078096335093e-05, + "loss": 0.0091, + "step": 654 + }, + { + "epoch": 1.2621686746987952, + "grad_norm": 0.34547340869903564, + "learning_rate": 2.87727970758753e-05, + "loss": 0.0088, + "step": 655 + }, + { + "epoch": 1.2640963855421687, + "grad_norm": 0.35163217782974243, + "learning_rate": 2.8732471942945443e-05, + "loss": 0.0145, + "step": 656 + }, + { + "epoch": 1.266024096385542, + "grad_norm": 1.715137243270874, + "learning_rate": 2.8692102900313378e-05, + "loss": 0.0198, + "step": 657 + }, + { + "epoch": 1.2679518072289158, + "grad_norm": 0.2860178053379059, + "learning_rate": 2.8651690150967748e-05, + "loss": 0.0085, + "step": 658 + }, + { + "epoch": 1.269879518072289, + "grad_norm": 0.21175967156887054, + "learning_rate": 2.8611233898116967e-05, + "loss": 0.0071, + "step": 659 + }, + { + "epoch": 1.2718072289156628, + "grad_norm": 0.33726972341537476, + "learning_rate": 2.85707343451882e-05, + "loss": 0.012, + "step": 660 + }, + { + "epoch": 1.273734939759036, + "grad_norm": 0.2138456553220749, + "learning_rate": 2.853019169582635e-05, + "loss": 0.0092, + "step": 661 + }, + { + "epoch": 1.2756626506024096, + "grad_norm": 0.2304934412240982, + "learning_rate": 2.8489606153892997e-05, + "loss": 0.0144, + "step": 662 + }, + { + "epoch": 1.277590361445783, + "grad_norm": 0.2691061794757843, + "learning_rate": 2.8448977923465425e-05, + "loss": 0.0121, + "step": 663 + }, + { + "epoch": 1.2795180722891566, + "grad_norm": 0.35254305601119995, + "learning_rate": 2.840830720883555e-05, + "loss": 0.0125, + "step": 664 + }, + { + "epoch": 1.28144578313253, + "grad_norm": 0.36552608013153076, + "learning_rate": 2.836759421450893e-05, + "loss": 0.021, + "step": 665 + }, + { + "epoch": 1.2833734939759036, + "grad_norm": 0.37177154421806335, + "learning_rate": 2.83268391452037e-05, + "loss": 0.0216, + "step": 666 + }, + { + "epoch": 1.2853012048192771, + "grad_norm": 0.20932547748088837, + "learning_rate": 2.828604220584958e-05, + "loss": 0.0077, + "step": 667 + }, + { + "epoch": 1.2872289156626506, + "grad_norm": 0.5158557295799255, + "learning_rate": 2.824520360158681e-05, + "loss": 0.0394, + "step": 668 + }, + { + "epoch": 1.2891566265060241, + "grad_norm": 0.22623969614505768, + "learning_rate": 2.820432353776515e-05, + "loss": 0.0087, + "step": 669 + }, + { + "epoch": 1.2910843373493976, + "grad_norm": 0.2996046245098114, + "learning_rate": 2.8163402219942822e-05, + "loss": 0.01, + "step": 670 + }, + { + "epoch": 1.2930120481927712, + "grad_norm": 0.24957989156246185, + "learning_rate": 2.8122439853885488e-05, + "loss": 0.0127, + "step": 671 + }, + { + "epoch": 1.2949397590361444, + "grad_norm": 0.2636559307575226, + "learning_rate": 2.8081436645565216e-05, + "loss": 0.0128, + "step": 672 + }, + { + "epoch": 1.2968674698795182, + "grad_norm": 0.3531591296195984, + "learning_rate": 2.804039280115944e-05, + "loss": 0.0199, + "step": 673 + }, + { + "epoch": 1.2987951807228915, + "grad_norm": 0.3682299852371216, + "learning_rate": 2.7999308527049927e-05, + "loss": 0.0088, + "step": 674 + }, + { + "epoch": 1.3007228915662652, + "grad_norm": 0.19555217027664185, + "learning_rate": 2.795818402982174e-05, + "loss": 0.0084, + "step": 675 + }, + { + "epoch": 1.3026506024096385, + "grad_norm": 0.2864912450313568, + "learning_rate": 2.7917019516262186e-05, + "loss": 0.0154, + "step": 676 + }, + { + "epoch": 1.304578313253012, + "grad_norm": 0.2211237996816635, + "learning_rate": 2.78758151933598e-05, + "loss": 0.0078, + "step": 677 + }, + { + "epoch": 1.3065060240963855, + "grad_norm": 0.13646945357322693, + "learning_rate": 2.7834571268303294e-05, + "loss": 0.0058, + "step": 678 + }, + { + "epoch": 1.308433734939759, + "grad_norm": 0.16530285775661469, + "learning_rate": 2.779328794848049e-05, + "loss": 0.007, + "step": 679 + }, + { + "epoch": 1.3103614457831325, + "grad_norm": 0.2145693302154541, + "learning_rate": 2.7751965441477325e-05, + "loss": 0.0203, + "step": 680 + }, + { + "epoch": 1.312289156626506, + "grad_norm": 0.24273739755153656, + "learning_rate": 2.771060395507677e-05, + "loss": 0.0106, + "step": 681 + }, + { + "epoch": 1.3142168674698795, + "grad_norm": 0.20430618524551392, + "learning_rate": 2.7669203697257794e-05, + "loss": 0.0122, + "step": 682 + }, + { + "epoch": 1.316144578313253, + "grad_norm": 0.2502615749835968, + "learning_rate": 2.7627764876194335e-05, + "loss": 0.0101, + "step": 683 + }, + { + "epoch": 1.3180722891566266, + "grad_norm": 0.287239670753479, + "learning_rate": 2.7586287700254214e-05, + "loss": 0.0203, + "step": 684 + }, + { + "epoch": 1.32, + "grad_norm": 0.16239754855632782, + "learning_rate": 2.7544772377998147e-05, + "loss": 0.0084, + "step": 685 + }, + { + "epoch": 1.3219277108433736, + "grad_norm": 0.27174142003059387, + "learning_rate": 2.7503219118178636e-05, + "loss": 0.008, + "step": 686 + }, + { + "epoch": 1.3238554216867469, + "grad_norm": 0.12878240644931793, + "learning_rate": 2.7461628129738954e-05, + "loss": 0.0053, + "step": 687 + }, + { + "epoch": 1.3257831325301206, + "grad_norm": 0.16112515330314636, + "learning_rate": 2.7419999621812086e-05, + "loss": 0.0059, + "step": 688 + }, + { + "epoch": 1.3277108433734939, + "grad_norm": 0.2398834228515625, + "learning_rate": 2.7378333803719672e-05, + "loss": 0.0095, + "step": 689 + }, + { + "epoch": 1.3296385542168676, + "grad_norm": 0.18516193330287933, + "learning_rate": 2.733663088497097e-05, + "loss": 0.0071, + "step": 690 + }, + { + "epoch": 1.331566265060241, + "grad_norm": 0.2974924147129059, + "learning_rate": 2.7294891075261785e-05, + "loss": 0.0227, + "step": 691 + }, + { + "epoch": 1.3334939759036144, + "grad_norm": 0.12931054830551147, + "learning_rate": 2.7253114584473418e-05, + "loss": 0.0039, + "step": 692 + }, + { + "epoch": 1.335421686746988, + "grad_norm": 0.16319474577903748, + "learning_rate": 2.7211301622671623e-05, + "loss": 0.008, + "step": 693 + }, + { + "epoch": 1.3373493975903614, + "grad_norm": 0.27622169256210327, + "learning_rate": 2.7169452400105533e-05, + "loss": 0.0238, + "step": 694 + }, + { + "epoch": 1.339277108433735, + "grad_norm": 0.45309779047966003, + "learning_rate": 2.712756712720663e-05, + "loss": 0.0439, + "step": 695 + }, + { + "epoch": 1.3412048192771084, + "grad_norm": 0.2469855099916458, + "learning_rate": 2.708564601458765e-05, + "loss": 0.0085, + "step": 696 + }, + { + "epoch": 1.343132530120482, + "grad_norm": 0.4245856702327728, + "learning_rate": 2.7043689273041535e-05, + "loss": 0.0097, + "step": 697 + }, + { + "epoch": 1.3450602409638555, + "grad_norm": 0.26796087622642517, + "learning_rate": 2.7001697113540414e-05, + "loss": 0.0119, + "step": 698 + }, + { + "epoch": 1.346987951807229, + "grad_norm": 0.3569283187389374, + "learning_rate": 2.6959669747234482e-05, + "loss": 0.0096, + "step": 699 + }, + { + "epoch": 1.3489156626506025, + "grad_norm": 0.7038524150848389, + "learning_rate": 2.6917607385450973e-05, + "loss": 0.0317, + "step": 700 + }, + { + "epoch": 1.350843373493976, + "grad_norm": 0.23568563163280487, + "learning_rate": 2.687551023969308e-05, + "loss": 0.0112, + "step": 701 + }, + { + "epoch": 1.3527710843373493, + "grad_norm": 0.20338499546051025, + "learning_rate": 2.6833378521638935e-05, + "loss": 0.0092, + "step": 702 + }, + { + "epoch": 1.354698795180723, + "grad_norm": 4.22187614440918, + "learning_rate": 2.679121244314046e-05, + "loss": 0.0314, + "step": 703 + }, + { + "epoch": 1.3566265060240963, + "grad_norm": 0.2542206048965454, + "learning_rate": 2.674901221622239e-05, + "loss": 0.0158, + "step": 704 + }, + { + "epoch": 1.3585542168674698, + "grad_norm": 0.49705010652542114, + "learning_rate": 2.670677805308116e-05, + "loss": 0.0162, + "step": 705 + }, + { + "epoch": 1.3604819277108433, + "grad_norm": 0.17502115666866302, + "learning_rate": 2.666451016608383e-05, + "loss": 0.0074, + "step": 706 + }, + { + "epoch": 1.3624096385542168, + "grad_norm": 0.21738742291927338, + "learning_rate": 2.6622208767767075e-05, + "loss": 0.0135, + "step": 707 + }, + { + "epoch": 1.3643373493975903, + "grad_norm": 0.3309847414493561, + "learning_rate": 2.6579874070836032e-05, + "loss": 0.0107, + "step": 708 + }, + { + "epoch": 1.3662650602409638, + "grad_norm": 0.10706827789545059, + "learning_rate": 2.6537506288163303e-05, + "loss": 0.0043, + "step": 709 + }, + { + "epoch": 1.3681927710843373, + "grad_norm": 0.173640176653862, + "learning_rate": 2.6495105632787835e-05, + "loss": 0.0092, + "step": 710 + }, + { + "epoch": 1.3701204819277109, + "grad_norm": 0.2636397182941437, + "learning_rate": 2.6452672317913893e-05, + "loss": 0.0097, + "step": 711 + }, + { + "epoch": 1.3720481927710844, + "grad_norm": 0.28485360741615295, + "learning_rate": 2.6410206556909943e-05, + "loss": 0.0193, + "step": 712 + }, + { + "epoch": 1.3739759036144579, + "grad_norm": 0.23210027813911438, + "learning_rate": 2.636770856330761e-05, + "loss": 0.0229, + "step": 713 + }, + { + "epoch": 1.3759036144578314, + "grad_norm": 0.13388316333293915, + "learning_rate": 2.6325178550800596e-05, + "loss": 0.004, + "step": 714 + }, + { + "epoch": 1.377831325301205, + "grad_norm": 0.5131422877311707, + "learning_rate": 2.6282616733243603e-05, + "loss": 0.0137, + "step": 715 + }, + { + "epoch": 1.3797590361445784, + "grad_norm": 0.3243267834186554, + "learning_rate": 2.6240023324651258e-05, + "loss": 0.0153, + "step": 716 + }, + { + "epoch": 1.3816867469879517, + "grad_norm": 0.1440611034631729, + "learning_rate": 2.619739853919704e-05, + "loss": 0.0031, + "step": 717 + }, + { + "epoch": 1.3836144578313254, + "grad_norm": 0.30346596240997314, + "learning_rate": 2.6154742591212196e-05, + "loss": 0.0109, + "step": 718 + }, + { + "epoch": 1.3855421686746987, + "grad_norm": 0.19109240174293518, + "learning_rate": 2.611205569518468e-05, + "loss": 0.0094, + "step": 719 + }, + { + "epoch": 1.3874698795180722, + "grad_norm": 0.28636518120765686, + "learning_rate": 2.6069338065758056e-05, + "loss": 0.0123, + "step": 720 + }, + { + "epoch": 1.3893975903614457, + "grad_norm": 0.28083911538124084, + "learning_rate": 2.6026589917730416e-05, + "loss": 0.0104, + "step": 721 + }, + { + "epoch": 1.3913253012048192, + "grad_norm": 0.36553966999053955, + "learning_rate": 2.5983811466053327e-05, + "loss": 0.0143, + "step": 722 + }, + { + "epoch": 1.3932530120481927, + "grad_norm": 0.23317205905914307, + "learning_rate": 2.5941002925830708e-05, + "loss": 0.011, + "step": 723 + }, + { + "epoch": 1.3951807228915662, + "grad_norm": 0.3825171887874603, + "learning_rate": 2.589816451231781e-05, + "loss": 0.0098, + "step": 724 + }, + { + "epoch": 1.3971084337349398, + "grad_norm": 0.19916608929634094, + "learning_rate": 2.585529644092006e-05, + "loss": 0.0094, + "step": 725 + }, + { + "epoch": 1.3990361445783133, + "grad_norm": 0.19990523159503937, + "learning_rate": 2.5812398927192027e-05, + "loss": 0.0128, + "step": 726 + }, + { + "epoch": 1.4009638554216868, + "grad_norm": 0.34662899374961853, + "learning_rate": 2.5769472186836347e-05, + "loss": 0.0091, + "step": 727 + }, + { + "epoch": 1.4028915662650603, + "grad_norm": 0.23481112718582153, + "learning_rate": 2.5726516435702583e-05, + "loss": 0.0154, + "step": 728 + }, + { + "epoch": 1.4048192771084338, + "grad_norm": 0.1846667379140854, + "learning_rate": 2.5683531889786194e-05, + "loss": 0.0088, + "step": 729 + }, + { + "epoch": 1.4067469879518073, + "grad_norm": 0.16717663407325745, + "learning_rate": 2.564051876522742e-05, + "loss": 0.0083, + "step": 730 + }, + { + "epoch": 1.4086746987951808, + "grad_norm": 0.4116475284099579, + "learning_rate": 2.5597477278310202e-05, + "loss": 0.0179, + "step": 731 + }, + { + "epoch": 1.410602409638554, + "grad_norm": 0.171807661652565, + "learning_rate": 2.5554407645461115e-05, + "loss": 0.0063, + "step": 732 + }, + { + "epoch": 1.4125301204819278, + "grad_norm": 0.1954439878463745, + "learning_rate": 2.5511310083248243e-05, + "loss": 0.017, + "step": 733 + }, + { + "epoch": 1.4144578313253011, + "grad_norm": 0.37158989906311035, + "learning_rate": 2.5468184808380104e-05, + "loss": 0.0173, + "step": 734 + }, + { + "epoch": 1.4163855421686746, + "grad_norm": 0.2001633644104004, + "learning_rate": 2.542503203770458e-05, + "loss": 0.0165, + "step": 735 + }, + { + "epoch": 1.4183132530120481, + "grad_norm": 0.45673373341560364, + "learning_rate": 2.53818519882078e-05, + "loss": 0.0185, + "step": 736 + }, + { + "epoch": 1.4202409638554216, + "grad_norm": 0.3838701546192169, + "learning_rate": 2.5338644877013067e-05, + "loss": 0.0134, + "step": 737 + }, + { + "epoch": 1.4221686746987952, + "grad_norm": 0.32032477855682373, + "learning_rate": 2.5295410921379745e-05, + "loss": 0.0143, + "step": 738 + }, + { + "epoch": 1.4240963855421687, + "grad_norm": 0.4594039022922516, + "learning_rate": 2.52521503387022e-05, + "loss": 0.0193, + "step": 739 + }, + { + "epoch": 1.4260240963855422, + "grad_norm": 0.3889620900154114, + "learning_rate": 2.5208863346508667e-05, + "loss": 0.0114, + "step": 740 + }, + { + "epoch": 1.4279518072289157, + "grad_norm": 0.33153319358825684, + "learning_rate": 2.5165550162460203e-05, + "loss": 0.0102, + "step": 741 + }, + { + "epoch": 1.4298795180722892, + "grad_norm": 0.7269518375396729, + "learning_rate": 2.5122211004349536e-05, + "loss": 0.0215, + "step": 742 + }, + { + "epoch": 1.4318072289156627, + "grad_norm": 0.31653261184692383, + "learning_rate": 2.5078846090100023e-05, + "loss": 0.0115, + "step": 743 + }, + { + "epoch": 1.4337349397590362, + "grad_norm": 0.20620353519916534, + "learning_rate": 2.5035455637764518e-05, + "loss": 0.0153, + "step": 744 + }, + { + "epoch": 1.4356626506024097, + "grad_norm": 0.17266008257865906, + "learning_rate": 2.4992039865524297e-05, + "loss": 0.0069, + "step": 745 + }, + { + "epoch": 1.4375903614457832, + "grad_norm": 0.24760811030864716, + "learning_rate": 2.494859899168795e-05, + "loss": 0.0108, + "step": 746 + }, + { + "epoch": 1.4395180722891565, + "grad_norm": 0.2584865391254425, + "learning_rate": 2.4905133234690282e-05, + "loss": 0.0095, + "step": 747 + }, + { + "epoch": 1.4414457831325302, + "grad_norm": 0.48847514390945435, + "learning_rate": 2.486164281309122e-05, + "loss": 0.0181, + "step": 748 + }, + { + "epoch": 1.4433734939759035, + "grad_norm": 0.42942047119140625, + "learning_rate": 2.4818127945574717e-05, + "loss": 0.025, + "step": 749 + }, + { + "epoch": 1.445301204819277, + "grad_norm": 0.23713800311088562, + "learning_rate": 2.4774588850947648e-05, + "loss": 0.0085, + "step": 750 + }, + { + "epoch": 1.4472289156626506, + "grad_norm": 0.8797569870948792, + "learning_rate": 2.473102574813871e-05, + "loss": 0.0097, + "step": 751 + }, + { + "epoch": 1.449156626506024, + "grad_norm": 0.2744862735271454, + "learning_rate": 2.4687438856197302e-05, + "loss": 0.0122, + "step": 752 + }, + { + "epoch": 1.4510843373493976, + "grad_norm": 0.12747010588645935, + "learning_rate": 2.4643828394292478e-05, + "loss": 0.0056, + "step": 753 + }, + { + "epoch": 1.453012048192771, + "grad_norm": 0.37376829981803894, + "learning_rate": 2.4600194581711775e-05, + "loss": 0.0052, + "step": 754 + }, + { + "epoch": 1.4549397590361446, + "grad_norm": 0.2536911368370056, + "learning_rate": 2.4556537637860176e-05, + "loss": 0.0113, + "step": 755 + }, + { + "epoch": 1.456867469879518, + "grad_norm": 0.25950780510902405, + "learning_rate": 2.451285778225894e-05, + "loss": 0.0099, + "step": 756 + }, + { + "epoch": 1.4587951807228916, + "grad_norm": 0.19535955786705017, + "learning_rate": 2.4469155234544565e-05, + "loss": 0.0069, + "step": 757 + }, + { + "epoch": 1.4607228915662651, + "grad_norm": 0.22816115617752075, + "learning_rate": 2.442543021446764e-05, + "loss": 0.0088, + "step": 758 + }, + { + "epoch": 1.4626506024096386, + "grad_norm": 0.3363986313343048, + "learning_rate": 2.4381682941891755e-05, + "loss": 0.0182, + "step": 759 + }, + { + "epoch": 1.464578313253012, + "grad_norm": 0.21492891013622284, + "learning_rate": 2.4337913636792382e-05, + "loss": 0.0069, + "step": 760 + }, + { + "epoch": 1.4665060240963856, + "grad_norm": 0.6070862412452698, + "learning_rate": 2.429412251925579e-05, + "loss": 0.0406, + "step": 761 + }, + { + "epoch": 1.468433734939759, + "grad_norm": 2.6469690799713135, + "learning_rate": 2.425030980947793e-05, + "loss": 0.0205, + "step": 762 + }, + { + "epoch": 1.4703614457831327, + "grad_norm": 0.30909740924835205, + "learning_rate": 2.420647572776332e-05, + "loss": 0.0136, + "step": 763 + }, + { + "epoch": 1.472289156626506, + "grad_norm": 0.6639553904533386, + "learning_rate": 2.416262049452395e-05, + "loss": 0.011, + "step": 764 + }, + { + "epoch": 1.4742168674698795, + "grad_norm": 0.2919616997241974, + "learning_rate": 2.4118744330278147e-05, + "loss": 0.0131, + "step": 765 + }, + { + "epoch": 1.476144578313253, + "grad_norm": 0.5232429504394531, + "learning_rate": 2.4074847455649523e-05, + "loss": 0.0138, + "step": 766 + }, + { + "epoch": 1.4780722891566265, + "grad_norm": 5.630630970001221, + "learning_rate": 2.403093009136579e-05, + "loss": 0.0264, + "step": 767 + }, + { + "epoch": 1.48, + "grad_norm": 0.33234721422195435, + "learning_rate": 2.3986992458257707e-05, + "loss": 0.0111, + "step": 768 + }, + { + "epoch": 1.4819277108433735, + "grad_norm": 0.28444772958755493, + "learning_rate": 2.3943034777257945e-05, + "loss": 0.0144, + "step": 769 + }, + { + "epoch": 1.483855421686747, + "grad_norm": 0.16229979693889618, + "learning_rate": 2.38990572694e-05, + "loss": 0.0062, + "step": 770 + }, + { + "epoch": 1.4857831325301205, + "grad_norm": 0.27474716305732727, + "learning_rate": 2.385506015581704e-05, + "loss": 0.0172, + "step": 771 + }, + { + "epoch": 1.487710843373494, + "grad_norm": 0.246526300907135, + "learning_rate": 2.381104365774083e-05, + "loss": 0.012, + "step": 772 + }, + { + "epoch": 1.4896385542168675, + "grad_norm": 0.282047837972641, + "learning_rate": 2.37670079965006e-05, + "loss": 0.0116, + "step": 773 + }, + { + "epoch": 1.491566265060241, + "grad_norm": 0.2878139317035675, + "learning_rate": 2.3722953393521944e-05, + "loss": 0.0147, + "step": 774 + }, + { + "epoch": 1.4934939759036143, + "grad_norm": 0.5586277842521667, + "learning_rate": 2.367888007032571e-05, + "loss": 0.0111, + "step": 775 + }, + { + "epoch": 1.495421686746988, + "grad_norm": 0.562160313129425, + "learning_rate": 2.3634788248526846e-05, + "loss": 0.0061, + "step": 776 + }, + { + "epoch": 1.4973493975903613, + "grad_norm": 0.3452005982398987, + "learning_rate": 2.3590678149833356e-05, + "loss": 0.0205, + "step": 777 + }, + { + "epoch": 1.499277108433735, + "grad_norm": 0.7757686376571655, + "learning_rate": 2.3546549996045114e-05, + "loss": 0.0273, + "step": 778 + }, + { + "epoch": 1.5012048192771084, + "grad_norm": 0.19530551135540009, + "learning_rate": 2.3502404009052812e-05, + "loss": 0.0083, + "step": 779 + }, + { + "epoch": 1.503132530120482, + "grad_norm": 0.2586531639099121, + "learning_rate": 2.3458240410836775e-05, + "loss": 0.0122, + "step": 780 + }, + { + "epoch": 1.5050602409638554, + "grad_norm": 0.30063286423683167, + "learning_rate": 2.3414059423465924e-05, + "loss": 0.0083, + "step": 781 + }, + { + "epoch": 1.5069879518072289, + "grad_norm": 0.18663185834884644, + "learning_rate": 2.3369861269096575e-05, + "loss": 0.0104, + "step": 782 + }, + { + "epoch": 1.5089156626506024, + "grad_norm": 0.4405941069126129, + "learning_rate": 2.3325646169971416e-05, + "loss": 0.0264, + "step": 783 + }, + { + "epoch": 1.510843373493976, + "grad_norm": 0.2947913110256195, + "learning_rate": 2.3281414348418294e-05, + "loss": 0.0107, + "step": 784 + }, + { + "epoch": 1.5127710843373494, + "grad_norm": 0.23813778162002563, + "learning_rate": 2.3237166026849158e-05, + "loss": 0.0084, + "step": 785 + }, + { + "epoch": 1.514698795180723, + "grad_norm": 0.33380329608917236, + "learning_rate": 2.3192901427758932e-05, + "loss": 0.0111, + "step": 786 + }, + { + "epoch": 1.5166265060240964, + "grad_norm": 0.3736988306045532, + "learning_rate": 2.314862077372438e-05, + "loss": 0.0135, + "step": 787 + }, + { + "epoch": 1.5185542168674697, + "grad_norm": 0.3785395920276642, + "learning_rate": 2.3104324287402996e-05, + "loss": 0.0265, + "step": 788 + }, + { + "epoch": 1.5204819277108435, + "grad_norm": 0.3359154462814331, + "learning_rate": 2.3060012191531885e-05, + "loss": 0.0127, + "step": 789 + }, + { + "epoch": 1.5224096385542167, + "grad_norm": 0.720753014087677, + "learning_rate": 2.301568470892664e-05, + "loss": 0.0134, + "step": 790 + }, + { + "epoch": 1.5243373493975905, + "grad_norm": 0.36473193764686584, + "learning_rate": 2.297134206248024e-05, + "loss": 0.0318, + "step": 791 + }, + { + "epoch": 1.5262650602409638, + "grad_norm": 0.29987087845802307, + "learning_rate": 2.2926984475161884e-05, + "loss": 0.008, + "step": 792 + }, + { + "epoch": 1.5281927710843375, + "grad_norm": 0.2883112132549286, + "learning_rate": 2.2882612170015914e-05, + "loss": 0.0125, + "step": 793 + }, + { + "epoch": 1.5301204819277108, + "grad_norm": 0.28983229398727417, + "learning_rate": 2.2838225370160682e-05, + "loss": 0.0155, + "step": 794 + }, + { + "epoch": 1.5320481927710843, + "grad_norm": 0.47236886620521545, + "learning_rate": 2.2793824298787414e-05, + "loss": 0.0132, + "step": 795 + }, + { + "epoch": 1.5339759036144578, + "grad_norm": 0.8328865170478821, + "learning_rate": 2.2749409179159104e-05, + "loss": 0.026, + "step": 796 + }, + { + "epoch": 1.5359036144578313, + "grad_norm": 0.3129172623157501, + "learning_rate": 2.2704980234609396e-05, + "loss": 0.0099, + "step": 797 + }, + { + "epoch": 1.5378313253012048, + "grad_norm": 0.22284500300884247, + "learning_rate": 2.2660537688541416e-05, + "loss": 0.009, + "step": 798 + }, + { + "epoch": 1.5397590361445783, + "grad_norm": 0.3346405625343323, + "learning_rate": 2.2616081764426726e-05, + "loss": 0.0077, + "step": 799 + }, + { + "epoch": 1.5416867469879518, + "grad_norm": 0.2923565208911896, + "learning_rate": 2.2571612685804124e-05, + "loss": 0.0119, + "step": 800 + }, + { + "epoch": 1.5436144578313253, + "grad_norm": 0.1921311914920807, + "learning_rate": 2.252713067627857e-05, + "loss": 0.0083, + "step": 801 + }, + { + "epoch": 1.5455421686746988, + "grad_norm": 0.23221106827259064, + "learning_rate": 2.2482635959520044e-05, + "loss": 0.0049, + "step": 802 + }, + { + "epoch": 1.5474698795180721, + "grad_norm": 0.6340724229812622, + "learning_rate": 2.243812875926241e-05, + "loss": 0.0273, + "step": 803 + }, + { + "epoch": 1.5493975903614459, + "grad_norm": 0.2699439823627472, + "learning_rate": 2.2393609299302314e-05, + "loss": 0.0108, + "step": 804 + }, + { + "epoch": 1.5513253012048192, + "grad_norm": 0.2005189210176468, + "learning_rate": 2.2349077803498052e-05, + "loss": 0.0076, + "step": 805 + }, + { + "epoch": 1.5532530120481929, + "grad_norm": 0.39668548107147217, + "learning_rate": 2.230453449576842e-05, + "loss": 0.0135, + "step": 806 + }, + { + "epoch": 1.5551807228915662, + "grad_norm": 0.2406950294971466, + "learning_rate": 2.2259979600091635e-05, + "loss": 0.0094, + "step": 807 + }, + { + "epoch": 1.55710843373494, + "grad_norm": 0.30363157391548157, + "learning_rate": 2.2215413340504158e-05, + "loss": 0.0178, + "step": 808 + }, + { + "epoch": 1.5590361445783132, + "grad_norm": 0.19508181512355804, + "learning_rate": 2.2170835941099605e-05, + "loss": 0.0069, + "step": 809 + }, + { + "epoch": 1.5609638554216867, + "grad_norm": 0.734106719493866, + "learning_rate": 2.2126247626027615e-05, + "loss": 0.0319, + "step": 810 + }, + { + "epoch": 1.5628915662650602, + "grad_norm": 0.2591583728790283, + "learning_rate": 2.208164861949268e-05, + "loss": 0.0168, + "step": 811 + }, + { + "epoch": 1.5648192771084337, + "grad_norm": 0.2386734038591385, + "learning_rate": 2.20370391457531e-05, + "loss": 0.0041, + "step": 812 + }, + { + "epoch": 1.5667469879518072, + "grad_norm": 0.1675218939781189, + "learning_rate": 2.1992419429119764e-05, + "loss": 0.0078, + "step": 813 + }, + { + "epoch": 1.5686746987951807, + "grad_norm": 0.45591506361961365, + "learning_rate": 2.1947789693955097e-05, + "loss": 0.0166, + "step": 814 + }, + { + "epoch": 1.5706024096385542, + "grad_norm": 0.46940621733665466, + "learning_rate": 2.190315016467188e-05, + "loss": 0.0176, + "step": 815 + }, + { + "epoch": 1.5725301204819278, + "grad_norm": 0.2294205278158188, + "learning_rate": 2.1858501065732146e-05, + "loss": 0.0102, + "step": 816 + }, + { + "epoch": 1.5744578313253013, + "grad_norm": 0.28922322392463684, + "learning_rate": 2.181384262164606e-05, + "loss": 0.0111, + "step": 817 + }, + { + "epoch": 1.5763855421686745, + "grad_norm": 0.19650064408779144, + "learning_rate": 2.1769175056970765e-05, + "loss": 0.0076, + "step": 818 + }, + { + "epoch": 1.5783132530120483, + "grad_norm": 0.19538825750350952, + "learning_rate": 2.172449859630927e-05, + "loss": 0.0118, + "step": 819 + }, + { + "epoch": 1.5802409638554216, + "grad_norm": 0.1900389939546585, + "learning_rate": 2.167981346430931e-05, + "loss": 0.0066, + "step": 820 + }, + { + "epoch": 1.5821686746987953, + "grad_norm": 0.21593710780143738, + "learning_rate": 2.1635119885662235e-05, + "loss": 0.0101, + "step": 821 + }, + { + "epoch": 1.5840963855421686, + "grad_norm": 0.2699289321899414, + "learning_rate": 2.159041808510185e-05, + "loss": 0.0118, + "step": 822 + }, + { + "epoch": 1.5860240963855423, + "grad_norm": 0.31867673993110657, + "learning_rate": 2.1545708287403322e-05, + "loss": 0.0122, + "step": 823 + }, + { + "epoch": 1.5879518072289156, + "grad_norm": 0.2862400412559509, + "learning_rate": 2.1500990717382004e-05, + "loss": 0.0216, + "step": 824 + }, + { + "epoch": 1.589879518072289, + "grad_norm": 0.28482481837272644, + "learning_rate": 2.145626559989237e-05, + "loss": 0.0136, + "step": 825 + }, + { + "epoch": 1.5918072289156626, + "grad_norm": 0.2866958975791931, + "learning_rate": 2.1411533159826803e-05, + "loss": 0.0298, + "step": 826 + }, + { + "epoch": 1.5937349397590361, + "grad_norm": 0.39092838764190674, + "learning_rate": 2.1366793622114533e-05, + "loss": 0.0382, + "step": 827 + }, + { + "epoch": 1.5956626506024096, + "grad_norm": 0.16381537914276123, + "learning_rate": 2.1322047211720468e-05, + "loss": 0.0074, + "step": 828 + }, + { + "epoch": 1.5975903614457831, + "grad_norm": 0.22146940231323242, + "learning_rate": 2.1277294153644083e-05, + "loss": 0.0103, + "step": 829 + }, + { + "epoch": 1.5995180722891567, + "grad_norm": 0.2155209183692932, + "learning_rate": 2.123253467291827e-05, + "loss": 0.0095, + "step": 830 + }, + { + "epoch": 1.6014457831325302, + "grad_norm": 0.41510409116744995, + "learning_rate": 2.118776899460822e-05, + "loss": 0.0457, + "step": 831 + }, + { + "epoch": 1.6033734939759037, + "grad_norm": 0.19718150794506073, + "learning_rate": 2.1142997343810293e-05, + "loss": 0.0192, + "step": 832 + }, + { + "epoch": 1.605301204819277, + "grad_norm": 0.40924403071403503, + "learning_rate": 2.1098219945650865e-05, + "loss": 0.0278, + "step": 833 + }, + { + "epoch": 1.6072289156626507, + "grad_norm": 0.18657824397087097, + "learning_rate": 2.105343702528524e-05, + "loss": 0.0076, + "step": 834 + }, + { + "epoch": 1.609156626506024, + "grad_norm": 0.1727641075849533, + "learning_rate": 2.100864880789645e-05, + "loss": 0.0076, + "step": 835 + }, + { + "epoch": 1.6110843373493977, + "grad_norm": 0.18138745427131653, + "learning_rate": 2.0963855518694203e-05, + "loss": 0.005, + "step": 836 + }, + { + "epoch": 1.613012048192771, + "grad_norm": 0.19173955917358398, + "learning_rate": 2.0919057382913675e-05, + "loss": 0.0084, + "step": 837 + }, + { + "epoch": 1.6149397590361447, + "grad_norm": 0.3812403380870819, + "learning_rate": 2.0874254625814435e-05, + "loss": 0.009, + "step": 838 + }, + { + "epoch": 1.616867469879518, + "grad_norm": 0.2009759545326233, + "learning_rate": 2.0829447472679285e-05, + "loss": 0.0098, + "step": 839 + }, + { + "epoch": 1.6187951807228915, + "grad_norm": 0.48703446984291077, + "learning_rate": 2.0784636148813124e-05, + "loss": 0.0099, + "step": 840 + }, + { + "epoch": 1.620722891566265, + "grad_norm": 0.28995075821876526, + "learning_rate": 2.0739820879541827e-05, + "loss": 0.0075, + "step": 841 + }, + { + "epoch": 1.6226506024096385, + "grad_norm": 0.2130059450864792, + "learning_rate": 2.069500189021111e-05, + "loss": 0.007, + "step": 842 + }, + { + "epoch": 1.624578313253012, + "grad_norm": 0.252524733543396, + "learning_rate": 2.0650179406185397e-05, + "loss": 0.0249, + "step": 843 + }, + { + "epoch": 1.6265060240963856, + "grad_norm": 0.23069098591804504, + "learning_rate": 2.060535365284668e-05, + "loss": 0.0084, + "step": 844 + }, + { + "epoch": 1.628433734939759, + "grad_norm": 0.25051403045654297, + "learning_rate": 2.056052485559338e-05, + "loss": 0.0071, + "step": 845 + }, + { + "epoch": 1.6303614457831326, + "grad_norm": 0.27664798498153687, + "learning_rate": 2.051569323983924e-05, + "loss": 0.0198, + "step": 846 + }, + { + "epoch": 1.632289156626506, + "grad_norm": 0.2954922318458557, + "learning_rate": 2.047085903101218e-05, + "loss": 0.006, + "step": 847 + }, + { + "epoch": 1.6342168674698794, + "grad_norm": 0.28477591276168823, + "learning_rate": 2.0426022454553137e-05, + "loss": 0.0147, + "step": 848 + }, + { + "epoch": 1.636144578313253, + "grad_norm": 0.2785305678844452, + "learning_rate": 2.0381183735914968e-05, + "loss": 0.0117, + "step": 849 + }, + { + "epoch": 1.6380722891566264, + "grad_norm": 0.2500309348106384, + "learning_rate": 2.0336343100561295e-05, + "loss": 0.008, + "step": 850 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.18932047486305237, + "learning_rate": 2.0291500773965392e-05, + "loss": 0.0256, + "step": 851 + }, + { + "epoch": 1.6419277108433734, + "grad_norm": 0.6396257877349854, + "learning_rate": 2.0246656981609013e-05, + "loss": 0.0141, + "step": 852 + }, + { + "epoch": 1.6438554216867471, + "grad_norm": 0.5072891116142273, + "learning_rate": 2.02018119489813e-05, + "loss": 0.008, + "step": 853 + }, + { + "epoch": 1.6457831325301204, + "grad_norm": 0.2920839488506317, + "learning_rate": 2.0156965901577635e-05, + "loss": 0.0085, + "step": 854 + }, + { + "epoch": 1.647710843373494, + "grad_norm": 0.1391262263059616, + "learning_rate": 2.011211906489848e-05, + "loss": 0.0078, + "step": 855 + }, + { + "epoch": 1.6496385542168674, + "grad_norm": 0.29620468616485596, + "learning_rate": 2.00672716644483e-05, + "loss": 0.0109, + "step": 856 + }, + { + "epoch": 1.651566265060241, + "grad_norm": 0.13946573436260223, + "learning_rate": 2.002242392573436e-05, + "loss": 0.0076, + "step": 857 + }, + { + "epoch": 1.6534939759036145, + "grad_norm": 0.9766128659248352, + "learning_rate": 1.997757607426565e-05, + "loss": 0.0309, + "step": 858 + }, + { + "epoch": 1.655421686746988, + "grad_norm": 0.18002203106880188, + "learning_rate": 1.9932728335551702e-05, + "loss": 0.0072, + "step": 859 + }, + { + "epoch": 1.6573493975903615, + "grad_norm": 0.28073111176490784, + "learning_rate": 1.988788093510152e-05, + "loss": 0.0246, + "step": 860 + }, + { + "epoch": 1.659277108433735, + "grad_norm": 0.1919957399368286, + "learning_rate": 1.9843034098422375e-05, + "loss": 0.0087, + "step": 861 + }, + { + "epoch": 1.6612048192771085, + "grad_norm": 0.1825258433818817, + "learning_rate": 1.9798188051018705e-05, + "loss": 0.0092, + "step": 862 + }, + { + "epoch": 1.6631325301204818, + "grad_norm": 0.32412952184677124, + "learning_rate": 1.9753343018390997e-05, + "loss": 0.0118, + "step": 863 + }, + { + "epoch": 1.6650602409638555, + "grad_norm": 0.12828563153743744, + "learning_rate": 1.9708499226034618e-05, + "loss": 0.0056, + "step": 864 + }, + { + "epoch": 1.6669879518072288, + "grad_norm": 0.18647560477256775, + "learning_rate": 1.966365689943871e-05, + "loss": 0.0094, + "step": 865 + }, + { + "epoch": 1.6689156626506025, + "grad_norm": 0.19835828244686127, + "learning_rate": 1.9618816264085042e-05, + "loss": 0.0097, + "step": 866 + }, + { + "epoch": 1.6708433734939758, + "grad_norm": 0.22364282608032227, + "learning_rate": 1.957397754544687e-05, + "loss": 0.0062, + "step": 867 + }, + { + "epoch": 1.6727710843373496, + "grad_norm": 0.29420018196105957, + "learning_rate": 1.952914096898783e-05, + "loss": 0.0182, + "step": 868 + }, + { + "epoch": 1.6746987951807228, + "grad_norm": 0.2149929702281952, + "learning_rate": 1.9484306760160766e-05, + "loss": 0.0125, + "step": 869 + }, + { + "epoch": 1.6766265060240964, + "grad_norm": 0.16844330728054047, + "learning_rate": 1.9439475144406623e-05, + "loss": 0.0074, + "step": 870 + }, + { + "epoch": 1.6785542168674699, + "grad_norm": 0.5010282397270203, + "learning_rate": 1.9394646347153334e-05, + "loss": 0.0213, + "step": 871 + }, + { + "epoch": 1.6804819277108434, + "grad_norm": 0.29847195744514465, + "learning_rate": 1.9349820593814606e-05, + "loss": 0.0173, + "step": 872 + }, + { + "epoch": 1.6824096385542169, + "grad_norm": 0.23835812509059906, + "learning_rate": 1.930499810978889e-05, + "loss": 0.011, + "step": 873 + }, + { + "epoch": 1.6843373493975904, + "grad_norm": 0.3269020617008209, + "learning_rate": 1.9260179120458177e-05, + "loss": 0.0285, + "step": 874 + }, + { + "epoch": 1.686265060240964, + "grad_norm": 0.2142144739627838, + "learning_rate": 1.9215363851186883e-05, + "loss": 0.0146, + "step": 875 + }, + { + "epoch": 1.6881927710843372, + "grad_norm": 0.3098377585411072, + "learning_rate": 1.9170552527320725e-05, + "loss": 0.0104, + "step": 876 + }, + { + "epoch": 1.690120481927711, + "grad_norm": 0.22504115104675293, + "learning_rate": 1.9125745374185568e-05, + "loss": 0.0091, + "step": 877 + }, + { + "epoch": 1.6920481927710842, + "grad_norm": 0.20633333921432495, + "learning_rate": 1.908094261708633e-05, + "loss": 0.0097, + "step": 878 + }, + { + "epoch": 1.693975903614458, + "grad_norm": 1.179566502571106, + "learning_rate": 1.9036144481305807e-05, + "loss": 0.0143, + "step": 879 + }, + { + "epoch": 1.6959036144578312, + "grad_norm": 0.15525613725185394, + "learning_rate": 1.8991351192103554e-05, + "loss": 0.0062, + "step": 880 + }, + { + "epoch": 1.697831325301205, + "grad_norm": 0.15966367721557617, + "learning_rate": 1.8946562974714763e-05, + "loss": 0.0048, + "step": 881 + }, + { + "epoch": 1.6997590361445782, + "grad_norm": 0.18902607262134552, + "learning_rate": 1.890178005434914e-05, + "loss": 0.0124, + "step": 882 + }, + { + "epoch": 1.701686746987952, + "grad_norm": 0.21692413091659546, + "learning_rate": 1.885700265618971e-05, + "loss": 0.0135, + "step": 883 + }, + { + "epoch": 1.7036144578313253, + "grad_norm": 0.38948455452919006, + "learning_rate": 1.8812231005391786e-05, + "loss": 0.0365, + "step": 884 + }, + { + "epoch": 1.7055421686746988, + "grad_norm": 0.2483491599559784, + "learning_rate": 1.8767465327081736e-05, + "loss": 0.0202, + "step": 885 + }, + { + "epoch": 1.7074698795180723, + "grad_norm": 0.15305832028388977, + "learning_rate": 1.872270584635592e-05, + "loss": 0.0035, + "step": 886 + }, + { + "epoch": 1.7093975903614458, + "grad_norm": 0.17794466018676758, + "learning_rate": 1.867795278827954e-05, + "loss": 0.0157, + "step": 887 + }, + { + "epoch": 1.7113253012048193, + "grad_norm": 0.1938813328742981, + "learning_rate": 1.863320637788547e-05, + "loss": 0.0071, + "step": 888 + }, + { + "epoch": 1.7132530120481928, + "grad_norm": 0.27061617374420166, + "learning_rate": 1.8588466840173207e-05, + "loss": 0.0347, + "step": 889 + }, + { + "epoch": 1.7151807228915663, + "grad_norm": 0.1541014313697815, + "learning_rate": 1.8543734400107637e-05, + "loss": 0.006, + "step": 890 + }, + { + "epoch": 1.7171084337349396, + "grad_norm": 0.1436876654624939, + "learning_rate": 1.8499009282617996e-05, + "loss": 0.0059, + "step": 891 + }, + { + "epoch": 1.7190361445783133, + "grad_norm": 1.0573723316192627, + "learning_rate": 1.8454291712596688e-05, + "loss": 0.008, + "step": 892 + }, + { + "epoch": 1.7209638554216866, + "grad_norm": 0.15406259894371033, + "learning_rate": 1.8409581914898157e-05, + "loss": 0.0061, + "step": 893 + }, + { + "epoch": 1.7228915662650603, + "grad_norm": 0.24822913110256195, + "learning_rate": 1.836488011433777e-05, + "loss": 0.0085, + "step": 894 + }, + { + "epoch": 1.7248192771084336, + "grad_norm": 0.21049316227436066, + "learning_rate": 1.83201865356907e-05, + "loss": 0.0075, + "step": 895 + }, + { + "epoch": 1.7267469879518074, + "grad_norm": 0.24159866571426392, + "learning_rate": 1.8275501403690733e-05, + "loss": 0.0156, + "step": 896 + }, + { + "epoch": 1.7286746987951807, + "grad_norm": 0.3191063106060028, + "learning_rate": 1.823082494302924e-05, + "loss": 0.0218, + "step": 897 + }, + { + "epoch": 1.7306024096385542, + "grad_norm": 0.20296362042427063, + "learning_rate": 1.8186157378353945e-05, + "loss": 0.0126, + "step": 898 + }, + { + "epoch": 1.7325301204819277, + "grad_norm": 0.1905524581670761, + "learning_rate": 1.8141498934267858e-05, + "loss": 0.0131, + "step": 899 + }, + { + "epoch": 1.7344578313253012, + "grad_norm": 0.5350520610809326, + "learning_rate": 1.809684983532813e-05, + "loss": 0.0115, + "step": 900 + }, + { + "epoch": 1.7363855421686747, + "grad_norm": 0.17144092917442322, + "learning_rate": 1.8052210306044907e-05, + "loss": 0.0113, + "step": 901 + }, + { + "epoch": 1.7383132530120482, + "grad_norm": 0.11777982115745544, + "learning_rate": 1.8007580570880236e-05, + "loss": 0.0058, + "step": 902 + }, + { + "epoch": 1.7402409638554217, + "grad_norm": 0.2078275978565216, + "learning_rate": 1.7962960854246908e-05, + "loss": 0.0106, + "step": 903 + }, + { + "epoch": 1.7421686746987952, + "grad_norm": 0.2550877630710602, + "learning_rate": 1.791835138050732e-05, + "loss": 0.0076, + "step": 904 + }, + { + "epoch": 1.7440963855421687, + "grad_norm": 0.11553912609815598, + "learning_rate": 1.7873752373972395e-05, + "loss": 0.0038, + "step": 905 + }, + { + "epoch": 1.746024096385542, + "grad_norm": 0.10724586248397827, + "learning_rate": 1.7829164058900398e-05, + "loss": 0.0043, + "step": 906 + }, + { + "epoch": 1.7479518072289157, + "grad_norm": 0.30152231454849243, + "learning_rate": 1.7784586659495845e-05, + "loss": 0.0099, + "step": 907 + }, + { + "epoch": 1.749879518072289, + "grad_norm": 0.18372933566570282, + "learning_rate": 1.7740020399908372e-05, + "loss": 0.0074, + "step": 908 + }, + { + "epoch": 1.7518072289156628, + "grad_norm": 0.35184428095817566, + "learning_rate": 1.7695465504231586e-05, + "loss": 0.0184, + "step": 909 + }, + { + "epoch": 1.753734939759036, + "grad_norm": 0.15083615481853485, + "learning_rate": 1.765092219650196e-05, + "loss": 0.0061, + "step": 910 + }, + { + "epoch": 1.7556626506024098, + "grad_norm": 0.2599961459636688, + "learning_rate": 1.7606390700697693e-05, + "loss": 0.0101, + "step": 911 + }, + { + "epoch": 1.757590361445783, + "grad_norm": 0.10829206556081772, + "learning_rate": 1.7561871240737595e-05, + "loss": 0.0034, + "step": 912 + }, + { + "epoch": 1.7595180722891566, + "grad_norm": 0.38098782300949097, + "learning_rate": 1.7517364040479966e-05, + "loss": 0.0384, + "step": 913 + }, + { + "epoch": 1.76144578313253, + "grad_norm": 0.14975085854530334, + "learning_rate": 1.7472869323721432e-05, + "loss": 0.0055, + "step": 914 + }, + { + "epoch": 1.7633734939759036, + "grad_norm": 0.4151444733142853, + "learning_rate": 1.742838731419588e-05, + "loss": 0.0307, + "step": 915 + }, + { + "epoch": 1.765301204819277, + "grad_norm": 0.22238481044769287, + "learning_rate": 1.738391823557328e-05, + "loss": 0.0059, + "step": 916 + }, + { + "epoch": 1.7672289156626506, + "grad_norm": 0.23386356234550476, + "learning_rate": 1.7339462311458587e-05, + "loss": 0.0113, + "step": 917 + }, + { + "epoch": 1.7691566265060241, + "grad_norm": 0.21911191940307617, + "learning_rate": 1.7295019765390618e-05, + "loss": 0.0071, + "step": 918 + }, + { + "epoch": 1.7710843373493976, + "grad_norm": 0.343159943819046, + "learning_rate": 1.7250590820840903e-05, + "loss": 0.0144, + "step": 919 + }, + { + "epoch": 1.7730120481927711, + "grad_norm": 0.32204556465148926, + "learning_rate": 1.720617570121259e-05, + "loss": 0.0131, + "step": 920 + }, + { + "epoch": 1.7749397590361444, + "grad_norm": 0.4105585515499115, + "learning_rate": 1.7161774629839328e-05, + "loss": 0.0148, + "step": 921 + }, + { + "epoch": 1.7768674698795182, + "grad_norm": 0.16380974650382996, + "learning_rate": 1.7117387829984093e-05, + "loss": 0.0066, + "step": 922 + }, + { + "epoch": 1.7787951807228914, + "grad_norm": 0.22920913994312286, + "learning_rate": 1.707301552483813e-05, + "loss": 0.0105, + "step": 923 + }, + { + "epoch": 1.7807228915662652, + "grad_norm": 0.2075149267911911, + "learning_rate": 1.7028657937519767e-05, + "loss": 0.0104, + "step": 924 + }, + { + "epoch": 1.7826506024096385, + "grad_norm": 0.44439977407455444, + "learning_rate": 1.6984315291073355e-05, + "loss": 0.0134, + "step": 925 + }, + { + "epoch": 1.7845783132530122, + "grad_norm": 0.24068203568458557, + "learning_rate": 1.6939987808468125e-05, + "loss": 0.0078, + "step": 926 + }, + { + "epoch": 1.7865060240963855, + "grad_norm": 0.34044349193573, + "learning_rate": 1.689567571259701e-05, + "loss": 0.0108, + "step": 927 + }, + { + "epoch": 1.788433734939759, + "grad_norm": 0.34082743525505066, + "learning_rate": 1.6851379226275624e-05, + "loss": 0.0266, + "step": 928 + }, + { + "epoch": 1.7903614457831325, + "grad_norm": 0.19490115344524384, + "learning_rate": 1.6807098572241075e-05, + "loss": 0.0109, + "step": 929 + }, + { + "epoch": 1.792289156626506, + "grad_norm": 0.16208237409591675, + "learning_rate": 1.6762833973150846e-05, + "loss": 0.0113, + "step": 930 + }, + { + "epoch": 1.7942168674698795, + "grad_norm": 0.35555699467658997, + "learning_rate": 1.671858565158172e-05, + "loss": 0.0196, + "step": 931 + }, + { + "epoch": 1.796144578313253, + "grad_norm": 0.1600857824087143, + "learning_rate": 1.6674353830028587e-05, + "loss": 0.0089, + "step": 932 + }, + { + "epoch": 1.7980722891566265, + "grad_norm": 0.1699574887752533, + "learning_rate": 1.663013873090342e-05, + "loss": 0.0074, + "step": 933 + }, + { + "epoch": 1.8, + "grad_norm": 0.2472933828830719, + "learning_rate": 1.6585940576534086e-05, + "loss": 0.0063, + "step": 934 + }, + { + "epoch": 1.8019277108433736, + "grad_norm": 0.23491555452346802, + "learning_rate": 1.654175958916323e-05, + "loss": 0.0101, + "step": 935 + }, + { + "epoch": 1.8038554216867468, + "grad_norm": 0.28635191917419434, + "learning_rate": 1.6497595990947195e-05, + "loss": 0.0131, + "step": 936 + }, + { + "epoch": 1.8057831325301206, + "grad_norm": 0.15400712192058563, + "learning_rate": 1.645345000395489e-05, + "loss": 0.0068, + "step": 937 + }, + { + "epoch": 1.8077108433734939, + "grad_norm": 0.18223172426223755, + "learning_rate": 1.6409321850166647e-05, + "loss": 0.0094, + "step": 938 + }, + { + "epoch": 1.8096385542168676, + "grad_norm": 0.2789457142353058, + "learning_rate": 1.636521175147316e-05, + "loss": 0.0202, + "step": 939 + }, + { + "epoch": 1.8115662650602409, + "grad_norm": 0.4267627000808716, + "learning_rate": 1.6321119929674297e-05, + "loss": 0.0176, + "step": 940 + }, + { + "epoch": 1.8134939759036146, + "grad_norm": 0.3021615445613861, + "learning_rate": 1.6277046606478056e-05, + "loss": 0.0085, + "step": 941 + }, + { + "epoch": 1.815421686746988, + "grad_norm": 0.3724934756755829, + "learning_rate": 1.6232992003499405e-05, + "loss": 0.0474, + "step": 942 + }, + { + "epoch": 1.8173493975903614, + "grad_norm": 0.20904326438903809, + "learning_rate": 1.6188956342259177e-05, + "loss": 0.0078, + "step": 943 + }, + { + "epoch": 1.819277108433735, + "grad_norm": 0.31168171763420105, + "learning_rate": 1.614493984418297e-05, + "loss": 0.0174, + "step": 944 + }, + { + "epoch": 1.8212048192771084, + "grad_norm": 0.21273556351661682, + "learning_rate": 1.6100942730600003e-05, + "loss": 0.0054, + "step": 945 + }, + { + "epoch": 1.823132530120482, + "grad_norm": 0.16991695761680603, + "learning_rate": 1.6056965222742055e-05, + "loss": 0.0063, + "step": 946 + }, + { + "epoch": 1.8250602409638554, + "grad_norm": 0.22762684524059296, + "learning_rate": 1.6013007541742303e-05, + "loss": 0.0234, + "step": 947 + }, + { + "epoch": 1.826987951807229, + "grad_norm": 0.20128795504570007, + "learning_rate": 1.596906990863422e-05, + "loss": 0.0095, + "step": 948 + }, + { + "epoch": 1.8289156626506025, + "grad_norm": 0.30772027373313904, + "learning_rate": 1.592515254435048e-05, + "loss": 0.0356, + "step": 949 + }, + { + "epoch": 1.830843373493976, + "grad_norm": 0.12954631447792053, + "learning_rate": 1.5881255669721857e-05, + "loss": 0.008, + "step": 950 + }, + { + "epoch": 1.8327710843373493, + "grad_norm": 0.7787145972251892, + "learning_rate": 1.5837379505476054e-05, + "loss": 0.0108, + "step": 951 + }, + { + "epoch": 1.834698795180723, + "grad_norm": 0.1683879941701889, + "learning_rate": 1.5793524272236683e-05, + "loss": 0.006, + "step": 952 + }, + { + "epoch": 1.8366265060240963, + "grad_norm": 0.16475361585617065, + "learning_rate": 1.5749690190522076e-05, + "loss": 0.0065, + "step": 953 + }, + { + "epoch": 1.83855421686747, + "grad_norm": 0.211905375123024, + "learning_rate": 1.5705877480744214e-05, + "loss": 0.0092, + "step": 954 + }, + { + "epoch": 1.8404819277108433, + "grad_norm": 0.23850117623806, + "learning_rate": 1.5662086363207628e-05, + "loss": 0.012, + "step": 955 + }, + { + "epoch": 1.842409638554217, + "grad_norm": 0.19100065529346466, + "learning_rate": 1.561831705810825e-05, + "loss": 0.0113, + "step": 956 + }, + { + "epoch": 1.8443373493975903, + "grad_norm": 0.3635985255241394, + "learning_rate": 1.557456978553236e-05, + "loss": 0.0168, + "step": 957 + }, + { + "epoch": 1.8462650602409638, + "grad_norm": 0.16449116170406342, + "learning_rate": 1.553084476545544e-05, + "loss": 0.0042, + "step": 958 + }, + { + "epoch": 1.8481927710843373, + "grad_norm": 0.566093385219574, + "learning_rate": 1.5487142217741062e-05, + "loss": 0.0145, + "step": 959 + }, + { + "epoch": 1.8501204819277108, + "grad_norm": 0.15960252285003662, + "learning_rate": 1.5443462362139834e-05, + "loss": 0.0059, + "step": 960 + }, + { + "epoch": 1.8520481927710843, + "grad_norm": 0.40773797035217285, + "learning_rate": 1.539980541828823e-05, + "loss": 0.0257, + "step": 961 + }, + { + "epoch": 1.8539759036144579, + "grad_norm": 0.4802496135234833, + "learning_rate": 1.5356171605707522e-05, + "loss": 0.0111, + "step": 962 + }, + { + "epoch": 1.8559036144578314, + "grad_norm": 0.15745794773101807, + "learning_rate": 1.5312561143802704e-05, + "loss": 0.0049, + "step": 963 + }, + { + "epoch": 1.8578313253012049, + "grad_norm": 0.15139251947402954, + "learning_rate": 1.5268974251861298e-05, + "loss": 0.0077, + "step": 964 + }, + { + "epoch": 1.8597590361445784, + "grad_norm": 0.2188841849565506, + "learning_rate": 1.5225411149052356e-05, + "loss": 0.017, + "step": 965 + }, + { + "epoch": 1.8616867469879517, + "grad_norm": 0.10853131115436554, + "learning_rate": 1.5181872054425287e-05, + "loss": 0.0049, + "step": 966 + }, + { + "epoch": 1.8636144578313254, + "grad_norm": 0.8254880905151367, + "learning_rate": 1.5138357186908785e-05, + "loss": 0.0317, + "step": 967 + }, + { + "epoch": 1.8655421686746987, + "grad_norm": 0.2989620566368103, + "learning_rate": 1.5094866765309728e-05, + "loss": 0.0126, + "step": 968 + }, + { + "epoch": 1.8674698795180724, + "grad_norm": 0.16411150991916656, + "learning_rate": 1.5051401008312054e-05, + "loss": 0.0101, + "step": 969 + }, + { + "epoch": 1.8693975903614457, + "grad_norm": 0.2861763834953308, + "learning_rate": 1.5007960134475706e-05, + "loss": 0.0155, + "step": 970 + }, + { + "epoch": 1.8713253012048194, + "grad_norm": 0.24879588186740875, + "learning_rate": 1.4964544362235487e-05, + "loss": 0.0187, + "step": 971 + }, + { + "epoch": 1.8732530120481927, + "grad_norm": 0.2433672398328781, + "learning_rate": 1.4921153909899983e-05, + "loss": 0.0084, + "step": 972 + }, + { + "epoch": 1.8751807228915662, + "grad_norm": 0.15097154676914215, + "learning_rate": 1.487778899565047e-05, + "loss": 0.007, + "step": 973 + }, + { + "epoch": 1.8771084337349397, + "grad_norm": 0.1629047691822052, + "learning_rate": 1.4834449837539806e-05, + "loss": 0.0058, + "step": 974 + }, + { + "epoch": 1.8790361445783132, + "grad_norm": 0.9937071204185486, + "learning_rate": 1.4791136653491333e-05, + "loss": 0.0323, + "step": 975 + }, + { + "epoch": 1.8809638554216868, + "grad_norm": 0.19555562734603882, + "learning_rate": 1.4747849661297808e-05, + "loss": 0.0126, + "step": 976 + }, + { + "epoch": 1.8828915662650603, + "grad_norm": 0.16147711873054504, + "learning_rate": 1.470458907862026e-05, + "loss": 0.0067, + "step": 977 + }, + { + "epoch": 1.8848192771084338, + "grad_norm": 0.2730027735233307, + "learning_rate": 1.4661355122986945e-05, + "loss": 0.0147, + "step": 978 + }, + { + "epoch": 1.886746987951807, + "grad_norm": 0.13759832084178925, + "learning_rate": 1.4618148011792206e-05, + "loss": 0.0038, + "step": 979 + }, + { + "epoch": 1.8886746987951808, + "grad_norm": 0.33516690135002136, + "learning_rate": 1.4574967962295419e-05, + "loss": 0.0139, + "step": 980 + }, + { + "epoch": 1.890602409638554, + "grad_norm": 0.2345741093158722, + "learning_rate": 1.4531815191619903e-05, + "loss": 0.0094, + "step": 981 + }, + { + "epoch": 1.8925301204819278, + "grad_norm": 0.14681044220924377, + "learning_rate": 1.4488689916751762e-05, + "loss": 0.0065, + "step": 982 + }, + { + "epoch": 1.894457831325301, + "grad_norm": 0.21143914759159088, + "learning_rate": 1.4445592354538885e-05, + "loss": 0.0057, + "step": 983 + }, + { + "epoch": 1.8963855421686748, + "grad_norm": 0.3109160363674164, + "learning_rate": 1.44025227216898e-05, + "loss": 0.0142, + "step": 984 + }, + { + "epoch": 1.8983132530120481, + "grad_norm": 0.24301907420158386, + "learning_rate": 1.435948123477259e-05, + "loss": 0.012, + "step": 985 + }, + { + "epoch": 1.9002409638554218, + "grad_norm": 0.19817675650119781, + "learning_rate": 1.431646811021382e-05, + "loss": 0.0097, + "step": 986 + }, + { + "epoch": 1.9021686746987951, + "grad_norm": 0.13464932143688202, + "learning_rate": 1.4273483564297425e-05, + "loss": 0.0046, + "step": 987 + }, + { + "epoch": 1.9040963855421686, + "grad_norm": 0.1698642522096634, + "learning_rate": 1.4230527813163656e-05, + "loss": 0.0038, + "step": 988 + }, + { + "epoch": 1.9060240963855422, + "grad_norm": 0.19395388662815094, + "learning_rate": 1.4187601072807975e-05, + "loss": 0.0123, + "step": 989 + }, + { + "epoch": 1.9079518072289157, + "grad_norm": 0.2093188613653183, + "learning_rate": 1.4144703559079948e-05, + "loss": 0.0093, + "step": 990 + }, + { + "epoch": 1.9098795180722892, + "grad_norm": 0.1529311090707779, + "learning_rate": 1.4101835487682198e-05, + "loss": 0.0051, + "step": 991 + }, + { + "epoch": 1.9118072289156627, + "grad_norm": 0.18725350499153137, + "learning_rate": 1.4058997074169299e-05, + "loss": 0.0083, + "step": 992 + }, + { + "epoch": 1.9137349397590362, + "grad_norm": 0.15601560473442078, + "learning_rate": 1.401618853394668e-05, + "loss": 0.0086, + "step": 993 + }, + { + "epoch": 1.9156626506024095, + "grad_norm": 0.23890644311904907, + "learning_rate": 1.3973410082269591e-05, + "loss": 0.015, + "step": 994 + }, + { + "epoch": 1.9175903614457832, + "grad_norm": 0.2442619949579239, + "learning_rate": 1.3930661934241947e-05, + "loss": 0.0089, + "step": 995 + }, + { + "epoch": 1.9195180722891565, + "grad_norm": 0.1540212482213974, + "learning_rate": 1.388794430481532e-05, + "loss": 0.0072, + "step": 996 + }, + { + "epoch": 1.9214457831325302, + "grad_norm": 0.1359291970729828, + "learning_rate": 1.3845257408787807e-05, + "loss": 0.0131, + "step": 997 + }, + { + "epoch": 1.9233734939759035, + "grad_norm": 0.25486138463020325, + "learning_rate": 1.3802601460802967e-05, + "loss": 0.0198, + "step": 998 + }, + { + "epoch": 1.9253012048192772, + "grad_norm": 0.28815609216690063, + "learning_rate": 1.3759976675348754e-05, + "loss": 0.014, + "step": 999 + }, + { + "epoch": 1.9272289156626505, + "grad_norm": 0.15648497641086578, + "learning_rate": 1.3717383266756403e-05, + "loss": 0.0065, + "step": 1000 + }, + { + "epoch": 1.929156626506024, + "grad_norm": 0.16912540793418884, + "learning_rate": 1.367482144919941e-05, + "loss": 0.0059, + "step": 1001 + }, + { + "epoch": 1.9310843373493976, + "grad_norm": 0.16896723210811615, + "learning_rate": 1.3632291436692397e-05, + "loss": 0.0054, + "step": 1002 + }, + { + "epoch": 1.933012048192771, + "grad_norm": 0.20287497341632843, + "learning_rate": 1.3589793443090064e-05, + "loss": 0.0097, + "step": 1003 + }, + { + "epoch": 1.9349397590361446, + "grad_norm": 0.14804276823997498, + "learning_rate": 1.3547327682086114e-05, + "loss": 0.0125, + "step": 1004 + }, + { + "epoch": 1.936867469879518, + "grad_norm": 0.23820064961910248, + "learning_rate": 1.3504894367212171e-05, + "loss": 0.0131, + "step": 1005 + }, + { + "epoch": 1.9387951807228916, + "grad_norm": 0.25607362389564514, + "learning_rate": 1.34624937118367e-05, + "loss": 0.0115, + "step": 1006 + }, + { + "epoch": 1.940722891566265, + "grad_norm": 0.37233737111091614, + "learning_rate": 1.3420125929163976e-05, + "loss": 0.0309, + "step": 1007 + }, + { + "epoch": 1.9426506024096386, + "grad_norm": 0.19426730275154114, + "learning_rate": 1.3377791232232929e-05, + "loss": 0.0078, + "step": 1008 + }, + { + "epoch": 1.944578313253012, + "grad_norm": 0.2784160077571869, + "learning_rate": 1.333548983391617e-05, + "loss": 0.0142, + "step": 1009 + }, + { + "epoch": 1.9465060240963856, + "grad_norm": 0.11407195776700974, + "learning_rate": 1.3293221946918853e-05, + "loss": 0.0035, + "step": 1010 + }, + { + "epoch": 1.948433734939759, + "grad_norm": 0.3965436816215515, + "learning_rate": 1.325098778377762e-05, + "loss": 0.0242, + "step": 1011 + }, + { + "epoch": 1.9503614457831326, + "grad_norm": 0.18520519137382507, + "learning_rate": 1.3208787556859543e-05, + "loss": 0.0096, + "step": 1012 + }, + { + "epoch": 1.952289156626506, + "grad_norm": 0.2783315181732178, + "learning_rate": 1.3166621478361075e-05, + "loss": 0.0103, + "step": 1013 + }, + { + "epoch": 1.9542168674698797, + "grad_norm": 0.22714459896087646, + "learning_rate": 1.3124489760306917e-05, + "loss": 0.0078, + "step": 1014 + }, + { + "epoch": 1.956144578313253, + "grad_norm": 0.1257915049791336, + "learning_rate": 1.3082392614549036e-05, + "loss": 0.0077, + "step": 1015 + }, + { + "epoch": 1.9580722891566265, + "grad_norm": 0.15592887997627258, + "learning_rate": 1.3040330252765526e-05, + "loss": 0.0106, + "step": 1016 + }, + { + "epoch": 1.96, + "grad_norm": 0.19295449554920197, + "learning_rate": 1.2998302886459586e-05, + "loss": 0.0082, + "step": 1017 + }, + { + "epoch": 1.9619277108433735, + "grad_norm": 0.15544794499874115, + "learning_rate": 1.2956310726958472e-05, + "loss": 0.0068, + "step": 1018 + }, + { + "epoch": 1.963855421686747, + "grad_norm": 0.25899502635002136, + "learning_rate": 1.291435398541236e-05, + "loss": 0.0086, + "step": 1019 + }, + { + "epoch": 1.9657831325301205, + "grad_norm": 0.34639033675193787, + "learning_rate": 1.2872432872793379e-05, + "loss": 0.0116, + "step": 1020 + }, + { + "epoch": 1.967710843373494, + "grad_norm": 0.1628410518169403, + "learning_rate": 1.283054759989447e-05, + "loss": 0.0055, + "step": 1021 + }, + { + "epoch": 1.9696385542168675, + "grad_norm": 0.9273788928985596, + "learning_rate": 1.2788698377328385e-05, + "loss": 0.0264, + "step": 1022 + }, + { + "epoch": 1.971566265060241, + "grad_norm": 0.163126140832901, + "learning_rate": 1.2746885415526594e-05, + "loss": 0.0046, + "step": 1023 + }, + { + "epoch": 1.9734939759036143, + "grad_norm": 0.1475439816713333, + "learning_rate": 1.2705108924738223e-05, + "loss": 0.0056, + "step": 1024 + }, + { + "epoch": 1.975421686746988, + "grad_norm": 0.1654318869113922, + "learning_rate": 1.2663369115029034e-05, + "loss": 0.0056, + "step": 1025 + }, + { + "epoch": 1.9773493975903613, + "grad_norm": 0.20536045730113983, + "learning_rate": 1.2621666196280333e-05, + "loss": 0.0101, + "step": 1026 + }, + { + "epoch": 1.979277108433735, + "grad_norm": 0.19256474077701569, + "learning_rate": 1.258000037818792e-05, + "loss": 0.0059, + "step": 1027 + }, + { + "epoch": 1.9812048192771083, + "grad_norm": 0.2605120539665222, + "learning_rate": 1.2538371870261053e-05, + "loss": 0.0115, + "step": 1028 + }, + { + "epoch": 1.983132530120482, + "grad_norm": 0.14840295910835266, + "learning_rate": 1.249678088182137e-05, + "loss": 0.0046, + "step": 1029 + }, + { + "epoch": 1.9850602409638554, + "grad_norm": 0.17585207521915436, + "learning_rate": 1.2455227622001851e-05, + "loss": 0.0086, + "step": 1030 + }, + { + "epoch": 1.9869879518072289, + "grad_norm": 0.11044781655073166, + "learning_rate": 1.241371229974579e-05, + "loss": 0.0034, + "step": 1031 + }, + { + "epoch": 1.9889156626506024, + "grad_norm": 0.25584840774536133, + "learning_rate": 1.2372235123805672e-05, + "loss": 0.0245, + "step": 1032 + }, + { + "epoch": 1.9908433734939759, + "grad_norm": 0.25962474942207336, + "learning_rate": 1.2330796302742211e-05, + "loss": 0.0104, + "step": 1033 + }, + { + "epoch": 1.9927710843373494, + "grad_norm": 0.33408522605895996, + "learning_rate": 1.2289396044923238e-05, + "loss": 0.0176, + "step": 1034 + }, + { + "epoch": 1.994698795180723, + "grad_norm": 0.479950487613678, + "learning_rate": 1.2248034558522682e-05, + "loss": 0.0113, + "step": 1035 + }, + { + "epoch": 1.9966265060240964, + "grad_norm": 0.16567294299602509, + "learning_rate": 1.2206712051519518e-05, + "loss": 0.0036, + "step": 1036 + }, + { + "epoch": 1.99855421686747, + "grad_norm": 0.19343771040439606, + "learning_rate": 1.2165428731696713e-05, + "loss": 0.0077, + "step": 1037 + }, + { + "epoch": 2.0, + "grad_norm": 0.22895601391792297, + "learning_rate": 1.2124184806640202e-05, + "loss": 0.0114, + "step": 1038 + }, + { + "epoch": 2.0019277108433733, + "grad_norm": 0.15838384628295898, + "learning_rate": 1.208298048373782e-05, + "loss": 0.0043, + "step": 1039 + }, + { + "epoch": 2.003855421686747, + "grad_norm": 0.681065559387207, + "learning_rate": 1.2041815970178268e-05, + "loss": 0.0214, + "step": 1040 + }, + { + "epoch": 2.0057831325301203, + "grad_norm": 0.3357350528240204, + "learning_rate": 1.2000691472950081e-05, + "loss": 0.0079, + "step": 1041 + }, + { + "epoch": 2.007710843373494, + "grad_norm": 0.15238308906555176, + "learning_rate": 1.1959607198840568e-05, + "loss": 0.0041, + "step": 1042 + }, + { + "epoch": 2.0096385542168673, + "grad_norm": 0.11763229966163635, + "learning_rate": 1.1918563354434784e-05, + "loss": 0.0033, + "step": 1043 + }, + { + "epoch": 2.011566265060241, + "grad_norm": 0.3759301006793976, + "learning_rate": 1.1877560146114515e-05, + "loss": 0.0128, + "step": 1044 + }, + { + "epoch": 2.0134939759036143, + "grad_norm": 0.1143188625574112, + "learning_rate": 1.1836597780057183e-05, + "loss": 0.0078, + "step": 1045 + }, + { + "epoch": 2.015421686746988, + "grad_norm": 0.20059260725975037, + "learning_rate": 1.179567646223485e-05, + "loss": 0.0149, + "step": 1046 + }, + { + "epoch": 2.0173493975903614, + "grad_norm": 0.15569567680358887, + "learning_rate": 1.1754796398413196e-05, + "loss": 0.0038, + "step": 1047 + }, + { + "epoch": 2.019277108433735, + "grad_norm": 0.1153278723359108, + "learning_rate": 1.1713957794150423e-05, + "loss": 0.0041, + "step": 1048 + }, + { + "epoch": 2.0212048192771084, + "grad_norm": 0.1838717758655548, + "learning_rate": 1.1673160854796307e-05, + "loss": 0.0041, + "step": 1049 + }, + { + "epoch": 2.023132530120482, + "grad_norm": 0.12264502793550491, + "learning_rate": 1.1632405785491077e-05, + "loss": 0.0043, + "step": 1050 + }, + { + "epoch": 2.0250602409638554, + "grad_norm": 0.14363229274749756, + "learning_rate": 1.159169279116445e-05, + "loss": 0.0066, + "step": 1051 + }, + { + "epoch": 2.026987951807229, + "grad_norm": 0.1316995471715927, + "learning_rate": 1.1551022076534585e-05, + "loss": 0.0024, + "step": 1052 + }, + { + "epoch": 2.0289156626506024, + "grad_norm": 0.13392619788646698, + "learning_rate": 1.1510393846107001e-05, + "loss": 0.0051, + "step": 1053 + }, + { + "epoch": 2.0308433734939757, + "grad_norm": 3.0086817741394043, + "learning_rate": 1.1469808304173658e-05, + "loss": 0.0334, + "step": 1054 + }, + { + "epoch": 2.0327710843373494, + "grad_norm": 0.17756076157093048, + "learning_rate": 1.1429265654811803e-05, + "loss": 0.0068, + "step": 1055 + }, + { + "epoch": 2.0346987951807227, + "grad_norm": 0.13250532746315002, + "learning_rate": 1.1388766101883038e-05, + "loss": 0.0087, + "step": 1056 + }, + { + "epoch": 2.0366265060240965, + "grad_norm": 0.3534089922904968, + "learning_rate": 1.1348309849032257e-05, + "loss": 0.0076, + "step": 1057 + }, + { + "epoch": 2.0385542168674697, + "grad_norm": 0.11939049512147903, + "learning_rate": 1.1307897099686627e-05, + "loss": 0.0029, + "step": 1058 + }, + { + "epoch": 2.0404819277108435, + "grad_norm": 0.11862517893314362, + "learning_rate": 1.1267528057054562e-05, + "loss": 0.0062, + "step": 1059 + }, + { + "epoch": 2.0424096385542168, + "grad_norm": 0.1539212018251419, + "learning_rate": 1.1227202924124704e-05, + "loss": 0.0067, + "step": 1060 + }, + { + "epoch": 2.0443373493975905, + "grad_norm": 0.17163440585136414, + "learning_rate": 1.118692190366491e-05, + "loss": 0.0055, + "step": 1061 + }, + { + "epoch": 2.0462650602409638, + "grad_norm": 0.12304897606372833, + "learning_rate": 1.1146685198221222e-05, + "loss": 0.0036, + "step": 1062 + }, + { + "epoch": 2.0481927710843375, + "grad_norm": 0.17319051921367645, + "learning_rate": 1.1106493010116842e-05, + "loss": 0.0058, + "step": 1063 + }, + { + "epoch": 2.050120481927711, + "grad_norm": 0.2242443859577179, + "learning_rate": 1.1066345541451127e-05, + "loss": 0.0059, + "step": 1064 + }, + { + "epoch": 2.0520481927710845, + "grad_norm": 0.09533938020467758, + "learning_rate": 1.1026242994098597e-05, + "loss": 0.0033, + "step": 1065 + }, + { + "epoch": 2.053975903614458, + "grad_norm": 0.11697929352521896, + "learning_rate": 1.0986185569707852e-05, + "loss": 0.0038, + "step": 1066 + }, + { + "epoch": 2.0559036144578315, + "grad_norm": 0.2563149333000183, + "learning_rate": 1.0946173469700625e-05, + "loss": 0.0158, + "step": 1067 + }, + { + "epoch": 2.057831325301205, + "grad_norm": 0.21836932003498077, + "learning_rate": 1.0906206895270739e-05, + "loss": 0.0085, + "step": 1068 + }, + { + "epoch": 2.059759036144578, + "grad_norm": 0.1798071414232254, + "learning_rate": 1.0866286047383094e-05, + "loss": 0.0053, + "step": 1069 + }, + { + "epoch": 2.061686746987952, + "grad_norm": 0.08937730640172958, + "learning_rate": 1.0826411126772675e-05, + "loss": 0.0025, + "step": 1070 + }, + { + "epoch": 2.063614457831325, + "grad_norm": 0.0942138060927391, + "learning_rate": 1.0786582333943499e-05, + "loss": 0.0017, + "step": 1071 + }, + { + "epoch": 2.065542168674699, + "grad_norm": 0.13076582551002502, + "learning_rate": 1.0746799869167679e-05, + "loss": 0.0033, + "step": 1072 + }, + { + "epoch": 2.067469879518072, + "grad_norm": 0.0993233174085617, + "learning_rate": 1.0707063932484357e-05, + "loss": 0.0046, + "step": 1073 + }, + { + "epoch": 2.069397590361446, + "grad_norm": 0.3046741485595703, + "learning_rate": 1.0667374723698698e-05, + "loss": 0.009, + "step": 1074 + }, + { + "epoch": 2.071325301204819, + "grad_norm": 0.12197669595479965, + "learning_rate": 1.0627732442380932e-05, + "loss": 0.0034, + "step": 1075 + }, + { + "epoch": 2.073253012048193, + "grad_norm": 0.12721140682697296, + "learning_rate": 1.058813728786531e-05, + "loss": 0.0048, + "step": 1076 + }, + { + "epoch": 2.075180722891566, + "grad_norm": 0.10011966526508331, + "learning_rate": 1.0548589459249112e-05, + "loss": 0.0026, + "step": 1077 + }, + { + "epoch": 2.07710843373494, + "grad_norm": 0.3314201831817627, + "learning_rate": 1.0509089155391661e-05, + "loss": 0.0284, + "step": 1078 + }, + { + "epoch": 2.079036144578313, + "grad_norm": 0.32739701867103577, + "learning_rate": 1.0469636574913288e-05, + "loss": 0.0088, + "step": 1079 + }, + { + "epoch": 2.080963855421687, + "grad_norm": 0.13805675506591797, + "learning_rate": 1.043023191619438e-05, + "loss": 0.0042, + "step": 1080 + }, + { + "epoch": 2.0828915662650602, + "grad_norm": 0.14789745211601257, + "learning_rate": 1.039087537737435e-05, + "loss": 0.0037, + "step": 1081 + }, + { + "epoch": 2.0848192771084335, + "grad_norm": 0.15518991649150848, + "learning_rate": 1.0351567156350617e-05, + "loss": 0.0044, + "step": 1082 + }, + { + "epoch": 2.0867469879518072, + "grad_norm": 0.08380113542079926, + "learning_rate": 1.0312307450777706e-05, + "loss": 0.0019, + "step": 1083 + }, + { + "epoch": 2.0886746987951805, + "grad_norm": 0.17892400920391083, + "learning_rate": 1.027309645806613e-05, + "loss": 0.0065, + "step": 1084 + }, + { + "epoch": 2.0906024096385543, + "grad_norm": 0.5497608780860901, + "learning_rate": 1.0233934375381489e-05, + "loss": 0.0238, + "step": 1085 + }, + { + "epoch": 2.0925301204819275, + "grad_norm": 1.0189186334609985, + "learning_rate": 1.019482139964344e-05, + "loss": 0.0092, + "step": 1086 + }, + { + "epoch": 2.0944578313253013, + "grad_norm": 0.12144117057323456, + "learning_rate": 1.015575772752472e-05, + "loss": 0.0038, + "step": 1087 + }, + { + "epoch": 2.0963855421686746, + "grad_norm": 0.1115315854549408, + "learning_rate": 1.0116743555450148e-05, + "loss": 0.0024, + "step": 1088 + }, + { + "epoch": 2.0983132530120483, + "grad_norm": 0.22671759128570557, + "learning_rate": 1.0077779079595631e-05, + "loss": 0.0136, + "step": 1089 + }, + { + "epoch": 2.1002409638554216, + "grad_norm": 2.0009827613830566, + "learning_rate": 1.003886449588719e-05, + "loss": 0.0493, + "step": 1090 + }, + { + "epoch": 2.1021686746987953, + "grad_norm": 0.11907301843166351, + "learning_rate": 1.0000000000000006e-05, + "loss": 0.0034, + "step": 1091 + }, + { + "epoch": 2.1040963855421686, + "grad_norm": 0.31257638335227966, + "learning_rate": 9.961185787357346e-06, + "loss": 0.0129, + "step": 1092 + }, + { + "epoch": 2.1060240963855423, + "grad_norm": 0.11033743619918823, + "learning_rate": 9.922422053129674e-06, + "loss": 0.0184, + "step": 1093 + }, + { + "epoch": 2.1079518072289156, + "grad_norm": 0.2575698494911194, + "learning_rate": 9.883708992233626e-06, + "loss": 0.0054, + "step": 1094 + }, + { + "epoch": 2.1098795180722894, + "grad_norm": 0.12921132147312164, + "learning_rate": 9.845046799331029e-06, + "loss": 0.0037, + "step": 1095 + }, + { + "epoch": 2.1118072289156626, + "grad_norm": 0.21405921876430511, + "learning_rate": 9.806435668827941e-06, + "loss": 0.006, + "step": 1096 + }, + { + "epoch": 2.113734939759036, + "grad_norm": 0.12929430603981018, + "learning_rate": 9.76787579487363e-06, + "loss": 0.0049, + "step": 1097 + }, + { + "epoch": 2.1156626506024097, + "grad_norm": 0.1793181151151657, + "learning_rate": 9.729367371359681e-06, + "loss": 0.0086, + "step": 1098 + }, + { + "epoch": 2.117590361445783, + "grad_norm": 0.2182074338197708, + "learning_rate": 9.690910591918936e-06, + "loss": 0.0106, + "step": 1099 + }, + { + "epoch": 2.1195180722891567, + "grad_norm": 0.0705680400133133, + "learning_rate": 9.652505649924547e-06, + "loss": 0.0012, + "step": 1100 + }, + { + "epoch": 2.12144578313253, + "grad_norm": 0.10509738326072693, + "learning_rate": 9.614152738489021e-06, + "loss": 0.0048, + "step": 1101 + }, + { + "epoch": 2.1233734939759037, + "grad_norm": 0.13775436580181122, + "learning_rate": 9.575852050463268e-06, + "loss": 0.0089, + "step": 1102 + }, + { + "epoch": 2.125301204819277, + "grad_norm": 0.15230101346969604, + "learning_rate": 9.537603778435545e-06, + "loss": 0.0065, + "step": 1103 + }, + { + "epoch": 2.1272289156626507, + "grad_norm": 0.24702346324920654, + "learning_rate": 9.499408114730583e-06, + "loss": 0.016, + "step": 1104 + }, + { + "epoch": 2.129156626506024, + "grad_norm": 0.1082577034831047, + "learning_rate": 9.461265251408575e-06, + "loss": 0.0036, + "step": 1105 + }, + { + "epoch": 2.1310843373493977, + "grad_norm": 0.1063847690820694, + "learning_rate": 9.423175380264211e-06, + "loss": 0.0037, + "step": 1106 + }, + { + "epoch": 2.133012048192771, + "grad_norm": 0.07686953246593475, + "learning_rate": 9.385138692825729e-06, + "loss": 0.0031, + "step": 1107 + }, + { + "epoch": 2.1349397590361447, + "grad_norm": 0.2046380341053009, + "learning_rate": 9.347155380353912e-06, + "loss": 0.0087, + "step": 1108 + }, + { + "epoch": 2.136867469879518, + "grad_norm": 0.1341692954301834, + "learning_rate": 9.30922563384121e-06, + "loss": 0.0045, + "step": 1109 + }, + { + "epoch": 2.1387951807228918, + "grad_norm": 0.09870535880327225, + "learning_rate": 9.271349644010672e-06, + "loss": 0.003, + "step": 1110 + }, + { + "epoch": 2.140722891566265, + "grad_norm": 0.18708615005016327, + "learning_rate": 9.233527601315069e-06, + "loss": 0.0042, + "step": 1111 + }, + { + "epoch": 2.1426506024096383, + "grad_norm": 0.5175634026527405, + "learning_rate": 9.195759695935907e-06, + "loss": 0.0173, + "step": 1112 + }, + { + "epoch": 2.144578313253012, + "grad_norm": 0.14939036965370178, + "learning_rate": 9.158046117782464e-06, + "loss": 0.0031, + "step": 1113 + }, + { + "epoch": 2.1465060240963854, + "grad_norm": 0.2837410569190979, + "learning_rate": 9.120387056490851e-06, + "loss": 0.0097, + "step": 1114 + }, + { + "epoch": 2.148433734939759, + "grad_norm": 0.11088677495718002, + "learning_rate": 9.082782701423047e-06, + "loss": 0.0026, + "step": 1115 + }, + { + "epoch": 2.1503614457831324, + "grad_norm": 0.07785166054964066, + "learning_rate": 9.045233241665947e-06, + "loss": 0.0019, + "step": 1116 + }, + { + "epoch": 2.152289156626506, + "grad_norm": 0.17568141222000122, + "learning_rate": 9.007738866030427e-06, + "loss": 0.0039, + "step": 1117 + }, + { + "epoch": 2.1542168674698794, + "grad_norm": 0.12652266025543213, + "learning_rate": 8.970299763050356e-06, + "loss": 0.0033, + "step": 1118 + }, + { + "epoch": 2.156144578313253, + "grad_norm": 0.16801467537879944, + "learning_rate": 8.932916120981695e-06, + "loss": 0.0076, + "step": 1119 + }, + { + "epoch": 2.1580722891566264, + "grad_norm": 0.18313169479370117, + "learning_rate": 8.895588127801545e-06, + "loss": 0.0052, + "step": 1120 + }, + { + "epoch": 2.16, + "grad_norm": 0.07546049356460571, + "learning_rate": 8.858315971207146e-06, + "loss": 0.0022, + "step": 1121 + }, + { + "epoch": 2.1619277108433734, + "grad_norm": 0.4039839208126068, + "learning_rate": 8.821099838614996e-06, + "loss": 0.0203, + "step": 1122 + }, + { + "epoch": 2.163855421686747, + "grad_norm": 0.09244243055582047, + "learning_rate": 8.783939917159897e-06, + "loss": 0.002, + "step": 1123 + }, + { + "epoch": 2.1657831325301204, + "grad_norm": 0.18327835202217102, + "learning_rate": 8.746836393693978e-06, + "loss": 0.0055, + "step": 1124 + }, + { + "epoch": 2.167710843373494, + "grad_norm": 0.22010307013988495, + "learning_rate": 8.709789454785809e-06, + "loss": 0.0077, + "step": 1125 + }, + { + "epoch": 2.1696385542168675, + "grad_norm": 0.09438297897577286, + "learning_rate": 8.67279928671939e-06, + "loss": 0.0032, + "step": 1126 + }, + { + "epoch": 2.1715662650602408, + "grad_norm": 0.20782770216464996, + "learning_rate": 8.635866075493318e-06, + "loss": 0.0028, + "step": 1127 + }, + { + "epoch": 2.1734939759036145, + "grad_norm": 0.1958685964345932, + "learning_rate": 8.598990006819756e-06, + "loss": 0.0047, + "step": 1128 + }, + { + "epoch": 2.1754216867469878, + "grad_norm": 0.06459935009479523, + "learning_rate": 8.562171266123528e-06, + "loss": 0.0015, + "step": 1129 + }, + { + "epoch": 2.1773493975903615, + "grad_norm": 0.33486708998680115, + "learning_rate": 8.525410038541218e-06, + "loss": 0.0094, + "step": 1130 + }, + { + "epoch": 2.179277108433735, + "grad_norm": 0.5755940079689026, + "learning_rate": 8.488706508920202e-06, + "loss": 0.0067, + "step": 1131 + }, + { + "epoch": 2.1812048192771085, + "grad_norm": 0.10840924829244614, + "learning_rate": 8.452060861817738e-06, + "loss": 0.0082, + "step": 1132 + }, + { + "epoch": 2.183132530120482, + "grad_norm": 0.18611350655555725, + "learning_rate": 8.415473281500037e-06, + "loss": 0.0059, + "step": 1133 + }, + { + "epoch": 2.1850602409638555, + "grad_norm": 0.11245249956846237, + "learning_rate": 8.378943951941301e-06, + "loss": 0.0107, + "step": 1134 + }, + { + "epoch": 2.186987951807229, + "grad_norm": 0.12284426391124725, + "learning_rate": 8.342473056822873e-06, + "loss": 0.0025, + "step": 1135 + }, + { + "epoch": 2.1889156626506026, + "grad_norm": 0.12542888522148132, + "learning_rate": 8.306060779532245e-06, + "loss": 0.0059, + "step": 1136 + }, + { + "epoch": 2.190843373493976, + "grad_norm": 0.1287655532360077, + "learning_rate": 8.26970730316215e-06, + "loss": 0.0022, + "step": 1137 + }, + { + "epoch": 2.1927710843373496, + "grad_norm": 0.1818632185459137, + "learning_rate": 8.233412810509669e-06, + "loss": 0.0131, + "step": 1138 + }, + { + "epoch": 2.194698795180723, + "grad_norm": 0.09687745571136475, + "learning_rate": 8.197177484075284e-06, + "loss": 0.0025, + "step": 1139 + }, + { + "epoch": 2.1966265060240966, + "grad_norm": 0.16103452444076538, + "learning_rate": 8.161001506061979e-06, + "loss": 0.0031, + "step": 1140 + }, + { + "epoch": 2.19855421686747, + "grad_norm": 0.2711680233478546, + "learning_rate": 8.124885058374302e-06, + "loss": 0.0034, + "step": 1141 + }, + { + "epoch": 2.200481927710843, + "grad_norm": 0.17613105475902557, + "learning_rate": 8.088828322617473e-06, + "loss": 0.0044, + "step": 1142 + }, + { + "epoch": 2.202409638554217, + "grad_norm": 0.2298487424850464, + "learning_rate": 8.052831480096464e-06, + "loss": 0.0168, + "step": 1143 + }, + { + "epoch": 2.20433734939759, + "grad_norm": 0.17042206227779388, + "learning_rate": 8.016894711815067e-06, + "loss": 0.007, + "step": 1144 + }, + { + "epoch": 2.206265060240964, + "grad_norm": 0.2830466628074646, + "learning_rate": 7.98101819847501e-06, + "loss": 0.0091, + "step": 1145 + }, + { + "epoch": 2.208192771084337, + "grad_norm": 0.22089065611362457, + "learning_rate": 7.945202120475063e-06, + "loss": 0.0046, + "step": 1146 + }, + { + "epoch": 2.210120481927711, + "grad_norm": 0.1716073900461197, + "learning_rate": 7.909446657910072e-06, + "loss": 0.0032, + "step": 1147 + }, + { + "epoch": 2.212048192771084, + "grad_norm": 0.16140373051166534, + "learning_rate": 7.873751990570104e-06, + "loss": 0.0057, + "step": 1148 + }, + { + "epoch": 2.213975903614458, + "grad_norm": 0.1671605408191681, + "learning_rate": 7.838118297939529e-06, + "loss": 0.0039, + "step": 1149 + }, + { + "epoch": 2.2159036144578312, + "grad_norm": 0.10933005809783936, + "learning_rate": 7.802545759196117e-06, + "loss": 0.005, + "step": 1150 + }, + { + "epoch": 2.217831325301205, + "grad_norm": 0.07819998264312744, + "learning_rate": 7.76703455321014e-06, + "loss": 0.0025, + "step": 1151 + }, + { + "epoch": 2.2197590361445783, + "grad_norm": 0.36211854219436646, + "learning_rate": 7.73158485854344e-06, + "loss": 0.0151, + "step": 1152 + }, + { + "epoch": 2.221686746987952, + "grad_norm": 0.09098304808139801, + "learning_rate": 7.696196853448612e-06, + "loss": 0.0027, + "step": 1153 + }, + { + "epoch": 2.2236144578313253, + "grad_norm": 0.17442144453525543, + "learning_rate": 7.660870715868018e-06, + "loss": 0.006, + "step": 1154 + }, + { + "epoch": 2.225542168674699, + "grad_norm": 0.09785338491201401, + "learning_rate": 7.625606623432933e-06, + "loss": 0.0041, + "step": 1155 + }, + { + "epoch": 2.2274698795180723, + "grad_norm": 0.19399888813495636, + "learning_rate": 7.590404753462653e-06, + "loss": 0.0125, + "step": 1156 + }, + { + "epoch": 2.2293975903614456, + "grad_norm": 0.11080623418092728, + "learning_rate": 7.55526528296362e-06, + "loss": 0.0022, + "step": 1157 + }, + { + "epoch": 2.2313253012048193, + "grad_norm": 0.14067359268665314, + "learning_rate": 7.520188388628473e-06, + "loss": 0.0123, + "step": 1158 + }, + { + "epoch": 2.2332530120481926, + "grad_norm": 0.14533625543117523, + "learning_rate": 7.485174246835227e-06, + "loss": 0.0039, + "step": 1159 + }, + { + "epoch": 2.2351807228915663, + "grad_norm": 0.1253812462091446, + "learning_rate": 7.4502230336463466e-06, + "loss": 0.003, + "step": 1160 + }, + { + "epoch": 2.2371084337349396, + "grad_norm": 0.12766572833061218, + "learning_rate": 7.415334924807869e-06, + "loss": 0.0044, + "step": 1161 + }, + { + "epoch": 2.2390361445783133, + "grad_norm": 0.11985791474580765, + "learning_rate": 7.380510095748535e-06, + "loss": 0.0071, + "step": 1162 + }, + { + "epoch": 2.2409638554216866, + "grad_norm": 0.15505346655845642, + "learning_rate": 7.3457487215788605e-06, + "loss": 0.0046, + "step": 1163 + }, + { + "epoch": 2.2428915662650604, + "grad_norm": 0.18983210623264313, + "learning_rate": 7.311050977090343e-06, + "loss": 0.0079, + "step": 1164 + }, + { + "epoch": 2.2448192771084337, + "grad_norm": 0.19279207289218903, + "learning_rate": 7.276417036754479e-06, + "loss": 0.0042, + "step": 1165 + }, + { + "epoch": 2.2467469879518074, + "grad_norm": 0.21539707481861115, + "learning_rate": 7.241847074721964e-06, + "loss": 0.0087, + "step": 1166 + }, + { + "epoch": 2.2486746987951807, + "grad_norm": 0.07004354894161224, + "learning_rate": 7.207341264821783e-06, + "loss": 0.002, + "step": 1167 + }, + { + "epoch": 2.2506024096385544, + "grad_norm": 0.2203039526939392, + "learning_rate": 7.172899780560345e-06, + "loss": 0.0069, + "step": 1168 + }, + { + "epoch": 2.2525301204819277, + "grad_norm": 0.12474718689918518, + "learning_rate": 7.138522795120606e-06, + "loss": 0.0122, + "step": 1169 + }, + { + "epoch": 2.2544578313253014, + "grad_norm": 0.09078995883464813, + "learning_rate": 7.104210481361204e-06, + "loss": 0.0025, + "step": 1170 + }, + { + "epoch": 2.2563855421686747, + "grad_norm": 0.141757071018219, + "learning_rate": 7.069963011815584e-06, + "loss": 0.0039, + "step": 1171 + }, + { + "epoch": 2.258313253012048, + "grad_norm": 0.14944659173488617, + "learning_rate": 7.035780558691141e-06, + "loss": 0.0025, + "step": 1172 + }, + { + "epoch": 2.2602409638554217, + "grad_norm": 0.06723666191101074, + "learning_rate": 7.001663293868328e-06, + "loss": 0.0014, + "step": 1173 + }, + { + "epoch": 2.262168674698795, + "grad_norm": 0.11966485530138016, + "learning_rate": 6.967611388899826e-06, + "loss": 0.0067, + "step": 1174 + }, + { + "epoch": 2.2640963855421687, + "grad_norm": 0.08943185210227966, + "learning_rate": 6.933625015009666e-06, + "loss": 0.0036, + "step": 1175 + }, + { + "epoch": 2.266024096385542, + "grad_norm": 0.04511453956365585, + "learning_rate": 6.899704343092359e-06, + "loss": 0.0014, + "step": 1176 + }, + { + "epoch": 2.2679518072289158, + "grad_norm": 0.1867951601743698, + "learning_rate": 6.865849543712058e-06, + "loss": 0.009, + "step": 1177 + }, + { + "epoch": 2.269879518072289, + "grad_norm": 0.23791250586509705, + "learning_rate": 6.832060787101658e-06, + "loss": 0.0117, + "step": 1178 + }, + { + "epoch": 2.271807228915663, + "grad_norm": 0.13210316002368927, + "learning_rate": 6.798338243162008e-06, + "loss": 0.0024, + "step": 1179 + }, + { + "epoch": 2.273734939759036, + "grad_norm": 0.1601375937461853, + "learning_rate": 6.764682081461002e-06, + "loss": 0.013, + "step": 1180 + }, + { + "epoch": 2.27566265060241, + "grad_norm": 0.21996766328811646, + "learning_rate": 6.73109247123273e-06, + "loss": 0.0074, + "step": 1181 + }, + { + "epoch": 2.277590361445783, + "grad_norm": 0.15780030190944672, + "learning_rate": 6.6975695813766465e-06, + "loss": 0.0052, + "step": 1182 + }, + { + "epoch": 2.279518072289157, + "grad_norm": 0.18146437406539917, + "learning_rate": 6.664113580456739e-06, + "loss": 0.0265, + "step": 1183 + }, + { + "epoch": 2.28144578313253, + "grad_norm": 0.12033495306968689, + "learning_rate": 6.630724636700618e-06, + "loss": 0.0026, + "step": 1184 + }, + { + "epoch": 2.283373493975904, + "grad_norm": 0.25268155336380005, + "learning_rate": 6.59740291799873e-06, + "loss": 0.0046, + "step": 1185 + }, + { + "epoch": 2.285301204819277, + "grad_norm": 0.19043004512786865, + "learning_rate": 6.564148591903488e-06, + "loss": 0.0063, + "step": 1186 + }, + { + "epoch": 2.2872289156626504, + "grad_norm": 0.06894923001527786, + "learning_rate": 6.530961825628432e-06, + "loss": 0.0012, + "step": 1187 + }, + { + "epoch": 2.289156626506024, + "grad_norm": 0.16378818452358246, + "learning_rate": 6.4978427860474015e-06, + "loss": 0.0048, + "step": 1188 + }, + { + "epoch": 2.2910843373493974, + "grad_norm": 0.11130444705486298, + "learning_rate": 6.464791639693648e-06, + "loss": 0.0049, + "step": 1189 + }, + { + "epoch": 2.293012048192771, + "grad_norm": 0.10573417693376541, + "learning_rate": 6.431808552759083e-06, + "loss": 0.0019, + "step": 1190 + }, + { + "epoch": 2.2949397590361444, + "grad_norm": 0.13344882428646088, + "learning_rate": 6.398893691093367e-06, + "loss": 0.0033, + "step": 1191 + }, + { + "epoch": 2.296867469879518, + "grad_norm": 0.12659135460853577, + "learning_rate": 6.366047220203088e-06, + "loss": 0.0032, + "step": 1192 + }, + { + "epoch": 2.2987951807228915, + "grad_norm": 0.10152821987867355, + "learning_rate": 6.333269305250971e-06, + "loss": 0.0027, + "step": 1193 + }, + { + "epoch": 2.300722891566265, + "grad_norm": 0.1889944225549698, + "learning_rate": 6.300560111055006e-06, + "loss": 0.0062, + "step": 1194 + }, + { + "epoch": 2.3026506024096385, + "grad_norm": 2.3101227283477783, + "learning_rate": 6.2679198020876275e-06, + "loss": 0.0113, + "step": 1195 + }, + { + "epoch": 2.304578313253012, + "grad_norm": 0.6224933862686157, + "learning_rate": 6.235348542474908e-06, + "loss": 0.0273, + "step": 1196 + }, + { + "epoch": 2.3065060240963855, + "grad_norm": 0.1908419281244278, + "learning_rate": 6.202846495995705e-06, + "loss": 0.0056, + "step": 1197 + }, + { + "epoch": 2.3084337349397592, + "grad_norm": 0.10968491435050964, + "learning_rate": 6.170413826080856e-06, + "loss": 0.0034, + "step": 1198 + }, + { + "epoch": 2.3103614457831325, + "grad_norm": 0.23200668394565582, + "learning_rate": 6.138050695812343e-06, + "loss": 0.0042, + "step": 1199 + }, + { + "epoch": 2.3122891566265062, + "grad_norm": 0.12442032992839813, + "learning_rate": 6.105757267922481e-06, + "loss": 0.0045, + "step": 1200 + }, + { + "epoch": 2.3142168674698795, + "grad_norm": 0.14563624560832977, + "learning_rate": 6.073533704793122e-06, + "loss": 0.0035, + "step": 1201 + }, + { + "epoch": 2.316144578313253, + "grad_norm": 0.11523722857236862, + "learning_rate": 6.04138016845478e-06, + "loss": 0.0088, + "step": 1202 + }, + { + "epoch": 2.3180722891566266, + "grad_norm": 0.2000943422317505, + "learning_rate": 6.009296820585871e-06, + "loss": 0.0059, + "step": 1203 + }, + { + "epoch": 2.32, + "grad_norm": 0.10698592662811279, + "learning_rate": 5.977283822511879e-06, + "loss": 0.0028, + "step": 1204 + }, + { + "epoch": 2.3219277108433736, + "grad_norm": 0.1533137410879135, + "learning_rate": 5.945341335204547e-06, + "loss": 0.0044, + "step": 1205 + }, + { + "epoch": 2.323855421686747, + "grad_norm": 0.1235835999250412, + "learning_rate": 5.9134695192810695e-06, + "loss": 0.0043, + "step": 1206 + }, + { + "epoch": 2.3257831325301206, + "grad_norm": 0.1916925013065338, + "learning_rate": 5.8816685350032575e-06, + "loss": 0.0066, + "step": 1207 + }, + { + "epoch": 2.327710843373494, + "grad_norm": 0.08812380582094193, + "learning_rate": 5.849938542276801e-06, + "loss": 0.0022, + "step": 1208 + }, + { + "epoch": 2.3296385542168676, + "grad_norm": 0.13387660682201385, + "learning_rate": 5.818279700650393e-06, + "loss": 0.0037, + "step": 1209 + }, + { + "epoch": 2.331566265060241, + "grad_norm": 0.2309022694826126, + "learning_rate": 5.786692169314954e-06, + "loss": 0.0049, + "step": 1210 + }, + { + "epoch": 2.3334939759036146, + "grad_norm": 0.09956549853086472, + "learning_rate": 5.755176107102833e-06, + "loss": 0.002, + "step": 1211 + }, + { + "epoch": 2.335421686746988, + "grad_norm": 0.06035687029361725, + "learning_rate": 5.723731672487043e-06, + "loss": 0.002, + "step": 1212 + }, + { + "epoch": 2.337349397590361, + "grad_norm": 0.06850237399339676, + "learning_rate": 5.69235902358038e-06, + "loss": 0.0013, + "step": 1213 + }, + { + "epoch": 2.339277108433735, + "grad_norm": 0.12068171054124832, + "learning_rate": 5.661058318134711e-06, + "loss": 0.0041, + "step": 1214 + }, + { + "epoch": 2.3412048192771087, + "grad_norm": 0.13146616518497467, + "learning_rate": 5.6298297135401355e-06, + "loss": 0.0022, + "step": 1215 + }, + { + "epoch": 2.343132530120482, + "grad_norm": 0.15160737931728363, + "learning_rate": 5.598673366824212e-06, + "loss": 0.0036, + "step": 1216 + }, + { + "epoch": 2.3450602409638552, + "grad_norm": 0.26196014881134033, + "learning_rate": 5.567589434651164e-06, + "loss": 0.0151, + "step": 1217 + }, + { + "epoch": 2.346987951807229, + "grad_norm": 0.12898831069469452, + "learning_rate": 5.536578073321073e-06, + "loss": 0.006, + "step": 1218 + }, + { + "epoch": 2.3489156626506023, + "grad_norm": 0.11385104805231094, + "learning_rate": 5.505639438769146e-06, + "loss": 0.0052, + "step": 1219 + }, + { + "epoch": 2.350843373493976, + "grad_norm": 0.14569509029388428, + "learning_rate": 5.47477368656486e-06, + "loss": 0.0048, + "step": 1220 + }, + { + "epoch": 2.3527710843373493, + "grad_norm": 0.12406075745820999, + "learning_rate": 5.443980971911238e-06, + "loss": 0.0028, + "step": 1221 + }, + { + "epoch": 2.354698795180723, + "grad_norm": 0.3730498254299164, + "learning_rate": 5.413261449644039e-06, + "loss": 0.0043, + "step": 1222 + }, + { + "epoch": 2.3566265060240963, + "grad_norm": 0.1449914574623108, + "learning_rate": 5.382615274230987e-06, + "loss": 0.0075, + "step": 1223 + }, + { + "epoch": 2.35855421686747, + "grad_norm": 0.20739100873470306, + "learning_rate": 5.352042599770995e-06, + "loss": 0.0061, + "step": 1224 + }, + { + "epoch": 2.3604819277108433, + "grad_norm": 0.05786775052547455, + "learning_rate": 5.321543579993398e-06, + "loss": 0.0015, + "step": 1225 + }, + { + "epoch": 2.362409638554217, + "grad_norm": 0.09043122828006744, + "learning_rate": 5.2911183682571446e-06, + "loss": 0.0034, + "step": 1226 + }, + { + "epoch": 2.3643373493975903, + "grad_norm": 0.2685496211051941, + "learning_rate": 5.260767117550094e-06, + "loss": 0.0076, + "step": 1227 + }, + { + "epoch": 2.3662650602409636, + "grad_norm": 0.17694126069545746, + "learning_rate": 5.230489980488165e-06, + "loss": 0.0148, + "step": 1228 + }, + { + "epoch": 2.3681927710843373, + "grad_norm": 0.11609307676553726, + "learning_rate": 5.200287109314633e-06, + "loss": 0.0049, + "step": 1229 + }, + { + "epoch": 2.370120481927711, + "grad_norm": 0.1257704645395279, + "learning_rate": 5.1701586558993285e-06, + "loss": 0.0031, + "step": 1230 + }, + { + "epoch": 2.3720481927710844, + "grad_norm": 0.27177703380584717, + "learning_rate": 5.140104771737899e-06, + "loss": 0.0058, + "step": 1231 + }, + { + "epoch": 2.3739759036144576, + "grad_norm": 0.13928169012069702, + "learning_rate": 5.110125607951024e-06, + "loss": 0.0051, + "step": 1232 + }, + { + "epoch": 2.3759036144578314, + "grad_norm": 0.679577648639679, + "learning_rate": 5.0802213152836514e-06, + "loss": 0.0173, + "step": 1233 + }, + { + "epoch": 2.3778313253012047, + "grad_norm": 0.16769403219223022, + "learning_rate": 5.0503920441042845e-06, + "loss": 0.0045, + "step": 1234 + }, + { + "epoch": 2.3797590361445784, + "grad_norm": 0.09427493065595627, + "learning_rate": 5.0206379444041764e-06, + "loss": 0.0024, + "step": 1235 + }, + { + "epoch": 2.3816867469879517, + "grad_norm": 0.33908671140670776, + "learning_rate": 4.990959165796585e-06, + "loss": 0.0088, + "step": 1236 + }, + { + "epoch": 2.3836144578313254, + "grad_norm": 0.18106943368911743, + "learning_rate": 4.961355857516034e-06, + "loss": 0.0094, + "step": 1237 + }, + { + "epoch": 2.3855421686746987, + "grad_norm": 0.5833203196525574, + "learning_rate": 4.931828168417583e-06, + "loss": 0.0086, + "step": 1238 + }, + { + "epoch": 2.3874698795180724, + "grad_norm": 0.09108569473028183, + "learning_rate": 4.902376246976015e-06, + "loss": 0.0014, + "step": 1239 + }, + { + "epoch": 2.3893975903614457, + "grad_norm": 0.10596407204866409, + "learning_rate": 4.873000241285153e-06, + "loss": 0.0043, + "step": 1240 + }, + { + "epoch": 2.3913253012048195, + "grad_norm": 0.10775511711835861, + "learning_rate": 4.8437002990570835e-06, + "loss": 0.0014, + "step": 1241 + }, + { + "epoch": 2.3932530120481927, + "grad_norm": 0.9646345973014832, + "learning_rate": 4.8144765676214245e-06, + "loss": 0.0525, + "step": 1242 + }, + { + "epoch": 2.395180722891566, + "grad_norm": 0.20530278980731964, + "learning_rate": 4.7853291939245814e-06, + "loss": 0.008, + "step": 1243 + }, + { + "epoch": 2.3971084337349398, + "grad_norm": 0.1682119369506836, + "learning_rate": 4.756258324528995e-06, + "loss": 0.0044, + "step": 1244 + }, + { + "epoch": 2.3990361445783135, + "grad_norm": 0.45536917448043823, + "learning_rate": 4.727264105612439e-06, + "loss": 0.0186, + "step": 1245 + }, + { + "epoch": 2.4009638554216868, + "grad_norm": 0.3017471730709076, + "learning_rate": 4.698346682967258e-06, + "loss": 0.0106, + "step": 1246 + }, + { + "epoch": 2.40289156626506, + "grad_norm": 0.1226554661989212, + "learning_rate": 4.669506201999625e-06, + "loss": 0.0035, + "step": 1247 + }, + { + "epoch": 2.404819277108434, + "grad_norm": 0.13750068843364716, + "learning_rate": 4.640742807728837e-06, + "loss": 0.0038, + "step": 1248 + }, + { + "epoch": 2.406746987951807, + "grad_norm": 0.11531024426221848, + "learning_rate": 4.612056644786575e-06, + "loss": 0.0021, + "step": 1249 + }, + { + "epoch": 2.408674698795181, + "grad_norm": 0.1143675372004509, + "learning_rate": 4.583447857416175e-06, + "loss": 0.0028, + "step": 1250 + }, + { + "epoch": 2.410602409638554, + "grad_norm": 0.0914216861128807, + "learning_rate": 4.554916589471898e-06, + "loss": 0.0027, + "step": 1251 + }, + { + "epoch": 2.412530120481928, + "grad_norm": 0.18339012563228607, + "learning_rate": 4.526462984418221e-06, + "loss": 0.0037, + "step": 1252 + }, + { + "epoch": 2.414457831325301, + "grad_norm": 0.11073138564825058, + "learning_rate": 4.498087185329105e-06, + "loss": 0.003, + "step": 1253 + }, + { + "epoch": 2.416385542168675, + "grad_norm": 0.20792435109615326, + "learning_rate": 4.469789334887265e-06, + "loss": 0.009, + "step": 1254 + }, + { + "epoch": 2.418313253012048, + "grad_norm": 0.09485629945993423, + "learning_rate": 4.441569575383471e-06, + "loss": 0.0033, + "step": 1255 + }, + { + "epoch": 2.420240963855422, + "grad_norm": 0.11831793934106827, + "learning_rate": 4.413428048715851e-06, + "loss": 0.0021, + "step": 1256 + }, + { + "epoch": 2.422168674698795, + "grad_norm": 0.11818034201860428, + "learning_rate": 4.38536489638911e-06, + "loss": 0.0041, + "step": 1257 + }, + { + "epoch": 2.4240963855421684, + "grad_norm": 0.2583082616329193, + "learning_rate": 4.3573802595138945e-06, + "loss": 0.0039, + "step": 1258 + }, + { + "epoch": 2.426024096385542, + "grad_norm": 0.3120201826095581, + "learning_rate": 4.329474278806034e-06, + "loss": 0.0087, + "step": 1259 + }, + { + "epoch": 2.427951807228916, + "grad_norm": 0.1258879452943802, + "learning_rate": 4.301647094585855e-06, + "loss": 0.0046, + "step": 1260 + }, + { + "epoch": 2.429879518072289, + "grad_norm": 0.15144586563110352, + "learning_rate": 4.273898846777473e-06, + "loss": 0.0054, + "step": 1261 + }, + { + "epoch": 2.4318072289156625, + "grad_norm": 0.15615184605121613, + "learning_rate": 4.246229674908067e-06, + "loss": 0.0072, + "step": 1262 + }, + { + "epoch": 2.433734939759036, + "grad_norm": 0.09690173715353012, + "learning_rate": 4.218639718107225e-06, + "loss": 0.003, + "step": 1263 + }, + { + "epoch": 2.4356626506024095, + "grad_norm": 0.23884955048561096, + "learning_rate": 4.1911291151062e-06, + "loss": 0.0109, + "step": 1264 + }, + { + "epoch": 2.4375903614457832, + "grad_norm": 0.0905768945813179, + "learning_rate": 4.163698004237222e-06, + "loss": 0.0027, + "step": 1265 + }, + { + "epoch": 2.4395180722891565, + "grad_norm": 0.09168912470340729, + "learning_rate": 4.136346523432821e-06, + "loss": 0.0018, + "step": 1266 + }, + { + "epoch": 2.4414457831325302, + "grad_norm": 0.17878012359142303, + "learning_rate": 4.109074810225118e-06, + "loss": 0.0048, + "step": 1267 + }, + { + "epoch": 2.4433734939759035, + "grad_norm": 0.09913790971040726, + "learning_rate": 4.08188300174513e-06, + "loss": 0.0021, + "step": 1268 + }, + { + "epoch": 2.4453012048192773, + "grad_norm": 0.16615812480449677, + "learning_rate": 4.054771234722106e-06, + "loss": 0.0066, + "step": 1269 + }, + { + "epoch": 2.4472289156626506, + "grad_norm": 0.09618276357650757, + "learning_rate": 4.027739645482784e-06, + "loss": 0.0043, + "step": 1270 + }, + { + "epoch": 2.4491566265060243, + "grad_norm": 0.33473479747772217, + "learning_rate": 4.0007883699507855e-06, + "loss": 0.0236, + "step": 1271 + }, + { + "epoch": 2.4510843373493976, + "grad_norm": 0.15051880478858948, + "learning_rate": 3.973917543645867e-06, + "loss": 0.0068, + "step": 1272 + }, + { + "epoch": 2.453012048192771, + "grad_norm": 0.24134816229343414, + "learning_rate": 3.947127301683249e-06, + "loss": 0.0194, + "step": 1273 + }, + { + "epoch": 2.4549397590361446, + "grad_norm": 0.10495353490114212, + "learning_rate": 3.920417778772967e-06, + "loss": 0.0042, + "step": 1274 + }, + { + "epoch": 2.4568674698795183, + "grad_norm": 0.2294938713312149, + "learning_rate": 3.893789109219171e-06, + "loss": 0.0224, + "step": 1275 + }, + { + "epoch": 2.4587951807228916, + "grad_norm": 0.13710513710975647, + "learning_rate": 3.867241426919446e-06, + "loss": 0.0046, + "step": 1276 + }, + { + "epoch": 2.460722891566265, + "grad_norm": 0.06754808127880096, + "learning_rate": 3.840774865364157e-06, + "loss": 0.0019, + "step": 1277 + }, + { + "epoch": 2.4626506024096386, + "grad_norm": 0.24797780811786652, + "learning_rate": 3.8143895576357605e-06, + "loss": 0.0063, + "step": 1278 + }, + { + "epoch": 2.464578313253012, + "grad_norm": 0.1476449817419052, + "learning_rate": 3.788085636408143e-06, + "loss": 0.0055, + "step": 1279 + }, + { + "epoch": 2.4665060240963856, + "grad_norm": 0.22397096455097198, + "learning_rate": 3.7618632339459616e-06, + "loss": 0.0164, + "step": 1280 + }, + { + "epoch": 2.468433734939759, + "grad_norm": 0.21596969664096832, + "learning_rate": 3.7357224821039497e-06, + "loss": 0.0112, + "step": 1281 + }, + { + "epoch": 2.4703614457831327, + "grad_norm": 0.2775099575519562, + "learning_rate": 3.7096635123263068e-06, + "loss": 0.0112, + "step": 1282 + }, + { + "epoch": 2.472289156626506, + "grad_norm": 0.07963326573371887, + "learning_rate": 3.683686455645974e-06, + "loss": 0.0013, + "step": 1283 + }, + { + "epoch": 2.4742168674698797, + "grad_norm": 0.1253802627325058, + "learning_rate": 3.6577914426840266e-06, + "loss": 0.0038, + "step": 1284 + }, + { + "epoch": 2.476144578313253, + "grad_norm": 0.10258597880601883, + "learning_rate": 3.631978603648989e-06, + "loss": 0.0023, + "step": 1285 + }, + { + "epoch": 2.4780722891566267, + "grad_norm": 0.17102380096912384, + "learning_rate": 3.6062480683361935e-06, + "loss": 0.0025, + "step": 1286 + }, + { + "epoch": 2.48, + "grad_norm": 0.09547360241413116, + "learning_rate": 3.580599966127123e-06, + "loss": 0.003, + "step": 1287 + }, + { + "epoch": 2.4819277108433733, + "grad_norm": 0.08008653670549393, + "learning_rate": 3.5550344259887438e-06, + "loss": 0.0023, + "step": 1288 + }, + { + "epoch": 2.483855421686747, + "grad_norm": 0.07712296396493912, + "learning_rate": 3.5295515764729003e-06, + "loss": 0.0015, + "step": 1289 + }, + { + "epoch": 2.4857831325301207, + "grad_norm": 0.21118703484535217, + "learning_rate": 3.5041515457156303e-06, + "loss": 0.0041, + "step": 1290 + }, + { + "epoch": 2.487710843373494, + "grad_norm": 0.10772393643856049, + "learning_rate": 3.4788344614365155e-06, + "loss": 0.0029, + "step": 1291 + }, + { + "epoch": 2.4896385542168673, + "grad_norm": 0.2353268563747406, + "learning_rate": 3.453600450938073e-06, + "loss": 0.0072, + "step": 1292 + }, + { + "epoch": 2.491566265060241, + "grad_norm": 0.2897944152355194, + "learning_rate": 3.428449641105107e-06, + "loss": 0.0205, + "step": 1293 + }, + { + "epoch": 2.4934939759036143, + "grad_norm": 0.19756680727005005, + "learning_rate": 3.4033821584040383e-06, + "loss": 0.0065, + "step": 1294 + }, + { + "epoch": 2.495421686746988, + "grad_norm": 0.13538534939289093, + "learning_rate": 3.378398128882305e-06, + "loss": 0.0025, + "step": 1295 + }, + { + "epoch": 2.4973493975903613, + "grad_norm": 0.2301637977361679, + "learning_rate": 3.3534976781677142e-06, + "loss": 0.0071, + "step": 1296 + }, + { + "epoch": 2.499277108433735, + "grad_norm": 0.0965796634554863, + "learning_rate": 3.3286809314678137e-06, + "loss": 0.0024, + "step": 1297 + }, + { + "epoch": 2.5012048192771084, + "grad_norm": 0.0777980163693428, + "learning_rate": 3.30394801356926e-06, + "loss": 0.0013, + "step": 1298 + }, + { + "epoch": 2.503132530120482, + "grad_norm": 0.3157603442668915, + "learning_rate": 3.279299048837177e-06, + "loss": 0.0228, + "step": 1299 + }, + { + "epoch": 2.5050602409638554, + "grad_norm": 0.15660233795642853, + "learning_rate": 3.2547341612145654e-06, + "loss": 0.0056, + "step": 1300 + }, + { + "epoch": 2.506987951807229, + "grad_norm": 0.21655581891536713, + "learning_rate": 3.2302534742216586e-06, + "loss": 0.0081, + "step": 1301 + }, + { + "epoch": 2.5089156626506024, + "grad_norm": 0.09475889801979065, + "learning_rate": 3.205857110955277e-06, + "loss": 0.0029, + "step": 1302 + }, + { + "epoch": 2.5108433734939757, + "grad_norm": 0.13174696266651154, + "learning_rate": 3.18154519408826e-06, + "loss": 0.0059, + "step": 1303 + }, + { + "epoch": 2.5127710843373494, + "grad_norm": 0.10386355221271515, + "learning_rate": 3.1573178458688102e-06, + "loss": 0.0042, + "step": 1304 + }, + { + "epoch": 2.514698795180723, + "grad_norm": 0.12700854241847992, + "learning_rate": 3.133175188119899e-06, + "loss": 0.0041, + "step": 1305 + }, + { + "epoch": 2.5166265060240964, + "grad_norm": 0.1617022454738617, + "learning_rate": 3.109117342238639e-06, + "loss": 0.0053, + "step": 1306 + }, + { + "epoch": 2.5185542168674697, + "grad_norm": 0.8668884038925171, + "learning_rate": 3.085144429195688e-06, + "loss": 0.0084, + "step": 1307 + }, + { + "epoch": 2.5204819277108435, + "grad_norm": 0.22429344058036804, + "learning_rate": 3.061256569534634e-06, + "loss": 0.0053, + "step": 1308 + }, + { + "epoch": 2.5224096385542167, + "grad_norm": 0.08967582136392593, + "learning_rate": 3.037453883371375e-06, + "loss": 0.0018, + "step": 1309 + }, + { + "epoch": 2.5243373493975905, + "grad_norm": 0.1251695454120636, + "learning_rate": 3.0137364903935464e-06, + "loss": 0.0037, + "step": 1310 + }, + { + "epoch": 2.5262650602409638, + "grad_norm": 0.09026174992322922, + "learning_rate": 2.990104509859897e-06, + "loss": 0.0024, + "step": 1311 + }, + { + "epoch": 2.5281927710843375, + "grad_norm": 0.34319114685058594, + "learning_rate": 2.966558060599689e-06, + "loss": 0.0063, + "step": 1312 + }, + { + "epoch": 2.5301204819277108, + "grad_norm": 0.20300136506557465, + "learning_rate": 2.9430972610121087e-06, + "loss": 0.0054, + "step": 1313 + }, + { + "epoch": 2.532048192771084, + "grad_norm": 0.19160760939121246, + "learning_rate": 2.9197222290656737e-06, + "loss": 0.0095, + "step": 1314 + }, + { + "epoch": 2.533975903614458, + "grad_norm": 0.18991442024707794, + "learning_rate": 2.8964330822976227e-06, + "loss": 0.006, + "step": 1315 + }, + { + "epoch": 2.5359036144578315, + "grad_norm": 0.1801903396844864, + "learning_rate": 2.873229937813349e-06, + "loss": 0.0067, + "step": 1316 + }, + { + "epoch": 2.537831325301205, + "grad_norm": 0.07068303227424622, + "learning_rate": 2.850112912285783e-06, + "loss": 0.0015, + "step": 1317 + }, + { + "epoch": 2.539759036144578, + "grad_norm": 0.1404612809419632, + "learning_rate": 2.8270821219548296e-06, + "loss": 0.0036, + "step": 1318 + }, + { + "epoch": 2.541686746987952, + "grad_norm": 0.12199504673480988, + "learning_rate": 2.8041376826267862e-06, + "loss": 0.0068, + "step": 1319 + }, + { + "epoch": 2.5436144578313256, + "grad_norm": 0.2167249619960785, + "learning_rate": 2.7812797096737253e-06, + "loss": 0.0048, + "step": 1320 + }, + { + "epoch": 2.545542168674699, + "grad_norm": 0.07466506212949753, + "learning_rate": 2.7585083180329575e-06, + "loss": 0.0017, + "step": 1321 + }, + { + "epoch": 2.547469879518072, + "grad_norm": 0.11736353486776352, + "learning_rate": 2.7358236222064283e-06, + "loss": 0.003, + "step": 1322 + }, + { + "epoch": 2.549397590361446, + "grad_norm": 0.16602204740047455, + "learning_rate": 2.7132257362601453e-06, + "loss": 0.005, + "step": 1323 + }, + { + "epoch": 2.551325301204819, + "grad_norm": 0.15473629534244537, + "learning_rate": 2.6907147738236193e-06, + "loss": 0.0077, + "step": 1324 + }, + { + "epoch": 2.553253012048193, + "grad_norm": 0.07868973910808563, + "learning_rate": 2.6682908480892567e-06, + "loss": 0.0013, + "step": 1325 + }, + { + "epoch": 2.555180722891566, + "grad_norm": 0.2137845754623413, + "learning_rate": 2.645954071811847e-06, + "loss": 0.0092, + "step": 1326 + }, + { + "epoch": 2.55710843373494, + "grad_norm": 0.11191053688526154, + "learning_rate": 2.623704557307949e-06, + "loss": 0.0031, + "step": 1327 + }, + { + "epoch": 2.559036144578313, + "grad_norm": 0.3080642521381378, + "learning_rate": 2.6015424164553295e-06, + "loss": 0.0104, + "step": 1328 + }, + { + "epoch": 2.5609638554216865, + "grad_norm": 0.08816439658403397, + "learning_rate": 2.579467760692427e-06, + "loss": 0.004, + "step": 1329 + }, + { + "epoch": 2.56289156626506, + "grad_norm": 0.17154981195926666, + "learning_rate": 2.557480701017776e-06, + "loss": 0.0035, + "step": 1330 + }, + { + "epoch": 2.564819277108434, + "grad_norm": 0.09479143470525742, + "learning_rate": 2.5355813479894464e-06, + "loss": 0.0034, + "step": 1331 + }, + { + "epoch": 2.5667469879518072, + "grad_norm": 0.26139333844184875, + "learning_rate": 2.513769811724487e-06, + "loss": 0.0076, + "step": 1332 + }, + { + "epoch": 2.5686746987951805, + "grad_norm": 0.16864238679409027, + "learning_rate": 2.4920462018983816e-06, + "loss": 0.0046, + "step": 1333 + }, + { + "epoch": 2.5706024096385542, + "grad_norm": 0.1133158802986145, + "learning_rate": 2.4704106277444884e-06, + "loss": 0.0034, + "step": 1334 + }, + { + "epoch": 2.572530120481928, + "grad_norm": 0.27522334456443787, + "learning_rate": 2.4488631980534995e-06, + "loss": 0.0127, + "step": 1335 + }, + { + "epoch": 2.5744578313253013, + "grad_norm": 0.13547387719154358, + "learning_rate": 2.427404021172868e-06, + "loss": 0.0031, + "step": 1336 + }, + { + "epoch": 2.5763855421686745, + "grad_norm": 0.13478629291057587, + "learning_rate": 2.406033205006313e-06, + "loss": 0.0039, + "step": 1337 + }, + { + "epoch": 2.5783132530120483, + "grad_norm": 0.11515481770038605, + "learning_rate": 2.3847508570132226e-06, + "loss": 0.0029, + "step": 1338 + }, + { + "epoch": 2.5802409638554216, + "grad_norm": 0.21657171845436096, + "learning_rate": 2.36355708420815e-06, + "loss": 0.011, + "step": 1339 + }, + { + "epoch": 2.5821686746987953, + "grad_norm": 0.11441601067781448, + "learning_rate": 2.342451993160262e-06, + "loss": 0.006, + "step": 1340 + }, + { + "epoch": 2.5840963855421686, + "grad_norm": 0.13475841283798218, + "learning_rate": 2.3214356899928036e-06, + "loss": 0.0051, + "step": 1341 + }, + { + "epoch": 2.5860240963855423, + "grad_norm": 0.053035832941532135, + "learning_rate": 2.300508280382572e-06, + "loss": 0.0012, + "step": 1342 + }, + { + "epoch": 2.5879518072289156, + "grad_norm": 0.12467508763074875, + "learning_rate": 2.279669869559358e-06, + "loss": 0.0024, + "step": 1343 + }, + { + "epoch": 2.589879518072289, + "grad_norm": 0.10572273284196854, + "learning_rate": 2.2589205623054646e-06, + "loss": 0.0024, + "step": 1344 + }, + { + "epoch": 2.5918072289156626, + "grad_norm": 0.17056365311145782, + "learning_rate": 2.238260462955142e-06, + "loss": 0.0064, + "step": 1345 + }, + { + "epoch": 2.5937349397590364, + "grad_norm": 0.07940494269132614, + "learning_rate": 2.2176896753940637e-06, + "loss": 0.0012, + "step": 1346 + }, + { + "epoch": 2.5956626506024096, + "grad_norm": 0.10416694730520248, + "learning_rate": 2.1972083030588244e-06, + "loss": 0.0092, + "step": 1347 + }, + { + "epoch": 2.597590361445783, + "grad_norm": 0.2384328842163086, + "learning_rate": 2.176816448936423e-06, + "loss": 0.0067, + "step": 1348 + }, + { + "epoch": 2.5995180722891567, + "grad_norm": 0.14279082417488098, + "learning_rate": 2.156514215563703e-06, + "loss": 0.0059, + "step": 1349 + }, + { + "epoch": 2.6014457831325304, + "grad_norm": 0.08462683111429214, + "learning_rate": 2.1363017050268886e-06, + "loss": 0.0021, + "step": 1350 + }, + { + "epoch": 2.6033734939759037, + "grad_norm": 0.09768491238355637, + "learning_rate": 2.1161790189610377e-06, + "loss": 0.0038, + "step": 1351 + }, + { + "epoch": 2.605301204819277, + "grad_norm": 0.25498896837234497, + "learning_rate": 2.0961462585495474e-06, + "loss": 0.0114, + "step": 1352 + }, + { + "epoch": 2.6072289156626507, + "grad_norm": 0.15635675191879272, + "learning_rate": 2.076203524523637e-06, + "loss": 0.0054, + "step": 1353 + }, + { + "epoch": 2.609156626506024, + "grad_norm": 0.11619213968515396, + "learning_rate": 2.056350917161836e-06, + "loss": 0.007, + "step": 1354 + }, + { + "epoch": 2.6110843373493977, + "grad_norm": 0.18085338175296783, + "learning_rate": 2.0365885362895053e-06, + "loss": 0.0061, + "step": 1355 + }, + { + "epoch": 2.613012048192771, + "grad_norm": 0.14492927491664886, + "learning_rate": 2.016916481278306e-06, + "loss": 0.0114, + "step": 1356 + }, + { + "epoch": 2.6149397590361447, + "grad_norm": 0.21257621049880981, + "learning_rate": 1.997334851045709e-06, + "loss": 0.0057, + "step": 1357 + }, + { + "epoch": 2.616867469879518, + "grad_norm": 0.11539656668901443, + "learning_rate": 1.9778437440545085e-06, + "loss": 0.0071, + "step": 1358 + }, + { + "epoch": 2.6187951807228913, + "grad_norm": 0.1642933189868927, + "learning_rate": 1.95844325831231e-06, + "loss": 0.0054, + "step": 1359 + }, + { + "epoch": 2.620722891566265, + "grad_norm": 0.10779479146003723, + "learning_rate": 1.9391334913710545e-06, + "loss": 0.0028, + "step": 1360 + }, + { + "epoch": 2.6226506024096388, + "grad_norm": 0.14295366406440735, + "learning_rate": 1.9199145403265175e-06, + "loss": 0.0048, + "step": 1361 + }, + { + "epoch": 2.624578313253012, + "grad_norm": 0.13454844057559967, + "learning_rate": 1.9007865018178107e-06, + "loss": 0.0072, + "step": 1362 + }, + { + "epoch": 2.6265060240963853, + "grad_norm": 0.778252363204956, + "learning_rate": 1.8817494720269302e-06, + "loss": 0.0071, + "step": 1363 + }, + { + "epoch": 2.628433734939759, + "grad_norm": 0.11488679051399231, + "learning_rate": 1.8628035466782268e-06, + "loss": 0.0038, + "step": 1364 + }, + { + "epoch": 2.630361445783133, + "grad_norm": 0.15560875833034515, + "learning_rate": 1.8439488210379687e-06, + "loss": 0.0043, + "step": 1365 + }, + { + "epoch": 2.632289156626506, + "grad_norm": 0.10538071393966675, + "learning_rate": 1.8251853899138306e-06, + "loss": 0.0041, + "step": 1366 + }, + { + "epoch": 2.6342168674698794, + "grad_norm": 0.12866193056106567, + "learning_rate": 1.8065133476544306e-06, + "loss": 0.0034, + "step": 1367 + }, + { + "epoch": 2.636144578313253, + "grad_norm": 0.2045469433069229, + "learning_rate": 1.7879327881488584e-06, + "loss": 0.0141, + "step": 1368 + }, + { + "epoch": 2.6380722891566264, + "grad_norm": 0.12423976510763168, + "learning_rate": 1.769443804826194e-06, + "loss": 0.0047, + "step": 1369 + }, + { + "epoch": 2.64, + "grad_norm": 0.1007109209895134, + "learning_rate": 1.751046490655046e-06, + "loss": 0.0031, + "step": 1370 + }, + { + "epoch": 2.6419277108433734, + "grad_norm": 0.0681275874376297, + "learning_rate": 1.7327409381430804e-06, + "loss": 0.0019, + "step": 1371 + }, + { + "epoch": 2.643855421686747, + "grad_norm": 0.1645517498254776, + "learning_rate": 1.7145272393365498e-06, + "loss": 0.0035, + "step": 1372 + }, + { + "epoch": 2.6457831325301204, + "grad_norm": 0.13689427077770233, + "learning_rate": 1.6964054858198386e-06, + "loss": 0.0086, + "step": 1373 + }, + { + "epoch": 2.6477108433734937, + "grad_norm": 0.10440093278884888, + "learning_rate": 1.6783757687150149e-06, + "loss": 0.0019, + "step": 1374 + }, + { + "epoch": 2.6496385542168674, + "grad_norm": 0.1142532229423523, + "learning_rate": 1.6604381786813383e-06, + "loss": 0.0047, + "step": 1375 + }, + { + "epoch": 2.651566265060241, + "grad_norm": 0.10430166125297546, + "learning_rate": 1.6425928059148312e-06, + "loss": 0.0027, + "step": 1376 + }, + { + "epoch": 2.6534939759036145, + "grad_norm": 0.2315254956483841, + "learning_rate": 1.624839740147819e-06, + "loss": 0.0071, + "step": 1377 + }, + { + "epoch": 2.6554216867469878, + "grad_norm": 0.15356265008449554, + "learning_rate": 1.6071790706484746e-06, + "loss": 0.0109, + "step": 1378 + }, + { + "epoch": 2.6573493975903615, + "grad_norm": 0.1332363784313202, + "learning_rate": 1.589610886220383e-06, + "loss": 0.0046, + "step": 1379 + }, + { + "epoch": 2.659277108433735, + "grad_norm": 0.18892519176006317, + "learning_rate": 1.5721352752020602e-06, + "loss": 0.0138, + "step": 1380 + }, + { + "epoch": 2.6612048192771085, + "grad_norm": 0.10537895560264587, + "learning_rate": 1.5547523254665598e-06, + "loss": 0.0066, + "step": 1381 + }, + { + "epoch": 2.663132530120482, + "grad_norm": 0.1308947205543518, + "learning_rate": 1.5374621244209965e-06, + "loss": 0.0039, + "step": 1382 + }, + { + "epoch": 2.6650602409638555, + "grad_norm": 0.11358808726072311, + "learning_rate": 1.5202647590060983e-06, + "loss": 0.0029, + "step": 1383 + }, + { + "epoch": 2.666987951807229, + "grad_norm": 0.12029009312391281, + "learning_rate": 1.5031603156958064e-06, + "loss": 0.0032, + "step": 1384 + }, + { + "epoch": 2.6689156626506025, + "grad_norm": 0.36994072794914246, + "learning_rate": 1.4861488804968093e-06, + "loss": 0.024, + "step": 1385 + }, + { + "epoch": 2.670843373493976, + "grad_norm": 0.1263083666563034, + "learning_rate": 1.4692305389481232e-06, + "loss": 0.0047, + "step": 1386 + }, + { + "epoch": 2.6727710843373496, + "grad_norm": 0.15056709945201874, + "learning_rate": 1.452405376120658e-06, + "loss": 0.0014, + "step": 1387 + }, + { + "epoch": 2.674698795180723, + "grad_norm": 0.10418888181447983, + "learning_rate": 1.4356734766167925e-06, + "loss": 0.0035, + "step": 1388 + }, + { + "epoch": 2.676626506024096, + "grad_norm": 0.12220565974712372, + "learning_rate": 1.4190349245699443e-06, + "loss": 0.0063, + "step": 1389 + }, + { + "epoch": 2.67855421686747, + "grad_norm": 0.14774753153324127, + "learning_rate": 1.402489803644156e-06, + "loss": 0.008, + "step": 1390 + }, + { + "epoch": 2.6804819277108436, + "grad_norm": 0.14384198188781738, + "learning_rate": 1.3860381970336544e-06, + "loss": 0.0039, + "step": 1391 + }, + { + "epoch": 2.682409638554217, + "grad_norm": 0.10995055735111237, + "learning_rate": 1.3696801874624698e-06, + "loss": 0.0028, + "step": 1392 + }, + { + "epoch": 2.68433734939759, + "grad_norm": 0.12208505719900131, + "learning_rate": 1.353415857183966e-06, + "loss": 0.0029, + "step": 1393 + }, + { + "epoch": 2.686265060240964, + "grad_norm": 0.16018439829349518, + "learning_rate": 1.337245287980482e-06, + "loss": 0.0068, + "step": 1394 + }, + { + "epoch": 2.688192771084337, + "grad_norm": 5.2112274169921875, + "learning_rate": 1.3211685611628844e-06, + "loss": 0.1645, + "step": 1395 + }, + { + "epoch": 2.690120481927711, + "grad_norm": 0.12426120787858963, + "learning_rate": 1.3051857575701732e-06, + "loss": 0.0044, + "step": 1396 + }, + { + "epoch": 2.692048192771084, + "grad_norm": 0.13931375741958618, + "learning_rate": 1.2892969575690685e-06, + "loss": 0.0035, + "step": 1397 + }, + { + "epoch": 2.693975903614458, + "grad_norm": 0.1804540753364563, + "learning_rate": 1.273502241053608e-06, + "loss": 0.0108, + "step": 1398 + }, + { + "epoch": 2.695903614457831, + "grad_norm": 0.12313607335090637, + "learning_rate": 1.2578016874447596e-06, + "loss": 0.0073, + "step": 1399 + }, + { + "epoch": 2.697831325301205, + "grad_norm": 0.1301470398902893, + "learning_rate": 1.2421953756899985e-06, + "loss": 0.0037, + "step": 1400 + }, + { + "epoch": 2.6997590361445782, + "grad_norm": 0.12769126892089844, + "learning_rate": 1.226683384262919e-06, + "loss": 0.0041, + "step": 1401 + }, + { + "epoch": 2.701686746987952, + "grad_norm": 0.20923997461795807, + "learning_rate": 1.21126579116285e-06, + "loss": 0.0101, + "step": 1402 + }, + { + "epoch": 2.7036144578313253, + "grad_norm": 0.09334482997655869, + "learning_rate": 1.1959426739144497e-06, + "loss": 0.0022, + "step": 1403 + }, + { + "epoch": 2.7055421686746985, + "grad_norm": 0.06848987936973572, + "learning_rate": 1.1807141095673291e-06, + "loss": 0.0013, + "step": 1404 + }, + { + "epoch": 2.7074698795180723, + "grad_norm": 0.14552196860313416, + "learning_rate": 1.1655801746956463e-06, + "loss": 0.0066, + "step": 1405 + }, + { + "epoch": 2.709397590361446, + "grad_norm": 0.11259587109088898, + "learning_rate": 1.1505409453977334e-06, + "loss": 0.0045, + "step": 1406 + }, + { + "epoch": 2.7113253012048193, + "grad_norm": 0.23408068716526031, + "learning_rate": 1.135596497295719e-06, + "loss": 0.0181, + "step": 1407 + }, + { + "epoch": 2.7132530120481926, + "grad_norm": 0.1483619660139084, + "learning_rate": 1.1207469055351395e-06, + "loss": 0.0042, + "step": 1408 + }, + { + "epoch": 2.7151807228915663, + "grad_norm": 0.1170588880777359, + "learning_rate": 1.105992244784555e-06, + "loss": 0.0059, + "step": 1409 + }, + { + "epoch": 2.7171084337349396, + "grad_norm": 0.15649215877056122, + "learning_rate": 1.0913325892351857e-06, + "loss": 0.0023, + "step": 1410 + }, + { + "epoch": 2.7190361445783133, + "grad_norm": 0.0980108231306076, + "learning_rate": 1.0767680126005443e-06, + "loss": 0.0019, + "step": 1411 + }, + { + "epoch": 2.7209638554216866, + "grad_norm": 0.14913050830364227, + "learning_rate": 1.0622985881160396e-06, + "loss": 0.0018, + "step": 1412 + }, + { + "epoch": 2.7228915662650603, + "grad_norm": 0.0827481672167778, + "learning_rate": 1.0479243885386347e-06, + "loss": 0.0023, + "step": 1413 + }, + { + "epoch": 2.7248192771084336, + "grad_norm": 0.15648555755615234, + "learning_rate": 1.0336454861464706e-06, + "loss": 0.0033, + "step": 1414 + }, + { + "epoch": 2.7267469879518074, + "grad_norm": 0.10614357888698578, + "learning_rate": 1.0194619527385007e-06, + "loss": 0.0029, + "step": 1415 + }, + { + "epoch": 2.7286746987951807, + "grad_norm": 0.07111652940511703, + "learning_rate": 1.0053738596341355e-06, + "loss": 0.0026, + "step": 1416 + }, + { + "epoch": 2.7306024096385544, + "grad_norm": 0.11736573278903961, + "learning_rate": 9.91381277672867e-07, + "loss": 0.005, + "step": 1417 + }, + { + "epoch": 2.7325301204819277, + "grad_norm": 0.18440629541873932, + "learning_rate": 9.774842772139537e-07, + "loss": 0.0038, + "step": 1418 + }, + { + "epoch": 2.734457831325301, + "grad_norm": 0.11000041663646698, + "learning_rate": 9.636829281360116e-07, + "loss": 0.0034, + "step": 1419 + }, + { + "epoch": 2.7363855421686747, + "grad_norm": 0.15212605893611908, + "learning_rate": 9.499772998367018e-07, + "loss": 0.0038, + "step": 1420 + }, + { + "epoch": 2.7383132530120484, + "grad_norm": 0.07784705609083176, + "learning_rate": 9.36367461232377e-07, + "loss": 0.002, + "step": 1421 + }, + { + "epoch": 2.7402409638554217, + "grad_norm": 0.1096726506948471, + "learning_rate": 9.22853480757715e-07, + "loss": 0.0028, + "step": 1422 + }, + { + "epoch": 2.742168674698795, + "grad_norm": 0.17528535425662994, + "learning_rate": 9.094354263653971e-07, + "loss": 0.0065, + "step": 1423 + }, + { + "epoch": 2.7440963855421687, + "grad_norm": 0.09263470768928528, + "learning_rate": 8.961133655257548e-07, + "loss": 0.0031, + "step": 1424 + }, + { + "epoch": 2.746024096385542, + "grad_norm": 0.14822180569171906, + "learning_rate": 8.828873652264303e-07, + "loss": 0.0043, + "step": 1425 + }, + { + "epoch": 2.7479518072289157, + "grad_norm": 0.11577019095420837, + "learning_rate": 8.697574919720497e-07, + "loss": 0.004, + "step": 1426 + }, + { + "epoch": 2.749879518072289, + "grad_norm": 0.11681873351335526, + "learning_rate": 8.567238117838683e-07, + "loss": 0.0035, + "step": 1427 + }, + { + "epoch": 2.7518072289156628, + "grad_norm": 0.1191524937748909, + "learning_rate": 8.437863901994592e-07, + "loss": 0.0022, + "step": 1428 + }, + { + "epoch": 2.753734939759036, + "grad_norm": 0.1528361737728119, + "learning_rate": 8.309452922723849e-07, + "loss": 0.0042, + "step": 1429 + }, + { + "epoch": 2.75566265060241, + "grad_norm": 0.42052382230758667, + "learning_rate": 8.18200582571842e-07, + "loss": 0.0149, + "step": 1430 + }, + { + "epoch": 2.757590361445783, + "grad_norm": 0.13524137437343597, + "learning_rate": 8.055523251823705e-07, + "loss": 0.0029, + "step": 1431 + }, + { + "epoch": 2.759518072289157, + "grad_norm": 0.0980493426322937, + "learning_rate": 7.930005837035138e-07, + "loss": 0.0036, + "step": 1432 + }, + { + "epoch": 2.76144578313253, + "grad_norm": 0.17335453629493713, + "learning_rate": 7.805454212494967e-07, + "loss": 0.0066, + "step": 1433 + }, + { + "epoch": 2.7633734939759034, + "grad_norm": 0.13746409118175507, + "learning_rate": 7.681869004489218e-07, + "loss": 0.0066, + "step": 1434 + }, + { + "epoch": 2.765301204819277, + "grad_norm": 0.18556399643421173, + "learning_rate": 7.559250834444332e-07, + "loss": 0.0073, + "step": 1435 + }, + { + "epoch": 2.767228915662651, + "grad_norm": 0.09743557125329971, + "learning_rate": 7.437600318924332e-07, + "loss": 0.0023, + "step": 1436 + }, + { + "epoch": 2.769156626506024, + "grad_norm": 0.10671001672744751, + "learning_rate": 7.316918069627488e-07, + "loss": 0.003, + "step": 1437 + }, + { + "epoch": 2.7710843373493974, + "grad_norm": 0.10671380162239075, + "learning_rate": 7.197204693383231e-07, + "loss": 0.0021, + "step": 1438 + }, + { + "epoch": 2.773012048192771, + "grad_norm": 0.06824454665184021, + "learning_rate": 7.078460792149311e-07, + "loss": 0.0017, + "step": 1439 + }, + { + "epoch": 2.7749397590361444, + "grad_norm": 0.12668560445308685, + "learning_rate": 6.960686963008556e-07, + "loss": 0.0035, + "step": 1440 + }, + { + "epoch": 2.776867469879518, + "grad_norm": 0.10260980576276779, + "learning_rate": 6.843883798166029e-07, + "loss": 0.0027, + "step": 1441 + }, + { + "epoch": 2.7787951807228914, + "grad_norm": 0.09880302101373672, + "learning_rate": 6.728051884945941e-07, + "loss": 0.0029, + "step": 1442 + }, + { + "epoch": 2.780722891566265, + "grad_norm": 0.305993914604187, + "learning_rate": 6.613191805788699e-07, + "loss": 0.0112, + "step": 1443 + }, + { + "epoch": 2.7826506024096385, + "grad_norm": 0.10707511752843857, + "learning_rate": 6.499304138248064e-07, + "loss": 0.0062, + "step": 1444 + }, + { + "epoch": 2.784578313253012, + "grad_norm": 0.0986943170428276, + "learning_rate": 6.386389454988195e-07, + "loss": 0.0021, + "step": 1445 + }, + { + "epoch": 2.7865060240963855, + "grad_norm": 0.1458776742219925, + "learning_rate": 6.274448323780724e-07, + "loss": 0.0094, + "step": 1446 + }, + { + "epoch": 2.788433734939759, + "grad_norm": 0.09657061100006104, + "learning_rate": 6.163481307501995e-07, + "loss": 0.0026, + "step": 1447 + }, + { + "epoch": 2.7903614457831325, + "grad_norm": 0.1462988704442978, + "learning_rate": 6.053488964130183e-07, + "loss": 0.0075, + "step": 1448 + }, + { + "epoch": 2.792289156626506, + "grad_norm": 0.15330864489078522, + "learning_rate": 5.94447184674245e-07, + "loss": 0.0067, + "step": 1449 + }, + { + "epoch": 2.7942168674698795, + "grad_norm": 0.1513473242521286, + "learning_rate": 5.836430503512236e-07, + "loss": 0.0106, + "step": 1450 + }, + { + "epoch": 2.7961445783132532, + "grad_norm": 0.2151842713356018, + "learning_rate": 5.729365477706505e-07, + "loss": 0.0062, + "step": 1451 + }, + { + "epoch": 2.7980722891566265, + "grad_norm": 0.13624203205108643, + "learning_rate": 5.623277307682929e-07, + "loss": 0.0045, + "step": 1452 + }, + { + "epoch": 2.8, + "grad_norm": 0.12075261026620865, + "learning_rate": 5.518166526887214e-07, + "loss": 0.0073, + "step": 1453 + }, + { + "epoch": 2.8019277108433736, + "grad_norm": 0.11320624500513077, + "learning_rate": 5.41403366385047e-07, + "loss": 0.002, + "step": 1454 + }, + { + "epoch": 2.803855421686747, + "grad_norm": 0.08470363914966583, + "learning_rate": 5.310879242186606e-07, + "loss": 0.0021, + "step": 1455 + }, + { + "epoch": 2.8057831325301206, + "grad_norm": 0.15221907198429108, + "learning_rate": 5.208703780589419e-07, + "loss": 0.0019, + "step": 1456 + }, + { + "epoch": 2.807710843373494, + "grad_norm": 0.12709103524684906, + "learning_rate": 5.107507792830335e-07, + "loss": 0.0052, + "step": 1457 + }, + { + "epoch": 2.8096385542168676, + "grad_norm": 0.10888515412807465, + "learning_rate": 5.007291787755586e-07, + "loss": 0.0023, + "step": 1458 + }, + { + "epoch": 2.811566265060241, + "grad_norm": 0.25710970163345337, + "learning_rate": 4.908056269283789e-07, + "loss": 0.0073, + "step": 1459 + }, + { + "epoch": 2.8134939759036146, + "grad_norm": 0.08488702774047852, + "learning_rate": 4.809801736403308e-07, + "loss": 0.0016, + "step": 1460 + }, + { + "epoch": 2.815421686746988, + "grad_norm": 0.1282006949186325, + "learning_rate": 4.7125286831698034e-07, + "loss": 0.0035, + "step": 1461 + }, + { + "epoch": 2.8173493975903616, + "grad_norm": 0.08955442905426025, + "learning_rate": 4.6162375987037766e-07, + "loss": 0.004, + "step": 1462 + }, + { + "epoch": 2.819277108433735, + "grad_norm": 0.11310838907957077, + "learning_rate": 4.520928967188054e-07, + "loss": 0.0022, + "step": 1463 + }, + { + "epoch": 2.821204819277108, + "grad_norm": 0.15055686235427856, + "learning_rate": 4.426603267865326e-07, + "loss": 0.0042, + "step": 1464 + }, + { + "epoch": 2.823132530120482, + "grad_norm": 0.14379452168941498, + "learning_rate": 4.333260975035769e-07, + "loss": 0.0089, + "step": 1465 + }, + { + "epoch": 2.8250602409638557, + "grad_norm": 0.1795361489057541, + "learning_rate": 4.240902558054827e-07, + "loss": 0.013, + "step": 1466 + }, + { + "epoch": 2.826987951807229, + "grad_norm": 0.06829468160867691, + "learning_rate": 4.1495284813305003e-07, + "loss": 0.0018, + "step": 1467 + }, + { + "epoch": 2.8289156626506022, + "grad_norm": 0.35213515162467957, + "learning_rate": 4.0591392043213275e-07, + "loss": 0.0144, + "step": 1468 + }, + { + "epoch": 2.830843373493976, + "grad_norm": 0.11828093230724335, + "learning_rate": 3.969735181533918e-07, + "loss": 0.0028, + "step": 1469 + }, + { + "epoch": 2.8327710843373493, + "grad_norm": 0.13286921381950378, + "learning_rate": 3.881316862520712e-07, + "loss": 0.0042, + "step": 1470 + }, + { + "epoch": 2.834698795180723, + "grad_norm": 0.10271132737398148, + "learning_rate": 3.7938846918776917e-07, + "loss": 0.0047, + "step": 1471 + }, + { + "epoch": 2.8366265060240963, + "grad_norm": 0.09422904253005981, + "learning_rate": 3.707439109242139e-07, + "loss": 0.0061, + "step": 1472 + }, + { + "epoch": 2.83855421686747, + "grad_norm": 0.10817123204469681, + "learning_rate": 3.6219805492905934e-07, + "loss": 0.0029, + "step": 1473 + }, + { + "epoch": 2.8404819277108433, + "grad_norm": 0.10254565626382828, + "learning_rate": 3.53750944173632e-07, + "loss": 0.0044, + "step": 1474 + }, + { + "epoch": 2.842409638554217, + "grad_norm": 0.11423154920339584, + "learning_rate": 3.45402621132751e-07, + "loss": 0.0059, + "step": 1475 + }, + { + "epoch": 2.8443373493975903, + "grad_norm": 0.15620556473731995, + "learning_rate": 3.3715312778449305e-07, + "loss": 0.005, + "step": 1476 + }, + { + "epoch": 2.846265060240964, + "grad_norm": 0.1081036925315857, + "learning_rate": 3.2900250560998546e-07, + "loss": 0.004, + "step": 1477 + }, + { + "epoch": 2.8481927710843373, + "grad_norm": 0.38650745153427124, + "learning_rate": 3.209507955932001e-07, + "loss": 0.0076, + "step": 1478 + }, + { + "epoch": 2.8501204819277106, + "grad_norm": 0.1864783614873886, + "learning_rate": 3.129980382207509e-07, + "loss": 0.0092, + "step": 1479 + }, + { + "epoch": 2.8520481927710843, + "grad_norm": 0.1458069533109665, + "learning_rate": 3.05144273481679e-07, + "loss": 0.0058, + "step": 1480 + }, + { + "epoch": 2.853975903614458, + "grad_norm": 0.14836257696151733, + "learning_rate": 2.9738954086726334e-07, + "loss": 0.014, + "step": 1481 + }, + { + "epoch": 2.8559036144578314, + "grad_norm": 0.10147511214017868, + "learning_rate": 2.8973387937081485e-07, + "loss": 0.0047, + "step": 1482 + }, + { + "epoch": 2.8578313253012047, + "grad_norm": 0.13740235567092896, + "learning_rate": 2.821773274874828e-07, + "loss": 0.0028, + "step": 1483 + }, + { + "epoch": 2.8597590361445784, + "grad_norm": 0.16089461743831635, + "learning_rate": 2.7471992321406624e-07, + "loss": 0.0168, + "step": 1484 + }, + { + "epoch": 2.8616867469879517, + "grad_norm": 0.0599152147769928, + "learning_rate": 2.6736170404880744e-07, + "loss": 0.0017, + "step": 1485 + }, + { + "epoch": 2.8636144578313254, + "grad_norm": 0.148875430226326, + "learning_rate": 2.6010270699122096e-07, + "loss": 0.0045, + "step": 1486 + }, + { + "epoch": 2.8655421686746987, + "grad_norm": 0.26763641834259033, + "learning_rate": 2.529429685419027e-07, + "loss": 0.007, + "step": 1487 + }, + { + "epoch": 2.8674698795180724, + "grad_norm": 0.1743084192276001, + "learning_rate": 2.458825247023389e-07, + "loss": 0.0112, + "step": 1488 + }, + { + "epoch": 2.8693975903614457, + "grad_norm": 0.21380828320980072, + "learning_rate": 2.3892141097473063e-07, + "loss": 0.0103, + "step": 1489 + }, + { + "epoch": 2.8713253012048194, + "grad_norm": 2.185253620147705, + "learning_rate": 2.3205966236181433e-07, + "loss": 0.0195, + "step": 1490 + }, + { + "epoch": 2.8732530120481927, + "grad_norm": 0.11854024976491928, + "learning_rate": 2.252973133666947e-07, + "loss": 0.0034, + "step": 1491 + }, + { + "epoch": 2.8751807228915665, + "grad_norm": 0.36487653851509094, + "learning_rate": 2.1863439799265195e-07, + "loss": 0.0063, + "step": 1492 + }, + { + "epoch": 2.8771084337349397, + "grad_norm": 0.1029730811715126, + "learning_rate": 2.1207094974298847e-07, + "loss": 0.0049, + "step": 1493 + }, + { + "epoch": 2.879036144578313, + "grad_norm": 0.10066278278827667, + "learning_rate": 2.056070016208489e-07, + "loss": 0.0021, + "step": 1494 + }, + { + "epoch": 2.8809638554216868, + "grad_norm": 0.21477262675762177, + "learning_rate": 1.9924258612906256e-07, + "loss": 0.0052, + "step": 1495 + }, + { + "epoch": 2.8828915662650605, + "grad_norm": 0.29007601737976074, + "learning_rate": 1.929777352699791e-07, + "loss": 0.0065, + "step": 1496 + }, + { + "epoch": 2.8848192771084338, + "grad_norm": 0.32320499420166016, + "learning_rate": 1.8681248054529754e-07, + "loss": 0.0334, + "step": 1497 + }, + { + "epoch": 2.886746987951807, + "grad_norm": 0.12790757417678833, + "learning_rate": 1.8074685295591754e-07, + "loss": 0.0034, + "step": 1498 + }, + { + "epoch": 2.888674698795181, + "grad_norm": 0.12194570153951645, + "learning_rate": 1.7478088300178608e-07, + "loss": 0.0038, + "step": 1499 + }, + { + "epoch": 2.890602409638554, + "grad_norm": 0.13514107465744019, + "learning_rate": 1.6891460068173548e-07, + "loss": 0.0042, + "step": 1500 + }, + { + "epoch": 2.892530120481928, + "grad_norm": 0.09762352705001831, + "learning_rate": 1.631480354933346e-07, + "loss": 0.0016, + "step": 1501 + }, + { + "epoch": 2.894457831325301, + "grad_norm": 0.10607658326625824, + "learning_rate": 1.5748121643274661e-07, + "loss": 0.0062, + "step": 1502 + }, + { + "epoch": 2.896385542168675, + "grad_norm": 0.0920143872499466, + "learning_rate": 1.519141719945738e-07, + "loss": 0.0025, + "step": 1503 + }, + { + "epoch": 2.898313253012048, + "grad_norm": 0.17520834505558014, + "learning_rate": 1.4644693017172418e-07, + "loss": 0.0045, + "step": 1504 + }, + { + "epoch": 2.900240963855422, + "grad_norm": 0.49769192934036255, + "learning_rate": 1.4107951845526267e-07, + "loss": 0.0059, + "step": 1505 + }, + { + "epoch": 2.902168674698795, + "grad_norm": 0.06354644149541855, + "learning_rate": 1.3581196383427586e-07, + "loss": 0.0021, + "step": 1506 + }, + { + "epoch": 2.904096385542169, + "grad_norm": 0.09340358525514603, + "learning_rate": 1.3064429279573853e-07, + "loss": 0.0036, + "step": 1507 + }, + { + "epoch": 2.906024096385542, + "grad_norm": 0.06073952466249466, + "learning_rate": 1.255765313243762e-07, + "loss": 0.001, + "step": 1508 + }, + { + "epoch": 2.9079518072289154, + "grad_norm": 0.1323407143354416, + "learning_rate": 1.206087049025384e-07, + "loss": 0.008, + "step": 1509 + }, + { + "epoch": 2.909879518072289, + "grad_norm": 0.18533159792423248, + "learning_rate": 1.1574083851007e-07, + "loss": 0.0086, + "step": 1510 + }, + { + "epoch": 2.911807228915663, + "grad_norm": 0.09885486960411072, + "learning_rate": 1.1097295662418018e-07, + "loss": 0.0023, + "step": 1511 + }, + { + "epoch": 2.913734939759036, + "grad_norm": 0.08286528289318085, + "learning_rate": 1.0630508321932687e-07, + "loss": 0.0029, + "step": 1512 + }, + { + "epoch": 2.9156626506024095, + "grad_norm": 0.1265413761138916, + "learning_rate": 1.0173724176709254e-07, + "loss": 0.003, + "step": 1513 + }, + { + "epoch": 2.917590361445783, + "grad_norm": 0.0776480957865715, + "learning_rate": 9.726945523606646e-08, + "loss": 0.0013, + "step": 1514 + }, + { + "epoch": 2.9195180722891565, + "grad_norm": 0.14106431603431702, + "learning_rate": 9.290174609172697e-08, + "loss": 0.0204, + "step": 1515 + }, + { + "epoch": 2.9214457831325302, + "grad_norm": 0.10813348740339279, + "learning_rate": 8.863413629633277e-08, + "loss": 0.0026, + "step": 1516 + }, + { + "epoch": 2.9233734939759035, + "grad_norm": 0.11505429446697235, + "learning_rate": 8.446664730881182e-08, + "loss": 0.0038, + "step": 1517 + }, + { + "epoch": 2.9253012048192772, + "grad_norm": 0.18488599359989166, + "learning_rate": 8.039930008465257e-08, + "loss": 0.0094, + "step": 1518 + }, + { + "epoch": 2.9272289156626505, + "grad_norm": 0.19229602813720703, + "learning_rate": 7.643211507579296e-08, + "loss": 0.0062, + "step": 1519 + }, + { + "epoch": 2.929156626506024, + "grad_norm": 0.0876188799738884, + "learning_rate": 7.25651122305293e-08, + "loss": 0.0024, + "step": 1520 + }, + { + "epoch": 2.9310843373493976, + "grad_norm": 0.15103434026241302, + "learning_rate": 6.87983109934054e-08, + "loss": 0.0056, + "step": 1521 + }, + { + "epoch": 2.9330120481927713, + "grad_norm": 0.1714266538619995, + "learning_rate": 6.51317303051191e-08, + "loss": 0.0047, + "step": 1522 + }, + { + "epoch": 2.9349397590361446, + "grad_norm": 0.30670225620269775, + "learning_rate": 6.156538860242922e-08, + "loss": 0.0111, + "step": 1523 + }, + { + "epoch": 2.936867469879518, + "grad_norm": 0.13250356912612915, + "learning_rate": 5.809930381805773e-08, + "loss": 0.0033, + "step": 1524 + }, + { + "epoch": 2.9387951807228916, + "grad_norm": 0.10350223630666733, + "learning_rate": 5.4733493380603183e-08, + "loss": 0.0028, + "step": 1525 + }, + { + "epoch": 2.9407228915662653, + "grad_norm": 0.1638195812702179, + "learning_rate": 5.1467974214456374e-08, + "loss": 0.0037, + "step": 1526 + }, + { + "epoch": 2.9426506024096386, + "grad_norm": 0.11159276962280273, + "learning_rate": 4.830276273970258e-08, + "loss": 0.003, + "step": 1527 + }, + { + "epoch": 2.944578313253012, + "grad_norm": 0.09866586327552795, + "learning_rate": 4.5237874872052776e-08, + "loss": 0.0032, + "step": 1528 + }, + { + "epoch": 2.9465060240963856, + "grad_norm": 0.17825454473495483, + "learning_rate": 4.227332602275924e-08, + "loss": 0.0105, + "step": 1529 + }, + { + "epoch": 2.948433734939759, + "grad_norm": 0.10379356890916824, + "learning_rate": 3.940913109853561e-08, + "loss": 0.0055, + "step": 1530 + }, + { + "epoch": 2.9503614457831326, + "grad_norm": 0.23834416270256042, + "learning_rate": 3.66453045014814e-08, + "loss": 0.0044, + "step": 1531 + }, + { + "epoch": 2.952289156626506, + "grad_norm": 0.11515571922063828, + "learning_rate": 3.398186012901539e-08, + "loss": 0.0042, + "step": 1532 + }, + { + "epoch": 2.9542168674698797, + "grad_norm": 0.14170049130916595, + "learning_rate": 3.141881137379788e-08, + "loss": 0.0073, + "step": 1533 + }, + { + "epoch": 2.956144578313253, + "grad_norm": 0.237248957157135, + "learning_rate": 2.8956171123670774e-08, + "loss": 0.0055, + "step": 1534 + }, + { + "epoch": 2.9580722891566262, + "grad_norm": 0.07076071947813034, + "learning_rate": 2.6593951761588744e-08, + "loss": 0.0016, + "step": 1535 + }, + { + "epoch": 2.96, + "grad_norm": 0.1100577786564827, + "learning_rate": 2.4332165165557032e-08, + "loss": 0.0026, + "step": 1536 + }, + { + "epoch": 2.9619277108433737, + "grad_norm": 0.11576279252767563, + "learning_rate": 2.2170822708573736e-08, + "loss": 0.0036, + "step": 1537 + }, + { + "epoch": 2.963855421686747, + "grad_norm": 0.2067718207836151, + "learning_rate": 2.0109935258565415e-08, + "loss": 0.0063, + "step": 1538 + }, + { + "epoch": 2.9657831325301203, + "grad_norm": 0.15040244162082672, + "learning_rate": 1.8149513178347122e-08, + "loss": 0.0081, + "step": 1539 + }, + { + "epoch": 2.967710843373494, + "grad_norm": 0.14071759581565857, + "learning_rate": 1.6289566325555783e-08, + "loss": 0.006, + "step": 1540 + }, + { + "epoch": 2.9696385542168677, + "grad_norm": 0.32527413964271545, + "learning_rate": 1.4530104052610239e-08, + "loss": 0.0021, + "step": 1541 + }, + { + "epoch": 2.971566265060241, + "grad_norm": 0.06794515997171402, + "learning_rate": 1.2871135206651287e-08, + "loss": 0.0016, + "step": 1542 + }, + { + "epoch": 2.9734939759036143, + "grad_norm": 0.08525913208723068, + "learning_rate": 1.1312668129519477e-08, + "loss": 0.0023, + "step": 1543 + }, + { + "epoch": 2.975421686746988, + "grad_norm": 0.14025282859802246, + "learning_rate": 9.854710657688504e-09, + "loss": 0.0025, + "step": 1544 + }, + { + "epoch": 2.9773493975903613, + "grad_norm": 0.15709802508354187, + "learning_rate": 8.497270122242996e-09, + "loss": 0.0038, + "step": 1545 + }, + { + "epoch": 2.979277108433735, + "grad_norm": 0.1520087569952011, + "learning_rate": 7.240353348834106e-09, + "loss": 0.0027, + "step": 1546 + }, + { + "epoch": 2.9812048192771083, + "grad_norm": 0.13271088898181915, + "learning_rate": 6.083966657646212e-09, + "loss": 0.003, + "step": 1547 + }, + { + "epoch": 2.983132530120482, + "grad_norm": 0.0962211862206459, + "learning_rate": 5.028115863370265e-09, + "loss": 0.0021, + "step": 1548 + }, + { + "epoch": 2.9850602409638554, + "grad_norm": 0.11485985666513443, + "learning_rate": 4.072806275163821e-09, + "loss": 0.0039, + "step": 1549 + }, + { + "epoch": 2.9869879518072286, + "grad_norm": 0.15437521040439606, + "learning_rate": 3.2180426966332833e-09, + "loss": 0.0048, + "step": 1550 + }, + { + "epoch": 2.9889156626506024, + "grad_norm": 0.09884651750326157, + "learning_rate": 2.4638294258072513e-09, + "loss": 0.0032, + "step": 1551 + }, + { + "epoch": 2.990843373493976, + "grad_norm": 0.30931419134140015, + "learning_rate": 1.810170255116539e-09, + "loss": 0.0038, + "step": 1552 + }, + { + "epoch": 2.9927710843373494, + "grad_norm": 0.3311678469181061, + "learning_rate": 1.2570684713719695e-09, + "loss": 0.0247, + "step": 1553 + }, + { + "epoch": 2.9946987951807227, + "grad_norm": 0.13150249421596527, + "learning_rate": 8.045268557443919e-10, + "loss": 0.0029, + "step": 1554 + }, + { + "epoch": 2.9966265060240964, + "grad_norm": 0.10827342420816422, + "learning_rate": 4.5254768376468137e-10, + "loss": 0.0119, + "step": 1555 + }, + { + "epoch": 2.99855421686747, + "grad_norm": 0.10358250141143799, + "learning_rate": 2.011327252948725e-10, + "loss": 0.0038, + "step": 1556 + }, + { + "epoch": 3.0, + "grad_norm": 0.09550733864307404, + "learning_rate": 5.028324453482114e-11, + "loss": 0.0016, + "step": 1557 + }, + { + "epoch": 3.0, + "step": 1557, + "total_flos": 2.043435500286509e+18, + "train_loss": 0.016654981696585226, + "train_runtime": 5294.7714, + "train_samples_per_second": 9.403, + "train_steps_per_second": 0.294 + } + ], + "logging_steps": 1, + "max_steps": 1557, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 92, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.043435500286509e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..ecc7b6b --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:342dfb3c86216e436950100c79812c54066d5572c4e9975b0f133c067f061bcf +size 7825 diff --git a/training_loss.png b/training_loss.png new file mode 100644 index 0000000..c297e45 Binary files /dev/null and b/training_loss.png differ