commit 4360715cbd1fea3f933edb0e0a6caa6f739ab8a3 Author: ModelHub XC Date: Thu Jun 4 10:56:16 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: ferrazzipietro/unsup-Llama-3.2-1B-Instruct-datav2-3ep Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..bc5f30d --- /dev/null +++ b/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..9802341 --- /dev/null +++ b/config.json @@ -0,0 +1,39 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..60d1426 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,10 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "transformers_version": "4.51.0" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..0d7d3dd --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0c82d0e1f62537eed47b0ccfb8e52394d0f7b6dd568465ed04deb3e7a976b75 +size 2996982344 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..b43be96 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..7739bcb --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2065 @@ +{ + "add_eos_token": false, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 16384, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..ee40ba1 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,33566 @@ +{ + "best_global_step": 12000, + "best_metric": 0.3009350597858429, + "best_model_checkpoint": null, + "epoch": 2.99971659791542, + "eval_steps": 1000, + "global_step": 23817, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00012595648203545676, + "grad_norm": 107.34553527832031, + "learning_rate": 0.0, + "loss": 4.9992, + "step": 1 + }, + { + "epoch": 0.0006297824101772837, + "grad_norm": 132.82723999023438, + "learning_rate": 5.037783375314861e-07, + "loss": 5.2462, + "step": 5 + }, + { + "epoch": 0.0012595648203545674, + "grad_norm": 61.73713684082031, + "learning_rate": 1.1335012594458437e-06, + "loss": 4.9216, + "step": 10 + }, + { + "epoch": 0.0018893472305318512, + "grad_norm": 29.1369571685791, + "learning_rate": 1.7632241813602012e-06, + "loss": 4.3103, + "step": 15 + }, + { + "epoch": 0.0025191296407091348, + "grad_norm": 13.688713073730469, + "learning_rate": 2.392947103274559e-06, + "loss": 3.7045, + "step": 20 + }, + { + "epoch": 0.0031489120508864186, + "grad_norm": 7.063476085662842, + "learning_rate": 3.0226700251889166e-06, + "loss": 3.255, + "step": 25 + }, + { + "epoch": 0.0037786944610637024, + "grad_norm": 4.568771839141846, + "learning_rate": 3.6523929471032744e-06, + "loss": 2.9188, + "step": 30 + }, + { + "epoch": 0.004408476871240986, + "grad_norm": 4.0639753341674805, + "learning_rate": 4.282115869017632e-06, + "loss": 2.6194, + "step": 35 + }, + { + "epoch": 0.0050382592814182696, + "grad_norm": 3.363410234451294, + "learning_rate": 4.911838790931989e-06, + "loss": 2.3438, + "step": 40 + }, + { + "epoch": 0.005668041691595554, + "grad_norm": 3.086527109146118, + "learning_rate": 5.541561712846347e-06, + "loss": 2.21, + "step": 45 + }, + { + "epoch": 0.006297824101772837, + "grad_norm": 2.987258195877075, + "learning_rate": 6.171284634760705e-06, + "loss": 2.063, + "step": 50 + }, + { + "epoch": 0.006927606511950121, + "grad_norm": 2.956822156906128, + "learning_rate": 6.801007556675062e-06, + "loss": 1.9592, + "step": 55 + }, + { + "epoch": 0.007557388922127405, + "grad_norm": 2.7860183715820312, + "learning_rate": 7.43073047858942e-06, + "loss": 1.8058, + "step": 60 + }, + { + "epoch": 0.008187171332304689, + "grad_norm": 2.930774211883545, + "learning_rate": 8.060453400503778e-06, + "loss": 1.7843, + "step": 65 + }, + { + "epoch": 0.008816953742481972, + "grad_norm": 2.797520637512207, + "learning_rate": 8.690176322418136e-06, + "loss": 1.6922, + "step": 70 + }, + { + "epoch": 0.009446736152659256, + "grad_norm": 2.945425271987915, + "learning_rate": 9.319899244332492e-06, + "loss": 1.6863, + "step": 75 + }, + { + "epoch": 0.010076518562836539, + "grad_norm": 3.775211811065674, + "learning_rate": 9.94962216624685e-06, + "loss": 1.6918, + "step": 80 + }, + { + "epoch": 0.010706300973013824, + "grad_norm": 4.795872688293457, + "learning_rate": 1.0579345088161209e-05, + "loss": 1.5709, + "step": 85 + }, + { + "epoch": 0.011336083383191108, + "grad_norm": 6.182225227355957, + "learning_rate": 1.1209068010075565e-05, + "loss": 1.582, + "step": 90 + }, + { + "epoch": 0.011965865793368391, + "grad_norm": 2.548555612564087, + "learning_rate": 1.1838790931989923e-05, + "loss": 1.4157, + "step": 95 + }, + { + "epoch": 0.012595648203545674, + "grad_norm": 2.5125293731689453, + "learning_rate": 1.246851385390428e-05, + "loss": 1.4848, + "step": 100 + }, + { + "epoch": 0.01322543061372296, + "grad_norm": 2.3581464290618896, + "learning_rate": 1.309823677581864e-05, + "loss": 1.4169, + "step": 105 + }, + { + "epoch": 0.013855213023900243, + "grad_norm": 2.4205543994903564, + "learning_rate": 1.3727959697732996e-05, + "loss": 1.3803, + "step": 110 + }, + { + "epoch": 0.014484995434077526, + "grad_norm": 2.251526355743408, + "learning_rate": 1.4357682619647355e-05, + "loss": 1.3928, + "step": 115 + }, + { + "epoch": 0.01511477784425481, + "grad_norm": 2.3966078758239746, + "learning_rate": 1.4987405541561711e-05, + "loss": 1.3994, + "step": 120 + }, + { + "epoch": 0.015744560254432095, + "grad_norm": 2.2904422283172607, + "learning_rate": 1.561712846347607e-05, + "loss": 1.3884, + "step": 125 + }, + { + "epoch": 0.016374342664609378, + "grad_norm": 2.238481283187866, + "learning_rate": 1.6246851385390428e-05, + "loss": 1.3419, + "step": 130 + }, + { + "epoch": 0.01700412507478666, + "grad_norm": 2.3208041191101074, + "learning_rate": 1.6876574307304786e-05, + "loss": 1.3282, + "step": 135 + }, + { + "epoch": 0.017633907484963945, + "grad_norm": 2.34198260307312, + "learning_rate": 1.750629722921914e-05, + "loss": 1.3398, + "step": 140 + }, + { + "epoch": 0.018263689895141228, + "grad_norm": 2.316502332687378, + "learning_rate": 1.81360201511335e-05, + "loss": 1.3171, + "step": 145 + }, + { + "epoch": 0.01889347230531851, + "grad_norm": 2.1474759578704834, + "learning_rate": 1.8765743073047857e-05, + "loss": 1.2896, + "step": 150 + }, + { + "epoch": 0.019523254715495795, + "grad_norm": 2.2437920570373535, + "learning_rate": 1.9395465994962215e-05, + "loss": 1.3504, + "step": 155 + }, + { + "epoch": 0.020153037125673078, + "grad_norm": 2.3530609607696533, + "learning_rate": 2.002518891687657e-05, + "loss": 1.2937, + "step": 160 + }, + { + "epoch": 0.020782819535850365, + "grad_norm": 3.199941635131836, + "learning_rate": 2.065491183879093e-05, + "loss": 1.2841, + "step": 165 + }, + { + "epoch": 0.02141260194602765, + "grad_norm": 2.3354921340942383, + "learning_rate": 2.128463476070529e-05, + "loss": 1.2573, + "step": 170 + }, + { + "epoch": 0.022042384356204932, + "grad_norm": 1.9546146392822266, + "learning_rate": 2.1914357682619645e-05, + "loss": 1.2179, + "step": 175 + }, + { + "epoch": 0.022672166766382215, + "grad_norm": 2.329948902130127, + "learning_rate": 2.2544080604534003e-05, + "loss": 1.2417, + "step": 180 + }, + { + "epoch": 0.0233019491765595, + "grad_norm": 2.110067367553711, + "learning_rate": 2.3173803526448358e-05, + "loss": 1.289, + "step": 185 + }, + { + "epoch": 0.023931731586736782, + "grad_norm": 1.8875372409820557, + "learning_rate": 2.380352644836272e-05, + "loss": 1.2183, + "step": 190 + }, + { + "epoch": 0.024561513996914065, + "grad_norm": 2.3789267539978027, + "learning_rate": 2.4433249370277077e-05, + "loss": 1.3136, + "step": 195 + }, + { + "epoch": 0.02519129640709135, + "grad_norm": 2.4072821140289307, + "learning_rate": 2.5062972292191432e-05, + "loss": 1.2241, + "step": 200 + }, + { + "epoch": 0.025821078817268632, + "grad_norm": 1.936230182647705, + "learning_rate": 2.569269521410579e-05, + "loss": 1.2715, + "step": 205 + }, + { + "epoch": 0.02645086122744592, + "grad_norm": 1.8356448411941528, + "learning_rate": 2.632241813602015e-05, + "loss": 1.1728, + "step": 210 + }, + { + "epoch": 0.027080643637623202, + "grad_norm": 1.7642450332641602, + "learning_rate": 2.6952141057934507e-05, + "loss": 1.2022, + "step": 215 + }, + { + "epoch": 0.027710426047800486, + "grad_norm": 1.8820688724517822, + "learning_rate": 2.7581863979848865e-05, + "loss": 1.1563, + "step": 220 + }, + { + "epoch": 0.02834020845797777, + "grad_norm": 1.8875645399093628, + "learning_rate": 2.821158690176322e-05, + "loss": 1.1473, + "step": 225 + }, + { + "epoch": 0.028969990868155052, + "grad_norm": 1.950432538986206, + "learning_rate": 2.884130982367758e-05, + "loss": 1.1478, + "step": 230 + }, + { + "epoch": 0.029599773278332336, + "grad_norm": 1.8610332012176514, + "learning_rate": 2.9471032745591936e-05, + "loss": 1.2026, + "step": 235 + }, + { + "epoch": 0.03022955568850962, + "grad_norm": 1.865122675895691, + "learning_rate": 3.0100755667506295e-05, + "loss": 1.1748, + "step": 240 + }, + { + "epoch": 0.030859338098686902, + "grad_norm": 1.7126635313034058, + "learning_rate": 3.0730478589420656e-05, + "loss": 1.0982, + "step": 245 + }, + { + "epoch": 0.03148912050886419, + "grad_norm": 1.809869408607483, + "learning_rate": 3.136020151133501e-05, + "loss": 1.1032, + "step": 250 + }, + { + "epoch": 0.03211890291904147, + "grad_norm": 1.9480305910110474, + "learning_rate": 3.1989924433249366e-05, + "loss": 1.1195, + "step": 255 + }, + { + "epoch": 0.032748685329218756, + "grad_norm": 1.762191653251648, + "learning_rate": 3.2619647355163724e-05, + "loss": 1.1413, + "step": 260 + }, + { + "epoch": 0.03337846773939604, + "grad_norm": 1.5951004028320312, + "learning_rate": 3.324937027707808e-05, + "loss": 1.1514, + "step": 265 + }, + { + "epoch": 0.03400825014957332, + "grad_norm": 1.9166375398635864, + "learning_rate": 3.387909319899244e-05, + "loss": 1.1129, + "step": 270 + }, + { + "epoch": 0.034638032559750606, + "grad_norm": 1.8051133155822754, + "learning_rate": 3.45088161209068e-05, + "loss": 1.1827, + "step": 275 + }, + { + "epoch": 0.03526781496992789, + "grad_norm": 1.8178284168243408, + "learning_rate": 3.513853904282116e-05, + "loss": 1.1567, + "step": 280 + }, + { + "epoch": 0.03589759738010517, + "grad_norm": 1.5286771059036255, + "learning_rate": 3.5768261964735515e-05, + "loss": 1.011, + "step": 285 + }, + { + "epoch": 0.036527379790282456, + "grad_norm": 1.6900701522827148, + "learning_rate": 3.639798488664987e-05, + "loss": 1.059, + "step": 290 + }, + { + "epoch": 0.03715716220045974, + "grad_norm": 1.5855501890182495, + "learning_rate": 3.702770780856423e-05, + "loss": 1.0943, + "step": 295 + }, + { + "epoch": 0.03778694461063702, + "grad_norm": 1.650087833404541, + "learning_rate": 3.765743073047858e-05, + "loss": 1.0843, + "step": 300 + }, + { + "epoch": 0.038416727020814306, + "grad_norm": 1.772127389907837, + "learning_rate": 3.828715365239294e-05, + "loss": 1.1161, + "step": 305 + }, + { + "epoch": 0.03904650943099159, + "grad_norm": 1.8052891492843628, + "learning_rate": 3.8916876574307306e-05, + "loss": 1.1493, + "step": 310 + }, + { + "epoch": 0.03967629184116887, + "grad_norm": 1.816603183746338, + "learning_rate": 3.954659949622166e-05, + "loss": 1.1113, + "step": 315 + }, + { + "epoch": 0.040306074251346156, + "grad_norm": 1.7114448547363281, + "learning_rate": 4.0176322418136016e-05, + "loss": 1.1124, + "step": 320 + }, + { + "epoch": 0.04093585666152345, + "grad_norm": 1.693428874015808, + "learning_rate": 4.080604534005038e-05, + "loss": 1.0994, + "step": 325 + }, + { + "epoch": 0.04156563907170073, + "grad_norm": 1.6820402145385742, + "learning_rate": 4.143576826196473e-05, + "loss": 1.0989, + "step": 330 + }, + { + "epoch": 0.042195421481878014, + "grad_norm": 1.5841305255889893, + "learning_rate": 4.206549118387909e-05, + "loss": 1.123, + "step": 335 + }, + { + "epoch": 0.0428252038920553, + "grad_norm": 1.714936375617981, + "learning_rate": 4.269521410579344e-05, + "loss": 1.0054, + "step": 340 + }, + { + "epoch": 0.04345498630223258, + "grad_norm": 1.6568523645401, + "learning_rate": 4.332493702770781e-05, + "loss": 1.0397, + "step": 345 + }, + { + "epoch": 0.044084768712409864, + "grad_norm": 1.9612834453582764, + "learning_rate": 4.3954659949622165e-05, + "loss": 1.0481, + "step": 350 + }, + { + "epoch": 0.04471455112258715, + "grad_norm": 1.5859555006027222, + "learning_rate": 4.4584382871536516e-05, + "loss": 1.0754, + "step": 355 + }, + { + "epoch": 0.04534433353276443, + "grad_norm": 1.6280044317245483, + "learning_rate": 4.521410579345088e-05, + "loss": 1.0696, + "step": 360 + }, + { + "epoch": 0.045974115942941714, + "grad_norm": 1.5000571012496948, + "learning_rate": 4.584382871536523e-05, + "loss": 1.0508, + "step": 365 + }, + { + "epoch": 0.046603898353119, + "grad_norm": 1.4447442293167114, + "learning_rate": 4.647355163727959e-05, + "loss": 1.0924, + "step": 370 + }, + { + "epoch": 0.04723368076329628, + "grad_norm": 1.6207387447357178, + "learning_rate": 4.7103274559193956e-05, + "loss": 1.0477, + "step": 375 + }, + { + "epoch": 0.047863463173473564, + "grad_norm": 1.5458993911743164, + "learning_rate": 4.773299748110831e-05, + "loss": 1.0905, + "step": 380 + }, + { + "epoch": 0.04849324558365085, + "grad_norm": 1.5125916004180908, + "learning_rate": 4.8362720403022666e-05, + "loss": 1.0858, + "step": 385 + }, + { + "epoch": 0.04912302799382813, + "grad_norm": 1.5438019037246704, + "learning_rate": 4.899244332493702e-05, + "loss": 1.1133, + "step": 390 + }, + { + "epoch": 0.049752810404005414, + "grad_norm": 1.5938135385513306, + "learning_rate": 4.962216624685138e-05, + "loss": 1.0814, + "step": 395 + }, + { + "epoch": 0.0503825928141827, + "grad_norm": 1.4425631761550903, + "learning_rate": 5.025188916876574e-05, + "loss": 1.0668, + "step": 400 + }, + { + "epoch": 0.05101237522435998, + "grad_norm": 1.505650520324707, + "learning_rate": 5.088161209068009e-05, + "loss": 1.1201, + "step": 405 + }, + { + "epoch": 0.051642157634537264, + "grad_norm": 1.421877145767212, + "learning_rate": 5.151133501259446e-05, + "loss": 1.0199, + "step": 410 + }, + { + "epoch": 0.052271940044714554, + "grad_norm": 1.6243743896484375, + "learning_rate": 5.2141057934508815e-05, + "loss": 1.0791, + "step": 415 + }, + { + "epoch": 0.05290172245489184, + "grad_norm": 1.5055480003356934, + "learning_rate": 5.2770780856423166e-05, + "loss": 1.0553, + "step": 420 + }, + { + "epoch": 0.05353150486506912, + "grad_norm": 1.5624281167984009, + "learning_rate": 5.340050377833753e-05, + "loss": 1.0778, + "step": 425 + }, + { + "epoch": 0.054161287275246404, + "grad_norm": 1.4252930879592896, + "learning_rate": 5.403022670025188e-05, + "loss": 1.0354, + "step": 430 + }, + { + "epoch": 0.05479106968542369, + "grad_norm": 1.4582849740982056, + "learning_rate": 5.465994962216624e-05, + "loss": 0.9825, + "step": 435 + }, + { + "epoch": 0.05542085209560097, + "grad_norm": 1.4336227178573608, + "learning_rate": 5.52896725440806e-05, + "loss": 1.0753, + "step": 440 + }, + { + "epoch": 0.056050634505778255, + "grad_norm": 1.4182472229003906, + "learning_rate": 5.591939546599496e-05, + "loss": 1.0005, + "step": 445 + }, + { + "epoch": 0.05668041691595554, + "grad_norm": 1.4060423374176025, + "learning_rate": 5.6549118387909316e-05, + "loss": 1.0311, + "step": 450 + }, + { + "epoch": 0.05731019932613282, + "grad_norm": 1.5005807876586914, + "learning_rate": 5.717884130982367e-05, + "loss": 1.0815, + "step": 455 + }, + { + "epoch": 0.057939981736310105, + "grad_norm": 1.3563963174819946, + "learning_rate": 5.780856423173803e-05, + "loss": 1.0099, + "step": 460 + }, + { + "epoch": 0.05856976414648739, + "grad_norm": 1.4664981365203857, + "learning_rate": 5.843828715365239e-05, + "loss": 1.0221, + "step": 465 + }, + { + "epoch": 0.05919954655666467, + "grad_norm": 1.3635119199752808, + "learning_rate": 5.906801007556674e-05, + "loss": 1.0864, + "step": 470 + }, + { + "epoch": 0.059829328966841955, + "grad_norm": 1.3736735582351685, + "learning_rate": 5.9697732997481107e-05, + "loss": 1.0949, + "step": 475 + }, + { + "epoch": 0.06045911137701924, + "grad_norm": 1.3973110914230347, + "learning_rate": 6.0327455919395465e-05, + "loss": 1.0936, + "step": 480 + }, + { + "epoch": 0.06108889378719652, + "grad_norm": 1.4158260822296143, + "learning_rate": 6.0957178841309816e-05, + "loss": 1.0389, + "step": 485 + }, + { + "epoch": 0.061718676197373805, + "grad_norm": 1.3810491561889648, + "learning_rate": 6.158690176322417e-05, + "loss": 1.0027, + "step": 490 + }, + { + "epoch": 0.06234845860755109, + "grad_norm": 1.3191838264465332, + "learning_rate": 6.221662468513854e-05, + "loss": 1.0626, + "step": 495 + }, + { + "epoch": 0.06297824101772838, + "grad_norm": 1.2959917783737183, + "learning_rate": 6.28463476070529e-05, + "loss": 1.064, + "step": 500 + }, + { + "epoch": 0.06360802342790566, + "grad_norm": 1.321399450302124, + "learning_rate": 6.347607052896724e-05, + "loss": 1.0407, + "step": 505 + }, + { + "epoch": 0.06423780583808295, + "grad_norm": 1.464593768119812, + "learning_rate": 6.410579345088161e-05, + "loss": 1.0961, + "step": 510 + }, + { + "epoch": 0.06486758824826022, + "grad_norm": 1.1570991277694702, + "learning_rate": 6.473551637279596e-05, + "loss": 1.0373, + "step": 515 + }, + { + "epoch": 0.06549737065843751, + "grad_norm": 1.2346997261047363, + "learning_rate": 6.536523929471032e-05, + "loss": 1.0016, + "step": 520 + }, + { + "epoch": 0.06612715306861479, + "grad_norm": 1.2645131349563599, + "learning_rate": 6.599496221662469e-05, + "loss": 1.0175, + "step": 525 + }, + { + "epoch": 0.06675693547879208, + "grad_norm": 1.7016206979751587, + "learning_rate": 6.662468513853903e-05, + "loss": 1.0517, + "step": 530 + }, + { + "epoch": 0.06738671788896936, + "grad_norm": 1.2666237354278564, + "learning_rate": 6.725440806045339e-05, + "loss": 0.9876, + "step": 535 + }, + { + "epoch": 0.06801650029914665, + "grad_norm": 1.365417242050171, + "learning_rate": 6.788413098236775e-05, + "loss": 1.0262, + "step": 540 + }, + { + "epoch": 0.06864628270932392, + "grad_norm": 1.1083537340164185, + "learning_rate": 6.851385390428211e-05, + "loss": 0.979, + "step": 545 + }, + { + "epoch": 0.06927606511950121, + "grad_norm": 1.2279447317123413, + "learning_rate": 6.914357682619647e-05, + "loss": 1.0566, + "step": 550 + }, + { + "epoch": 0.0699058475296785, + "grad_norm": 1.2323870658874512, + "learning_rate": 6.977329974811082e-05, + "loss": 1.0047, + "step": 555 + }, + { + "epoch": 0.07053562993985578, + "grad_norm": 1.2166273593902588, + "learning_rate": 7.040302267002518e-05, + "loss": 0.989, + "step": 560 + }, + { + "epoch": 0.07116541235003307, + "grad_norm": 1.3858731985092163, + "learning_rate": 7.103274559193954e-05, + "loss": 1.0389, + "step": 565 + }, + { + "epoch": 0.07179519476021035, + "grad_norm": 1.1922657489776611, + "learning_rate": 7.16624685138539e-05, + "loss": 1.0214, + "step": 570 + }, + { + "epoch": 0.07242497717038764, + "grad_norm": 1.2922885417938232, + "learning_rate": 7.229219143576826e-05, + "loss": 1.0647, + "step": 575 + }, + { + "epoch": 0.07305475958056491, + "grad_norm": 1.174737811088562, + "learning_rate": 7.292191435768262e-05, + "loss": 1.039, + "step": 580 + }, + { + "epoch": 0.0736845419907422, + "grad_norm": 1.4469586610794067, + "learning_rate": 7.355163727959697e-05, + "loss": 0.9553, + "step": 585 + }, + { + "epoch": 0.07431432440091948, + "grad_norm": 1.2497671842575073, + "learning_rate": 7.418136020151133e-05, + "loss": 1.0676, + "step": 590 + }, + { + "epoch": 0.07494410681109677, + "grad_norm": 1.0431822538375854, + "learning_rate": 7.481108312342569e-05, + "loss": 0.9664, + "step": 595 + }, + { + "epoch": 0.07557388922127405, + "grad_norm": 1.1643774509429932, + "learning_rate": 7.544080604534005e-05, + "loss": 1.0354, + "step": 600 + }, + { + "epoch": 0.07620367163145134, + "grad_norm": 1.277813196182251, + "learning_rate": 7.60705289672544e-05, + "loss": 1.0078, + "step": 605 + }, + { + "epoch": 0.07683345404162861, + "grad_norm": 1.17416250705719, + "learning_rate": 7.670025188916876e-05, + "loss": 0.9802, + "step": 610 + }, + { + "epoch": 0.0774632364518059, + "grad_norm": 1.3622347116470337, + "learning_rate": 7.732997481108312e-05, + "loss": 0.9323, + "step": 615 + }, + { + "epoch": 0.07809301886198318, + "grad_norm": 1.1814721822738647, + "learning_rate": 7.795969773299747e-05, + "loss": 0.989, + "step": 620 + }, + { + "epoch": 0.07872280127216047, + "grad_norm": 1.1661953926086426, + "learning_rate": 7.858942065491183e-05, + "loss": 0.9873, + "step": 625 + }, + { + "epoch": 0.07935258368233775, + "grad_norm": 1.1950370073318481, + "learning_rate": 7.921914357682618e-05, + "loss": 0.9997, + "step": 630 + }, + { + "epoch": 0.07998236609251504, + "grad_norm": 1.2635341882705688, + "learning_rate": 7.984886649874056e-05, + "loss": 0.9939, + "step": 635 + }, + { + "epoch": 0.08061214850269231, + "grad_norm": 1.094709038734436, + "learning_rate": 8.047858942065491e-05, + "loss": 1.035, + "step": 640 + }, + { + "epoch": 0.0812419309128696, + "grad_norm": 1.0304499864578247, + "learning_rate": 8.110831234256926e-05, + "loss": 1.0385, + "step": 645 + }, + { + "epoch": 0.0818717133230469, + "grad_norm": 1.203204870223999, + "learning_rate": 8.173803526448362e-05, + "loss": 1.0159, + "step": 650 + }, + { + "epoch": 0.08250149573322417, + "grad_norm": 1.317717432975769, + "learning_rate": 8.236775818639797e-05, + "loss": 0.9972, + "step": 655 + }, + { + "epoch": 0.08313127814340146, + "grad_norm": 1.1352335214614868, + "learning_rate": 8.299748110831233e-05, + "loss": 1.0027, + "step": 660 + }, + { + "epoch": 0.08376106055357874, + "grad_norm": 1.1126285791397095, + "learning_rate": 8.362720403022669e-05, + "loss": 0.9824, + "step": 665 + }, + { + "epoch": 0.08439084296375603, + "grad_norm": 1.1858975887298584, + "learning_rate": 8.425692695214106e-05, + "loss": 1.0084, + "step": 670 + }, + { + "epoch": 0.0850206253739333, + "grad_norm": 1.1154075860977173, + "learning_rate": 8.488664987405541e-05, + "loss": 0.968, + "step": 675 + }, + { + "epoch": 0.0856504077841106, + "grad_norm": 1.1235395669937134, + "learning_rate": 8.551637279596977e-05, + "loss": 0.9828, + "step": 680 + }, + { + "epoch": 0.08628019019428787, + "grad_norm": 1.251595139503479, + "learning_rate": 8.614609571788412e-05, + "loss": 0.9945, + "step": 685 + }, + { + "epoch": 0.08690997260446516, + "grad_norm": 1.2071354389190674, + "learning_rate": 8.677581863979848e-05, + "loss": 1.0263, + "step": 690 + }, + { + "epoch": 0.08753975501464244, + "grad_norm": 1.1954138278961182, + "learning_rate": 8.740554156171283e-05, + "loss": 0.9987, + "step": 695 + }, + { + "epoch": 0.08816953742481973, + "grad_norm": 1.088759422302246, + "learning_rate": 8.803526448362719e-05, + "loss": 1.0323, + "step": 700 + }, + { + "epoch": 0.088799319834997, + "grad_norm": 1.0866519212722778, + "learning_rate": 8.866498740554156e-05, + "loss": 1.0009, + "step": 705 + }, + { + "epoch": 0.0894291022451743, + "grad_norm": 1.1450897455215454, + "learning_rate": 8.929471032745592e-05, + "loss": 1.0493, + "step": 710 + }, + { + "epoch": 0.09005888465535157, + "grad_norm": 1.0619937181472778, + "learning_rate": 8.992443324937027e-05, + "loss": 1.0007, + "step": 715 + }, + { + "epoch": 0.09068866706552886, + "grad_norm": 1.1548367738723755, + "learning_rate": 9.055415617128463e-05, + "loss": 1.006, + "step": 720 + }, + { + "epoch": 0.09131844947570614, + "grad_norm": 1.0847381353378296, + "learning_rate": 9.118387909319898e-05, + "loss": 0.9703, + "step": 725 + }, + { + "epoch": 0.09194823188588343, + "grad_norm": 1.1227048635482788, + "learning_rate": 9.181360201511333e-05, + "loss": 0.957, + "step": 730 + }, + { + "epoch": 0.09257801429606072, + "grad_norm": 1.0912779569625854, + "learning_rate": 9.24433249370277e-05, + "loss": 0.9866, + "step": 735 + }, + { + "epoch": 0.093207796706238, + "grad_norm": 1.0490583181381226, + "learning_rate": 9.307304785894206e-05, + "loss": 1.0605, + "step": 740 + }, + { + "epoch": 0.09383757911641528, + "grad_norm": 1.0930982828140259, + "learning_rate": 9.370277078085642e-05, + "loss": 1.0061, + "step": 745 + }, + { + "epoch": 0.09446736152659256, + "grad_norm": 1.1814302206039429, + "learning_rate": 9.433249370277077e-05, + "loss": 1.0187, + "step": 750 + }, + { + "epoch": 0.09509714393676985, + "grad_norm": 1.113162636756897, + "learning_rate": 9.496221662468513e-05, + "loss": 0.9929, + "step": 755 + }, + { + "epoch": 0.09572692634694713, + "grad_norm": 1.0789388418197632, + "learning_rate": 9.559193954659948e-05, + "loss": 0.9699, + "step": 760 + }, + { + "epoch": 0.09635670875712442, + "grad_norm": 1.0445059537887573, + "learning_rate": 9.622166246851384e-05, + "loss": 0.929, + "step": 765 + }, + { + "epoch": 0.0969864911673017, + "grad_norm": 1.0267964601516724, + "learning_rate": 9.685138539042821e-05, + "loss": 0.9581, + "step": 770 + }, + { + "epoch": 0.09761627357747898, + "grad_norm": 1.0283193588256836, + "learning_rate": 9.748110831234256e-05, + "loss": 1.0331, + "step": 775 + }, + { + "epoch": 0.09824605598765626, + "grad_norm": 1.0210477113723755, + "learning_rate": 9.811083123425692e-05, + "loss": 0.96, + "step": 780 + }, + { + "epoch": 0.09887583839783355, + "grad_norm": 1.0163402557373047, + "learning_rate": 9.874055415617127e-05, + "loss": 0.9923, + "step": 785 + }, + { + "epoch": 0.09950562080801083, + "grad_norm": 1.0012452602386475, + "learning_rate": 9.937027707808563e-05, + "loss": 0.9722, + "step": 790 + }, + { + "epoch": 0.10013540321818812, + "grad_norm": 0.9810453653335571, + "learning_rate": 9.999999999999999e-05, + "loss": 0.991, + "step": 795 + }, + { + "epoch": 0.1007651856283654, + "grad_norm": 1.1151692867279053, + "learning_rate": 0.00010062972292191434, + "loss": 0.9783, + "step": 800 + }, + { + "epoch": 0.10139496803854268, + "grad_norm": 1.1338117122650146, + "learning_rate": 0.00010125944584382871, + "loss": 0.9979, + "step": 805 + }, + { + "epoch": 0.10202475044871996, + "grad_norm": 0.9878106117248535, + "learning_rate": 0.00010188916876574307, + "loss": 0.9068, + "step": 810 + }, + { + "epoch": 0.10265453285889725, + "grad_norm": 1.0334627628326416, + "learning_rate": 0.00010251889168765742, + "loss": 0.9389, + "step": 815 + }, + { + "epoch": 0.10328431526907453, + "grad_norm": 0.9542704224586487, + "learning_rate": 0.00010314861460957178, + "loss": 0.9699, + "step": 820 + }, + { + "epoch": 0.10391409767925182, + "grad_norm": 1.003753423690796, + "learning_rate": 0.00010377833753148613, + "loss": 0.9309, + "step": 825 + }, + { + "epoch": 0.10454388008942911, + "grad_norm": 0.9803423285484314, + "learning_rate": 0.00010440806045340049, + "loss": 0.9711, + "step": 830 + }, + { + "epoch": 0.10517366249960639, + "grad_norm": 0.9765311479568481, + "learning_rate": 0.00010503778337531486, + "loss": 1.0237, + "step": 835 + }, + { + "epoch": 0.10580344490978368, + "grad_norm": 1.035510540008545, + "learning_rate": 0.00010566750629722922, + "loss": 0.9737, + "step": 840 + }, + { + "epoch": 0.10643322731996095, + "grad_norm": 3.4597954750061035, + "learning_rate": 0.00010629722921914357, + "loss": 1.038, + "step": 845 + }, + { + "epoch": 0.10706300973013824, + "grad_norm": 1.0254745483398438, + "learning_rate": 0.00010692695214105793, + "loss": 1.0044, + "step": 850 + }, + { + "epoch": 0.10769279214031552, + "grad_norm": 4.8941521644592285, + "learning_rate": 0.00010755667506297228, + "loss": 1.1038, + "step": 855 + }, + { + "epoch": 0.10832257455049281, + "grad_norm": 1.6676890850067139, + "learning_rate": 0.00010818639798488663, + "loss": 1.0043, + "step": 860 + }, + { + "epoch": 0.10895235696067009, + "grad_norm": 5.738070964813232, + "learning_rate": 0.00010881612090680099, + "loss": 1.0502, + "step": 865 + }, + { + "epoch": 0.10958213937084738, + "grad_norm": 1.1913108825683594, + "learning_rate": 0.00010944584382871536, + "loss": 1.0071, + "step": 870 + }, + { + "epoch": 0.11021192178102465, + "grad_norm": 1.0302019119262695, + "learning_rate": 0.00011007556675062972, + "loss": 0.9732, + "step": 875 + }, + { + "epoch": 0.11084170419120194, + "grad_norm": 0.92161625623703, + "learning_rate": 0.00011070528967254407, + "loss": 0.9414, + "step": 880 + }, + { + "epoch": 0.11147148660137922, + "grad_norm": 0.954598605632782, + "learning_rate": 0.00011133501259445843, + "loss": 0.9772, + "step": 885 + }, + { + "epoch": 0.11210126901155651, + "grad_norm": 0.9241647720336914, + "learning_rate": 0.00011196473551637278, + "loss": 0.9498, + "step": 890 + }, + { + "epoch": 0.11273105142173379, + "grad_norm": 0.9744060039520264, + "learning_rate": 0.00011259445843828714, + "loss": 0.9501, + "step": 895 + }, + { + "epoch": 0.11336083383191108, + "grad_norm": 1.0800458192825317, + "learning_rate": 0.0001132241813602015, + "loss": 0.9823, + "step": 900 + }, + { + "epoch": 0.11399061624208835, + "grad_norm": 1.0275344848632812, + "learning_rate": 0.00011385390428211587, + "loss": 1.0426, + "step": 905 + }, + { + "epoch": 0.11462039865226564, + "grad_norm": 1.0069867372512817, + "learning_rate": 0.00011448362720403022, + "loss": 0.9933, + "step": 910 + }, + { + "epoch": 0.11525018106244292, + "grad_norm": 1.0309741497039795, + "learning_rate": 0.00011511335012594457, + "loss": 0.9792, + "step": 915 + }, + { + "epoch": 0.11587996347262021, + "grad_norm": 0.9738866090774536, + "learning_rate": 0.00011574307304785893, + "loss": 1.0193, + "step": 920 + }, + { + "epoch": 0.1165097458827975, + "grad_norm": 0.9231003522872925, + "learning_rate": 0.00011637279596977329, + "loss": 0.9741, + "step": 925 + }, + { + "epoch": 0.11713952829297478, + "grad_norm": 1.1318124532699585, + "learning_rate": 0.00011700251889168764, + "loss": 0.9644, + "step": 930 + }, + { + "epoch": 0.11776931070315207, + "grad_norm": 1.033288598060608, + "learning_rate": 0.00011763224181360201, + "loss": 0.9216, + "step": 935 + }, + { + "epoch": 0.11839909311332934, + "grad_norm": 1.003190517425537, + "learning_rate": 0.00011826196473551637, + "loss": 0.9521, + "step": 940 + }, + { + "epoch": 0.11902887552350663, + "grad_norm": 1.0145738124847412, + "learning_rate": 0.00011889168765743072, + "loss": 0.9676, + "step": 945 + }, + { + "epoch": 0.11965865793368391, + "grad_norm": 1.1370879411697388, + "learning_rate": 0.00011952141057934508, + "loss": 0.9987, + "step": 950 + }, + { + "epoch": 0.1202884403438612, + "grad_norm": 0.9657129645347595, + "learning_rate": 0.00012015113350125943, + "loss": 0.9622, + "step": 955 + }, + { + "epoch": 0.12091822275403848, + "grad_norm": 0.9489335417747498, + "learning_rate": 0.00012078085642317378, + "loss": 0.9402, + "step": 960 + }, + { + "epoch": 0.12154800516421577, + "grad_norm": 1.0598636865615845, + "learning_rate": 0.00012141057934508814, + "loss": 1.0047, + "step": 965 + }, + { + "epoch": 0.12217778757439304, + "grad_norm": 0.9747732281684875, + "learning_rate": 0.00012204030226700251, + "loss": 1.009, + "step": 970 + }, + { + "epoch": 0.12280756998457033, + "grad_norm": 0.9424954056739807, + "learning_rate": 0.00012267002518891686, + "loss": 0.9603, + "step": 975 + }, + { + "epoch": 0.12343735239474761, + "grad_norm": 1.0061867237091064, + "learning_rate": 0.00012329974811083123, + "loss": 0.9494, + "step": 980 + }, + { + "epoch": 0.1240671348049249, + "grad_norm": 0.924182116985321, + "learning_rate": 0.00012392947103274558, + "loss": 0.9804, + "step": 985 + }, + { + "epoch": 0.12469691721510218, + "grad_norm": 0.983267605304718, + "learning_rate": 0.00012455919395465995, + "loss": 0.9814, + "step": 990 + }, + { + "epoch": 0.12532669962527945, + "grad_norm": 0.896524965763092, + "learning_rate": 0.0001251889168765743, + "loss": 0.9533, + "step": 995 + }, + { + "epoch": 0.12595648203545676, + "grad_norm": 0.8669747710227966, + "learning_rate": 0.00012581863979848864, + "loss": 0.9544, + "step": 1000 + }, + { + "epoch": 0.12595648203545676, + "eval_loss": 0.383962482213974, + "eval_runtime": 6.2938, + "eval_samples_per_second": 158.887, + "eval_steps_per_second": 10.01, + "step": 1000 + }, + { + "epoch": 0.12658626444563403, + "grad_norm": 0.9055171012878418, + "learning_rate": 0.000126448362720403, + "loss": 0.9353, + "step": 1005 + }, + { + "epoch": 0.1272160468558113, + "grad_norm": 0.9889428019523621, + "learning_rate": 0.00012707808564231738, + "loss": 0.9508, + "step": 1010 + }, + { + "epoch": 0.12784582926598861, + "grad_norm": 0.8966602683067322, + "learning_rate": 0.00012770780856423173, + "loss": 0.995, + "step": 1015 + }, + { + "epoch": 0.1284756116761659, + "grad_norm": 0.9995138645172119, + "learning_rate": 0.0001283375314861461, + "loss": 0.9624, + "step": 1020 + }, + { + "epoch": 0.12910539408634317, + "grad_norm": 0.8536145687103271, + "learning_rate": 0.00012896725440806044, + "loss": 0.9549, + "step": 1025 + }, + { + "epoch": 0.12973517649652044, + "grad_norm": 0.8860256671905518, + "learning_rate": 0.00012959697732997479, + "loss": 1.0021, + "step": 1030 + }, + { + "epoch": 0.13036495890669775, + "grad_norm": 0.8574298620223999, + "learning_rate": 0.00013022670025188916, + "loss": 0.9798, + "step": 1035 + }, + { + "epoch": 0.13099474131687502, + "grad_norm": 1.1180200576782227, + "learning_rate": 0.00013085642317380353, + "loss": 0.9225, + "step": 1040 + }, + { + "epoch": 0.1316245237270523, + "grad_norm": 0.9391751289367676, + "learning_rate": 0.00013148614609571787, + "loss": 0.9467, + "step": 1045 + }, + { + "epoch": 0.13225430613722958, + "grad_norm": 0.8861620426177979, + "learning_rate": 0.00013211586901763222, + "loss": 0.9413, + "step": 1050 + }, + { + "epoch": 0.13288408854740688, + "grad_norm": 0.8499036431312561, + "learning_rate": 0.0001327455919395466, + "loss": 0.9644, + "step": 1055 + }, + { + "epoch": 0.13351387095758416, + "grad_norm": 0.9816482067108154, + "learning_rate": 0.00013337531486146094, + "loss": 0.9552, + "step": 1060 + }, + { + "epoch": 0.13414365336776143, + "grad_norm": 0.9725036025047302, + "learning_rate": 0.0001340050377833753, + "loss": 0.9461, + "step": 1065 + }, + { + "epoch": 0.1347734357779387, + "grad_norm": 0.9366094470024109, + "learning_rate": 0.00013463476070528968, + "loss": 0.9305, + "step": 1070 + }, + { + "epoch": 0.13540321818811601, + "grad_norm": 0.9212390780448914, + "learning_rate": 0.00013526448362720402, + "loss": 0.9551, + "step": 1075 + }, + { + "epoch": 0.1360330005982933, + "grad_norm": 0.8980582356452942, + "learning_rate": 0.00013589420654911837, + "loss": 0.9491, + "step": 1080 + }, + { + "epoch": 0.13666278300847057, + "grad_norm": 0.9107893109321594, + "learning_rate": 0.00013652392947103274, + "loss": 0.9366, + "step": 1085 + }, + { + "epoch": 0.13729256541864784, + "grad_norm": 0.8583124876022339, + "learning_rate": 0.00013715365239294708, + "loss": 0.9628, + "step": 1090 + }, + { + "epoch": 0.13792234782882515, + "grad_norm": 0.877052903175354, + "learning_rate": 0.00013778337531486146, + "loss": 0.9675, + "step": 1095 + }, + { + "epoch": 0.13855213023900242, + "grad_norm": 0.8020456433296204, + "learning_rate": 0.0001384130982367758, + "loss": 0.9015, + "step": 1100 + }, + { + "epoch": 0.1391819126491797, + "grad_norm": 0.8703967928886414, + "learning_rate": 0.00013904282115869017, + "loss": 0.9658, + "step": 1105 + }, + { + "epoch": 0.139811695059357, + "grad_norm": 0.7955961227416992, + "learning_rate": 0.00013967254408060452, + "loss": 0.9084, + "step": 1110 + }, + { + "epoch": 0.14044147746953428, + "grad_norm": 0.893059492111206, + "learning_rate": 0.0001403022670025189, + "loss": 0.9591, + "step": 1115 + }, + { + "epoch": 0.14107125987971156, + "grad_norm": 0.8481057286262512, + "learning_rate": 0.00014093198992443323, + "loss": 0.9588, + "step": 1120 + }, + { + "epoch": 0.14170104228988883, + "grad_norm": 0.8342163562774658, + "learning_rate": 0.00014156171284634758, + "loss": 0.947, + "step": 1125 + }, + { + "epoch": 0.14233082470006614, + "grad_norm": 0.790868878364563, + "learning_rate": 0.00014219143576826195, + "loss": 0.9366, + "step": 1130 + }, + { + "epoch": 0.14296060711024342, + "grad_norm": 0.8430061340332031, + "learning_rate": 0.0001428211586901763, + "loss": 0.9014, + "step": 1135 + }, + { + "epoch": 0.1435903895204207, + "grad_norm": 0.9150258302688599, + "learning_rate": 0.00014345088161209067, + "loss": 0.9546, + "step": 1140 + }, + { + "epoch": 0.14422017193059797, + "grad_norm": 0.8204888105392456, + "learning_rate": 0.00014408060453400504, + "loss": 0.9159, + "step": 1145 + }, + { + "epoch": 0.14484995434077527, + "grad_norm": 0.7595349550247192, + "learning_rate": 0.00014471032745591938, + "loss": 0.9179, + "step": 1150 + }, + { + "epoch": 0.14547973675095255, + "grad_norm": 0.8642888069152832, + "learning_rate": 0.00014534005037783373, + "loss": 1.0338, + "step": 1155 + }, + { + "epoch": 0.14610951916112982, + "grad_norm": 0.9633650183677673, + "learning_rate": 0.0001459697732997481, + "loss": 0.9638, + "step": 1160 + }, + { + "epoch": 0.1467393015713071, + "grad_norm": 0.8363626599311829, + "learning_rate": 0.00014659949622166244, + "loss": 0.8828, + "step": 1165 + }, + { + "epoch": 0.1473690839814844, + "grad_norm": 0.8199290633201599, + "learning_rate": 0.00014722921914357682, + "loss": 0.9577, + "step": 1170 + }, + { + "epoch": 0.14799886639166168, + "grad_norm": 0.7671203017234802, + "learning_rate": 0.0001478589420654912, + "loss": 0.9381, + "step": 1175 + }, + { + "epoch": 0.14862864880183896, + "grad_norm": 0.8354636430740356, + "learning_rate": 0.00014848866498740553, + "loss": 1.0019, + "step": 1180 + }, + { + "epoch": 0.14925843121201623, + "grad_norm": 0.911165714263916, + "learning_rate": 0.00014911838790931988, + "loss": 0.8985, + "step": 1185 + }, + { + "epoch": 0.14988821362219354, + "grad_norm": 0.8125472664833069, + "learning_rate": 0.00014974811083123425, + "loss": 0.9628, + "step": 1190 + }, + { + "epoch": 0.15051799603237082, + "grad_norm": 0.8937430381774902, + "learning_rate": 0.00015037783375314862, + "loss": 0.9843, + "step": 1195 + }, + { + "epoch": 0.1511477784425481, + "grad_norm": 0.9609346985816956, + "learning_rate": 0.00015100755667506297, + "loss": 0.9552, + "step": 1200 + }, + { + "epoch": 0.1517775608527254, + "grad_norm": 0.7975132465362549, + "learning_rate": 0.0001516372795969773, + "loss": 0.9799, + "step": 1205 + }, + { + "epoch": 0.15240734326290267, + "grad_norm": 0.8690225481987, + "learning_rate": 0.00015226700251889168, + "loss": 0.9604, + "step": 1210 + }, + { + "epoch": 0.15303712567307995, + "grad_norm": 0.7486653923988342, + "learning_rate": 0.00015289672544080603, + "loss": 0.9022, + "step": 1215 + }, + { + "epoch": 0.15366690808325723, + "grad_norm": 0.8420302271842957, + "learning_rate": 0.0001535264483627204, + "loss": 0.8791, + "step": 1220 + }, + { + "epoch": 0.15429669049343453, + "grad_norm": 0.8187466263771057, + "learning_rate": 0.00015415617128463474, + "loss": 0.9332, + "step": 1225 + }, + { + "epoch": 0.1549264729036118, + "grad_norm": 0.8711130619049072, + "learning_rate": 0.0001547858942065491, + "loss": 0.8924, + "step": 1230 + }, + { + "epoch": 0.15555625531378908, + "grad_norm": 0.8086002469062805, + "learning_rate": 0.00015541561712846346, + "loss": 0.9491, + "step": 1235 + }, + { + "epoch": 0.15618603772396636, + "grad_norm": 0.8274957537651062, + "learning_rate": 0.0001560453400503778, + "loss": 0.9392, + "step": 1240 + }, + { + "epoch": 0.15681582013414366, + "grad_norm": 0.831676721572876, + "learning_rate": 0.00015667506297229218, + "loss": 1.0327, + "step": 1245 + }, + { + "epoch": 0.15744560254432094, + "grad_norm": 0.8806201219558716, + "learning_rate": 0.00015730478589420652, + "loss": 0.8607, + "step": 1250 + }, + { + "epoch": 0.15807538495449822, + "grad_norm": 0.905436635017395, + "learning_rate": 0.00015793450881612092, + "loss": 0.9301, + "step": 1255 + }, + { + "epoch": 0.1587051673646755, + "grad_norm": 0.8631262183189392, + "learning_rate": 0.00015856423173803526, + "loss": 0.9443, + "step": 1260 + }, + { + "epoch": 0.1593349497748528, + "grad_norm": 0.7483521699905396, + "learning_rate": 0.0001591939546599496, + "loss": 0.901, + "step": 1265 + }, + { + "epoch": 0.15996473218503007, + "grad_norm": 0.8273198008537292, + "learning_rate": 0.00015982367758186398, + "loss": 0.9608, + "step": 1270 + }, + { + "epoch": 0.16059451459520735, + "grad_norm": 0.7562909722328186, + "learning_rate": 0.00016045340050377832, + "loss": 0.9219, + "step": 1275 + }, + { + "epoch": 0.16122429700538463, + "grad_norm": 0.8585835099220276, + "learning_rate": 0.0001610831234256927, + "loss": 0.942, + "step": 1280 + }, + { + "epoch": 0.16185407941556193, + "grad_norm": 0.8192921876907349, + "learning_rate": 0.00016171284634760704, + "loss": 0.9531, + "step": 1285 + }, + { + "epoch": 0.1624838618257392, + "grad_norm": 0.8301946520805359, + "learning_rate": 0.00016234256926952139, + "loss": 0.8972, + "step": 1290 + }, + { + "epoch": 0.16311364423591648, + "grad_norm": 0.8291681408882141, + "learning_rate": 0.00016297229219143576, + "loss": 0.9653, + "step": 1295 + }, + { + "epoch": 0.1637434266460938, + "grad_norm": 0.8672564625740051, + "learning_rate": 0.0001636020151133501, + "loss": 0.9498, + "step": 1300 + }, + { + "epoch": 0.16437320905627106, + "grad_norm": 0.7432397603988647, + "learning_rate": 0.00016423173803526445, + "loss": 0.8782, + "step": 1305 + }, + { + "epoch": 0.16500299146644834, + "grad_norm": 0.7710584402084351, + "learning_rate": 0.00016486146095717882, + "loss": 0.8872, + "step": 1310 + }, + { + "epoch": 0.16563277387662562, + "grad_norm": 0.7810630798339844, + "learning_rate": 0.00016549118387909316, + "loss": 0.9357, + "step": 1315 + }, + { + "epoch": 0.16626255628680292, + "grad_norm": 0.7368482947349548, + "learning_rate": 0.00016612090680100756, + "loss": 0.8935, + "step": 1320 + }, + { + "epoch": 0.1668923386969802, + "grad_norm": 0.7725487947463989, + "learning_rate": 0.0001667506297229219, + "loss": 0.9241, + "step": 1325 + }, + { + "epoch": 0.16752212110715747, + "grad_norm": 0.7551338076591492, + "learning_rate": 0.00016738035264483628, + "loss": 0.8878, + "step": 1330 + }, + { + "epoch": 0.16815190351733475, + "grad_norm": 0.8027164340019226, + "learning_rate": 0.00016801007556675062, + "loss": 0.9149, + "step": 1335 + }, + { + "epoch": 0.16878168592751205, + "grad_norm": 0.7476945519447327, + "learning_rate": 0.00016863979848866497, + "loss": 0.9241, + "step": 1340 + }, + { + "epoch": 0.16941146833768933, + "grad_norm": 0.7967312335968018, + "learning_rate": 0.00016926952141057934, + "loss": 0.932, + "step": 1345 + }, + { + "epoch": 0.1700412507478666, + "grad_norm": 0.809727668762207, + "learning_rate": 0.00016989924433249368, + "loss": 0.922, + "step": 1350 + }, + { + "epoch": 0.17067103315804388, + "grad_norm": 0.7631811499595642, + "learning_rate": 0.00017052896725440806, + "loss": 0.94, + "step": 1355 + }, + { + "epoch": 0.1713008155682212, + "grad_norm": 0.6545524001121521, + "learning_rate": 0.0001711586901763224, + "loss": 0.8898, + "step": 1360 + }, + { + "epoch": 0.17193059797839846, + "grad_norm": 0.8232229351997375, + "learning_rate": 0.00017178841309823675, + "loss": 0.9235, + "step": 1365 + }, + { + "epoch": 0.17256038038857574, + "grad_norm": 0.8617391586303711, + "learning_rate": 0.00017241813602015112, + "loss": 0.9491, + "step": 1370 + }, + { + "epoch": 0.17319016279875304, + "grad_norm": 0.7971004247665405, + "learning_rate": 0.00017304785894206546, + "loss": 0.8749, + "step": 1375 + }, + { + "epoch": 0.17381994520893032, + "grad_norm": 0.7876558899879456, + "learning_rate": 0.0001736775818639798, + "loss": 0.9954, + "step": 1380 + }, + { + "epoch": 0.1744497276191076, + "grad_norm": 0.8051108121871948, + "learning_rate": 0.00017430730478589418, + "loss": 0.897, + "step": 1385 + }, + { + "epoch": 0.17507951002928487, + "grad_norm": 0.8449770212173462, + "learning_rate": 0.00017493702770780855, + "loss": 0.8881, + "step": 1390 + }, + { + "epoch": 0.17570929243946218, + "grad_norm": 0.8217072486877441, + "learning_rate": 0.00017556675062972292, + "loss": 0.9027, + "step": 1395 + }, + { + "epoch": 0.17633907484963945, + "grad_norm": 0.806914210319519, + "learning_rate": 0.00017619647355163727, + "loss": 0.9222, + "step": 1400 + }, + { + "epoch": 0.17696885725981673, + "grad_norm": 0.8344951868057251, + "learning_rate": 0.00017682619647355164, + "loss": 0.9462, + "step": 1405 + }, + { + "epoch": 0.177598639669994, + "grad_norm": 0.7249205112457275, + "learning_rate": 0.00017745591939546598, + "loss": 0.917, + "step": 1410 + }, + { + "epoch": 0.1782284220801713, + "grad_norm": 0.8052341341972351, + "learning_rate": 0.00017808564231738033, + "loss": 0.9168, + "step": 1415 + }, + { + "epoch": 0.1788582044903486, + "grad_norm": 0.7675748467445374, + "learning_rate": 0.0001787153652392947, + "loss": 0.9186, + "step": 1420 + }, + { + "epoch": 0.17948798690052586, + "grad_norm": 0.7672801613807678, + "learning_rate": 0.00017934508816120904, + "loss": 0.8637, + "step": 1425 + }, + { + "epoch": 0.18011776931070314, + "grad_norm": 0.7517289519309998, + "learning_rate": 0.00017997481108312342, + "loss": 0.9053, + "step": 1430 + }, + { + "epoch": 0.18074755172088044, + "grad_norm": 0.7253280878067017, + "learning_rate": 0.00018060453400503776, + "loss": 0.9047, + "step": 1435 + }, + { + "epoch": 0.18137733413105772, + "grad_norm": 0.7113356590270996, + "learning_rate": 0.0001812342569269521, + "loss": 0.9288, + "step": 1440 + }, + { + "epoch": 0.182007116541235, + "grad_norm": 0.6626010537147522, + "learning_rate": 0.00018186397984886648, + "loss": 0.8942, + "step": 1445 + }, + { + "epoch": 0.18263689895141227, + "grad_norm": 0.7033849358558655, + "learning_rate": 0.00018249370277078082, + "loss": 0.9086, + "step": 1450 + }, + { + "epoch": 0.18326668136158958, + "grad_norm": 0.701263427734375, + "learning_rate": 0.00018312342569269522, + "loss": 0.9429, + "step": 1455 + }, + { + "epoch": 0.18389646377176685, + "grad_norm": 0.7362795472145081, + "learning_rate": 0.00018375314861460957, + "loss": 0.8955, + "step": 1460 + }, + { + "epoch": 0.18452624618194413, + "grad_norm": 0.7902641296386719, + "learning_rate": 0.00018438287153652394, + "loss": 0.8535, + "step": 1465 + }, + { + "epoch": 0.18515602859212144, + "grad_norm": 0.6855788230895996, + "learning_rate": 0.00018501259445843828, + "loss": 0.9166, + "step": 1470 + }, + { + "epoch": 0.1857858110022987, + "grad_norm": 0.6782147884368896, + "learning_rate": 0.00018564231738035263, + "loss": 0.8755, + "step": 1475 + }, + { + "epoch": 0.186415593412476, + "grad_norm": 0.6875694990158081, + "learning_rate": 0.000186272040302267, + "loss": 0.8712, + "step": 1480 + }, + { + "epoch": 0.18704537582265326, + "grad_norm": 0.7253673672676086, + "learning_rate": 0.00018690176322418134, + "loss": 0.8933, + "step": 1485 + }, + { + "epoch": 0.18767515823283057, + "grad_norm": 0.8096954822540283, + "learning_rate": 0.0001875314861460957, + "loss": 0.9359, + "step": 1490 + }, + { + "epoch": 0.18830494064300785, + "grad_norm": 0.7597787380218506, + "learning_rate": 0.00018816120906801006, + "loss": 0.9341, + "step": 1495 + }, + { + "epoch": 0.18893472305318512, + "grad_norm": 0.7736676931381226, + "learning_rate": 0.0001887909319899244, + "loss": 0.9281, + "step": 1500 + }, + { + "epoch": 0.1895645054633624, + "grad_norm": 0.6343753337860107, + "learning_rate": 0.00018942065491183878, + "loss": 0.9177, + "step": 1505 + }, + { + "epoch": 0.1901942878735397, + "grad_norm": 0.728712260723114, + "learning_rate": 0.00019005037783375312, + "loss": 0.9371, + "step": 1510 + }, + { + "epoch": 0.19082407028371698, + "grad_norm": 0.7092194557189941, + "learning_rate": 0.00019068010075566746, + "loss": 0.8902, + "step": 1515 + }, + { + "epoch": 0.19145385269389426, + "grad_norm": 0.7485836744308472, + "learning_rate": 0.00019130982367758186, + "loss": 0.8931, + "step": 1520 + }, + { + "epoch": 0.19208363510407153, + "grad_norm": 0.7485086917877197, + "learning_rate": 0.0001919395465994962, + "loss": 0.9368, + "step": 1525 + }, + { + "epoch": 0.19271341751424884, + "grad_norm": 0.7100546360015869, + "learning_rate": 0.00019256926952141058, + "loss": 0.8803, + "step": 1530 + }, + { + "epoch": 0.1933431999244261, + "grad_norm": 0.7371817827224731, + "learning_rate": 0.00019319899244332492, + "loss": 0.8963, + "step": 1535 + }, + { + "epoch": 0.1939729823346034, + "grad_norm": 0.6849647164344788, + "learning_rate": 0.0001938287153652393, + "loss": 0.9137, + "step": 1540 + }, + { + "epoch": 0.19460276474478067, + "grad_norm": 0.7108625173568726, + "learning_rate": 0.00019445843828715364, + "loss": 0.9078, + "step": 1545 + }, + { + "epoch": 0.19523254715495797, + "grad_norm": 0.7581806182861328, + "learning_rate": 0.00019508816120906799, + "loss": 0.9002, + "step": 1550 + }, + { + "epoch": 0.19586232956513525, + "grad_norm": 0.7299503087997437, + "learning_rate": 0.00019571788413098236, + "loss": 0.8897, + "step": 1555 + }, + { + "epoch": 0.19649211197531252, + "grad_norm": 0.7815247774124146, + "learning_rate": 0.0001963476070528967, + "loss": 0.8454, + "step": 1560 + }, + { + "epoch": 0.19712189438548983, + "grad_norm": 0.7475869655609131, + "learning_rate": 0.00019697732997481105, + "loss": 0.9482, + "step": 1565 + }, + { + "epoch": 0.1977516767956671, + "grad_norm": 0.7469599843025208, + "learning_rate": 0.00019760705289672542, + "loss": 0.9048, + "step": 1570 + }, + { + "epoch": 0.19838145920584438, + "grad_norm": 0.6186767220497131, + "learning_rate": 0.00019823677581863976, + "loss": 0.8698, + "step": 1575 + }, + { + "epoch": 0.19901124161602166, + "grad_norm": 0.843999445438385, + "learning_rate": 0.00019886649874055413, + "loss": 0.9567, + "step": 1580 + }, + { + "epoch": 0.19964102402619896, + "grad_norm": 0.749344527721405, + "learning_rate": 0.00019949622166246848, + "loss": 0.9234, + "step": 1585 + }, + { + "epoch": 0.20027080643637624, + "grad_norm": 0.6822441220283508, + "learning_rate": 0.00020012594458438288, + "loss": 0.8915, + "step": 1590 + }, + { + "epoch": 0.2009005888465535, + "grad_norm": 0.7193272113800049, + "learning_rate": 0.00020075566750629722, + "loss": 0.8922, + "step": 1595 + }, + { + "epoch": 0.2015303712567308, + "grad_norm": 0.7202250361442566, + "learning_rate": 0.00020138539042821157, + "loss": 0.9026, + "step": 1600 + }, + { + "epoch": 0.2021601536669081, + "grad_norm": 0.6946163773536682, + "learning_rate": 0.00020201511335012594, + "loss": 0.9181, + "step": 1605 + }, + { + "epoch": 0.20278993607708537, + "grad_norm": 0.7185525894165039, + "learning_rate": 0.00020264483627204028, + "loss": 0.8809, + "step": 1610 + }, + { + "epoch": 0.20341971848726265, + "grad_norm": 0.6290002465248108, + "learning_rate": 0.00020327455919395466, + "loss": 0.9033, + "step": 1615 + }, + { + "epoch": 0.20404950089743992, + "grad_norm": 0.6773431897163391, + "learning_rate": 0.000203904282115869, + "loss": 0.838, + "step": 1620 + }, + { + "epoch": 0.20467928330761723, + "grad_norm": 0.7076095342636108, + "learning_rate": 0.00020453400503778335, + "loss": 0.9158, + "step": 1625 + }, + { + "epoch": 0.2053090657177945, + "grad_norm": 0.7354462146759033, + "learning_rate": 0.00020516372795969772, + "loss": 0.8336, + "step": 1630 + }, + { + "epoch": 0.20593884812797178, + "grad_norm": 0.6885705590248108, + "learning_rate": 0.00020579345088161206, + "loss": 0.8971, + "step": 1635 + }, + { + "epoch": 0.20656863053814906, + "grad_norm": 0.697887659072876, + "learning_rate": 0.00020642317380352643, + "loss": 0.851, + "step": 1640 + }, + { + "epoch": 0.20719841294832636, + "grad_norm": 0.7369652986526489, + "learning_rate": 0.00020705289672544078, + "loss": 0.8567, + "step": 1645 + }, + { + "epoch": 0.20782819535850364, + "grad_norm": 0.7226613759994507, + "learning_rate": 0.00020768261964735512, + "loss": 0.9038, + "step": 1650 + }, + { + "epoch": 0.2084579777686809, + "grad_norm": 0.6973157525062561, + "learning_rate": 0.00020831234256926952, + "loss": 0.8443, + "step": 1655 + }, + { + "epoch": 0.20908776017885822, + "grad_norm": 0.7276191115379333, + "learning_rate": 0.00020894206549118387, + "loss": 0.8985, + "step": 1660 + }, + { + "epoch": 0.2097175425890355, + "grad_norm": 0.694542646408081, + "learning_rate": 0.00020957178841309824, + "loss": 0.8914, + "step": 1665 + }, + { + "epoch": 0.21034732499921277, + "grad_norm": 0.8255221843719482, + "learning_rate": 0.00021020151133501258, + "loss": 0.9072, + "step": 1670 + }, + { + "epoch": 0.21097710740939005, + "grad_norm": 0.637487530708313, + "learning_rate": 0.00021083123425692693, + "loss": 0.8637, + "step": 1675 + }, + { + "epoch": 0.21160688981956735, + "grad_norm": 0.6839597821235657, + "learning_rate": 0.0002114609571788413, + "loss": 0.8736, + "step": 1680 + }, + { + "epoch": 0.21223667222974463, + "grad_norm": 0.6435440182685852, + "learning_rate": 0.00021209068010075564, + "loss": 0.8725, + "step": 1685 + }, + { + "epoch": 0.2128664546399219, + "grad_norm": 0.7100492715835571, + "learning_rate": 0.00021272040302267002, + "loss": 0.9169, + "step": 1690 + }, + { + "epoch": 0.21349623705009918, + "grad_norm": 0.6926056742668152, + "learning_rate": 0.00021335012594458436, + "loss": 0.8549, + "step": 1695 + }, + { + "epoch": 0.21412601946027648, + "grad_norm": 0.8507684469223022, + "learning_rate": 0.0002139798488664987, + "loss": 0.9011, + "step": 1700 + }, + { + "epoch": 0.21475580187045376, + "grad_norm": 0.7276325821876526, + "learning_rate": 0.00021460957178841308, + "loss": 0.8607, + "step": 1705 + }, + { + "epoch": 0.21538558428063104, + "grad_norm": 0.6535823941230774, + "learning_rate": 0.00021523929471032742, + "loss": 0.8558, + "step": 1710 + }, + { + "epoch": 0.2160153666908083, + "grad_norm": 0.6517070531845093, + "learning_rate": 0.0002158690176322418, + "loss": 0.8703, + "step": 1715 + }, + { + "epoch": 0.21664514910098562, + "grad_norm": 0.7442309260368347, + "learning_rate": 0.00021649874055415614, + "loss": 0.8961, + "step": 1720 + }, + { + "epoch": 0.2172749315111629, + "grad_norm": 0.7261196374893188, + "learning_rate": 0.00021712846347607054, + "loss": 0.8902, + "step": 1725 + }, + { + "epoch": 0.21790471392134017, + "grad_norm": 0.7019686698913574, + "learning_rate": 0.00021775818639798488, + "loss": 0.8929, + "step": 1730 + }, + { + "epoch": 0.21853449633151745, + "grad_norm": 0.7852956056594849, + "learning_rate": 0.00021838790931989923, + "loss": 0.8766, + "step": 1735 + }, + { + "epoch": 0.21916427874169475, + "grad_norm": 0.7370544672012329, + "learning_rate": 0.0002190176322418136, + "loss": 0.855, + "step": 1740 + }, + { + "epoch": 0.21979406115187203, + "grad_norm": 0.6246267557144165, + "learning_rate": 0.00021964735516372794, + "loss": 0.9127, + "step": 1745 + }, + { + "epoch": 0.2204238435620493, + "grad_norm": 0.6939797401428223, + "learning_rate": 0.0002202770780856423, + "loss": 0.8878, + "step": 1750 + }, + { + "epoch": 0.2210536259722266, + "grad_norm": 0.6594600081443787, + "learning_rate": 0.00022090680100755666, + "loss": 0.9105, + "step": 1755 + }, + { + "epoch": 0.22168340838240388, + "grad_norm": 0.6578107476234436, + "learning_rate": 0.000221536523929471, + "loss": 0.9016, + "step": 1760 + }, + { + "epoch": 0.22231319079258116, + "grad_norm": 0.6889748573303223, + "learning_rate": 0.00022216624685138538, + "loss": 0.9091, + "step": 1765 + }, + { + "epoch": 0.22294297320275844, + "grad_norm": 0.6207224130630493, + "learning_rate": 0.00022279596977329972, + "loss": 0.9058, + "step": 1770 + }, + { + "epoch": 0.22357275561293574, + "grad_norm": 0.6724773645401001, + "learning_rate": 0.00022342569269521406, + "loss": 0.9144, + "step": 1775 + }, + { + "epoch": 0.22420253802311302, + "grad_norm": 0.702472448348999, + "learning_rate": 0.00022405541561712844, + "loss": 0.9, + "step": 1780 + }, + { + "epoch": 0.2248323204332903, + "grad_norm": 0.6482950448989868, + "learning_rate": 0.00022468513853904278, + "loss": 0.88, + "step": 1785 + }, + { + "epoch": 0.22546210284346757, + "grad_norm": 0.7253268957138062, + "learning_rate": 0.00022531486146095718, + "loss": 0.9147, + "step": 1790 + }, + { + "epoch": 0.22609188525364488, + "grad_norm": 0.7196680307388306, + "learning_rate": 0.00022594458438287152, + "loss": 0.8687, + "step": 1795 + }, + { + "epoch": 0.22672166766382215, + "grad_norm": 0.6720924973487854, + "learning_rate": 0.0002265743073047859, + "loss": 0.9173, + "step": 1800 + }, + { + "epoch": 0.22735145007399943, + "grad_norm": 0.6656882762908936, + "learning_rate": 0.00022720403022670024, + "loss": 0.8237, + "step": 1805 + }, + { + "epoch": 0.2279812324841767, + "grad_norm": 0.6303510665893555, + "learning_rate": 0.00022783375314861459, + "loss": 0.891, + "step": 1810 + }, + { + "epoch": 0.228611014894354, + "grad_norm": 0.6595205068588257, + "learning_rate": 0.00022846347607052896, + "loss": 0.8745, + "step": 1815 + }, + { + "epoch": 0.22924079730453129, + "grad_norm": 0.6373685002326965, + "learning_rate": 0.0002290931989924433, + "loss": 0.895, + "step": 1820 + }, + { + "epoch": 0.22987057971470856, + "grad_norm": 0.6187670230865479, + "learning_rate": 0.00022972292191435767, + "loss": 0.8954, + "step": 1825 + }, + { + "epoch": 0.23050036212488584, + "grad_norm": 0.6348496079444885, + "learning_rate": 0.00023035264483627202, + "loss": 0.8462, + "step": 1830 + }, + { + "epoch": 0.23113014453506314, + "grad_norm": 0.6880120038986206, + "learning_rate": 0.00023098236775818636, + "loss": 0.883, + "step": 1835 + }, + { + "epoch": 0.23175992694524042, + "grad_norm": 0.7668615579605103, + "learning_rate": 0.00023161209068010073, + "loss": 0.9134, + "step": 1840 + }, + { + "epoch": 0.2323897093554177, + "grad_norm": 0.6664952635765076, + "learning_rate": 0.00023224181360201508, + "loss": 0.9276, + "step": 1845 + }, + { + "epoch": 0.233019491765595, + "grad_norm": 0.754509449005127, + "learning_rate": 0.00023287153652392942, + "loss": 0.858, + "step": 1850 + }, + { + "epoch": 0.23364927417577228, + "grad_norm": 0.6345789432525635, + "learning_rate": 0.00023350125944584382, + "loss": 0.9048, + "step": 1855 + }, + { + "epoch": 0.23427905658594955, + "grad_norm": 0.6877152323722839, + "learning_rate": 0.00023413098236775817, + "loss": 0.9023, + "step": 1860 + }, + { + "epoch": 0.23490883899612683, + "grad_norm": 0.6173678636550903, + "learning_rate": 0.00023476070528967254, + "loss": 0.8951, + "step": 1865 + }, + { + "epoch": 0.23553862140630413, + "grad_norm": 0.6912857294082642, + "learning_rate": 0.00023539042821158688, + "loss": 0.8412, + "step": 1870 + }, + { + "epoch": 0.2361684038164814, + "grad_norm": 0.6385686993598938, + "learning_rate": 0.00023602015113350126, + "loss": 0.8954, + "step": 1875 + }, + { + "epoch": 0.23679818622665869, + "grad_norm": 0.6755088567733765, + "learning_rate": 0.0002366498740554156, + "loss": 0.8964, + "step": 1880 + }, + { + "epoch": 0.23742796863683596, + "grad_norm": 0.6391545534133911, + "learning_rate": 0.00023727959697732995, + "loss": 0.9294, + "step": 1885 + }, + { + "epoch": 0.23805775104701327, + "grad_norm": 0.7155817747116089, + "learning_rate": 0.00023790931989924432, + "loss": 0.8967, + "step": 1890 + }, + { + "epoch": 0.23868753345719054, + "grad_norm": 0.681224524974823, + "learning_rate": 0.00023853904282115866, + "loss": 0.8997, + "step": 1895 + }, + { + "epoch": 0.23931731586736782, + "grad_norm": 0.6473144888877869, + "learning_rate": 0.00023916876574307303, + "loss": 0.9172, + "step": 1900 + }, + { + "epoch": 0.2399470982775451, + "grad_norm": 0.6562004685401917, + "learning_rate": 0.00023979848866498738, + "loss": 0.8488, + "step": 1905 + }, + { + "epoch": 0.2405768806877224, + "grad_norm": 0.6842007637023926, + "learning_rate": 0.00024042821158690172, + "loss": 0.9183, + "step": 1910 + }, + { + "epoch": 0.24120666309789968, + "grad_norm": 0.5957079529762268, + "learning_rate": 0.0002410579345088161, + "loss": 0.8293, + "step": 1915 + }, + { + "epoch": 0.24183644550807695, + "grad_norm": 0.6745590567588806, + "learning_rate": 0.00024168765743073044, + "loss": 0.8219, + "step": 1920 + }, + { + "epoch": 0.24246622791825426, + "grad_norm": 0.6895525455474854, + "learning_rate": 0.00024231738035264484, + "loss": 0.9034, + "step": 1925 + }, + { + "epoch": 0.24309601032843153, + "grad_norm": 0.7394620776176453, + "learning_rate": 0.00024294710327455918, + "loss": 0.8702, + "step": 1930 + }, + { + "epoch": 0.2437257927386088, + "grad_norm": 0.7846884727478027, + "learning_rate": 0.00024357682619647353, + "loss": 0.9143, + "step": 1935 + }, + { + "epoch": 0.24435557514878609, + "grad_norm": 0.594127893447876, + "learning_rate": 0.00024420654911838787, + "loss": 0.8838, + "step": 1940 + }, + { + "epoch": 0.2449853575589634, + "grad_norm": 0.6737518906593323, + "learning_rate": 0.00024483627204030224, + "loss": 0.8657, + "step": 1945 + }, + { + "epoch": 0.24561513996914067, + "grad_norm": 0.6851866245269775, + "learning_rate": 0.0002454659949622166, + "loss": 0.9133, + "step": 1950 + }, + { + "epoch": 0.24624492237931794, + "grad_norm": 0.6238758563995361, + "learning_rate": 0.000246095717884131, + "loss": 0.8816, + "step": 1955 + }, + { + "epoch": 0.24687470478949522, + "grad_norm": 0.6002854704856873, + "learning_rate": 0.0002467254408060453, + "loss": 0.8888, + "step": 1960 + }, + { + "epoch": 0.24750448719967252, + "grad_norm": 0.6201847791671753, + "learning_rate": 0.0002473551637279597, + "loss": 0.8299, + "step": 1965 + }, + { + "epoch": 0.2481342696098498, + "grad_norm": 0.6619172692298889, + "learning_rate": 0.00024798488664987405, + "loss": 0.9297, + "step": 1970 + }, + { + "epoch": 0.24876405202002708, + "grad_norm": 0.6359203457832336, + "learning_rate": 0.00024861460957178837, + "loss": 0.8811, + "step": 1975 + }, + { + "epoch": 0.24939383443020435, + "grad_norm": 0.6441104412078857, + "learning_rate": 0.00024924433249370274, + "loss": 0.8704, + "step": 1980 + }, + { + "epoch": 0.25002361684038166, + "grad_norm": 0.7083386778831482, + "learning_rate": 0.0002498740554156171, + "loss": 0.8877, + "step": 1985 + }, + { + "epoch": 0.2506533992505589, + "grad_norm": 0.642206072807312, + "learning_rate": 0.0002505037783375315, + "loss": 0.8661, + "step": 1990 + }, + { + "epoch": 0.2512831816607362, + "grad_norm": 0.6782190203666687, + "learning_rate": 0.00025113350125944585, + "loss": 0.901, + "step": 1995 + }, + { + "epoch": 0.2519129640709135, + "grad_norm": 0.6277428269386292, + "learning_rate": 0.00025176322418136017, + "loss": 0.8212, + "step": 2000 + }, + { + "epoch": 0.2519129640709135, + "eval_loss": 0.35735848546028137, + "eval_runtime": 6.2326, + "eval_samples_per_second": 160.447, + "eval_steps_per_second": 10.108, + "step": 2000 + }, + { + "epoch": 0.25254274648109076, + "grad_norm": 0.5980456471443176, + "learning_rate": 0.00025239294710327454, + "loss": 0.8808, + "step": 2005 + }, + { + "epoch": 0.25317252889126807, + "grad_norm": 0.6398759484291077, + "learning_rate": 0.0002530226700251889, + "loss": 0.8817, + "step": 2010 + }, + { + "epoch": 0.25380231130144537, + "grad_norm": 0.5681187510490417, + "learning_rate": 0.00025365239294710323, + "loss": 0.8672, + "step": 2015 + }, + { + "epoch": 0.2544320937116226, + "grad_norm": 0.6202912926673889, + "learning_rate": 0.0002542821158690176, + "loss": 0.8627, + "step": 2020 + }, + { + "epoch": 0.2550618761217999, + "grad_norm": 0.5921783447265625, + "learning_rate": 0.000254911838790932, + "loss": 0.8214, + "step": 2025 + }, + { + "epoch": 0.25569165853197723, + "grad_norm": 0.629782497882843, + "learning_rate": 0.00025554156171284635, + "loss": 0.8995, + "step": 2030 + }, + { + "epoch": 0.2563214409421545, + "grad_norm": 0.6545585989952087, + "learning_rate": 0.00025617128463476066, + "loss": 0.8422, + "step": 2035 + }, + { + "epoch": 0.2569512233523318, + "grad_norm": 0.6024030447006226, + "learning_rate": 0.00025680100755667504, + "loss": 0.8341, + "step": 2040 + }, + { + "epoch": 0.25758100576250903, + "grad_norm": 0.6795976161956787, + "learning_rate": 0.0002574307304785894, + "loss": 0.852, + "step": 2045 + }, + { + "epoch": 0.25821078817268633, + "grad_norm": 0.6465495228767395, + "learning_rate": 0.0002580604534005037, + "loss": 0.8514, + "step": 2050 + }, + { + "epoch": 0.25884057058286364, + "grad_norm": 0.6498434543609619, + "learning_rate": 0.0002586901763224181, + "loss": 0.8906, + "step": 2055 + }, + { + "epoch": 0.2594703529930409, + "grad_norm": 0.7072421908378601, + "learning_rate": 0.00025931989924433247, + "loss": 0.9061, + "step": 2060 + }, + { + "epoch": 0.2601001354032182, + "grad_norm": 0.5902896523475647, + "learning_rate": 0.00025994962216624684, + "loss": 0.8327, + "step": 2065 + }, + { + "epoch": 0.2607299178133955, + "grad_norm": 0.6410335302352905, + "learning_rate": 0.0002605793450881612, + "loss": 0.9002, + "step": 2070 + }, + { + "epoch": 0.26135970022357274, + "grad_norm": 1.628951072692871, + "learning_rate": 0.00026120906801007553, + "loss": 0.8944, + "step": 2075 + }, + { + "epoch": 0.26198948263375005, + "grad_norm": 0.6544843316078186, + "learning_rate": 0.0002618387909319899, + "loss": 0.8656, + "step": 2080 + }, + { + "epoch": 0.2626192650439273, + "grad_norm": 15.444189071655273, + "learning_rate": 0.0002624685138539043, + "loss": 0.9639, + "step": 2085 + }, + { + "epoch": 0.2632490474541046, + "grad_norm": 8.399425506591797, + "learning_rate": 0.0002630982367758186, + "loss": 1.1367, + "step": 2090 + }, + { + "epoch": 0.2638788298642819, + "grad_norm": 24.009044647216797, + "learning_rate": 0.00026372795969773296, + "loss": 1.1429, + "step": 2095 + }, + { + "epoch": 0.26450861227445915, + "grad_norm": 4.226770877838135, + "learning_rate": 0.00026435768261964733, + "loss": 0.974, + "step": 2100 + }, + { + "epoch": 0.26513839468463646, + "grad_norm": 1.0910799503326416, + "learning_rate": 0.0002649874055415617, + "loss": 1.0182, + "step": 2105 + }, + { + "epoch": 0.26576817709481376, + "grad_norm": 2.8835411071777344, + "learning_rate": 0.000265617128463476, + "loss": 1.0283, + "step": 2110 + }, + { + "epoch": 0.266397959504991, + "grad_norm": 2.8626575469970703, + "learning_rate": 0.0002662468513853904, + "loss": 0.9273, + "step": 2115 + }, + { + "epoch": 0.2670277419151683, + "grad_norm": 1.4587650299072266, + "learning_rate": 0.00026687657430730477, + "loss": 0.9578, + "step": 2120 + }, + { + "epoch": 0.2676575243253456, + "grad_norm": 0.7692992091178894, + "learning_rate": 0.00026750629722921914, + "loss": 0.8701, + "step": 2125 + }, + { + "epoch": 0.26828730673552287, + "grad_norm": 0.8609071373939514, + "learning_rate": 0.0002681360201511335, + "loss": 0.8718, + "step": 2130 + }, + { + "epoch": 0.26891708914570017, + "grad_norm": 0.7419576048851013, + "learning_rate": 0.00026876574307304783, + "loss": 0.8732, + "step": 2135 + }, + { + "epoch": 0.2695468715558774, + "grad_norm": 0.8134281635284424, + "learning_rate": 0.0002693954659949622, + "loss": 0.9112, + "step": 2140 + }, + { + "epoch": 0.2701766539660547, + "grad_norm": 0.7559547424316406, + "learning_rate": 0.00027002518891687657, + "loss": 0.8804, + "step": 2145 + }, + { + "epoch": 0.27080643637623203, + "grad_norm": 0.7497460842132568, + "learning_rate": 0.0002706549118387909, + "loss": 0.8439, + "step": 2150 + }, + { + "epoch": 0.2714362187864093, + "grad_norm": 0.775444746017456, + "learning_rate": 0.00027128463476070526, + "loss": 0.888, + "step": 2155 + }, + { + "epoch": 0.2720660011965866, + "grad_norm": 0.7074035406112671, + "learning_rate": 0.00027191435768261963, + "loss": 0.8628, + "step": 2160 + }, + { + "epoch": 0.2726957836067639, + "grad_norm": 0.730311393737793, + "learning_rate": 0.00027254408060453395, + "loss": 0.8908, + "step": 2165 + }, + { + "epoch": 0.27332556601694113, + "grad_norm": 0.7610625624656677, + "learning_rate": 0.0002731738035264483, + "loss": 0.8954, + "step": 2170 + }, + { + "epoch": 0.27395534842711844, + "grad_norm": 0.6473423838615417, + "learning_rate": 0.0002738035264483627, + "loss": 0.8488, + "step": 2175 + }, + { + "epoch": 0.2745851308372957, + "grad_norm": 0.7084975838661194, + "learning_rate": 0.00027443324937027707, + "loss": 0.8631, + "step": 2180 + }, + { + "epoch": 0.275214913247473, + "grad_norm": 0.6844817996025085, + "learning_rate": 0.0002750629722921914, + "loss": 0.9021, + "step": 2185 + }, + { + "epoch": 0.2758446956576503, + "grad_norm": 0.641327440738678, + "learning_rate": 0.0002756926952141058, + "loss": 0.9002, + "step": 2190 + }, + { + "epoch": 0.27647447806782754, + "grad_norm": 0.7175489664077759, + "learning_rate": 0.0002763224181360201, + "loss": 0.8794, + "step": 2195 + }, + { + "epoch": 0.27710426047800485, + "grad_norm": 0.6306767463684082, + "learning_rate": 0.0002769521410579345, + "loss": 0.8732, + "step": 2200 + }, + { + "epoch": 0.27773404288818215, + "grad_norm": 0.6501113176345825, + "learning_rate": 0.00027758186397984887, + "loss": 0.8725, + "step": 2205 + }, + { + "epoch": 0.2783638252983594, + "grad_norm": 0.5996410250663757, + "learning_rate": 0.0002782115869017632, + "loss": 0.8828, + "step": 2210 + }, + { + "epoch": 0.2789936077085367, + "grad_norm": 0.6551349759101868, + "learning_rate": 0.00027884130982367756, + "loss": 0.8725, + "step": 2215 + }, + { + "epoch": 0.279623390118714, + "grad_norm": 0.6475560069084167, + "learning_rate": 0.00027947103274559193, + "loss": 0.9333, + "step": 2220 + }, + { + "epoch": 0.28025317252889126, + "grad_norm": 0.6957899928092957, + "learning_rate": 0.00028010075566750625, + "loss": 0.8933, + "step": 2225 + }, + { + "epoch": 0.28088295493906856, + "grad_norm": 0.6194736361503601, + "learning_rate": 0.0002807304785894206, + "loss": 0.9268, + "step": 2230 + }, + { + "epoch": 0.2815127373492458, + "grad_norm": 0.6293075084686279, + "learning_rate": 0.000281360201511335, + "loss": 0.8985, + "step": 2235 + }, + { + "epoch": 0.2821425197594231, + "grad_norm": 0.6805360317230225, + "learning_rate": 0.0002819899244332493, + "loss": 0.854, + "step": 2240 + }, + { + "epoch": 0.2827723021696004, + "grad_norm": 0.6671084761619568, + "learning_rate": 0.0002826196473551637, + "loss": 0.8774, + "step": 2245 + }, + { + "epoch": 0.28340208457977767, + "grad_norm": 0.5680047273635864, + "learning_rate": 0.00028324937027707805, + "loss": 0.8273, + "step": 2250 + }, + { + "epoch": 0.284031866989955, + "grad_norm": 0.5691477060317993, + "learning_rate": 0.0002838790931989924, + "loss": 0.8633, + "step": 2255 + }, + { + "epoch": 0.2846616494001323, + "grad_norm": 0.6509323120117188, + "learning_rate": 0.0002845088161209068, + "loss": 0.8991, + "step": 2260 + }, + { + "epoch": 0.2852914318103095, + "grad_norm": 0.714750349521637, + "learning_rate": 0.00028513853904282117, + "loss": 0.8863, + "step": 2265 + }, + { + "epoch": 0.28592121422048683, + "grad_norm": 0.6934742331504822, + "learning_rate": 0.0002857682619647355, + "loss": 0.8699, + "step": 2270 + }, + { + "epoch": 0.2865509966306641, + "grad_norm": 0.6048073172569275, + "learning_rate": 0.00028639798488664986, + "loss": 0.8983, + "step": 2275 + }, + { + "epoch": 0.2871807790408414, + "grad_norm": 0.6630669236183167, + "learning_rate": 0.00028702770780856423, + "loss": 0.9142, + "step": 2280 + }, + { + "epoch": 0.2878105614510187, + "grad_norm": 0.6518734693527222, + "learning_rate": 0.00028765743073047855, + "loss": 0.8734, + "step": 2285 + }, + { + "epoch": 0.28844034386119594, + "grad_norm": 0.5939868688583374, + "learning_rate": 0.0002882871536523929, + "loss": 0.8873, + "step": 2290 + }, + { + "epoch": 0.28907012627137324, + "grad_norm": 0.6081305742263794, + "learning_rate": 0.0002889168765743073, + "loss": 0.8735, + "step": 2295 + }, + { + "epoch": 0.28969990868155054, + "grad_norm": 0.5869495272636414, + "learning_rate": 0.0002895465994962216, + "loss": 0.8694, + "step": 2300 + }, + { + "epoch": 0.2903296910917278, + "grad_norm": 0.6381964683532715, + "learning_rate": 0.000290176322418136, + "loss": 0.8638, + "step": 2305 + }, + { + "epoch": 0.2909594735019051, + "grad_norm": 0.5546308755874634, + "learning_rate": 0.00029080604534005035, + "loss": 0.8748, + "step": 2310 + }, + { + "epoch": 0.2915892559120824, + "grad_norm": 0.7318828701972961, + "learning_rate": 0.0002914357682619647, + "loss": 0.8594, + "step": 2315 + }, + { + "epoch": 0.29221903832225965, + "grad_norm": 0.5685531497001648, + "learning_rate": 0.00029206549118387904, + "loss": 0.8815, + "step": 2320 + }, + { + "epoch": 0.29284882073243695, + "grad_norm": 0.6351069808006287, + "learning_rate": 0.00029269521410579347, + "loss": 0.8351, + "step": 2325 + }, + { + "epoch": 0.2934786031426142, + "grad_norm": 0.5828582048416138, + "learning_rate": 0.0002933249370277078, + "loss": 0.8678, + "step": 2330 + }, + { + "epoch": 0.2941083855527915, + "grad_norm": 0.5991604924201965, + "learning_rate": 0.00029395465994962216, + "loss": 0.8939, + "step": 2335 + }, + { + "epoch": 0.2947381679629688, + "grad_norm": 0.5732405781745911, + "learning_rate": 0.00029458438287153653, + "loss": 0.8594, + "step": 2340 + }, + { + "epoch": 0.29536795037314606, + "grad_norm": 0.5813714265823364, + "learning_rate": 0.00029521410579345085, + "loss": 0.8412, + "step": 2345 + }, + { + "epoch": 0.29599773278332336, + "grad_norm": 0.5281296968460083, + "learning_rate": 0.0002958438287153652, + "loss": 0.9049, + "step": 2350 + }, + { + "epoch": 0.29662751519350067, + "grad_norm": 0.6491068005561829, + "learning_rate": 0.0002964735516372796, + "loss": 0.8955, + "step": 2355 + }, + { + "epoch": 0.2972572976036779, + "grad_norm": 0.6236696839332581, + "learning_rate": 0.0002971032745591939, + "loss": 0.8792, + "step": 2360 + }, + { + "epoch": 0.2978870800138552, + "grad_norm": 0.605625331401825, + "learning_rate": 0.0002977329974811083, + "loss": 0.8448, + "step": 2365 + }, + { + "epoch": 0.29851686242403247, + "grad_norm": 0.6011054515838623, + "learning_rate": 0.00029836272040302265, + "loss": 0.911, + "step": 2370 + }, + { + "epoch": 0.2991466448342098, + "grad_norm": 0.5662422180175781, + "learning_rate": 0.00029899244332493697, + "loss": 0.872, + "step": 2375 + }, + { + "epoch": 0.2997764272443871, + "grad_norm": 0.8375005125999451, + "learning_rate": 0.00029962216624685134, + "loss": 0.7924, + "step": 2380 + }, + { + "epoch": 0.3004062096545643, + "grad_norm": 0.5916186571121216, + "learning_rate": 0.0002999999935557256, + "loss": 0.9007, + "step": 2385 + }, + { + "epoch": 0.30103599206474163, + "grad_norm": 0.6436251997947693, + "learning_rate": 0.00029999992105764553, + "loss": 0.8247, + "step": 2390 + }, + { + "epoch": 0.30166577447491894, + "grad_norm": 0.6368377208709717, + "learning_rate": 0.0002999997680061815, + "loss": 0.891, + "step": 2395 + }, + { + "epoch": 0.3022955568850962, + "grad_norm": 0.5848705172538757, + "learning_rate": 0.0002999995344014156, + "loss": 0.8335, + "step": 2400 + }, + { + "epoch": 0.3029253392952735, + "grad_norm": 0.5829634070396423, + "learning_rate": 0.0002999992202434735, + "loss": 0.8705, + "step": 2405 + }, + { + "epoch": 0.3035551217054508, + "grad_norm": 0.6242154240608215, + "learning_rate": 0.0002999988255325237, + "loss": 0.8819, + "step": 2410 + }, + { + "epoch": 0.30418490411562804, + "grad_norm": 0.5757481455802917, + "learning_rate": 0.0002999983502687783, + "loss": 0.8748, + "step": 2415 + }, + { + "epoch": 0.30481468652580535, + "grad_norm": 0.5024969577789307, + "learning_rate": 0.00029999779445249243, + "loss": 0.8534, + "step": 2420 + }, + { + "epoch": 0.3054444689359826, + "grad_norm": 0.5515364408493042, + "learning_rate": 0.00029999715808396463, + "loss": 0.8535, + "step": 2425 + }, + { + "epoch": 0.3060742513461599, + "grad_norm": 0.5151112079620361, + "learning_rate": 0.00029999644116353666, + "loss": 0.8686, + "step": 2430 + }, + { + "epoch": 0.3067040337563372, + "grad_norm": 0.5231375098228455, + "learning_rate": 0.0002999956436915935, + "loss": 0.8465, + "step": 2435 + }, + { + "epoch": 0.30733381616651445, + "grad_norm": 0.5415048003196716, + "learning_rate": 0.0002999947656685634, + "loss": 0.853, + "step": 2440 + }, + { + "epoch": 0.30796359857669175, + "grad_norm": 0.5642004609107971, + "learning_rate": 0.00029999380709491794, + "loss": 0.8827, + "step": 2445 + }, + { + "epoch": 0.30859338098686906, + "grad_norm": 0.6197057366371155, + "learning_rate": 0.0002999927679711718, + "loss": 0.9072, + "step": 2450 + }, + { + "epoch": 0.3092231633970463, + "grad_norm": 0.5865146517753601, + "learning_rate": 0.0002999916482978831, + "loss": 0.837, + "step": 2455 + }, + { + "epoch": 0.3098529458072236, + "grad_norm": 0.5961802005767822, + "learning_rate": 0.0002999904480756531, + "loss": 0.8657, + "step": 2460 + }, + { + "epoch": 0.31048272821740086, + "grad_norm": 0.5736685395240784, + "learning_rate": 0.0002999891673051263, + "loss": 0.872, + "step": 2465 + }, + { + "epoch": 0.31111251062757816, + "grad_norm": 0.5412915945053101, + "learning_rate": 0.0002999878059869905, + "loss": 0.8327, + "step": 2470 + }, + { + "epoch": 0.31174229303775547, + "grad_norm": 0.5011366605758667, + "learning_rate": 0.0002999863641219769, + "loss": 0.8418, + "step": 2475 + }, + { + "epoch": 0.3123720754479327, + "grad_norm": 0.5566514134407043, + "learning_rate": 0.0002999848417108597, + "loss": 0.8768, + "step": 2480 + }, + { + "epoch": 0.31300185785811, + "grad_norm": 0.5639830231666565, + "learning_rate": 0.0002999832387544564, + "loss": 0.8178, + "step": 2485 + }, + { + "epoch": 0.3136316402682873, + "grad_norm": 0.5784679055213928, + "learning_rate": 0.000299981555253628, + "loss": 0.8698, + "step": 2490 + }, + { + "epoch": 0.3142614226784646, + "grad_norm": 0.5428637266159058, + "learning_rate": 0.00029997979120927846, + "loss": 0.8671, + "step": 2495 + }, + { + "epoch": 0.3148912050886419, + "grad_norm": 0.5629287362098694, + "learning_rate": 0.00029997794662235515, + "loss": 0.873, + "step": 2500 + }, + { + "epoch": 0.3155209874988192, + "grad_norm": 0.5561172366142273, + "learning_rate": 0.00029997602149384856, + "loss": 0.8664, + "step": 2505 + }, + { + "epoch": 0.31615076990899643, + "grad_norm": 0.5451831221580505, + "learning_rate": 0.0002999740158247927, + "loss": 0.8349, + "step": 2510 + }, + { + "epoch": 0.31678055231917374, + "grad_norm": 0.5645403861999512, + "learning_rate": 0.00029997192961626456, + "loss": 0.8924, + "step": 2515 + }, + { + "epoch": 0.317410334729351, + "grad_norm": 0.5120379328727722, + "learning_rate": 0.00029996976286938444, + "loss": 0.8606, + "step": 2520 + }, + { + "epoch": 0.3180401171395283, + "grad_norm": 0.45988166332244873, + "learning_rate": 0.0002999675155853161, + "loss": 0.8285, + "step": 2525 + }, + { + "epoch": 0.3186698995497056, + "grad_norm": 0.5446504950523376, + "learning_rate": 0.00029996518776526614, + "loss": 0.8913, + "step": 2530 + }, + { + "epoch": 0.31929968195988284, + "grad_norm": 0.648369550704956, + "learning_rate": 0.00029996277941048485, + "loss": 0.8753, + "step": 2535 + }, + { + "epoch": 0.31992946437006015, + "grad_norm": 0.6404165029525757, + "learning_rate": 0.0002999602905222655, + "loss": 0.8747, + "step": 2540 + }, + { + "epoch": 0.32055924678023745, + "grad_norm": 0.46791502833366394, + "learning_rate": 0.0002999577211019447, + "loss": 0.8132, + "step": 2545 + }, + { + "epoch": 0.3211890291904147, + "grad_norm": 0.5365081429481506, + "learning_rate": 0.00029995507115090225, + "loss": 0.8363, + "step": 2550 + }, + { + "epoch": 0.321818811600592, + "grad_norm": 0.5029319524765015, + "learning_rate": 0.00029995234067056124, + "loss": 0.8297, + "step": 2555 + }, + { + "epoch": 0.32244859401076925, + "grad_norm": 0.509843647480011, + "learning_rate": 0.00029994952966238804, + "loss": 0.828, + "step": 2560 + }, + { + "epoch": 0.32307837642094656, + "grad_norm": 0.479045569896698, + "learning_rate": 0.0002999466381278922, + "loss": 0.8689, + "step": 2565 + }, + { + "epoch": 0.32370815883112386, + "grad_norm": 0.5639600157737732, + "learning_rate": 0.0002999436660686265, + "loss": 0.8521, + "step": 2570 + }, + { + "epoch": 0.3243379412413011, + "grad_norm": 0.5077898502349854, + "learning_rate": 0.00029994061348618715, + "loss": 0.835, + "step": 2575 + }, + { + "epoch": 0.3249677236514784, + "grad_norm": 0.45198580622673035, + "learning_rate": 0.00029993748038221324, + "loss": 0.8394, + "step": 2580 + }, + { + "epoch": 0.3255975060616557, + "grad_norm": 0.5617688894271851, + "learning_rate": 0.0002999342667583875, + "loss": 0.8285, + "step": 2585 + }, + { + "epoch": 0.32622728847183297, + "grad_norm": 0.5159285664558411, + "learning_rate": 0.0002999309726164356, + "loss": 0.7654, + "step": 2590 + }, + { + "epoch": 0.32685707088201027, + "grad_norm": 0.526965320110321, + "learning_rate": 0.00029992759795812666, + "loss": 0.8392, + "step": 2595 + }, + { + "epoch": 0.3274868532921876, + "grad_norm": 0.4861494302749634, + "learning_rate": 0.0002999241427852729, + "loss": 0.8177, + "step": 2600 + }, + { + "epoch": 0.3281166357023648, + "grad_norm": 0.5498744249343872, + "learning_rate": 0.0002999206070997298, + "loss": 0.8006, + "step": 2605 + }, + { + "epoch": 0.3287464181125421, + "grad_norm": 0.526978075504303, + "learning_rate": 0.0002999169909033962, + "loss": 0.8261, + "step": 2610 + }, + { + "epoch": 0.3293762005227194, + "grad_norm": 0.5078813433647156, + "learning_rate": 0.0002999132941982139, + "loss": 0.8396, + "step": 2615 + }, + { + "epoch": 0.3300059829328967, + "grad_norm": 0.5390729308128357, + "learning_rate": 0.00029990951698616834, + "loss": 0.8695, + "step": 2620 + }, + { + "epoch": 0.330635765343074, + "grad_norm": 0.520889401435852, + "learning_rate": 0.00029990565926928787, + "loss": 0.8489, + "step": 2625 + }, + { + "epoch": 0.33126554775325123, + "grad_norm": 0.6547030210494995, + "learning_rate": 0.00029990172104964413, + "loss": 0.8821, + "step": 2630 + }, + { + "epoch": 0.33189533016342854, + "grad_norm": 0.5034601092338562, + "learning_rate": 0.00029989770232935204, + "loss": 0.8202, + "step": 2635 + }, + { + "epoch": 0.33252511257360584, + "grad_norm": 0.5204071402549744, + "learning_rate": 0.0002998936031105698, + "loss": 0.852, + "step": 2640 + }, + { + "epoch": 0.3331548949837831, + "grad_norm": 0.499221533536911, + "learning_rate": 0.0002998894233954988, + "loss": 0.8338, + "step": 2645 + }, + { + "epoch": 0.3337846773939604, + "grad_norm": 0.5096358060836792, + "learning_rate": 0.0002998851631863835, + "loss": 0.8149, + "step": 2650 + }, + { + "epoch": 0.3344144598041377, + "grad_norm": 0.4654362201690674, + "learning_rate": 0.0002998808224855119, + "loss": 0.8461, + "step": 2655 + }, + { + "epoch": 0.33504424221431495, + "grad_norm": 0.7029035091400146, + "learning_rate": 0.00029987640129521497, + "loss": 0.8137, + "step": 2660 + }, + { + "epoch": 0.33567402462449225, + "grad_norm": 0.5634217262268066, + "learning_rate": 0.000299871899617867, + "loss": 0.8434, + "step": 2665 + }, + { + "epoch": 0.3363038070346695, + "grad_norm": 0.5168646574020386, + "learning_rate": 0.0002998673174558855, + "loss": 0.8554, + "step": 2670 + }, + { + "epoch": 0.3369335894448468, + "grad_norm": 0.4693644344806671, + "learning_rate": 0.00029986265481173123, + "loss": 0.8246, + "step": 2675 + }, + { + "epoch": 0.3375633718550241, + "grad_norm": 0.44928330183029175, + "learning_rate": 0.00029985791168790805, + "loss": 0.8554, + "step": 2680 + }, + { + "epoch": 0.33819315426520136, + "grad_norm": 0.5288846492767334, + "learning_rate": 0.0002998530880869632, + "loss": 0.8319, + "step": 2685 + }, + { + "epoch": 0.33882293667537866, + "grad_norm": 0.4755760431289673, + "learning_rate": 0.00029984818401148706, + "loss": 0.874, + "step": 2690 + }, + { + "epoch": 0.33945271908555597, + "grad_norm": 0.541684091091156, + "learning_rate": 0.0002998431994641132, + "loss": 0.8526, + "step": 2695 + }, + { + "epoch": 0.3400825014957332, + "grad_norm": 0.5160995125770569, + "learning_rate": 0.0002998381344475184, + "loss": 0.8749, + "step": 2700 + }, + { + "epoch": 0.3407122839059105, + "grad_norm": 0.5409444570541382, + "learning_rate": 0.00029983298896442276, + "loss": 0.8118, + "step": 2705 + }, + { + "epoch": 0.34134206631608777, + "grad_norm": 0.5148081183433533, + "learning_rate": 0.00029982776301758956, + "loss": 0.8685, + "step": 2710 + }, + { + "epoch": 0.34197184872626507, + "grad_norm": 0.5689860582351685, + "learning_rate": 0.0002998224566098251, + "loss": 0.8476, + "step": 2715 + }, + { + "epoch": 0.3426016311364424, + "grad_norm": 0.520268440246582, + "learning_rate": 0.00029981706974397917, + "loss": 0.8128, + "step": 2720 + }, + { + "epoch": 0.3432314135466196, + "grad_norm": 0.49906817078590393, + "learning_rate": 0.00029981160242294457, + "loss": 0.836, + "step": 2725 + }, + { + "epoch": 0.34386119595679693, + "grad_norm": 0.47317516803741455, + "learning_rate": 0.0002998060546496575, + "loss": 0.8251, + "step": 2730 + }, + { + "epoch": 0.34449097836697423, + "grad_norm": 0.49573519825935364, + "learning_rate": 0.0002998004264270971, + "loss": 0.832, + "step": 2735 + }, + { + "epoch": 0.3451207607771515, + "grad_norm": 0.43803608417510986, + "learning_rate": 0.0002997947177582859, + "loss": 0.7875, + "step": 2740 + }, + { + "epoch": 0.3457505431873288, + "grad_norm": 0.5324883460998535, + "learning_rate": 0.0002997889286462896, + "loss": 0.824, + "step": 2745 + }, + { + "epoch": 0.3463803255975061, + "grad_norm": 0.5902321934700012, + "learning_rate": 0.00029978305909421707, + "loss": 0.8265, + "step": 2750 + }, + { + "epoch": 0.34701010800768334, + "grad_norm": 0.5052042007446289, + "learning_rate": 0.0002997771091052204, + "loss": 0.7715, + "step": 2755 + }, + { + "epoch": 0.34763989041786064, + "grad_norm": 0.4439961314201355, + "learning_rate": 0.0002997710786824949, + "loss": 0.8387, + "step": 2760 + }, + { + "epoch": 0.3482696728280379, + "grad_norm": 0.5099385976791382, + "learning_rate": 0.0002997649678292789, + "loss": 0.8424, + "step": 2765 + }, + { + "epoch": 0.3488994552382152, + "grad_norm": 0.4415825605392456, + "learning_rate": 0.00029975877654885426, + "loss": 0.8066, + "step": 2770 + }, + { + "epoch": 0.3495292376483925, + "grad_norm": 0.5088052153587341, + "learning_rate": 0.0002997525048445458, + "loss": 0.8172, + "step": 2775 + }, + { + "epoch": 0.35015902005856975, + "grad_norm": 0.5503986477851868, + "learning_rate": 0.00029974615271972146, + "loss": 0.873, + "step": 2780 + }, + { + "epoch": 0.35078880246874705, + "grad_norm": 0.4609704613685608, + "learning_rate": 0.0002997397201777926, + "loss": 0.8041, + "step": 2785 + }, + { + "epoch": 0.35141858487892436, + "grad_norm": 0.49494004249572754, + "learning_rate": 0.00029973320722221356, + "loss": 0.916, + "step": 2790 + }, + { + "epoch": 0.3520483672891016, + "grad_norm": 0.4820273518562317, + "learning_rate": 0.00029972661385648197, + "loss": 0.8597, + "step": 2795 + }, + { + "epoch": 0.3526781496992789, + "grad_norm": 0.467641681432724, + "learning_rate": 0.0002997199400841386, + "loss": 0.7944, + "step": 2800 + }, + { + "epoch": 0.35330793210945616, + "grad_norm": 0.49666082859039307, + "learning_rate": 0.00029971318590876745, + "loss": 0.8204, + "step": 2805 + }, + { + "epoch": 0.35393771451963346, + "grad_norm": 0.4578961133956909, + "learning_rate": 0.00029970635133399565, + "loss": 0.8426, + "step": 2810 + }, + { + "epoch": 0.35456749692981077, + "grad_norm": 0.48815369606018066, + "learning_rate": 0.00029969943636349363, + "loss": 0.8277, + "step": 2815 + }, + { + "epoch": 0.355197279339988, + "grad_norm": 0.4686887562274933, + "learning_rate": 0.0002996924410009747, + "loss": 0.7936, + "step": 2820 + }, + { + "epoch": 0.3558270617501653, + "grad_norm": 0.4245474636554718, + "learning_rate": 0.0002996853652501956, + "loss": 0.8085, + "step": 2825 + }, + { + "epoch": 0.3564568441603426, + "grad_norm": 0.5085129737854004, + "learning_rate": 0.0002996782091149562, + "loss": 0.8584, + "step": 2830 + }, + { + "epoch": 0.35708662657051987, + "grad_norm": 0.4415908455848694, + "learning_rate": 0.0002996709725990995, + "loss": 0.8234, + "step": 2835 + }, + { + "epoch": 0.3577164089806972, + "grad_norm": 0.44018128514289856, + "learning_rate": 0.00029966365570651164, + "loss": 0.8566, + "step": 2840 + }, + { + "epoch": 0.3583461913908745, + "grad_norm": 0.4675704836845398, + "learning_rate": 0.000299656258441122, + "loss": 0.8164, + "step": 2845 + }, + { + "epoch": 0.35897597380105173, + "grad_norm": 0.47553756833076477, + "learning_rate": 0.0002996487808069031, + "loss": 0.8177, + "step": 2850 + }, + { + "epoch": 0.35960575621122903, + "grad_norm": 0.5298905372619629, + "learning_rate": 0.00029964122280787053, + "loss": 0.8537, + "step": 2855 + }, + { + "epoch": 0.3602355386214063, + "grad_norm": 0.484838604927063, + "learning_rate": 0.0002996335844480832, + "loss": 0.8495, + "step": 2860 + }, + { + "epoch": 0.3608653210315836, + "grad_norm": 0.4366026818752289, + "learning_rate": 0.000299625865731643, + "loss": 0.8073, + "step": 2865 + }, + { + "epoch": 0.3614951034417609, + "grad_norm": 0.4988342225551605, + "learning_rate": 0.00029961806666269503, + "loss": 0.8127, + "step": 2870 + }, + { + "epoch": 0.36212488585193814, + "grad_norm": 0.5028805732727051, + "learning_rate": 0.00029961018724542767, + "loss": 0.8711, + "step": 2875 + }, + { + "epoch": 0.36275466826211544, + "grad_norm": 0.5009424686431885, + "learning_rate": 0.00029960222748407226, + "loss": 0.8015, + "step": 2880 + }, + { + "epoch": 0.36338445067229275, + "grad_norm": 0.4522798955440521, + "learning_rate": 0.00029959418738290344, + "loss": 0.8261, + "step": 2885 + }, + { + "epoch": 0.36401423308247, + "grad_norm": 0.49349725246429443, + "learning_rate": 0.00029958606694623893, + "loss": 0.8006, + "step": 2890 + }, + { + "epoch": 0.3646440154926473, + "grad_norm": 0.46870625019073486, + "learning_rate": 0.00029957786617843956, + "loss": 0.8285, + "step": 2895 + }, + { + "epoch": 0.36527379790282455, + "grad_norm": 0.5234463810920715, + "learning_rate": 0.0002995695850839093, + "loss": 0.8497, + "step": 2900 + }, + { + "epoch": 0.36590358031300185, + "grad_norm": 0.487884521484375, + "learning_rate": 0.0002995612236670953, + "loss": 0.8033, + "step": 2905 + }, + { + "epoch": 0.36653336272317916, + "grad_norm": 0.4760074317455292, + "learning_rate": 0.0002995527819324879, + "loss": 0.8412, + "step": 2910 + }, + { + "epoch": 0.3671631451333564, + "grad_norm": 0.4630395472049713, + "learning_rate": 0.0002995442598846205, + "loss": 0.8244, + "step": 2915 + }, + { + "epoch": 0.3677929275435337, + "grad_norm": 0.4981043040752411, + "learning_rate": 0.0002995356575280695, + "loss": 0.798, + "step": 2920 + }, + { + "epoch": 0.368422709953711, + "grad_norm": 0.4630597233772278, + "learning_rate": 0.00029952697486745466, + "loss": 0.8032, + "step": 2925 + }, + { + "epoch": 0.36905249236388826, + "grad_norm": 0.4962010979652405, + "learning_rate": 0.00029951821190743884, + "loss": 0.8183, + "step": 2930 + }, + { + "epoch": 0.36968227477406557, + "grad_norm": 0.47193852066993713, + "learning_rate": 0.00029950936865272775, + "loss": 0.841, + "step": 2935 + }, + { + "epoch": 0.37031205718424287, + "grad_norm": 0.4802277982234955, + "learning_rate": 0.0002995004451080706, + "loss": 0.7433, + "step": 2940 + }, + { + "epoch": 0.3709418395944201, + "grad_norm": 0.43486830592155457, + "learning_rate": 0.00029949144127825947, + "loss": 0.8051, + "step": 2945 + }, + { + "epoch": 0.3715716220045974, + "grad_norm": 0.5078021287918091, + "learning_rate": 0.0002994823571681296, + "loss": 0.8662, + "step": 2950 + }, + { + "epoch": 0.3722014044147747, + "grad_norm": 0.44738146662712097, + "learning_rate": 0.0002994731927825594, + "loss": 0.7997, + "step": 2955 + }, + { + "epoch": 0.372831186824952, + "grad_norm": 0.44323739409446716, + "learning_rate": 0.0002994639481264704, + "loss": 0.8481, + "step": 2960 + }, + { + "epoch": 0.3734609692351293, + "grad_norm": 0.525050938129425, + "learning_rate": 0.0002994546232048271, + "loss": 0.8375, + "step": 2965 + }, + { + "epoch": 0.37409075164530653, + "grad_norm": 0.4817000925540924, + "learning_rate": 0.00029944521802263723, + "loss": 0.8, + "step": 2970 + }, + { + "epoch": 0.37472053405548383, + "grad_norm": 0.5709639191627502, + "learning_rate": 0.00029943573258495165, + "loss": 0.8104, + "step": 2975 + }, + { + "epoch": 0.37535031646566114, + "grad_norm": 0.518618643283844, + "learning_rate": 0.00029942616689686416, + "loss": 0.7948, + "step": 2980 + }, + { + "epoch": 0.3759800988758384, + "grad_norm": 0.4226647615432739, + "learning_rate": 0.00029941652096351174, + "loss": 0.7599, + "step": 2985 + }, + { + "epoch": 0.3766098812860157, + "grad_norm": 0.4751405119895935, + "learning_rate": 0.0002994067947900746, + "loss": 0.8015, + "step": 2990 + }, + { + "epoch": 0.37723966369619294, + "grad_norm": 0.4653600752353668, + "learning_rate": 0.0002993969883817758, + "loss": 0.7758, + "step": 2995 + }, + { + "epoch": 0.37786944610637024, + "grad_norm": 0.512941837310791, + "learning_rate": 0.00029938710174388163, + "loss": 0.8188, + "step": 3000 + }, + { + "epoch": 0.37786944610637024, + "eval_loss": 0.3438470661640167, + "eval_runtime": 6.225, + "eval_samples_per_second": 160.642, + "eval_steps_per_second": 10.12, + "step": 3000 + }, + { + "epoch": 0.37849922851654755, + "grad_norm": 0.43524855375289917, + "learning_rate": 0.0002993771348817015, + "loss": 0.803, + "step": 3005 + }, + { + "epoch": 0.3791290109267248, + "grad_norm": 0.4569668173789978, + "learning_rate": 0.0002993670878005878, + "loss": 0.8777, + "step": 3010 + }, + { + "epoch": 0.3797587933369021, + "grad_norm": 0.4643417000770569, + "learning_rate": 0.00029935696050593604, + "loss": 0.7621, + "step": 3015 + }, + { + "epoch": 0.3803885757470794, + "grad_norm": 0.4604712128639221, + "learning_rate": 0.00029934675300318485, + "loss": 0.8216, + "step": 3020 + }, + { + "epoch": 0.38101835815725665, + "grad_norm": 0.4307630956172943, + "learning_rate": 0.0002993364652978158, + "loss": 0.8163, + "step": 3025 + }, + { + "epoch": 0.38164814056743396, + "grad_norm": 0.44455698132514954, + "learning_rate": 0.00029932609739535365, + "loss": 0.818, + "step": 3030 + }, + { + "epoch": 0.38227792297761126, + "grad_norm": 0.43203669786453247, + "learning_rate": 0.0002993156493013663, + "loss": 0.8168, + "step": 3035 + }, + { + "epoch": 0.3829077053877885, + "grad_norm": 0.42328670620918274, + "learning_rate": 0.00029930512102146453, + "loss": 0.8025, + "step": 3040 + }, + { + "epoch": 0.3835374877979658, + "grad_norm": 0.43900108337402344, + "learning_rate": 0.0002992945125613023, + "loss": 0.7595, + "step": 3045 + }, + { + "epoch": 0.38416727020814306, + "grad_norm": 0.46638986468315125, + "learning_rate": 0.00029928382392657656, + "loss": 0.8208, + "step": 3050 + }, + { + "epoch": 0.38479705261832037, + "grad_norm": 0.4279174208641052, + "learning_rate": 0.00029927305512302736, + "loss": 0.8151, + "step": 3055 + }, + { + "epoch": 0.38542683502849767, + "grad_norm": 0.4648323357105255, + "learning_rate": 0.0002992622061564378, + "loss": 0.7666, + "step": 3060 + }, + { + "epoch": 0.3860566174386749, + "grad_norm": 0.45052894949913025, + "learning_rate": 0.000299251277032634, + "loss": 0.7995, + "step": 3065 + }, + { + "epoch": 0.3866863998488522, + "grad_norm": 0.46262305974960327, + "learning_rate": 0.0002992402677574852, + "loss": 0.8175, + "step": 3070 + }, + { + "epoch": 0.38731618225902953, + "grad_norm": 0.4934038519859314, + "learning_rate": 0.00029922917833690365, + "loss": 0.821, + "step": 3075 + }, + { + "epoch": 0.3879459646692068, + "grad_norm": 0.46494096517562866, + "learning_rate": 0.0002992180087768445, + "loss": 0.8081, + "step": 3080 + }, + { + "epoch": 0.3885757470793841, + "grad_norm": 0.9760459661483765, + "learning_rate": 0.0002992067590833062, + "loss": 0.7673, + "step": 3085 + }, + { + "epoch": 0.38920552948956133, + "grad_norm": 0.7070348262786865, + "learning_rate": 0.00029919542926233, + "loss": 0.8017, + "step": 3090 + }, + { + "epoch": 0.38983531189973863, + "grad_norm": 0.6773821711540222, + "learning_rate": 0.00029918401932000027, + "loss": 0.7946, + "step": 3095 + }, + { + "epoch": 0.39046509430991594, + "grad_norm": 0.4955935478210449, + "learning_rate": 0.0002991725292624445, + "loss": 0.8431, + "step": 3100 + }, + { + "epoch": 0.3910948767200932, + "grad_norm": 0.7728490829467773, + "learning_rate": 0.000299160959095833, + "loss": 0.8024, + "step": 3105 + }, + { + "epoch": 0.3917246591302705, + "grad_norm": 0.6880044341087341, + "learning_rate": 0.00029914930882637926, + "loss": 0.788, + "step": 3110 + }, + { + "epoch": 0.3923544415404478, + "grad_norm": 0.4916500747203827, + "learning_rate": 0.0002991375784603398, + "loss": 0.7878, + "step": 3115 + }, + { + "epoch": 0.39298422395062504, + "grad_norm": 0.5188093781471252, + "learning_rate": 0.00029912576800401403, + "loss": 0.8404, + "step": 3120 + }, + { + "epoch": 0.39361400636080235, + "grad_norm": 0.5084378123283386, + "learning_rate": 0.0002991138774637444, + "loss": 0.8277, + "step": 3125 + }, + { + "epoch": 0.39424378877097965, + "grad_norm": 0.4139776825904846, + "learning_rate": 0.0002991019068459165, + "loss": 0.7672, + "step": 3130 + }, + { + "epoch": 0.3948735711811569, + "grad_norm": 0.4756976366043091, + "learning_rate": 0.0002990898561569588, + "loss": 0.7936, + "step": 3135 + }, + { + "epoch": 0.3955033535913342, + "grad_norm": 0.46053630113601685, + "learning_rate": 0.0002990777254033427, + "loss": 0.8102, + "step": 3140 + }, + { + "epoch": 0.39613313600151145, + "grad_norm": 0.48546189069747925, + "learning_rate": 0.00029906551459158283, + "loss": 0.8184, + "step": 3145 + }, + { + "epoch": 0.39676291841168876, + "grad_norm": 0.477192223072052, + "learning_rate": 0.0002990532237282366, + "loss": 0.828, + "step": 3150 + }, + { + "epoch": 0.39739270082186606, + "grad_norm": 0.48900333046913147, + "learning_rate": 0.00029904085281990447, + "loss": 0.8183, + "step": 3155 + }, + { + "epoch": 0.3980224832320433, + "grad_norm": 0.5019087791442871, + "learning_rate": 0.0002990284018732299, + "loss": 0.8002, + "step": 3160 + }, + { + "epoch": 0.3986522656422206, + "grad_norm": 0.5127068758010864, + "learning_rate": 0.0002990158708948994, + "loss": 0.8088, + "step": 3165 + }, + { + "epoch": 0.3992820480523979, + "grad_norm": 0.44172775745391846, + "learning_rate": 0.00029900325989164233, + "loss": 0.8013, + "step": 3170 + }, + { + "epoch": 0.39991183046257517, + "grad_norm": 0.5318475961685181, + "learning_rate": 0.0002989905688702311, + "loss": 0.8239, + "step": 3175 + }, + { + "epoch": 0.4005416128727525, + "grad_norm": 0.4257467985153198, + "learning_rate": 0.0002989777978374811, + "loss": 0.7714, + "step": 3180 + }, + { + "epoch": 0.4011713952829297, + "grad_norm": 0.42196664214134216, + "learning_rate": 0.0002989649468002506, + "loss": 0.7987, + "step": 3185 + }, + { + "epoch": 0.401801177693107, + "grad_norm": 0.47977736592292786, + "learning_rate": 0.000298952015765441, + "loss": 0.7699, + "step": 3190 + }, + { + "epoch": 0.40243096010328433, + "grad_norm": 0.4841693639755249, + "learning_rate": 0.0002989390047399965, + "loss": 0.7916, + "step": 3195 + }, + { + "epoch": 0.4030607425134616, + "grad_norm": 0.5104061961174011, + "learning_rate": 0.0002989259137309043, + "loss": 0.8244, + "step": 3200 + }, + { + "epoch": 0.4036905249236389, + "grad_norm": 0.46594473719596863, + "learning_rate": 0.00029891274274519464, + "loss": 0.786, + "step": 3205 + }, + { + "epoch": 0.4043203073338162, + "grad_norm": 0.4309998154640198, + "learning_rate": 0.0002988994917899405, + "loss": 0.8266, + "step": 3210 + }, + { + "epoch": 0.40495008974399344, + "grad_norm": 0.4976588785648346, + "learning_rate": 0.00029888616087225817, + "loss": 0.7911, + "step": 3215 + }, + { + "epoch": 0.40557987215417074, + "grad_norm": 0.47657066583633423, + "learning_rate": 0.00029887274999930647, + "loss": 0.7926, + "step": 3220 + }, + { + "epoch": 0.40620965456434804, + "grad_norm": 0.42497026920318604, + "learning_rate": 0.0002988592591782874, + "loss": 0.7838, + "step": 3225 + }, + { + "epoch": 0.4068394369745253, + "grad_norm": 0.4974801540374756, + "learning_rate": 0.00029884568841644587, + "loss": 0.7854, + "step": 3230 + }, + { + "epoch": 0.4074692193847026, + "grad_norm": 0.43505486845970154, + "learning_rate": 0.00029883203772106966, + "loss": 0.8689, + "step": 3235 + }, + { + "epoch": 0.40809900179487985, + "grad_norm": 0.5216085314750671, + "learning_rate": 0.0002988183070994895, + "loss": 0.8445, + "step": 3240 + }, + { + "epoch": 0.40872878420505715, + "grad_norm": 0.5993830561637878, + "learning_rate": 0.0002988044965590791, + "loss": 0.7944, + "step": 3245 + }, + { + "epoch": 0.40935856661523445, + "grad_norm": 0.7245651483535767, + "learning_rate": 0.00029879060610725494, + "loss": 0.8175, + "step": 3250 + }, + { + "epoch": 0.4099883490254117, + "grad_norm": 0.4758714735507965, + "learning_rate": 0.00029877663575147653, + "loss": 0.7862, + "step": 3255 + }, + { + "epoch": 0.410618131435589, + "grad_norm": 0.5264742970466614, + "learning_rate": 0.0002987625854992464, + "loss": 0.7625, + "step": 3260 + }, + { + "epoch": 0.4112479138457663, + "grad_norm": 0.46857550740242004, + "learning_rate": 0.0002987484553581097, + "loss": 0.7878, + "step": 3265 + }, + { + "epoch": 0.41187769625594356, + "grad_norm": 0.4588899314403534, + "learning_rate": 0.0002987342453356547, + "loss": 0.8435, + "step": 3270 + }, + { + "epoch": 0.41250747866612086, + "grad_norm": 0.47005462646484375, + "learning_rate": 0.0002987199554395125, + "loss": 0.8343, + "step": 3275 + }, + { + "epoch": 0.4131372610762981, + "grad_norm": 0.4855548143386841, + "learning_rate": 0.00029870558567735716, + "loss": 0.7944, + "step": 3280 + }, + { + "epoch": 0.4137670434864754, + "grad_norm": 0.4832567572593689, + "learning_rate": 0.00029869113605690545, + "loss": 0.7999, + "step": 3285 + }, + { + "epoch": 0.4143968258966527, + "grad_norm": 0.4483296573162079, + "learning_rate": 0.00029867660658591724, + "loss": 0.8074, + "step": 3290 + }, + { + "epoch": 0.41502660830682997, + "grad_norm": 0.5084306001663208, + "learning_rate": 0.00029866199727219514, + "loss": 0.8173, + "step": 3295 + }, + { + "epoch": 0.4156563907170073, + "grad_norm": 0.43247321248054504, + "learning_rate": 0.00029864730812358473, + "loss": 0.7904, + "step": 3300 + }, + { + "epoch": 0.4162861731271846, + "grad_norm": 0.4278540313243866, + "learning_rate": 0.0002986325391479744, + "loss": 0.79, + "step": 3305 + }, + { + "epoch": 0.4169159555373618, + "grad_norm": 0.4396720230579376, + "learning_rate": 0.00029861769035329546, + "loss": 0.7737, + "step": 3310 + }, + { + "epoch": 0.41754573794753913, + "grad_norm": 0.4305702745914459, + "learning_rate": 0.0002986027617475219, + "loss": 0.8133, + "step": 3315 + }, + { + "epoch": 0.41817552035771643, + "grad_norm": 0.4455117881298065, + "learning_rate": 0.0002985877533386709, + "loss": 0.7932, + "step": 3320 + }, + { + "epoch": 0.4188053027678937, + "grad_norm": 0.45051881670951843, + "learning_rate": 0.00029857266513480226, + "loss": 0.8162, + "step": 3325 + }, + { + "epoch": 0.419435085178071, + "grad_norm": 0.47537773847579956, + "learning_rate": 0.0002985574971440187, + "loss": 0.7931, + "step": 3330 + }, + { + "epoch": 0.42006486758824824, + "grad_norm": 0.46828627586364746, + "learning_rate": 0.0002985422493744657, + "loss": 0.8399, + "step": 3335 + }, + { + "epoch": 0.42069464999842554, + "grad_norm": 0.4528372585773468, + "learning_rate": 0.00029852692183433176, + "loss": 0.7821, + "step": 3340 + }, + { + "epoch": 0.42132443240860284, + "grad_norm": 0.4476306736469269, + "learning_rate": 0.00029851151453184807, + "loss": 0.7986, + "step": 3345 + }, + { + "epoch": 0.4219542148187801, + "grad_norm": 0.4092450439929962, + "learning_rate": 0.00029849602747528874, + "loss": 0.7827, + "step": 3350 + }, + { + "epoch": 0.4225839972289574, + "grad_norm": 0.4776279330253601, + "learning_rate": 0.00029848046067297064, + "loss": 0.8269, + "step": 3355 + }, + { + "epoch": 0.4232137796391347, + "grad_norm": 0.45867466926574707, + "learning_rate": 0.00029846481413325346, + "loss": 0.8094, + "step": 3360 + }, + { + "epoch": 0.42384356204931195, + "grad_norm": 0.4665123522281647, + "learning_rate": 0.00029844908786453986, + "loss": 0.7288, + "step": 3365 + }, + { + "epoch": 0.42447334445948925, + "grad_norm": 0.4820057451725006, + "learning_rate": 0.0002984332818752751, + "loss": 0.8012, + "step": 3370 + }, + { + "epoch": 0.4251031268696665, + "grad_norm": 0.46213796734809875, + "learning_rate": 0.00029841739617394737, + "loss": 0.7693, + "step": 3375 + }, + { + "epoch": 0.4257329092798438, + "grad_norm": 0.4329429864883423, + "learning_rate": 0.0002984014307690878, + "loss": 0.7215, + "step": 3380 + }, + { + "epoch": 0.4263626916900211, + "grad_norm": 0.437621146440506, + "learning_rate": 0.00029838538566926993, + "loss": 0.7839, + "step": 3385 + }, + { + "epoch": 0.42699247410019836, + "grad_norm": 0.4661789536476135, + "learning_rate": 0.0002983692608831105, + "loss": 0.7827, + "step": 3390 + }, + { + "epoch": 0.42762225651037566, + "grad_norm": 0.4203425645828247, + "learning_rate": 0.0002983530564192689, + "loss": 0.8096, + "step": 3395 + }, + { + "epoch": 0.42825203892055297, + "grad_norm": 0.4614803194999695, + "learning_rate": 0.00029833677228644726, + "loss": 0.8189, + "step": 3400 + }, + { + "epoch": 0.4288818213307302, + "grad_norm": 0.4247860908508301, + "learning_rate": 0.0002983204084933905, + "loss": 0.8123, + "step": 3405 + }, + { + "epoch": 0.4295116037409075, + "grad_norm": 0.4418291449546814, + "learning_rate": 0.0002983039650488864, + "loss": 0.8036, + "step": 3410 + }, + { + "epoch": 0.4301413861510848, + "grad_norm": 0.46780282258987427, + "learning_rate": 0.00029828744196176547, + "loss": 0.8122, + "step": 3415 + }, + { + "epoch": 0.4307711685612621, + "grad_norm": 0.44973024725914, + "learning_rate": 0.0002982708392409009, + "loss": 0.7813, + "step": 3420 + }, + { + "epoch": 0.4314009509714394, + "grad_norm": 0.3922254741191864, + "learning_rate": 0.00029825415689520887, + "loss": 0.7809, + "step": 3425 + }, + { + "epoch": 0.4320307333816166, + "grad_norm": 0.391084223985672, + "learning_rate": 0.00029823739493364804, + "loss": 0.7757, + "step": 3430 + }, + { + "epoch": 0.43266051579179393, + "grad_norm": 0.4502919316291809, + "learning_rate": 0.00029822055336522005, + "loss": 0.7688, + "step": 3435 + }, + { + "epoch": 0.43329029820197124, + "grad_norm": 0.475436270236969, + "learning_rate": 0.0002982036321989692, + "loss": 0.7667, + "step": 3440 + }, + { + "epoch": 0.4339200806121485, + "grad_norm": 0.42362409830093384, + "learning_rate": 0.00029818663144398253, + "loss": 0.8098, + "step": 3445 + }, + { + "epoch": 0.4345498630223258, + "grad_norm": 0.45329517126083374, + "learning_rate": 0.0002981695511093898, + "loss": 0.7706, + "step": 3450 + }, + { + "epoch": 0.4351796454325031, + "grad_norm": 0.4297536313533783, + "learning_rate": 0.00029815239120436365, + "loss": 0.808, + "step": 3455 + }, + { + "epoch": 0.43580942784268034, + "grad_norm": 0.4800092577934265, + "learning_rate": 0.0002981351517381192, + "loss": 0.7973, + "step": 3460 + }, + { + "epoch": 0.43643921025285765, + "grad_norm": 0.5014777779579163, + "learning_rate": 0.00029811783271991454, + "loss": 0.8098, + "step": 3465 + }, + { + "epoch": 0.4370689926630349, + "grad_norm": 0.4412321448326111, + "learning_rate": 0.00029810043415905027, + "loss": 0.7669, + "step": 3470 + }, + { + "epoch": 0.4376987750732122, + "grad_norm": 0.4491146206855774, + "learning_rate": 0.00029808295606486993, + "loss": 0.7599, + "step": 3475 + }, + { + "epoch": 0.4383285574833895, + "grad_norm": 0.42482897639274597, + "learning_rate": 0.0002980653984467596, + "loss": 0.7501, + "step": 3480 + }, + { + "epoch": 0.43895833989356675, + "grad_norm": 0.4166581332683563, + "learning_rate": 0.0002980477613141482, + "loss": 0.7807, + "step": 3485 + }, + { + "epoch": 0.43958812230374406, + "grad_norm": 0.48076120018959045, + "learning_rate": 0.0002980300446765071, + "loss": 0.821, + "step": 3490 + }, + { + "epoch": 0.44021790471392136, + "grad_norm": 0.4148639142513275, + "learning_rate": 0.00029801224854335073, + "loss": 0.781, + "step": 3495 + }, + { + "epoch": 0.4408476871240986, + "grad_norm": 0.41731131076812744, + "learning_rate": 0.00029799437292423586, + "loss": 0.7784, + "step": 3500 + }, + { + "epoch": 0.4414774695342759, + "grad_norm": 0.4514264762401581, + "learning_rate": 0.00029797641782876224, + "loss": 0.8066, + "step": 3505 + }, + { + "epoch": 0.4421072519444532, + "grad_norm": 0.44717252254486084, + "learning_rate": 0.00029795838326657204, + "loss": 0.7761, + "step": 3510 + }, + { + "epoch": 0.44273703435463047, + "grad_norm": 0.42850586771965027, + "learning_rate": 0.00029794026924735034, + "loss": 0.783, + "step": 3515 + }, + { + "epoch": 0.44336681676480777, + "grad_norm": 0.7937319278717041, + "learning_rate": 0.00029792207578082476, + "loss": 0.7894, + "step": 3520 + }, + { + "epoch": 0.443996599174985, + "grad_norm": 0.4401470124721527, + "learning_rate": 0.0002979038028767656, + "loss": 0.8046, + "step": 3525 + }, + { + "epoch": 0.4446263815851623, + "grad_norm": 0.45515474677085876, + "learning_rate": 0.00029788545054498577, + "loss": 0.8095, + "step": 3530 + }, + { + "epoch": 0.4452561639953396, + "grad_norm": 0.4676735997200012, + "learning_rate": 0.00029786701879534093, + "loss": 0.7969, + "step": 3535 + }, + { + "epoch": 0.4458859464055169, + "grad_norm": 0.42322975397109985, + "learning_rate": 0.0002978485076377294, + "loss": 0.8336, + "step": 3540 + }, + { + "epoch": 0.4465157288156942, + "grad_norm": 0.4256497025489807, + "learning_rate": 0.000297829917082092, + "loss": 0.7773, + "step": 3545 + }, + { + "epoch": 0.4471455112258715, + "grad_norm": 0.40527772903442383, + "learning_rate": 0.00029781124713841237, + "loss": 0.8058, + "step": 3550 + }, + { + "epoch": 0.44777529363604873, + "grad_norm": 0.4047418534755707, + "learning_rate": 0.0002977924978167166, + "loss": 0.7769, + "step": 3555 + }, + { + "epoch": 0.44840507604622604, + "grad_norm": 0.4016299545764923, + "learning_rate": 0.00029777366912707366, + "loss": 0.7531, + "step": 3560 + }, + { + "epoch": 0.4490348584564033, + "grad_norm": 0.4176371395587921, + "learning_rate": 0.00029775476107959486, + "loss": 0.7865, + "step": 3565 + }, + { + "epoch": 0.4496646408665806, + "grad_norm": 0.44107553362846375, + "learning_rate": 0.00029773577368443426, + "loss": 0.7735, + "step": 3570 + }, + { + "epoch": 0.4502944232767579, + "grad_norm": 0.43897122144699097, + "learning_rate": 0.00029771670695178857, + "loss": 0.7715, + "step": 3575 + }, + { + "epoch": 0.45092420568693514, + "grad_norm": 0.4024025499820709, + "learning_rate": 0.000297697560891897, + "loss": 0.7442, + "step": 3580 + }, + { + "epoch": 0.45155398809711245, + "grad_norm": 0.4896734356880188, + "learning_rate": 0.0002976783355150415, + "loss": 0.829, + "step": 3585 + }, + { + "epoch": 0.45218377050728975, + "grad_norm": 0.47621142864227295, + "learning_rate": 0.0002976590308315465, + "loss": 0.7915, + "step": 3590 + }, + { + "epoch": 0.452813552917467, + "grad_norm": 0.3869519829750061, + "learning_rate": 0.00029763964685177905, + "loss": 0.7696, + "step": 3595 + }, + { + "epoch": 0.4534433353276443, + "grad_norm": 0.4245865046977997, + "learning_rate": 0.0002976201835861488, + "loss": 0.7281, + "step": 3600 + }, + { + "epoch": 0.4540731177378216, + "grad_norm": 0.40193241834640503, + "learning_rate": 0.0002976006410451079, + "loss": 0.7435, + "step": 3605 + }, + { + "epoch": 0.45470290014799886, + "grad_norm": 0.4142551124095917, + "learning_rate": 0.00029758101923915123, + "loss": 0.7627, + "step": 3610 + }, + { + "epoch": 0.45533268255817616, + "grad_norm": 0.4214671552181244, + "learning_rate": 0.0002975613181788162, + "loss": 0.8084, + "step": 3615 + }, + { + "epoch": 0.4559624649683534, + "grad_norm": 0.46768540143966675, + "learning_rate": 0.0002975415378746826, + "loss": 0.7502, + "step": 3620 + }, + { + "epoch": 0.4565922473785307, + "grad_norm": 0.43812718987464905, + "learning_rate": 0.00029752167833737295, + "loss": 0.7555, + "step": 3625 + }, + { + "epoch": 0.457222029788708, + "grad_norm": 0.44065701961517334, + "learning_rate": 0.00029750173957755223, + "loss": 0.7824, + "step": 3630 + }, + { + "epoch": 0.45785181219888527, + "grad_norm": 0.46632614731788635, + "learning_rate": 0.00029748172160592816, + "loss": 0.7787, + "step": 3635 + }, + { + "epoch": 0.45848159460906257, + "grad_norm": 0.3616549074649811, + "learning_rate": 0.00029746162443325066, + "loss": 0.766, + "step": 3640 + }, + { + "epoch": 0.4591113770192399, + "grad_norm": 0.39852583408355713, + "learning_rate": 0.00029744144807031253, + "loss": 0.7318, + "step": 3645 + }, + { + "epoch": 0.4597411594294171, + "grad_norm": 0.4082608222961426, + "learning_rate": 0.0002974211925279488, + "loss": 0.7726, + "step": 3650 + }, + { + "epoch": 0.4603709418395944, + "grad_norm": 0.4750503599643707, + "learning_rate": 0.00029740085781703726, + "loss": 0.7953, + "step": 3655 + }, + { + "epoch": 0.4610007242497717, + "grad_norm": 0.439531534910202, + "learning_rate": 0.0002973804439484981, + "loss": 0.788, + "step": 3660 + }, + { + "epoch": 0.461630506659949, + "grad_norm": 0.41563692688941956, + "learning_rate": 0.000297359950933294, + "loss": 0.7935, + "step": 3665 + }, + { + "epoch": 0.4622602890701263, + "grad_norm": 0.4645535945892334, + "learning_rate": 0.00029733937878243015, + "loss": 0.7716, + "step": 3670 + }, + { + "epoch": 0.46289007148030353, + "grad_norm": 0.4334595501422882, + "learning_rate": 0.0002973187275069544, + "loss": 0.7455, + "step": 3675 + }, + { + "epoch": 0.46351985389048084, + "grad_norm": 0.4452027678489685, + "learning_rate": 0.0002972979971179568, + "loss": 0.7533, + "step": 3680 + }, + { + "epoch": 0.46414963630065814, + "grad_norm": 0.4289001226425171, + "learning_rate": 0.0002972771876265701, + "loss": 0.8066, + "step": 3685 + }, + { + "epoch": 0.4647794187108354, + "grad_norm": 0.44446882605552673, + "learning_rate": 0.0002972562990439694, + "loss": 0.8017, + "step": 3690 + }, + { + "epoch": 0.4654092011210127, + "grad_norm": 0.4466266930103302, + "learning_rate": 0.00029723533138137256, + "loss": 0.7686, + "step": 3695 + }, + { + "epoch": 0.46603898353119, + "grad_norm": 0.44262874126434326, + "learning_rate": 0.0002972142846500395, + "loss": 0.7835, + "step": 3700 + }, + { + "epoch": 0.46666876594136725, + "grad_norm": 0.40932217240333557, + "learning_rate": 0.0002971931588612729, + "loss": 0.7844, + "step": 3705 + }, + { + "epoch": 0.46729854835154455, + "grad_norm": 0.38220685720443726, + "learning_rate": 0.0002971719540264177, + "loss": 0.7682, + "step": 3710 + }, + { + "epoch": 0.4679283307617218, + "grad_norm": 0.4890231788158417, + "learning_rate": 0.0002971506701568614, + "loss": 0.7883, + "step": 3715 + }, + { + "epoch": 0.4685581131718991, + "grad_norm": 0.44211700558662415, + "learning_rate": 0.00029712930726403397, + "loss": 0.7287, + "step": 3720 + }, + { + "epoch": 0.4691878955820764, + "grad_norm": 0.4585074782371521, + "learning_rate": 0.0002971078653594078, + "loss": 0.7452, + "step": 3725 + }, + { + "epoch": 0.46981767799225366, + "grad_norm": 0.3818283975124359, + "learning_rate": 0.00029708634445449754, + "loss": 0.751, + "step": 3730 + }, + { + "epoch": 0.47044746040243096, + "grad_norm": 0.4351007640361786, + "learning_rate": 0.00029706474456086054, + "loss": 0.7665, + "step": 3735 + }, + { + "epoch": 0.47107724281260827, + "grad_norm": 0.4167363941669464, + "learning_rate": 0.0002970430656900964, + "loss": 0.7421, + "step": 3740 + }, + { + "epoch": 0.4717070252227855, + "grad_norm": 0.40461474657058716, + "learning_rate": 0.0002970213078538472, + "loss": 0.7496, + "step": 3745 + }, + { + "epoch": 0.4723368076329628, + "grad_norm": 0.38994473218917847, + "learning_rate": 0.00029699947106379734, + "loss": 0.773, + "step": 3750 + }, + { + "epoch": 0.4729665900431401, + "grad_norm": 0.42335331439971924, + "learning_rate": 0.0002969775553316737, + "loss": 0.7496, + "step": 3755 + }, + { + "epoch": 0.47359637245331737, + "grad_norm": 0.39755743741989136, + "learning_rate": 0.0002969555606692455, + "loss": 0.7794, + "step": 3760 + }, + { + "epoch": 0.4742261548634947, + "grad_norm": 0.4671246409416199, + "learning_rate": 0.0002969334870883244, + "loss": 0.8289, + "step": 3765 + }, + { + "epoch": 0.4748559372736719, + "grad_norm": 0.4498395621776581, + "learning_rate": 0.00029691133460076443, + "loss": 0.7856, + "step": 3770 + }, + { + "epoch": 0.47548571968384923, + "grad_norm": 0.4068240225315094, + "learning_rate": 0.00029688910321846193, + "loss": 0.7572, + "step": 3775 + }, + { + "epoch": 0.47611550209402653, + "grad_norm": 0.4673171043395996, + "learning_rate": 0.0002968667929533557, + "loss": 0.7972, + "step": 3780 + }, + { + "epoch": 0.4767452845042038, + "grad_norm": 0.4210684597492218, + "learning_rate": 0.00029684440381742697, + "loss": 0.7566, + "step": 3785 + }, + { + "epoch": 0.4773750669143811, + "grad_norm": 0.4167214632034302, + "learning_rate": 0.000296821935822699, + "loss": 0.7383, + "step": 3790 + }, + { + "epoch": 0.4780048493245584, + "grad_norm": 0.3826481103897095, + "learning_rate": 0.0002967993889812378, + "loss": 0.7749, + "step": 3795 + }, + { + "epoch": 0.47863463173473564, + "grad_norm": 0.416202187538147, + "learning_rate": 0.0002967767633051514, + "loss": 0.7755, + "step": 3800 + }, + { + "epoch": 0.47926441414491294, + "grad_norm": 0.38089507818222046, + "learning_rate": 0.0002967540588065904, + "loss": 0.7813, + "step": 3805 + }, + { + "epoch": 0.4798941965550902, + "grad_norm": 0.5257447957992554, + "learning_rate": 0.0002967312754977476, + "loss": 0.7408, + "step": 3810 + }, + { + "epoch": 0.4805239789652675, + "grad_norm": 0.4472402334213257, + "learning_rate": 0.00029670841339085813, + "loss": 0.7946, + "step": 3815 + }, + { + "epoch": 0.4811537613754448, + "grad_norm": 0.39956340193748474, + "learning_rate": 0.00029668547249819957, + "loss": 0.7469, + "step": 3820 + }, + { + "epoch": 0.48178354378562205, + "grad_norm": 0.41508588194847107, + "learning_rate": 0.00029666245283209154, + "loss": 0.7328, + "step": 3825 + }, + { + "epoch": 0.48241332619579935, + "grad_norm": 0.3888874053955078, + "learning_rate": 0.00029663935440489624, + "loss": 0.7529, + "step": 3830 + }, + { + "epoch": 0.48304310860597666, + "grad_norm": 0.39619430899620056, + "learning_rate": 0.00029661617722901806, + "loss": 0.7406, + "step": 3835 + }, + { + "epoch": 0.4836728910161539, + "grad_norm": 0.38242536783218384, + "learning_rate": 0.0002965929213169036, + "loss": 0.7355, + "step": 3840 + }, + { + "epoch": 0.4843026734263312, + "grad_norm": 0.37065988779067993, + "learning_rate": 0.0002965695866810419, + "loss": 0.7087, + "step": 3845 + }, + { + "epoch": 0.4849324558365085, + "grad_norm": 0.4015233814716339, + "learning_rate": 0.0002965461733339641, + "loss": 0.7531, + "step": 3850 + }, + { + "epoch": 0.48556223824668576, + "grad_norm": 0.4394242465496063, + "learning_rate": 0.0002965226812882438, + "loss": 0.7619, + "step": 3855 + }, + { + "epoch": 0.48619202065686307, + "grad_norm": 0.39809074997901917, + "learning_rate": 0.00029649911055649666, + "loss": 0.7702, + "step": 3860 + }, + { + "epoch": 0.4868218030670403, + "grad_norm": 0.5184118747711182, + "learning_rate": 0.0002964754611513808, + "loss": 0.7931, + "step": 3865 + }, + { + "epoch": 0.4874515854772176, + "grad_norm": 0.5002544522285461, + "learning_rate": 0.00029645173308559644, + "loss": 0.7989, + "step": 3870 + }, + { + "epoch": 0.4880813678873949, + "grad_norm": 0.551458477973938, + "learning_rate": 0.0002964279263718861, + "loss": 0.7345, + "step": 3875 + }, + { + "epoch": 0.48871115029757217, + "grad_norm": 0.499612420797348, + "learning_rate": 0.0002964040410230345, + "loss": 0.7885, + "step": 3880 + }, + { + "epoch": 0.4893409327077495, + "grad_norm": 0.5279458165168762, + "learning_rate": 0.0002963800770518687, + "loss": 0.795, + "step": 3885 + }, + { + "epoch": 0.4899707151179268, + "grad_norm": 0.47077476978302, + "learning_rate": 0.0002963560344712578, + "loss": 0.7716, + "step": 3890 + }, + { + "epoch": 0.49060049752810403, + "grad_norm": 0.8377729058265686, + "learning_rate": 0.0002963319132941133, + "loss": 0.7625, + "step": 3895 + }, + { + "epoch": 0.49123027993828133, + "grad_norm": 0.43871793150901794, + "learning_rate": 0.0002963077135333888, + "loss": 0.734, + "step": 3900 + }, + { + "epoch": 0.4918600623484586, + "grad_norm": 0.44589656591415405, + "learning_rate": 0.00029628343520208004, + "loss": 0.7735, + "step": 3905 + }, + { + "epoch": 0.4924898447586359, + "grad_norm": 0.7113938927650452, + "learning_rate": 0.00029625907831322515, + "loss": 0.7611, + "step": 3910 + }, + { + "epoch": 0.4931196271688132, + "grad_norm": 0.3830680847167969, + "learning_rate": 0.0002962346428799043, + "loss": 0.7399, + "step": 3915 + }, + { + "epoch": 0.49374940957899044, + "grad_norm": 0.4787169396877289, + "learning_rate": 0.00029621012891523985, + "loss": 0.7572, + "step": 3920 + }, + { + "epoch": 0.49437919198916774, + "grad_norm": 0.428469717502594, + "learning_rate": 0.0002961855364323964, + "loss": 0.7548, + "step": 3925 + }, + { + "epoch": 0.49500897439934505, + "grad_norm": 0.3982272148132324, + "learning_rate": 0.00029616086544458065, + "loss": 0.7846, + "step": 3930 + }, + { + "epoch": 0.4956387568095223, + "grad_norm": 0.45961281657218933, + "learning_rate": 0.00029613611596504146, + "loss": 0.8041, + "step": 3935 + }, + { + "epoch": 0.4962685392196996, + "grad_norm": 0.400146484375, + "learning_rate": 0.00029611128800706996, + "loss": 0.7395, + "step": 3940 + }, + { + "epoch": 0.4968983216298769, + "grad_norm": 0.3984740674495697, + "learning_rate": 0.00029608638158399925, + "loss": 0.7569, + "step": 3945 + }, + { + "epoch": 0.49752810404005415, + "grad_norm": 0.4343632161617279, + "learning_rate": 0.0002960613967092046, + "loss": 0.7958, + "step": 3950 + }, + { + "epoch": 0.49815788645023146, + "grad_norm": 0.4569413363933563, + "learning_rate": 0.0002960363333961036, + "loss": 0.7673, + "step": 3955 + }, + { + "epoch": 0.4987876688604087, + "grad_norm": 0.4818612039089203, + "learning_rate": 0.0002960111916581557, + "loss": 0.7621, + "step": 3960 + }, + { + "epoch": 0.499417451270586, + "grad_norm": 0.4524887800216675, + "learning_rate": 0.0002959859715088626, + "loss": 0.7793, + "step": 3965 + }, + { + "epoch": 0.5000472336807633, + "grad_norm": 0.4414517283439636, + "learning_rate": 0.0002959606729617682, + "loss": 0.7557, + "step": 3970 + }, + { + "epoch": 0.5006770160909406, + "grad_norm": 0.3883852958679199, + "learning_rate": 0.0002959352960304583, + "loss": 0.7146, + "step": 3975 + }, + { + "epoch": 0.5013067985011178, + "grad_norm": 0.34819212555885315, + "learning_rate": 0.00029590984072856084, + "loss": 0.7271, + "step": 3980 + }, + { + "epoch": 0.5019365809112951, + "grad_norm": 0.3943585753440857, + "learning_rate": 0.0002958843070697461, + "loss": 0.817, + "step": 3985 + }, + { + "epoch": 0.5025663633214724, + "grad_norm": 0.3881372809410095, + "learning_rate": 0.000295858695067726, + "loss": 0.6997, + "step": 3990 + }, + { + "epoch": 0.5031961457316497, + "grad_norm": 0.4077765941619873, + "learning_rate": 0.00029583300473625497, + "loss": 0.789, + "step": 3995 + }, + { + "epoch": 0.503825928141827, + "grad_norm": 0.4519467353820801, + "learning_rate": 0.0002958072360891292, + "loss": 0.7081, + "step": 4000 + }, + { + "epoch": 0.503825928141827, + "eval_loss": 0.3237769305706024, + "eval_runtime": 6.2328, + "eval_samples_per_second": 160.442, + "eval_steps_per_second": 10.108, + "step": 4000 + }, + { + "epoch": 0.5044557105520043, + "grad_norm": 0.4161011278629303, + "learning_rate": 0.00029578138914018704, + "loss": 0.7426, + "step": 4005 + }, + { + "epoch": 0.5050854929621815, + "grad_norm": 0.4170863926410675, + "learning_rate": 0.0002957554639033089, + "loss": 0.7614, + "step": 4010 + }, + { + "epoch": 0.5057152753723588, + "grad_norm": 0.41827666759490967, + "learning_rate": 0.0002957294603924172, + "loss": 0.7339, + "step": 4015 + }, + { + "epoch": 0.5063450577825361, + "grad_norm": 0.4575699269771576, + "learning_rate": 0.0002957033786214766, + "loss": 0.7506, + "step": 4020 + }, + { + "epoch": 0.5069748401927134, + "grad_norm": 0.39499175548553467, + "learning_rate": 0.00029567721860449333, + "loss": 0.7227, + "step": 4025 + }, + { + "epoch": 0.5076046226028907, + "grad_norm": 0.4190625548362732, + "learning_rate": 0.00029565098035551606, + "loss": 0.7375, + "step": 4030 + }, + { + "epoch": 0.5082344050130679, + "grad_norm": 0.41470181941986084, + "learning_rate": 0.00029562466388863534, + "loss": 0.7953, + "step": 4035 + }, + { + "epoch": 0.5088641874232452, + "grad_norm": 0.5376180410385132, + "learning_rate": 0.00029559826921798373, + "loss": 0.7927, + "step": 4040 + }, + { + "epoch": 0.5094939698334225, + "grad_norm": 0.41073331236839294, + "learning_rate": 0.0002955717963577357, + "loss": 0.7175, + "step": 4045 + }, + { + "epoch": 0.5101237522435998, + "grad_norm": 0.3894195854663849, + "learning_rate": 0.0002955452453221078, + "loss": 0.743, + "step": 4050 + }, + { + "epoch": 0.5107535346537772, + "grad_norm": 0.37404918670654297, + "learning_rate": 0.00029551861612535856, + "loss": 0.6833, + "step": 4055 + }, + { + "epoch": 0.5113833170639545, + "grad_norm": 0.4107655882835388, + "learning_rate": 0.0002954919087817885, + "loss": 0.7588, + "step": 4060 + }, + { + "epoch": 0.5120130994741316, + "grad_norm": 0.4086790382862091, + "learning_rate": 0.00029546512330574004, + "loss": 0.7328, + "step": 4065 + }, + { + "epoch": 0.512642881884309, + "grad_norm": 0.4246830344200134, + "learning_rate": 0.0002954382597115976, + "loss": 0.7171, + "step": 4070 + }, + { + "epoch": 0.5132726642944863, + "grad_norm": 0.4021676182746887, + "learning_rate": 0.00029541131801378743, + "loss": 0.8009, + "step": 4075 + }, + { + "epoch": 0.5139024467046636, + "grad_norm": 0.42611315846443176, + "learning_rate": 0.00029538429822677806, + "loss": 0.7338, + "step": 4080 + }, + { + "epoch": 0.5145322291148409, + "grad_norm": 0.40971845388412476, + "learning_rate": 0.0002953572003650795, + "loss": 0.7883, + "step": 4085 + }, + { + "epoch": 0.5151620115250181, + "grad_norm": 0.4226076304912567, + "learning_rate": 0.0002953300244432441, + "loss": 0.7696, + "step": 4090 + }, + { + "epoch": 0.5157917939351954, + "grad_norm": 0.4504645764827728, + "learning_rate": 0.0002953027704758659, + "loss": 0.7123, + "step": 4095 + }, + { + "epoch": 0.5164215763453727, + "grad_norm": 0.4032662510871887, + "learning_rate": 0.00029527543847758086, + "loss": 0.6786, + "step": 4100 + }, + { + "epoch": 0.51705135875555, + "grad_norm": 0.4030795097351074, + "learning_rate": 0.00029524802846306694, + "loss": 0.7335, + "step": 4105 + }, + { + "epoch": 0.5176811411657273, + "grad_norm": 0.4887290596961975, + "learning_rate": 0.0002952205404470439, + "loss": 0.7238, + "step": 4110 + }, + { + "epoch": 0.5183109235759045, + "grad_norm": 0.4061615467071533, + "learning_rate": 0.00029519297444427343, + "loss": 0.7733, + "step": 4115 + }, + { + "epoch": 0.5189407059860818, + "grad_norm": 0.4060840308666229, + "learning_rate": 0.00029516533046955917, + "loss": 0.7268, + "step": 4120 + }, + { + "epoch": 0.5195704883962591, + "grad_norm": 0.4293743371963501, + "learning_rate": 0.0002951376085377465, + "loss": 0.7234, + "step": 4125 + }, + { + "epoch": 0.5202002708064364, + "grad_norm": 0.410264790058136, + "learning_rate": 0.00029510980866372273, + "loss": 0.774, + "step": 4130 + }, + { + "epoch": 0.5208300532166137, + "grad_norm": 0.3944232761859894, + "learning_rate": 0.0002950819308624171, + "loss": 0.7517, + "step": 4135 + }, + { + "epoch": 0.521459835626791, + "grad_norm": 0.4245687425136566, + "learning_rate": 0.0002950539751488005, + "loss": 0.7612, + "step": 4140 + }, + { + "epoch": 0.5220896180369682, + "grad_norm": 0.3795585632324219, + "learning_rate": 0.00029502594153788593, + "loss": 0.7778, + "step": 4145 + }, + { + "epoch": 0.5227194004471455, + "grad_norm": 0.42911338806152344, + "learning_rate": 0.000294997830044728, + "loss": 0.8033, + "step": 4150 + }, + { + "epoch": 0.5233491828573228, + "grad_norm": 0.4150259494781494, + "learning_rate": 0.0002949696406844232, + "loss": 0.7326, + "step": 4155 + }, + { + "epoch": 0.5239789652675001, + "grad_norm": 0.3846616744995117, + "learning_rate": 0.0002949413734721099, + "loss": 0.7085, + "step": 4160 + }, + { + "epoch": 0.5246087476776774, + "grad_norm": 0.3434165418148041, + "learning_rate": 0.00029491302842296824, + "loss": 0.711, + "step": 4165 + }, + { + "epoch": 0.5252385300878546, + "grad_norm": 0.33985382318496704, + "learning_rate": 0.0002948846055522202, + "loss": 0.7493, + "step": 4170 + }, + { + "epoch": 0.5258683124980319, + "grad_norm": 0.3809979259967804, + "learning_rate": 0.0002948561048751294, + "loss": 0.7224, + "step": 4175 + }, + { + "epoch": 0.5264980949082092, + "grad_norm": 0.45042338967323303, + "learning_rate": 0.00029482752640700143, + "loss": 0.7554, + "step": 4180 + }, + { + "epoch": 0.5271278773183865, + "grad_norm": 0.4068913757801056, + "learning_rate": 0.00029479887016318357, + "loss": 0.7267, + "step": 4185 + }, + { + "epoch": 0.5277576597285638, + "grad_norm": 0.41964098811149597, + "learning_rate": 0.0002947701361590649, + "loss": 0.7255, + "step": 4190 + }, + { + "epoch": 0.5283874421387411, + "grad_norm": 0.3956906795501709, + "learning_rate": 0.0002947413244100762, + "loss": 0.7272, + "step": 4195 + }, + { + "epoch": 0.5290172245489183, + "grad_norm": 0.41254857182502747, + "learning_rate": 0.0002947124349316901, + "loss": 0.7155, + "step": 4200 + }, + { + "epoch": 0.5296470069590956, + "grad_norm": 0.4162386655807495, + "learning_rate": 0.0002946834677394208, + "loss": 0.7729, + "step": 4205 + }, + { + "epoch": 0.5302767893692729, + "grad_norm": 0.4521070420742035, + "learning_rate": 0.00029465442284882436, + "loss": 0.7328, + "step": 4210 + }, + { + "epoch": 0.5309065717794502, + "grad_norm": 0.3702057898044586, + "learning_rate": 0.00029462530027549866, + "loss": 0.7592, + "step": 4215 + }, + { + "epoch": 0.5315363541896275, + "grad_norm": 0.4132764935493469, + "learning_rate": 0.00029459610003508313, + "loss": 0.7238, + "step": 4220 + }, + { + "epoch": 0.5321661365998047, + "grad_norm": 0.3817763328552246, + "learning_rate": 0.0002945668221432589, + "loss": 0.7524, + "step": 4225 + }, + { + "epoch": 0.532795919009982, + "grad_norm": 0.41137659549713135, + "learning_rate": 0.000294537466615749, + "loss": 0.7405, + "step": 4230 + }, + { + "epoch": 0.5334257014201593, + "grad_norm": 0.446150541305542, + "learning_rate": 0.00029450803346831787, + "loss": 0.7481, + "step": 4235 + }, + { + "epoch": 0.5340554838303366, + "grad_norm": 0.37535202503204346, + "learning_rate": 0.0002944785227167719, + "loss": 0.7505, + "step": 4240 + }, + { + "epoch": 0.5346852662405139, + "grad_norm": 0.4109747111797333, + "learning_rate": 0.000294448934376959, + "loss": 0.7406, + "step": 4245 + }, + { + "epoch": 0.5353150486506912, + "grad_norm": 0.4233269989490509, + "learning_rate": 0.00029441926846476873, + "loss": 0.7823, + "step": 4250 + }, + { + "epoch": 0.5359448310608684, + "grad_norm": 0.40127456188201904, + "learning_rate": 0.00029438952499613244, + "loss": 0.7486, + "step": 4255 + }, + { + "epoch": 0.5365746134710457, + "grad_norm": 0.40279653668403625, + "learning_rate": 0.000294359703987023, + "loss": 0.7157, + "step": 4260 + }, + { + "epoch": 0.537204395881223, + "grad_norm": 0.34208250045776367, + "learning_rate": 0.000294329805453455, + "loss": 0.7158, + "step": 4265 + }, + { + "epoch": 0.5378341782914003, + "grad_norm": 0.41574689745903015, + "learning_rate": 0.0002942998294114846, + "loss": 0.7668, + "step": 4270 + }, + { + "epoch": 0.5384639607015776, + "grad_norm": 0.401426762342453, + "learning_rate": 0.0002942697758772097, + "loss": 0.734, + "step": 4275 + }, + { + "epoch": 0.5390937431117548, + "grad_norm": 0.4085477292537689, + "learning_rate": 0.00029423964486676964, + "loss": 0.7448, + "step": 4280 + }, + { + "epoch": 0.5397235255219321, + "grad_norm": 0.43037959933280945, + "learning_rate": 0.0002942094363963456, + "loss": 0.7618, + "step": 4285 + }, + { + "epoch": 0.5403533079321095, + "grad_norm": 0.34685570001602173, + "learning_rate": 0.00029417915048216003, + "loss": 0.7314, + "step": 4290 + }, + { + "epoch": 0.5409830903422868, + "grad_norm": 0.3967381417751312, + "learning_rate": 0.00029414878714047725, + "loss": 0.7465, + "step": 4295 + }, + { + "epoch": 0.5416128727524641, + "grad_norm": 0.36378154158592224, + "learning_rate": 0.0002941183463876031, + "loss": 0.7372, + "step": 4300 + }, + { + "epoch": 0.5422426551626414, + "grad_norm": 0.3804253339767456, + "learning_rate": 0.00029408782823988494, + "loss": 0.7488, + "step": 4305 + }, + { + "epoch": 0.5428724375728186, + "grad_norm": 0.3679543137550354, + "learning_rate": 0.00029405723271371166, + "loss": 0.7253, + "step": 4310 + }, + { + "epoch": 0.5435022199829959, + "grad_norm": 0.35688257217407227, + "learning_rate": 0.0002940265598255138, + "loss": 0.7523, + "step": 4315 + }, + { + "epoch": 0.5441320023931732, + "grad_norm": 0.40890881419181824, + "learning_rate": 0.00029399580959176344, + "loss": 0.756, + "step": 4320 + }, + { + "epoch": 0.5447617848033505, + "grad_norm": 0.478547602891922, + "learning_rate": 0.00029396498202897406, + "loss": 0.7249, + "step": 4325 + }, + { + "epoch": 0.5453915672135278, + "grad_norm": 0.40117356181144714, + "learning_rate": 0.0002939340771537009, + "loss": 0.7466, + "step": 4330 + }, + { + "epoch": 0.546021349623705, + "grad_norm": 0.42868953943252563, + "learning_rate": 0.0002939030949825404, + "loss": 0.7894, + "step": 4335 + }, + { + "epoch": 0.5466511320338823, + "grad_norm": 0.41796940565109253, + "learning_rate": 0.0002938720355321309, + "loss": 0.7446, + "step": 4340 + }, + { + "epoch": 0.5472809144440596, + "grad_norm": 0.427336186170578, + "learning_rate": 0.0002938408988191519, + "loss": 0.7824, + "step": 4345 + }, + { + "epoch": 0.5479106968542369, + "grad_norm": 0.38179048895835876, + "learning_rate": 0.00029380968486032456, + "loss": 0.7427, + "step": 4350 + }, + { + "epoch": 0.5485404792644142, + "grad_norm": 0.39974477887153625, + "learning_rate": 0.0002937783936724115, + "loss": 0.7347, + "step": 4355 + }, + { + "epoch": 0.5491702616745914, + "grad_norm": 0.3805896043777466, + "learning_rate": 0.00029374702527221674, + "loss": 0.7547, + "step": 4360 + }, + { + "epoch": 0.5498000440847687, + "grad_norm": 0.43362486362457275, + "learning_rate": 0.0002937155796765859, + "loss": 0.7651, + "step": 4365 + }, + { + "epoch": 0.550429826494946, + "grad_norm": 0.38877996802330017, + "learning_rate": 0.000293684056902406, + "loss": 0.7054, + "step": 4370 + }, + { + "epoch": 0.5510596089051233, + "grad_norm": 0.393184095621109, + "learning_rate": 0.00029365245696660544, + "loss": 0.7453, + "step": 4375 + }, + { + "epoch": 0.5516893913153006, + "grad_norm": 0.3892836570739746, + "learning_rate": 0.0002936207798861541, + "loss": 0.7036, + "step": 4380 + }, + { + "epoch": 0.5523191737254779, + "grad_norm": 0.3737259805202484, + "learning_rate": 0.0002935890256780633, + "loss": 0.7403, + "step": 4385 + }, + { + "epoch": 0.5529489561356551, + "grad_norm": 0.36731937527656555, + "learning_rate": 0.00029355719435938585, + "loss": 0.7098, + "step": 4390 + }, + { + "epoch": 0.5535787385458324, + "grad_norm": 0.40238016843795776, + "learning_rate": 0.00029352528594721577, + "loss": 0.7625, + "step": 4395 + }, + { + "epoch": 0.5542085209560097, + "grad_norm": 0.3878697454929352, + "learning_rate": 0.0002934933004586887, + "loss": 0.7486, + "step": 4400 + }, + { + "epoch": 0.554838303366187, + "grad_norm": 0.36463412642478943, + "learning_rate": 0.00029346123791098157, + "loss": 0.7489, + "step": 4405 + }, + { + "epoch": 0.5554680857763643, + "grad_norm": 0.3860667049884796, + "learning_rate": 0.0002934290983213126, + "loss": 0.7503, + "step": 4410 + }, + { + "epoch": 0.5560978681865415, + "grad_norm": 0.40702390670776367, + "learning_rate": 0.0002933968817069417, + "loss": 0.6892, + "step": 4415 + }, + { + "epoch": 0.5567276505967188, + "grad_norm": 0.4769366979598999, + "learning_rate": 0.0002933645880851697, + "loss": 0.7285, + "step": 4420 + }, + { + "epoch": 0.5573574330068961, + "grad_norm": 0.37400034070014954, + "learning_rate": 0.00029333221747333913, + "loss": 0.7055, + "step": 4425 + }, + { + "epoch": 0.5579872154170734, + "grad_norm": 0.4280668795108795, + "learning_rate": 0.00029329976988883374, + "loss": 0.7629, + "step": 4430 + }, + { + "epoch": 0.5586169978272507, + "grad_norm": 0.3710954189300537, + "learning_rate": 0.00029326724534907856, + "loss": 0.696, + "step": 4435 + }, + { + "epoch": 0.559246780237428, + "grad_norm": 0.4311872720718384, + "learning_rate": 0.0002932346438715401, + "loss": 0.726, + "step": 4440 + }, + { + "epoch": 0.5598765626476052, + "grad_norm": 0.3708207309246063, + "learning_rate": 0.000293201965473726, + "loss": 0.7308, + "step": 4445 + }, + { + "epoch": 0.5605063450577825, + "grad_norm": 0.36177051067352295, + "learning_rate": 0.00029316921017318536, + "loss": 0.7403, + "step": 4450 + }, + { + "epoch": 0.5611361274679598, + "grad_norm": 0.4313011169433594, + "learning_rate": 0.0002931363779875086, + "loss": 0.7053, + "step": 4455 + }, + { + "epoch": 0.5617659098781371, + "grad_norm": 0.36055561900138855, + "learning_rate": 0.0002931034689343272, + "loss": 0.7544, + "step": 4460 + }, + { + "epoch": 0.5623956922883144, + "grad_norm": 0.37126588821411133, + "learning_rate": 0.0002930704830313142, + "loss": 0.7444, + "step": 4465 + }, + { + "epoch": 0.5630254746984916, + "grad_norm": 0.35056549310684204, + "learning_rate": 0.00029303742029618377, + "loss": 0.7251, + "step": 4470 + }, + { + "epoch": 0.5636552571086689, + "grad_norm": 0.3944834768772125, + "learning_rate": 0.0002930042807466913, + "loss": 0.771, + "step": 4475 + }, + { + "epoch": 0.5642850395188462, + "grad_norm": 0.39250391721725464, + "learning_rate": 0.0002929710644006334, + "loss": 0.7177, + "step": 4480 + }, + { + "epoch": 0.5649148219290235, + "grad_norm": 0.41848230361938477, + "learning_rate": 0.00029293777127584826, + "loss": 0.7362, + "step": 4485 + }, + { + "epoch": 0.5655446043392008, + "grad_norm": 0.3222586214542389, + "learning_rate": 0.00029290440139021477, + "loss": 0.6746, + "step": 4490 + }, + { + "epoch": 0.5661743867493781, + "grad_norm": 0.4275425672531128, + "learning_rate": 0.00029287095476165356, + "loss": 0.7641, + "step": 4495 + }, + { + "epoch": 0.5668041691595553, + "grad_norm": 0.37914222478866577, + "learning_rate": 0.0002928374314081261, + "loss": 0.7367, + "step": 4500 + }, + { + "epoch": 0.5674339515697326, + "grad_norm": 0.36903491616249084, + "learning_rate": 0.00029280383134763516, + "loss": 0.726, + "step": 4505 + }, + { + "epoch": 0.56806373397991, + "grad_norm": 0.45876601338386536, + "learning_rate": 0.0002927701545982249, + "loss": 0.7285, + "step": 4510 + }, + { + "epoch": 0.5686935163900873, + "grad_norm": 0.3885752856731415, + "learning_rate": 0.0002927364011779803, + "loss": 0.7111, + "step": 4515 + }, + { + "epoch": 0.5693232988002646, + "grad_norm": 0.3738529086112976, + "learning_rate": 0.00029270257110502784, + "loss": 0.7381, + "step": 4520 + }, + { + "epoch": 0.5699530812104417, + "grad_norm": 0.38678133487701416, + "learning_rate": 0.0002926686643975351, + "loss": 0.7069, + "step": 4525 + }, + { + "epoch": 0.570582863620619, + "grad_norm": 0.38699817657470703, + "learning_rate": 0.0002926346810737106, + "loss": 0.7456, + "step": 4530 + }, + { + "epoch": 0.5712126460307964, + "grad_norm": 0.39948272705078125, + "learning_rate": 0.0002926006211518043, + "loss": 0.7018, + "step": 4535 + }, + { + "epoch": 0.5718424284409737, + "grad_norm": 0.36441704630851746, + "learning_rate": 0.00029256648465010706, + "loss": 0.7155, + "step": 4540 + }, + { + "epoch": 0.572472210851151, + "grad_norm": 0.38412773609161377, + "learning_rate": 0.00029253227158695103, + "loss": 0.7131, + "step": 4545 + }, + { + "epoch": 0.5731019932613282, + "grad_norm": 0.3713320791721344, + "learning_rate": 0.0002924979819807094, + "loss": 0.7109, + "step": 4550 + }, + { + "epoch": 0.5737317756715055, + "grad_norm": 0.41460588574409485, + "learning_rate": 0.00029246361584979637, + "loss": 0.7218, + "step": 4555 + }, + { + "epoch": 0.5743615580816828, + "grad_norm": 0.37706735730171204, + "learning_rate": 0.0002924291732126675, + "loss": 0.7364, + "step": 4560 + }, + { + "epoch": 0.5749913404918601, + "grad_norm": 0.3931211829185486, + "learning_rate": 0.00029239465408781914, + "loss": 0.793, + "step": 4565 + }, + { + "epoch": 0.5756211229020374, + "grad_norm": 0.4280949831008911, + "learning_rate": 0.0002923600584937889, + "loss": 0.7577, + "step": 4570 + }, + { + "epoch": 0.5762509053122147, + "grad_norm": 0.408357173204422, + "learning_rate": 0.0002923253864491554, + "loss": 0.6866, + "step": 4575 + }, + { + "epoch": 0.5768806877223919, + "grad_norm": 0.3654685318470001, + "learning_rate": 0.0002922906379725383, + "loss": 0.7409, + "step": 4580 + }, + { + "epoch": 0.5775104701325692, + "grad_norm": 0.3723433017730713, + "learning_rate": 0.0002922558130825984, + "loss": 0.7106, + "step": 4585 + }, + { + "epoch": 0.5781402525427465, + "grad_norm": 0.40489017963409424, + "learning_rate": 0.00029222091179803735, + "loss": 0.7311, + "step": 4590 + }, + { + "epoch": 0.5787700349529238, + "grad_norm": 0.40270909667015076, + "learning_rate": 0.000292185934137598, + "loss": 0.7393, + "step": 4595 + }, + { + "epoch": 0.5793998173631011, + "grad_norm": 0.4228857159614563, + "learning_rate": 0.0002921508801200642, + "loss": 0.7253, + "step": 4600 + }, + { + "epoch": 0.5800295997732783, + "grad_norm": 0.39830881357192993, + "learning_rate": 0.0002921157497642607, + "loss": 0.7413, + "step": 4605 + }, + { + "epoch": 0.5806593821834556, + "grad_norm": 0.40520498156547546, + "learning_rate": 0.00029208054308905323, + "loss": 0.6902, + "step": 4610 + }, + { + "epoch": 0.5812891645936329, + "grad_norm": 0.3546881377696991, + "learning_rate": 0.0002920452601133487, + "loss": 0.7104, + "step": 4615 + }, + { + "epoch": 0.5819189470038102, + "grad_norm": 0.40294864773750305, + "learning_rate": 0.0002920099008560949, + "loss": 0.7258, + "step": 4620 + }, + { + "epoch": 0.5825487294139875, + "grad_norm": 0.36979302763938904, + "learning_rate": 0.0002919744653362804, + "loss": 0.708, + "step": 4625 + }, + { + "epoch": 0.5831785118241648, + "grad_norm": 0.42616382241249084, + "learning_rate": 0.000291938953572935, + "loss": 0.7044, + "step": 4630 + }, + { + "epoch": 0.583808294234342, + "grad_norm": 0.3644506335258484, + "learning_rate": 0.0002919033655851293, + "loss": 0.7277, + "step": 4635 + }, + { + "epoch": 0.5844380766445193, + "grad_norm": 0.34578534960746765, + "learning_rate": 0.0002918677013919749, + "loss": 0.7233, + "step": 4640 + }, + { + "epoch": 0.5850678590546966, + "grad_norm": 0.3914281725883484, + "learning_rate": 0.00029183196101262423, + "loss": 0.6829, + "step": 4645 + }, + { + "epoch": 0.5856976414648739, + "grad_norm": 0.35399550199508667, + "learning_rate": 0.0002917961444662707, + "loss": 0.7371, + "step": 4650 + }, + { + "epoch": 0.5863274238750512, + "grad_norm": 0.3999468684196472, + "learning_rate": 0.0002917602517721486, + "loss": 0.7228, + "step": 4655 + }, + { + "epoch": 0.5869572062852284, + "grad_norm": 0.4196580946445465, + "learning_rate": 0.0002917242829495332, + "loss": 0.7013, + "step": 4660 + }, + { + "epoch": 0.5875869886954057, + "grad_norm": 0.38301941752433777, + "learning_rate": 0.0002916882380177405, + "loss": 0.7409, + "step": 4665 + }, + { + "epoch": 0.588216771105583, + "grad_norm": 0.3997241258621216, + "learning_rate": 0.0002916521169961275, + "loss": 0.7216, + "step": 4670 + }, + { + "epoch": 0.5888465535157603, + "grad_norm": 0.3389524817466736, + "learning_rate": 0.00029161591990409203, + "loss": 0.7109, + "step": 4675 + }, + { + "epoch": 0.5894763359259376, + "grad_norm": 0.38282495737075806, + "learning_rate": 0.0002915796467610727, + "loss": 0.7608, + "step": 4680 + }, + { + "epoch": 0.5901061183361149, + "grad_norm": 0.40406349301338196, + "learning_rate": 0.000291543297586549, + "loss": 0.7062, + "step": 4685 + }, + { + "epoch": 0.5907359007462921, + "grad_norm": 0.37658101320266724, + "learning_rate": 0.0002915068724000413, + "loss": 0.7305, + "step": 4690 + }, + { + "epoch": 0.5913656831564694, + "grad_norm": 0.397401362657547, + "learning_rate": 0.0002914703712211108, + "loss": 0.7276, + "step": 4695 + }, + { + "epoch": 0.5919954655666467, + "grad_norm": 0.4348791539669037, + "learning_rate": 0.0002914337940693594, + "loss": 0.7572, + "step": 4700 + }, + { + "epoch": 0.592625247976824, + "grad_norm": 0.372659295797348, + "learning_rate": 0.0002913971409644299, + "loss": 0.7436, + "step": 4705 + }, + { + "epoch": 0.5932550303870013, + "grad_norm": 0.3933033049106598, + "learning_rate": 0.0002913604119260059, + "loss": 0.7229, + "step": 4710 + }, + { + "epoch": 0.5938848127971785, + "grad_norm": 0.35579994320869446, + "learning_rate": 0.0002913236069738116, + "loss": 0.7055, + "step": 4715 + }, + { + "epoch": 0.5945145952073558, + "grad_norm": 0.40102267265319824, + "learning_rate": 0.0002912867261276122, + "loss": 0.7167, + "step": 4720 + }, + { + "epoch": 0.5951443776175331, + "grad_norm": 0.3881862163543701, + "learning_rate": 0.0002912497694072136, + "loss": 0.7395, + "step": 4725 + }, + { + "epoch": 0.5957741600277104, + "grad_norm": 0.43878352642059326, + "learning_rate": 0.00029121273683246234, + "loss": 0.7251, + "step": 4730 + }, + { + "epoch": 0.5964039424378877, + "grad_norm": 0.3500851094722748, + "learning_rate": 0.0002911756284232457, + "loss": 0.6989, + "step": 4735 + }, + { + "epoch": 0.5970337248480649, + "grad_norm": 0.3887772262096405, + "learning_rate": 0.00029113844419949184, + "loss": 0.7324, + "step": 4740 + }, + { + "epoch": 0.5976635072582422, + "grad_norm": 0.376321405172348, + "learning_rate": 0.0002911011841811695, + "loss": 0.7239, + "step": 4745 + }, + { + "epoch": 0.5982932896684195, + "grad_norm": 0.3770900070667267, + "learning_rate": 0.00029106384838828816, + "loss": 0.6973, + "step": 4750 + }, + { + "epoch": 0.5989230720785969, + "grad_norm": 0.366205096244812, + "learning_rate": 0.000291026436840898, + "loss": 0.749, + "step": 4755 + }, + { + "epoch": 0.5995528544887742, + "grad_norm": 0.40675458312034607, + "learning_rate": 0.00029098894955908983, + "loss": 0.7155, + "step": 4760 + }, + { + "epoch": 0.6001826368989515, + "grad_norm": 0.4125664532184601, + "learning_rate": 0.0002909513865629953, + "loss": 0.7455, + "step": 4765 + }, + { + "epoch": 0.6008124193091287, + "grad_norm": 0.39872869849205017, + "learning_rate": 0.0002909137478727864, + "loss": 0.7194, + "step": 4770 + }, + { + "epoch": 0.601442201719306, + "grad_norm": 0.3824906051158905, + "learning_rate": 0.00029087603350867616, + "loss": 0.742, + "step": 4775 + }, + { + "epoch": 0.6020719841294833, + "grad_norm": 0.37914398312568665, + "learning_rate": 0.00029083824349091794, + "loss": 0.692, + "step": 4780 + }, + { + "epoch": 0.6027017665396606, + "grad_norm": 0.35589495301246643, + "learning_rate": 0.0002908003778398059, + "loss": 0.6706, + "step": 4785 + }, + { + "epoch": 0.6033315489498379, + "grad_norm": 0.34000104665756226, + "learning_rate": 0.0002907624365756748, + "loss": 0.7506, + "step": 4790 + }, + { + "epoch": 0.6039613313600151, + "grad_norm": 0.34795689582824707, + "learning_rate": 0.0002907244197188998, + "loss": 0.7097, + "step": 4795 + }, + { + "epoch": 0.6045911137701924, + "grad_norm": 0.38767385482788086, + "learning_rate": 0.00029068632728989697, + "loss": 0.6986, + "step": 4800 + }, + { + "epoch": 0.6052208961803697, + "grad_norm": 0.40651988983154297, + "learning_rate": 0.00029064815930912276, + "loss": 0.7159, + "step": 4805 + }, + { + "epoch": 0.605850678590547, + "grad_norm": 0.37715932726860046, + "learning_rate": 0.00029060991579707424, + "loss": 0.7189, + "step": 4810 + }, + { + "epoch": 0.6064804610007243, + "grad_norm": 0.3925745487213135, + "learning_rate": 0.0002905715967742891, + "loss": 0.6956, + "step": 4815 + }, + { + "epoch": 0.6071102434109016, + "grad_norm": 0.33669450879096985, + "learning_rate": 0.0002905332022613455, + "loss": 0.6806, + "step": 4820 + }, + { + "epoch": 0.6077400258210788, + "grad_norm": 0.38812607526779175, + "learning_rate": 0.00029049473227886214, + "loss": 0.6997, + "step": 4825 + }, + { + "epoch": 0.6083698082312561, + "grad_norm": 0.3890033960342407, + "learning_rate": 0.00029045618684749833, + "loss": 0.7306, + "step": 4830 + }, + { + "epoch": 0.6089995906414334, + "grad_norm": 0.4020345211029053, + "learning_rate": 0.00029041756598795383, + "loss": 0.7357, + "step": 4835 + }, + { + "epoch": 0.6096293730516107, + "grad_norm": 0.39244237542152405, + "learning_rate": 0.0002903788697209689, + "loss": 0.6956, + "step": 4840 + }, + { + "epoch": 0.610259155461788, + "grad_norm": 0.35866880416870117, + "learning_rate": 0.0002903400980673243, + "loss": 0.7219, + "step": 4845 + }, + { + "epoch": 0.6108889378719652, + "grad_norm": 0.3912501931190491, + "learning_rate": 0.0002903012510478414, + "loss": 0.7194, + "step": 4850 + }, + { + "epoch": 0.6115187202821425, + "grad_norm": 0.3933585584163666, + "learning_rate": 0.00029026232868338184, + "loss": 0.7136, + "step": 4855 + }, + { + "epoch": 0.6121485026923198, + "grad_norm": 0.37603482604026794, + "learning_rate": 0.0002902233309948479, + "loss": 0.7208, + "step": 4860 + }, + { + "epoch": 0.6127782851024971, + "grad_norm": 0.36919742822647095, + "learning_rate": 0.00029018425800318205, + "loss": 0.7499, + "step": 4865 + }, + { + "epoch": 0.6134080675126744, + "grad_norm": 0.38067975640296936, + "learning_rate": 0.0002901451097293676, + "loss": 0.7468, + "step": 4870 + }, + { + "epoch": 0.6140378499228517, + "grad_norm": 0.33954986929893494, + "learning_rate": 0.00029010588619442793, + "loss": 0.6894, + "step": 4875 + }, + { + "epoch": 0.6146676323330289, + "grad_norm": 0.38103437423706055, + "learning_rate": 0.000290066587419427, + "loss": 0.6661, + "step": 4880 + }, + { + "epoch": 0.6152974147432062, + "grad_norm": 0.3855966031551361, + "learning_rate": 0.00029002721342546924, + "loss": 0.7138, + "step": 4885 + }, + { + "epoch": 0.6159271971533835, + "grad_norm": 0.5084123611450195, + "learning_rate": 0.00028998776423369923, + "loss": 0.7005, + "step": 4890 + }, + { + "epoch": 0.6165569795635608, + "grad_norm": 0.36192139983177185, + "learning_rate": 0.0002899482398653022, + "loss": 0.7386, + "step": 4895 + }, + { + "epoch": 0.6171867619737381, + "grad_norm": 0.37423619627952576, + "learning_rate": 0.0002899086403415037, + "loss": 0.7172, + "step": 4900 + }, + { + "epoch": 0.6178165443839153, + "grad_norm": 0.3741579055786133, + "learning_rate": 0.00028986896568356933, + "loss": 0.7519, + "step": 4905 + }, + { + "epoch": 0.6184463267940926, + "grad_norm": 0.4323353171348572, + "learning_rate": 0.0002898292159128055, + "loss": 0.7325, + "step": 4910 + }, + { + "epoch": 0.6190761092042699, + "grad_norm": 0.3273026645183563, + "learning_rate": 0.00028978939105055873, + "loss": 0.7211, + "step": 4915 + }, + { + "epoch": 0.6197058916144472, + "grad_norm": 0.38831016421318054, + "learning_rate": 0.0002897494911182158, + "loss": 0.6435, + "step": 4920 + }, + { + "epoch": 0.6203356740246245, + "grad_norm": 0.36923748254776, + "learning_rate": 0.00028970951613720397, + "loss": 0.7184, + "step": 4925 + }, + { + "epoch": 0.6209654564348017, + "grad_norm": 0.3658188283443451, + "learning_rate": 0.0002896694661289906, + "loss": 0.7171, + "step": 4930 + }, + { + "epoch": 0.621595238844979, + "grad_norm": 0.3589092493057251, + "learning_rate": 0.00028962934111508357, + "loss": 0.7173, + "step": 4935 + }, + { + "epoch": 0.6222250212551563, + "grad_norm": 0.41886886954307556, + "learning_rate": 0.00028958914111703086, + "loss": 0.7412, + "step": 4940 + }, + { + "epoch": 0.6228548036653336, + "grad_norm": 0.34496763348579407, + "learning_rate": 0.0002895488661564208, + "loss": 0.6608, + "step": 4945 + }, + { + "epoch": 0.6234845860755109, + "grad_norm": 0.3527592122554779, + "learning_rate": 0.000289508516254882, + "loss": 0.7179, + "step": 4950 + }, + { + "epoch": 0.6241143684856882, + "grad_norm": 0.3406129479408264, + "learning_rate": 0.0002894680914340833, + "loss": 0.6862, + "step": 4955 + }, + { + "epoch": 0.6247441508958654, + "grad_norm": 0.33078086376190186, + "learning_rate": 0.00028942759171573374, + "loss": 0.6804, + "step": 4960 + }, + { + "epoch": 0.6253739333060427, + "grad_norm": 0.3582599461078644, + "learning_rate": 0.00028938701712158247, + "loss": 0.6681, + "step": 4965 + }, + { + "epoch": 0.62600371571622, + "grad_norm": 0.3656567633152008, + "learning_rate": 0.0002893463676734191, + "loss": 0.6714, + "step": 4970 + }, + { + "epoch": 0.6266334981263973, + "grad_norm": 0.35537272691726685, + "learning_rate": 0.00028930564339307337, + "loss": 0.6917, + "step": 4975 + }, + { + "epoch": 0.6272632805365747, + "grad_norm": 0.35100945830345154, + "learning_rate": 0.0002892648443024149, + "loss": 0.7217, + "step": 4980 + }, + { + "epoch": 0.6278930629467518, + "grad_norm": 0.34070494771003723, + "learning_rate": 0.000289223970423354, + "loss": 0.7237, + "step": 4985 + }, + { + "epoch": 0.6285228453569291, + "grad_norm": 0.3810268044471741, + "learning_rate": 0.00028918302177784075, + "loss": 0.7513, + "step": 4990 + }, + { + "epoch": 0.6291526277671065, + "grad_norm": 0.3511486053466797, + "learning_rate": 0.0002891419983878655, + "loss": 0.7112, + "step": 4995 + }, + { + "epoch": 0.6297824101772838, + "grad_norm": 0.30101874470710754, + "learning_rate": 0.0002891009002754588, + "loss": 0.6666, + "step": 5000 + }, + { + "epoch": 0.6297824101772838, + "eval_loss": 0.31327521800994873, + "eval_runtime": 6.2403, + "eval_samples_per_second": 160.248, + "eval_steps_per_second": 10.096, + "step": 5000 + }, + { + "epoch": 0.6304121925874611, + "grad_norm": 0.3446876108646393, + "learning_rate": 0.00028905972746269125, + "loss": 0.6651, + "step": 5005 + }, + { + "epoch": 0.6310419749976384, + "grad_norm": 0.3606228232383728, + "learning_rate": 0.0002890184799716736, + "loss": 0.7387, + "step": 5010 + }, + { + "epoch": 0.6316717574078156, + "grad_norm": 0.37057119607925415, + "learning_rate": 0.0002889771578245567, + "loss": 0.7044, + "step": 5015 + }, + { + "epoch": 0.6323015398179929, + "grad_norm": 0.36304429173469543, + "learning_rate": 0.0002889357610435314, + "loss": 0.7391, + "step": 5020 + }, + { + "epoch": 0.6329313222281702, + "grad_norm": 0.38329148292541504, + "learning_rate": 0.00028889428965082886, + "loss": 0.7045, + "step": 5025 + }, + { + "epoch": 0.6335611046383475, + "grad_norm": 0.3362608850002289, + "learning_rate": 0.00028885274366872006, + "loss": 0.6865, + "step": 5030 + }, + { + "epoch": 0.6341908870485248, + "grad_norm": 0.4079527258872986, + "learning_rate": 0.00028881112311951625, + "loss": 0.6892, + "step": 5035 + }, + { + "epoch": 0.634820669458702, + "grad_norm": 0.35261860489845276, + "learning_rate": 0.00028876942802556847, + "loss": 0.7189, + "step": 5040 + }, + { + "epoch": 0.6354504518688793, + "grad_norm": 0.40486040711402893, + "learning_rate": 0.00028872765840926804, + "loss": 0.7385, + "step": 5045 + }, + { + "epoch": 0.6360802342790566, + "grad_norm": 0.32852765917778015, + "learning_rate": 0.0002886858142930462, + "loss": 0.6267, + "step": 5050 + }, + { + "epoch": 0.6367100166892339, + "grad_norm": 0.31455445289611816, + "learning_rate": 0.0002886438956993741, + "loss": 0.6813, + "step": 5055 + }, + { + "epoch": 0.6373397990994112, + "grad_norm": 0.3047012686729431, + "learning_rate": 0.00028860190265076304, + "loss": 0.6862, + "step": 5060 + }, + { + "epoch": 0.6379695815095885, + "grad_norm": 0.34203359484672546, + "learning_rate": 0.0002885598351697643, + "loss": 0.6996, + "step": 5065 + }, + { + "epoch": 0.6385993639197657, + "grad_norm": 0.4077922999858856, + "learning_rate": 0.0002885176932789691, + "loss": 0.7018, + "step": 5070 + }, + { + "epoch": 0.639229146329943, + "grad_norm": 0.3590135872364044, + "learning_rate": 0.00028847547700100836, + "loss": 0.6741, + "step": 5075 + }, + { + "epoch": 0.6398589287401203, + "grad_norm": 0.33030763268470764, + "learning_rate": 0.0002884331863585535, + "loss": 0.6775, + "step": 5080 + }, + { + "epoch": 0.6404887111502976, + "grad_norm": 0.3921838104724884, + "learning_rate": 0.0002883908213743153, + "loss": 0.7359, + "step": 5085 + }, + { + "epoch": 0.6411184935604749, + "grad_norm": 0.35765379667282104, + "learning_rate": 0.0002883483820710449, + "loss": 0.6953, + "step": 5090 + }, + { + "epoch": 0.6417482759706521, + "grad_norm": 0.3486902415752411, + "learning_rate": 0.0002883058684715331, + "loss": 0.6848, + "step": 5095 + }, + { + "epoch": 0.6423780583808294, + "grad_norm": 0.35446256399154663, + "learning_rate": 0.0002882632805986108, + "loss": 0.7031, + "step": 5100 + }, + { + "epoch": 0.6430078407910067, + "grad_norm": 0.3666916489601135, + "learning_rate": 0.00028822061847514843, + "loss": 0.7135, + "step": 5105 + }, + { + "epoch": 0.643637623201184, + "grad_norm": 0.38766369223594666, + "learning_rate": 0.00028817788212405666, + "loss": 0.6623, + "step": 5110 + }, + { + "epoch": 0.6442674056113613, + "grad_norm": 0.3532891273498535, + "learning_rate": 0.0002881350715682859, + "loss": 0.699, + "step": 5115 + }, + { + "epoch": 0.6448971880215385, + "grad_norm": 0.36512479186058044, + "learning_rate": 0.0002880921868308263, + "loss": 0.6859, + "step": 5120 + }, + { + "epoch": 0.6455269704317158, + "grad_norm": 0.34285515546798706, + "learning_rate": 0.0002880492279347081, + "loss": 0.7254, + "step": 5125 + }, + { + "epoch": 0.6461567528418931, + "grad_norm": 0.3731713891029358, + "learning_rate": 0.00028800619490300107, + "loss": 0.6995, + "step": 5130 + }, + { + "epoch": 0.6467865352520704, + "grad_norm": 0.37182632088661194, + "learning_rate": 0.000287963087758815, + "loss": 0.7262, + "step": 5135 + }, + { + "epoch": 0.6474163176622477, + "grad_norm": 0.371231347322464, + "learning_rate": 0.0002879199065252994, + "loss": 0.7051, + "step": 5140 + }, + { + "epoch": 0.648046100072425, + "grad_norm": 0.35507723689079285, + "learning_rate": 0.00028787665122564357, + "loss": 0.6799, + "step": 5145 + }, + { + "epoch": 0.6486758824826022, + "grad_norm": 0.4001401662826538, + "learning_rate": 0.0002878333218830766, + "loss": 0.7718, + "step": 5150 + }, + { + "epoch": 0.6493056648927795, + "grad_norm": 0.36585733294487, + "learning_rate": 0.0002877899185208673, + "loss": 0.6652, + "step": 5155 + }, + { + "epoch": 0.6499354473029568, + "grad_norm": 0.3719576895236969, + "learning_rate": 0.00028774644116232436, + "loss": 0.7232, + "step": 5160 + }, + { + "epoch": 0.6505652297131341, + "grad_norm": 0.40236014127731323, + "learning_rate": 0.000287702889830796, + "loss": 0.6697, + "step": 5165 + }, + { + "epoch": 0.6511950121233114, + "grad_norm": 0.4343264400959015, + "learning_rate": 0.00028765926454967037, + "loss": 0.6877, + "step": 5170 + }, + { + "epoch": 0.6518247945334886, + "grad_norm": 0.3576568067073822, + "learning_rate": 0.00028761556534237514, + "loss": 0.7239, + "step": 5175 + }, + { + "epoch": 0.6524545769436659, + "grad_norm": 0.33383145928382874, + "learning_rate": 0.00028757179223237793, + "loss": 0.6822, + "step": 5180 + }, + { + "epoch": 0.6530843593538432, + "grad_norm": 0.353253573179245, + "learning_rate": 0.0002875279452431858, + "loss": 0.6925, + "step": 5185 + }, + { + "epoch": 0.6537141417640205, + "grad_norm": 0.3755667209625244, + "learning_rate": 0.0002874840243983455, + "loss": 0.6872, + "step": 5190 + }, + { + "epoch": 0.6543439241741978, + "grad_norm": 0.3973848521709442, + "learning_rate": 0.00028744002972144376, + "loss": 0.7251, + "step": 5195 + }, + { + "epoch": 0.6549737065843751, + "grad_norm": 0.3476422131061554, + "learning_rate": 0.0002873959612361066, + "loss": 0.6964, + "step": 5200 + }, + { + "epoch": 0.6556034889945523, + "grad_norm": 0.42737796902656555, + "learning_rate": 0.0002873518189659997, + "loss": 0.7106, + "step": 5205 + }, + { + "epoch": 0.6562332714047296, + "grad_norm": 0.3009507358074188, + "learning_rate": 0.00028730760293482863, + "loss": 0.6614, + "step": 5210 + }, + { + "epoch": 0.656863053814907, + "grad_norm": 0.38053247332572937, + "learning_rate": 0.00028726331316633835, + "loss": 0.6963, + "step": 5215 + }, + { + "epoch": 0.6574928362250843, + "grad_norm": 0.4153291583061218, + "learning_rate": 0.00028721894968431345, + "loss": 0.7471, + "step": 5220 + }, + { + "epoch": 0.6581226186352616, + "grad_norm": 0.36470016837120056, + "learning_rate": 0.0002871745125125782, + "loss": 0.6558, + "step": 5225 + }, + { + "epoch": 0.6587524010454388, + "grad_norm": 0.3935704827308655, + "learning_rate": 0.00028713000167499627, + "loss": 0.7025, + "step": 5230 + }, + { + "epoch": 0.659382183455616, + "grad_norm": 0.36777618527412415, + "learning_rate": 0.0002870854171954711, + "loss": 0.7386, + "step": 5235 + }, + { + "epoch": 0.6600119658657934, + "grad_norm": 0.36549127101898193, + "learning_rate": 0.0002870407590979455, + "loss": 0.703, + "step": 5240 + }, + { + "epoch": 0.6606417482759707, + "grad_norm": 0.37523144483566284, + "learning_rate": 0.00028699602740640194, + "loss": 0.6708, + "step": 5245 + }, + { + "epoch": 0.661271530686148, + "grad_norm": 0.3451475203037262, + "learning_rate": 0.00028695122214486237, + "loss": 0.6776, + "step": 5250 + }, + { + "epoch": 0.6619013130963253, + "grad_norm": 0.35215169191360474, + "learning_rate": 0.00028690634333738816, + "loss": 0.6983, + "step": 5255 + }, + { + "epoch": 0.6625310955065025, + "grad_norm": 0.37627631425857544, + "learning_rate": 0.00028686139100808037, + "loss": 0.6844, + "step": 5260 + }, + { + "epoch": 0.6631608779166798, + "grad_norm": 0.34171178936958313, + "learning_rate": 0.0002868163651810793, + "loss": 0.7068, + "step": 5265 + }, + { + "epoch": 0.6637906603268571, + "grad_norm": 0.3566179573535919, + "learning_rate": 0.0002867712658805649, + "loss": 0.6618, + "step": 5270 + }, + { + "epoch": 0.6644204427370344, + "grad_norm": 0.3453030586242676, + "learning_rate": 0.00028672609313075664, + "loss": 0.7046, + "step": 5275 + }, + { + "epoch": 0.6650502251472117, + "grad_norm": 0.40633949637413025, + "learning_rate": 0.00028668084695591316, + "loss": 0.6931, + "step": 5280 + }, + { + "epoch": 0.6656800075573889, + "grad_norm": 0.3927484154701233, + "learning_rate": 0.00028663552738033275, + "loss": 0.7051, + "step": 5285 + }, + { + "epoch": 0.6663097899675662, + "grad_norm": 0.35829389095306396, + "learning_rate": 0.000286590134428353, + "loss": 0.7051, + "step": 5290 + }, + { + "epoch": 0.6669395723777435, + "grad_norm": 0.4202066957950592, + "learning_rate": 0.00028654466812435105, + "loss": 0.7179, + "step": 5295 + }, + { + "epoch": 0.6675693547879208, + "grad_norm": 0.37852293252944946, + "learning_rate": 0.0002864991284927433, + "loss": 0.7107, + "step": 5300 + }, + { + "epoch": 0.6681991371980981, + "grad_norm": 0.3831678330898285, + "learning_rate": 0.0002864535155579856, + "loss": 0.659, + "step": 5305 + }, + { + "epoch": 0.6688289196082754, + "grad_norm": 0.3563750684261322, + "learning_rate": 0.0002864078293445731, + "loss": 0.7111, + "step": 5310 + }, + { + "epoch": 0.6694587020184526, + "grad_norm": 0.3460354804992676, + "learning_rate": 0.0002863620698770403, + "loss": 0.6822, + "step": 5315 + }, + { + "epoch": 0.6700884844286299, + "grad_norm": 0.36469632387161255, + "learning_rate": 0.0002863162371799612, + "loss": 0.6298, + "step": 5320 + }, + { + "epoch": 0.6707182668388072, + "grad_norm": 0.3730217218399048, + "learning_rate": 0.00028627033127794896, + "loss": 0.7137, + "step": 5325 + }, + { + "epoch": 0.6713480492489845, + "grad_norm": 0.347002774477005, + "learning_rate": 0.00028622435219565606, + "loss": 0.6873, + "step": 5330 + }, + { + "epoch": 0.6719778316591618, + "grad_norm": 0.35723358392715454, + "learning_rate": 0.00028617829995777433, + "loss": 0.7055, + "step": 5335 + }, + { + "epoch": 0.672607614069339, + "grad_norm": 0.3175225257873535, + "learning_rate": 0.0002861321745890349, + "loss": 0.6702, + "step": 5340 + }, + { + "epoch": 0.6732373964795163, + "grad_norm": 0.3599521517753601, + "learning_rate": 0.00028608597611420807, + "loss": 0.6646, + "step": 5345 + }, + { + "epoch": 0.6738671788896936, + "grad_norm": 0.4381812810897827, + "learning_rate": 0.00028603970455810357, + "loss": 0.7122, + "step": 5350 + }, + { + "epoch": 0.6744969612998709, + "grad_norm": 0.3400894105434418, + "learning_rate": 0.00028599335994557027, + "loss": 0.705, + "step": 5355 + }, + { + "epoch": 0.6751267437100482, + "grad_norm": 0.3332962989807129, + "learning_rate": 0.00028594694230149625, + "loss": 0.6497, + "step": 5360 + }, + { + "epoch": 0.6757565261202254, + "grad_norm": 0.386343389749527, + "learning_rate": 0.00028590045165080883, + "loss": 0.6344, + "step": 5365 + }, + { + "epoch": 0.6763863085304027, + "grad_norm": 0.4404468834400177, + "learning_rate": 0.0002858538880184746, + "loss": 0.7115, + "step": 5370 + }, + { + "epoch": 0.67701609094058, + "grad_norm": 0.35227730870246887, + "learning_rate": 0.00028580725142949925, + "loss": 0.702, + "step": 5375 + }, + { + "epoch": 0.6776458733507573, + "grad_norm": 0.38216719031333923, + "learning_rate": 0.00028576054190892775, + "loss": 0.6845, + "step": 5380 + }, + { + "epoch": 0.6782756557609346, + "grad_norm": 0.3602873682975769, + "learning_rate": 0.0002857137594818441, + "loss": 0.7156, + "step": 5385 + }, + { + "epoch": 0.6789054381711119, + "grad_norm": 0.38896870613098145, + "learning_rate": 0.00028566690417337166, + "loss": 0.7029, + "step": 5390 + }, + { + "epoch": 0.6795352205812891, + "grad_norm": 0.3434313237667084, + "learning_rate": 0.0002856199760086726, + "loss": 0.687, + "step": 5395 + }, + { + "epoch": 0.6801650029914664, + "grad_norm": 0.381331205368042, + "learning_rate": 0.0002855729750129487, + "loss": 0.6597, + "step": 5400 + }, + { + "epoch": 0.6807947854016437, + "grad_norm": 0.35004013776779175, + "learning_rate": 0.0002855259012114403, + "loss": 0.6604, + "step": 5405 + }, + { + "epoch": 0.681424567811821, + "grad_norm": 0.3601452112197876, + "learning_rate": 0.0002854787546294272, + "loss": 0.6949, + "step": 5410 + }, + { + "epoch": 0.6820543502219983, + "grad_norm": 0.3827126920223236, + "learning_rate": 0.0002854315352922282, + "loss": 0.7121, + "step": 5415 + }, + { + "epoch": 0.6826841326321755, + "grad_norm": 0.35859569907188416, + "learning_rate": 0.0002853842432252012, + "loss": 0.6662, + "step": 5420 + }, + { + "epoch": 0.6833139150423528, + "grad_norm": 0.36607855558395386, + "learning_rate": 0.00028533687845374304, + "loss": 0.6716, + "step": 5425 + }, + { + "epoch": 0.6839436974525301, + "grad_norm": 0.3658086061477661, + "learning_rate": 0.00028528944100328975, + "loss": 0.6718, + "step": 5430 + }, + { + "epoch": 0.6845734798627074, + "grad_norm": 0.3442821800708771, + "learning_rate": 0.00028524193089931633, + "loss": 0.6474, + "step": 5435 + }, + { + "epoch": 0.6852032622728847, + "grad_norm": 0.38460132479667664, + "learning_rate": 0.0002851943481673367, + "loss": 0.6973, + "step": 5440 + }, + { + "epoch": 0.685833044683062, + "grad_norm": 0.3717944622039795, + "learning_rate": 0.000285146692832904, + "loss": 0.6962, + "step": 5445 + }, + { + "epoch": 0.6864628270932392, + "grad_norm": 0.42136862874031067, + "learning_rate": 0.00028509896492161013, + "loss": 0.6783, + "step": 5450 + }, + { + "epoch": 0.6870926095034166, + "grad_norm": 0.37208443880081177, + "learning_rate": 0.0002850511644590862, + "loss": 0.6915, + "step": 5455 + }, + { + "epoch": 0.6877223919135939, + "grad_norm": 0.3807058036327362, + "learning_rate": 0.000285003291471002, + "loss": 0.7269, + "step": 5460 + }, + { + "epoch": 0.6883521743237712, + "grad_norm": 0.38431763648986816, + "learning_rate": 0.00028495534598306645, + "loss": 0.6589, + "step": 5465 + }, + { + "epoch": 0.6889819567339485, + "grad_norm": 0.372773140668869, + "learning_rate": 0.0002849073280210274, + "loss": 0.6922, + "step": 5470 + }, + { + "epoch": 0.6896117391441257, + "grad_norm": 0.3280029892921448, + "learning_rate": 0.00028485923761067164, + "loss": 0.6887, + "step": 5475 + }, + { + "epoch": 0.690241521554303, + "grad_norm": 0.3463418483734131, + "learning_rate": 0.0002848110747778247, + "loss": 0.6565, + "step": 5480 + }, + { + "epoch": 0.6908713039644803, + "grad_norm": 0.3423214256763458, + "learning_rate": 0.00028476283954835123, + "loss": 0.6412, + "step": 5485 + }, + { + "epoch": 0.6915010863746576, + "grad_norm": 0.3461606204509735, + "learning_rate": 0.0002847145319481546, + "loss": 0.6803, + "step": 5490 + }, + { + "epoch": 0.6921308687848349, + "grad_norm": 0.38746729493141174, + "learning_rate": 0.0002846661520031772, + "loss": 0.6424, + "step": 5495 + }, + { + "epoch": 0.6927606511950122, + "grad_norm": 0.32353097200393677, + "learning_rate": 0.00028461769973939997, + "loss": 0.6761, + "step": 5500 + }, + { + "epoch": 0.6933904336051894, + "grad_norm": 0.3790241777896881, + "learning_rate": 0.00028456917518284304, + "loss": 0.6683, + "step": 5505 + }, + { + "epoch": 0.6940202160153667, + "grad_norm": 0.3713475465774536, + "learning_rate": 0.0002845205783595651, + "loss": 0.6663, + "step": 5510 + }, + { + "epoch": 0.694649998425544, + "grad_norm": 0.3859196901321411, + "learning_rate": 0.00028447190929566384, + "loss": 0.6717, + "step": 5515 + }, + { + "epoch": 0.6952797808357213, + "grad_norm": 0.34451383352279663, + "learning_rate": 0.0002844231680172756, + "loss": 0.6368, + "step": 5520 + }, + { + "epoch": 0.6959095632458986, + "grad_norm": 0.3519328534603119, + "learning_rate": 0.00028437435455057564, + "loss": 0.6882, + "step": 5525 + }, + { + "epoch": 0.6965393456560758, + "grad_norm": 0.382755309343338, + "learning_rate": 0.0002843254689217778, + "loss": 0.6415, + "step": 5530 + }, + { + "epoch": 0.6971691280662531, + "grad_norm": 0.35310298204421997, + "learning_rate": 0.0002842765111571349, + "loss": 0.6744, + "step": 5535 + }, + { + "epoch": 0.6977989104764304, + "grad_norm": 0.3392702341079712, + "learning_rate": 0.0002842274812829382, + "loss": 0.6705, + "step": 5540 + }, + { + "epoch": 0.6984286928866077, + "grad_norm": 0.36502036452293396, + "learning_rate": 0.00028417837932551805, + "loss": 0.6777, + "step": 5545 + }, + { + "epoch": 0.699058475296785, + "grad_norm": 0.36270782351493835, + "learning_rate": 0.0002841292053112432, + "loss": 0.6988, + "step": 5550 + }, + { + "epoch": 0.6996882577069622, + "grad_norm": 0.3752531111240387, + "learning_rate": 0.0002840799592665213, + "loss": 0.6745, + "step": 5555 + }, + { + "epoch": 0.7003180401171395, + "grad_norm": 0.32373905181884766, + "learning_rate": 0.00028403064121779853, + "loss": 0.664, + "step": 5560 + }, + { + "epoch": 0.7009478225273168, + "grad_norm": 0.4017639756202698, + "learning_rate": 0.0002839812511915599, + "loss": 0.6793, + "step": 5565 + }, + { + "epoch": 0.7015776049374941, + "grad_norm": 0.33867186307907104, + "learning_rate": 0.00028393178921432883, + "loss": 0.6811, + "step": 5570 + }, + { + "epoch": 0.7022073873476714, + "grad_norm": 0.3769174814224243, + "learning_rate": 0.0002838822553126677, + "loss": 0.7118, + "step": 5575 + }, + { + "epoch": 0.7028371697578487, + "grad_norm": 0.36820533871650696, + "learning_rate": 0.00028383264951317727, + "loss": 0.6581, + "step": 5580 + }, + { + "epoch": 0.7034669521680259, + "grad_norm": 0.37128061056137085, + "learning_rate": 0.00028378297184249694, + "loss": 0.6722, + "step": 5585 + }, + { + "epoch": 0.7040967345782032, + "grad_norm": 0.39225873351097107, + "learning_rate": 0.00028373322232730483, + "loss": 0.6846, + "step": 5590 + }, + { + "epoch": 0.7047265169883805, + "grad_norm": 0.3394504189491272, + "learning_rate": 0.0002836834009943175, + "loss": 0.6815, + "step": 5595 + }, + { + "epoch": 0.7053562993985578, + "grad_norm": 0.37265124917030334, + "learning_rate": 0.0002836335078702903, + "loss": 0.6614, + "step": 5600 + }, + { + "epoch": 0.7059860818087351, + "grad_norm": 0.33066150546073914, + "learning_rate": 0.00028358354298201673, + "loss": 0.6701, + "step": 5605 + }, + { + "epoch": 0.7066158642189123, + "grad_norm": 0.35536128282546997, + "learning_rate": 0.0002835335063563293, + "loss": 0.6149, + "step": 5610 + }, + { + "epoch": 0.7072456466290896, + "grad_norm": 0.35491225123405457, + "learning_rate": 0.0002834833980200987, + "loss": 0.6773, + "step": 5615 + }, + { + "epoch": 0.7078754290392669, + "grad_norm": 0.37837696075439453, + "learning_rate": 0.0002834332180002343, + "loss": 0.6899, + "step": 5620 + }, + { + "epoch": 0.7085052114494442, + "grad_norm": 0.3391937017440796, + "learning_rate": 0.0002833829663236838, + "loss": 0.7041, + "step": 5625 + }, + { + "epoch": 0.7091349938596215, + "grad_norm": 0.3482423424720764, + "learning_rate": 0.00028333264301743375, + "loss": 0.6597, + "step": 5630 + }, + { + "epoch": 0.7097647762697988, + "grad_norm": 0.4188586175441742, + "learning_rate": 0.00028328224810850866, + "loss": 0.6916, + "step": 5635 + }, + { + "epoch": 0.710394558679976, + "grad_norm": 0.32832324504852295, + "learning_rate": 0.0002832317816239718, + "loss": 0.6791, + "step": 5640 + }, + { + "epoch": 0.7110243410901533, + "grad_norm": 0.343058705329895, + "learning_rate": 0.00028318124359092496, + "loss": 0.6423, + "step": 5645 + }, + { + "epoch": 0.7116541235003306, + "grad_norm": 0.37011584639549255, + "learning_rate": 0.0002831306340365081, + "loss": 0.6783, + "step": 5650 + }, + { + "epoch": 0.7122839059105079, + "grad_norm": 0.38297170400619507, + "learning_rate": 0.00028307995298789974, + "loss": 0.6751, + "step": 5655 + }, + { + "epoch": 0.7129136883206852, + "grad_norm": 0.38705122470855713, + "learning_rate": 0.00028302920047231677, + "loss": 0.6844, + "step": 5660 + }, + { + "epoch": 0.7135434707308624, + "grad_norm": 0.3647492527961731, + "learning_rate": 0.0002829783765170144, + "loss": 0.6811, + "step": 5665 + }, + { + "epoch": 0.7141732531410397, + "grad_norm": 0.3796983063220978, + "learning_rate": 0.0002829274811492863, + "loss": 0.6766, + "step": 5670 + }, + { + "epoch": 0.714803035551217, + "grad_norm": 0.36972787976264954, + "learning_rate": 0.00028287651439646444, + "loss": 0.6701, + "step": 5675 + }, + { + "epoch": 0.7154328179613944, + "grad_norm": 0.37298983335494995, + "learning_rate": 0.0002828254762859192, + "loss": 0.6439, + "step": 5680 + }, + { + "epoch": 0.7160626003715717, + "grad_norm": 0.3464621603488922, + "learning_rate": 0.0002827743668450591, + "loss": 0.6626, + "step": 5685 + }, + { + "epoch": 0.716692382781749, + "grad_norm": 0.34213629364967346, + "learning_rate": 0.00028272318610133104, + "loss": 0.6987, + "step": 5690 + }, + { + "epoch": 0.7173221651919262, + "grad_norm": 0.38596463203430176, + "learning_rate": 0.0002826719340822204, + "loss": 0.6846, + "step": 5695 + }, + { + "epoch": 0.7179519476021035, + "grad_norm": 0.3410765826702118, + "learning_rate": 0.0002826206108152506, + "loss": 0.6769, + "step": 5700 + }, + { + "epoch": 0.7185817300122808, + "grad_norm": 0.3370499610900879, + "learning_rate": 0.0002825692163279834, + "loss": 0.6563, + "step": 5705 + }, + { + "epoch": 0.7192115124224581, + "grad_norm": 0.3973693549633026, + "learning_rate": 0.0002825177506480189, + "loss": 0.6587, + "step": 5710 + }, + { + "epoch": 0.7198412948326354, + "grad_norm": 0.3341182470321655, + "learning_rate": 0.0002824662138029952, + "loss": 0.6489, + "step": 5715 + }, + { + "epoch": 0.7204710772428126, + "grad_norm": 0.3598056733608246, + "learning_rate": 0.00028241460582058883, + "loss": 0.6623, + "step": 5720 + }, + { + "epoch": 0.7211008596529899, + "grad_norm": 0.34275728464126587, + "learning_rate": 0.00028236292672851443, + "loss": 0.6987, + "step": 5725 + }, + { + "epoch": 0.7217306420631672, + "grad_norm": 0.3606712222099304, + "learning_rate": 0.000282311176554525, + "loss": 0.6947, + "step": 5730 + }, + { + "epoch": 0.7223604244733445, + "grad_norm": 0.32409214973449707, + "learning_rate": 0.0002822593553264114, + "loss": 0.6468, + "step": 5735 + }, + { + "epoch": 0.7229902068835218, + "grad_norm": 0.3465891182422638, + "learning_rate": 0.00028220746307200287, + "loss": 0.647, + "step": 5740 + }, + { + "epoch": 0.723619989293699, + "grad_norm": 0.3540678918361664, + "learning_rate": 0.0002821554998191667, + "loss": 0.6964, + "step": 5745 + }, + { + "epoch": 0.7242497717038763, + "grad_norm": 0.35845157504081726, + "learning_rate": 0.0002821034655958084, + "loss": 0.6599, + "step": 5750 + }, + { + "epoch": 0.7248795541140536, + "grad_norm": 0.3469247817993164, + "learning_rate": 0.00028205136042987156, + "loss": 0.6518, + "step": 5755 + }, + { + "epoch": 0.7255093365242309, + "grad_norm": 0.3693814277648926, + "learning_rate": 0.0002819991843493377, + "loss": 0.6339, + "step": 5760 + }, + { + "epoch": 0.7261391189344082, + "grad_norm": 0.35166436433792114, + "learning_rate": 0.0002819469373822268, + "loss": 0.6593, + "step": 5765 + }, + { + "epoch": 0.7267689013445855, + "grad_norm": 0.376717746257782, + "learning_rate": 0.00028189461955659644, + "loss": 0.6583, + "step": 5770 + }, + { + "epoch": 0.7273986837547627, + "grad_norm": 0.36365002393722534, + "learning_rate": 0.0002818422309005426, + "loss": 0.707, + "step": 5775 + }, + { + "epoch": 0.72802846616494, + "grad_norm": 0.3356451392173767, + "learning_rate": 0.00028178977144219914, + "loss": 0.6439, + "step": 5780 + }, + { + "epoch": 0.7286582485751173, + "grad_norm": 0.33520832657814026, + "learning_rate": 0.00028173724120973806, + "loss": 0.6276, + "step": 5785 + }, + { + "epoch": 0.7292880309852946, + "grad_norm": 0.3459213376045227, + "learning_rate": 0.00028168464023136926, + "loss": 0.648, + "step": 5790 + }, + { + "epoch": 0.7299178133954719, + "grad_norm": 0.3563973903656006, + "learning_rate": 0.0002816319685353406, + "loss": 0.6579, + "step": 5795 + }, + { + "epoch": 0.7305475958056491, + "grad_norm": 0.3637474775314331, + "learning_rate": 0.0002815792261499381, + "loss": 0.6828, + "step": 5800 + }, + { + "epoch": 0.7311773782158264, + "grad_norm": 0.38304394483566284, + "learning_rate": 0.00028152641310348554, + "loss": 0.6348, + "step": 5805 + }, + { + "epoch": 0.7318071606260037, + "grad_norm": 0.33336034417152405, + "learning_rate": 0.0002814735294243448, + "loss": 0.6337, + "step": 5810 + }, + { + "epoch": 0.732436943036181, + "grad_norm": 0.34154805541038513, + "learning_rate": 0.0002814205751409156, + "loss": 0.6885, + "step": 5815 + }, + { + "epoch": 0.7330667254463583, + "grad_norm": 0.3780697286128998, + "learning_rate": 0.00028136755028163556, + "loss": 0.6558, + "step": 5820 + }, + { + "epoch": 0.7336965078565356, + "grad_norm": 0.3496229946613312, + "learning_rate": 0.0002813144548749802, + "loss": 0.7058, + "step": 5825 + }, + { + "epoch": 0.7343262902667128, + "grad_norm": 0.36560389399528503, + "learning_rate": 0.0002812612889494631, + "loss": 0.6991, + "step": 5830 + }, + { + "epoch": 0.7349560726768901, + "grad_norm": 0.3215349316596985, + "learning_rate": 0.00028120805253363545, + "loss": 0.612, + "step": 5835 + }, + { + "epoch": 0.7355858550870674, + "grad_norm": 0.36016130447387695, + "learning_rate": 0.00028115474565608656, + "loss": 0.6905, + "step": 5840 + }, + { + "epoch": 0.7362156374972447, + "grad_norm": 0.3493592441082001, + "learning_rate": 0.00028110136834544336, + "loss": 0.6922, + "step": 5845 + }, + { + "epoch": 0.736845419907422, + "grad_norm": 0.34350746870040894, + "learning_rate": 0.00028104792063037064, + "loss": 0.6238, + "step": 5850 + }, + { + "epoch": 0.7374752023175992, + "grad_norm": 0.3633589446544647, + "learning_rate": 0.0002809944025395711, + "loss": 0.6775, + "step": 5855 + }, + { + "epoch": 0.7381049847277765, + "grad_norm": 0.3892457187175751, + "learning_rate": 0.00028094081410178515, + "loss": 0.6756, + "step": 5860 + }, + { + "epoch": 0.7387347671379538, + "grad_norm": 0.33569657802581787, + "learning_rate": 0.00028088715534579104, + "loss": 0.63, + "step": 5865 + }, + { + "epoch": 0.7393645495481311, + "grad_norm": 0.36327067017555237, + "learning_rate": 0.0002808334263004047, + "loss": 0.6653, + "step": 5870 + }, + { + "epoch": 0.7399943319583084, + "grad_norm": 0.32698652148246765, + "learning_rate": 0.00028077962699448, + "loss": 0.655, + "step": 5875 + }, + { + "epoch": 0.7406241143684857, + "grad_norm": 0.35473042726516724, + "learning_rate": 0.0002807257574569082, + "loss": 0.6341, + "step": 5880 + }, + { + "epoch": 0.7412538967786629, + "grad_norm": 0.33008939027786255, + "learning_rate": 0.0002806718177166185, + "loss": 0.6614, + "step": 5885 + }, + { + "epoch": 0.7418836791888402, + "grad_norm": 0.3434574007987976, + "learning_rate": 0.0002806178078025779, + "loss": 0.6313, + "step": 5890 + }, + { + "epoch": 0.7425134615990175, + "grad_norm": 0.30766573548316956, + "learning_rate": 0.00028056372774379085, + "loss": 0.6296, + "step": 5895 + }, + { + "epoch": 0.7431432440091948, + "grad_norm": 0.3676775097846985, + "learning_rate": 0.00028050957756929965, + "loss": 0.628, + "step": 5900 + }, + { + "epoch": 0.7437730264193722, + "grad_norm": 0.3424786925315857, + "learning_rate": 0.0002804553573081841, + "loss": 0.6141, + "step": 5905 + }, + { + "epoch": 0.7444028088295493, + "grad_norm": 0.391250878572464, + "learning_rate": 0.0002804010669895618, + "loss": 0.6615, + "step": 5910 + }, + { + "epoch": 0.7450325912397266, + "grad_norm": 0.34186193346977234, + "learning_rate": 0.0002803467066425878, + "loss": 0.6389, + "step": 5915 + }, + { + "epoch": 0.745662373649904, + "grad_norm": 0.37509649991989136, + "learning_rate": 0.0002802922762964549, + "loss": 0.6397, + "step": 5920 + }, + { + "epoch": 0.7462921560600813, + "grad_norm": 0.3327299654483795, + "learning_rate": 0.00028023777598039346, + "loss": 0.6241, + "step": 5925 + }, + { + "epoch": 0.7469219384702586, + "grad_norm": 0.37098389863967896, + "learning_rate": 0.0002801832057236714, + "loss": 0.7004, + "step": 5930 + }, + { + "epoch": 0.7475517208804358, + "grad_norm": 0.36630627512931824, + "learning_rate": 0.00028012856555559415, + "loss": 0.6201, + "step": 5935 + }, + { + "epoch": 0.7481815032906131, + "grad_norm": 0.3580261170864105, + "learning_rate": 0.00028007385550550475, + "loss": 0.6969, + "step": 5940 + }, + { + "epoch": 0.7488112857007904, + "grad_norm": 0.3491668105125427, + "learning_rate": 0.0002800190756027837, + "loss": 0.6457, + "step": 5945 + }, + { + "epoch": 0.7494410681109677, + "grad_norm": 0.2999480664730072, + "learning_rate": 0.0002799642258768491, + "loss": 0.6398, + "step": 5950 + }, + { + "epoch": 0.750070850521145, + "grad_norm": 0.33795973658561707, + "learning_rate": 0.00027990930635715655, + "loss": 0.6672, + "step": 5955 + }, + { + "epoch": 0.7507006329313223, + "grad_norm": 0.39881202578544617, + "learning_rate": 0.00027985431707319903, + "loss": 0.6796, + "step": 5960 + }, + { + "epoch": 0.7513304153414995, + "grad_norm": 0.4092641770839691, + "learning_rate": 0.0002797992580545071, + "loss": 0.6488, + "step": 5965 + }, + { + "epoch": 0.7519601977516768, + "grad_norm": 0.33037346601486206, + "learning_rate": 0.0002797441293306486, + "loss": 0.667, + "step": 5970 + }, + { + "epoch": 0.7525899801618541, + "grad_norm": 0.35514095425605774, + "learning_rate": 0.00027968893093122896, + "loss": 0.6984, + "step": 5975 + }, + { + "epoch": 0.7532197625720314, + "grad_norm": 0.4268254339694977, + "learning_rate": 0.0002796336628858911, + "loss": 0.6762, + "step": 5980 + }, + { + "epoch": 0.7538495449822087, + "grad_norm": 0.33386656641960144, + "learning_rate": 0.00027957832522431503, + "loss": 0.6438, + "step": 5985 + }, + { + "epoch": 0.7544793273923859, + "grad_norm": 0.374845415353775, + "learning_rate": 0.00027952291797621846, + "loss": 0.6422, + "step": 5990 + }, + { + "epoch": 0.7551091098025632, + "grad_norm": 0.32742077112197876, + "learning_rate": 0.0002794674411713563, + "loss": 0.6685, + "step": 5995 + }, + { + "epoch": 0.7557388922127405, + "grad_norm": 0.3118845820426941, + "learning_rate": 0.0002794118948395209, + "loss": 0.6273, + "step": 6000 + }, + { + "epoch": 0.7557388922127405, + "eval_loss": 0.3097546696662903, + "eval_runtime": 6.2567, + "eval_samples_per_second": 159.828, + "eval_steps_per_second": 10.069, + "step": 6000 + }, + { + "epoch": 0.7563686746229178, + "grad_norm": 0.3407754898071289, + "learning_rate": 0.00027935627901054197, + "loss": 0.6712, + "step": 6005 + }, + { + "epoch": 0.7569984570330951, + "grad_norm": 0.34817007184028625, + "learning_rate": 0.0002793005937142863, + "loss": 0.6492, + "step": 6010 + }, + { + "epoch": 0.7576282394432724, + "grad_norm": 0.36492645740509033, + "learning_rate": 0.00027924483898065833, + "loss": 0.6467, + "step": 6015 + }, + { + "epoch": 0.7582580218534496, + "grad_norm": 0.33556580543518066, + "learning_rate": 0.0002791890148395995, + "loss": 0.6486, + "step": 6020 + }, + { + "epoch": 0.7588878042636269, + "grad_norm": 0.36699965596199036, + "learning_rate": 0.00027913312132108874, + "loss": 0.6909, + "step": 6025 + }, + { + "epoch": 0.7595175866738042, + "grad_norm": 0.32526010274887085, + "learning_rate": 0.0002790771584551421, + "loss": 0.6234, + "step": 6030 + }, + { + "epoch": 0.7601473690839815, + "grad_norm": 0.38366591930389404, + "learning_rate": 0.00027902112627181295, + "loss": 0.6195, + "step": 6035 + }, + { + "epoch": 0.7607771514941588, + "grad_norm": 0.33587443828582764, + "learning_rate": 0.0002789650248011918, + "loss": 0.6546, + "step": 6040 + }, + { + "epoch": 0.761406933904336, + "grad_norm": 0.36170026659965515, + "learning_rate": 0.00027890885407340653, + "loss": 0.6294, + "step": 6045 + }, + { + "epoch": 0.7620367163145133, + "grad_norm": 0.34692490100860596, + "learning_rate": 0.000278852614118622, + "loss": 0.6468, + "step": 6050 + }, + { + "epoch": 0.7626664987246906, + "grad_norm": 0.346608966588974, + "learning_rate": 0.0002787963049670404, + "loss": 0.6714, + "step": 6055 + }, + { + "epoch": 0.7632962811348679, + "grad_norm": 0.3632940948009491, + "learning_rate": 0.00027873992664890097, + "loss": 0.6772, + "step": 6060 + }, + { + "epoch": 0.7639260635450452, + "grad_norm": 0.38135001063346863, + "learning_rate": 0.00027868347919448027, + "loss": 0.658, + "step": 6065 + }, + { + "epoch": 0.7645558459552225, + "grad_norm": 0.3518752455711365, + "learning_rate": 0.00027862696263409177, + "loss": 0.6445, + "step": 6070 + }, + { + "epoch": 0.7651856283653997, + "grad_norm": 0.33004361391067505, + "learning_rate": 0.00027857037699808613, + "loss": 0.6553, + "step": 6075 + }, + { + "epoch": 0.765815410775577, + "grad_norm": 0.36370858550071716, + "learning_rate": 0.0002785137223168512, + "loss": 0.6632, + "step": 6080 + }, + { + "epoch": 0.7664451931857543, + "grad_norm": 0.3472859561443329, + "learning_rate": 0.0002784569986208119, + "loss": 0.626, + "step": 6085 + }, + { + "epoch": 0.7670749755959316, + "grad_norm": 0.3560635447502136, + "learning_rate": 0.00027840020594043, + "loss": 0.6628, + "step": 6090 + }, + { + "epoch": 0.7677047580061089, + "grad_norm": 0.3515082895755768, + "learning_rate": 0.00027834334430620455, + "loss": 0.7061, + "step": 6095 + }, + { + "epoch": 0.7683345404162861, + "grad_norm": 0.3222733736038208, + "learning_rate": 0.00027828641374867154, + "loss": 0.617, + "step": 6100 + }, + { + "epoch": 0.7689643228264634, + "grad_norm": 0.3362828493118286, + "learning_rate": 0.00027822941429840397, + "loss": 0.6825, + "step": 6105 + }, + { + "epoch": 0.7695941052366407, + "grad_norm": 0.34228187799453735, + "learning_rate": 0.0002781723459860119, + "loss": 0.6306, + "step": 6110 + }, + { + "epoch": 0.770223887646818, + "grad_norm": 0.3672444820404053, + "learning_rate": 0.0002781152088421422, + "loss": 0.6601, + "step": 6115 + }, + { + "epoch": 0.7708536700569953, + "grad_norm": 0.3703080415725708, + "learning_rate": 0.00027805800289747894, + "loss": 0.6385, + "step": 6120 + }, + { + "epoch": 0.7714834524671725, + "grad_norm": 0.34456151723861694, + "learning_rate": 0.0002780007281827429, + "loss": 0.6635, + "step": 6125 + }, + { + "epoch": 0.7721132348773498, + "grad_norm": 0.3449029326438904, + "learning_rate": 0.00027794338472869205, + "loss": 0.6258, + "step": 6130 + }, + { + "epoch": 0.7727430172875271, + "grad_norm": 0.3441922068595886, + "learning_rate": 0.0002778859725661211, + "loss": 0.627, + "step": 6135 + }, + { + "epoch": 0.7733727996977044, + "grad_norm": 0.3855600357055664, + "learning_rate": 0.00027782849172586156, + "loss": 0.6205, + "step": 6140 + }, + { + "epoch": 0.7740025821078818, + "grad_norm": 0.3838488757610321, + "learning_rate": 0.0002777709422387821, + "loss": 0.6463, + "step": 6145 + }, + { + "epoch": 0.7746323645180591, + "grad_norm": 0.3128564953804016, + "learning_rate": 0.00027771332413578805, + "loss": 0.6639, + "step": 6150 + }, + { + "epoch": 0.7752621469282363, + "grad_norm": 0.32142025232315063, + "learning_rate": 0.00027765563744782166, + "loss": 0.6187, + "step": 6155 + }, + { + "epoch": 0.7758919293384136, + "grad_norm": 0.34378373622894287, + "learning_rate": 0.000277597882205862, + "loss": 0.659, + "step": 6160 + }, + { + "epoch": 0.7765217117485909, + "grad_norm": 0.35872867703437805, + "learning_rate": 0.0002775400584409249, + "loss": 0.6245, + "step": 6165 + }, + { + "epoch": 0.7771514941587682, + "grad_norm": 0.32217180728912354, + "learning_rate": 0.00027748216618406316, + "loss": 0.6216, + "step": 6170 + }, + { + "epoch": 0.7777812765689455, + "grad_norm": 0.3139524757862091, + "learning_rate": 0.00027742420546636616, + "loss": 0.6831, + "step": 6175 + }, + { + "epoch": 0.7784110589791227, + "grad_norm": 0.3159128427505493, + "learning_rate": 0.00027736617631896017, + "loss": 0.6417, + "step": 6180 + }, + { + "epoch": 0.7790408413893, + "grad_norm": 0.36738142371177673, + "learning_rate": 0.0002773080787730081, + "loss": 0.6592, + "step": 6185 + }, + { + "epoch": 0.7796706237994773, + "grad_norm": 0.31971079111099243, + "learning_rate": 0.0002772499128597097, + "loss": 0.6296, + "step": 6190 + }, + { + "epoch": 0.7803004062096546, + "grad_norm": 0.3699764609336853, + "learning_rate": 0.00027719167861030145, + "loss": 0.6161, + "step": 6195 + }, + { + "epoch": 0.7809301886198319, + "grad_norm": 0.3316752016544342, + "learning_rate": 0.0002771333760560564, + "loss": 0.6698, + "step": 6200 + }, + { + "epoch": 0.7815599710300092, + "grad_norm": 0.34318891167640686, + "learning_rate": 0.00027707500522828433, + "loss": 0.6312, + "step": 6205 + }, + { + "epoch": 0.7821897534401864, + "grad_norm": 0.3325194716453552, + "learning_rate": 0.00027701656615833185, + "loss": 0.6515, + "step": 6210 + }, + { + "epoch": 0.7828195358503637, + "grad_norm": 0.3374411463737488, + "learning_rate": 0.0002769580588775819, + "loss": 0.6811, + "step": 6215 + }, + { + "epoch": 0.783449318260541, + "grad_norm": 0.3507198989391327, + "learning_rate": 0.00027689948341745433, + "loss": 0.6177, + "step": 6220 + }, + { + "epoch": 0.7840791006707183, + "grad_norm": 0.3619876205921173, + "learning_rate": 0.00027684083980940543, + "loss": 0.6812, + "step": 6225 + }, + { + "epoch": 0.7847088830808956, + "grad_norm": 0.3660729229450226, + "learning_rate": 0.00027678212808492824, + "loss": 0.6888, + "step": 6230 + }, + { + "epoch": 0.7853386654910728, + "grad_norm": 0.37557917833328247, + "learning_rate": 0.00027672334827555226, + "loss": 0.6516, + "step": 6235 + }, + { + "epoch": 0.7859684479012501, + "grad_norm": 0.37117084860801697, + "learning_rate": 0.00027666450041284363, + "loss": 0.6503, + "step": 6240 + }, + { + "epoch": 0.7865982303114274, + "grad_norm": 0.3434617519378662, + "learning_rate": 0.00027660558452840487, + "loss": 0.6582, + "step": 6245 + }, + { + "epoch": 0.7872280127216047, + "grad_norm": 0.3878399431705475, + "learning_rate": 0.0002765466006538753, + "loss": 0.6309, + "step": 6250 + }, + { + "epoch": 0.787857795131782, + "grad_norm": 0.3379189968109131, + "learning_rate": 0.0002764875488209305, + "loss": 0.6802, + "step": 6255 + }, + { + "epoch": 0.7884875775419593, + "grad_norm": 0.3534158170223236, + "learning_rate": 0.0002764284290612827, + "loss": 0.6248, + "step": 6260 + }, + { + "epoch": 0.7891173599521365, + "grad_norm": 0.3273150324821472, + "learning_rate": 0.0002763692414066806, + "loss": 0.617, + "step": 6265 + }, + { + "epoch": 0.7897471423623138, + "grad_norm": 0.4256115257740021, + "learning_rate": 0.0002763099858889093, + "loss": 0.6452, + "step": 6270 + }, + { + "epoch": 0.7903769247724911, + "grad_norm": 0.34881314635276794, + "learning_rate": 0.0002762506625397903, + "loss": 0.6545, + "step": 6275 + }, + { + "epoch": 0.7910067071826684, + "grad_norm": 0.3283347487449646, + "learning_rate": 0.0002761912713911817, + "loss": 0.6819, + "step": 6280 + }, + { + "epoch": 0.7916364895928457, + "grad_norm": 0.33939605951309204, + "learning_rate": 0.0002761318124749778, + "loss": 0.6188, + "step": 6285 + }, + { + "epoch": 0.7922662720030229, + "grad_norm": 0.3786788582801819, + "learning_rate": 0.00027607228582310947, + "loss": 0.6583, + "step": 6290 + }, + { + "epoch": 0.7928960544132002, + "grad_norm": 0.34528714418411255, + "learning_rate": 0.0002760126914675439, + "loss": 0.6594, + "step": 6295 + }, + { + "epoch": 0.7935258368233775, + "grad_norm": 0.3494967818260193, + "learning_rate": 0.00027595302944028447, + "loss": 0.6241, + "step": 6300 + }, + { + "epoch": 0.7941556192335548, + "grad_norm": 0.350005179643631, + "learning_rate": 0.00027589329977337126, + "loss": 0.6724, + "step": 6305 + }, + { + "epoch": 0.7947854016437321, + "grad_norm": 0.3381168246269226, + "learning_rate": 0.0002758335024988803, + "loss": 0.6062, + "step": 6310 + }, + { + "epoch": 0.7954151840539094, + "grad_norm": 0.32583653926849365, + "learning_rate": 0.0002757736376489242, + "loss": 0.6602, + "step": 6315 + }, + { + "epoch": 0.7960449664640866, + "grad_norm": 0.33687326312065125, + "learning_rate": 0.0002757137052556517, + "loss": 0.6391, + "step": 6320 + }, + { + "epoch": 0.7966747488742639, + "grad_norm": 0.35395026206970215, + "learning_rate": 0.00027565370535124784, + "loss": 0.6445, + "step": 6325 + }, + { + "epoch": 0.7973045312844412, + "grad_norm": 0.3484829068183899, + "learning_rate": 0.000275593637967934, + "loss": 0.6242, + "step": 6330 + }, + { + "epoch": 0.7979343136946185, + "grad_norm": 0.32783517241477966, + "learning_rate": 0.0002755335031379677, + "loss": 0.6481, + "step": 6335 + }, + { + "epoch": 0.7985640961047958, + "grad_norm": 0.3683319389820099, + "learning_rate": 0.0002754733008936427, + "loss": 0.6506, + "step": 6340 + }, + { + "epoch": 0.799193878514973, + "grad_norm": 0.360219269990921, + "learning_rate": 0.00027541303126728907, + "loss": 0.6377, + "step": 6345 + }, + { + "epoch": 0.7998236609251503, + "grad_norm": 0.3323548436164856, + "learning_rate": 0.00027535269429127283, + "loss": 0.6278, + "step": 6350 + }, + { + "epoch": 0.8004534433353276, + "grad_norm": 0.33823835849761963, + "learning_rate": 0.0002752922899979965, + "loss": 0.5999, + "step": 6355 + }, + { + "epoch": 0.801083225745505, + "grad_norm": 0.35394924879074097, + "learning_rate": 0.0002752318184198984, + "loss": 0.6873, + "step": 6360 + }, + { + "epoch": 0.8017130081556822, + "grad_norm": 0.35529881715774536, + "learning_rate": 0.00027517127958945315, + "loss": 0.6183, + "step": 6365 + }, + { + "epoch": 0.8023427905658594, + "grad_norm": 0.35854044556617737, + "learning_rate": 0.00027511067353917166, + "loss": 0.6394, + "step": 6370 + }, + { + "epoch": 0.8029725729760367, + "grad_norm": 0.32757097482681274, + "learning_rate": 0.0002750500003016006, + "loss": 0.6383, + "step": 6375 + }, + { + "epoch": 0.803602355386214, + "grad_norm": 0.3267909586429596, + "learning_rate": 0.0002749892599093229, + "loss": 0.5951, + "step": 6380 + }, + { + "epoch": 0.8042321377963914, + "grad_norm": 0.31262004375457764, + "learning_rate": 0.0002749284523949576, + "loss": 0.6497, + "step": 6385 + }, + { + "epoch": 0.8048619202065687, + "grad_norm": 0.34036824107170105, + "learning_rate": 0.00027486757779115973, + "loss": 0.6295, + "step": 6390 + }, + { + "epoch": 0.805491702616746, + "grad_norm": 0.3461470901966095, + "learning_rate": 0.0002748066361306203, + "loss": 0.6537, + "step": 6395 + }, + { + "epoch": 0.8061214850269232, + "grad_norm": 0.35146886110305786, + "learning_rate": 0.00027474562744606636, + "loss": 0.6217, + "step": 6400 + }, + { + "epoch": 0.8067512674371005, + "grad_norm": 0.37654054164886475, + "learning_rate": 0.000274684551770261, + "loss": 0.6417, + "step": 6405 + }, + { + "epoch": 0.8073810498472778, + "grad_norm": 0.36115625500679016, + "learning_rate": 0.0002746234091360032, + "loss": 0.6638, + "step": 6410 + }, + { + "epoch": 0.8080108322574551, + "grad_norm": 0.3503740727901459, + "learning_rate": 0.00027456219957612804, + "loss": 0.6652, + "step": 6415 + }, + { + "epoch": 0.8086406146676324, + "grad_norm": 0.3303118646144867, + "learning_rate": 0.0002745009231235064, + "loss": 0.614, + "step": 6420 + }, + { + "epoch": 0.8092703970778096, + "grad_norm": 0.35880813002586365, + "learning_rate": 0.00027443957981104517, + "loss": 0.6449, + "step": 6425 + }, + { + "epoch": 0.8099001794879869, + "grad_norm": 0.3664454221725464, + "learning_rate": 0.000274378169671687, + "loss": 0.6448, + "step": 6430 + }, + { + "epoch": 0.8105299618981642, + "grad_norm": 0.38473254442214966, + "learning_rate": 0.00027431669273841067, + "loss": 0.6576, + "step": 6435 + }, + { + "epoch": 0.8111597443083415, + "grad_norm": 0.3694675862789154, + "learning_rate": 0.0002742551490442307, + "loss": 0.6365, + "step": 6440 + }, + { + "epoch": 0.8117895267185188, + "grad_norm": 0.32066047191619873, + "learning_rate": 0.0002741935386221973, + "loss": 0.6563, + "step": 6445 + }, + { + "epoch": 0.8124193091286961, + "grad_norm": 0.3764455020427704, + "learning_rate": 0.0002741318615053968, + "loss": 0.61, + "step": 6450 + }, + { + "epoch": 0.8130490915388733, + "grad_norm": 0.3913812041282654, + "learning_rate": 0.00027407011772695124, + "loss": 0.6606, + "step": 6455 + }, + { + "epoch": 0.8136788739490506, + "grad_norm": 0.2876626253128052, + "learning_rate": 0.0002740083073200184, + "loss": 0.6123, + "step": 6460 + }, + { + "epoch": 0.8143086563592279, + "grad_norm": 0.37668120861053467, + "learning_rate": 0.0002739464303177919, + "loss": 0.6323, + "step": 6465 + }, + { + "epoch": 0.8149384387694052, + "grad_norm": 0.3343159854412079, + "learning_rate": 0.000273884486753501, + "loss": 0.6051, + "step": 6470 + }, + { + "epoch": 0.8155682211795825, + "grad_norm": 0.3852281868457794, + "learning_rate": 0.00027382247666041097, + "loss": 0.6614, + "step": 6475 + }, + { + "epoch": 0.8161980035897597, + "grad_norm": 0.36491283774375916, + "learning_rate": 0.0002737604000718225, + "loss": 0.6383, + "step": 6480 + }, + { + "epoch": 0.816827785999937, + "grad_norm": 0.32019633054733276, + "learning_rate": 0.00027369825702107224, + "loss": 0.623, + "step": 6485 + }, + { + "epoch": 0.8174575684101143, + "grad_norm": 0.3173837661743164, + "learning_rate": 0.0002736360475415324, + "loss": 0.599, + "step": 6490 + }, + { + "epoch": 0.8180873508202916, + "grad_norm": 0.31505605578422546, + "learning_rate": 0.00027357377166661086, + "loss": 0.6341, + "step": 6495 + }, + { + "epoch": 0.8187171332304689, + "grad_norm": 0.3370759189128876, + "learning_rate": 0.00027351142942975124, + "loss": 0.6296, + "step": 6500 + }, + { + "epoch": 0.8193469156406462, + "grad_norm": 0.3554564416408539, + "learning_rate": 0.0002734490208644327, + "loss": 0.6587, + "step": 6505 + }, + { + "epoch": 0.8199766980508234, + "grad_norm": 0.3487757444381714, + "learning_rate": 0.0002733865460041701, + "loss": 0.6292, + "step": 6510 + }, + { + "epoch": 0.8206064804610007, + "grad_norm": 0.3280607759952545, + "learning_rate": 0.0002733240048825138, + "loss": 0.5964, + "step": 6515 + }, + { + "epoch": 0.821236262871178, + "grad_norm": 0.35416868329048157, + "learning_rate": 0.0002732613975330499, + "loss": 0.6089, + "step": 6520 + }, + { + "epoch": 0.8218660452813553, + "grad_norm": 0.3558996915817261, + "learning_rate": 0.00027319872398939995, + "loss": 0.5791, + "step": 6525 + }, + { + "epoch": 0.8224958276915326, + "grad_norm": 0.35394206643104553, + "learning_rate": 0.000273135984285221, + "loss": 0.6183, + "step": 6530 + }, + { + "epoch": 0.8231256101017098, + "grad_norm": 0.33172932267189026, + "learning_rate": 0.0002730731784542058, + "loss": 0.605, + "step": 6535 + }, + { + "epoch": 0.8237553925118871, + "grad_norm": 0.3498142957687378, + "learning_rate": 0.00027301030653008253, + "loss": 0.6199, + "step": 6540 + }, + { + "epoch": 0.8243851749220644, + "grad_norm": 0.3364173471927643, + "learning_rate": 0.0002729473685466148, + "loss": 0.6352, + "step": 6545 + }, + { + "epoch": 0.8250149573322417, + "grad_norm": 0.38148370385169983, + "learning_rate": 0.00027288436453760164, + "loss": 0.6216, + "step": 6550 + }, + { + "epoch": 0.825644739742419, + "grad_norm": 0.33975306153297424, + "learning_rate": 0.0002728212945368778, + "loss": 0.6155, + "step": 6555 + }, + { + "epoch": 0.8262745221525962, + "grad_norm": 0.3361944854259491, + "learning_rate": 0.0002727581585783133, + "loss": 0.6084, + "step": 6560 + }, + { + "epoch": 0.8269043045627735, + "grad_norm": 0.3503773808479309, + "learning_rate": 0.00027269495669581353, + "loss": 0.6355, + "step": 6565 + }, + { + "epoch": 0.8275340869729508, + "grad_norm": 0.35406753420829773, + "learning_rate": 0.00027263168892331934, + "loss": 0.624, + "step": 6570 + }, + { + "epoch": 0.8281638693831281, + "grad_norm": 0.3337428569793701, + "learning_rate": 0.00027256835529480697, + "loss": 0.6451, + "step": 6575 + }, + { + "epoch": 0.8287936517933054, + "grad_norm": 0.3431616425514221, + "learning_rate": 0.00027250495584428807, + "loss": 0.5969, + "step": 6580 + }, + { + "epoch": 0.8294234342034827, + "grad_norm": 0.4032285511493683, + "learning_rate": 0.0002724414906058096, + "loss": 0.5954, + "step": 6585 + }, + { + "epoch": 0.8300532166136599, + "grad_norm": 0.3352124094963074, + "learning_rate": 0.00027237795961345383, + "loss": 0.6077, + "step": 6590 + }, + { + "epoch": 0.8306829990238372, + "grad_norm": 0.3181077837944031, + "learning_rate": 0.0002723143629013383, + "loss": 0.6107, + "step": 6595 + }, + { + "epoch": 0.8313127814340145, + "grad_norm": 0.32390958070755005, + "learning_rate": 0.000272250700503616, + "loss": 0.6345, + "step": 6600 + }, + { + "epoch": 0.8319425638441919, + "grad_norm": 0.3199234902858734, + "learning_rate": 0.0002721869724544749, + "loss": 0.6268, + "step": 6605 + }, + { + "epoch": 0.8325723462543692, + "grad_norm": 0.38811957836151123, + "learning_rate": 0.00027212317878813863, + "loss": 0.643, + "step": 6610 + }, + { + "epoch": 0.8332021286645463, + "grad_norm": 0.38010820746421814, + "learning_rate": 0.00027205931953886575, + "loss": 0.6055, + "step": 6615 + }, + { + "epoch": 0.8338319110747237, + "grad_norm": 0.3288145065307617, + "learning_rate": 0.00027199539474095013, + "loss": 0.6311, + "step": 6620 + }, + { + "epoch": 0.834461693484901, + "grad_norm": 0.33361807465553284, + "learning_rate": 0.0002719314044287209, + "loss": 0.6083, + "step": 6625 + }, + { + "epoch": 0.8350914758950783, + "grad_norm": 0.350864440202713, + "learning_rate": 0.0002718673486365423, + "loss": 0.5969, + "step": 6630 + }, + { + "epoch": 0.8357212583052556, + "grad_norm": 0.35795754194259644, + "learning_rate": 0.0002718032273988137, + "loss": 0.6623, + "step": 6635 + }, + { + "epoch": 0.8363510407154329, + "grad_norm": 0.3748815357685089, + "learning_rate": 0.0002717390407499697, + "loss": 0.6301, + "step": 6640 + }, + { + "epoch": 0.8369808231256101, + "grad_norm": 0.3146851360797882, + "learning_rate": 0.00027167478872448, + "loss": 0.62, + "step": 6645 + }, + { + "epoch": 0.8376106055357874, + "grad_norm": 0.3758367598056793, + "learning_rate": 0.0002716104713568495, + "loss": 0.6202, + "step": 6650 + }, + { + "epoch": 0.8382403879459647, + "grad_norm": 0.4035817086696625, + "learning_rate": 0.0002715460886816179, + "loss": 0.606, + "step": 6655 + }, + { + "epoch": 0.838870170356142, + "grad_norm": 0.3586306869983673, + "learning_rate": 0.00027148164073336026, + "loss": 0.6523, + "step": 6660 + }, + { + "epoch": 0.8394999527663193, + "grad_norm": 0.33375057578086853, + "learning_rate": 0.0002714171275466866, + "loss": 0.6193, + "step": 6665 + }, + { + "epoch": 0.8401297351764965, + "grad_norm": 0.30258163809776306, + "learning_rate": 0.0002713525491562421, + "loss": 0.6225, + "step": 6670 + }, + { + "epoch": 0.8407595175866738, + "grad_norm": 0.33032524585723877, + "learning_rate": 0.00027128790559670667, + "loss": 0.628, + "step": 6675 + }, + { + "epoch": 0.8413892999968511, + "grad_norm": 0.36689457297325134, + "learning_rate": 0.00027122319690279535, + "loss": 0.6341, + "step": 6680 + }, + { + "epoch": 0.8420190824070284, + "grad_norm": 0.35744035243988037, + "learning_rate": 0.00027115842310925837, + "loss": 0.5945, + "step": 6685 + }, + { + "epoch": 0.8426488648172057, + "grad_norm": 0.3377218246459961, + "learning_rate": 0.0002710935842508806, + "loss": 0.6216, + "step": 6690 + }, + { + "epoch": 0.843278647227383, + "grad_norm": 0.3244309723377228, + "learning_rate": 0.000271028680362482, + "loss": 0.6045, + "step": 6695 + }, + { + "epoch": 0.8439084296375602, + "grad_norm": 0.34593185782432556, + "learning_rate": 0.00027096371147891744, + "loss": 0.6277, + "step": 6700 + }, + { + "epoch": 0.8445382120477375, + "grad_norm": 0.3151993751525879, + "learning_rate": 0.0002708986776350767, + "loss": 0.5929, + "step": 6705 + }, + { + "epoch": 0.8451679944579148, + "grad_norm": 0.38307860493659973, + "learning_rate": 0.0002708335788658845, + "loss": 0.5934, + "step": 6710 + }, + { + "epoch": 0.8457977768680921, + "grad_norm": 0.3155449330806732, + "learning_rate": 0.0002707684152063003, + "loss": 0.5838, + "step": 6715 + }, + { + "epoch": 0.8464275592782694, + "grad_norm": 0.3827744424343109, + "learning_rate": 0.00027070318669131845, + "loss": 0.5976, + "step": 6720 + }, + { + "epoch": 0.8470573416884466, + "grad_norm": 0.35382625460624695, + "learning_rate": 0.00027063789335596825, + "loss": 0.5997, + "step": 6725 + }, + { + "epoch": 0.8476871240986239, + "grad_norm": 0.36884164810180664, + "learning_rate": 0.00027057253523531365, + "loss": 0.6373, + "step": 6730 + }, + { + "epoch": 0.8483169065088012, + "grad_norm": 0.35557276010513306, + "learning_rate": 0.0002705071123644534, + "loss": 0.6717, + "step": 6735 + }, + { + "epoch": 0.8489466889189785, + "grad_norm": 0.3088480234146118, + "learning_rate": 0.00027044162477852124, + "loss": 0.6011, + "step": 6740 + }, + { + "epoch": 0.8495764713291558, + "grad_norm": 0.33665964007377625, + "learning_rate": 0.0002703760725126853, + "loss": 0.6039, + "step": 6745 + }, + { + "epoch": 0.850206253739333, + "grad_norm": 0.3297666311264038, + "learning_rate": 0.0002703104556021488, + "loss": 0.6226, + "step": 6750 + }, + { + "epoch": 0.8508360361495103, + "grad_norm": 0.33897268772125244, + "learning_rate": 0.00027024477408214945, + "loss": 0.5564, + "step": 6755 + }, + { + "epoch": 0.8514658185596876, + "grad_norm": 0.3549680709838867, + "learning_rate": 0.0002701790279879597, + "loss": 0.5989, + "step": 6760 + }, + { + "epoch": 0.8520956009698649, + "grad_norm": 0.31162139773368835, + "learning_rate": 0.0002701132173548868, + "loss": 0.6363, + "step": 6765 + }, + { + "epoch": 0.8527253833800422, + "grad_norm": 0.35543885827064514, + "learning_rate": 0.0002700473422182724, + "loss": 0.6228, + "step": 6770 + }, + { + "epoch": 0.8533551657902195, + "grad_norm": 0.3361263871192932, + "learning_rate": 0.0002699814026134932, + "loss": 0.5957, + "step": 6775 + }, + { + "epoch": 0.8539849482003967, + "grad_norm": 0.2764013707637787, + "learning_rate": 0.00026991539857596, + "loss": 0.5982, + "step": 6780 + }, + { + "epoch": 0.854614730610574, + "grad_norm": 0.3229328691959381, + "learning_rate": 0.0002698493301411187, + "loss": 0.6562, + "step": 6785 + }, + { + "epoch": 0.8552445130207513, + "grad_norm": 0.3163946270942688, + "learning_rate": 0.00026978319734444943, + "loss": 0.6125, + "step": 6790 + }, + { + "epoch": 0.8558742954309286, + "grad_norm": 0.38809090852737427, + "learning_rate": 0.0002697170002214671, + "loss": 0.6308, + "step": 6795 + }, + { + "epoch": 0.8565040778411059, + "grad_norm": 0.3416973650455475, + "learning_rate": 0.0002696507388077209, + "loss": 0.6565, + "step": 6800 + }, + { + "epoch": 0.8571338602512831, + "grad_norm": 0.3220008909702301, + "learning_rate": 0.00026958441313879494, + "loss": 0.6211, + "step": 6805 + }, + { + "epoch": 0.8577636426614604, + "grad_norm": 0.34507647156715393, + "learning_rate": 0.00026951802325030755, + "loss": 0.6384, + "step": 6810 + }, + { + "epoch": 0.8583934250716377, + "grad_norm": 0.3345770239830017, + "learning_rate": 0.00026945156917791154, + "loss": 0.6566, + "step": 6815 + }, + { + "epoch": 0.859023207481815, + "grad_norm": 0.32488980889320374, + "learning_rate": 0.0002693850509572943, + "loss": 0.626, + "step": 6820 + }, + { + "epoch": 0.8596529898919923, + "grad_norm": 0.3537434935569763, + "learning_rate": 0.00026931846862417766, + "loss": 0.6539, + "step": 6825 + }, + { + "epoch": 0.8602827723021697, + "grad_norm": 0.3165736794471741, + "learning_rate": 0.0002692518222143179, + "loss": 0.6468, + "step": 6830 + }, + { + "epoch": 0.8609125547123468, + "grad_norm": 0.34746891260147095, + "learning_rate": 0.0002691851117635056, + "loss": 0.6498, + "step": 6835 + }, + { + "epoch": 0.8615423371225241, + "grad_norm": 0.3370078206062317, + "learning_rate": 0.00026911833730756577, + "loss": 0.5951, + "step": 6840 + }, + { + "epoch": 0.8621721195327015, + "grad_norm": 0.3180099427700043, + "learning_rate": 0.00026905149888235787, + "loss": 0.609, + "step": 6845 + }, + { + "epoch": 0.8628019019428788, + "grad_norm": 0.34123897552490234, + "learning_rate": 0.0002689845965237757, + "loss": 0.6228, + "step": 6850 + }, + { + "epoch": 0.8634316843530561, + "grad_norm": 0.3529733717441559, + "learning_rate": 0.00026891763026774725, + "loss": 0.6101, + "step": 6855 + }, + { + "epoch": 0.8640614667632333, + "grad_norm": 0.3116464614868164, + "learning_rate": 0.00026885060015023496, + "loss": 0.5734, + "step": 6860 + }, + { + "epoch": 0.8646912491734106, + "grad_norm": 0.3331621587276459, + "learning_rate": 0.00026878350620723556, + "loss": 0.6004, + "step": 6865 + }, + { + "epoch": 0.8653210315835879, + "grad_norm": 0.3215835690498352, + "learning_rate": 0.00026871634847478007, + "loss": 0.6105, + "step": 6870 + }, + { + "epoch": 0.8659508139937652, + "grad_norm": 0.3454177677631378, + "learning_rate": 0.0002686491269889336, + "loss": 0.6203, + "step": 6875 + }, + { + "epoch": 0.8665805964039425, + "grad_norm": 0.3336181640625, + "learning_rate": 0.0002685818417857958, + "loss": 0.6179, + "step": 6880 + }, + { + "epoch": 0.8672103788141198, + "grad_norm": 0.3452587127685547, + "learning_rate": 0.00026851449290150024, + "loss": 0.5918, + "step": 6885 + }, + { + "epoch": 0.867840161224297, + "grad_norm": 0.37552833557128906, + "learning_rate": 0.0002684470803722148, + "loss": 0.6284, + "step": 6890 + }, + { + "epoch": 0.8684699436344743, + "grad_norm": 0.33525559306144714, + "learning_rate": 0.0002683796042341416, + "loss": 0.6465, + "step": 6895 + }, + { + "epoch": 0.8690997260446516, + "grad_norm": 0.3272569477558136, + "learning_rate": 0.00026831206452351683, + "loss": 0.636, + "step": 6900 + }, + { + "epoch": 0.8697295084548289, + "grad_norm": 0.35215091705322266, + "learning_rate": 0.0002682444612766109, + "loss": 0.6415, + "step": 6905 + }, + { + "epoch": 0.8703592908650062, + "grad_norm": 0.33025211095809937, + "learning_rate": 0.0002681767945297282, + "loss": 0.6677, + "step": 6910 + }, + { + "epoch": 0.8709890732751834, + "grad_norm": 0.34073176980018616, + "learning_rate": 0.0002681090643192075, + "loss": 0.6386, + "step": 6915 + }, + { + "epoch": 0.8716188556853607, + "grad_norm": 0.4070134162902832, + "learning_rate": 0.0002680412706814213, + "loss": 0.6365, + "step": 6920 + }, + { + "epoch": 0.872248638095538, + "grad_norm": 0.33693283796310425, + "learning_rate": 0.00026797341365277644, + "loss": 0.6465, + "step": 6925 + }, + { + "epoch": 0.8728784205057153, + "grad_norm": 0.3678983747959137, + "learning_rate": 0.0002679054932697136, + "loss": 0.594, + "step": 6930 + }, + { + "epoch": 0.8735082029158926, + "grad_norm": 0.31632333993911743, + "learning_rate": 0.00026783750956870764, + "loss": 0.6128, + "step": 6935 + }, + { + "epoch": 0.8741379853260698, + "grad_norm": 0.3184865713119507, + "learning_rate": 0.0002677694625862674, + "loss": 0.5955, + "step": 6940 + }, + { + "epoch": 0.8747677677362471, + "grad_norm": 0.33729860186576843, + "learning_rate": 0.00026770135235893556, + "loss": 0.609, + "step": 6945 + }, + { + "epoch": 0.8753975501464244, + "grad_norm": 0.3195466995239258, + "learning_rate": 0.0002676331789232889, + "loss": 0.6399, + "step": 6950 + }, + { + "epoch": 0.8760273325566017, + "grad_norm": 0.35504212975502014, + "learning_rate": 0.0002675649423159382, + "loss": 0.6162, + "step": 6955 + }, + { + "epoch": 0.876657114966779, + "grad_norm": 0.3598940372467041, + "learning_rate": 0.000267496642573528, + "loss": 0.6117, + "step": 6960 + }, + { + "epoch": 0.8772868973769563, + "grad_norm": 0.32016637921333313, + "learning_rate": 0.0002674282797327368, + "loss": 0.6129, + "step": 6965 + }, + { + "epoch": 0.8779166797871335, + "grad_norm": 0.36968451738357544, + "learning_rate": 0.00026735985383027704, + "loss": 0.619, + "step": 6970 + }, + { + "epoch": 0.8785464621973108, + "grad_norm": 0.3299955427646637, + "learning_rate": 0.000267291364902895, + "loss": 0.5894, + "step": 6975 + }, + { + "epoch": 0.8791762446074881, + "grad_norm": 0.34892305731773376, + "learning_rate": 0.0002672228129873708, + "loss": 0.6152, + "step": 6980 + }, + { + "epoch": 0.8798060270176654, + "grad_norm": 0.379016637802124, + "learning_rate": 0.00026715419812051833, + "loss": 0.6633, + "step": 6985 + }, + { + "epoch": 0.8804358094278427, + "grad_norm": 0.3378797173500061, + "learning_rate": 0.00026708552033918544, + "loss": 0.5911, + "step": 6990 + }, + { + "epoch": 0.8810655918380199, + "grad_norm": 0.3348138928413391, + "learning_rate": 0.0002670167796802536, + "loss": 0.5841, + "step": 6995 + }, + { + "epoch": 0.8816953742481972, + "grad_norm": 0.36374861001968384, + "learning_rate": 0.0002669479761806381, + "loss": 0.5973, + "step": 7000 + }, + { + "epoch": 0.8816953742481972, + "eval_loss": 0.3066178560256958, + "eval_runtime": 6.2494, + "eval_samples_per_second": 160.014, + "eval_steps_per_second": 10.081, + "step": 7000 + }, + { + "epoch": 0.8823251566583745, + "grad_norm": 0.31616318225860596, + "learning_rate": 0.000266879109877288, + "loss": 0.6302, + "step": 7005 + }, + { + "epoch": 0.8829549390685518, + "grad_norm": 0.37413114309310913, + "learning_rate": 0.00026681018080718615, + "loss": 0.6141, + "step": 7010 + }, + { + "epoch": 0.8835847214787291, + "grad_norm": 0.3616124987602234, + "learning_rate": 0.0002667411890073489, + "loss": 0.6081, + "step": 7015 + }, + { + "epoch": 0.8842145038889064, + "grad_norm": 0.3536156713962555, + "learning_rate": 0.00026667213451482655, + "loss": 0.6101, + "step": 7020 + }, + { + "epoch": 0.8848442862990836, + "grad_norm": 0.2826579809188843, + "learning_rate": 0.00026660301736670293, + "loss": 0.5803, + "step": 7025 + }, + { + "epoch": 0.8854740687092609, + "grad_norm": 0.3352709710597992, + "learning_rate": 0.00026653383760009546, + "loss": 0.5994, + "step": 7030 + }, + { + "epoch": 0.8861038511194382, + "grad_norm": 0.320122092962265, + "learning_rate": 0.00026646459525215524, + "loss": 0.6159, + "step": 7035 + }, + { + "epoch": 0.8867336335296155, + "grad_norm": 0.3512963652610779, + "learning_rate": 0.0002663952903600671, + "loss": 0.6034, + "step": 7040 + }, + { + "epoch": 0.8873634159397928, + "grad_norm": 0.358071506023407, + "learning_rate": 0.00026632592296104926, + "loss": 0.6155, + "step": 7045 + }, + { + "epoch": 0.88799319834997, + "grad_norm": 0.342318058013916, + "learning_rate": 0.0002662564930923536, + "loss": 0.5997, + "step": 7050 + }, + { + "epoch": 0.8886229807601473, + "grad_norm": 0.291960746049881, + "learning_rate": 0.0002661870007912656, + "loss": 0.5721, + "step": 7055 + }, + { + "epoch": 0.8892527631703246, + "grad_norm": 0.3608805239200592, + "learning_rate": 0.0002661174460951042, + "loss": 0.6248, + "step": 7060 + }, + { + "epoch": 0.889882545580502, + "grad_norm": 0.329289972782135, + "learning_rate": 0.0002660478290412218, + "loss": 0.6163, + "step": 7065 + }, + { + "epoch": 0.8905123279906793, + "grad_norm": 0.352383553981781, + "learning_rate": 0.0002659781496670044, + "loss": 0.6252, + "step": 7070 + }, + { + "epoch": 0.8911421104008566, + "grad_norm": 0.3424574136734009, + "learning_rate": 0.0002659084080098714, + "loss": 0.5562, + "step": 7075 + }, + { + "epoch": 0.8917718928110338, + "grad_norm": 0.32095563411712646, + "learning_rate": 0.0002658386041072757, + "loss": 0.6232, + "step": 7080 + }, + { + "epoch": 0.892401675221211, + "grad_norm": 0.3307218849658966, + "learning_rate": 0.00026576873799670356, + "loss": 0.5958, + "step": 7085 + }, + { + "epoch": 0.8930314576313884, + "grad_norm": 0.31858259439468384, + "learning_rate": 0.00026569880971567464, + "loss": 0.6128, + "step": 7090 + }, + { + "epoch": 0.8936612400415657, + "grad_norm": 0.3014832139015198, + "learning_rate": 0.00026562881930174213, + "loss": 0.5886, + "step": 7095 + }, + { + "epoch": 0.894291022451743, + "grad_norm": 0.35925576090812683, + "learning_rate": 0.00026555876679249234, + "loss": 0.6032, + "step": 7100 + }, + { + "epoch": 0.8949208048619202, + "grad_norm": 0.337100625038147, + "learning_rate": 0.0002654886522255452, + "loss": 0.6217, + "step": 7105 + }, + { + "epoch": 0.8955505872720975, + "grad_norm": 0.34906861186027527, + "learning_rate": 0.00026541847563855373, + "loss": 0.5999, + "step": 7110 + }, + { + "epoch": 0.8961803696822748, + "grad_norm": 0.2829444110393524, + "learning_rate": 0.00026534823706920443, + "loss": 0.5747, + "step": 7115 + }, + { + "epoch": 0.8968101520924521, + "grad_norm": 0.3298097550868988, + "learning_rate": 0.00026527793655521697, + "loss": 0.5959, + "step": 7120 + }, + { + "epoch": 0.8974399345026294, + "grad_norm": 0.3762158453464508, + "learning_rate": 0.0002652075741343444, + "loss": 0.6325, + "step": 7125 + }, + { + "epoch": 0.8980697169128066, + "grad_norm": 0.3318065106868744, + "learning_rate": 0.00026513714984437284, + "loss": 0.6015, + "step": 7130 + }, + { + "epoch": 0.8986994993229839, + "grad_norm": 0.3132246434688568, + "learning_rate": 0.0002650666637231218, + "loss": 0.6317, + "step": 7135 + }, + { + "epoch": 0.8993292817331612, + "grad_norm": 0.3308473527431488, + "learning_rate": 0.00026499611580844403, + "loss": 0.6364, + "step": 7140 + }, + { + "epoch": 0.8999590641433385, + "grad_norm": 0.31450155377388, + "learning_rate": 0.0002649255061382252, + "loss": 0.6186, + "step": 7145 + }, + { + "epoch": 0.9005888465535158, + "grad_norm": 0.3408615291118622, + "learning_rate": 0.00026485483475038445, + "loss": 0.5954, + "step": 7150 + }, + { + "epoch": 0.9012186289636931, + "grad_norm": 0.34355321526527405, + "learning_rate": 0.0002647841016828738, + "loss": 0.6143, + "step": 7155 + }, + { + "epoch": 0.9018484113738703, + "grad_norm": 0.35341107845306396, + "learning_rate": 0.00026471330697367865, + "loss": 0.5887, + "step": 7160 + }, + { + "epoch": 0.9024781937840476, + "grad_norm": 0.3439336121082306, + "learning_rate": 0.0002646424506608173, + "loss": 0.6152, + "step": 7165 + }, + { + "epoch": 0.9031079761942249, + "grad_norm": 0.32301509380340576, + "learning_rate": 0.00026457153278234126, + "loss": 0.6191, + "step": 7170 + }, + { + "epoch": 0.9037377586044022, + "grad_norm": 0.3085480034351349, + "learning_rate": 0.000264500553376335, + "loss": 0.5993, + "step": 7175 + }, + { + "epoch": 0.9043675410145795, + "grad_norm": 0.3285475969314575, + "learning_rate": 0.0002644295124809161, + "loss": 0.5832, + "step": 7180 + }, + { + "epoch": 0.9049973234247567, + "grad_norm": 0.3160327076911926, + "learning_rate": 0.0002643584101342352, + "loss": 0.6258, + "step": 7185 + }, + { + "epoch": 0.905627105834934, + "grad_norm": 0.30449238419532776, + "learning_rate": 0.0002642872463744759, + "loss": 0.62, + "step": 7190 + }, + { + "epoch": 0.9062568882451113, + "grad_norm": 0.31154754757881165, + "learning_rate": 0.00026421602123985455, + "loss": 0.5888, + "step": 7195 + }, + { + "epoch": 0.9068866706552886, + "grad_norm": 0.32224607467651367, + "learning_rate": 0.0002641447347686209, + "loss": 0.5971, + "step": 7200 + }, + { + "epoch": 0.9075164530654659, + "grad_norm": 0.33809399604797363, + "learning_rate": 0.0002640733869990573, + "loss": 0.5942, + "step": 7205 + }, + { + "epoch": 0.9081462354756432, + "grad_norm": 0.337990403175354, + "learning_rate": 0.0002640019779694792, + "loss": 0.5996, + "step": 7210 + }, + { + "epoch": 0.9087760178858204, + "grad_norm": 0.33843520283699036, + "learning_rate": 0.0002639305077182348, + "loss": 0.6009, + "step": 7215 + }, + { + "epoch": 0.9094058002959977, + "grad_norm": 0.31854307651519775, + "learning_rate": 0.00026385897628370536, + "loss": 0.5929, + "step": 7220 + }, + { + "epoch": 0.910035582706175, + "grad_norm": 0.31263160705566406, + "learning_rate": 0.0002637873837043049, + "loss": 0.5861, + "step": 7225 + }, + { + "epoch": 0.9106653651163523, + "grad_norm": 0.3141006827354431, + "learning_rate": 0.00026371573001848005, + "loss": 0.6204, + "step": 7230 + }, + { + "epoch": 0.9112951475265296, + "grad_norm": 0.3565130829811096, + "learning_rate": 0.00026364401526471077, + "loss": 0.6051, + "step": 7235 + }, + { + "epoch": 0.9119249299367068, + "grad_norm": 0.3886755108833313, + "learning_rate": 0.0002635722394815094, + "loss": 0.6162, + "step": 7240 + }, + { + "epoch": 0.9125547123468841, + "grad_norm": 0.32173478603363037, + "learning_rate": 0.0002635004027074211, + "loss": 0.5908, + "step": 7245 + }, + { + "epoch": 0.9131844947570614, + "grad_norm": 0.3483346998691559, + "learning_rate": 0.0002634285049810239, + "loss": 0.5934, + "step": 7250 + }, + { + "epoch": 0.9138142771672387, + "grad_norm": 0.31829094886779785, + "learning_rate": 0.00026335654634092857, + "loss": 0.6205, + "step": 7255 + }, + { + "epoch": 0.914444059577416, + "grad_norm": 0.2864934206008911, + "learning_rate": 0.0002632845268257785, + "loss": 0.5486, + "step": 7260 + }, + { + "epoch": 0.9150738419875933, + "grad_norm": 0.34583529829978943, + "learning_rate": 0.0002632124464742499, + "loss": 0.5994, + "step": 7265 + }, + { + "epoch": 0.9157036243977705, + "grad_norm": 0.3405662775039673, + "learning_rate": 0.00026314030532505146, + "loss": 0.5941, + "step": 7270 + }, + { + "epoch": 0.9163334068079478, + "grad_norm": 0.319985568523407, + "learning_rate": 0.00026306810341692464, + "loss": 0.5949, + "step": 7275 + }, + { + "epoch": 0.9169631892181251, + "grad_norm": 0.3206420838832855, + "learning_rate": 0.00026299584078864354, + "loss": 0.5895, + "step": 7280 + }, + { + "epoch": 0.9175929716283024, + "grad_norm": 0.33022215962409973, + "learning_rate": 0.00026292351747901486, + "loss": 0.6018, + "step": 7285 + }, + { + "epoch": 0.9182227540384797, + "grad_norm": 0.3440692722797394, + "learning_rate": 0.00026285113352687785, + "loss": 0.5818, + "step": 7290 + }, + { + "epoch": 0.9188525364486569, + "grad_norm": 0.3580811619758606, + "learning_rate": 0.0002627786889711043, + "loss": 0.6024, + "step": 7295 + }, + { + "epoch": 0.9194823188588342, + "grad_norm": 0.3101358413696289, + "learning_rate": 0.0002627061838505987, + "loss": 0.6241, + "step": 7300 + }, + { + "epoch": 0.9201121012690116, + "grad_norm": 0.3681425452232361, + "learning_rate": 0.00026263361820429783, + "loss": 0.5759, + "step": 7305 + }, + { + "epoch": 0.9207418836791889, + "grad_norm": 0.3331769108772278, + "learning_rate": 0.0002625609920711712, + "loss": 0.5696, + "step": 7310 + }, + { + "epoch": 0.9213716660893662, + "grad_norm": 0.34252071380615234, + "learning_rate": 0.00026248830549022064, + "loss": 0.6171, + "step": 7315 + }, + { + "epoch": 0.9220014484995434, + "grad_norm": 0.31009170413017273, + "learning_rate": 0.00026241555850048056, + "loss": 0.5758, + "step": 7320 + }, + { + "epoch": 0.9226312309097207, + "grad_norm": 0.33126717805862427, + "learning_rate": 0.00026234275114101765, + "loss": 0.557, + "step": 7325 + }, + { + "epoch": 0.923261013319898, + "grad_norm": 0.35423141717910767, + "learning_rate": 0.00026226988345093126, + "loss": 0.6239, + "step": 7330 + }, + { + "epoch": 0.9238907957300753, + "grad_norm": 0.31321558356285095, + "learning_rate": 0.0002621969554693529, + "loss": 0.5796, + "step": 7335 + }, + { + "epoch": 0.9245205781402526, + "grad_norm": 0.38709312677383423, + "learning_rate": 0.00026212396723544664, + "loss": 0.5831, + "step": 7340 + }, + { + "epoch": 0.9251503605504299, + "grad_norm": 0.3205506205558777, + "learning_rate": 0.0002620509187884088, + "loss": 0.5577, + "step": 7345 + }, + { + "epoch": 0.9257801429606071, + "grad_norm": 0.3263196647167206, + "learning_rate": 0.00026197781016746804, + "loss": 0.5729, + "step": 7350 + }, + { + "epoch": 0.9264099253707844, + "grad_norm": 0.3553536534309387, + "learning_rate": 0.0002619046414118854, + "loss": 0.5968, + "step": 7355 + }, + { + "epoch": 0.9270397077809617, + "grad_norm": 0.4170524477958679, + "learning_rate": 0.0002618314125609541, + "loss": 0.5731, + "step": 7360 + }, + { + "epoch": 0.927669490191139, + "grad_norm": 0.3739701807498932, + "learning_rate": 0.00026175812365399976, + "loss": 0.5785, + "step": 7365 + }, + { + "epoch": 0.9282992726013163, + "grad_norm": 0.32139813899993896, + "learning_rate": 0.0002616847747303802, + "loss": 0.5909, + "step": 7370 + }, + { + "epoch": 0.9289290550114935, + "grad_norm": 0.3099890947341919, + "learning_rate": 0.00026161136582948544, + "loss": 0.5579, + "step": 7375 + }, + { + "epoch": 0.9295588374216708, + "grad_norm": 0.349729984998703, + "learning_rate": 0.0002615378969907378, + "loss": 0.5762, + "step": 7380 + }, + { + "epoch": 0.9301886198318481, + "grad_norm": 0.3257734775543213, + "learning_rate": 0.00026146436825359167, + "loss": 0.6216, + "step": 7385 + }, + { + "epoch": 0.9308184022420254, + "grad_norm": 0.3399578332901001, + "learning_rate": 0.0002613907796575337, + "loss": 0.5694, + "step": 7390 + }, + { + "epoch": 0.9314481846522027, + "grad_norm": 0.3863985240459442, + "learning_rate": 0.0002613171312420826, + "loss": 0.6416, + "step": 7395 + }, + { + "epoch": 0.93207796706238, + "grad_norm": 0.3288150429725647, + "learning_rate": 0.0002612434230467892, + "loss": 0.5839, + "step": 7400 + }, + { + "epoch": 0.9327077494725572, + "grad_norm": 0.37783902883529663, + "learning_rate": 0.00026116965511123664, + "loss": 0.5919, + "step": 7405 + }, + { + "epoch": 0.9333375318827345, + "grad_norm": 0.36346110701560974, + "learning_rate": 0.00026109582747503986, + "loss": 0.5796, + "step": 7410 + }, + { + "epoch": 0.9339673142929118, + "grad_norm": 0.3194875419139862, + "learning_rate": 0.00026102194017784606, + "loss": 0.5808, + "step": 7415 + }, + { + "epoch": 0.9345970967030891, + "grad_norm": 0.286823570728302, + "learning_rate": 0.00026094799325933435, + "loss": 0.5605, + "step": 7420 + }, + { + "epoch": 0.9352268791132664, + "grad_norm": 0.3147251307964325, + "learning_rate": 0.0002608739867592159, + "loss": 0.572, + "step": 7425 + }, + { + "epoch": 0.9358566615234436, + "grad_norm": 0.34172821044921875, + "learning_rate": 0.000260799920717234, + "loss": 0.5763, + "step": 7430 + }, + { + "epoch": 0.9364864439336209, + "grad_norm": 0.32804232835769653, + "learning_rate": 0.0002607257951731637, + "loss": 0.5925, + "step": 7435 + }, + { + "epoch": 0.9371162263437982, + "grad_norm": 0.2969893515110016, + "learning_rate": 0.0002606516101668122, + "loss": 0.5754, + "step": 7440 + }, + { + "epoch": 0.9377460087539755, + "grad_norm": 0.3364142179489136, + "learning_rate": 0.00026057736573801844, + "loss": 0.6248, + "step": 7445 + }, + { + "epoch": 0.9383757911641528, + "grad_norm": 0.3493711054325104, + "learning_rate": 0.0002605030619266534, + "loss": 0.5828, + "step": 7450 + }, + { + "epoch": 0.9390055735743301, + "grad_norm": 0.3338306248188019, + "learning_rate": 0.00026042869877262, + "loss": 0.5947, + "step": 7455 + }, + { + "epoch": 0.9396353559845073, + "grad_norm": 0.30441364645957947, + "learning_rate": 0.0002603542763158529, + "loss": 0.5743, + "step": 7460 + }, + { + "epoch": 0.9402651383946846, + "grad_norm": 0.31838342547416687, + "learning_rate": 0.0002602797945963186, + "loss": 0.5493, + "step": 7465 + }, + { + "epoch": 0.9408949208048619, + "grad_norm": 0.3308780789375305, + "learning_rate": 0.0002602052536540156, + "loss": 0.5984, + "step": 7470 + }, + { + "epoch": 0.9415247032150392, + "grad_norm": 0.30487555265426636, + "learning_rate": 0.00026013065352897407, + "loss": 0.5687, + "step": 7475 + }, + { + "epoch": 0.9421544856252165, + "grad_norm": 0.33297523856163025, + "learning_rate": 0.0002600559942612559, + "loss": 0.5728, + "step": 7480 + }, + { + "epoch": 0.9427842680353937, + "grad_norm": 0.3194848299026489, + "learning_rate": 0.00025998127589095483, + "loss": 0.5939, + "step": 7485 + }, + { + "epoch": 0.943414050445571, + "grad_norm": 0.3401489555835724, + "learning_rate": 0.0002599064984581964, + "loss": 0.5282, + "step": 7490 + }, + { + "epoch": 0.9440438328557483, + "grad_norm": 0.3722991943359375, + "learning_rate": 0.0002598316620031378, + "loss": 0.6044, + "step": 7495 + }, + { + "epoch": 0.9446736152659256, + "grad_norm": 0.3582395613193512, + "learning_rate": 0.0002597567665659678, + "loss": 0.574, + "step": 7500 + }, + { + "epoch": 0.9453033976761029, + "grad_norm": 0.30922654271125793, + "learning_rate": 0.0002596818121869071, + "loss": 0.6086, + "step": 7505 + }, + { + "epoch": 0.9459331800862802, + "grad_norm": 0.34381213784217834, + "learning_rate": 0.00025960679890620785, + "loss": 0.6032, + "step": 7510 + }, + { + "epoch": 0.9465629624964574, + "grad_norm": 0.3153468072414398, + "learning_rate": 0.0002595317267641539, + "loss": 0.5758, + "step": 7515 + }, + { + "epoch": 0.9471927449066347, + "grad_norm": 0.30763527750968933, + "learning_rate": 0.0002594565958010607, + "loss": 0.6036, + "step": 7520 + }, + { + "epoch": 0.947822527316812, + "grad_norm": 0.33897343277931213, + "learning_rate": 0.00025938140605727536, + "loss": 0.5879, + "step": 7525 + }, + { + "epoch": 0.9484523097269894, + "grad_norm": 0.2996034324169159, + "learning_rate": 0.00025930615757317635, + "loss": 0.6095, + "step": 7530 + }, + { + "epoch": 0.9490820921371667, + "grad_norm": 0.37265533208847046, + "learning_rate": 0.00025923085038917395, + "loss": 0.5718, + "step": 7535 + }, + { + "epoch": 0.9497118745473438, + "grad_norm": 0.32904815673828125, + "learning_rate": 0.00025915548454570977, + "loss": 0.5689, + "step": 7540 + }, + { + "epoch": 0.9503416569575212, + "grad_norm": 0.3493824005126953, + "learning_rate": 0.000259080060083257, + "loss": 0.594, + "step": 7545 + }, + { + "epoch": 0.9509714393676985, + "grad_norm": 0.33561789989471436, + "learning_rate": 0.0002590045770423204, + "loss": 0.5604, + "step": 7550 + }, + { + "epoch": 0.9516012217778758, + "grad_norm": 0.3272433876991272, + "learning_rate": 0.00025892903546343587, + "loss": 0.5819, + "step": 7555 + }, + { + "epoch": 0.9522310041880531, + "grad_norm": 0.34539222717285156, + "learning_rate": 0.00025885343538717116, + "loss": 0.591, + "step": 7560 + }, + { + "epoch": 0.9528607865982303, + "grad_norm": 0.3331897258758545, + "learning_rate": 0.0002587777768541252, + "loss": 0.5885, + "step": 7565 + }, + { + "epoch": 0.9534905690084076, + "grad_norm": 0.3285147547721863, + "learning_rate": 0.00025870205990492827, + "loss": 0.5561, + "step": 7570 + }, + { + "epoch": 0.9541203514185849, + "grad_norm": 0.3221907317638397, + "learning_rate": 0.0002586262845802422, + "loss": 0.5837, + "step": 7575 + }, + { + "epoch": 0.9547501338287622, + "grad_norm": 0.4986007511615753, + "learning_rate": 0.00025855045092076, + "loss": 0.5645, + "step": 7580 + }, + { + "epoch": 0.9553799162389395, + "grad_norm": 0.33891043066978455, + "learning_rate": 0.00025847455896720615, + "loss": 0.5801, + "step": 7585 + }, + { + "epoch": 0.9560096986491168, + "grad_norm": 0.345480740070343, + "learning_rate": 0.00025839860876033626, + "loss": 0.5876, + "step": 7590 + }, + { + "epoch": 0.956639481059294, + "grad_norm": 0.39212220907211304, + "learning_rate": 0.0002583226003409374, + "loss": 0.5949, + "step": 7595 + }, + { + "epoch": 0.9572692634694713, + "grad_norm": 0.3195202648639679, + "learning_rate": 0.00025824653374982776, + "loss": 0.592, + "step": 7600 + }, + { + "epoch": 0.9578990458796486, + "grad_norm": 0.31688785552978516, + "learning_rate": 0.00025817040902785694, + "loss": 0.5432, + "step": 7605 + }, + { + "epoch": 0.9585288282898259, + "grad_norm": 0.3165288269519806, + "learning_rate": 0.00025809422621590554, + "loss": 0.552, + "step": 7610 + }, + { + "epoch": 0.9591586107000032, + "grad_norm": 0.33528926968574524, + "learning_rate": 0.0002580179853548856, + "loss": 0.5745, + "step": 7615 + }, + { + "epoch": 0.9597883931101804, + "grad_norm": 0.34123846888542175, + "learning_rate": 0.0002579416864857401, + "loss": 0.6019, + "step": 7620 + }, + { + "epoch": 0.9604181755203577, + "grad_norm": 0.3223724663257599, + "learning_rate": 0.0002578653296494433, + "loss": 0.5725, + "step": 7625 + }, + { + "epoch": 0.961047957930535, + "grad_norm": 0.349751740694046, + "learning_rate": 0.0002577889148870006, + "loss": 0.5739, + "step": 7630 + }, + { + "epoch": 0.9616777403407123, + "grad_norm": 0.3111324608325958, + "learning_rate": 0.0002577124422394484, + "loss": 0.5555, + "step": 7635 + }, + { + "epoch": 0.9623075227508896, + "grad_norm": 0.364615797996521, + "learning_rate": 0.00025763591174785433, + "loss": 0.5789, + "step": 7640 + }, + { + "epoch": 0.9629373051610669, + "grad_norm": 0.31817707419395447, + "learning_rate": 0.000257559323453317, + "loss": 0.5799, + "step": 7645 + }, + { + "epoch": 0.9635670875712441, + "grad_norm": 0.33710840344429016, + "learning_rate": 0.000257482677396966, + "loss": 0.6, + "step": 7650 + }, + { + "epoch": 0.9641968699814214, + "grad_norm": 0.3512105345726013, + "learning_rate": 0.00025740597361996215, + "loss": 0.5772, + "step": 7655 + }, + { + "epoch": 0.9648266523915987, + "grad_norm": 0.32505640387535095, + "learning_rate": 0.00025732921216349705, + "loss": 0.5872, + "step": 7660 + }, + { + "epoch": 0.965456434801776, + "grad_norm": 0.32156363129615784, + "learning_rate": 0.0002572523930687933, + "loss": 0.5842, + "step": 7665 + }, + { + "epoch": 0.9660862172119533, + "grad_norm": 0.313147634267807, + "learning_rate": 0.0002571755163771046, + "loss": 0.5697, + "step": 7670 + }, + { + "epoch": 0.9667159996221305, + "grad_norm": 0.3494894504547119, + "learning_rate": 0.00025709858212971545, + "loss": 0.5651, + "step": 7675 + }, + { + "epoch": 0.9673457820323078, + "grad_norm": 0.317107230424881, + "learning_rate": 0.00025702159036794135, + "loss": 0.5563, + "step": 7680 + }, + { + "epoch": 0.9679755644424851, + "grad_norm": 0.3228907585144043, + "learning_rate": 0.00025694454113312854, + "loss": 0.5642, + "step": 7685 + }, + { + "epoch": 0.9686053468526624, + "grad_norm": 0.33400991559028625, + "learning_rate": 0.00025686743446665426, + "loss": 0.5738, + "step": 7690 + }, + { + "epoch": 0.9692351292628397, + "grad_norm": 0.35151737928390503, + "learning_rate": 0.0002567902704099266, + "loss": 0.562, + "step": 7695 + }, + { + "epoch": 0.969864911673017, + "grad_norm": 0.33582988381385803, + "learning_rate": 0.00025671304900438437, + "loss": 0.5724, + "step": 7700 + }, + { + "epoch": 0.9704946940831942, + "grad_norm": 0.4050043523311615, + "learning_rate": 0.00025663577029149727, + "loss": 0.6038, + "step": 7705 + }, + { + "epoch": 0.9711244764933715, + "grad_norm": 0.3320407271385193, + "learning_rate": 0.00025655843431276565, + "loss": 0.5725, + "step": 7710 + }, + { + "epoch": 0.9717542589035488, + "grad_norm": 0.33253729343414307, + "learning_rate": 0.00025648104110972074, + "loss": 0.559, + "step": 7715 + }, + { + "epoch": 0.9723840413137261, + "grad_norm": 0.3316608667373657, + "learning_rate": 0.0002564035907239245, + "loss": 0.5813, + "step": 7720 + }, + { + "epoch": 0.9730138237239034, + "grad_norm": 0.35272932052612305, + "learning_rate": 0.0002563260831969695, + "loss": 0.5544, + "step": 7725 + }, + { + "epoch": 0.9736436061340806, + "grad_norm": 0.2942962348461151, + "learning_rate": 0.00025624851857047914, + "loss": 0.5741, + "step": 7730 + }, + { + "epoch": 0.9742733885442579, + "grad_norm": 0.30799049139022827, + "learning_rate": 0.0002561708968861073, + "loss": 0.5604, + "step": 7735 + }, + { + "epoch": 0.9749031709544352, + "grad_norm": 0.2929095923900604, + "learning_rate": 0.00025609321818553864, + "loss": 0.5399, + "step": 7740 + }, + { + "epoch": 0.9755329533646125, + "grad_norm": 0.3074556291103363, + "learning_rate": 0.00025601548251048833, + "loss": 0.5714, + "step": 7745 + }, + { + "epoch": 0.9761627357747898, + "grad_norm": 0.3233494162559509, + "learning_rate": 0.0002559376899027024, + "loss": 0.5559, + "step": 7750 + }, + { + "epoch": 0.976792518184967, + "grad_norm": 0.3106531500816345, + "learning_rate": 0.000255859840403957, + "loss": 0.5462, + "step": 7755 + }, + { + "epoch": 0.9774223005951443, + "grad_norm": 0.35069772601127625, + "learning_rate": 0.00025578193405605923, + "loss": 0.5635, + "step": 7760 + }, + { + "epoch": 0.9780520830053216, + "grad_norm": 0.310811311006546, + "learning_rate": 0.00025570397090084656, + "loss": 0.5658, + "step": 7765 + }, + { + "epoch": 0.978681865415499, + "grad_norm": 0.36216944456100464, + "learning_rate": 0.000255625950980187, + "loss": 0.5785, + "step": 7770 + }, + { + "epoch": 0.9793116478256763, + "grad_norm": 0.30353617668151855, + "learning_rate": 0.000255547874335979, + "loss": 0.5347, + "step": 7775 + }, + { + "epoch": 0.9799414302358536, + "grad_norm": 0.3112618029117584, + "learning_rate": 0.00025546974101015154, + "loss": 0.5559, + "step": 7780 + }, + { + "epoch": 0.9805712126460308, + "grad_norm": 0.3782903552055359, + "learning_rate": 0.00025539155104466394, + "loss": 0.5717, + "step": 7785 + }, + { + "epoch": 0.9812009950562081, + "grad_norm": 0.3308548331260681, + "learning_rate": 0.000255313304481506, + "loss": 0.5511, + "step": 7790 + }, + { + "epoch": 0.9818307774663854, + "grad_norm": 0.2971625328063965, + "learning_rate": 0.000255235001362698, + "loss": 0.5411, + "step": 7795 + }, + { + "epoch": 0.9824605598765627, + "grad_norm": 0.3594948351383209, + "learning_rate": 0.0002551566417302904, + "loss": 0.5817, + "step": 7800 + }, + { + "epoch": 0.98309034228674, + "grad_norm": 0.3537582755088806, + "learning_rate": 0.0002550782256263642, + "loss": 0.5631, + "step": 7805 + }, + { + "epoch": 0.9837201246969172, + "grad_norm": 0.3132795989513397, + "learning_rate": 0.0002549997530930306, + "loss": 0.5725, + "step": 7810 + }, + { + "epoch": 0.9843499071070945, + "grad_norm": 0.3250652551651001, + "learning_rate": 0.00025492122417243113, + "loss": 0.5786, + "step": 7815 + }, + { + "epoch": 0.9849796895172718, + "grad_norm": 0.3318973183631897, + "learning_rate": 0.0002548426389067376, + "loss": 0.5399, + "step": 7820 + }, + { + "epoch": 0.9856094719274491, + "grad_norm": 0.3335192799568176, + "learning_rate": 0.00025476399733815214, + "loss": 0.5693, + "step": 7825 + }, + { + "epoch": 0.9862392543376264, + "grad_norm": 0.31399449706077576, + "learning_rate": 0.00025468529950890703, + "loss": 0.5821, + "step": 7830 + }, + { + "epoch": 0.9868690367478037, + "grad_norm": 0.33886855840682983, + "learning_rate": 0.00025460654546126485, + "loss": 0.556, + "step": 7835 + }, + { + "epoch": 0.9874988191579809, + "grad_norm": 0.3620472848415375, + "learning_rate": 0.0002545277352375183, + "loss": 0.6104, + "step": 7840 + }, + { + "epoch": 0.9881286015681582, + "grad_norm": 0.31123921275138855, + "learning_rate": 0.0002544488688799902, + "loss": 0.5802, + "step": 7845 + }, + { + "epoch": 0.9887583839783355, + "grad_norm": 0.33104339241981506, + "learning_rate": 0.0002543699464310337, + "loss": 0.5882, + "step": 7850 + }, + { + "epoch": 0.9893881663885128, + "grad_norm": 0.3223660886287689, + "learning_rate": 0.00025429096793303186, + "loss": 0.5649, + "step": 7855 + }, + { + "epoch": 0.9900179487986901, + "grad_norm": 0.3436056077480316, + "learning_rate": 0.000254211933428398, + "loss": 0.5546, + "step": 7860 + }, + { + "epoch": 0.9906477312088673, + "grad_norm": 0.29697200655937195, + "learning_rate": 0.00025413284295957547, + "loss": 0.5434, + "step": 7865 + }, + { + "epoch": 0.9912775136190446, + "grad_norm": 0.32985180616378784, + "learning_rate": 0.0002540536965690376, + "loss": 0.5737, + "step": 7870 + }, + { + "epoch": 0.9919072960292219, + "grad_norm": 0.31599709391593933, + "learning_rate": 0.0002539744942992878, + "loss": 0.5452, + "step": 7875 + }, + { + "epoch": 0.9925370784393992, + "grad_norm": 0.30331170558929443, + "learning_rate": 0.00025389523619285956, + "loss": 0.5593, + "step": 7880 + }, + { + "epoch": 0.9931668608495765, + "grad_norm": 0.3150465786457062, + "learning_rate": 0.0002538159222923163, + "loss": 0.5518, + "step": 7885 + }, + { + "epoch": 0.9937966432597538, + "grad_norm": 0.3179359436035156, + "learning_rate": 0.00025373655264025134, + "loss": 0.5546, + "step": 7890 + }, + { + "epoch": 0.994426425669931, + "grad_norm": 0.3226470947265625, + "learning_rate": 0.000253657127279288, + "loss": 0.58, + "step": 7895 + }, + { + "epoch": 0.9950562080801083, + "grad_norm": 0.3453287184238434, + "learning_rate": 0.0002535776462520795, + "loss": 0.5681, + "step": 7900 + }, + { + "epoch": 0.9956859904902856, + "grad_norm": 0.3329002261161804, + "learning_rate": 0.0002534981096013091, + "loss": 0.548, + "step": 7905 + }, + { + "epoch": 0.9963157729004629, + "grad_norm": 0.32592061161994934, + "learning_rate": 0.00025341851736968956, + "loss": 0.5244, + "step": 7910 + }, + { + "epoch": 0.9969455553106402, + "grad_norm": 0.32833319902420044, + "learning_rate": 0.00025333886959996396, + "loss": 0.5558, + "step": 7915 + }, + { + "epoch": 0.9975753377208174, + "grad_norm": 0.3146878182888031, + "learning_rate": 0.00025325916633490487, + "loss": 0.595, + "step": 7920 + }, + { + "epoch": 0.9982051201309947, + "grad_norm": 0.3828830122947693, + "learning_rate": 0.00025317940761731476, + "loss": 0.5675, + "step": 7925 + }, + { + "epoch": 0.998834902541172, + "grad_norm": 0.3208398222923279, + "learning_rate": 0.0002530995934900259, + "loss": 0.5439, + "step": 7930 + }, + { + "epoch": 0.9994646849513493, + "grad_norm": 0.3446502983570099, + "learning_rate": 0.00025301972399590023, + "loss": 0.5276, + "step": 7935 + }, + { + "epoch": 1.0, + "grad_norm": 0.31275373697280884, + "learning_rate": 0.0002529397991778297, + "loss": 0.543, + "step": 7940 + }, + { + "epoch": 1.0006297824101773, + "grad_norm": 0.3219754099845886, + "learning_rate": 0.0002528598190787355, + "loss": 0.4901, + "step": 7945 + }, + { + "epoch": 1.0012595648203546, + "grad_norm": 0.33292412757873535, + "learning_rate": 0.0002527797837415689, + "loss": 0.4794, + "step": 7950 + }, + { + "epoch": 1.001889347230532, + "grad_norm": 0.36561062932014465, + "learning_rate": 0.00025269969320931065, + "loss": 0.4948, + "step": 7955 + }, + { + "epoch": 1.0025191296407092, + "grad_norm": 0.2977091372013092, + "learning_rate": 0.0002526195475249713, + "loss": 0.5172, + "step": 7960 + }, + { + "epoch": 1.0031489120508865, + "grad_norm": 0.3075500428676605, + "learning_rate": 0.00025253934673159084, + "loss": 0.4755, + "step": 7965 + }, + { + "epoch": 1.0037786944610636, + "grad_norm": 0.30956047773361206, + "learning_rate": 0.00025245909087223895, + "loss": 0.4783, + "step": 7970 + }, + { + "epoch": 1.004408476871241, + "grad_norm": 0.34965232014656067, + "learning_rate": 0.00025237877999001484, + "loss": 0.4876, + "step": 7975 + }, + { + "epoch": 1.0050382592814182, + "grad_norm": 0.3290039896965027, + "learning_rate": 0.00025229841412804726, + "loss": 0.501, + "step": 7980 + }, + { + "epoch": 1.0056680416915955, + "grad_norm": 0.3144761323928833, + "learning_rate": 0.00025221799332949456, + "loss": 0.4923, + "step": 7985 + }, + { + "epoch": 1.0062978241017728, + "grad_norm": 0.3586188554763794, + "learning_rate": 0.0002521375176375446, + "loss": 0.487, + "step": 7990 + }, + { + "epoch": 1.0069276065119501, + "grad_norm": 0.3210572302341461, + "learning_rate": 0.0002520569870954146, + "loss": 0.4916, + "step": 7995 + }, + { + "epoch": 1.0075573889221274, + "grad_norm": 0.3171830177307129, + "learning_rate": 0.0002519764017463512, + "loss": 0.4834, + "step": 8000 + }, + { + "epoch": 1.0075573889221274, + "eval_loss": 0.30723655223846436, + "eval_runtime": 6.2539, + "eval_samples_per_second": 159.899, + "eval_steps_per_second": 10.074, + "step": 8000 + }, + { + "epoch": 1.0081871713323047, + "grad_norm": 0.3511858880519867, + "learning_rate": 0.00025189576163363076, + "loss": 0.4937, + "step": 8005 + }, + { + "epoch": 1.008816953742482, + "grad_norm": 0.3305964171886444, + "learning_rate": 0.00025181506680055875, + "loss": 0.4665, + "step": 8010 + }, + { + "epoch": 1.0094467361526593, + "grad_norm": 0.3735099732875824, + "learning_rate": 0.00025173431729047014, + "loss": 0.5116, + "step": 8015 + }, + { + "epoch": 1.0100765185628366, + "grad_norm": 0.34169599413871765, + "learning_rate": 0.0002516535131467293, + "loss": 0.475, + "step": 8020 + }, + { + "epoch": 1.0107063009730137, + "grad_norm": 0.3473950922489166, + "learning_rate": 0.00025157265441272993, + "loss": 0.4812, + "step": 8025 + }, + { + "epoch": 1.011336083383191, + "grad_norm": 0.31877681612968445, + "learning_rate": 0.00025149174113189496, + "loss": 0.4906, + "step": 8030 + }, + { + "epoch": 1.0119658657933683, + "grad_norm": 0.364511638879776, + "learning_rate": 0.0002514107733476766, + "loss": 0.4926, + "step": 8035 + }, + { + "epoch": 1.0125956482035456, + "grad_norm": 0.3073696792125702, + "learning_rate": 0.00025132975110355664, + "loss": 0.4994, + "step": 8040 + }, + { + "epoch": 1.013225430613723, + "grad_norm": 0.3270637094974518, + "learning_rate": 0.0002512486744430456, + "loss": 0.468, + "step": 8045 + }, + { + "epoch": 1.0138552130239002, + "grad_norm": 0.3626968264579773, + "learning_rate": 0.0002511675434096837, + "loss": 0.5139, + "step": 8050 + }, + { + "epoch": 1.0144849954340776, + "grad_norm": 0.30527931451797485, + "learning_rate": 0.00025108635804704, + "loss": 0.4922, + "step": 8055 + }, + { + "epoch": 1.0151147778442549, + "grad_norm": 0.3518252968788147, + "learning_rate": 0.000251005118398713, + "loss": 0.5297, + "step": 8060 + }, + { + "epoch": 1.0157445602544322, + "grad_norm": 0.3298850655555725, + "learning_rate": 0.0002509238245083302, + "loss": 0.5292, + "step": 8065 + }, + { + "epoch": 1.0163743426646095, + "grad_norm": 0.3175168037414551, + "learning_rate": 0.0002508424764195484, + "loss": 0.4907, + "step": 8070 + }, + { + "epoch": 1.0170041250747868, + "grad_norm": 0.33489352464675903, + "learning_rate": 0.0002507610741760531, + "loss": 0.4869, + "step": 8075 + }, + { + "epoch": 1.0176339074849639, + "grad_norm": 0.2922315299510956, + "learning_rate": 0.0002506796178215595, + "loss": 0.474, + "step": 8080 + }, + { + "epoch": 1.0182636898951412, + "grad_norm": 0.32073619961738586, + "learning_rate": 0.00025059810739981125, + "loss": 0.4951, + "step": 8085 + }, + { + "epoch": 1.0188934723053185, + "grad_norm": 0.2875652611255646, + "learning_rate": 0.0002505165429545815, + "loss": 0.5104, + "step": 8090 + }, + { + "epoch": 1.0195232547154958, + "grad_norm": 0.33247148990631104, + "learning_rate": 0.0002504349245296721, + "loss": 0.489, + "step": 8095 + }, + { + "epoch": 1.020153037125673, + "grad_norm": 0.29777953028678894, + "learning_rate": 0.0002503532521689141, + "loss": 0.5172, + "step": 8100 + }, + { + "epoch": 1.0207828195358504, + "grad_norm": 0.3418375253677368, + "learning_rate": 0.0002502715259161673, + "loss": 0.4464, + "step": 8105 + }, + { + "epoch": 1.0214126019460277, + "grad_norm": 0.39162155985832214, + "learning_rate": 0.0002501897458153207, + "loss": 0.4953, + "step": 8110 + }, + { + "epoch": 1.022042384356205, + "grad_norm": 0.32206737995147705, + "learning_rate": 0.000250107911910292, + "loss": 0.4732, + "step": 8115 + }, + { + "epoch": 1.0226721667663823, + "grad_norm": 0.37178757786750793, + "learning_rate": 0.0002500260242450279, + "loss": 0.504, + "step": 8120 + }, + { + "epoch": 1.0233019491765596, + "grad_norm": 0.33448055386543274, + "learning_rate": 0.0002499440828635039, + "loss": 0.4774, + "step": 8125 + }, + { + "epoch": 1.0239317315867367, + "grad_norm": 0.344594806432724, + "learning_rate": 0.00024986208780972455, + "loss": 0.4948, + "step": 8130 + }, + { + "epoch": 1.024561513996914, + "grad_norm": 0.3440978527069092, + "learning_rate": 0.00024978003912772283, + "loss": 0.4979, + "step": 8135 + }, + { + "epoch": 1.0251912964070913, + "grad_norm": 0.2915257513523102, + "learning_rate": 0.000249697936861561, + "loss": 0.4875, + "step": 8140 + }, + { + "epoch": 1.0258210788172686, + "grad_norm": 0.271371990442276, + "learning_rate": 0.0002496157810553296, + "loss": 0.4929, + "step": 8145 + }, + { + "epoch": 1.0264508612274459, + "grad_norm": 0.3228522539138794, + "learning_rate": 0.0002495335717531484, + "loss": 0.4706, + "step": 8150 + }, + { + "epoch": 1.0270806436376232, + "grad_norm": 0.3222556412220001, + "learning_rate": 0.00024945130899916554, + "loss": 0.487, + "step": 8155 + }, + { + "epoch": 1.0277104260478005, + "grad_norm": 0.32311001420021057, + "learning_rate": 0.00024936899283755807, + "loss": 0.5144, + "step": 8160 + }, + { + "epoch": 1.0283402084579778, + "grad_norm": 0.2946212589740753, + "learning_rate": 0.0002492866233125316, + "loss": 0.4867, + "step": 8165 + }, + { + "epoch": 1.028969990868155, + "grad_norm": 0.32464465498924255, + "learning_rate": 0.0002492042004683205, + "loss": 0.4729, + "step": 8170 + }, + { + "epoch": 1.0295997732783324, + "grad_norm": 0.3378526270389557, + "learning_rate": 0.0002491217243491876, + "loss": 0.4843, + "step": 8175 + }, + { + "epoch": 1.0302295556885097, + "grad_norm": 0.35685908794403076, + "learning_rate": 0.0002490391949994246, + "loss": 0.4941, + "step": 8180 + }, + { + "epoch": 1.0308593380986868, + "grad_norm": 0.30618053674697876, + "learning_rate": 0.0002489566124633516, + "loss": 0.4985, + "step": 8185 + }, + { + "epoch": 1.031489120508864, + "grad_norm": 0.34786808490753174, + "learning_rate": 0.0002488739767853173, + "loss": 0.4914, + "step": 8190 + }, + { + "epoch": 1.0321189029190414, + "grad_norm": 0.35167476534843445, + "learning_rate": 0.00024879128800969893, + "loss": 0.493, + "step": 8195 + }, + { + "epoch": 1.0327486853292187, + "grad_norm": 0.3278263509273529, + "learning_rate": 0.00024870854618090225, + "loss": 0.4676, + "step": 8200 + }, + { + "epoch": 1.033378467739396, + "grad_norm": 0.36896881461143494, + "learning_rate": 0.00024862575134336154, + "loss": 0.4995, + "step": 8205 + }, + { + "epoch": 1.0340082501495733, + "grad_norm": 0.3700760304927826, + "learning_rate": 0.00024854290354153953, + "loss": 0.5189, + "step": 8210 + }, + { + "epoch": 1.0346380325597506, + "grad_norm": 0.3370974063873291, + "learning_rate": 0.00024846000281992733, + "loss": 0.5044, + "step": 8215 + }, + { + "epoch": 1.035267814969928, + "grad_norm": 0.3200768232345581, + "learning_rate": 0.00024837704922304457, + "loss": 0.4779, + "step": 8220 + }, + { + "epoch": 1.0358975973801052, + "grad_norm": 0.2786978781223297, + "learning_rate": 0.0002482940427954392, + "loss": 0.4677, + "step": 8225 + }, + { + "epoch": 1.0365273797902825, + "grad_norm": 0.3220120668411255, + "learning_rate": 0.00024821098358168757, + "loss": 0.4503, + "step": 8230 + }, + { + "epoch": 1.0371571622004598, + "grad_norm": 0.3315715491771698, + "learning_rate": 0.00024812787162639444, + "loss": 0.4715, + "step": 8235 + }, + { + "epoch": 1.037786944610637, + "grad_norm": 0.3595867455005646, + "learning_rate": 0.00024804470697419273, + "loss": 0.4712, + "step": 8240 + }, + { + "epoch": 1.0384167270208142, + "grad_norm": 0.29993361234664917, + "learning_rate": 0.00024796148966974376, + "loss": 0.47, + "step": 8245 + }, + { + "epoch": 1.0390465094309915, + "grad_norm": 0.39950379729270935, + "learning_rate": 0.00024787821975773717, + "loss": 0.5233, + "step": 8250 + }, + { + "epoch": 1.0396762918411688, + "grad_norm": 0.312003493309021, + "learning_rate": 0.0002477948972828908, + "loss": 0.4836, + "step": 8255 + }, + { + "epoch": 1.0403060742513461, + "grad_norm": 0.29678481817245483, + "learning_rate": 0.0002477115222899507, + "loss": 0.4928, + "step": 8260 + }, + { + "epoch": 1.0409358566615234, + "grad_norm": 0.35694456100463867, + "learning_rate": 0.0002476280948236912, + "loss": 0.4925, + "step": 8265 + }, + { + "epoch": 1.0415656390717007, + "grad_norm": 0.3164297640323639, + "learning_rate": 0.00024754461492891474, + "loss": 0.4828, + "step": 8270 + }, + { + "epoch": 1.042195421481878, + "grad_norm": 0.37906938791275024, + "learning_rate": 0.00024746108265045184, + "loss": 0.4989, + "step": 8275 + }, + { + "epoch": 1.0428252038920554, + "grad_norm": 0.3458475172519684, + "learning_rate": 0.0002473774980331614, + "loss": 0.5072, + "step": 8280 + }, + { + "epoch": 1.0434549863022327, + "grad_norm": 0.36052700877189636, + "learning_rate": 0.0002472938611219301, + "loss": 0.4872, + "step": 8285 + }, + { + "epoch": 1.04408476871241, + "grad_norm": 0.4497036337852478, + "learning_rate": 0.00024721017196167297, + "loss": 0.4921, + "step": 8290 + }, + { + "epoch": 1.044714551122587, + "grad_norm": 0.357461154460907, + "learning_rate": 0.000247126430597333, + "loss": 0.5035, + "step": 8295 + }, + { + "epoch": 1.0453443335327643, + "grad_norm": 0.3499346375465393, + "learning_rate": 0.00024704263707388117, + "loss": 0.5242, + "step": 8300 + }, + { + "epoch": 1.0459741159429417, + "grad_norm": 0.2994784414768219, + "learning_rate": 0.0002469587914363166, + "loss": 0.4575, + "step": 8305 + }, + { + "epoch": 1.046603898353119, + "grad_norm": 0.3699876666069031, + "learning_rate": 0.0002468748937296662, + "loss": 0.4804, + "step": 8310 + }, + { + "epoch": 1.0472336807632963, + "grad_norm": 0.37695133686065674, + "learning_rate": 0.000246790943998985, + "loss": 0.4914, + "step": 8315 + }, + { + "epoch": 1.0478634631734736, + "grad_norm": 0.30732589960098267, + "learning_rate": 0.0002467069422893559, + "loss": 0.458, + "step": 8320 + }, + { + "epoch": 1.0484932455836509, + "grad_norm": 0.3094361424446106, + "learning_rate": 0.0002466228886458899, + "loss": 0.4584, + "step": 8325 + }, + { + "epoch": 1.0491230279938282, + "grad_norm": 0.3499257564544678, + "learning_rate": 0.0002465387831137255, + "loss": 0.4717, + "step": 8330 + }, + { + "epoch": 1.0497528104040055, + "grad_norm": 0.32755059003829956, + "learning_rate": 0.0002464546257380294, + "loss": 0.49, + "step": 8335 + }, + { + "epoch": 1.0503825928141828, + "grad_norm": 0.3201046884059906, + "learning_rate": 0.000246370416563996, + "loss": 0.4833, + "step": 8340 + }, + { + "epoch": 1.05101237522436, + "grad_norm": 0.2581581771373749, + "learning_rate": 0.0002462861556368476, + "loss": 0.465, + "step": 8345 + }, + { + "epoch": 1.0516421576345372, + "grad_norm": 0.3480297923088074, + "learning_rate": 0.00024620184300183423, + "loss": 0.4756, + "step": 8350 + }, + { + "epoch": 1.0522719400447145, + "grad_norm": 0.36630478501319885, + "learning_rate": 0.00024611747870423366, + "loss": 0.5051, + "step": 8355 + }, + { + "epoch": 1.0529017224548918, + "grad_norm": 0.3450157940387726, + "learning_rate": 0.0002460330627893515, + "loss": 0.4996, + "step": 8360 + }, + { + "epoch": 1.053531504865069, + "grad_norm": 0.30790945887565613, + "learning_rate": 0.000245948595302521, + "loss": 0.4826, + "step": 8365 + }, + { + "epoch": 1.0541612872752464, + "grad_norm": 0.39590683579444885, + "learning_rate": 0.00024586407628910306, + "loss": 0.4963, + "step": 8370 + }, + { + "epoch": 1.0547910696854237, + "grad_norm": 0.3294634521007538, + "learning_rate": 0.00024577950579448643, + "loss": 0.4868, + "step": 8375 + }, + { + "epoch": 1.055420852095601, + "grad_norm": 0.33493947982788086, + "learning_rate": 0.00024569488386408736, + "loss": 0.4773, + "step": 8380 + }, + { + "epoch": 1.0560506345057783, + "grad_norm": 0.32626229524612427, + "learning_rate": 0.00024561021054334974, + "loss": 0.4898, + "step": 8385 + }, + { + "epoch": 1.0566804169159556, + "grad_norm": 0.3181340992450714, + "learning_rate": 0.00024552548587774507, + "loss": 0.4757, + "step": 8390 + }, + { + "epoch": 1.057310199326133, + "grad_norm": 0.3592873215675354, + "learning_rate": 0.0002454407099127725, + "loss": 0.5034, + "step": 8395 + }, + { + "epoch": 1.0579399817363102, + "grad_norm": 0.3184007406234741, + "learning_rate": 0.00024535588269395856, + "loss": 0.4929, + "step": 8400 + }, + { + "epoch": 1.0585697641464873, + "grad_norm": 0.3555738627910614, + "learning_rate": 0.00024527100426685746, + "loss": 0.503, + "step": 8405 + }, + { + "epoch": 1.0591995465566646, + "grad_norm": 0.29637908935546875, + "learning_rate": 0.0002451860746770509, + "loss": 0.4716, + "step": 8410 + }, + { + "epoch": 1.059829328966842, + "grad_norm": 0.3031441569328308, + "learning_rate": 0.0002451010939701479, + "loss": 0.4757, + "step": 8415 + }, + { + "epoch": 1.0604591113770192, + "grad_norm": 0.28256094455718994, + "learning_rate": 0.0002450160621917851, + "loss": 0.4558, + "step": 8420 + }, + { + "epoch": 1.0610888937871965, + "grad_norm": 0.3192931115627289, + "learning_rate": 0.0002449309793876266, + "loss": 0.499, + "step": 8425 + }, + { + "epoch": 1.0617186761973738, + "grad_norm": 0.2788430154323578, + "learning_rate": 0.00024484584560336363, + "loss": 0.4616, + "step": 8430 + }, + { + "epoch": 1.0623484586075511, + "grad_norm": 0.35733649134635925, + "learning_rate": 0.00024476066088471507, + "loss": 0.4926, + "step": 8435 + }, + { + "epoch": 1.0629782410177284, + "grad_norm": 0.3398718535900116, + "learning_rate": 0.00024467542527742707, + "loss": 0.4944, + "step": 8440 + }, + { + "epoch": 1.0636080234279057, + "grad_norm": 0.3338175117969513, + "learning_rate": 0.000244590138827273, + "loss": 0.5181, + "step": 8445 + }, + { + "epoch": 1.064237805838083, + "grad_norm": 0.33433952927589417, + "learning_rate": 0.00024450480158005384, + "loss": 0.4837, + "step": 8450 + }, + { + "epoch": 1.0648675882482603, + "grad_norm": 0.3656097948551178, + "learning_rate": 0.0002444194135815974, + "loss": 0.4639, + "step": 8455 + }, + { + "epoch": 1.0654973706584374, + "grad_norm": 0.31470635533332825, + "learning_rate": 0.0002443339748777592, + "loss": 0.4718, + "step": 8460 + }, + { + "epoch": 1.0661271530686147, + "grad_norm": 0.29020166397094727, + "learning_rate": 0.00024424848551442166, + "loss": 0.4712, + "step": 8465 + }, + { + "epoch": 1.066756935478792, + "grad_norm": 0.34259042143821716, + "learning_rate": 0.00024416294553749446, + "loss": 0.5252, + "step": 8470 + }, + { + "epoch": 1.0673867178889693, + "grad_norm": 0.33828607201576233, + "learning_rate": 0.0002440773549929146, + "loss": 0.4663, + "step": 8475 + }, + { + "epoch": 1.0680165002991466, + "grad_norm": 0.35722973942756653, + "learning_rate": 0.00024399171392664622, + "loss": 0.4868, + "step": 8480 + }, + { + "epoch": 1.068646282709324, + "grad_norm": 0.3226557970046997, + "learning_rate": 0.00024390602238468043, + "loss": 0.4785, + "step": 8485 + }, + { + "epoch": 1.0692760651195012, + "grad_norm": 0.3097434639930725, + "learning_rate": 0.0002438202804130356, + "loss": 0.4677, + "step": 8490 + }, + { + "epoch": 1.0699058475296785, + "grad_norm": 0.3146856725215912, + "learning_rate": 0.00024373448805775709, + "loss": 0.4802, + "step": 8495 + }, + { + "epoch": 1.0705356299398558, + "grad_norm": 0.3576582372188568, + "learning_rate": 0.00024364864536491739, + "loss": 0.5113, + "step": 8500 + }, + { + "epoch": 1.0711654123500332, + "grad_norm": 0.33004313707351685, + "learning_rate": 0.0002435627523806159, + "loss": 0.4625, + "step": 8505 + }, + { + "epoch": 1.0717951947602105, + "grad_norm": 0.3689037263393402, + "learning_rate": 0.00024347680915097928, + "loss": 0.4923, + "step": 8510 + }, + { + "epoch": 1.0724249771703875, + "grad_norm": 0.28334125876426697, + "learning_rate": 0.00024339081572216084, + "loss": 0.4818, + "step": 8515 + }, + { + "epoch": 1.0730547595805648, + "grad_norm": 0.3461993336677551, + "learning_rate": 0.00024330477214034113, + "loss": 0.4807, + "step": 8520 + }, + { + "epoch": 1.0736845419907421, + "grad_norm": 0.32148951292037964, + "learning_rate": 0.00024321867845172743, + "loss": 0.4829, + "step": 8525 + }, + { + "epoch": 1.0743143244009195, + "grad_norm": 0.31461793184280396, + "learning_rate": 0.0002431325347025541, + "loss": 0.5045, + "step": 8530 + }, + { + "epoch": 1.0749441068110968, + "grad_norm": 0.30194273591041565, + "learning_rate": 0.00024304634093908224, + "loss": 0.4747, + "step": 8535 + }, + { + "epoch": 1.075573889221274, + "grad_norm": 0.27379968762397766, + "learning_rate": 0.0002429600972075999, + "loss": 0.4382, + "step": 8540 + }, + { + "epoch": 1.0762036716314514, + "grad_norm": 0.3732368052005768, + "learning_rate": 0.0002428738035544219, + "loss": 0.4704, + "step": 8545 + }, + { + "epoch": 1.0768334540416287, + "grad_norm": 0.3252260088920593, + "learning_rate": 0.00024278746002588997, + "loss": 0.4929, + "step": 8550 + }, + { + "epoch": 1.077463236451806, + "grad_norm": 0.31606802344322205, + "learning_rate": 0.00024270106666837246, + "loss": 0.4698, + "step": 8555 + }, + { + "epoch": 1.0780930188619833, + "grad_norm": 0.320529967546463, + "learning_rate": 0.00024261462352826468, + "loss": 0.4531, + "step": 8560 + }, + { + "epoch": 1.0787228012721606, + "grad_norm": 0.36827871203422546, + "learning_rate": 0.00024252813065198852, + "loss": 0.4948, + "step": 8565 + }, + { + "epoch": 1.0793525836823377, + "grad_norm": 0.3132867217063904, + "learning_rate": 0.00024244158808599264, + "loss": 0.4836, + "step": 8570 + }, + { + "epoch": 1.079982366092515, + "grad_norm": 0.32383888959884644, + "learning_rate": 0.00024235499587675236, + "loss": 0.4749, + "step": 8575 + }, + { + "epoch": 1.0806121485026923, + "grad_norm": 0.32294297218322754, + "learning_rate": 0.0002422683540707697, + "loss": 0.4616, + "step": 8580 + }, + { + "epoch": 1.0812419309128696, + "grad_norm": 0.3049245774745941, + "learning_rate": 0.00024218166271457322, + "loss": 0.4871, + "step": 8585 + }, + { + "epoch": 1.0818717133230469, + "grad_norm": 0.3330252170562744, + "learning_rate": 0.00024209492185471826, + "loss": 0.4908, + "step": 8590 + }, + { + "epoch": 1.0825014957332242, + "grad_norm": 0.35933157801628113, + "learning_rate": 0.00024200813153778654, + "loss": 0.4936, + "step": 8595 + }, + { + "epoch": 1.0831312781434015, + "grad_norm": 0.345434308052063, + "learning_rate": 0.00024192129181038654, + "loss": 0.4637, + "step": 8600 + }, + { + "epoch": 1.0837610605535788, + "grad_norm": 0.3012515604496002, + "learning_rate": 0.0002418344027191531, + "loss": 0.4719, + "step": 8605 + }, + { + "epoch": 1.084390842963756, + "grad_norm": 0.3081362247467041, + "learning_rate": 0.0002417474643107477, + "loss": 0.4852, + "step": 8610 + }, + { + "epoch": 1.0850206253739334, + "grad_norm": 0.367389053106308, + "learning_rate": 0.00024166047663185826, + "loss": 0.5046, + "step": 8615 + }, + { + "epoch": 1.0856504077841107, + "grad_norm": 0.3392958641052246, + "learning_rate": 0.0002415734397291991, + "loss": 0.5087, + "step": 8620 + }, + { + "epoch": 1.0862801901942878, + "grad_norm": 0.2843685746192932, + "learning_rate": 0.000241486353649511, + "loss": 0.4722, + "step": 8625 + }, + { + "epoch": 1.086909972604465, + "grad_norm": 0.29619672894477844, + "learning_rate": 0.00024139921843956128, + "loss": 0.4645, + "step": 8630 + }, + { + "epoch": 1.0875397550146424, + "grad_norm": 0.30029621720314026, + "learning_rate": 0.00024131203414614347, + "loss": 0.4434, + "step": 8635 + }, + { + "epoch": 1.0881695374248197, + "grad_norm": 0.3630850911140442, + "learning_rate": 0.00024122480081607755, + "loss": 0.4772, + "step": 8640 + }, + { + "epoch": 1.088799319834997, + "grad_norm": 0.32482001185417175, + "learning_rate": 0.00024113751849620974, + "loss": 0.4441, + "step": 8645 + }, + { + "epoch": 1.0894291022451743, + "grad_norm": 0.3149590492248535, + "learning_rate": 0.00024105018723341275, + "loss": 0.468, + "step": 8650 + }, + { + "epoch": 1.0900588846553516, + "grad_norm": 0.34652113914489746, + "learning_rate": 0.0002409628070745854, + "loss": 0.4706, + "step": 8655 + }, + { + "epoch": 1.090688667065529, + "grad_norm": 0.31633374094963074, + "learning_rate": 0.00024087537806665279, + "loss": 0.4693, + "step": 8660 + }, + { + "epoch": 1.0913184494757062, + "grad_norm": 0.31668806076049805, + "learning_rate": 0.00024078790025656638, + "loss": 0.4619, + "step": 8665 + }, + { + "epoch": 1.0919482318858835, + "grad_norm": 0.3093356490135193, + "learning_rate": 0.00024070037369130375, + "loss": 0.485, + "step": 8670 + }, + { + "epoch": 1.0925780142960608, + "grad_norm": 0.31765609979629517, + "learning_rate": 0.0002406127984178686, + "loss": 0.4696, + "step": 8675 + }, + { + "epoch": 1.093207796706238, + "grad_norm": 0.35910454392433167, + "learning_rate": 0.00024052517448329086, + "loss": 0.4781, + "step": 8680 + }, + { + "epoch": 1.0938375791164152, + "grad_norm": 0.37290528416633606, + "learning_rate": 0.00024043750193462665, + "loss": 0.4824, + "step": 8685 + }, + { + "epoch": 1.0944673615265925, + "grad_norm": 0.3106020390987396, + "learning_rate": 0.00024034978081895807, + "loss": 0.4607, + "step": 8690 + }, + { + "epoch": 1.0950971439367698, + "grad_norm": 0.3306252062320709, + "learning_rate": 0.0002402620111833934, + "loss": 0.4725, + "step": 8695 + }, + { + "epoch": 1.0957269263469471, + "grad_norm": 0.2956124544143677, + "learning_rate": 0.00024017419307506687, + "loss": 0.4784, + "step": 8700 + }, + { + "epoch": 1.0963567087571244, + "grad_norm": 0.3285719156265259, + "learning_rate": 0.00024008632654113894, + "loss": 0.4856, + "step": 8705 + }, + { + "epoch": 1.0969864911673017, + "grad_norm": 0.3430241644382477, + "learning_rate": 0.00023999841162879583, + "loss": 0.5017, + "step": 8710 + }, + { + "epoch": 1.097616273577479, + "grad_norm": 0.33543142676353455, + "learning_rate": 0.00023991044838524985, + "loss": 0.516, + "step": 8715 + }, + { + "epoch": 1.0982460559876563, + "grad_norm": 0.28755661845207214, + "learning_rate": 0.0002398224368577394, + "loss": 0.4645, + "step": 8720 + }, + { + "epoch": 1.0988758383978336, + "grad_norm": 0.34112608432769775, + "learning_rate": 0.00023973437709352851, + "loss": 0.5134, + "step": 8725 + }, + { + "epoch": 1.0995056208080107, + "grad_norm": 0.3198321759700775, + "learning_rate": 0.00023964626913990743, + "loss": 0.4939, + "step": 8730 + }, + { + "epoch": 1.100135403218188, + "grad_norm": 0.2985571026802063, + "learning_rate": 0.00023955811304419205, + "loss": 0.4817, + "step": 8735 + }, + { + "epoch": 1.1007651856283653, + "grad_norm": 0.32038047909736633, + "learning_rate": 0.0002394699088537243, + "loss": 0.524, + "step": 8740 + }, + { + "epoch": 1.1013949680385426, + "grad_norm": 0.3562256395816803, + "learning_rate": 0.00023938165661587175, + "loss": 0.4779, + "step": 8745 + }, + { + "epoch": 1.10202475044872, + "grad_norm": 0.3481481969356537, + "learning_rate": 0.00023929335637802788, + "loss": 0.4861, + "step": 8750 + }, + { + "epoch": 1.1026545328588973, + "grad_norm": 0.3087615966796875, + "learning_rate": 0.00023920500818761198, + "loss": 0.473, + "step": 8755 + }, + { + "epoch": 1.1032843152690746, + "grad_norm": 0.34575629234313965, + "learning_rate": 0.00023911661209206903, + "loss": 0.4709, + "step": 8760 + }, + { + "epoch": 1.1039140976792519, + "grad_norm": 0.3505946099758148, + "learning_rate": 0.0002390281681388697, + "loss": 0.4766, + "step": 8765 + }, + { + "epoch": 1.1045438800894292, + "grad_norm": 0.40102317929267883, + "learning_rate": 0.0002389396763755105, + "loss": 0.5048, + "step": 8770 + }, + { + "epoch": 1.1051736624996065, + "grad_norm": 0.3319726884365082, + "learning_rate": 0.0002388511368495135, + "loss": 0.4768, + "step": 8775 + }, + { + "epoch": 1.1058034449097838, + "grad_norm": 0.3191297948360443, + "learning_rate": 0.00023876254960842645, + "loss": 0.5009, + "step": 8780 + }, + { + "epoch": 1.1064332273199609, + "grad_norm": 0.3122735619544983, + "learning_rate": 0.00023867391469982268, + "loss": 0.4777, + "step": 8785 + }, + { + "epoch": 1.1070630097301382, + "grad_norm": 0.33340710401535034, + "learning_rate": 0.0002385852321713012, + "loss": 0.459, + "step": 8790 + }, + { + "epoch": 1.1076927921403155, + "grad_norm": 0.32803764939308167, + "learning_rate": 0.00023849650207048655, + "loss": 0.4784, + "step": 8795 + }, + { + "epoch": 1.1083225745504928, + "grad_norm": 0.35463786125183105, + "learning_rate": 0.00023840772444502878, + "loss": 0.4739, + "step": 8800 + }, + { + "epoch": 1.10895235696067, + "grad_norm": 0.3237099349498749, + "learning_rate": 0.00023831889934260357, + "loss": 0.4652, + "step": 8805 + }, + { + "epoch": 1.1095821393708474, + "grad_norm": 0.34681713581085205, + "learning_rate": 0.000238230026810912, + "loss": 0.4872, + "step": 8810 + }, + { + "epoch": 1.1102119217810247, + "grad_norm": 0.3360891342163086, + "learning_rate": 0.00023814110689768066, + "loss": 0.496, + "step": 8815 + }, + { + "epoch": 1.110841704191202, + "grad_norm": 0.32971322536468506, + "learning_rate": 0.0002380521396506615, + "loss": 0.4468, + "step": 8820 + }, + { + "epoch": 1.1114714866013793, + "grad_norm": 0.3112764060497284, + "learning_rate": 0.00023796312511763205, + "loss": 0.4985, + "step": 8825 + }, + { + "epoch": 1.1121012690115566, + "grad_norm": 0.30539095401763916, + "learning_rate": 0.0002378740633463951, + "loss": 0.4835, + "step": 8830 + }, + { + "epoch": 1.1127310514217337, + "grad_norm": 0.274139940738678, + "learning_rate": 0.00023778495438477894, + "loss": 0.5014, + "step": 8835 + }, + { + "epoch": 1.113360833831911, + "grad_norm": 0.2877870500087738, + "learning_rate": 0.000237695798280637, + "loss": 0.4842, + "step": 8840 + }, + { + "epoch": 1.1139906162420883, + "grad_norm": 0.262893944978714, + "learning_rate": 0.00023760659508184823, + "loss": 0.4754, + "step": 8845 + }, + { + "epoch": 1.1146203986522656, + "grad_norm": 0.3255792260169983, + "learning_rate": 0.00023751734483631672, + "loss": 0.489, + "step": 8850 + }, + { + "epoch": 1.115250181062443, + "grad_norm": 0.3453415632247925, + "learning_rate": 0.00023742804759197195, + "loss": 0.4624, + "step": 8855 + }, + { + "epoch": 1.1158799634726202, + "grad_norm": 0.3276025354862213, + "learning_rate": 0.00023733870339676856, + "loss": 0.4629, + "step": 8860 + }, + { + "epoch": 1.1165097458827975, + "grad_norm": 0.32096150517463684, + "learning_rate": 0.0002372493122986864, + "loss": 0.4482, + "step": 8865 + }, + { + "epoch": 1.1171395282929748, + "grad_norm": 0.33016180992126465, + "learning_rate": 0.00023715987434573055, + "loss": 0.493, + "step": 8870 + }, + { + "epoch": 1.117769310703152, + "grad_norm": 0.2946653366088867, + "learning_rate": 0.00023707038958593126, + "loss": 0.4365, + "step": 8875 + }, + { + "epoch": 1.1183990931133294, + "grad_norm": 0.37148308753967285, + "learning_rate": 0.00023698085806734385, + "loss": 0.4974, + "step": 8880 + }, + { + "epoch": 1.1190288755235067, + "grad_norm": 0.3068748116493225, + "learning_rate": 0.00023689127983804882, + "loss": 0.4886, + "step": 8885 + }, + { + "epoch": 1.1196586579336838, + "grad_norm": 0.3096564710140228, + "learning_rate": 0.00023680165494615167, + "loss": 0.4592, + "step": 8890 + }, + { + "epoch": 1.120288440343861, + "grad_norm": 0.3341507613658905, + "learning_rate": 0.00023671198343978308, + "loss": 0.4258, + "step": 8895 + }, + { + "epoch": 1.1209182227540384, + "grad_norm": 0.30653128027915955, + "learning_rate": 0.00023662226536709868, + "loss": 0.486, + "step": 8900 + }, + { + "epoch": 1.1215480051642157, + "grad_norm": 0.30991849303245544, + "learning_rate": 0.00023653250077627908, + "loss": 0.4879, + "step": 8905 + }, + { + "epoch": 1.122177787574393, + "grad_norm": 0.3082162141799927, + "learning_rate": 0.00023644268971552998, + "loss": 0.4538, + "step": 8910 + }, + { + "epoch": 1.1228075699845703, + "grad_norm": 0.30248114466667175, + "learning_rate": 0.00023635283223308193, + "loss": 0.4501, + "step": 8915 + }, + { + "epoch": 1.1234373523947476, + "grad_norm": 0.34090158343315125, + "learning_rate": 0.00023626292837719047, + "loss": 0.4825, + "step": 8920 + }, + { + "epoch": 1.124067134804925, + "grad_norm": 0.28670960664749146, + "learning_rate": 0.00023617297819613598, + "loss": 0.4422, + "step": 8925 + }, + { + "epoch": 1.1246969172151022, + "grad_norm": 0.37079116702079773, + "learning_rate": 0.0002360829817382239, + "loss": 0.4725, + "step": 8930 + }, + { + "epoch": 1.1253266996252795, + "grad_norm": 0.35876086354255676, + "learning_rate": 0.00023599293905178417, + "loss": 0.4672, + "step": 8935 + }, + { + "epoch": 1.1259564820354568, + "grad_norm": 0.28581666946411133, + "learning_rate": 0.00023590285018517196, + "loss": 0.4597, + "step": 8940 + }, + { + "epoch": 1.126586264445634, + "grad_norm": 0.34076693654060364, + "learning_rate": 0.00023581271518676694, + "loss": 0.4894, + "step": 8945 + }, + { + "epoch": 1.1272160468558112, + "grad_norm": 0.29919254779815674, + "learning_rate": 0.0002357225341049737, + "loss": 0.4538, + "step": 8950 + }, + { + "epoch": 1.1278458292659885, + "grad_norm": 0.2799806594848633, + "learning_rate": 0.00023563230698822154, + "loss": 0.4814, + "step": 8955 + }, + { + "epoch": 1.1284756116761658, + "grad_norm": 0.3249780833721161, + "learning_rate": 0.00023554203388496446, + "loss": 0.4825, + "step": 8960 + }, + { + "epoch": 1.1291053940863431, + "grad_norm": 0.3509981036186218, + "learning_rate": 0.0002354517148436812, + "loss": 0.4468, + "step": 8965 + }, + { + "epoch": 1.1297351764965204, + "grad_norm": 0.33016157150268555, + "learning_rate": 0.0002353613499128752, + "loss": 0.449, + "step": 8970 + }, + { + "epoch": 1.1303649589066977, + "grad_norm": 0.2889571487903595, + "learning_rate": 0.00023527093914107436, + "loss": 0.4584, + "step": 8975 + }, + { + "epoch": 1.130994741316875, + "grad_norm": 0.31957536935806274, + "learning_rate": 0.00023518048257683145, + "loss": 0.4807, + "step": 8980 + }, + { + "epoch": 1.1316245237270524, + "grad_norm": 0.31418105959892273, + "learning_rate": 0.00023508998026872365, + "loss": 0.4755, + "step": 8985 + }, + { + "epoch": 1.1322543061372297, + "grad_norm": 0.3458874523639679, + "learning_rate": 0.00023499943226535278, + "loss": 0.4906, + "step": 8990 + }, + { + "epoch": 1.132884088547407, + "grad_norm": 0.3091862201690674, + "learning_rate": 0.0002349088386153452, + "loss": 0.4786, + "step": 8995 + }, + { + "epoch": 1.133513870957584, + "grad_norm": 0.2758231461048126, + "learning_rate": 0.00023481819936735178, + "loss": 0.4189, + "step": 9000 + }, + { + "epoch": 1.133513870957584, + "eval_loss": 0.3038506805896759, + "eval_runtime": 6.258, + "eval_samples_per_second": 159.795, + "eval_steps_per_second": 10.067, + "step": 9000 + }, + { + "epoch": 1.1341436533677614, + "grad_norm": 0.3153883218765259, + "learning_rate": 0.00023472751457004782, + "loss": 0.4802, + "step": 9005 + }, + { + "epoch": 1.1347734357779387, + "grad_norm": 0.3110881745815277, + "learning_rate": 0.00023463678427213317, + "loss": 0.4488, + "step": 9010 + }, + { + "epoch": 1.135403218188116, + "grad_norm": 0.30957111716270447, + "learning_rate": 0.00023454600852233206, + "loss": 0.476, + "step": 9015 + }, + { + "epoch": 1.1360330005982933, + "grad_norm": 0.3130200207233429, + "learning_rate": 0.00023445518736939312, + "loss": 0.4396, + "step": 9020 + }, + { + "epoch": 1.1366627830084706, + "grad_norm": 0.31500178575515747, + "learning_rate": 0.0002343643208620894, + "loss": 0.4644, + "step": 9025 + }, + { + "epoch": 1.1372925654186479, + "grad_norm": 0.3096972703933716, + "learning_rate": 0.00023427340904921834, + "loss": 0.4775, + "step": 9030 + }, + { + "epoch": 1.1379223478288252, + "grad_norm": 0.3503490388393402, + "learning_rate": 0.00023418245197960155, + "loss": 0.4617, + "step": 9035 + }, + { + "epoch": 1.1385521302390025, + "grad_norm": 0.31281721591949463, + "learning_rate": 0.00023409144970208516, + "loss": 0.4703, + "step": 9040 + }, + { + "epoch": 1.1391819126491798, + "grad_norm": 0.3011356592178345, + "learning_rate": 0.0002340004022655394, + "loss": 0.4472, + "step": 9045 + }, + { + "epoch": 1.139811695059357, + "grad_norm": 0.3240005075931549, + "learning_rate": 0.00023390930971885888, + "loss": 0.4726, + "step": 9050 + }, + { + "epoch": 1.1404414774695342, + "grad_norm": 0.35690784454345703, + "learning_rate": 0.0002338181721109623, + "loss": 0.4601, + "step": 9055 + }, + { + "epoch": 1.1410712598797115, + "grad_norm": 0.30888888239860535, + "learning_rate": 0.0002337269894907927, + "loss": 0.45, + "step": 9060 + }, + { + "epoch": 1.1417010422898888, + "grad_norm": 0.3118223249912262, + "learning_rate": 0.00023363576190731726, + "loss": 0.4456, + "step": 9065 + }, + { + "epoch": 1.142330824700066, + "grad_norm": 0.3156544864177704, + "learning_rate": 0.0002335444894095272, + "loss": 0.4744, + "step": 9070 + }, + { + "epoch": 1.1429606071102434, + "grad_norm": 0.33679795265197754, + "learning_rate": 0.00023345317204643797, + "loss": 0.4662, + "step": 9075 + }, + { + "epoch": 1.1435903895204207, + "grad_norm": 0.32647955417633057, + "learning_rate": 0.00023336180986708904, + "loss": 0.4573, + "step": 9080 + }, + { + "epoch": 1.144220171930598, + "grad_norm": 0.3759111762046814, + "learning_rate": 0.00023327040292054412, + "loss": 0.4439, + "step": 9085 + }, + { + "epoch": 1.1448499543407753, + "grad_norm": 0.31271886825561523, + "learning_rate": 0.00023317895125589066, + "loss": 0.4778, + "step": 9090 + }, + { + "epoch": 1.1454797367509526, + "grad_norm": 0.2915593385696411, + "learning_rate": 0.0002330874549222404, + "loss": 0.4646, + "step": 9095 + }, + { + "epoch": 1.14610951916113, + "grad_norm": 0.3337639570236206, + "learning_rate": 0.00023299591396872893, + "loss": 0.4597, + "step": 9100 + }, + { + "epoch": 1.1467393015713072, + "grad_norm": 0.3345816433429718, + "learning_rate": 0.0002329043284445158, + "loss": 0.519, + "step": 9105 + }, + { + "epoch": 1.1473690839814843, + "grad_norm": 0.31568819284439087, + "learning_rate": 0.0002328126983987846, + "loss": 0.455, + "step": 9110 + }, + { + "epoch": 1.1479988663916616, + "grad_norm": 0.3630363643169403, + "learning_rate": 0.00023272102388074265, + "loss": 0.4544, + "step": 9115 + }, + { + "epoch": 1.148628648801839, + "grad_norm": 0.30382248759269714, + "learning_rate": 0.00023262930493962142, + "loss": 0.485, + "step": 9120 + }, + { + "epoch": 1.1492584312120162, + "grad_norm": 0.30339518189430237, + "learning_rate": 0.0002325375416246759, + "loss": 0.474, + "step": 9125 + }, + { + "epoch": 1.1498882136221935, + "grad_norm": 0.33041009306907654, + "learning_rate": 0.00023244573398518523, + "loss": 0.447, + "step": 9130 + }, + { + "epoch": 1.1505179960323708, + "grad_norm": 0.35708925127983093, + "learning_rate": 0.00023235388207045214, + "loss": 0.4801, + "step": 9135 + }, + { + "epoch": 1.1511477784425481, + "grad_norm": 0.3497597575187683, + "learning_rate": 0.00023226198592980318, + "loss": 0.4753, + "step": 9140 + }, + { + "epoch": 1.1517775608527254, + "grad_norm": 0.31747546792030334, + "learning_rate": 0.00023217004561258876, + "loss": 0.4642, + "step": 9145 + }, + { + "epoch": 1.1524073432629027, + "grad_norm": 0.31225451827049255, + "learning_rate": 0.00023207806116818283, + "loss": 0.501, + "step": 9150 + }, + { + "epoch": 1.15303712567308, + "grad_norm": 0.31150931119918823, + "learning_rate": 0.00023198603264598327, + "loss": 0.447, + "step": 9155 + }, + { + "epoch": 1.1536669080832573, + "grad_norm": 0.29207199811935425, + "learning_rate": 0.00023189396009541135, + "loss": 0.448, + "step": 9160 + }, + { + "epoch": 1.1542966904934344, + "grad_norm": 0.33640962839126587, + "learning_rate": 0.00023180184356591223, + "loss": 0.4725, + "step": 9165 + }, + { + "epoch": 1.1549264729036117, + "grad_norm": 0.292582631111145, + "learning_rate": 0.00023170968310695457, + "loss": 0.4603, + "step": 9170 + }, + { + "epoch": 1.155556255313789, + "grad_norm": 0.3217863142490387, + "learning_rate": 0.00023161747876803066, + "loss": 0.4386, + "step": 9175 + }, + { + "epoch": 1.1561860377239663, + "grad_norm": 0.32607826590538025, + "learning_rate": 0.00023152523059865622, + "loss": 0.4747, + "step": 9180 + }, + { + "epoch": 1.1568158201341436, + "grad_norm": 0.35956209897994995, + "learning_rate": 0.00023143293864837078, + "loss": 0.4563, + "step": 9185 + }, + { + "epoch": 1.157445602544321, + "grad_norm": 0.3542852997779846, + "learning_rate": 0.00023134060296673716, + "loss": 0.4907, + "step": 9190 + }, + { + "epoch": 1.1580753849544982, + "grad_norm": 0.3324996829032898, + "learning_rate": 0.0002312482236033417, + "loss": 0.4539, + "step": 9195 + }, + { + "epoch": 1.1587051673646755, + "grad_norm": 0.3436378836631775, + "learning_rate": 0.00023115580060779429, + "loss": 0.5107, + "step": 9200 + }, + { + "epoch": 1.1593349497748529, + "grad_norm": 0.2886941730976105, + "learning_rate": 0.00023106333402972813, + "loss": 0.4547, + "step": 9205 + }, + { + "epoch": 1.1599647321850302, + "grad_norm": 0.30411913990974426, + "learning_rate": 0.00023097082391879993, + "loss": 0.4517, + "step": 9210 + }, + { + "epoch": 1.1605945145952075, + "grad_norm": 0.3265014886856079, + "learning_rate": 0.00023087827032468975, + "loss": 0.4589, + "step": 9215 + }, + { + "epoch": 1.1612242970053845, + "grad_norm": 0.2876526713371277, + "learning_rate": 0.00023078567329710091, + "loss": 0.466, + "step": 9220 + }, + { + "epoch": 1.1618540794155618, + "grad_norm": 0.29947248101234436, + "learning_rate": 0.0002306930328857602, + "loss": 0.4459, + "step": 9225 + }, + { + "epoch": 1.1624838618257392, + "grad_norm": 0.33246028423309326, + "learning_rate": 0.00023060034914041753, + "loss": 0.4826, + "step": 9230 + }, + { + "epoch": 1.1631136442359165, + "grad_norm": 0.29653674364089966, + "learning_rate": 0.0002305076221108463, + "loss": 0.4394, + "step": 9235 + }, + { + "epoch": 1.1637434266460938, + "grad_norm": 0.30506858229637146, + "learning_rate": 0.00023041485184684308, + "loss": 0.4645, + "step": 9240 + }, + { + "epoch": 1.164373209056271, + "grad_norm": 0.2603437304496765, + "learning_rate": 0.00023032203839822748, + "loss": 0.4536, + "step": 9245 + }, + { + "epoch": 1.1650029914664484, + "grad_norm": 0.3310236632823944, + "learning_rate": 0.00023022918181484254, + "loss": 0.4653, + "step": 9250 + }, + { + "epoch": 1.1656327738766257, + "grad_norm": 0.3645521104335785, + "learning_rate": 0.0002301362821465543, + "loss": 0.4404, + "step": 9255 + }, + { + "epoch": 1.166262556286803, + "grad_norm": 0.33431464433670044, + "learning_rate": 0.00023004333944325208, + "loss": 0.4389, + "step": 9260 + }, + { + "epoch": 1.1668923386969803, + "grad_norm": 0.31086647510528564, + "learning_rate": 0.00022995035375484817, + "loss": 0.493, + "step": 9265 + }, + { + "epoch": 1.1675221211071576, + "grad_norm": 0.34322085976600647, + "learning_rate": 0.00022985732513127805, + "loss": 0.4839, + "step": 9270 + }, + { + "epoch": 1.1681519035173347, + "grad_norm": 0.3111884593963623, + "learning_rate": 0.0002297642536225002, + "loss": 0.4473, + "step": 9275 + }, + { + "epoch": 1.168781685927512, + "grad_norm": 0.3494400084018707, + "learning_rate": 0.00022967113927849613, + "loss": 0.469, + "step": 9280 + }, + { + "epoch": 1.1694114683376893, + "grad_norm": 0.27351829409599304, + "learning_rate": 0.00022957798214927037, + "loss": 0.4617, + "step": 9285 + }, + { + "epoch": 1.1700412507478666, + "grad_norm": 0.3605945408344269, + "learning_rate": 0.00022948478228485046, + "loss": 0.444, + "step": 9290 + }, + { + "epoch": 1.1706710331580439, + "grad_norm": 0.31383225321769714, + "learning_rate": 0.0002293915397352869, + "loss": 0.4716, + "step": 9295 + }, + { + "epoch": 1.1713008155682212, + "grad_norm": 0.3261600732803345, + "learning_rate": 0.00022929825455065292, + "loss": 0.4646, + "step": 9300 + }, + { + "epoch": 1.1719305979783985, + "grad_norm": 0.29624396562576294, + "learning_rate": 0.00022920492678104492, + "loss": 0.4636, + "step": 9305 + }, + { + "epoch": 1.1725603803885758, + "grad_norm": 0.39078545570373535, + "learning_rate": 0.00022911155647658201, + "loss": 0.4933, + "step": 9310 + }, + { + "epoch": 1.173190162798753, + "grad_norm": 0.2990373373031616, + "learning_rate": 0.00022901814368740615, + "loss": 0.4726, + "step": 9315 + }, + { + "epoch": 1.1738199452089304, + "grad_norm": 0.28325891494750977, + "learning_rate": 0.00022892468846368217, + "loss": 0.4428, + "step": 9320 + }, + { + "epoch": 1.1744497276191077, + "grad_norm": 0.3357643187046051, + "learning_rate": 0.0002288311908555977, + "loss": 0.4618, + "step": 9325 + }, + { + "epoch": 1.1750795100292848, + "grad_norm": 0.31550613045692444, + "learning_rate": 0.00022873765091336302, + "loss": 0.4607, + "step": 9330 + }, + { + "epoch": 1.175709292439462, + "grad_norm": 0.30639806389808655, + "learning_rate": 0.00022864406868721118, + "loss": 0.458, + "step": 9335 + }, + { + "epoch": 1.1763390748496394, + "grad_norm": 0.3836449086666107, + "learning_rate": 0.0002285504442273981, + "loss": 0.4788, + "step": 9340 + }, + { + "epoch": 1.1769688572598167, + "grad_norm": 0.2955804467201233, + "learning_rate": 0.00022845677758420217, + "loss": 0.4636, + "step": 9345 + }, + { + "epoch": 1.177598639669994, + "grad_norm": 0.3264003098011017, + "learning_rate": 0.0002283630688079245, + "loss": 0.4769, + "step": 9350 + }, + { + "epoch": 1.1782284220801713, + "grad_norm": 0.34578555822372437, + "learning_rate": 0.00022826931794888894, + "loss": 0.4784, + "step": 9355 + }, + { + "epoch": 1.1788582044903486, + "grad_norm": 0.37039560079574585, + "learning_rate": 0.00022817552505744178, + "loss": 0.5042, + "step": 9360 + }, + { + "epoch": 1.179487986900526, + "grad_norm": 0.319118857383728, + "learning_rate": 0.00022808169018395192, + "loss": 0.4607, + "step": 9365 + }, + { + "epoch": 1.1801177693107032, + "grad_norm": 0.32380104064941406, + "learning_rate": 0.00022798781337881086, + "loss": 0.4606, + "step": 9370 + }, + { + "epoch": 1.1807475517208805, + "grad_norm": 0.3038274943828583, + "learning_rate": 0.00022789389469243256, + "loss": 0.448, + "step": 9375 + }, + { + "epoch": 1.1813773341310578, + "grad_norm": 0.3078247308731079, + "learning_rate": 0.00022779993417525356, + "loss": 0.4683, + "step": 9380 + }, + { + "epoch": 1.182007116541235, + "grad_norm": 0.2909676432609558, + "learning_rate": 0.00022770593187773275, + "loss": 0.4778, + "step": 9385 + }, + { + "epoch": 1.1826368989514122, + "grad_norm": 0.3095955550670624, + "learning_rate": 0.00022761188785035155, + "loss": 0.4523, + "step": 9390 + }, + { + "epoch": 1.1832666813615895, + "grad_norm": 0.2969966530799866, + "learning_rate": 0.0002275178021436137, + "loss": 0.4735, + "step": 9395 + }, + { + "epoch": 1.1838964637717668, + "grad_norm": 0.2896679937839508, + "learning_rate": 0.00022742367480804544, + "loss": 0.45, + "step": 9400 + }, + { + "epoch": 1.1845262461819441, + "grad_norm": 0.31511151790618896, + "learning_rate": 0.0002273295058941952, + "loss": 0.4614, + "step": 9405 + }, + { + "epoch": 1.1851560285921214, + "grad_norm": 0.3440285623073578, + "learning_rate": 0.00022723529545263399, + "loss": 0.4593, + "step": 9410 + }, + { + "epoch": 1.1857858110022987, + "grad_norm": 0.29399538040161133, + "learning_rate": 0.00022714104353395483, + "loss": 0.4519, + "step": 9415 + }, + { + "epoch": 1.186415593412476, + "grad_norm": 0.3958999812602997, + "learning_rate": 0.00022704675018877322, + "loss": 0.4838, + "step": 9420 + }, + { + "epoch": 1.1870453758226533, + "grad_norm": 0.2960554361343384, + "learning_rate": 0.0002269524154677268, + "loss": 0.459, + "step": 9425 + }, + { + "epoch": 1.1876751582328307, + "grad_norm": 0.32369253039360046, + "learning_rate": 0.00022685803942147555, + "loss": 0.4542, + "step": 9430 + }, + { + "epoch": 1.188304940643008, + "grad_norm": 0.320547491312027, + "learning_rate": 0.00022676362210070144, + "loss": 0.4853, + "step": 9435 + }, + { + "epoch": 1.188934723053185, + "grad_norm": 0.2984744906425476, + "learning_rate": 0.00022666916355610885, + "loss": 0.4201, + "step": 9440 + }, + { + "epoch": 1.1895645054633623, + "grad_norm": 0.34194597601890564, + "learning_rate": 0.00022657466383842407, + "loss": 0.4705, + "step": 9445 + }, + { + "epoch": 1.1901942878735396, + "grad_norm": 0.29718858003616333, + "learning_rate": 0.0002264801229983957, + "loss": 0.4403, + "step": 9450 + }, + { + "epoch": 1.190824070283717, + "grad_norm": 0.29723846912384033, + "learning_rate": 0.0002263855410867943, + "loss": 0.4841, + "step": 9455 + }, + { + "epoch": 1.1914538526938943, + "grad_norm": 0.31662440299987793, + "learning_rate": 0.00022629091815441245, + "loss": 0.456, + "step": 9460 + }, + { + "epoch": 1.1920836351040716, + "grad_norm": 0.3458605408668518, + "learning_rate": 0.0002261962542520649, + "loss": 0.4504, + "step": 9465 + }, + { + "epoch": 1.1927134175142489, + "grad_norm": 0.31829431653022766, + "learning_rate": 0.00022610154943058833, + "loss": 0.4821, + "step": 9470 + }, + { + "epoch": 1.1933431999244262, + "grad_norm": 0.3380287289619446, + "learning_rate": 0.00022600680374084138, + "loss": 0.4963, + "step": 9475 + }, + { + "epoch": 1.1939729823346035, + "grad_norm": 0.3048580288887024, + "learning_rate": 0.00022591201723370458, + "loss": 0.4443, + "step": 9480 + }, + { + "epoch": 1.1946027647447806, + "grad_norm": 0.34586548805236816, + "learning_rate": 0.0002258171899600806, + "loss": 0.473, + "step": 9485 + }, + { + "epoch": 1.195232547154958, + "grad_norm": 0.2828037440776825, + "learning_rate": 0.0002257223219708937, + "loss": 0.4539, + "step": 9490 + }, + { + "epoch": 1.1958623295651352, + "grad_norm": 0.31300345063209534, + "learning_rate": 0.00022562741331709024, + "loss": 0.4353, + "step": 9495 + }, + { + "epoch": 1.1964921119753125, + "grad_norm": 0.311260849237442, + "learning_rate": 0.0002255324640496383, + "loss": 0.4553, + "step": 9500 + }, + { + "epoch": 1.1971218943854898, + "grad_norm": 0.2941080331802368, + "learning_rate": 0.0002254374742195279, + "loss": 0.4464, + "step": 9505 + }, + { + "epoch": 1.197751676795667, + "grad_norm": 0.26669132709503174, + "learning_rate": 0.00022534244387777057, + "loss": 0.4368, + "step": 9510 + }, + { + "epoch": 1.1983814592058444, + "grad_norm": 0.2933709919452667, + "learning_rate": 0.00022524737307539995, + "loss": 0.4526, + "step": 9515 + }, + { + "epoch": 1.1990112416160217, + "grad_norm": 0.338360458612442, + "learning_rate": 0.0002251522618634711, + "loss": 0.4625, + "step": 9520 + }, + { + "epoch": 1.199641024026199, + "grad_norm": 0.31670835614204407, + "learning_rate": 0.00022505711029306098, + "loss": 0.4553, + "step": 9525 + }, + { + "epoch": 1.2002708064363763, + "grad_norm": 0.3221518099308014, + "learning_rate": 0.00022496191841526813, + "loss": 0.475, + "step": 9530 + }, + { + "epoch": 1.2009005888465536, + "grad_norm": 0.32984668016433716, + "learning_rate": 0.00022486668628121282, + "loss": 0.4526, + "step": 9535 + }, + { + "epoch": 1.2015303712567307, + "grad_norm": 0.2793140113353729, + "learning_rate": 0.00022477141394203678, + "loss": 0.4374, + "step": 9540 + }, + { + "epoch": 1.2021601536669082, + "grad_norm": 0.3125605881214142, + "learning_rate": 0.00022467610144890357, + "loss": 0.4569, + "step": 9545 + }, + { + "epoch": 1.2027899360770853, + "grad_norm": 0.2892754375934601, + "learning_rate": 0.00022458074885299808, + "loss": 0.4747, + "step": 9550 + }, + { + "epoch": 1.2034197184872626, + "grad_norm": 0.3224146068096161, + "learning_rate": 0.00022448535620552684, + "loss": 0.4372, + "step": 9555 + }, + { + "epoch": 1.20404950089744, + "grad_norm": 0.33973759412765503, + "learning_rate": 0.00022438992355771787, + "loss": 0.4368, + "step": 9560 + }, + { + "epoch": 1.2046792833076172, + "grad_norm": 0.37665504217147827, + "learning_rate": 0.00022429445096082073, + "loss": 0.4747, + "step": 9565 + }, + { + "epoch": 1.2053090657177945, + "grad_norm": 0.2834467589855194, + "learning_rate": 0.00022419893846610634, + "loss": 0.4841, + "step": 9570 + }, + { + "epoch": 1.2059388481279718, + "grad_norm": 0.3729229271411896, + "learning_rate": 0.00022410338612486715, + "loss": 0.475, + "step": 9575 + }, + { + "epoch": 1.2065686305381491, + "grad_norm": 0.30668923258781433, + "learning_rate": 0.00022400779398841684, + "loss": 0.4271, + "step": 9580 + }, + { + "epoch": 1.2071984129483264, + "grad_norm": 0.33016908168792725, + "learning_rate": 0.00022391216210809072, + "loss": 0.4553, + "step": 9585 + }, + { + "epoch": 1.2078281953585037, + "grad_norm": 0.30926114320755005, + "learning_rate": 0.00022381649053524518, + "loss": 0.4512, + "step": 9590 + }, + { + "epoch": 1.2084579777686808, + "grad_norm": 0.3481772840023041, + "learning_rate": 0.00022372077932125809, + "loss": 0.4707, + "step": 9595 + }, + { + "epoch": 1.2090877601788583, + "grad_norm": 0.2549537420272827, + "learning_rate": 0.0002236250285175285, + "loss": 0.4686, + "step": 9600 + }, + { + "epoch": 1.2097175425890354, + "grad_norm": 0.3111298978328705, + "learning_rate": 0.00022352923817547688, + "loss": 0.4535, + "step": 9605 + }, + { + "epoch": 1.2103473249992127, + "grad_norm": 0.29062095284461975, + "learning_rate": 0.00022343340834654472, + "loss": 0.4612, + "step": 9610 + }, + { + "epoch": 1.21097710740939, + "grad_norm": 0.3373335897922516, + "learning_rate": 0.0002233375390821949, + "loss": 0.4233, + "step": 9615 + }, + { + "epoch": 1.2116068898195673, + "grad_norm": 0.308648943901062, + "learning_rate": 0.0002232416304339114, + "loss": 0.4535, + "step": 9620 + }, + { + "epoch": 1.2122366722297446, + "grad_norm": 0.32941722869873047, + "learning_rate": 0.00022314568245319935, + "loss": 0.4564, + "step": 9625 + }, + { + "epoch": 1.212866454639922, + "grad_norm": 0.33229124546051025, + "learning_rate": 0.00022304969519158495, + "loss": 0.458, + "step": 9630 + }, + { + "epoch": 1.2134962370500992, + "grad_norm": 0.29093366861343384, + "learning_rate": 0.00022295366870061565, + "loss": 0.4315, + "step": 9635 + }, + { + "epoch": 1.2141260194602765, + "grad_norm": 0.3482106328010559, + "learning_rate": 0.00022285760303185982, + "loss": 0.4311, + "step": 9640 + }, + { + "epoch": 1.2147558018704538, + "grad_norm": 0.29717814922332764, + "learning_rate": 0.0002227614982369069, + "loss": 0.4261, + "step": 9645 + }, + { + "epoch": 1.215385584280631, + "grad_norm": 0.3359118700027466, + "learning_rate": 0.00022266535436736738, + "loss": 0.4698, + "step": 9650 + }, + { + "epoch": 1.2160153666908082, + "grad_norm": 0.3095514476299286, + "learning_rate": 0.0002225691714748727, + "loss": 0.4463, + "step": 9655 + }, + { + "epoch": 1.2166451491009855, + "grad_norm": 0.29095733165740967, + "learning_rate": 0.0002224729496110753, + "loss": 0.4662, + "step": 9660 + }, + { + "epoch": 1.2172749315111628, + "grad_norm": 0.34425532817840576, + "learning_rate": 0.00022237668882764847, + "loss": 0.4579, + "step": 9665 + }, + { + "epoch": 1.2179047139213401, + "grad_norm": 0.32856446504592896, + "learning_rate": 0.0002222803891762865, + "loss": 0.4648, + "step": 9670 + }, + { + "epoch": 1.2185344963315174, + "grad_norm": 0.35708895325660706, + "learning_rate": 0.00022218405070870451, + "loss": 0.4579, + "step": 9675 + }, + { + "epoch": 1.2191642787416948, + "grad_norm": 0.26759231090545654, + "learning_rate": 0.0002220876734766384, + "loss": 0.4321, + "step": 9680 + }, + { + "epoch": 1.219794061151872, + "grad_norm": 0.27995094656944275, + "learning_rate": 0.00022199125753184497, + "loss": 0.4552, + "step": 9685 + }, + { + "epoch": 1.2204238435620494, + "grad_norm": 0.3591984510421753, + "learning_rate": 0.00022189480292610187, + "loss": 0.4685, + "step": 9690 + }, + { + "epoch": 1.2210536259722267, + "grad_norm": 0.2892036736011505, + "learning_rate": 0.00022179830971120722, + "loss": 0.4609, + "step": 9695 + }, + { + "epoch": 1.221683408382404, + "grad_norm": 0.3287111520767212, + "learning_rate": 0.00022170177793898028, + "loss": 0.479, + "step": 9700 + }, + { + "epoch": 1.222313190792581, + "grad_norm": 0.3088148832321167, + "learning_rate": 0.00022160520766126074, + "loss": 0.4597, + "step": 9705 + }, + { + "epoch": 1.2229429732027584, + "grad_norm": 0.3263307511806488, + "learning_rate": 0.0002215085989299091, + "loss": 0.4801, + "step": 9710 + }, + { + "epoch": 1.2235727556129357, + "grad_norm": 0.283078134059906, + "learning_rate": 0.0002214119517968063, + "loss": 0.4476, + "step": 9715 + }, + { + "epoch": 1.224202538023113, + "grad_norm": 0.3226225674152374, + "learning_rate": 0.00022131526631385422, + "loss": 0.4644, + "step": 9720 + }, + { + "epoch": 1.2248323204332903, + "grad_norm": 0.32242435216903687, + "learning_rate": 0.00022121854253297514, + "loss": 0.4477, + "step": 9725 + }, + { + "epoch": 1.2254621028434676, + "grad_norm": 0.3373146057128906, + "learning_rate": 0.0002211217805061119, + "loss": 0.4541, + "step": 9730 + }, + { + "epoch": 1.2260918852536449, + "grad_norm": 0.28866246342658997, + "learning_rate": 0.00022102498028522786, + "loss": 0.4388, + "step": 9735 + }, + { + "epoch": 1.2267216676638222, + "grad_norm": 0.308704674243927, + "learning_rate": 0.00022092814192230711, + "loss": 0.425, + "step": 9740 + }, + { + "epoch": 1.2273514500739995, + "grad_norm": 0.3144040107727051, + "learning_rate": 0.00022083126546935394, + "loss": 0.4532, + "step": 9745 + }, + { + "epoch": 1.2279812324841768, + "grad_norm": 0.29848021268844604, + "learning_rate": 0.00022073435097839329, + "loss": 0.457, + "step": 9750 + }, + { + "epoch": 1.228611014894354, + "grad_norm": 0.35102754831314087, + "learning_rate": 0.00022063739850147036, + "loss": 0.4258, + "step": 9755 + }, + { + "epoch": 1.2292407973045312, + "grad_norm": 0.32105547189712524, + "learning_rate": 0.000220540408090651, + "loss": 0.4226, + "step": 9760 + }, + { + "epoch": 1.2298705797147085, + "grad_norm": 0.3647817075252533, + "learning_rate": 0.0002204433797980211, + "loss": 0.4556, + "step": 9765 + }, + { + "epoch": 1.2305003621248858, + "grad_norm": 0.3260333836078644, + "learning_rate": 0.00022034631367568718, + "loss": 0.4834, + "step": 9770 + }, + { + "epoch": 1.231130144535063, + "grad_norm": 0.30218422412872314, + "learning_rate": 0.00022024920977577596, + "loss": 0.4327, + "step": 9775 + }, + { + "epoch": 1.2317599269452404, + "grad_norm": 0.3666177988052368, + "learning_rate": 0.0002201520681504344, + "loss": 0.4361, + "step": 9780 + }, + { + "epoch": 1.2323897093554177, + "grad_norm": 0.3113807737827301, + "learning_rate": 0.00022005488885182975, + "loss": 0.4554, + "step": 9785 + }, + { + "epoch": 1.233019491765595, + "grad_norm": 0.31085875630378723, + "learning_rate": 0.00021995767193214963, + "loss": 0.4391, + "step": 9790 + }, + { + "epoch": 1.2336492741757723, + "grad_norm": 0.304509699344635, + "learning_rate": 0.0002198604174436017, + "loss": 0.4754, + "step": 9795 + }, + { + "epoch": 1.2342790565859496, + "grad_norm": 0.2930733263492584, + "learning_rate": 0.0002197631254384138, + "loss": 0.4194, + "step": 9800 + }, + { + "epoch": 1.234908838996127, + "grad_norm": 0.30277615785598755, + "learning_rate": 0.00021966579596883394, + "loss": 0.4506, + "step": 9805 + }, + { + "epoch": 1.2355386214063042, + "grad_norm": 0.2824211120605469, + "learning_rate": 0.00021956842908713037, + "loss": 0.4398, + "step": 9810 + }, + { + "epoch": 1.2361684038164813, + "grad_norm": 0.31834569573402405, + "learning_rate": 0.00021947102484559121, + "loss": 0.4756, + "step": 9815 + }, + { + "epoch": 1.2367981862266586, + "grad_norm": 0.355283260345459, + "learning_rate": 0.00021937358329652488, + "loss": 0.456, + "step": 9820 + }, + { + "epoch": 1.237427968636836, + "grad_norm": 0.2955317497253418, + "learning_rate": 0.00021927610449225962, + "loss": 0.4462, + "step": 9825 + }, + { + "epoch": 1.2380577510470132, + "grad_norm": 0.2653120756149292, + "learning_rate": 0.00021917858848514383, + "loss": 0.4197, + "step": 9830 + }, + { + "epoch": 1.2386875334571905, + "grad_norm": 0.3773416578769684, + "learning_rate": 0.0002190810353275458, + "loss": 0.4263, + "step": 9835 + }, + { + "epoch": 1.2393173158673678, + "grad_norm": 0.28635114431381226, + "learning_rate": 0.00021898344507185384, + "loss": 0.4705, + "step": 9840 + }, + { + "epoch": 1.2399470982775451, + "grad_norm": 0.3044835031032562, + "learning_rate": 0.00021888581777047608, + "loss": 0.4671, + "step": 9845 + }, + { + "epoch": 1.2405768806877224, + "grad_norm": 0.293748676776886, + "learning_rate": 0.0002187881534758407, + "loss": 0.436, + "step": 9850 + }, + { + "epoch": 1.2412066630978997, + "grad_norm": 0.3891184628009796, + "learning_rate": 0.00021869045224039564, + "loss": 0.456, + "step": 9855 + }, + { + "epoch": 1.241836445508077, + "grad_norm": 0.3140691816806793, + "learning_rate": 0.0002185927141166086, + "loss": 0.4402, + "step": 9860 + }, + { + "epoch": 1.2424662279182543, + "grad_norm": 0.33889827132225037, + "learning_rate": 0.00021849493915696738, + "loss": 0.4363, + "step": 9865 + }, + { + "epoch": 1.2430960103284314, + "grad_norm": 0.3084375858306885, + "learning_rate": 0.0002183971274139791, + "loss": 0.4295, + "step": 9870 + }, + { + "epoch": 1.2437257927386087, + "grad_norm": 0.3091178834438324, + "learning_rate": 0.00021829927894017115, + "loss": 0.4263, + "step": 9875 + }, + { + "epoch": 1.244355575148786, + "grad_norm": 0.3208729922771454, + "learning_rate": 0.00021820139378809025, + "loss": 0.4233, + "step": 9880 + }, + { + "epoch": 1.2449853575589633, + "grad_norm": 0.30196666717529297, + "learning_rate": 0.000218103472010303, + "loss": 0.4265, + "step": 9885 + }, + { + "epoch": 1.2456151399691406, + "grad_norm": 0.3044353127479553, + "learning_rate": 0.0002180055136593956, + "loss": 0.48, + "step": 9890 + }, + { + "epoch": 1.246244922379318, + "grad_norm": 0.31633850932121277, + "learning_rate": 0.000217907518787974, + "loss": 0.4708, + "step": 9895 + }, + { + "epoch": 1.2468747047894952, + "grad_norm": 0.29174062609672546, + "learning_rate": 0.0002178094874486636, + "loss": 0.4135, + "step": 9900 + }, + { + "epoch": 1.2475044871996726, + "grad_norm": 0.33092647790908813, + "learning_rate": 0.00021771141969410956, + "loss": 0.4541, + "step": 9905 + }, + { + "epoch": 1.2481342696098499, + "grad_norm": 0.30151379108428955, + "learning_rate": 0.00021761331557697635, + "loss": 0.4397, + "step": 9910 + }, + { + "epoch": 1.2487640520200272, + "grad_norm": 0.31203630566596985, + "learning_rate": 0.00021751517514994836, + "loss": 0.454, + "step": 9915 + }, + { + "epoch": 1.2493938344302045, + "grad_norm": 0.30847153067588806, + "learning_rate": 0.00021741699846572902, + "loss": 0.4309, + "step": 9920 + }, + { + "epoch": 1.2500236168403815, + "grad_norm": 0.2937026619911194, + "learning_rate": 0.00021731878557704158, + "loss": 0.4206, + "step": 9925 + }, + { + "epoch": 1.2506533992505589, + "grad_norm": 0.2875721752643585, + "learning_rate": 0.0002172205365366285, + "loss": 0.4385, + "step": 9930 + }, + { + "epoch": 1.2512831816607362, + "grad_norm": 0.2834903299808502, + "learning_rate": 0.00021712225139725188, + "loss": 0.423, + "step": 9935 + }, + { + "epoch": 1.2519129640709135, + "grad_norm": 0.3069617748260498, + "learning_rate": 0.000217023930211693, + "loss": 0.4536, + "step": 9940 + }, + { + "epoch": 1.2525427464810908, + "grad_norm": 0.32263246178627014, + "learning_rate": 0.0002169255730327526, + "loss": 0.4281, + "step": 9945 + }, + { + "epoch": 1.253172528891268, + "grad_norm": 0.2980237603187561, + "learning_rate": 0.00021682717991325075, + "loss": 0.4163, + "step": 9950 + }, + { + "epoch": 1.2538023113014454, + "grad_norm": 0.3552669584751129, + "learning_rate": 0.0002167287509060268, + "loss": 0.4378, + "step": 9955 + }, + { + "epoch": 1.2544320937116227, + "grad_norm": 0.3207598924636841, + "learning_rate": 0.00021663028606393932, + "loss": 0.4411, + "step": 9960 + }, + { + "epoch": 1.2550618761218, + "grad_norm": 0.3187711238861084, + "learning_rate": 0.0002165317854398663, + "loss": 0.4384, + "step": 9965 + }, + { + "epoch": 1.2556916585319773, + "grad_norm": 0.3156946897506714, + "learning_rate": 0.00021643324908670472, + "loss": 0.4227, + "step": 9970 + }, + { + "epoch": 1.2563214409421546, + "grad_norm": 0.3305997848510742, + "learning_rate": 0.00021633467705737085, + "loss": 0.4521, + "step": 9975 + }, + { + "epoch": 1.2569512233523317, + "grad_norm": 0.2964983880519867, + "learning_rate": 0.00021623606940480015, + "loss": 0.4373, + "step": 9980 + }, + { + "epoch": 1.257581005762509, + "grad_norm": 0.29807519912719727, + "learning_rate": 0.00021613742618194727, + "loss": 0.4591, + "step": 9985 + }, + { + "epoch": 1.2582107881726863, + "grad_norm": 0.29127413034439087, + "learning_rate": 0.00021603874744178576, + "loss": 0.43, + "step": 9990 + }, + { + "epoch": 1.2588405705828636, + "grad_norm": 0.339418888092041, + "learning_rate": 0.00021594003323730836, + "loss": 0.4407, + "step": 9995 + }, + { + "epoch": 1.2594703529930409, + "grad_norm": 0.3419913053512573, + "learning_rate": 0.0002158412836215269, + "loss": 0.4678, + "step": 10000 + }, + { + "epoch": 1.2594703529930409, + "eval_loss": 0.30844178795814514, + "eval_runtime": 6.157, + "eval_samples_per_second": 162.416, + "eval_steps_per_second": 10.232, + "step": 10000 + }, + { + "epoch": 1.2601001354032182, + "grad_norm": 0.3139461576938629, + "learning_rate": 0.00021574249864747216, + "loss": 0.4491, + "step": 10005 + }, + { + "epoch": 1.2607299178133955, + "grad_norm": 0.319892555475235, + "learning_rate": 0.00021564367836819393, + "loss": 0.4648, + "step": 10010 + }, + { + "epoch": 1.2613597002235728, + "grad_norm": 0.30732426047325134, + "learning_rate": 0.00021554482283676093, + "loss": 0.4113, + "step": 10015 + }, + { + "epoch": 1.26198948263375, + "grad_norm": 0.3234427571296692, + "learning_rate": 0.00021544593210626092, + "loss": 0.4461, + "step": 10020 + }, + { + "epoch": 1.2626192650439272, + "grad_norm": 0.3298225998878479, + "learning_rate": 0.00021534700622980038, + "loss": 0.4487, + "step": 10025 + }, + { + "epoch": 1.2632490474541047, + "grad_norm": 0.3394641578197479, + "learning_rate": 0.0002152480452605048, + "loss": 0.4653, + "step": 10030 + }, + { + "epoch": 1.2638788298642818, + "grad_norm": 0.29091107845306396, + "learning_rate": 0.00021514904925151854, + "loss": 0.4639, + "step": 10035 + }, + { + "epoch": 1.264508612274459, + "grad_norm": 0.27975961565971375, + "learning_rate": 0.00021505001825600461, + "loss": 0.4094, + "step": 10040 + }, + { + "epoch": 1.2651383946846364, + "grad_norm": 0.2882293164730072, + "learning_rate": 0.00021495095232714503, + "loss": 0.4212, + "step": 10045 + }, + { + "epoch": 1.2657681770948137, + "grad_norm": 0.31701260805130005, + "learning_rate": 0.0002148518515181404, + "loss": 0.4427, + "step": 10050 + }, + { + "epoch": 1.266397959504991, + "grad_norm": 0.33051052689552307, + "learning_rate": 0.00021475271588221014, + "loss": 0.4331, + "step": 10055 + }, + { + "epoch": 1.2670277419151683, + "grad_norm": 0.32075920701026917, + "learning_rate": 0.00021465354547259234, + "loss": 0.4486, + "step": 10060 + }, + { + "epoch": 1.2676575243253456, + "grad_norm": 0.3044838309288025, + "learning_rate": 0.00021455434034254375, + "loss": 0.4141, + "step": 10065 + }, + { + "epoch": 1.268287306735523, + "grad_norm": 0.31618407368659973, + "learning_rate": 0.00021445510054533983, + "loss": 0.446, + "step": 10070 + }, + { + "epoch": 1.2689170891457002, + "grad_norm": 0.3025960624217987, + "learning_rate": 0.0002143558261342746, + "loss": 0.4233, + "step": 10075 + }, + { + "epoch": 1.2695468715558773, + "grad_norm": 0.2974034249782562, + "learning_rate": 0.0002142565171626607, + "loss": 0.4078, + "step": 10080 + }, + { + "epoch": 1.2701766539660548, + "grad_norm": 0.34097397327423096, + "learning_rate": 0.0002141571736838293, + "loss": 0.4555, + "step": 10085 + }, + { + "epoch": 1.270806436376232, + "grad_norm": 0.30995890498161316, + "learning_rate": 0.0002140577957511302, + "loss": 0.4388, + "step": 10090 + }, + { + "epoch": 1.2714362187864092, + "grad_norm": 0.24191588163375854, + "learning_rate": 0.00021395838341793145, + "loss": 0.4114, + "step": 10095 + }, + { + "epoch": 1.2720660011965865, + "grad_norm": 0.31779953837394714, + "learning_rate": 0.00021385893673761986, + "loss": 0.4169, + "step": 10100 + }, + { + "epoch": 1.2726957836067638, + "grad_norm": 0.31599584221839905, + "learning_rate": 0.0002137594557636006, + "loss": 0.4081, + "step": 10105 + }, + { + "epoch": 1.2733255660169411, + "grad_norm": 0.31904011964797974, + "learning_rate": 0.00021365994054929713, + "loss": 0.4406, + "step": 10110 + }, + { + "epoch": 1.2739553484271184, + "grad_norm": 0.2923012375831604, + "learning_rate": 0.00021356039114815145, + "loss": 0.4335, + "step": 10115 + }, + { + "epoch": 1.2745851308372957, + "grad_norm": 0.27983418107032776, + "learning_rate": 0.00021346080761362385, + "loss": 0.4039, + "step": 10120 + }, + { + "epoch": 1.275214913247473, + "grad_norm": 0.29870182275772095, + "learning_rate": 0.000213361189999193, + "loss": 0.4311, + "step": 10125 + }, + { + "epoch": 1.2758446956576504, + "grad_norm": 0.3060225546360016, + "learning_rate": 0.00021326153835835574, + "loss": 0.4722, + "step": 10130 + }, + { + "epoch": 1.2764744780678274, + "grad_norm": 0.38860756158828735, + "learning_rate": 0.00021316185274462734, + "loss": 0.4276, + "step": 10135 + }, + { + "epoch": 1.277104260478005, + "grad_norm": 0.32171720266342163, + "learning_rate": 0.0002130621332115413, + "loss": 0.4334, + "step": 10140 + }, + { + "epoch": 1.277734042888182, + "grad_norm": 0.2947072684764862, + "learning_rate": 0.00021296237981264916, + "loss": 0.411, + "step": 10145 + }, + { + "epoch": 1.2783638252983593, + "grad_norm": 0.2904439866542816, + "learning_rate": 0.00021286259260152088, + "loss": 0.4222, + "step": 10150 + }, + { + "epoch": 1.2789936077085367, + "grad_norm": 0.2517947554588318, + "learning_rate": 0.00021276277163174444, + "loss": 0.4336, + "step": 10155 + }, + { + "epoch": 1.279623390118714, + "grad_norm": 0.295692503452301, + "learning_rate": 0.00021266291695692602, + "loss": 0.4617, + "step": 10160 + }, + { + "epoch": 1.2802531725288913, + "grad_norm": 0.3214627802371979, + "learning_rate": 0.00021256302863068976, + "loss": 0.4327, + "step": 10165 + }, + { + "epoch": 1.2808829549390686, + "grad_norm": 0.3030719459056854, + "learning_rate": 0.00021246310670667808, + "loss": 0.4289, + "step": 10170 + }, + { + "epoch": 1.2815127373492459, + "grad_norm": 0.32924139499664307, + "learning_rate": 0.00021236315123855128, + "loss": 0.4391, + "step": 10175 + }, + { + "epoch": 1.2821425197594232, + "grad_norm": 0.2978973984718323, + "learning_rate": 0.00021226316227998773, + "loss": 0.4356, + "step": 10180 + }, + { + "epoch": 1.2827723021696005, + "grad_norm": 0.289858341217041, + "learning_rate": 0.00021216313988468375, + "loss": 0.4302, + "step": 10185 + }, + { + "epoch": 1.2834020845797776, + "grad_norm": 0.28235578536987305, + "learning_rate": 0.00021206308410635376, + "loss": 0.4581, + "step": 10190 + }, + { + "epoch": 1.284031866989955, + "grad_norm": 0.28610706329345703, + "learning_rate": 0.0002119629949987299, + "loss": 0.4233, + "step": 10195 + }, + { + "epoch": 1.2846616494001322, + "grad_norm": 0.347464382648468, + "learning_rate": 0.00021186287261556238, + "loss": 0.4191, + "step": 10200 + }, + { + "epoch": 1.2852914318103095, + "grad_norm": 0.3228091299533844, + "learning_rate": 0.00021176271701061914, + "loss": 0.4162, + "step": 10205 + }, + { + "epoch": 1.2859212142204868, + "grad_norm": 0.34487780928611755, + "learning_rate": 0.00021166252823768606, + "loss": 0.4383, + "step": 10210 + }, + { + "epoch": 1.286550996630664, + "grad_norm": 0.34411466121673584, + "learning_rate": 0.00021156230635056676, + "loss": 0.4532, + "step": 10215 + }, + { + "epoch": 1.2871807790408414, + "grad_norm": 0.38219863176345825, + "learning_rate": 0.00021146205140308273, + "loss": 0.4656, + "step": 10220 + }, + { + "epoch": 1.2878105614510187, + "grad_norm": 0.3240879774093628, + "learning_rate": 0.00021136176344907322, + "loss": 0.4174, + "step": 10225 + }, + { + "epoch": 1.288440343861196, + "grad_norm": 0.34157487750053406, + "learning_rate": 0.00021126144254239503, + "loss": 0.4297, + "step": 10230 + }, + { + "epoch": 1.2890701262713733, + "grad_norm": 0.2788861095905304, + "learning_rate": 0.00021116108873692286, + "loss": 0.429, + "step": 10235 + }, + { + "epoch": 1.2896999086815506, + "grad_norm": 0.28119325637817383, + "learning_rate": 0.00021106070208654895, + "loss": 0.4145, + "step": 10240 + }, + { + "epoch": 1.2903296910917277, + "grad_norm": 0.32004043459892273, + "learning_rate": 0.00021096028264518325, + "loss": 0.4361, + "step": 10245 + }, + { + "epoch": 1.2909594735019052, + "grad_norm": 0.3054758310317993, + "learning_rate": 0.0002108598304667533, + "loss": 0.4331, + "step": 10250 + }, + { + "epoch": 1.2915892559120823, + "grad_norm": 0.3827783167362213, + "learning_rate": 0.0002107593456052042, + "loss": 0.4246, + "step": 10255 + }, + { + "epoch": 1.2922190383222596, + "grad_norm": 0.3008691370487213, + "learning_rate": 0.00021065882811449862, + "loss": 0.4448, + "step": 10260 + }, + { + "epoch": 1.292848820732437, + "grad_norm": 0.3227977752685547, + "learning_rate": 0.00021055827804861675, + "loss": 0.4308, + "step": 10265 + }, + { + "epoch": 1.2934786031426142, + "grad_norm": 0.32592520117759705, + "learning_rate": 0.00021045769546155623, + "loss": 0.4472, + "step": 10270 + }, + { + "epoch": 1.2941083855527915, + "grad_norm": 0.30866268277168274, + "learning_rate": 0.00021035708040733231, + "loss": 0.4193, + "step": 10275 + }, + { + "epoch": 1.2947381679629688, + "grad_norm": 0.36590054631233215, + "learning_rate": 0.0002102564329399775, + "loss": 0.4554, + "step": 10280 + }, + { + "epoch": 1.2953679503731461, + "grad_norm": 0.34002235531806946, + "learning_rate": 0.00021015575311354175, + "loss": 0.465, + "step": 10285 + }, + { + "epoch": 1.2959977327833234, + "grad_norm": 0.26847660541534424, + "learning_rate": 0.00021005504098209248, + "loss": 0.4226, + "step": 10290 + }, + { + "epoch": 1.2966275151935007, + "grad_norm": 0.2904103398323059, + "learning_rate": 0.00020995429659971445, + "loss": 0.4135, + "step": 10295 + }, + { + "epoch": 1.2972572976036778, + "grad_norm": 0.2799352705478668, + "learning_rate": 0.00020985352002050962, + "loss": 0.4241, + "step": 10300 + }, + { + "epoch": 1.2978870800138553, + "grad_norm": 0.3527425229549408, + "learning_rate": 0.00020975271129859734, + "loss": 0.4397, + "step": 10305 + }, + { + "epoch": 1.2985168624240324, + "grad_norm": 0.30795904994010925, + "learning_rate": 0.00020965187048811417, + "loss": 0.4248, + "step": 10310 + }, + { + "epoch": 1.2991466448342097, + "grad_norm": 0.31814008951187134, + "learning_rate": 0.00020955099764321402, + "loss": 0.4501, + "step": 10315 + }, + { + "epoch": 1.299776427244387, + "grad_norm": 0.29917100071907043, + "learning_rate": 0.0002094500928180678, + "loss": 0.4511, + "step": 10320 + }, + { + "epoch": 1.3004062096545643, + "grad_norm": 0.32853367924690247, + "learning_rate": 0.00020934915606686373, + "loss": 0.4055, + "step": 10325 + }, + { + "epoch": 1.3010359920647416, + "grad_norm": 0.420550137758255, + "learning_rate": 0.00020924818744380723, + "loss": 0.4417, + "step": 10330 + }, + { + "epoch": 1.301665774474919, + "grad_norm": 0.3183051347732544, + "learning_rate": 0.0002091471870031207, + "loss": 0.4256, + "step": 10335 + }, + { + "epoch": 1.3022955568850962, + "grad_norm": 0.30520761013031006, + "learning_rate": 0.00020904615479904362, + "loss": 0.4213, + "step": 10340 + }, + { + "epoch": 1.3029253392952735, + "grad_norm": 0.3484478294849396, + "learning_rate": 0.0002089450908858327, + "loss": 0.4202, + "step": 10345 + }, + { + "epoch": 1.3035551217054508, + "grad_norm": 0.3063777983188629, + "learning_rate": 0.00020884399531776154, + "loss": 0.4121, + "step": 10350 + }, + { + "epoch": 1.304184904115628, + "grad_norm": 0.35436901450157166, + "learning_rate": 0.00020874286814912072, + "loss": 0.4351, + "step": 10355 + }, + { + "epoch": 1.3048146865258055, + "grad_norm": 0.3233969211578369, + "learning_rate": 0.00020864170943421786, + "loss": 0.4326, + "step": 10360 + }, + { + "epoch": 1.3054444689359825, + "grad_norm": 0.34073448181152344, + "learning_rate": 0.0002085405192273776, + "loss": 0.4454, + "step": 10365 + }, + { + "epoch": 1.3060742513461598, + "grad_norm": 0.28455135226249695, + "learning_rate": 0.00020843929758294121, + "loss": 0.4511, + "step": 10370 + }, + { + "epoch": 1.3067040337563371, + "grad_norm": 0.31585589051246643, + "learning_rate": 0.0002083380445552672, + "loss": 0.4258, + "step": 10375 + }, + { + "epoch": 1.3073338161665145, + "grad_norm": 0.31528952717781067, + "learning_rate": 0.00020823676019873064, + "loss": 0.424, + "step": 10380 + }, + { + "epoch": 1.3079635985766918, + "grad_norm": 0.3014485836029053, + "learning_rate": 0.00020813544456772362, + "loss": 0.4429, + "step": 10385 + }, + { + "epoch": 1.308593380986869, + "grad_norm": 0.2870473861694336, + "learning_rate": 0.00020803409771665484, + "loss": 0.439, + "step": 10390 + }, + { + "epoch": 1.3092231633970464, + "grad_norm": 0.2971458435058594, + "learning_rate": 0.00020793271969994997, + "loss": 0.4233, + "step": 10395 + }, + { + "epoch": 1.3098529458072237, + "grad_norm": 0.2853131890296936, + "learning_rate": 0.00020783131057205135, + "loss": 0.4164, + "step": 10400 + }, + { + "epoch": 1.310482728217401, + "grad_norm": 0.29392004013061523, + "learning_rate": 0.00020772987038741793, + "loss": 0.4234, + "step": 10405 + }, + { + "epoch": 1.311112510627578, + "grad_norm": 0.2874060273170471, + "learning_rate": 0.00020762839920052543, + "loss": 0.4413, + "step": 10410 + }, + { + "epoch": 1.3117422930377556, + "grad_norm": 0.2806376516819, + "learning_rate": 0.00020752689706586615, + "loss": 0.4223, + "step": 10415 + }, + { + "epoch": 1.3123720754479327, + "grad_norm": 0.28510767221450806, + "learning_rate": 0.00020742536403794908, + "loss": 0.4183, + "step": 10420 + }, + { + "epoch": 1.31300185785811, + "grad_norm": 0.3087919056415558, + "learning_rate": 0.00020732380017129983, + "loss": 0.4241, + "step": 10425 + }, + { + "epoch": 1.3136316402682873, + "grad_norm": 0.2965323328971863, + "learning_rate": 0.00020722220552046048, + "loss": 0.4225, + "step": 10430 + }, + { + "epoch": 1.3142614226784646, + "grad_norm": 0.2907772660255432, + "learning_rate": 0.00020712058013998963, + "loss": 0.4176, + "step": 10435 + }, + { + "epoch": 1.3148912050886419, + "grad_norm": 0.3242434859275818, + "learning_rate": 0.0002070189240844625, + "loss": 0.4377, + "step": 10440 + }, + { + "epoch": 1.3155209874988192, + "grad_norm": 0.28129857778549194, + "learning_rate": 0.00020691723740847066, + "loss": 0.425, + "step": 10445 + }, + { + "epoch": 1.3161507699089965, + "grad_norm": 0.3053089380264282, + "learning_rate": 0.00020681552016662224, + "loss": 0.4066, + "step": 10450 + }, + { + "epoch": 1.3167805523191738, + "grad_norm": 0.27167361974716187, + "learning_rate": 0.00020671377241354168, + "loss": 0.4458, + "step": 10455 + }, + { + "epoch": 1.317410334729351, + "grad_norm": 0.29331174492836, + "learning_rate": 0.00020661199420386986, + "loss": 0.427, + "step": 10460 + }, + { + "epoch": 1.3180401171395282, + "grad_norm": 0.329908162355423, + "learning_rate": 0.00020651018559226394, + "loss": 0.4292, + "step": 10465 + }, + { + "epoch": 1.3186698995497057, + "grad_norm": 0.32669904828071594, + "learning_rate": 0.0002064083466333976, + "loss": 0.4118, + "step": 10470 + }, + { + "epoch": 1.3192996819598828, + "grad_norm": 0.35706159472465515, + "learning_rate": 0.00020630647738196058, + "loss": 0.4433, + "step": 10475 + }, + { + "epoch": 1.31992946437006, + "grad_norm": 0.3119877278804779, + "learning_rate": 0.00020620457789265905, + "loss": 0.4206, + "step": 10480 + }, + { + "epoch": 1.3205592467802374, + "grad_norm": 0.34798958897590637, + "learning_rate": 0.00020610264822021532, + "loss": 0.39, + "step": 10485 + }, + { + "epoch": 1.3211890291904147, + "grad_norm": 0.36972302198410034, + "learning_rate": 0.000206000688419368, + "loss": 0.4402, + "step": 10490 + }, + { + "epoch": 1.321818811600592, + "grad_norm": 0.27949050068855286, + "learning_rate": 0.00020589869854487175, + "loss": 0.4221, + "step": 10495 + }, + { + "epoch": 1.3224485940107693, + "grad_norm": 0.30757853388786316, + "learning_rate": 0.00020579667865149758, + "loss": 0.4402, + "step": 10500 + }, + { + "epoch": 1.3230783764209466, + "grad_norm": 0.3018808364868164, + "learning_rate": 0.0002056946287940324, + "loss": 0.4088, + "step": 10505 + }, + { + "epoch": 1.323708158831124, + "grad_norm": 0.2630440592765808, + "learning_rate": 0.00020559254902727942, + "loss": 0.4062, + "step": 10510 + }, + { + "epoch": 1.3243379412413012, + "grad_norm": 0.3145885169506073, + "learning_rate": 0.00020549043940605767, + "loss": 0.4301, + "step": 10515 + }, + { + "epoch": 1.3249677236514783, + "grad_norm": 0.3040730655193329, + "learning_rate": 0.0002053882999852025, + "loss": 0.4267, + "step": 10520 + }, + { + "epoch": 1.3255975060616558, + "grad_norm": 0.2861897945404053, + "learning_rate": 0.00020528613081956498, + "loss": 0.4115, + "step": 10525 + }, + { + "epoch": 1.326227288471833, + "grad_norm": 0.2938830256462097, + "learning_rate": 0.00020518393196401234, + "loss": 0.4315, + "step": 10530 + }, + { + "epoch": 1.3268570708820102, + "grad_norm": 0.24550281465053558, + "learning_rate": 0.0002050817034734277, + "loss": 0.4181, + "step": 10535 + }, + { + "epoch": 1.3274868532921875, + "grad_norm": 0.30074000358581543, + "learning_rate": 0.00020497944540271017, + "loss": 0.4016, + "step": 10540 + }, + { + "epoch": 1.3281166357023648, + "grad_norm": 0.34675145149230957, + "learning_rate": 0.0002048771578067745, + "loss": 0.4157, + "step": 10545 + }, + { + "epoch": 1.3287464181125421, + "grad_norm": 0.3144848644733429, + "learning_rate": 0.00020477484074055157, + "loss": 0.4024, + "step": 10550 + }, + { + "epoch": 1.3293762005227194, + "grad_norm": 0.32153722643852234, + "learning_rate": 0.00020467249425898805, + "loss": 0.4114, + "step": 10555 + }, + { + "epoch": 1.3300059829328967, + "grad_norm": 0.301707923412323, + "learning_rate": 0.0002045701184170462, + "loss": 0.423, + "step": 10560 + }, + { + "epoch": 1.330635765343074, + "grad_norm": 0.25224459171295166, + "learning_rate": 0.00020446771326970424, + "loss": 0.4037, + "step": 10565 + }, + { + "epoch": 1.3312655477532513, + "grad_norm": 0.3072243928909302, + "learning_rate": 0.00020436527887195607, + "loss": 0.4279, + "step": 10570 + }, + { + "epoch": 1.3318953301634284, + "grad_norm": 0.36949509382247925, + "learning_rate": 0.00020426281527881137, + "loss": 0.4259, + "step": 10575 + }, + { + "epoch": 1.332525112573606, + "grad_norm": 0.30465519428253174, + "learning_rate": 0.00020416032254529535, + "loss": 0.457, + "step": 10580 + }, + { + "epoch": 1.333154894983783, + "grad_norm": 0.2719140350818634, + "learning_rate": 0.00020405780072644896, + "loss": 0.3927, + "step": 10585 + }, + { + "epoch": 1.3337846773939603, + "grad_norm": 0.33556681871414185, + "learning_rate": 0.00020395524987732876, + "loss": 0.4341, + "step": 10590 + }, + { + "epoch": 1.3344144598041376, + "grad_norm": 0.3145639896392822, + "learning_rate": 0.0002038526700530069, + "loss": 0.4176, + "step": 10595 + }, + { + "epoch": 1.335044242214315, + "grad_norm": 0.31328147649765015, + "learning_rate": 0.00020375006130857111, + "loss": 0.4332, + "step": 10600 + }, + { + "epoch": 1.3356740246244923, + "grad_norm": 0.3016543388366699, + "learning_rate": 0.00020364742369912464, + "loss": 0.4173, + "step": 10605 + }, + { + "epoch": 1.3363038070346696, + "grad_norm": 0.31259703636169434, + "learning_rate": 0.0002035447572797862, + "loss": 0.4091, + "step": 10610 + }, + { + "epoch": 1.3369335894448469, + "grad_norm": 0.34624606370925903, + "learning_rate": 0.00020344206210569, + "loss": 0.4408, + "step": 10615 + }, + { + "epoch": 1.3375633718550242, + "grad_norm": 0.3144773542881012, + "learning_rate": 0.00020333933823198566, + "loss": 0.3863, + "step": 10620 + }, + { + "epoch": 1.3381931542652015, + "grad_norm": 0.3231208026409149, + "learning_rate": 0.00020323658571383833, + "loss": 0.4151, + "step": 10625 + }, + { + "epoch": 1.3388229366753785, + "grad_norm": 0.3022227883338928, + "learning_rate": 0.00020313380460642842, + "loss": 0.4108, + "step": 10630 + }, + { + "epoch": 1.339452719085556, + "grad_norm": 0.2899850606918335, + "learning_rate": 0.00020303099496495172, + "loss": 0.412, + "step": 10635 + }, + { + "epoch": 1.3400825014957332, + "grad_norm": 0.31005537509918213, + "learning_rate": 0.00020292815684461936, + "loss": 0.4114, + "step": 10640 + }, + { + "epoch": 1.3407122839059105, + "grad_norm": 0.29457420110702515, + "learning_rate": 0.00020282529030065784, + "loss": 0.4292, + "step": 10645 + }, + { + "epoch": 1.3413420663160878, + "grad_norm": 0.31712374091148376, + "learning_rate": 0.00020272239538830867, + "loss": 0.4029, + "step": 10650 + }, + { + "epoch": 1.341971848726265, + "grad_norm": 0.3228032886981964, + "learning_rate": 0.00020261947216282896, + "loss": 0.414, + "step": 10655 + }, + { + "epoch": 1.3426016311364424, + "grad_norm": 0.305351197719574, + "learning_rate": 0.00020251652067949068, + "loss": 0.4233, + "step": 10660 + }, + { + "epoch": 1.3432314135466197, + "grad_norm": 0.30317017436027527, + "learning_rate": 0.00020241354099358123, + "loss": 0.3816, + "step": 10665 + }, + { + "epoch": 1.343861195956797, + "grad_norm": 0.3036525845527649, + "learning_rate": 0.00020231053316040293, + "loss": 0.4115, + "step": 10670 + }, + { + "epoch": 1.3444909783669743, + "grad_norm": 0.33367687463760376, + "learning_rate": 0.00020220749723527353, + "loss": 0.449, + "step": 10675 + }, + { + "epoch": 1.3451207607771516, + "grad_norm": 0.28938767313957214, + "learning_rate": 0.00020210443327352553, + "loss": 0.3919, + "step": 10680 + }, + { + "epoch": 1.3457505431873287, + "grad_norm": 0.2946431338787079, + "learning_rate": 0.00020200134133050666, + "loss": 0.4043, + "step": 10685 + }, + { + "epoch": 1.3463803255975062, + "grad_norm": 0.31588709354400635, + "learning_rate": 0.00020189822146157962, + "loss": 0.4136, + "step": 10690 + }, + { + "epoch": 1.3470101080076833, + "grad_norm": 0.2830824851989746, + "learning_rate": 0.00020179507372212224, + "loss": 0.4164, + "step": 10695 + }, + { + "epoch": 1.3476398904178606, + "grad_norm": 0.31364426016807556, + "learning_rate": 0.0002016918981675271, + "loss": 0.4197, + "step": 10700 + }, + { + "epoch": 1.348269672828038, + "grad_norm": 0.32086437940597534, + "learning_rate": 0.00020158869485320194, + "loss": 0.4346, + "step": 10705 + }, + { + "epoch": 1.3488994552382152, + "grad_norm": 0.30549678206443787, + "learning_rate": 0.0002014854638345692, + "loss": 0.4134, + "step": 10710 + }, + { + "epoch": 1.3495292376483925, + "grad_norm": 0.2996455132961273, + "learning_rate": 0.00020138220516706634, + "loss": 0.3846, + "step": 10715 + }, + { + "epoch": 1.3501590200585698, + "grad_norm": 0.3013511002063751, + "learning_rate": 0.00020127891890614556, + "loss": 0.3994, + "step": 10720 + }, + { + "epoch": 1.350788802468747, + "grad_norm": 0.28055283427238464, + "learning_rate": 0.00020117560510727402, + "loss": 0.4163, + "step": 10725 + }, + { + "epoch": 1.3514185848789244, + "grad_norm": 0.3024522364139557, + "learning_rate": 0.00020107226382593357, + "loss": 0.4042, + "step": 10730 + }, + { + "epoch": 1.3520483672891017, + "grad_norm": 0.28080272674560547, + "learning_rate": 0.00020096889511762083, + "loss": 0.4176, + "step": 10735 + }, + { + "epoch": 1.3526781496992788, + "grad_norm": 0.3069353997707367, + "learning_rate": 0.00020086549903784715, + "loss": 0.4189, + "step": 10740 + }, + { + "epoch": 1.353307932109456, + "grad_norm": 0.2898117005825043, + "learning_rate": 0.00020076207564213866, + "loss": 0.4342, + "step": 10745 + }, + { + "epoch": 1.3539377145196334, + "grad_norm": 0.3365933299064636, + "learning_rate": 0.00020065862498603592, + "loss": 0.3944, + "step": 10750 + }, + { + "epoch": 1.3545674969298107, + "grad_norm": 0.29901427030563354, + "learning_rate": 0.00020055514712509446, + "loss": 0.4059, + "step": 10755 + }, + { + "epoch": 1.355197279339988, + "grad_norm": 0.2927230894565582, + "learning_rate": 0.00020045164211488417, + "loss": 0.4137, + "step": 10760 + }, + { + "epoch": 1.3558270617501653, + "grad_norm": 0.35867777466773987, + "learning_rate": 0.00020034811001098964, + "loss": 0.4108, + "step": 10765 + }, + { + "epoch": 1.3564568441603426, + "grad_norm": 0.2955409586429596, + "learning_rate": 0.00020024455086900994, + "loss": 0.4328, + "step": 10770 + }, + { + "epoch": 1.35708662657052, + "grad_norm": 0.29247814416885376, + "learning_rate": 0.00020014096474455873, + "loss": 0.4014, + "step": 10775 + }, + { + "epoch": 1.3577164089806972, + "grad_norm": 0.30858153104782104, + "learning_rate": 0.00020003735169326413, + "loss": 0.4112, + "step": 10780 + }, + { + "epoch": 1.3583461913908745, + "grad_norm": 0.4134693145751953, + "learning_rate": 0.0001999337117707687, + "loss": 0.4062, + "step": 10785 + }, + { + "epoch": 1.3589759738010518, + "grad_norm": 0.3120553195476532, + "learning_rate": 0.0001998300450327294, + "loss": 0.4049, + "step": 10790 + }, + { + "epoch": 1.359605756211229, + "grad_norm": 0.3146657645702362, + "learning_rate": 0.00019972635153481767, + "loss": 0.4029, + "step": 10795 + }, + { + "epoch": 1.3602355386214062, + "grad_norm": 0.2997225821018219, + "learning_rate": 0.00019962263133271933, + "loss": 0.3792, + "step": 10800 + }, + { + "epoch": 1.3608653210315835, + "grad_norm": 0.32136911153793335, + "learning_rate": 0.0001995188844821345, + "loss": 0.3987, + "step": 10805 + }, + { + "epoch": 1.3614951034417608, + "grad_norm": 0.30875489115715027, + "learning_rate": 0.0001994151110387775, + "loss": 0.4211, + "step": 10810 + }, + { + "epoch": 1.3621248858519381, + "grad_norm": 0.30939677357673645, + "learning_rate": 0.00019931131105837714, + "loss": 0.451, + "step": 10815 + }, + { + "epoch": 1.3627546682621154, + "grad_norm": 0.27874892950057983, + "learning_rate": 0.0001992074845966764, + "loss": 0.4102, + "step": 10820 + }, + { + "epoch": 1.3633844506722927, + "grad_norm": 0.28371527791023254, + "learning_rate": 0.00019910363170943233, + "loss": 0.4153, + "step": 10825 + }, + { + "epoch": 1.36401423308247, + "grad_norm": 0.2852970063686371, + "learning_rate": 0.00019899975245241643, + "loss": 0.409, + "step": 10830 + }, + { + "epoch": 1.3646440154926474, + "grad_norm": 0.300521582365036, + "learning_rate": 0.00019889584688141418, + "loss": 0.4032, + "step": 10835 + }, + { + "epoch": 1.3652737979028244, + "grad_norm": 0.30631181597709656, + "learning_rate": 0.00019879191505222526, + "loss": 0.4299, + "step": 10840 + }, + { + "epoch": 1.365903580313002, + "grad_norm": 0.3514620363712311, + "learning_rate": 0.00019868795702066342, + "loss": 0.4051, + "step": 10845 + }, + { + "epoch": 1.366533362723179, + "grad_norm": 0.27533403038978577, + "learning_rate": 0.00019858397284255657, + "loss": 0.4108, + "step": 10850 + }, + { + "epoch": 1.3671631451333564, + "grad_norm": 0.3143390119075775, + "learning_rate": 0.00019847996257374645, + "loss": 0.426, + "step": 10855 + }, + { + "epoch": 1.3677929275435337, + "grad_norm": 0.3388061821460724, + "learning_rate": 0.00019837592627008904, + "loss": 0.4163, + "step": 10860 + }, + { + "epoch": 1.368422709953711, + "grad_norm": 0.34078383445739746, + "learning_rate": 0.00019827186398745417, + "loss": 0.4015, + "step": 10865 + }, + { + "epoch": 1.3690524923638883, + "grad_norm": 0.33532068133354187, + "learning_rate": 0.00019816777578172582, + "loss": 0.4436, + "step": 10870 + }, + { + "epoch": 1.3696822747740656, + "grad_norm": 0.3230116069316864, + "learning_rate": 0.0001980636617088015, + "loss": 0.4239, + "step": 10875 + }, + { + "epoch": 1.3703120571842429, + "grad_norm": 0.31974872946739197, + "learning_rate": 0.00019795952182459297, + "loss": 0.4313, + "step": 10880 + }, + { + "epoch": 1.3709418395944202, + "grad_norm": 0.2825758159160614, + "learning_rate": 0.0001978553561850257, + "loss": 0.4045, + "step": 10885 + }, + { + "epoch": 1.3715716220045975, + "grad_norm": 0.2678980529308319, + "learning_rate": 0.00019775116484603908, + "loss": 0.3899, + "step": 10890 + }, + { + "epoch": 1.3722014044147746, + "grad_norm": 0.3492506146430969, + "learning_rate": 0.00019764694786358612, + "loss": 0.3807, + "step": 10895 + }, + { + "epoch": 1.372831186824952, + "grad_norm": 0.30808547139167786, + "learning_rate": 0.00019754270529363384, + "loss": 0.4163, + "step": 10900 + }, + { + "epoch": 1.3734609692351292, + "grad_norm": 0.30980342626571655, + "learning_rate": 0.0001974384371921628, + "loss": 0.3843, + "step": 10905 + }, + { + "epoch": 1.3740907516453065, + "grad_norm": 0.2915787100791931, + "learning_rate": 0.00019733414361516736, + "loss": 0.4208, + "step": 10910 + }, + { + "epoch": 1.3747205340554838, + "grad_norm": 0.30979228019714355, + "learning_rate": 0.00019722982461865555, + "loss": 0.4188, + "step": 10915 + }, + { + "epoch": 1.375350316465661, + "grad_norm": 0.28953999280929565, + "learning_rate": 0.00019712548025864918, + "loss": 0.3934, + "step": 10920 + }, + { + "epoch": 1.3759800988758384, + "grad_norm": 0.31495416164398193, + "learning_rate": 0.00019702111059118334, + "loss": 0.4117, + "step": 10925 + }, + { + "epoch": 1.3766098812860157, + "grad_norm": 0.38459569215774536, + "learning_rate": 0.00019691671567230714, + "loss": 0.4229, + "step": 10930 + }, + { + "epoch": 1.377239663696193, + "grad_norm": 0.31138870120048523, + "learning_rate": 0.00019681229555808285, + "loss": 0.4284, + "step": 10935 + }, + { + "epoch": 1.3778694461063703, + "grad_norm": 0.2761414051055908, + "learning_rate": 0.0001967078503045866, + "loss": 0.3838, + "step": 10940 + }, + { + "epoch": 1.3784992285165476, + "grad_norm": 0.31627506017684937, + "learning_rate": 0.00019660337996790772, + "loss": 0.4008, + "step": 10945 + }, + { + "epoch": 1.3791290109267247, + "grad_norm": 0.29025107622146606, + "learning_rate": 0.00019649888460414937, + "loss": 0.409, + "step": 10950 + }, + { + "epoch": 1.3797587933369022, + "grad_norm": 0.3379102349281311, + "learning_rate": 0.0001963943642694278, + "loss": 0.4213, + "step": 10955 + }, + { + "epoch": 1.3803885757470793, + "grad_norm": 0.3209204375743866, + "learning_rate": 0.00019628981901987285, + "loss": 0.3834, + "step": 10960 + }, + { + "epoch": 1.3810183581572566, + "grad_norm": 0.31717419624328613, + "learning_rate": 0.0001961852489116277, + "loss": 0.4499, + "step": 10965 + }, + { + "epoch": 1.381648140567434, + "grad_norm": 0.27936458587646484, + "learning_rate": 0.00019608065400084898, + "loss": 0.3987, + "step": 10970 + }, + { + "epoch": 1.3822779229776112, + "grad_norm": 0.28877684473991394, + "learning_rate": 0.00019597603434370637, + "loss": 0.4252, + "step": 10975 + }, + { + "epoch": 1.3829077053877885, + "grad_norm": 0.3423072397708893, + "learning_rate": 0.00019587138999638316, + "loss": 0.421, + "step": 10980 + }, + { + "epoch": 1.3835374877979658, + "grad_norm": 0.26486262679100037, + "learning_rate": 0.00019576672101507568, + "loss": 0.4104, + "step": 10985 + }, + { + "epoch": 1.3841672702081431, + "grad_norm": 0.2929472029209137, + "learning_rate": 0.00019566202745599365, + "loss": 0.4127, + "step": 10990 + }, + { + "epoch": 1.3847970526183204, + "grad_norm": 0.2696884870529175, + "learning_rate": 0.00019555730937535976, + "loss": 0.4067, + "step": 10995 + }, + { + "epoch": 1.3854268350284977, + "grad_norm": 0.32420167326927185, + "learning_rate": 0.0001954525668294102, + "loss": 0.4136, + "step": 11000 + }, + { + "epoch": 1.3854268350284977, + "eval_loss": 0.3039778470993042, + "eval_runtime": 6.1549, + "eval_samples_per_second": 162.472, + "eval_steps_per_second": 10.236, + "step": 11000 + }, + { + "epoch": 1.3860566174386748, + "grad_norm": 0.3149106502532959, + "learning_rate": 0.00019534779987439395, + "loss": 0.3954, + "step": 11005 + }, + { + "epoch": 1.3866863998488523, + "grad_norm": 0.332868367433548, + "learning_rate": 0.0001952430085665733, + "loss": 0.4178, + "step": 11010 + }, + { + "epoch": 1.3873161822590294, + "grad_norm": 0.285671591758728, + "learning_rate": 0.00019513819296222362, + "loss": 0.3788, + "step": 11015 + }, + { + "epoch": 1.3879459646692067, + "grad_norm": 0.3317325711250305, + "learning_rate": 0.0001950333531176332, + "loss": 0.4091, + "step": 11020 + }, + { + "epoch": 1.388575747079384, + "grad_norm": 0.27808326482772827, + "learning_rate": 0.00019492848908910356, + "loss": 0.4104, + "step": 11025 + }, + { + "epoch": 1.3892055294895613, + "grad_norm": 0.29725268483161926, + "learning_rate": 0.00019482360093294897, + "loss": 0.3981, + "step": 11030 + }, + { + "epoch": 1.3898353118997386, + "grad_norm": 0.24770186841487885, + "learning_rate": 0.0001947186887054968, + "loss": 0.4052, + "step": 11035 + }, + { + "epoch": 1.390465094309916, + "grad_norm": 0.31627580523490906, + "learning_rate": 0.00019461375246308734, + "loss": 0.4051, + "step": 11040 + }, + { + "epoch": 1.3910948767200932, + "grad_norm": 0.2721163332462311, + "learning_rate": 0.00019450879226207368, + "loss": 0.3962, + "step": 11045 + }, + { + "epoch": 1.3917246591302705, + "grad_norm": 0.31926798820495605, + "learning_rate": 0.00019440380815882187, + "loss": 0.3964, + "step": 11050 + }, + { + "epoch": 1.3923544415404479, + "grad_norm": 0.3047574460506439, + "learning_rate": 0.0001942988002097108, + "loss": 0.3818, + "step": 11055 + }, + { + "epoch": 1.392984223950625, + "grad_norm": 0.35394978523254395, + "learning_rate": 0.00019419376847113216, + "loss": 0.4398, + "step": 11060 + }, + { + "epoch": 1.3936140063608025, + "grad_norm": 0.2855307459831238, + "learning_rate": 0.00019408871299949037, + "loss": 0.4089, + "step": 11065 + }, + { + "epoch": 1.3942437887709795, + "grad_norm": 0.3066868484020233, + "learning_rate": 0.00019398363385120254, + "loss": 0.3987, + "step": 11070 + }, + { + "epoch": 1.3948735711811568, + "grad_norm": 0.312775194644928, + "learning_rate": 0.0001938785310826987, + "loss": 0.3794, + "step": 11075 + }, + { + "epoch": 1.3955033535913342, + "grad_norm": 0.3235652446746826, + "learning_rate": 0.00019377340475042136, + "loss": 0.3852, + "step": 11080 + }, + { + "epoch": 1.3961331360015115, + "grad_norm": 0.33732032775878906, + "learning_rate": 0.00019366825491082574, + "loss": 0.4003, + "step": 11085 + }, + { + "epoch": 1.3967629184116888, + "grad_norm": 0.33549800515174866, + "learning_rate": 0.00019356308162037976, + "loss": 0.3699, + "step": 11090 + }, + { + "epoch": 1.397392700821866, + "grad_norm": 0.3360839784145355, + "learning_rate": 0.00019345788493556394, + "loss": 0.394, + "step": 11095 + }, + { + "epoch": 1.3980224832320434, + "grad_norm": 0.3089699447154999, + "learning_rate": 0.00019335266491287112, + "loss": 0.4016, + "step": 11100 + }, + { + "epoch": 1.3986522656422207, + "grad_norm": 0.30863386392593384, + "learning_rate": 0.00019324742160880702, + "loss": 0.3973, + "step": 11105 + }, + { + "epoch": 1.399282048052398, + "grad_norm": 0.30803561210632324, + "learning_rate": 0.00019314215507988965, + "loss": 0.4119, + "step": 11110 + }, + { + "epoch": 1.399911830462575, + "grad_norm": 0.2869633138179779, + "learning_rate": 0.0001930368653826495, + "loss": 0.4098, + "step": 11115 + }, + { + "epoch": 1.4005416128727526, + "grad_norm": 0.25851666927337646, + "learning_rate": 0.00019293155257362957, + "loss": 0.4034, + "step": 11120 + }, + { + "epoch": 1.4011713952829297, + "grad_norm": 0.32763540744781494, + "learning_rate": 0.00019282621670938527, + "loss": 0.4121, + "step": 11125 + }, + { + "epoch": 1.401801177693107, + "grad_norm": 0.3531438410282135, + "learning_rate": 0.00019272085784648432, + "loss": 0.4021, + "step": 11130 + }, + { + "epoch": 1.4024309601032843, + "grad_norm": 0.27890294790267944, + "learning_rate": 0.00019261547604150687, + "loss": 0.3872, + "step": 11135 + }, + { + "epoch": 1.4030607425134616, + "grad_norm": 0.26616647839546204, + "learning_rate": 0.00019251007135104534, + "loss": 0.4293, + "step": 11140 + }, + { + "epoch": 1.4036905249236389, + "grad_norm": 0.3214140236377716, + "learning_rate": 0.0001924046438317045, + "loss": 0.3974, + "step": 11145 + }, + { + "epoch": 1.4043203073338162, + "grad_norm": 0.31075042486190796, + "learning_rate": 0.00019229919354010126, + "loss": 0.3978, + "step": 11150 + }, + { + "epoch": 1.4049500897439935, + "grad_norm": 0.31546491384506226, + "learning_rate": 0.00019219372053286485, + "loss": 0.3937, + "step": 11155 + }, + { + "epoch": 1.4055798721541708, + "grad_norm": 0.33116820454597473, + "learning_rate": 0.00019208822486663677, + "loss": 0.3779, + "step": 11160 + }, + { + "epoch": 1.406209654564348, + "grad_norm": 0.30159297585487366, + "learning_rate": 0.0001919827065980705, + "loss": 0.3822, + "step": 11165 + }, + { + "epoch": 1.4068394369745252, + "grad_norm": 0.29656147956848145, + "learning_rate": 0.00019187716578383178, + "loss": 0.4047, + "step": 11170 + }, + { + "epoch": 1.4074692193847027, + "grad_norm": 0.3193992078304291, + "learning_rate": 0.0001917716024805985, + "loss": 0.4088, + "step": 11175 + }, + { + "epoch": 1.4080990017948798, + "grad_norm": 0.29688236117362976, + "learning_rate": 0.0001916660167450605, + "loss": 0.3693, + "step": 11180 + }, + { + "epoch": 1.408728784205057, + "grad_norm": 0.33146485686302185, + "learning_rate": 0.00019156040863391977, + "loss": 0.3865, + "step": 11185 + }, + { + "epoch": 1.4093585666152344, + "grad_norm": 0.3015727698802948, + "learning_rate": 0.00019145477820389027, + "loss": 0.3857, + "step": 11190 + }, + { + "epoch": 1.4099883490254117, + "grad_norm": 0.27797931432724, + "learning_rate": 0.00019134912551169796, + "loss": 0.4148, + "step": 11195 + }, + { + "epoch": 1.410618131435589, + "grad_norm": 0.30010297894477844, + "learning_rate": 0.00019124345061408067, + "loss": 0.4076, + "step": 11200 + }, + { + "epoch": 1.4112479138457663, + "grad_norm": 0.29101455211639404, + "learning_rate": 0.00019113775356778833, + "loss": 0.3802, + "step": 11205 + }, + { + "epoch": 1.4118776962559436, + "grad_norm": 0.29706794023513794, + "learning_rate": 0.00019103203442958266, + "loss": 0.3867, + "step": 11210 + }, + { + "epoch": 1.412507478666121, + "grad_norm": 0.2546458840370178, + "learning_rate": 0.00019092629325623723, + "loss": 0.3964, + "step": 11215 + }, + { + "epoch": 1.4131372610762982, + "grad_norm": 0.3409089148044586, + "learning_rate": 0.0001908205301045375, + "loss": 0.4171, + "step": 11220 + }, + { + "epoch": 1.4137670434864753, + "grad_norm": 0.27688878774642944, + "learning_rate": 0.00019071474503128057, + "loss": 0.405, + "step": 11225 + }, + { + "epoch": 1.4143968258966528, + "grad_norm": 0.30704399943351746, + "learning_rate": 0.00019060893809327563, + "loss": 0.4024, + "step": 11230 + }, + { + "epoch": 1.41502660830683, + "grad_norm": 0.2823016941547394, + "learning_rate": 0.00019050310934734326, + "loss": 0.3908, + "step": 11235 + }, + { + "epoch": 1.4156563907170072, + "grad_norm": 0.3309246897697449, + "learning_rate": 0.000190397258850316, + "loss": 0.4049, + "step": 11240 + }, + { + "epoch": 1.4162861731271845, + "grad_norm": 0.2959790527820587, + "learning_rate": 0.00019029138665903794, + "loss": 0.4031, + "step": 11245 + }, + { + "epoch": 1.4169159555373618, + "grad_norm": 0.29836803674697876, + "learning_rate": 0.00019018549283036497, + "loss": 0.4103, + "step": 11250 + }, + { + "epoch": 1.4175457379475391, + "grad_norm": 0.3187415301799774, + "learning_rate": 0.00019007957742116433, + "loss": 0.4055, + "step": 11255 + }, + { + "epoch": 1.4181755203577164, + "grad_norm": 0.3521386981010437, + "learning_rate": 0.00018997364048831515, + "loss": 0.3839, + "step": 11260 + }, + { + "epoch": 1.4188053027678937, + "grad_norm": 0.3985449969768524, + "learning_rate": 0.00018986768208870792, + "loss": 0.4058, + "step": 11265 + }, + { + "epoch": 1.419435085178071, + "grad_norm": 0.30885374546051025, + "learning_rate": 0.00018976170227924473, + "loss": 0.394, + "step": 11270 + }, + { + "epoch": 1.4200648675882483, + "grad_norm": 0.2981209456920624, + "learning_rate": 0.00018965570111683917, + "loss": 0.3917, + "step": 11275 + }, + { + "epoch": 1.4206946499984254, + "grad_norm": 0.2993827164173126, + "learning_rate": 0.00018954967865841629, + "loss": 0.4016, + "step": 11280 + }, + { + "epoch": 1.421324432408603, + "grad_norm": 0.283632755279541, + "learning_rate": 0.00018944363496091254, + "loss": 0.3873, + "step": 11285 + }, + { + "epoch": 1.42195421481878, + "grad_norm": 0.2871907353401184, + "learning_rate": 0.0001893375700812758, + "loss": 0.4136, + "step": 11290 + }, + { + "epoch": 1.4225839972289573, + "grad_norm": 0.3341853618621826, + "learning_rate": 0.00018923148407646537, + "loss": 0.409, + "step": 11295 + }, + { + "epoch": 1.4232137796391346, + "grad_norm": 0.32463696599006653, + "learning_rate": 0.00018912537700345192, + "loss": 0.3912, + "step": 11300 + }, + { + "epoch": 1.423843562049312, + "grad_norm": 0.33242395520210266, + "learning_rate": 0.00018901924891921726, + "loss": 0.4158, + "step": 11305 + }, + { + "epoch": 1.4244733444594893, + "grad_norm": 0.301289439201355, + "learning_rate": 0.00018891309988075463, + "loss": 0.4012, + "step": 11310 + }, + { + "epoch": 1.4251031268696666, + "grad_norm": 0.28636494278907776, + "learning_rate": 0.00018880692994506845, + "loss": 0.3817, + "step": 11315 + }, + { + "epoch": 1.4257329092798439, + "grad_norm": 0.2837861478328705, + "learning_rate": 0.00018870073916917455, + "loss": 0.4116, + "step": 11320 + }, + { + "epoch": 1.4263626916900212, + "grad_norm": 0.31169527769088745, + "learning_rate": 0.0001885945276100996, + "loss": 0.3967, + "step": 11325 + }, + { + "epoch": 1.4269924741001985, + "grad_norm": 0.31035301089286804, + "learning_rate": 0.00018848829532488177, + "loss": 0.407, + "step": 11330 + }, + { + "epoch": 1.4276222565103756, + "grad_norm": 0.3047008812427521, + "learning_rate": 0.00018838204237057023, + "loss": 0.3939, + "step": 11335 + }, + { + "epoch": 1.428252038920553, + "grad_norm": 0.2646077871322632, + "learning_rate": 0.00018827576880422515, + "loss": 0.3881, + "step": 11340 + }, + { + "epoch": 1.4288818213307302, + "grad_norm": 0.31041520833969116, + "learning_rate": 0.00018816947468291788, + "loss": 0.3822, + "step": 11345 + }, + { + "epoch": 1.4295116037409075, + "grad_norm": 0.2699204385280609, + "learning_rate": 0.00018806316006373086, + "loss": 0.3895, + "step": 11350 + }, + { + "epoch": 1.4301413861510848, + "grad_norm": 0.285363107919693, + "learning_rate": 0.00018795682500375742, + "loss": 0.4027, + "step": 11355 + }, + { + "epoch": 1.430771168561262, + "grad_norm": 0.27154308557510376, + "learning_rate": 0.00018785046956010194, + "loss": 0.3815, + "step": 11360 + }, + { + "epoch": 1.4314009509714394, + "grad_norm": 0.29652640223503113, + "learning_rate": 0.00018774409378987972, + "loss": 0.4003, + "step": 11365 + }, + { + "epoch": 1.4320307333816167, + "grad_norm": 0.2921524941921234, + "learning_rate": 0.00018763769775021695, + "loss": 0.3828, + "step": 11370 + }, + { + "epoch": 1.432660515791794, + "grad_norm": 0.26934945583343506, + "learning_rate": 0.00018753128149825074, + "loss": 0.3999, + "step": 11375 + }, + { + "epoch": 1.4332902982019713, + "grad_norm": 0.29320502281188965, + "learning_rate": 0.00018742484509112907, + "loss": 0.4034, + "step": 11380 + }, + { + "epoch": 1.4339200806121486, + "grad_norm": 0.2842418849468231, + "learning_rate": 0.00018731838858601074, + "loss": 0.3877, + "step": 11385 + }, + { + "epoch": 1.4345498630223257, + "grad_norm": 0.31208139657974243, + "learning_rate": 0.00018721191204006525, + "loss": 0.3731, + "step": 11390 + }, + { + "epoch": 1.4351796454325032, + "grad_norm": 0.2809062600135803, + "learning_rate": 0.00018710541551047303, + "loss": 0.3939, + "step": 11395 + }, + { + "epoch": 1.4358094278426803, + "grad_norm": 0.308969646692276, + "learning_rate": 0.00018699889905442508, + "loss": 0.3874, + "step": 11400 + }, + { + "epoch": 1.4364392102528576, + "grad_norm": 0.3051275610923767, + "learning_rate": 0.00018689236272912316, + "loss": 0.3676, + "step": 11405 + }, + { + "epoch": 1.437068992663035, + "grad_norm": 0.31084486842155457, + "learning_rate": 0.0001867858065917798, + "loss": 0.3954, + "step": 11410 + }, + { + "epoch": 1.4376987750732122, + "grad_norm": 0.28356167674064636, + "learning_rate": 0.000186679230699618, + "loss": 0.3701, + "step": 11415 + }, + { + "epoch": 1.4383285574833895, + "grad_norm": 0.3026244044303894, + "learning_rate": 0.0001865726351098715, + "loss": 0.3797, + "step": 11420 + }, + { + "epoch": 1.4389583398935668, + "grad_norm": 0.2909928560256958, + "learning_rate": 0.00018646601987978452, + "loss": 0.4022, + "step": 11425 + }, + { + "epoch": 1.439588122303744, + "grad_norm": 0.3085511326789856, + "learning_rate": 0.00018635938506661183, + "loss": 0.4099, + "step": 11430 + }, + { + "epoch": 1.4402179047139214, + "grad_norm": 0.28047701716423035, + "learning_rate": 0.0001862527307276189, + "loss": 0.3789, + "step": 11435 + }, + { + "epoch": 1.4408476871240987, + "grad_norm": 0.2697209119796753, + "learning_rate": 0.00018614605692008146, + "loss": 0.3864, + "step": 11440 + }, + { + "epoch": 1.4414774695342758, + "grad_norm": 0.40744665265083313, + "learning_rate": 0.0001860393637012858, + "loss": 0.4085, + "step": 11445 + }, + { + "epoch": 1.4421072519444533, + "grad_norm": 0.25875118374824524, + "learning_rate": 0.00018593265112852854, + "loss": 0.4033, + "step": 11450 + }, + { + "epoch": 1.4427370343546304, + "grad_norm": 0.2960642874240875, + "learning_rate": 0.00018582591925911694, + "loss": 0.4214, + "step": 11455 + }, + { + "epoch": 1.4433668167648077, + "grad_norm": 0.2711925506591797, + "learning_rate": 0.00018571916815036824, + "loss": 0.3537, + "step": 11460 + }, + { + "epoch": 1.443996599174985, + "grad_norm": 0.28002485632896423, + "learning_rate": 0.0001856123978596104, + "loss": 0.3787, + "step": 11465 + }, + { + "epoch": 1.4446263815851623, + "grad_norm": 0.3143458366394043, + "learning_rate": 0.00018550560844418138, + "loss": 0.3553, + "step": 11470 + }, + { + "epoch": 1.4452561639953396, + "grad_norm": 0.3184334337711334, + "learning_rate": 0.00018539879996142962, + "loss": 0.385, + "step": 11475 + }, + { + "epoch": 1.445885946405517, + "grad_norm": 0.3327188789844513, + "learning_rate": 0.00018529197246871368, + "loss": 0.4074, + "step": 11480 + }, + { + "epoch": 1.4465157288156942, + "grad_norm": 0.317942351102829, + "learning_rate": 0.0001851851260234024, + "loss": 0.3995, + "step": 11485 + }, + { + "epoch": 1.4471455112258715, + "grad_norm": 0.2567351758480072, + "learning_rate": 0.00018507826068287473, + "loss": 0.3661, + "step": 11490 + }, + { + "epoch": 1.4477752936360488, + "grad_norm": 0.29439592361450195, + "learning_rate": 0.0001849713765045198, + "loss": 0.3759, + "step": 11495 + }, + { + "epoch": 1.448405076046226, + "grad_norm": 0.3125048279762268, + "learning_rate": 0.0001848644735457368, + "loss": 0.4107, + "step": 11500 + }, + { + "epoch": 1.4490348584564032, + "grad_norm": 0.2855313718318939, + "learning_rate": 0.00018475755186393516, + "loss": 0.4061, + "step": 11505 + }, + { + "epoch": 1.4496646408665805, + "grad_norm": 0.3040854036808014, + "learning_rate": 0.00018465061151653423, + "loss": 0.3902, + "step": 11510 + }, + { + "epoch": 1.4502944232767578, + "grad_norm": 0.28425633907318115, + "learning_rate": 0.0001845436525609634, + "loss": 0.3861, + "step": 11515 + }, + { + "epoch": 1.4509242056869351, + "grad_norm": 0.31335607171058655, + "learning_rate": 0.00018443667505466205, + "loss": 0.3949, + "step": 11520 + }, + { + "epoch": 1.4515539880971124, + "grad_norm": 0.2725260555744171, + "learning_rate": 0.00018432967905507967, + "loss": 0.3979, + "step": 11525 + }, + { + "epoch": 1.4521837705072898, + "grad_norm": 0.2674049437046051, + "learning_rate": 0.00018422266461967537, + "loss": 0.3747, + "step": 11530 + }, + { + "epoch": 1.452813552917467, + "grad_norm": 0.3076520562171936, + "learning_rate": 0.0001841156318059185, + "loss": 0.385, + "step": 11535 + }, + { + "epoch": 1.4534433353276444, + "grad_norm": 0.23340527713298798, + "learning_rate": 0.00018400858067128806, + "loss": 0.3736, + "step": 11540 + }, + { + "epoch": 1.4540731177378217, + "grad_norm": 0.29402169585227966, + "learning_rate": 0.00018390151127327295, + "loss": 0.3994, + "step": 11545 + }, + { + "epoch": 1.454702900147999, + "grad_norm": 0.32409217953681946, + "learning_rate": 0.00018379442366937187, + "loss": 0.3979, + "step": 11550 + }, + { + "epoch": 1.455332682558176, + "grad_norm": 0.28875911235809326, + "learning_rate": 0.00018368731791709337, + "loss": 0.365, + "step": 11555 + }, + { + "epoch": 1.4559624649683534, + "grad_norm": 0.26838234066963196, + "learning_rate": 0.0001835801940739556, + "loss": 0.3912, + "step": 11560 + }, + { + "epoch": 1.4565922473785307, + "grad_norm": 0.31797516345977783, + "learning_rate": 0.00018347305219748665, + "loss": 0.3622, + "step": 11565 + }, + { + "epoch": 1.457222029788708, + "grad_norm": 0.31115812063217163, + "learning_rate": 0.00018336589234522398, + "loss": 0.4283, + "step": 11570 + }, + { + "epoch": 1.4578518121988853, + "grad_norm": 0.2730168402194977, + "learning_rate": 0.00018325871457471496, + "loss": 0.3864, + "step": 11575 + }, + { + "epoch": 1.4584815946090626, + "grad_norm": 0.28333088755607605, + "learning_rate": 0.00018315151894351657, + "loss": 0.3451, + "step": 11580 + }, + { + "epoch": 1.4591113770192399, + "grad_norm": 0.3169468343257904, + "learning_rate": 0.00018304430550919522, + "loss": 0.3719, + "step": 11585 + }, + { + "epoch": 1.4597411594294172, + "grad_norm": 0.3411467969417572, + "learning_rate": 0.000182937074329327, + "loss": 0.4073, + "step": 11590 + }, + { + "epoch": 1.4603709418395945, + "grad_norm": 0.3131183385848999, + "learning_rate": 0.0001828298254614975, + "loss": 0.4117, + "step": 11595 + }, + { + "epoch": 1.4610007242497716, + "grad_norm": 0.25929832458496094, + "learning_rate": 0.0001827225589633018, + "loss": 0.3834, + "step": 11600 + }, + { + "epoch": 1.461630506659949, + "grad_norm": 0.32609832286834717, + "learning_rate": 0.00018261527489234444, + "loss": 0.3972, + "step": 11605 + }, + { + "epoch": 1.4622602890701262, + "grad_norm": 0.3089287579059601, + "learning_rate": 0.00018250797330623953, + "loss": 0.3727, + "step": 11610 + }, + { + "epoch": 1.4628900714803035, + "grad_norm": 0.2891997992992401, + "learning_rate": 0.00018240065426261033, + "loss": 0.3891, + "step": 11615 + }, + { + "epoch": 1.4635198538904808, + "grad_norm": 0.3119528293609619, + "learning_rate": 0.00018229331781908971, + "loss": 0.388, + "step": 11620 + }, + { + "epoch": 1.464149636300658, + "grad_norm": 0.3314844071865082, + "learning_rate": 0.00018218596403331977, + "loss": 0.3803, + "step": 11625 + }, + { + "epoch": 1.4647794187108354, + "grad_norm": 0.27267536520957947, + "learning_rate": 0.00018207859296295197, + "loss": 0.3665, + "step": 11630 + }, + { + "epoch": 1.4654092011210127, + "grad_norm": 0.30490440130233765, + "learning_rate": 0.00018197120466564693, + "loss": 0.4051, + "step": 11635 + }, + { + "epoch": 1.46603898353119, + "grad_norm": 0.3182273209095001, + "learning_rate": 0.00018186379919907472, + "loss": 0.38, + "step": 11640 + }, + { + "epoch": 1.4666687659413673, + "grad_norm": 0.3026832044124603, + "learning_rate": 0.00018175637662091448, + "loss": 0.3371, + "step": 11645 + }, + { + "epoch": 1.4672985483515446, + "grad_norm": 0.3287534713745117, + "learning_rate": 0.0001816489369888546, + "loss": 0.4234, + "step": 11650 + }, + { + "epoch": 1.4679283307617217, + "grad_norm": 0.28076720237731934, + "learning_rate": 0.00018154148036059263, + "loss": 0.3825, + "step": 11655 + }, + { + "epoch": 1.4685581131718992, + "grad_norm": 0.304766446352005, + "learning_rate": 0.0001814340067938352, + "loss": 0.3905, + "step": 11660 + }, + { + "epoch": 1.4691878955820763, + "grad_norm": 0.30473533272743225, + "learning_rate": 0.00018132651634629812, + "loss": 0.409, + "step": 11665 + }, + { + "epoch": 1.4698176779922536, + "grad_norm": 0.32186418771743774, + "learning_rate": 0.00018121900907570618, + "loss": 0.3741, + "step": 11670 + }, + { + "epoch": 1.470447460402431, + "grad_norm": 0.33314061164855957, + "learning_rate": 0.00018111148503979326, + "loss": 0.3981, + "step": 11675 + }, + { + "epoch": 1.4710772428126082, + "grad_norm": 0.3202495872974396, + "learning_rate": 0.00018100394429630223, + "loss": 0.4014, + "step": 11680 + }, + { + "epoch": 1.4717070252227855, + "grad_norm": 0.2801063656806946, + "learning_rate": 0.00018089638690298488, + "loss": 0.3827, + "step": 11685 + }, + { + "epoch": 1.4723368076329628, + "grad_norm": 0.3252180516719818, + "learning_rate": 0.000180788812917602, + "loss": 0.4207, + "step": 11690 + }, + { + "epoch": 1.4729665900431401, + "grad_norm": 0.279823899269104, + "learning_rate": 0.0001806812223979233, + "loss": 0.4092, + "step": 11695 + }, + { + "epoch": 1.4735963724533174, + "grad_norm": 0.29136526584625244, + "learning_rate": 0.00018057361540172733, + "loss": 0.3939, + "step": 11700 + }, + { + "epoch": 1.4742261548634947, + "grad_norm": 0.2708832621574402, + "learning_rate": 0.00018046599198680153, + "loss": 0.3645, + "step": 11705 + }, + { + "epoch": 1.4748559372736718, + "grad_norm": 0.34708496928215027, + "learning_rate": 0.00018035835221094214, + "loss": 0.3814, + "step": 11710 + }, + { + "epoch": 1.4754857196838493, + "grad_norm": 0.3081948161125183, + "learning_rate": 0.00018025069613195413, + "loss": 0.3738, + "step": 11715 + }, + { + "epoch": 1.4761155020940264, + "grad_norm": 0.26891911029815674, + "learning_rate": 0.0001801430238076513, + "loss": 0.3724, + "step": 11720 + }, + { + "epoch": 1.4767452845042037, + "grad_norm": 0.3266797363758087, + "learning_rate": 0.00018003533529585612, + "loss": 0.3749, + "step": 11725 + }, + { + "epoch": 1.477375066914381, + "grad_norm": 0.25788089632987976, + "learning_rate": 0.00017992763065439982, + "loss": 0.3661, + "step": 11730 + }, + { + "epoch": 1.4780048493245583, + "grad_norm": 0.301270067691803, + "learning_rate": 0.00017981990994112227, + "loss": 0.3832, + "step": 11735 + }, + { + "epoch": 1.4786346317347356, + "grad_norm": 0.2785583734512329, + "learning_rate": 0.0001797121732138719, + "loss": 0.357, + "step": 11740 + }, + { + "epoch": 1.479264414144913, + "grad_norm": 0.3153518736362457, + "learning_rate": 0.00017960442053050583, + "loss": 0.3964, + "step": 11745 + }, + { + "epoch": 1.4798941965550902, + "grad_norm": 0.2862750291824341, + "learning_rate": 0.00017949665194888972, + "loss": 0.3781, + "step": 11750 + }, + { + "epoch": 1.4805239789652676, + "grad_norm": 0.31263992190361023, + "learning_rate": 0.00017938886752689765, + "loss": 0.3822, + "step": 11755 + }, + { + "epoch": 1.4811537613754449, + "grad_norm": 0.31964340806007385, + "learning_rate": 0.00017928106732241248, + "loss": 0.3757, + "step": 11760 + }, + { + "epoch": 1.481783543785622, + "grad_norm": 0.29111340641975403, + "learning_rate": 0.0001791732513933253, + "loss": 0.362, + "step": 11765 + }, + { + "epoch": 1.4824133261957995, + "grad_norm": 0.32248637080192566, + "learning_rate": 0.00017906541979753572, + "loss": 0.3978, + "step": 11770 + }, + { + "epoch": 1.4830431086059765, + "grad_norm": 0.2964222729206085, + "learning_rate": 0.0001789575725929518, + "loss": 0.3853, + "step": 11775 + }, + { + "epoch": 1.4836728910161538, + "grad_norm": 0.32823482155799866, + "learning_rate": 0.0001788497098374899, + "loss": 0.3828, + "step": 11780 + }, + { + "epoch": 1.4843026734263312, + "grad_norm": 0.30054226517677307, + "learning_rate": 0.0001787418315890748, + "loss": 0.38, + "step": 11785 + }, + { + "epoch": 1.4849324558365085, + "grad_norm": 0.30829596519470215, + "learning_rate": 0.0001786339379056397, + "loss": 0.3645, + "step": 11790 + }, + { + "epoch": 1.4855622382466858, + "grad_norm": 0.3095497786998749, + "learning_rate": 0.00017852602884512584, + "loss": 0.3727, + "step": 11795 + }, + { + "epoch": 1.486192020656863, + "grad_norm": 0.29647621512413025, + "learning_rate": 0.00017841810446548283, + "loss": 0.3764, + "step": 11800 + }, + { + "epoch": 1.4868218030670404, + "grad_norm": 0.3227784037590027, + "learning_rate": 0.00017831016482466864, + "loss": 0.3797, + "step": 11805 + }, + { + "epoch": 1.4874515854772177, + "grad_norm": 0.32365646958351135, + "learning_rate": 0.00017820220998064927, + "loss": 0.3766, + "step": 11810 + }, + { + "epoch": 1.488081367887395, + "grad_norm": 0.36090198159217834, + "learning_rate": 0.0001780942399913989, + "loss": 0.4015, + "step": 11815 + }, + { + "epoch": 1.488711150297572, + "grad_norm": 0.28814610838890076, + "learning_rate": 0.00017798625491489994, + "loss": 0.3616, + "step": 11820 + }, + { + "epoch": 1.4893409327077496, + "grad_norm": 0.2654825747013092, + "learning_rate": 0.00017787825480914283, + "loss": 0.3462, + "step": 11825 + }, + { + "epoch": 1.4899707151179267, + "grad_norm": 0.2913071811199188, + "learning_rate": 0.000177770239732126, + "loss": 0.3707, + "step": 11830 + }, + { + "epoch": 1.490600497528104, + "grad_norm": 0.33099865913391113, + "learning_rate": 0.0001776622097418562, + "loss": 0.3644, + "step": 11835 + }, + { + "epoch": 1.4912302799382813, + "grad_norm": 0.2980974018573761, + "learning_rate": 0.0001775541648963478, + "loss": 0.3839, + "step": 11840 + }, + { + "epoch": 1.4918600623484586, + "grad_norm": 0.2673074007034302, + "learning_rate": 0.00017744610525362352, + "loss": 0.3736, + "step": 11845 + }, + { + "epoch": 1.4924898447586359, + "grad_norm": 0.26277023553848267, + "learning_rate": 0.00017733803087171372, + "loss": 0.3463, + "step": 11850 + }, + { + "epoch": 1.4931196271688132, + "grad_norm": 0.27924680709838867, + "learning_rate": 0.00017722994180865696, + "loss": 0.4095, + "step": 11855 + }, + { + "epoch": 1.4937494095789905, + "grad_norm": 0.2761695086956024, + "learning_rate": 0.00017712183812249938, + "loss": 0.3748, + "step": 11860 + }, + { + "epoch": 1.4943791919891678, + "grad_norm": 0.312854528427124, + "learning_rate": 0.00017701371987129523, + "loss": 0.3748, + "step": 11865 + }, + { + "epoch": 1.495008974399345, + "grad_norm": 0.3033592998981476, + "learning_rate": 0.00017690558711310644, + "loss": 0.3728, + "step": 11870 + }, + { + "epoch": 1.4956387568095222, + "grad_norm": 0.2711508572101593, + "learning_rate": 0.00017679743990600281, + "loss": 0.3748, + "step": 11875 + }, + { + "epoch": 1.4962685392196997, + "grad_norm": 0.28003159165382385, + "learning_rate": 0.00017668927830806177, + "loss": 0.3658, + "step": 11880 + }, + { + "epoch": 1.4968983216298768, + "grad_norm": 0.2750314772129059, + "learning_rate": 0.0001765811023773687, + "loss": 0.3705, + "step": 11885 + }, + { + "epoch": 1.497528104040054, + "grad_norm": 0.31037452816963196, + "learning_rate": 0.00017647291217201644, + "loss": 0.3718, + "step": 11890 + }, + { + "epoch": 1.4981578864502314, + "grad_norm": 0.33681520819664, + "learning_rate": 0.00017636470775010563, + "loss": 0.37, + "step": 11895 + }, + { + "epoch": 1.4987876688604087, + "grad_norm": 0.2735719084739685, + "learning_rate": 0.00017625648916974452, + "loss": 0.3898, + "step": 11900 + }, + { + "epoch": 1.499417451270586, + "grad_norm": 0.2873845398426056, + "learning_rate": 0.00017614825648904902, + "loss": 0.387, + "step": 11905 + }, + { + "epoch": 1.5000472336807633, + "grad_norm": 0.2826070189476013, + "learning_rate": 0.00017604000976614243, + "loss": 0.3656, + "step": 11910 + }, + { + "epoch": 1.5006770160909406, + "grad_norm": 0.2709527015686035, + "learning_rate": 0.00017593174905915581, + "loss": 0.3583, + "step": 11915 + }, + { + "epoch": 1.5013067985011177, + "grad_norm": 0.3088144063949585, + "learning_rate": 0.00017582347442622755, + "loss": 0.3715, + "step": 11920 + }, + { + "epoch": 1.5019365809112952, + "grad_norm": 0.27996301651000977, + "learning_rate": 0.0001757151859255038, + "loss": 0.3636, + "step": 11925 + }, + { + "epoch": 1.5025663633214723, + "grad_norm": 0.3117114007472992, + "learning_rate": 0.00017560688361513766, + "loss": 0.351, + "step": 11930 + }, + { + "epoch": 1.5031961457316498, + "grad_norm": 0.32614433765411377, + "learning_rate": 0.00017549856755329012, + "loss": 0.3711, + "step": 11935 + }, + { + "epoch": 1.503825928141827, + "grad_norm": 0.23831017315387726, + "learning_rate": 0.0001753902377981294, + "loss": 0.3645, + "step": 11940 + }, + { + "epoch": 1.5044557105520044, + "grad_norm": 0.27338019013404846, + "learning_rate": 0.000175281894407831, + "loss": 0.3606, + "step": 11945 + }, + { + "epoch": 1.5050854929621815, + "grad_norm": 0.2813990116119385, + "learning_rate": 0.0001751735374405778, + "loss": 0.3637, + "step": 11950 + }, + { + "epoch": 1.5057152753723588, + "grad_norm": 0.2607782781124115, + "learning_rate": 0.00017506516695455992, + "loss": 0.3493, + "step": 11955 + }, + { + "epoch": 1.5063450577825361, + "grad_norm": 0.2825680077075958, + "learning_rate": 0.0001749567830079749, + "loss": 0.3474, + "step": 11960 + }, + { + "epoch": 1.5069748401927134, + "grad_norm": 0.2957023084163666, + "learning_rate": 0.00017484838565902735, + "loss": 0.3852, + "step": 11965 + }, + { + "epoch": 1.5076046226028907, + "grad_norm": 0.31363338232040405, + "learning_rate": 0.00017473997496592904, + "loss": 0.3944, + "step": 11970 + }, + { + "epoch": 1.5082344050130678, + "grad_norm": 0.271010160446167, + "learning_rate": 0.00017463155098689908, + "loss": 0.3667, + "step": 11975 + }, + { + "epoch": 1.5088641874232454, + "grad_norm": 0.28360188007354736, + "learning_rate": 0.00017452311378016362, + "loss": 0.3564, + "step": 11980 + }, + { + "epoch": 1.5094939698334224, + "grad_norm": 0.28345590829849243, + "learning_rate": 0.00017441466340395583, + "loss": 0.358, + "step": 11985 + }, + { + "epoch": 1.5101237522436, + "grad_norm": 0.23574601113796234, + "learning_rate": 0.00017430619991651614, + "loss": 0.3588, + "step": 11990 + }, + { + "epoch": 1.510753534653777, + "grad_norm": 0.32633906602859497, + "learning_rate": 0.0001741977233760919, + "loss": 0.3786, + "step": 11995 + }, + { + "epoch": 1.5113833170639546, + "grad_norm": 0.31216609477996826, + "learning_rate": 0.00017408923384093746, + "loss": 0.3949, + "step": 12000 + }, + { + "epoch": 1.5113833170639546, + "eval_loss": 0.3009350597858429, + "eval_runtime": 6.1573, + "eval_samples_per_second": 162.409, + "eval_steps_per_second": 10.232, + "step": 12000 + }, + { + "epoch": 1.5120130994741316, + "grad_norm": 0.2735341191291809, + "learning_rate": 0.00017398073136931416, + "loss": 0.3667, + "step": 12005 + }, + { + "epoch": 1.512642881884309, + "grad_norm": 0.3168368637561798, + "learning_rate": 0.0001738722160194904, + "loss": 0.3693, + "step": 12010 + }, + { + "epoch": 1.5132726642944863, + "grad_norm": 0.27563655376434326, + "learning_rate": 0.0001737636878497413, + "loss": 0.3721, + "step": 12015 + }, + { + "epoch": 1.5139024467046636, + "grad_norm": 0.27887552976608276, + "learning_rate": 0.00017365514691834898, + "loss": 0.402, + "step": 12020 + }, + { + "epoch": 1.5145322291148409, + "grad_norm": 0.30676189064979553, + "learning_rate": 0.0001735465932836024, + "loss": 0.3875, + "step": 12025 + }, + { + "epoch": 1.515162011525018, + "grad_norm": 0.30623871088027954, + "learning_rate": 0.00017343802700379746, + "loss": 0.3644, + "step": 12030 + }, + { + "epoch": 1.5157917939351955, + "grad_norm": 0.2534305453300476, + "learning_rate": 0.00017332944813723658, + "loss": 0.3753, + "step": 12035 + }, + { + "epoch": 1.5164215763453726, + "grad_norm": 0.29374125599861145, + "learning_rate": 0.00017322085674222916, + "loss": 0.3964, + "step": 12040 + }, + { + "epoch": 1.51705135875555, + "grad_norm": 0.2833009362220764, + "learning_rate": 0.00017311225287709126, + "loss": 0.3778, + "step": 12045 + }, + { + "epoch": 1.5176811411657272, + "grad_norm": 0.273299902677536, + "learning_rate": 0.0001730036366001456, + "loss": 0.3661, + "step": 12050 + }, + { + "epoch": 1.5183109235759045, + "grad_norm": 0.32840752601623535, + "learning_rate": 0.00017289500796972165, + "loss": 0.3564, + "step": 12055 + }, + { + "epoch": 1.5189407059860818, + "grad_norm": 0.289202481508255, + "learning_rate": 0.00017278636704415545, + "loss": 0.3885, + "step": 12060 + }, + { + "epoch": 1.519570488396259, + "grad_norm": 0.28327277302742004, + "learning_rate": 0.0001726777138817896, + "loss": 0.376, + "step": 12065 + }, + { + "epoch": 1.5202002708064364, + "grad_norm": 0.2617267370223999, + "learning_rate": 0.00017256904854097343, + "loss": 0.353, + "step": 12070 + }, + { + "epoch": 1.5208300532166137, + "grad_norm": 0.2693130671977997, + "learning_rate": 0.00017246037108006266, + "loss": 0.386, + "step": 12075 + }, + { + "epoch": 1.521459835626791, + "grad_norm": 0.260217547416687, + "learning_rate": 0.00017235168155741956, + "loss": 0.3773, + "step": 12080 + }, + { + "epoch": 1.522089618036968, + "grad_norm": 0.2806963622570038, + "learning_rate": 0.0001722429800314129, + "loss": 0.3703, + "step": 12085 + }, + { + "epoch": 1.5227194004471456, + "grad_norm": 0.2797011435031891, + "learning_rate": 0.00017213426656041787, + "loss": 0.3523, + "step": 12090 + }, + { + "epoch": 1.5233491828573227, + "grad_norm": 0.3413710296154022, + "learning_rate": 0.00017202554120281612, + "loss": 0.3825, + "step": 12095 + }, + { + "epoch": 1.5239789652675002, + "grad_norm": 0.2759542167186737, + "learning_rate": 0.0001719168040169956, + "loss": 0.346, + "step": 12100 + }, + { + "epoch": 1.5246087476776773, + "grad_norm": 0.28816723823547363, + "learning_rate": 0.00017180805506135068, + "loss": 0.3772, + "step": 12105 + }, + { + "epoch": 1.5252385300878546, + "grad_norm": 0.2563376724720001, + "learning_rate": 0.00017169929439428207, + "loss": 0.3661, + "step": 12110 + }, + { + "epoch": 1.525868312498032, + "grad_norm": 0.29572755098342896, + "learning_rate": 0.0001715905220741967, + "loss": 0.3428, + "step": 12115 + }, + { + "epoch": 1.5264980949082092, + "grad_norm": 0.28491732478141785, + "learning_rate": 0.0001714817381595078, + "loss": 0.3778, + "step": 12120 + }, + { + "epoch": 1.5271278773183865, + "grad_norm": 0.28429144620895386, + "learning_rate": 0.0001713729427086348, + "loss": 0.351, + "step": 12125 + }, + { + "epoch": 1.5277576597285638, + "grad_norm": 0.3044835925102234, + "learning_rate": 0.00017126413578000342, + "loss": 0.3651, + "step": 12130 + }, + { + "epoch": 1.5283874421387411, + "grad_norm": 0.30945730209350586, + "learning_rate": 0.0001711553174320453, + "loss": 0.3731, + "step": 12135 + }, + { + "epoch": 1.5290172245489182, + "grad_norm": 0.26389655470848083, + "learning_rate": 0.00017104648772319853, + "loss": 0.3527, + "step": 12140 + }, + { + "epoch": 1.5296470069590957, + "grad_norm": 0.3144720196723938, + "learning_rate": 0.0001709376467119071, + "loss": 0.3776, + "step": 12145 + }, + { + "epoch": 1.5302767893692728, + "grad_norm": 0.2860710918903351, + "learning_rate": 0.00017082879445662113, + "loss": 0.3575, + "step": 12150 + }, + { + "epoch": 1.5309065717794503, + "grad_norm": 0.2869095802307129, + "learning_rate": 0.00017071993101579674, + "loss": 0.3322, + "step": 12155 + }, + { + "epoch": 1.5315363541896274, + "grad_norm": 0.2524400055408478, + "learning_rate": 0.00017061105644789612, + "loss": 0.3743, + "step": 12160 + }, + { + "epoch": 1.5321661365998047, + "grad_norm": 0.2670304477214813, + "learning_rate": 0.00017050217081138736, + "loss": 0.3735, + "step": 12165 + }, + { + "epoch": 1.532795919009982, + "grad_norm": 0.2701478898525238, + "learning_rate": 0.00017039327416474456, + "loss": 0.3467, + "step": 12170 + }, + { + "epoch": 1.5334257014201593, + "grad_norm": 0.2941682040691376, + "learning_rate": 0.0001702843665664477, + "loss": 0.3895, + "step": 12175 + }, + { + "epoch": 1.5340554838303366, + "grad_norm": 0.28004932403564453, + "learning_rate": 0.00017017544807498264, + "loss": 0.3666, + "step": 12180 + }, + { + "epoch": 1.534685266240514, + "grad_norm": 0.29110807180404663, + "learning_rate": 0.00017006651874884116, + "loss": 0.3628, + "step": 12185 + }, + { + "epoch": 1.5353150486506912, + "grad_norm": 0.2467578798532486, + "learning_rate": 0.00016995757864652066, + "loss": 0.35, + "step": 12190 + }, + { + "epoch": 1.5359448310608683, + "grad_norm": 0.3148331046104431, + "learning_rate": 0.00016984862782652463, + "loss": 0.3535, + "step": 12195 + }, + { + "epoch": 1.5365746134710458, + "grad_norm": 0.28578343987464905, + "learning_rate": 0.00016973966634736202, + "loss": 0.3477, + "step": 12200 + }, + { + "epoch": 1.537204395881223, + "grad_norm": 0.24588525295257568, + "learning_rate": 0.0001696306942675477, + "loss": 0.3459, + "step": 12205 + }, + { + "epoch": 1.5378341782914005, + "grad_norm": 0.2754054665565491, + "learning_rate": 0.00016952171164560213, + "loss": 0.3555, + "step": 12210 + }, + { + "epoch": 1.5384639607015775, + "grad_norm": 0.28237447142601013, + "learning_rate": 0.00016941271854005148, + "loss": 0.3446, + "step": 12215 + }, + { + "epoch": 1.5390937431117548, + "grad_norm": 0.27689647674560547, + "learning_rate": 0.00016930371500942755, + "loss": 0.3651, + "step": 12220 + }, + { + "epoch": 1.5397235255219321, + "grad_norm": 0.29644525051116943, + "learning_rate": 0.0001691947011122677, + "loss": 0.3482, + "step": 12225 + }, + { + "epoch": 1.5403533079321095, + "grad_norm": 0.3168468475341797, + "learning_rate": 0.0001690856769071149, + "loss": 0.3859, + "step": 12230 + }, + { + "epoch": 1.5409830903422868, + "grad_norm": 0.282879501581192, + "learning_rate": 0.0001689766424525177, + "loss": 0.3742, + "step": 12235 + }, + { + "epoch": 1.541612872752464, + "grad_norm": 0.2539578676223755, + "learning_rate": 0.00016886759780702996, + "loss": 0.3467, + "step": 12240 + }, + { + "epoch": 1.5422426551626414, + "grad_norm": 0.3353635370731354, + "learning_rate": 0.00016875854302921122, + "loss": 0.3831, + "step": 12245 + }, + { + "epoch": 1.5428724375728184, + "grad_norm": 0.2890516519546509, + "learning_rate": 0.0001686494781776264, + "loss": 0.3672, + "step": 12250 + }, + { + "epoch": 1.543502219982996, + "grad_norm": 0.3136516213417053, + "learning_rate": 0.00016854040331084583, + "loss": 0.37, + "step": 12255 + }, + { + "epoch": 1.544132002393173, + "grad_norm": 0.29757821559906006, + "learning_rate": 0.0001684313184874451, + "loss": 0.3681, + "step": 12260 + }, + { + "epoch": 1.5447617848033506, + "grad_norm": 0.3504684269428253, + "learning_rate": 0.0001683222237660054, + "loss": 0.3868, + "step": 12265 + }, + { + "epoch": 1.5453915672135277, + "grad_norm": 0.25241127610206604, + "learning_rate": 0.00016821311920511297, + "loss": 0.3425, + "step": 12270 + }, + { + "epoch": 1.546021349623705, + "grad_norm": 0.27286654710769653, + "learning_rate": 0.00016810400486335953, + "loss": 0.3604, + "step": 12275 + }, + { + "epoch": 1.5466511320338823, + "grad_norm": 0.3104652166366577, + "learning_rate": 0.0001679948807993419, + "loss": 0.3487, + "step": 12280 + }, + { + "epoch": 1.5472809144440596, + "grad_norm": 0.2972196042537689, + "learning_rate": 0.00016788574707166226, + "loss": 0.3555, + "step": 12285 + }, + { + "epoch": 1.5479106968542369, + "grad_norm": 0.29232388734817505, + "learning_rate": 0.00016777660373892787, + "loss": 0.3654, + "step": 12290 + }, + { + "epoch": 1.5485404792644142, + "grad_norm": 0.29798245429992676, + "learning_rate": 0.00016766745085975126, + "loss": 0.3575, + "step": 12295 + }, + { + "epoch": 1.5491702616745915, + "grad_norm": 0.2721775472164154, + "learning_rate": 0.0001675582884927499, + "loss": 0.3409, + "step": 12300 + }, + { + "epoch": 1.5498000440847686, + "grad_norm": 0.3131150007247925, + "learning_rate": 0.00016744911669654662, + "loss": 0.3695, + "step": 12305 + }, + { + "epoch": 1.550429826494946, + "grad_norm": 0.29543280601501465, + "learning_rate": 0.00016733993552976901, + "loss": 0.3572, + "step": 12310 + }, + { + "epoch": 1.5510596089051232, + "grad_norm": 0.3287052512168884, + "learning_rate": 0.00016723074505105, + "loss": 0.3681, + "step": 12315 + }, + { + "epoch": 1.5516893913153007, + "grad_norm": 0.2833183705806732, + "learning_rate": 0.0001671215453190273, + "loss": 0.3709, + "step": 12320 + }, + { + "epoch": 1.5523191737254778, + "grad_norm": 0.2558510899543762, + "learning_rate": 0.00016701233639234363, + "loss": 0.3404, + "step": 12325 + }, + { + "epoch": 1.552948956135655, + "grad_norm": 0.2524779438972473, + "learning_rate": 0.0001669031183296467, + "loss": 0.3492, + "step": 12330 + }, + { + "epoch": 1.5535787385458324, + "grad_norm": 0.2844880521297455, + "learning_rate": 0.00016679389118958918, + "loss": 0.3538, + "step": 12335 + }, + { + "epoch": 1.5542085209560097, + "grad_norm": 0.28060173988342285, + "learning_rate": 0.0001666846550308285, + "loss": 0.3615, + "step": 12340 + }, + { + "epoch": 1.554838303366187, + "grad_norm": 0.2490835040807724, + "learning_rate": 0.00016657540991202687, + "loss": 0.3655, + "step": 12345 + }, + { + "epoch": 1.5554680857763643, + "grad_norm": 0.27524054050445557, + "learning_rate": 0.00016646615589185153, + "loss": 0.3412, + "step": 12350 + }, + { + "epoch": 1.5560978681865416, + "grad_norm": 0.31142935156822205, + "learning_rate": 0.00016635689302897435, + "loss": 0.347, + "step": 12355 + }, + { + "epoch": 1.5567276505967187, + "grad_norm": 0.28053995966911316, + "learning_rate": 0.00016624762138207197, + "loss": 0.3838, + "step": 12360 + }, + { + "epoch": 1.5573574330068962, + "grad_norm": 0.2476169764995575, + "learning_rate": 0.0001661383410098258, + "loss": 0.3636, + "step": 12365 + }, + { + "epoch": 1.5579872154170733, + "grad_norm": 0.4054109752178192, + "learning_rate": 0.00016602905197092183, + "loss": 0.3657, + "step": 12370 + }, + { + "epoch": 1.5586169978272508, + "grad_norm": 0.2735072672367096, + "learning_rate": 0.00016591975432405084, + "loss": 0.3593, + "step": 12375 + }, + { + "epoch": 1.559246780237428, + "grad_norm": 0.2994532883167267, + "learning_rate": 0.00016581044812790817, + "loss": 0.3641, + "step": 12380 + }, + { + "epoch": 1.5598765626476052, + "grad_norm": 0.263090044260025, + "learning_rate": 0.0001657011334411936, + "loss": 0.3711, + "step": 12385 + }, + { + "epoch": 1.5605063450577825, + "grad_norm": 0.25073063373565674, + "learning_rate": 0.0001655918103226118, + "loss": 0.3554, + "step": 12390 + }, + { + "epoch": 1.5611361274679598, + "grad_norm": 0.2575080096721649, + "learning_rate": 0.00016548247883087168, + "loss": 0.3744, + "step": 12395 + }, + { + "epoch": 1.5617659098781371, + "grad_norm": 0.2630578577518463, + "learning_rate": 0.00016537313902468677, + "loss": 0.3501, + "step": 12400 + }, + { + "epoch": 1.5623956922883144, + "grad_norm": 0.3097805678844452, + "learning_rate": 0.00016526379096277503, + "loss": 0.3586, + "step": 12405 + }, + { + "epoch": 1.5630254746984917, + "grad_norm": 0.3104281723499298, + "learning_rate": 0.0001651544347038589, + "loss": 0.3643, + "step": 12410 + }, + { + "epoch": 1.5636552571086688, + "grad_norm": 0.3604758381843567, + "learning_rate": 0.0001650450703066652, + "loss": 0.3645, + "step": 12415 + }, + { + "epoch": 1.5642850395188463, + "grad_norm": 0.30638590455055237, + "learning_rate": 0.000164935697829925, + "loss": 0.3572, + "step": 12420 + }, + { + "epoch": 1.5649148219290234, + "grad_norm": 0.27940669655799866, + "learning_rate": 0.00016482631733237397, + "loss": 0.3636, + "step": 12425 + }, + { + "epoch": 1.565544604339201, + "grad_norm": 0.28857216238975525, + "learning_rate": 0.00016471692887275185, + "loss": 0.3601, + "step": 12430 + }, + { + "epoch": 1.566174386749378, + "grad_norm": 0.2992657721042633, + "learning_rate": 0.0001646075325098027, + "loss": 0.3621, + "step": 12435 + }, + { + "epoch": 1.5668041691595553, + "grad_norm": 0.28050917387008667, + "learning_rate": 0.00016449812830227498, + "loss": 0.3623, + "step": 12440 + }, + { + "epoch": 1.5674339515697326, + "grad_norm": 0.269634485244751, + "learning_rate": 0.0001643887163089212, + "loss": 0.3375, + "step": 12445 + }, + { + "epoch": 1.56806373397991, + "grad_norm": 0.2825991213321686, + "learning_rate": 0.00016427929658849807, + "loss": 0.3523, + "step": 12450 + }, + { + "epoch": 1.5686935163900873, + "grad_norm": 0.3219839334487915, + "learning_rate": 0.00016416986919976645, + "loss": 0.3588, + "step": 12455 + }, + { + "epoch": 1.5693232988002646, + "grad_norm": 0.2681691646575928, + "learning_rate": 0.00016406043420149146, + "loss": 0.3466, + "step": 12460 + }, + { + "epoch": 1.5699530812104419, + "grad_norm": 0.2719057500362396, + "learning_rate": 0.0001639509916524421, + "loss": 0.3599, + "step": 12465 + }, + { + "epoch": 1.570582863620619, + "grad_norm": 0.24405649304389954, + "learning_rate": 0.00016384154161139158, + "loss": 0.3402, + "step": 12470 + }, + { + "epoch": 1.5712126460307965, + "grad_norm": 0.306537002325058, + "learning_rate": 0.00016373208413711696, + "loss": 0.3283, + "step": 12475 + }, + { + "epoch": 1.5718424284409735, + "grad_norm": 0.28490665555000305, + "learning_rate": 0.0001636226192883996, + "loss": 0.3529, + "step": 12480 + }, + { + "epoch": 1.572472210851151, + "grad_norm": 0.2510652542114258, + "learning_rate": 0.00016351314712402442, + "loss": 0.3228, + "step": 12485 + }, + { + "epoch": 1.5731019932613282, + "grad_norm": 0.2670060694217682, + "learning_rate": 0.0001634036677027806, + "loss": 0.3592, + "step": 12490 + }, + { + "epoch": 1.5737317756715055, + "grad_norm": 0.29240545630455017, + "learning_rate": 0.00016329418108346105, + "loss": 0.3717, + "step": 12495 + }, + { + "epoch": 1.5743615580816828, + "grad_norm": 0.29088887572288513, + "learning_rate": 0.00016318468732486255, + "loss": 0.3679, + "step": 12500 + }, + { + "epoch": 1.57499134049186, + "grad_norm": 0.25105100870132446, + "learning_rate": 0.0001630751864857858, + "loss": 0.3464, + "step": 12505 + }, + { + "epoch": 1.5756211229020374, + "grad_norm": 0.26624953746795654, + "learning_rate": 0.00016296567862503526, + "loss": 0.3552, + "step": 12510 + }, + { + "epoch": 1.5762509053122147, + "grad_norm": 0.28500837087631226, + "learning_rate": 0.00016285616380141914, + "loss": 0.3591, + "step": 12515 + }, + { + "epoch": 1.576880687722392, + "grad_norm": 0.2937677502632141, + "learning_rate": 0.00016274664207374936, + "loss": 0.3664, + "step": 12520 + }, + { + "epoch": 1.577510470132569, + "grad_norm": 0.28588148951530457, + "learning_rate": 0.00016263711350084165, + "loss": 0.3767, + "step": 12525 + }, + { + "epoch": 1.5781402525427466, + "grad_norm": 0.31547772884368896, + "learning_rate": 0.0001625275781415153, + "loss": 0.3521, + "step": 12530 + }, + { + "epoch": 1.5787700349529237, + "grad_norm": 0.29322996735572815, + "learning_rate": 0.00016241803605459334, + "loss": 0.3777, + "step": 12535 + }, + { + "epoch": 1.5793998173631012, + "grad_norm": 0.29141756892204285, + "learning_rate": 0.00016230848729890238, + "loss": 0.3367, + "step": 12540 + }, + { + "epoch": 1.5800295997732783, + "grad_norm": 0.2583523094654083, + "learning_rate": 0.00016219893193327258, + "loss": 0.3473, + "step": 12545 + }, + { + "epoch": 1.5806593821834556, + "grad_norm": 0.26929906010627747, + "learning_rate": 0.00016208937001653765, + "loss": 0.3622, + "step": 12550 + }, + { + "epoch": 1.581289164593633, + "grad_norm": 0.2727062702178955, + "learning_rate": 0.0001619798016075349, + "loss": 0.3607, + "step": 12555 + }, + { + "epoch": 1.5819189470038102, + "grad_norm": 0.35366252064704895, + "learning_rate": 0.000161870226765105, + "loss": 0.3453, + "step": 12560 + }, + { + "epoch": 1.5825487294139875, + "grad_norm": 0.30689889192581177, + "learning_rate": 0.00016176064554809225, + "loss": 0.3672, + "step": 12565 + }, + { + "epoch": 1.5831785118241648, + "grad_norm": 0.2855357825756073, + "learning_rate": 0.00016165105801534414, + "loss": 0.3715, + "step": 12570 + }, + { + "epoch": 1.583808294234342, + "grad_norm": 0.33706697821617126, + "learning_rate": 0.00016154146422571176, + "loss": 0.3645, + "step": 12575 + }, + { + "epoch": 1.5844380766445192, + "grad_norm": 0.24410569667816162, + "learning_rate": 0.00016143186423804944, + "loss": 0.3576, + "step": 12580 + }, + { + "epoch": 1.5850678590546967, + "grad_norm": 0.33356451988220215, + "learning_rate": 0.00016132225811121492, + "loss": 0.3774, + "step": 12585 + }, + { + "epoch": 1.5856976414648738, + "grad_norm": 0.2804293632507324, + "learning_rate": 0.00016121264590406912, + "loss": 0.3656, + "step": 12590 + }, + { + "epoch": 1.5863274238750513, + "grad_norm": 0.29394668340682983, + "learning_rate": 0.0001611030276754764, + "loss": 0.3468, + "step": 12595 + }, + { + "epoch": 1.5869572062852284, + "grad_norm": 0.2657965421676636, + "learning_rate": 0.0001609934034843042, + "loss": 0.3518, + "step": 12600 + }, + { + "epoch": 1.5875869886954057, + "grad_norm": 0.25583842396736145, + "learning_rate": 0.00016088377338942318, + "loss": 0.361, + "step": 12605 + }, + { + "epoch": 1.588216771105583, + "grad_norm": 0.27986687421798706, + "learning_rate": 0.00016077413744970722, + "loss": 0.3771, + "step": 12610 + }, + { + "epoch": 1.5888465535157603, + "grad_norm": 0.3200220763683319, + "learning_rate": 0.0001606644957240334, + "loss": 0.3666, + "step": 12615 + }, + { + "epoch": 1.5894763359259376, + "grad_norm": 0.29622554779052734, + "learning_rate": 0.00016055484827128173, + "loss": 0.3469, + "step": 12620 + }, + { + "epoch": 1.590106118336115, + "grad_norm": 0.3073137700557709, + "learning_rate": 0.00016044519515033545, + "loss": 0.3382, + "step": 12625 + }, + { + "epoch": 1.5907359007462922, + "grad_norm": 0.31342241168022156, + "learning_rate": 0.00016033553642008077, + "loss": 0.357, + "step": 12630 + }, + { + "epoch": 1.5913656831564693, + "grad_norm": 0.2913351058959961, + "learning_rate": 0.00016022587213940698, + "loss": 0.3487, + "step": 12635 + }, + { + "epoch": 1.5919954655666468, + "grad_norm": 0.2823300361633301, + "learning_rate": 0.00016011620236720621, + "loss": 0.3367, + "step": 12640 + }, + { + "epoch": 1.592625247976824, + "grad_norm": 0.3134678304195404, + "learning_rate": 0.00016000652716237373, + "loss": 0.3393, + "step": 12645 + }, + { + "epoch": 1.5932550303870014, + "grad_norm": 0.3235761821269989, + "learning_rate": 0.0001598968465838076, + "loss": 0.3752, + "step": 12650 + }, + { + "epoch": 1.5938848127971785, + "grad_norm": 0.26606664061546326, + "learning_rate": 0.00015978716069040875, + "loss": 0.3413, + "step": 12655 + }, + { + "epoch": 1.5945145952073558, + "grad_norm": 0.30575528740882874, + "learning_rate": 0.0001596774695410811, + "loss": 0.3715, + "step": 12660 + }, + { + "epoch": 1.5951443776175331, + "grad_norm": 0.3017826974391937, + "learning_rate": 0.0001595677731947312, + "loss": 0.3586, + "step": 12665 + }, + { + "epoch": 1.5957741600277104, + "grad_norm": 0.3203217089176178, + "learning_rate": 0.00015945807171026855, + "loss": 0.3753, + "step": 12670 + }, + { + "epoch": 1.5964039424378877, + "grad_norm": 0.2615835666656494, + "learning_rate": 0.00015934836514660536, + "loss": 0.3641, + "step": 12675 + }, + { + "epoch": 1.5970337248480648, + "grad_norm": 0.2814265191555023, + "learning_rate": 0.00015923865356265652, + "loss": 0.3467, + "step": 12680 + }, + { + "epoch": 1.5976635072582424, + "grad_norm": 0.28240394592285156, + "learning_rate": 0.00015912893701733975, + "loss": 0.3405, + "step": 12685 + }, + { + "epoch": 1.5982932896684194, + "grad_norm": 0.2675967514514923, + "learning_rate": 0.0001590192155695752, + "loss": 0.3341, + "step": 12690 + }, + { + "epoch": 1.598923072078597, + "grad_norm": 0.33408063650131226, + "learning_rate": 0.00015890948927828593, + "loss": 0.3431, + "step": 12695 + }, + { + "epoch": 1.599552854488774, + "grad_norm": 0.2793383300304413, + "learning_rate": 0.00015879975820239737, + "loss": 0.3334, + "step": 12700 + }, + { + "epoch": 1.6001826368989516, + "grad_norm": 0.29299893975257874, + "learning_rate": 0.00015869002240083765, + "loss": 0.3479, + "step": 12705 + }, + { + "epoch": 1.6008124193091287, + "grad_norm": 0.2782432436943054, + "learning_rate": 0.0001585802819325374, + "loss": 0.3491, + "step": 12710 + }, + { + "epoch": 1.601442201719306, + "grad_norm": 0.2812289297580719, + "learning_rate": 0.00015847053685642977, + "loss": 0.3406, + "step": 12715 + }, + { + "epoch": 1.6020719841294833, + "grad_norm": 0.2464970499277115, + "learning_rate": 0.00015836078723145032, + "loss": 0.3539, + "step": 12720 + }, + { + "epoch": 1.6027017665396606, + "grad_norm": 0.3087675869464874, + "learning_rate": 0.0001582510331165372, + "loss": 0.356, + "step": 12725 + }, + { + "epoch": 1.6033315489498379, + "grad_norm": 0.2726532816886902, + "learning_rate": 0.0001581412745706308, + "loss": 0.3443, + "step": 12730 + }, + { + "epoch": 1.603961331360015, + "grad_norm": 0.28401410579681396, + "learning_rate": 0.00015803151165267397, + "loss": 0.3359, + "step": 12735 + }, + { + "epoch": 1.6045911137701925, + "grad_norm": 0.2700473666191101, + "learning_rate": 0.00015792174442161194, + "loss": 0.3523, + "step": 12740 + }, + { + "epoch": 1.6052208961803696, + "grad_norm": 0.32183146476745605, + "learning_rate": 0.00015781197293639223, + "loss": 0.3765, + "step": 12745 + }, + { + "epoch": 1.605850678590547, + "grad_norm": 0.2805304229259491, + "learning_rate": 0.0001577021972559646, + "loss": 0.3546, + "step": 12750 + }, + { + "epoch": 1.6064804610007242, + "grad_norm": 0.30137768387794495, + "learning_rate": 0.00015759241743928108, + "loss": 0.3721, + "step": 12755 + }, + { + "epoch": 1.6071102434109017, + "grad_norm": 0.25476494431495667, + "learning_rate": 0.00015748263354529597, + "loss": 0.3281, + "step": 12760 + }, + { + "epoch": 1.6077400258210788, + "grad_norm": 0.3311167061328888, + "learning_rate": 0.0001573728456329657, + "loss": 0.3875, + "step": 12765 + }, + { + "epoch": 1.608369808231256, + "grad_norm": 0.27148258686065674, + "learning_rate": 0.00015726305376124897, + "loss": 0.3547, + "step": 12770 + }, + { + "epoch": 1.6089995906414334, + "grad_norm": 0.25366437435150146, + "learning_rate": 0.00015715325798910644, + "loss": 0.3423, + "step": 12775 + }, + { + "epoch": 1.6096293730516107, + "grad_norm": 0.2699160873889923, + "learning_rate": 0.000157043458375501, + "loss": 0.3347, + "step": 12780 + }, + { + "epoch": 1.610259155461788, + "grad_norm": 0.2672334611415863, + "learning_rate": 0.00015693365497939743, + "loss": 0.3354, + "step": 12785 + }, + { + "epoch": 1.610888937871965, + "grad_norm": 0.3269018828868866, + "learning_rate": 0.00015682384785976284, + "loss": 0.3427, + "step": 12790 + }, + { + "epoch": 1.6115187202821426, + "grad_norm": 0.2637929320335388, + "learning_rate": 0.00015671403707556605, + "loss": 0.3501, + "step": 12795 + }, + { + "epoch": 1.6121485026923197, + "grad_norm": 0.2606005072593689, + "learning_rate": 0.00015660422268577801, + "loss": 0.3387, + "step": 12800 + }, + { + "epoch": 1.6127782851024972, + "grad_norm": 0.30220791697502136, + "learning_rate": 0.00015649440474937152, + "loss": 0.3489, + "step": 12805 + }, + { + "epoch": 1.6134080675126743, + "grad_norm": 0.29726284742355347, + "learning_rate": 0.0001563845833253213, + "loss": 0.3358, + "step": 12810 + }, + { + "epoch": 1.6140378499228518, + "grad_norm": 0.2928326427936554, + "learning_rate": 0.000156274758472604, + "loss": 0.3236, + "step": 12815 + }, + { + "epoch": 1.614667632333029, + "grad_norm": 0.31645599007606506, + "learning_rate": 0.0001561649302501981, + "loss": 0.3571, + "step": 12820 + }, + { + "epoch": 1.6152974147432062, + "grad_norm": 0.26705339550971985, + "learning_rate": 0.00015605509871708382, + "loss": 0.3671, + "step": 12825 + }, + { + "epoch": 1.6159271971533835, + "grad_norm": 0.2691219449043274, + "learning_rate": 0.00015594526393224322, + "loss": 0.3452, + "step": 12830 + }, + { + "epoch": 1.6165569795635608, + "grad_norm": 0.2822478413581848, + "learning_rate": 0.00015583542595466005, + "loss": 0.3273, + "step": 12835 + }, + { + "epoch": 1.6171867619737381, + "grad_norm": 0.2974461019039154, + "learning_rate": 0.00015572558484331994, + "loss": 0.3652, + "step": 12840 + }, + { + "epoch": 1.6178165443839152, + "grad_norm": 0.2611928880214691, + "learning_rate": 0.00015561574065720986, + "loss": 0.3445, + "step": 12845 + }, + { + "epoch": 1.6184463267940927, + "grad_norm": 0.2836850583553314, + "learning_rate": 0.00015550589345531885, + "loss": 0.3326, + "step": 12850 + }, + { + "epoch": 1.6190761092042698, + "grad_norm": 0.2735482156276703, + "learning_rate": 0.00015539604329663725, + "loss": 0.3532, + "step": 12855 + }, + { + "epoch": 1.6197058916144473, + "grad_norm": 0.34394770860671997, + "learning_rate": 0.0001552861902401572, + "loss": 0.3532, + "step": 12860 + }, + { + "epoch": 1.6203356740246244, + "grad_norm": 0.2786300778388977, + "learning_rate": 0.0001551763343448722, + "loss": 0.3591, + "step": 12865 + }, + { + "epoch": 1.6209654564348017, + "grad_norm": 0.26574140787124634, + "learning_rate": 0.00015506647566977737, + "loss": 0.3527, + "step": 12870 + }, + { + "epoch": 1.621595238844979, + "grad_norm": 0.23986276984214783, + "learning_rate": 0.00015495661427386944, + "loss": 0.3437, + "step": 12875 + }, + { + "epoch": 1.6222250212551563, + "grad_norm": 0.29332754015922546, + "learning_rate": 0.0001548467502161464, + "loss": 0.3323, + "step": 12880 + }, + { + "epoch": 1.6228548036653336, + "grad_norm": 0.34338971972465515, + "learning_rate": 0.0001547368835556078, + "loss": 0.3367, + "step": 12885 + }, + { + "epoch": 1.623484586075511, + "grad_norm": 0.3112575113773346, + "learning_rate": 0.00015462701435125451, + "loss": 0.3392, + "step": 12890 + }, + { + "epoch": 1.6241143684856882, + "grad_norm": 0.26299479603767395, + "learning_rate": 0.0001545171426620888, + "loss": 0.3194, + "step": 12895 + }, + { + "epoch": 1.6247441508958653, + "grad_norm": 0.27403828501701355, + "learning_rate": 0.00015440726854711436, + "loss": 0.3344, + "step": 12900 + }, + { + "epoch": 1.6253739333060429, + "grad_norm": 0.2603330910205841, + "learning_rate": 0.000154297392065336, + "loss": 0.3564, + "step": 12905 + }, + { + "epoch": 1.62600371571622, + "grad_norm": 0.2812626361846924, + "learning_rate": 0.00015418751327575994, + "loss": 0.3583, + "step": 12910 + }, + { + "epoch": 1.6266334981263975, + "grad_norm": 0.27280038595199585, + "learning_rate": 0.0001540776322373936, + "loss": 0.3568, + "step": 12915 + }, + { + "epoch": 1.6272632805365745, + "grad_norm": 0.3073441982269287, + "learning_rate": 0.0001539677490092456, + "loss": 0.3336, + "step": 12920 + }, + { + "epoch": 1.6278930629467518, + "grad_norm": 0.2868177890777588, + "learning_rate": 0.00015385786365032576, + "loss": 0.3455, + "step": 12925 + }, + { + "epoch": 1.6285228453569291, + "grad_norm": 0.2661624550819397, + "learning_rate": 0.000153747976219645, + "loss": 0.3377, + "step": 12930 + }, + { + "epoch": 1.6291526277671065, + "grad_norm": 0.3179323673248291, + "learning_rate": 0.0001536380867762154, + "loss": 0.3706, + "step": 12935 + }, + { + "epoch": 1.6297824101772838, + "grad_norm": 0.30941662192344666, + "learning_rate": 0.0001535281953790501, + "loss": 0.3514, + "step": 12940 + }, + { + "epoch": 1.630412192587461, + "grad_norm": 0.3018413782119751, + "learning_rate": 0.0001534183020871633, + "loss": 0.3516, + "step": 12945 + }, + { + "epoch": 1.6310419749976384, + "grad_norm": 0.34621462225914, + "learning_rate": 0.00015330840695957019, + "loss": 0.3522, + "step": 12950 + }, + { + "epoch": 1.6316717574078154, + "grad_norm": 0.2858521342277527, + "learning_rate": 0.000153198510055287, + "loss": 0.3378, + "step": 12955 + }, + { + "epoch": 1.632301539817993, + "grad_norm": 0.2880783975124359, + "learning_rate": 0.00015308861143333076, + "loss": 0.3615, + "step": 12960 + }, + { + "epoch": 1.63293132222817, + "grad_norm": 0.24324443936347961, + "learning_rate": 0.00015297871115271976, + "loss": 0.3346, + "step": 12965 + }, + { + "epoch": 1.6335611046383476, + "grad_norm": 0.26982635259628296, + "learning_rate": 0.00015286880927247273, + "loss": 0.3423, + "step": 12970 + }, + { + "epoch": 1.6341908870485247, + "grad_norm": 0.27813199162483215, + "learning_rate": 0.00015275890585160961, + "loss": 0.3545, + "step": 12975 + }, + { + "epoch": 1.634820669458702, + "grad_norm": 0.27575090527534485, + "learning_rate": 0.00015264900094915106, + "loss": 0.3357, + "step": 12980 + }, + { + "epoch": 1.6354504518688793, + "grad_norm": 0.25838521122932434, + "learning_rate": 0.00015253909462411847, + "loss": 0.3244, + "step": 12985 + }, + { + "epoch": 1.6360802342790566, + "grad_norm": 0.2889041602611542, + "learning_rate": 0.00015242918693553404, + "loss": 0.3297, + "step": 12990 + }, + { + "epoch": 1.6367100166892339, + "grad_norm": 0.3074316680431366, + "learning_rate": 0.0001523192779424208, + "loss": 0.3525, + "step": 12995 + }, + { + "epoch": 1.6373397990994112, + "grad_norm": 0.26425209641456604, + "learning_rate": 0.00015220936770380227, + "loss": 0.3493, + "step": 13000 + }, + { + "epoch": 1.6373397990994112, + "eval_loss": 0.30248695611953735, + "eval_runtime": 6.1682, + "eval_samples_per_second": 162.123, + "eval_steps_per_second": 10.214, + "step": 13000 + }, + { + "epoch": 1.6379695815095885, + "grad_norm": 0.2873767018318176, + "learning_rate": 0.00015209945627870283, + "loss": 0.3838, + "step": 13005 + }, + { + "epoch": 1.6385993639197656, + "grad_norm": 0.2895953059196472, + "learning_rate": 0.0001519895437261474, + "loss": 0.3509, + "step": 13010 + }, + { + "epoch": 1.639229146329943, + "grad_norm": 0.2910915017127991, + "learning_rate": 0.0001518796301051616, + "loss": 0.326, + "step": 13015 + }, + { + "epoch": 1.6398589287401202, + "grad_norm": 0.2735256552696228, + "learning_rate": 0.00015176971547477142, + "loss": 0.366, + "step": 13020 + }, + { + "epoch": 1.6404887111502977, + "grad_norm": 0.3099430501461029, + "learning_rate": 0.00015165979989400366, + "loss": 0.3226, + "step": 13025 + }, + { + "epoch": 1.6411184935604748, + "grad_norm": 0.2963193655014038, + "learning_rate": 0.00015154988342188543, + "loss": 0.3301, + "step": 13030 + }, + { + "epoch": 1.641748275970652, + "grad_norm": 0.28377631306648254, + "learning_rate": 0.0001514399661174444, + "loss": 0.3143, + "step": 13035 + }, + { + "epoch": 1.6423780583808294, + "grad_norm": 0.25847306847572327, + "learning_rate": 0.00015133004803970866, + "loss": 0.325, + "step": 13040 + }, + { + "epoch": 1.6430078407910067, + "grad_norm": 0.2919864058494568, + "learning_rate": 0.00015122012924770675, + "loss": 0.3543, + "step": 13045 + }, + { + "epoch": 1.643637623201184, + "grad_norm": 0.31185677647590637, + "learning_rate": 0.00015111020980046756, + "loss": 0.3546, + "step": 13050 + }, + { + "epoch": 1.6442674056113613, + "grad_norm": 0.27933362126350403, + "learning_rate": 0.00015100028975702036, + "loss": 0.3344, + "step": 13055 + }, + { + "epoch": 1.6448971880215386, + "grad_norm": 0.2898799777030945, + "learning_rate": 0.00015089036917639468, + "loss": 0.3473, + "step": 13060 + }, + { + "epoch": 1.6455269704317157, + "grad_norm": 0.31464672088623047, + "learning_rate": 0.00015078044811762047, + "loss": 0.3418, + "step": 13065 + }, + { + "epoch": 1.6461567528418932, + "grad_norm": 0.2676648199558258, + "learning_rate": 0.00015067052663972775, + "loss": 0.3331, + "step": 13070 + }, + { + "epoch": 1.6467865352520703, + "grad_norm": 0.30420759320259094, + "learning_rate": 0.0001505606048017469, + "loss": 0.3544, + "step": 13075 + }, + { + "epoch": 1.6474163176622478, + "grad_norm": 0.3160271942615509, + "learning_rate": 0.00015045068266270848, + "loss": 0.3526, + "step": 13080 + }, + { + "epoch": 1.648046100072425, + "grad_norm": 0.31276562809944153, + "learning_rate": 0.0001503407602816432, + "loss": 0.3213, + "step": 13085 + }, + { + "epoch": 1.6486758824826022, + "grad_norm": 0.316756933927536, + "learning_rate": 0.00015023083771758183, + "loss": 0.3446, + "step": 13090 + }, + { + "epoch": 1.6493056648927795, + "grad_norm": 0.23935994505882263, + "learning_rate": 0.00015012091502955533, + "loss": 0.3416, + "step": 13095 + }, + { + "epoch": 1.6499354473029568, + "grad_norm": 0.2719472348690033, + "learning_rate": 0.00015001099227659475, + "loss": 0.3567, + "step": 13100 + }, + { + "epoch": 1.6505652297131341, + "grad_norm": 0.3108009696006775, + "learning_rate": 0.00014990106951773098, + "loss": 0.3524, + "step": 13105 + }, + { + "epoch": 1.6511950121233114, + "grad_norm": 0.3002628982067108, + "learning_rate": 0.00014979114681199524, + "loss": 0.3314, + "step": 13110 + }, + { + "epoch": 1.6518247945334887, + "grad_norm": 0.32287389039993286, + "learning_rate": 0.0001496812242184184, + "loss": 0.3376, + "step": 13115 + }, + { + "epoch": 1.6524545769436658, + "grad_norm": 0.27193522453308105, + "learning_rate": 0.0001495713017960314, + "loss": 0.3443, + "step": 13120 + }, + { + "epoch": 1.6530843593538433, + "grad_norm": 0.30429700016975403, + "learning_rate": 0.00014946137960386512, + "loss": 0.3345, + "step": 13125 + }, + { + "epoch": 1.6537141417640204, + "grad_norm": 0.2757263481616974, + "learning_rate": 0.00014935145770095034, + "loss": 0.3405, + "step": 13130 + }, + { + "epoch": 1.654343924174198, + "grad_norm": 0.274728000164032, + "learning_rate": 0.00014924153614631754, + "loss": 0.3199, + "step": 13135 + }, + { + "epoch": 1.654973706584375, + "grad_norm": 0.2992052137851715, + "learning_rate": 0.0001491316149989972, + "loss": 0.3641, + "step": 13140 + }, + { + "epoch": 1.6556034889945523, + "grad_norm": 0.28687140345573425, + "learning_rate": 0.00014902169431801947, + "loss": 0.3586, + "step": 13145 + }, + { + "epoch": 1.6562332714047296, + "grad_norm": 0.31748563051223755, + "learning_rate": 0.00014891177416241416, + "loss": 0.3318, + "step": 13150 + }, + { + "epoch": 1.656863053814907, + "grad_norm": 0.2876995801925659, + "learning_rate": 0.00014880185459121103, + "loss": 0.3446, + "step": 13155 + }, + { + "epoch": 1.6574928362250843, + "grad_norm": 0.2874261736869812, + "learning_rate": 0.00014869193566343934, + "loss": 0.3058, + "step": 13160 + }, + { + "epoch": 1.6581226186352616, + "grad_norm": 0.2720824182033539, + "learning_rate": 0.00014858201743812806, + "loss": 0.3332, + "step": 13165 + }, + { + "epoch": 1.6587524010454389, + "grad_norm": 0.27765411138534546, + "learning_rate": 0.00014847209997430582, + "loss": 0.3428, + "step": 13170 + }, + { + "epoch": 1.659382183455616, + "grad_norm": 0.28871631622314453, + "learning_rate": 0.0001483621833310008, + "loss": 0.3325, + "step": 13175 + }, + { + "epoch": 1.6600119658657935, + "grad_norm": 0.2875865697860718, + "learning_rate": 0.00014825226756724077, + "loss": 0.3527, + "step": 13180 + }, + { + "epoch": 1.6606417482759706, + "grad_norm": 0.2774711549282074, + "learning_rate": 0.00014814235274205297, + "loss": 0.335, + "step": 13185 + }, + { + "epoch": 1.661271530686148, + "grad_norm": 0.2727283537387848, + "learning_rate": 0.00014803243891446416, + "loss": 0.3393, + "step": 13190 + }, + { + "epoch": 1.6619013130963252, + "grad_norm": 0.27977532148361206, + "learning_rate": 0.00014792252614350055, + "loss": 0.3566, + "step": 13195 + }, + { + "epoch": 1.6625310955065025, + "grad_norm": 0.29823413491249084, + "learning_rate": 0.0001478126144881879, + "loss": 0.3287, + "step": 13200 + }, + { + "epoch": 1.6631608779166798, + "grad_norm": 0.2849923372268677, + "learning_rate": 0.00014770270400755125, + "loss": 0.3166, + "step": 13205 + }, + { + "epoch": 1.663790660326857, + "grad_norm": 0.259219229221344, + "learning_rate": 0.00014759279476061503, + "loss": 0.336, + "step": 13210 + }, + { + "epoch": 1.6644204427370344, + "grad_norm": 0.2877882719039917, + "learning_rate": 0.00014748288680640302, + "loss": 0.3506, + "step": 13215 + }, + { + "epoch": 1.6650502251472117, + "grad_norm": 0.2952651381492615, + "learning_rate": 0.00014737298020393828, + "loss": 0.3562, + "step": 13220 + }, + { + "epoch": 1.665680007557389, + "grad_norm": 0.25878390669822693, + "learning_rate": 0.00014726307501224312, + "loss": 0.3289, + "step": 13225 + }, + { + "epoch": 1.666309789967566, + "grad_norm": 0.29914605617523193, + "learning_rate": 0.00014715317129033924, + "loss": 0.3321, + "step": 13230 + }, + { + "epoch": 1.6669395723777436, + "grad_norm": 0.27533242106437683, + "learning_rate": 0.00014704326909724738, + "loss": 0.3234, + "step": 13235 + }, + { + "epoch": 1.6675693547879207, + "grad_norm": 0.2584016025066376, + "learning_rate": 0.0001469333684919876, + "loss": 0.3181, + "step": 13240 + }, + { + "epoch": 1.6681991371980982, + "grad_norm": 0.262953519821167, + "learning_rate": 0.00014682346953357898, + "loss": 0.3127, + "step": 13245 + }, + { + "epoch": 1.6688289196082753, + "grad_norm": 0.3399054706096649, + "learning_rate": 0.00014671357228103978, + "loss": 0.3529, + "step": 13250 + }, + { + "epoch": 1.6694587020184526, + "grad_norm": 0.26437637209892273, + "learning_rate": 0.00014660367679338732, + "loss": 0.318, + "step": 13255 + }, + { + "epoch": 1.67008848442863, + "grad_norm": 0.28796815872192383, + "learning_rate": 0.000146493783129638, + "loss": 0.3226, + "step": 13260 + }, + { + "epoch": 1.6707182668388072, + "grad_norm": 0.3208424150943756, + "learning_rate": 0.00014638389134880722, + "loss": 0.3661, + "step": 13265 + }, + { + "epoch": 1.6713480492489845, + "grad_norm": 0.2934640347957611, + "learning_rate": 0.00014627400150990941, + "loss": 0.3414, + "step": 13270 + }, + { + "epoch": 1.6719778316591618, + "grad_norm": 0.28860223293304443, + "learning_rate": 0.0001461641136719579, + "loss": 0.3386, + "step": 13275 + }, + { + "epoch": 1.672607614069339, + "grad_norm": 0.2960747182369232, + "learning_rate": 0.00014605422789396494, + "loss": 0.3466, + "step": 13280 + }, + { + "epoch": 1.6732373964795162, + "grad_norm": 0.25040510296821594, + "learning_rate": 0.00014594434423494178, + "loss": 0.3366, + "step": 13285 + }, + { + "epoch": 1.6738671788896937, + "grad_norm": 0.2958894371986389, + "learning_rate": 0.0001458344627538984, + "loss": 0.3614, + "step": 13290 + }, + { + "epoch": 1.6744969612998708, + "grad_norm": 0.26937004923820496, + "learning_rate": 0.00014572458350984362, + "loss": 0.3499, + "step": 13295 + }, + { + "epoch": 1.6751267437100483, + "grad_norm": 0.267607182264328, + "learning_rate": 0.00014561470656178517, + "loss": 0.3268, + "step": 13300 + }, + { + "epoch": 1.6757565261202254, + "grad_norm": 0.30197760462760925, + "learning_rate": 0.0001455048319687295, + "loss": 0.3212, + "step": 13305 + }, + { + "epoch": 1.6763863085304027, + "grad_norm": 0.29999008774757385, + "learning_rate": 0.0001453949597896817, + "loss": 0.3492, + "step": 13310 + }, + { + "epoch": 1.67701609094058, + "grad_norm": 0.30626264214515686, + "learning_rate": 0.00014528509008364572, + "loss": 0.3541, + "step": 13315 + }, + { + "epoch": 1.6776458733507573, + "grad_norm": 0.2915571630001068, + "learning_rate": 0.0001451752229096241, + "loss": 0.3231, + "step": 13320 + }, + { + "epoch": 1.6782756557609346, + "grad_norm": 0.2660951018333435, + "learning_rate": 0.0001450653583266179, + "loss": 0.321, + "step": 13325 + }, + { + "epoch": 1.678905438171112, + "grad_norm": 0.2831597924232483, + "learning_rate": 0.00014495549639362707, + "loss": 0.3243, + "step": 13330 + }, + { + "epoch": 1.6795352205812892, + "grad_norm": 0.2856467664241791, + "learning_rate": 0.0001448456371696499, + "loss": 0.3134, + "step": 13335 + }, + { + "epoch": 1.6801650029914663, + "grad_norm": 0.31137335300445557, + "learning_rate": 0.00014473578071368324, + "loss": 0.3266, + "step": 13340 + }, + { + "epoch": 1.6807947854016438, + "grad_norm": 0.3102738857269287, + "learning_rate": 0.0001446259270847226, + "loss": 0.3368, + "step": 13345 + }, + { + "epoch": 1.681424567811821, + "grad_norm": 0.2788311839103699, + "learning_rate": 0.00014451607634176196, + "loss": 0.345, + "step": 13350 + }, + { + "epoch": 1.6820543502219985, + "grad_norm": 0.26762083172798157, + "learning_rate": 0.0001444062285437935, + "loss": 0.3112, + "step": 13355 + }, + { + "epoch": 1.6826841326321755, + "grad_norm": 0.30155837535858154, + "learning_rate": 0.00014429638374980814, + "loss": 0.3353, + "step": 13360 + }, + { + "epoch": 1.6833139150423528, + "grad_norm": 0.3196204602718353, + "learning_rate": 0.00014418654201879498, + "loss": 0.3738, + "step": 13365 + }, + { + "epoch": 1.6839436974525301, + "grad_norm": 0.29560673236846924, + "learning_rate": 0.0001440767034097415, + "loss": 0.3458, + "step": 13370 + }, + { + "epoch": 1.6845734798627074, + "grad_norm": 0.30189448595046997, + "learning_rate": 0.00014396686798163365, + "loss": 0.3577, + "step": 13375 + }, + { + "epoch": 1.6852032622728847, + "grad_norm": 0.29545098543167114, + "learning_rate": 0.00014385703579345544, + "loss": 0.3299, + "step": 13380 + }, + { + "epoch": 1.685833044683062, + "grad_norm": 0.3403629660606384, + "learning_rate": 0.00014374720690418942, + "loss": 0.3349, + "step": 13385 + }, + { + "epoch": 1.6864628270932394, + "grad_norm": 0.2561693489551544, + "learning_rate": 0.0001436373813728161, + "loss": 0.321, + "step": 13390 + }, + { + "epoch": 1.6870926095034164, + "grad_norm": 0.2968713641166687, + "learning_rate": 0.00014352755925831428, + "loss": 0.3314, + "step": 13395 + }, + { + "epoch": 1.687722391913594, + "grad_norm": 0.25213027000427246, + "learning_rate": 0.00014341774061966096, + "loss": 0.3245, + "step": 13400 + }, + { + "epoch": 1.688352174323771, + "grad_norm": 0.26504096388816833, + "learning_rate": 0.00014330792551583133, + "loss": 0.324, + "step": 13405 + }, + { + "epoch": 1.6889819567339486, + "grad_norm": 0.31459683179855347, + "learning_rate": 0.00014319811400579854, + "loss": 0.33, + "step": 13410 + }, + { + "epoch": 1.6896117391441257, + "grad_norm": 0.31566324830055237, + "learning_rate": 0.00014308830614853392, + "loss": 0.3097, + "step": 13415 + }, + { + "epoch": 1.690241521554303, + "grad_norm": 0.3083827793598175, + "learning_rate": 0.00014297850200300683, + "loss": 0.3345, + "step": 13420 + }, + { + "epoch": 1.6908713039644803, + "grad_norm": 0.29203763604164124, + "learning_rate": 0.0001428687016281845, + "loss": 0.3459, + "step": 13425 + }, + { + "epoch": 1.6915010863746576, + "grad_norm": 0.28596800565719604, + "learning_rate": 0.00014275890508303225, + "loss": 0.3188, + "step": 13430 + }, + { + "epoch": 1.6921308687848349, + "grad_norm": 0.3753102421760559, + "learning_rate": 0.00014264911242651342, + "loss": 0.3457, + "step": 13435 + }, + { + "epoch": 1.6927606511950122, + "grad_norm": 0.28502312302589417, + "learning_rate": 0.0001425393237175891, + "loss": 0.3295, + "step": 13440 + }, + { + "epoch": 1.6933904336051895, + "grad_norm": 0.3175462782382965, + "learning_rate": 0.00014242953901521838, + "loss": 0.3094, + "step": 13445 + }, + { + "epoch": 1.6940202160153666, + "grad_norm": 0.25370490550994873, + "learning_rate": 0.00014231975837835815, + "loss": 0.3446, + "step": 13450 + }, + { + "epoch": 1.694649998425544, + "grad_norm": 0.2589857876300812, + "learning_rate": 0.00014220998186596315, + "loss": 0.3258, + "step": 13455 + }, + { + "epoch": 1.6952797808357212, + "grad_norm": 0.31022030115127563, + "learning_rate": 0.00014210020953698573, + "loss": 0.344, + "step": 13460 + }, + { + "epoch": 1.6959095632458987, + "grad_norm": 0.3099876046180725, + "learning_rate": 0.0001419904414503763, + "loss": 0.3425, + "step": 13465 + }, + { + "epoch": 1.6965393456560758, + "grad_norm": 0.27715328335762024, + "learning_rate": 0.00014188067766508273, + "loss": 0.3309, + "step": 13470 + }, + { + "epoch": 1.697169128066253, + "grad_norm": 0.2700579762458801, + "learning_rate": 0.00014177091824005075, + "loss": 0.3191, + "step": 13475 + }, + { + "epoch": 1.6977989104764304, + "grad_norm": 0.2773703336715698, + "learning_rate": 0.00014166116323422365, + "loss": 0.3321, + "step": 13480 + }, + { + "epoch": 1.6984286928866077, + "grad_norm": 0.2699192464351654, + "learning_rate": 0.00014155141270654232, + "loss": 0.3318, + "step": 13485 + }, + { + "epoch": 1.699058475296785, + "grad_norm": 0.26127228140830994, + "learning_rate": 0.00014144166671594544, + "loss": 0.2982, + "step": 13490 + }, + { + "epoch": 1.699688257706962, + "grad_norm": 0.37218350172042847, + "learning_rate": 0.000141331925321369, + "loss": 0.3335, + "step": 13495 + }, + { + "epoch": 1.7003180401171396, + "grad_norm": 0.26352524757385254, + "learning_rate": 0.0001412221885817466, + "loss": 0.3246, + "step": 13500 + }, + { + "epoch": 1.7009478225273167, + "grad_norm": 0.27649009227752686, + "learning_rate": 0.00014111245655600948, + "loss": 0.3117, + "step": 13505 + }, + { + "epoch": 1.7015776049374942, + "grad_norm": 0.26316478848457336, + "learning_rate": 0.00014100272930308623, + "loss": 0.3268, + "step": 13510 + }, + { + "epoch": 1.7022073873476713, + "grad_norm": 0.26319512724876404, + "learning_rate": 0.0001408930068819028, + "loss": 0.3083, + "step": 13515 + }, + { + "epoch": 1.7028371697578488, + "grad_norm": 0.26792389154434204, + "learning_rate": 0.00014078328935138276, + "loss": 0.3317, + "step": 13520 + }, + { + "epoch": 1.703466952168026, + "grad_norm": 0.2627207338809967, + "learning_rate": 0.0001406735767704469, + "loss": 0.3225, + "step": 13525 + }, + { + "epoch": 1.7040967345782032, + "grad_norm": 0.30815207958221436, + "learning_rate": 0.00014056386919801325, + "loss": 0.3201, + "step": 13530 + }, + { + "epoch": 1.7047265169883805, + "grad_norm": 0.296520471572876, + "learning_rate": 0.00014045416669299747, + "loss": 0.3189, + "step": 13535 + }, + { + "epoch": 1.7053562993985578, + "grad_norm": 0.2739796042442322, + "learning_rate": 0.0001403444693143122, + "loss": 0.3023, + "step": 13540 + }, + { + "epoch": 1.7059860818087351, + "grad_norm": 0.311927855014801, + "learning_rate": 0.00014023477712086743, + "loss": 0.3311, + "step": 13545 + }, + { + "epoch": 1.7066158642189122, + "grad_norm": 0.2842674255371094, + "learning_rate": 0.0001401250901715704, + "loss": 0.3376, + "step": 13550 + }, + { + "epoch": 1.7072456466290897, + "grad_norm": 0.30459704995155334, + "learning_rate": 0.00014001540852532553, + "loss": 0.3276, + "step": 13555 + }, + { + "epoch": 1.7078754290392668, + "grad_norm": 0.26651817560195923, + "learning_rate": 0.00013990573224103442, + "loss": 0.3309, + "step": 13560 + }, + { + "epoch": 1.7085052114494443, + "grad_norm": 0.32419687509536743, + "learning_rate": 0.00013979606137759563, + "loss": 0.314, + "step": 13565 + }, + { + "epoch": 1.7091349938596214, + "grad_norm": 0.2715966999530792, + "learning_rate": 0.000139686395993905, + "loss": 0.3293, + "step": 13570 + }, + { + "epoch": 1.709764776269799, + "grad_norm": 0.29049497842788696, + "learning_rate": 0.0001395767361488552, + "loss": 0.3159, + "step": 13575 + }, + { + "epoch": 1.710394558679976, + "grad_norm": 0.3235701024532318, + "learning_rate": 0.00013946708190133627, + "loss": 0.3422, + "step": 13580 + }, + { + "epoch": 1.7110243410901533, + "grad_norm": 0.2732395529747009, + "learning_rate": 0.00013935743331023492, + "loss": 0.317, + "step": 13585 + }, + { + "epoch": 1.7116541235003306, + "grad_norm": 0.2833672761917114, + "learning_rate": 0.000139247790434435, + "loss": 0.3619, + "step": 13590 + }, + { + "epoch": 1.712283905910508, + "grad_norm": 0.2510261535644531, + "learning_rate": 0.00013913815333281728, + "loss": 0.3215, + "step": 13595 + }, + { + "epoch": 1.7129136883206852, + "grad_norm": 0.29638463258743286, + "learning_rate": 0.00013902852206425925, + "loss": 0.3341, + "step": 13600 + }, + { + "epoch": 1.7135434707308623, + "grad_norm": 0.26883918046951294, + "learning_rate": 0.0001389188966876355, + "loss": 0.3198, + "step": 13605 + }, + { + "epoch": 1.7141732531410399, + "grad_norm": 0.280301958322525, + "learning_rate": 0.00013880927726181737, + "loss": 0.3232, + "step": 13610 + }, + { + "epoch": 1.714803035551217, + "grad_norm": 0.25223594903945923, + "learning_rate": 0.00013869966384567293, + "loss": 0.3362, + "step": 13615 + }, + { + "epoch": 1.7154328179613945, + "grad_norm": 0.29902294278144836, + "learning_rate": 0.00013859005649806717, + "loss": 0.3169, + "step": 13620 + }, + { + "epoch": 1.7160626003715715, + "grad_norm": 0.3142664134502411, + "learning_rate": 0.00013848045527786168, + "loss": 0.3149, + "step": 13625 + }, + { + "epoch": 1.716692382781749, + "grad_norm": 0.312800794839859, + "learning_rate": 0.0001383708602439149, + "loss": 0.3327, + "step": 13630 + }, + { + "epoch": 1.7173221651919262, + "grad_norm": 0.3177478015422821, + "learning_rate": 0.00013826127145508176, + "loss": 0.3215, + "step": 13635 + }, + { + "epoch": 1.7179519476021035, + "grad_norm": 0.2900395691394806, + "learning_rate": 0.00013815168897021398, + "loss": 0.3169, + "step": 13640 + }, + { + "epoch": 1.7185817300122808, + "grad_norm": 0.2877413332462311, + "learning_rate": 0.00013804211284815986, + "loss": 0.3247, + "step": 13645 + }, + { + "epoch": 1.719211512422458, + "grad_norm": 0.25947847962379456, + "learning_rate": 0.00013793254314776432, + "loss": 0.3091, + "step": 13650 + }, + { + "epoch": 1.7198412948326354, + "grad_norm": 0.270942747592926, + "learning_rate": 0.00013782297992786873, + "loss": 0.3318, + "step": 13655 + }, + { + "epoch": 1.7204710772428125, + "grad_norm": 0.2605541944503784, + "learning_rate": 0.00013771342324731106, + "loss": 0.3247, + "step": 13660 + }, + { + "epoch": 1.72110085965299, + "grad_norm": 0.25236964225769043, + "learning_rate": 0.00013760387316492584, + "loss": 0.3111, + "step": 13665 + }, + { + "epoch": 1.721730642063167, + "grad_norm": 0.2639407217502594, + "learning_rate": 0.00013749432973954385, + "loss": 0.305, + "step": 13670 + }, + { + "epoch": 1.7223604244733446, + "grad_norm": 0.3111459016799927, + "learning_rate": 0.0001373847930299924, + "loss": 0.3367, + "step": 13675 + }, + { + "epoch": 1.7229902068835217, + "grad_norm": 0.31038767099380493, + "learning_rate": 0.00013727526309509531, + "loss": 0.3223, + "step": 13680 + }, + { + "epoch": 1.723619989293699, + "grad_norm": 0.2571181058883667, + "learning_rate": 0.00013716573999367259, + "loss": 0.3057, + "step": 13685 + }, + { + "epoch": 1.7242497717038763, + "grad_norm": 0.24940542876720428, + "learning_rate": 0.0001370562237845406, + "loss": 0.319, + "step": 13690 + }, + { + "epoch": 1.7248795541140536, + "grad_norm": 0.2301412671804428, + "learning_rate": 0.00013694671452651216, + "loss": 0.3099, + "step": 13695 + }, + { + "epoch": 1.7255093365242309, + "grad_norm": 0.27043718099594116, + "learning_rate": 0.00013683721227839623, + "loss": 0.3345, + "step": 13700 + }, + { + "epoch": 1.7261391189344082, + "grad_norm": 0.26595422625541687, + "learning_rate": 0.00013672771709899792, + "loss": 0.3162, + "step": 13705 + }, + { + "epoch": 1.7267689013445855, + "grad_norm": 0.26224714517593384, + "learning_rate": 0.0001366182290471187, + "loss": 0.322, + "step": 13710 + }, + { + "epoch": 1.7273986837547626, + "grad_norm": 0.26390886306762695, + "learning_rate": 0.00013650874818155618, + "loss": 0.2964, + "step": 13715 + }, + { + "epoch": 1.72802846616494, + "grad_norm": 0.3042176365852356, + "learning_rate": 0.00013639927456110402, + "loss": 0.3128, + "step": 13720 + }, + { + "epoch": 1.7286582485751172, + "grad_norm": 0.269771009683609, + "learning_rate": 0.00013628980824455212, + "loss": 0.2963, + "step": 13725 + }, + { + "epoch": 1.7292880309852947, + "grad_norm": 0.3462948203086853, + "learning_rate": 0.00013618034929068634, + "loss": 0.3445, + "step": 13730 + }, + { + "epoch": 1.7299178133954718, + "grad_norm": 0.270379900932312, + "learning_rate": 0.0001360708977582887, + "loss": 0.3174, + "step": 13735 + }, + { + "epoch": 1.730547595805649, + "grad_norm": 0.23746255040168762, + "learning_rate": 0.00013596145370613715, + "loss": 0.3006, + "step": 13740 + }, + { + "epoch": 1.7311773782158264, + "grad_norm": 0.30519574880599976, + "learning_rate": 0.00013585201719300562, + "loss": 0.3272, + "step": 13745 + }, + { + "epoch": 1.7318071606260037, + "grad_norm": 0.3508155941963196, + "learning_rate": 0.000135742588277664, + "loss": 0.3385, + "step": 13750 + }, + { + "epoch": 1.732436943036181, + "grad_norm": 0.2649688720703125, + "learning_rate": 0.00013563316701887816, + "loss": 0.3191, + "step": 13755 + }, + { + "epoch": 1.7330667254463583, + "grad_norm": 0.25044509768486023, + "learning_rate": 0.0001355237534754098, + "loss": 0.3114, + "step": 13760 + }, + { + "epoch": 1.7336965078565356, + "grad_norm": 0.27739325165748596, + "learning_rate": 0.00013541434770601653, + "loss": 0.3555, + "step": 13765 + }, + { + "epoch": 1.7343262902667127, + "grad_norm": 0.27952834963798523, + "learning_rate": 0.00013530494976945172, + "loss": 0.3287, + "step": 13770 + }, + { + "epoch": 1.7349560726768902, + "grad_norm": 0.29794949293136597, + "learning_rate": 0.00013519555972446454, + "loss": 0.3248, + "step": 13775 + }, + { + "epoch": 1.7355858550870673, + "grad_norm": 0.3177776634693146, + "learning_rate": 0.00013508617762979992, + "loss": 0.3311, + "step": 13780 + }, + { + "epoch": 1.7362156374972448, + "grad_norm": 0.29036352038383484, + "learning_rate": 0.0001349768035441986, + "loss": 0.3021, + "step": 13785 + }, + { + "epoch": 1.736845419907422, + "grad_norm": 0.2803820073604584, + "learning_rate": 0.00013486743752639694, + "loss": 0.3021, + "step": 13790 + }, + { + "epoch": 1.7374752023175992, + "grad_norm": 0.25854361057281494, + "learning_rate": 0.000134758079635127, + "loss": 0.3215, + "step": 13795 + }, + { + "epoch": 1.7381049847277765, + "grad_norm": 0.2606901228427887, + "learning_rate": 0.0001346487299291165, + "loss": 0.3093, + "step": 13800 + }, + { + "epoch": 1.7387347671379538, + "grad_norm": 0.25198522210121155, + "learning_rate": 0.00013453938846708864, + "loss": 0.2954, + "step": 13805 + }, + { + "epoch": 1.7393645495481311, + "grad_norm": 0.27399036288261414, + "learning_rate": 0.00013443005530776233, + "loss": 0.3212, + "step": 13810 + }, + { + "epoch": 1.7399943319583084, + "grad_norm": 0.2777753174304962, + "learning_rate": 0.000134320730509852, + "loss": 0.32, + "step": 13815 + }, + { + "epoch": 1.7406241143684857, + "grad_norm": 0.28130999207496643, + "learning_rate": 0.0001342114141320675, + "loss": 0.305, + "step": 13820 + }, + { + "epoch": 1.7412538967786628, + "grad_norm": 0.28102371096611023, + "learning_rate": 0.00013410210623311428, + "loss": 0.3066, + "step": 13825 + }, + { + "epoch": 1.7418836791888404, + "grad_norm": 0.21866032481193542, + "learning_rate": 0.00013399280687169312, + "loss": 0.3181, + "step": 13830 + }, + { + "epoch": 1.7425134615990174, + "grad_norm": 0.27159667015075684, + "learning_rate": 0.00013388351610650045, + "loss": 0.2983, + "step": 13835 + }, + { + "epoch": 1.743143244009195, + "grad_norm": 0.26473724842071533, + "learning_rate": 0.00013377423399622764, + "loss": 0.3041, + "step": 13840 + }, + { + "epoch": 1.743773026419372, + "grad_norm": 0.30044063925743103, + "learning_rate": 0.00013366496059956184, + "loss": 0.3391, + "step": 13845 + }, + { + "epoch": 1.7444028088295493, + "grad_norm": 0.3015748858451843, + "learning_rate": 0.00013355569597518532, + "loss": 0.3033, + "step": 13850 + }, + { + "epoch": 1.7450325912397266, + "grad_norm": 0.27009138464927673, + "learning_rate": 0.00013344644018177572, + "loss": 0.2973, + "step": 13855 + }, + { + "epoch": 1.745662373649904, + "grad_norm": 0.28925400972366333, + "learning_rate": 0.00013333719327800585, + "loss": 0.3137, + "step": 13860 + }, + { + "epoch": 1.7462921560600813, + "grad_norm": 0.27679139375686646, + "learning_rate": 0.00013322795532254379, + "loss": 0.3119, + "step": 13865 + }, + { + "epoch": 1.7469219384702586, + "grad_norm": 0.283965140581131, + "learning_rate": 0.0001331187263740529, + "loss": 0.3151, + "step": 13870 + }, + { + "epoch": 1.7475517208804359, + "grad_norm": 0.24927465617656708, + "learning_rate": 0.0001330095064911915, + "loss": 0.2968, + "step": 13875 + }, + { + "epoch": 1.748181503290613, + "grad_norm": 0.2976732850074768, + "learning_rate": 0.0001329002957326132, + "loss": 0.3257, + "step": 13880 + }, + { + "epoch": 1.7488112857007905, + "grad_norm": 0.27860409021377563, + "learning_rate": 0.00013279109415696672, + "loss": 0.2988, + "step": 13885 + }, + { + "epoch": 1.7494410681109676, + "grad_norm": 0.28782716393470764, + "learning_rate": 0.0001326819018228958, + "loss": 0.3098, + "step": 13890 + }, + { + "epoch": 1.750070850521145, + "grad_norm": 0.24729984998703003, + "learning_rate": 0.0001325727187890391, + "loss": 0.3123, + "step": 13895 + }, + { + "epoch": 1.7507006329313222, + "grad_norm": 0.23218853771686554, + "learning_rate": 0.00013246354511403058, + "loss": 0.3025, + "step": 13900 + }, + { + "epoch": 1.7513304153414995, + "grad_norm": 0.2634672522544861, + "learning_rate": 0.00013235438085649893, + "loss": 0.3123, + "step": 13905 + }, + { + "epoch": 1.7519601977516768, + "grad_norm": 0.3087509572505951, + "learning_rate": 0.00013224522607506776, + "loss": 0.3515, + "step": 13910 + }, + { + "epoch": 1.752589980161854, + "grad_norm": 0.28160160779953003, + "learning_rate": 0.00013213608082835576, + "loss": 0.3141, + "step": 13915 + }, + { + "epoch": 1.7532197625720314, + "grad_norm": 0.2643168866634369, + "learning_rate": 0.0001320269451749764, + "loss": 0.297, + "step": 13920 + }, + { + "epoch": 1.7538495449822087, + "grad_norm": 0.34547582268714905, + "learning_rate": 0.00013191781917353803, + "loss": 0.3194, + "step": 13925 + }, + { + "epoch": 1.754479327392386, + "grad_norm": 0.29079994559288025, + "learning_rate": 0.00013180870288264385, + "loss": 0.3334, + "step": 13930 + }, + { + "epoch": 1.755109109802563, + "grad_norm": 0.2323244959115982, + "learning_rate": 0.00013169959636089167, + "loss": 0.3106, + "step": 13935 + }, + { + "epoch": 1.7557388922127406, + "grad_norm": 0.29080161452293396, + "learning_rate": 0.00013159049966687437, + "loss": 0.2978, + "step": 13940 + }, + { + "epoch": 1.7563686746229177, + "grad_norm": 0.2688988149166107, + "learning_rate": 0.00013148141285917924, + "loss": 0.3184, + "step": 13945 + }, + { + "epoch": 1.7569984570330952, + "grad_norm": 0.25353583693504333, + "learning_rate": 0.0001313723359963884, + "loss": 0.2956, + "step": 13950 + }, + { + "epoch": 1.7576282394432723, + "grad_norm": 0.32606688141822815, + "learning_rate": 0.0001312632691370786, + "loss": 0.3136, + "step": 13955 + }, + { + "epoch": 1.7582580218534496, + "grad_norm": 0.24126961827278137, + "learning_rate": 0.0001311542123398213, + "loss": 0.304, + "step": 13960 + }, + { + "epoch": 1.758887804263627, + "grad_norm": 0.2840232253074646, + "learning_rate": 0.0001310451656631824, + "loss": 0.3126, + "step": 13965 + }, + { + "epoch": 1.7595175866738042, + "grad_norm": 0.30879929661750793, + "learning_rate": 0.0001309361291657226, + "loss": 0.3115, + "step": 13970 + }, + { + "epoch": 1.7601473690839815, + "grad_norm": 0.29478558897972107, + "learning_rate": 0.0001308271029059969, + "loss": 0.3035, + "step": 13975 + }, + { + "epoch": 1.7607771514941588, + "grad_norm": 0.29496970772743225, + "learning_rate": 0.00013071808694255484, + "loss": 0.3417, + "step": 13980 + }, + { + "epoch": 1.7614069339043361, + "grad_norm": 0.27189967036247253, + "learning_rate": 0.00013060908133394054, + "loss": 0.3146, + "step": 13985 + }, + { + "epoch": 1.7620367163145132, + "grad_norm": 0.2737963795661926, + "learning_rate": 0.00013050008613869256, + "loss": 0.3223, + "step": 13990 + }, + { + "epoch": 1.7626664987246907, + "grad_norm": 0.2881993055343628, + "learning_rate": 0.00013039110141534367, + "loss": 0.3039, + "step": 13995 + }, + { + "epoch": 1.7632962811348678, + "grad_norm": 0.29045918583869934, + "learning_rate": 0.00013028212722242127, + "loss": 0.3193, + "step": 14000 + }, + { + "epoch": 1.7632962811348678, + "eval_loss": 0.3040441870689392, + "eval_runtime": 6.1585, + "eval_samples_per_second": 162.378, + "eval_steps_per_second": 10.23, + "step": 14000 + }, + { + "epoch": 1.7639260635450453, + "grad_norm": 0.24037687480449677, + "learning_rate": 0.00013017316361844692, + "loss": 0.2918, + "step": 14005 + }, + { + "epoch": 1.7645558459552224, + "grad_norm": 0.25562503933906555, + "learning_rate": 0.0001300642106619367, + "loss": 0.2967, + "step": 14010 + }, + { + "epoch": 1.7651856283653997, + "grad_norm": 0.3410753905773163, + "learning_rate": 0.00012995526841140068, + "loss": 0.3158, + "step": 14015 + }, + { + "epoch": 1.765815410775577, + "grad_norm": 0.2569274306297302, + "learning_rate": 0.00012984633692534337, + "loss": 0.306, + "step": 14020 + }, + { + "epoch": 1.7664451931857543, + "grad_norm": 0.26620200276374817, + "learning_rate": 0.00012973741626226348, + "loss": 0.3122, + "step": 14025 + }, + { + "epoch": 1.7670749755959316, + "grad_norm": 0.2842133045196533, + "learning_rate": 0.00012962850648065393, + "loss": 0.3253, + "step": 14030 + }, + { + "epoch": 1.767704758006109, + "grad_norm": 0.27718397974967957, + "learning_rate": 0.00012951960763900173, + "loss": 0.3187, + "step": 14035 + }, + { + "epoch": 1.7683345404162862, + "grad_norm": 0.27699559926986694, + "learning_rate": 0.00012941071979578805, + "loss": 0.33, + "step": 14040 + }, + { + "epoch": 1.7689643228264633, + "grad_norm": 0.21499434113502502, + "learning_rate": 0.00012930184300948819, + "loss": 0.2765, + "step": 14045 + }, + { + "epoch": 1.7695941052366408, + "grad_norm": 0.29474014043807983, + "learning_rate": 0.00012919297733857138, + "loss": 0.32, + "step": 14050 + }, + { + "epoch": 1.770223887646818, + "grad_norm": 0.3570992052555084, + "learning_rate": 0.00012908412284150104, + "loss": 0.3088, + "step": 14055 + }, + { + "epoch": 1.7708536700569955, + "grad_norm": 0.2408706545829773, + "learning_rate": 0.00012897527957673446, + "loss": 0.2991, + "step": 14060 + }, + { + "epoch": 1.7714834524671725, + "grad_norm": 0.23086212575435638, + "learning_rate": 0.00012886644760272306, + "loss": 0.2959, + "step": 14065 + }, + { + "epoch": 1.7721132348773498, + "grad_norm": 0.25117409229278564, + "learning_rate": 0.00012875762697791199, + "loss": 0.2933, + "step": 14070 + }, + { + "epoch": 1.7727430172875271, + "grad_norm": 0.28731420636177063, + "learning_rate": 0.0001286488177607405, + "loss": 0.3234, + "step": 14075 + }, + { + "epoch": 1.7733727996977044, + "grad_norm": 0.23875364661216736, + "learning_rate": 0.0001285400200096416, + "loss": 0.2952, + "step": 14080 + }, + { + "epoch": 1.7740025821078818, + "grad_norm": 0.2722354829311371, + "learning_rate": 0.0001284312337830421, + "loss": 0.2997, + "step": 14085 + }, + { + "epoch": 1.774632364518059, + "grad_norm": 0.27776023745536804, + "learning_rate": 0.00012832245913936278, + "loss": 0.3256, + "step": 14090 + }, + { + "epoch": 1.7752621469282364, + "grad_norm": 0.26422828435897827, + "learning_rate": 0.00012821369613701808, + "loss": 0.2983, + "step": 14095 + }, + { + "epoch": 1.7758919293384134, + "grad_norm": 0.23418962955474854, + "learning_rate": 0.00012810494483441614, + "loss": 0.3024, + "step": 14100 + }, + { + "epoch": 1.776521711748591, + "grad_norm": 0.300912082195282, + "learning_rate": 0.000127996205289959, + "loss": 0.3001, + "step": 14105 + }, + { + "epoch": 1.777151494158768, + "grad_norm": 0.2872162461280823, + "learning_rate": 0.00012788747756204222, + "loss": 0.3074, + "step": 14110 + }, + { + "epoch": 1.7777812765689456, + "grad_norm": 0.2784421145915985, + "learning_rate": 0.00012777876170905515, + "loss": 0.2978, + "step": 14115 + }, + { + "epoch": 1.7784110589791227, + "grad_norm": 0.28062257170677185, + "learning_rate": 0.00012767005778938062, + "loss": 0.2993, + "step": 14120 + }, + { + "epoch": 1.7790408413893, + "grad_norm": 0.3496231734752655, + "learning_rate": 0.0001275613658613951, + "loss": 0.3147, + "step": 14125 + }, + { + "epoch": 1.7796706237994773, + "grad_norm": 0.2595261037349701, + "learning_rate": 0.00012745268598346864, + "loss": 0.2943, + "step": 14130 + }, + { + "epoch": 1.7803004062096546, + "grad_norm": 0.2795499563217163, + "learning_rate": 0.00012734401821396486, + "loss": 0.3123, + "step": 14135 + }, + { + "epoch": 1.7809301886198319, + "grad_norm": 0.2615763247013092, + "learning_rate": 0.0001272353626112408, + "loss": 0.3059, + "step": 14140 + }, + { + "epoch": 1.7815599710300092, + "grad_norm": 0.2783886790275574, + "learning_rate": 0.00012712671923364706, + "loss": 0.3134, + "step": 14145 + }, + { + "epoch": 1.7821897534401865, + "grad_norm": 0.2884584367275238, + "learning_rate": 0.0001270180881395276, + "loss": 0.3151, + "step": 14150 + }, + { + "epoch": 1.7828195358503636, + "grad_norm": 0.2677745521068573, + "learning_rate": 0.0001269094693872197, + "loss": 0.3146, + "step": 14155 + }, + { + "epoch": 1.783449318260541, + "grad_norm": 0.25956082344055176, + "learning_rate": 0.0001268008630350542, + "loss": 0.3118, + "step": 14160 + }, + { + "epoch": 1.7840791006707182, + "grad_norm": 0.2646723985671997, + "learning_rate": 0.0001266922691413552, + "loss": 0.2861, + "step": 14165 + }, + { + "epoch": 1.7847088830808957, + "grad_norm": 0.29946067929267883, + "learning_rate": 0.00012658368776444004, + "loss": 0.3349, + "step": 14170 + }, + { + "epoch": 1.7853386654910728, + "grad_norm": 0.24171167612075806, + "learning_rate": 0.00012647511896261943, + "loss": 0.2805, + "step": 14175 + }, + { + "epoch": 1.78596844790125, + "grad_norm": 0.26428696513175964, + "learning_rate": 0.0001263665627941973, + "loss": 0.3231, + "step": 14180 + }, + { + "epoch": 1.7865982303114274, + "grad_norm": 0.2787708044052124, + "learning_rate": 0.0001262580193174709, + "loss": 0.2961, + "step": 14185 + }, + { + "epoch": 1.7872280127216047, + "grad_norm": 0.2826111614704132, + "learning_rate": 0.00012614948859073036, + "loss": 0.3343, + "step": 14190 + }, + { + "epoch": 1.787857795131782, + "grad_norm": 0.278361052274704, + "learning_rate": 0.00012604097067225927, + "loss": 0.2919, + "step": 14195 + }, + { + "epoch": 1.7884875775419593, + "grad_norm": 0.24778404831886292, + "learning_rate": 0.00012593246562033419, + "loss": 0.316, + "step": 14200 + }, + { + "epoch": 1.7891173599521366, + "grad_norm": 0.28171002864837646, + "learning_rate": 0.00012582397349322484, + "loss": 0.3076, + "step": 14205 + }, + { + "epoch": 1.7897471423623137, + "grad_norm": 0.26361143589019775, + "learning_rate": 0.00012571549434919392, + "loss": 0.2953, + "step": 14210 + }, + { + "epoch": 1.7903769247724912, + "grad_norm": 0.27602389454841614, + "learning_rate": 0.0001256070282464973, + "loss": 0.3266, + "step": 14215 + }, + { + "epoch": 1.7910067071826683, + "grad_norm": 0.2887786328792572, + "learning_rate": 0.00012549857524338378, + "loss": 0.3166, + "step": 14220 + }, + { + "epoch": 1.7916364895928458, + "grad_norm": 0.272359162569046, + "learning_rate": 0.00012539013539809493, + "loss": 0.3053, + "step": 14225 + }, + { + "epoch": 1.792266272003023, + "grad_norm": 0.2615000903606415, + "learning_rate": 0.00012528170876886555, + "loss": 0.2974, + "step": 14230 + }, + { + "epoch": 1.7928960544132002, + "grad_norm": 0.2882770597934723, + "learning_rate": 0.00012517329541392316, + "loss": 0.301, + "step": 14235 + }, + { + "epoch": 1.7935258368233775, + "grad_norm": 0.29980406165122986, + "learning_rate": 0.00012506489539148823, + "loss": 0.3009, + "step": 14240 + }, + { + "epoch": 1.7941556192335548, + "grad_norm": 0.2714889943599701, + "learning_rate": 0.0001249565087597741, + "loss": 0.2897, + "step": 14245 + }, + { + "epoch": 1.7947854016437321, + "grad_norm": 0.3578423261642456, + "learning_rate": 0.00012484813557698678, + "loss": 0.3021, + "step": 14250 + }, + { + "epoch": 1.7954151840539094, + "grad_norm": 0.29889971017837524, + "learning_rate": 0.00012473977590132524, + "loss": 0.3039, + "step": 14255 + }, + { + "epoch": 1.7960449664640867, + "grad_norm": 0.27244943380355835, + "learning_rate": 0.000124631429790981, + "loss": 0.3068, + "step": 14260 + }, + { + "epoch": 1.7966747488742638, + "grad_norm": 0.2793833613395691, + "learning_rate": 0.00012452309730413843, + "loss": 0.3081, + "step": 14265 + }, + { + "epoch": 1.7973045312844413, + "grad_norm": 0.27198326587677, + "learning_rate": 0.00012441477849897461, + "loss": 0.2957, + "step": 14270 + }, + { + "epoch": 1.7979343136946184, + "grad_norm": 0.24795940518379211, + "learning_rate": 0.0001243064734336591, + "loss": 0.3094, + "step": 14275 + }, + { + "epoch": 1.798564096104796, + "grad_norm": 0.29008451104164124, + "learning_rate": 0.0001241981821663543, + "loss": 0.3306, + "step": 14280 + }, + { + "epoch": 1.799193878514973, + "grad_norm": 0.24478363990783691, + "learning_rate": 0.00012408990475521508, + "loss": 0.291, + "step": 14285 + }, + { + "epoch": 1.7998236609251503, + "grad_norm": 0.2566664218902588, + "learning_rate": 0.00012398164125838881, + "loss": 0.3087, + "step": 14290 + }, + { + "epoch": 1.8004534433353276, + "grad_norm": 0.24992555379867554, + "learning_rate": 0.00012387339173401552, + "loss": 0.318, + "step": 14295 + }, + { + "epoch": 1.801083225745505, + "grad_norm": 0.244164377450943, + "learning_rate": 0.00012376515624022767, + "loss": 0.3096, + "step": 14300 + }, + { + "epoch": 1.8017130081556822, + "grad_norm": 0.2495235651731491, + "learning_rate": 0.00012365693483515016, + "loss": 0.283, + "step": 14305 + }, + { + "epoch": 1.8023427905658593, + "grad_norm": 0.2685554027557373, + "learning_rate": 0.00012354872757690038, + "loss": 0.3359, + "step": 14310 + }, + { + "epoch": 1.8029725729760369, + "grad_norm": 0.23964886367321014, + "learning_rate": 0.0001234405345235881, + "loss": 0.3074, + "step": 14315 + }, + { + "epoch": 1.803602355386214, + "grad_norm": 0.24736544489860535, + "learning_rate": 0.00012333235573331556, + "loss": 0.2891, + "step": 14320 + }, + { + "epoch": 1.8042321377963915, + "grad_norm": 0.2994007170200348, + "learning_rate": 0.00012322419126417706, + "loss": 0.3109, + "step": 14325 + }, + { + "epoch": 1.8048619202065685, + "grad_norm": 0.26516586542129517, + "learning_rate": 0.0001231160411742595, + "loss": 0.2974, + "step": 14330 + }, + { + "epoch": 1.805491702616746, + "grad_norm": 0.27139636874198914, + "learning_rate": 0.0001230079055216419, + "loss": 0.3023, + "step": 14335 + }, + { + "epoch": 1.8061214850269232, + "grad_norm": 0.26109209656715393, + "learning_rate": 0.00012289978436439558, + "loss": 0.3059, + "step": 14340 + }, + { + "epoch": 1.8067512674371005, + "grad_norm": 0.29744458198547363, + "learning_rate": 0.0001227916777605841, + "loss": 0.3088, + "step": 14345 + }, + { + "epoch": 1.8073810498472778, + "grad_norm": 0.27332085371017456, + "learning_rate": 0.0001226835857682632, + "loss": 0.2888, + "step": 14350 + }, + { + "epoch": 1.808010832257455, + "grad_norm": 0.2586978077888489, + "learning_rate": 0.00012257550844548074, + "loss": 0.328, + "step": 14355 + }, + { + "epoch": 1.8086406146676324, + "grad_norm": 0.29042935371398926, + "learning_rate": 0.00012246744585027667, + "loss": 0.3113, + "step": 14360 + }, + { + "epoch": 1.8092703970778095, + "grad_norm": 0.271710067987442, + "learning_rate": 0.000122359398040683, + "loss": 0.2888, + "step": 14365 + }, + { + "epoch": 1.809900179487987, + "grad_norm": 0.2969205379486084, + "learning_rate": 0.00012225136507472406, + "loss": 0.312, + "step": 14370 + }, + { + "epoch": 1.810529961898164, + "grad_norm": 0.301145076751709, + "learning_rate": 0.00012214334701041586, + "loss": 0.2952, + "step": 14375 + }, + { + "epoch": 1.8111597443083416, + "grad_norm": 0.250630259513855, + "learning_rate": 0.00012203534390576666, + "loss": 0.3073, + "step": 14380 + }, + { + "epoch": 1.8117895267185187, + "grad_norm": 0.24282781779766083, + "learning_rate": 0.00012192735581877654, + "loss": 0.2863, + "step": 14385 + }, + { + "epoch": 1.8124193091286962, + "grad_norm": 0.2824462652206421, + "learning_rate": 0.00012181938280743769, + "loss": 0.2999, + "step": 14390 + }, + { + "epoch": 1.8130490915388733, + "grad_norm": 0.2740934491157532, + "learning_rate": 0.00012171142492973388, + "loss": 0.3131, + "step": 14395 + }, + { + "epoch": 1.8136788739490506, + "grad_norm": 0.23533669114112854, + "learning_rate": 0.00012160348224364109, + "loss": 0.2846, + "step": 14400 + }, + { + "epoch": 1.814308656359228, + "grad_norm": 0.26320409774780273, + "learning_rate": 0.00012149555480712697, + "loss": 0.2954, + "step": 14405 + }, + { + "epoch": 1.8149384387694052, + "grad_norm": 0.2816338837146759, + "learning_rate": 0.00012138764267815105, + "loss": 0.2811, + "step": 14410 + }, + { + "epoch": 1.8155682211795825, + "grad_norm": 0.23801551759243011, + "learning_rate": 0.00012127974591466455, + "loss": 0.2846, + "step": 14415 + }, + { + "epoch": 1.8161980035897596, + "grad_norm": 0.3131721317768097, + "learning_rate": 0.00012117186457461056, + "loss": 0.2969, + "step": 14420 + }, + { + "epoch": 1.816827785999937, + "grad_norm": 0.2892078757286072, + "learning_rate": 0.00012106399871592385, + "loss": 0.3, + "step": 14425 + }, + { + "epoch": 1.8174575684101142, + "grad_norm": 0.253273606300354, + "learning_rate": 0.00012095614839653074, + "loss": 0.3005, + "step": 14430 + }, + { + "epoch": 1.8180873508202917, + "grad_norm": 0.2675528824329376, + "learning_rate": 0.00012084831367434937, + "loss": 0.2947, + "step": 14435 + }, + { + "epoch": 1.8187171332304688, + "grad_norm": 0.2665347158908844, + "learning_rate": 0.00012074049460728945, + "loss": 0.3012, + "step": 14440 + }, + { + "epoch": 1.8193469156406463, + "grad_norm": 0.2987824082374573, + "learning_rate": 0.00012063269125325228, + "loss": 0.2986, + "step": 14445 + }, + { + "epoch": 1.8199766980508234, + "grad_norm": 0.2429313212633133, + "learning_rate": 0.00012052490367013076, + "loss": 0.3035, + "step": 14450 + }, + { + "epoch": 1.8206064804610007, + "grad_norm": 0.28424081206321716, + "learning_rate": 0.00012041713191580925, + "loss": 0.2948, + "step": 14455 + }, + { + "epoch": 1.821236262871178, + "grad_norm": 0.25087571144104004, + "learning_rate": 0.00012030937604816365, + "loss": 0.2949, + "step": 14460 + }, + { + "epoch": 1.8218660452813553, + "grad_norm": 0.23633217811584473, + "learning_rate": 0.00012020163612506127, + "loss": 0.2669, + "step": 14465 + }, + { + "epoch": 1.8224958276915326, + "grad_norm": 0.26396888494491577, + "learning_rate": 0.000120093912204361, + "loss": 0.2912, + "step": 14470 + }, + { + "epoch": 1.8231256101017097, + "grad_norm": 0.2898525297641754, + "learning_rate": 0.00011998620434391299, + "loss": 0.319, + "step": 14475 + }, + { + "epoch": 1.8237553925118872, + "grad_norm": 0.25507113337516785, + "learning_rate": 0.00011987851260155881, + "loss": 0.3028, + "step": 14480 + }, + { + "epoch": 1.8243851749220643, + "grad_norm": 0.2405284345149994, + "learning_rate": 0.00011977083703513145, + "loss": 0.2879, + "step": 14485 + }, + { + "epoch": 1.8250149573322418, + "grad_norm": 0.27114009857177734, + "learning_rate": 0.00011966317770245507, + "loss": 0.3094, + "step": 14490 + }, + { + "epoch": 1.825644739742419, + "grad_norm": 0.2708043158054352, + "learning_rate": 0.0001195555346613453, + "loss": 0.3062, + "step": 14495 + }, + { + "epoch": 1.8262745221525962, + "grad_norm": 0.2507513463497162, + "learning_rate": 0.00011944790796960878, + "loss": 0.2832, + "step": 14500 + }, + { + "epoch": 1.8269043045627735, + "grad_norm": 0.2864154577255249, + "learning_rate": 0.0001193402976850436, + "loss": 0.3067, + "step": 14505 + }, + { + "epoch": 1.8275340869729508, + "grad_norm": 0.26530271768569946, + "learning_rate": 0.00011923270386543886, + "loss": 0.2816, + "step": 14510 + }, + { + "epoch": 1.8281638693831281, + "grad_norm": 0.24444885551929474, + "learning_rate": 0.00011912512656857498, + "loss": 0.2993, + "step": 14515 + }, + { + "epoch": 1.8287936517933054, + "grad_norm": 0.2591851055622101, + "learning_rate": 0.00011901756585222334, + "loss": 0.2926, + "step": 14520 + }, + { + "epoch": 1.8294234342034827, + "grad_norm": 0.2942061424255371, + "learning_rate": 0.0001189100217741466, + "loss": 0.3032, + "step": 14525 + }, + { + "epoch": 1.8300532166136598, + "grad_norm": 0.28199318051338196, + "learning_rate": 0.00011880249439209836, + "loss": 0.291, + "step": 14530 + }, + { + "epoch": 1.8306829990238374, + "grad_norm": 0.2743484377861023, + "learning_rate": 0.00011869498376382324, + "loss": 0.3101, + "step": 14535 + }, + { + "epoch": 1.8313127814340144, + "grad_norm": 0.24012960493564606, + "learning_rate": 0.00011858748994705689, + "loss": 0.3, + "step": 14540 + }, + { + "epoch": 1.831942563844192, + "grad_norm": 0.2856425344944, + "learning_rate": 0.00011848001299952598, + "loss": 0.3042, + "step": 14545 + }, + { + "epoch": 1.832572346254369, + "grad_norm": 0.2720118463039398, + "learning_rate": 0.00011837255297894808, + "loss": 0.298, + "step": 14550 + }, + { + "epoch": 1.8332021286645463, + "grad_norm": 0.26973578333854675, + "learning_rate": 0.0001182651099430317, + "loss": 0.2734, + "step": 14555 + }, + { + "epoch": 1.8338319110747237, + "grad_norm": 0.35720425844192505, + "learning_rate": 0.00011815768394947616, + "loss": 0.3174, + "step": 14560 + }, + { + "epoch": 1.834461693484901, + "grad_norm": 0.2649666666984558, + "learning_rate": 0.00011805027505597178, + "loss": 0.3009, + "step": 14565 + }, + { + "epoch": 1.8350914758950783, + "grad_norm": 0.2809504270553589, + "learning_rate": 0.00011794288332019939, + "loss": 0.3075, + "step": 14570 + }, + { + "epoch": 1.8357212583052556, + "grad_norm": 0.247705340385437, + "learning_rate": 0.00011783550879983097, + "loss": 0.2929, + "step": 14575 + }, + { + "epoch": 1.8363510407154329, + "grad_norm": 0.3010486364364624, + "learning_rate": 0.00011772815155252901, + "loss": 0.2923, + "step": 14580 + }, + { + "epoch": 1.83698082312561, + "grad_norm": 0.29634296894073486, + "learning_rate": 0.00011762081163594686, + "loss": 0.2956, + "step": 14585 + }, + { + "epoch": 1.8376106055357875, + "grad_norm": 0.3235035538673401, + "learning_rate": 0.00011751348910772844, + "loss": 0.299, + "step": 14590 + }, + { + "epoch": 1.8382403879459646, + "grad_norm": 0.27069565653800964, + "learning_rate": 0.00011740618402550849, + "loss": 0.2885, + "step": 14595 + }, + { + "epoch": 1.838870170356142, + "grad_norm": 0.26986175775527954, + "learning_rate": 0.00011729889644691227, + "loss": 0.2974, + "step": 14600 + }, + { + "epoch": 1.8394999527663192, + "grad_norm": 0.24633704125881195, + "learning_rate": 0.00011719162642955559, + "loss": 0.3011, + "step": 14605 + }, + { + "epoch": 1.8401297351764965, + "grad_norm": 0.2659735381603241, + "learning_rate": 0.00011708437403104491, + "loss": 0.2802, + "step": 14610 + }, + { + "epoch": 1.8407595175866738, + "grad_norm": 0.2634638547897339, + "learning_rate": 0.00011697713930897728, + "loss": 0.2834, + "step": 14615 + }, + { + "epoch": 1.841389299996851, + "grad_norm": 0.2891436517238617, + "learning_rate": 0.00011686992232094012, + "loss": 0.2892, + "step": 14620 + }, + { + "epoch": 1.8420190824070284, + "grad_norm": 0.26533305644989014, + "learning_rate": 0.0001167627231245115, + "loss": 0.2954, + "step": 14625 + }, + { + "epoch": 1.8426488648172057, + "grad_norm": 0.26114416122436523, + "learning_rate": 0.00011665554177725977, + "loss": 0.2936, + "step": 14630 + }, + { + "epoch": 1.843278647227383, + "grad_norm": 0.24782754480838776, + "learning_rate": 0.00011654837833674379, + "loss": 0.283, + "step": 14635 + }, + { + "epoch": 1.84390842963756, + "grad_norm": 0.2653804123401642, + "learning_rate": 0.00011644123286051274, + "loss": 0.2911, + "step": 14640 + }, + { + "epoch": 1.8445382120477376, + "grad_norm": 0.2524818181991577, + "learning_rate": 0.00011633410540610621, + "loss": 0.29, + "step": 14645 + }, + { + "epoch": 1.8451679944579147, + "grad_norm": 0.2844378352165222, + "learning_rate": 0.00011622699603105404, + "loss": 0.298, + "step": 14650 + }, + { + "epoch": 1.8457977768680922, + "grad_norm": 0.2608543038368225, + "learning_rate": 0.0001161199047928765, + "loss": 0.2807, + "step": 14655 + }, + { + "epoch": 1.8464275592782693, + "grad_norm": 0.2596459984779358, + "learning_rate": 0.000116012831749084, + "loss": 0.29, + "step": 14660 + }, + { + "epoch": 1.8470573416884466, + "grad_norm": 0.2654721140861511, + "learning_rate": 0.00011590577695717717, + "loss": 0.2878, + "step": 14665 + }, + { + "epoch": 1.847687124098624, + "grad_norm": 0.283388614654541, + "learning_rate": 0.00011579874047464696, + "loss": 0.2751, + "step": 14670 + }, + { + "epoch": 1.8483169065088012, + "grad_norm": 0.24917341768741608, + "learning_rate": 0.00011569172235897433, + "loss": 0.3, + "step": 14675 + }, + { + "epoch": 1.8489466889189785, + "grad_norm": 0.2464076280593872, + "learning_rate": 0.00011558472266763049, + "loss": 0.2848, + "step": 14680 + }, + { + "epoch": 1.8495764713291558, + "grad_norm": 0.2884039282798767, + "learning_rate": 0.00011547774145807665, + "loss": 0.2698, + "step": 14685 + }, + { + "epoch": 1.8502062537393331, + "grad_norm": 0.2762083411216736, + "learning_rate": 0.00011537077878776425, + "loss": 0.3151, + "step": 14690 + }, + { + "epoch": 1.8508360361495102, + "grad_norm": 0.22906774282455444, + "learning_rate": 0.00011526383471413463, + "loss": 0.2669, + "step": 14695 + }, + { + "epoch": 1.8514658185596877, + "grad_norm": 0.28603047132492065, + "learning_rate": 0.00011515690929461928, + "loss": 0.2922, + "step": 14700 + }, + { + "epoch": 1.8520956009698648, + "grad_norm": 0.26245948672294617, + "learning_rate": 0.00011505000258663954, + "loss": 0.3095, + "step": 14705 + }, + { + "epoch": 1.8527253833800423, + "grad_norm": 0.2754320800304413, + "learning_rate": 0.00011494311464760673, + "loss": 0.2843, + "step": 14710 + }, + { + "epoch": 1.8533551657902194, + "grad_norm": 0.24283255636692047, + "learning_rate": 0.00011483624553492212, + "loss": 0.3039, + "step": 14715 + }, + { + "epoch": 1.8539849482003967, + "grad_norm": 0.299950510263443, + "learning_rate": 0.00011472939530597691, + "loss": 0.3108, + "step": 14720 + }, + { + "epoch": 1.854614730610574, + "grad_norm": 0.23872928321361542, + "learning_rate": 0.00011462256401815205, + "loss": 0.3221, + "step": 14725 + }, + { + "epoch": 1.8552445130207513, + "grad_norm": 0.32674193382263184, + "learning_rate": 0.00011451575172881845, + "loss": 0.3066, + "step": 14730 + }, + { + "epoch": 1.8558742954309286, + "grad_norm": 0.2620803415775299, + "learning_rate": 0.00011440895849533675, + "loss": 0.2855, + "step": 14735 + }, + { + "epoch": 1.856504077841106, + "grad_norm": 0.2653051018714905, + "learning_rate": 0.0001143021843750573, + "loss": 0.2827, + "step": 14740 + }, + { + "epoch": 1.8571338602512832, + "grad_norm": 0.29697105288505554, + "learning_rate": 0.00011419542942532023, + "loss": 0.2848, + "step": 14745 + }, + { + "epoch": 1.8577636426614603, + "grad_norm": 0.26711151003837585, + "learning_rate": 0.00011408869370345545, + "loss": 0.301, + "step": 14750 + }, + { + "epoch": 1.8583934250716379, + "grad_norm": 0.26371288299560547, + "learning_rate": 0.0001139819772667824, + "loss": 0.2994, + "step": 14755 + }, + { + "epoch": 1.859023207481815, + "grad_norm": 0.34920862317085266, + "learning_rate": 0.00011387528017261035, + "loss": 0.2968, + "step": 14760 + }, + { + "epoch": 1.8596529898919925, + "grad_norm": 0.2951182723045349, + "learning_rate": 0.000113768602478238, + "loss": 0.2935, + "step": 14765 + }, + { + "epoch": 1.8602827723021695, + "grad_norm": 0.22442401945590973, + "learning_rate": 0.00011366194424095381, + "loss": 0.2952, + "step": 14770 + }, + { + "epoch": 1.8609125547123468, + "grad_norm": 0.26102137565612793, + "learning_rate": 0.00011355530551803553, + "loss": 0.2823, + "step": 14775 + }, + { + "epoch": 1.8615423371225241, + "grad_norm": 0.27118000388145447, + "learning_rate": 0.0001134486863667507, + "loss": 0.286, + "step": 14780 + }, + { + "epoch": 1.8621721195327015, + "grad_norm": 0.2869999408721924, + "learning_rate": 0.00011334208684435617, + "loss": 0.2734, + "step": 14785 + }, + { + "epoch": 1.8628019019428788, + "grad_norm": 0.32944396138191223, + "learning_rate": 0.0001132355070080984, + "loss": 0.3038, + "step": 14790 + }, + { + "epoch": 1.863431684353056, + "grad_norm": 0.28535759449005127, + "learning_rate": 0.00011312894691521312, + "loss": 0.3213, + "step": 14795 + }, + { + "epoch": 1.8640614667632334, + "grad_norm": 0.23751592636108398, + "learning_rate": 0.00011302240662292561, + "loss": 0.2972, + "step": 14800 + }, + { + "epoch": 1.8646912491734104, + "grad_norm": 0.2352185994386673, + "learning_rate": 0.00011291588618845043, + "loss": 0.2772, + "step": 14805 + }, + { + "epoch": 1.865321031583588, + "grad_norm": 0.24066108465194702, + "learning_rate": 0.00011280938566899142, + "loss": 0.3053, + "step": 14810 + }, + { + "epoch": 1.865950813993765, + "grad_norm": 0.33842501044273376, + "learning_rate": 0.0001127029051217418, + "loss": 0.2992, + "step": 14815 + }, + { + "epoch": 1.8665805964039426, + "grad_norm": 0.24873322248458862, + "learning_rate": 0.00011259644460388412, + "loss": 0.2887, + "step": 14820 + }, + { + "epoch": 1.8672103788141197, + "grad_norm": 0.27127575874328613, + "learning_rate": 0.00011249000417259005, + "loss": 0.2619, + "step": 14825 + }, + { + "epoch": 1.867840161224297, + "grad_norm": 0.28289374709129333, + "learning_rate": 0.00011238358388502059, + "loss": 0.2815, + "step": 14830 + }, + { + "epoch": 1.8684699436344743, + "grad_norm": 0.29764994978904724, + "learning_rate": 0.00011227718379832583, + "loss": 0.3006, + "step": 14835 + }, + { + "epoch": 1.8690997260446516, + "grad_norm": 0.2869538366794586, + "learning_rate": 0.00011217080396964507, + "loss": 0.2707, + "step": 14840 + }, + { + "epoch": 1.8697295084548289, + "grad_norm": 0.2732262909412384, + "learning_rate": 0.00011206444445610663, + "loss": 0.2768, + "step": 14845 + }, + { + "epoch": 1.8703592908650062, + "grad_norm": 0.3032742738723755, + "learning_rate": 0.0001119581053148281, + "loss": 0.2715, + "step": 14850 + }, + { + "epoch": 1.8709890732751835, + "grad_norm": 0.26171359419822693, + "learning_rate": 0.00011185178660291594, + "loss": 0.2701, + "step": 14855 + }, + { + "epoch": 1.8716188556853606, + "grad_norm": 0.30940353870391846, + "learning_rate": 0.00011174548837746581, + "loss": 0.2843, + "step": 14860 + }, + { + "epoch": 1.872248638095538, + "grad_norm": 0.2774769067764282, + "learning_rate": 0.00011163921069556224, + "loss": 0.2951, + "step": 14865 + }, + { + "epoch": 1.8728784205057152, + "grad_norm": 0.29232633113861084, + "learning_rate": 0.00011153295361427876, + "loss": 0.2938, + "step": 14870 + }, + { + "epoch": 1.8735082029158927, + "grad_norm": 0.28283149003982544, + "learning_rate": 0.00011142671719067793, + "loss": 0.2875, + "step": 14875 + }, + { + "epoch": 1.8741379853260698, + "grad_norm": 0.24245183169841766, + "learning_rate": 0.00011132050148181103, + "loss": 0.2829, + "step": 14880 + }, + { + "epoch": 1.874767767736247, + "grad_norm": 0.27495938539505005, + "learning_rate": 0.00011121430654471837, + "loss": 0.2923, + "step": 14885 + }, + { + "epoch": 1.8753975501464244, + "grad_norm": 0.3106895089149475, + "learning_rate": 0.00011110813243642906, + "loss": 0.2855, + "step": 14890 + }, + { + "epoch": 1.8760273325566017, + "grad_norm": 0.263810396194458, + "learning_rate": 0.00011100197921396102, + "loss": 0.271, + "step": 14895 + }, + { + "epoch": 1.876657114966779, + "grad_norm": 0.23044048249721527, + "learning_rate": 0.00011089584693432091, + "loss": 0.2608, + "step": 14900 + }, + { + "epoch": 1.8772868973769563, + "grad_norm": 0.29268765449523926, + "learning_rate": 0.00011078973565450427, + "loss": 0.2835, + "step": 14905 + }, + { + "epoch": 1.8779166797871336, + "grad_norm": 0.2691350281238556, + "learning_rate": 0.00011068364543149527, + "loss": 0.291, + "step": 14910 + }, + { + "epoch": 1.8785464621973107, + "grad_norm": 0.26748213171958923, + "learning_rate": 0.00011057757632226672, + "loss": 0.2994, + "step": 14915 + }, + { + "epoch": 1.8791762446074882, + "grad_norm": 0.2624029815196991, + "learning_rate": 0.00011047152838378018, + "loss": 0.2832, + "step": 14920 + }, + { + "epoch": 1.8798060270176653, + "grad_norm": 0.2670036554336548, + "learning_rate": 0.00011036550167298583, + "loss": 0.284, + "step": 14925 + }, + { + "epoch": 1.8804358094278428, + "grad_norm": 0.2848396599292755, + "learning_rate": 0.0001102594962468224, + "loss": 0.2831, + "step": 14930 + }, + { + "epoch": 1.88106559183802, + "grad_norm": 0.2502748668193817, + "learning_rate": 0.0001101535121622173, + "loss": 0.3038, + "step": 14935 + }, + { + "epoch": 1.8816953742481972, + "grad_norm": 0.2998834252357483, + "learning_rate": 0.0001100475494760863, + "loss": 0.2847, + "step": 14940 + }, + { + "epoch": 1.8823251566583745, + "grad_norm": 0.229685977101326, + "learning_rate": 0.00010994160824533398, + "loss": 0.261, + "step": 14945 + }, + { + "epoch": 1.8829549390685518, + "grad_norm": 0.26833808422088623, + "learning_rate": 0.00010983568852685294, + "loss": 0.2923, + "step": 14950 + }, + { + "epoch": 1.8835847214787291, + "grad_norm": 0.2380465716123581, + "learning_rate": 0.00010972979037752465, + "loss": 0.2664, + "step": 14955 + }, + { + "epoch": 1.8842145038889064, + "grad_norm": 0.2505188286304474, + "learning_rate": 0.00010962391385421876, + "loss": 0.2914, + "step": 14960 + }, + { + "epoch": 1.8848442862990837, + "grad_norm": 0.33335885405540466, + "learning_rate": 0.00010951805901379346, + "loss": 0.3092, + "step": 14965 + }, + { + "epoch": 1.8854740687092608, + "grad_norm": 0.22425580024719238, + "learning_rate": 0.0001094122259130951, + "loss": 0.2583, + "step": 14970 + }, + { + "epoch": 1.8861038511194383, + "grad_norm": 0.25008514523506165, + "learning_rate": 0.00010930641460895863, + "loss": 0.2936, + "step": 14975 + }, + { + "epoch": 1.8867336335296154, + "grad_norm": 0.2543163299560547, + "learning_rate": 0.00010920062515820707, + "loss": 0.2855, + "step": 14980 + }, + { + "epoch": 1.887363415939793, + "grad_norm": 0.25144490599632263, + "learning_rate": 0.00010909485761765172, + "loss": 0.2788, + "step": 14985 + }, + { + "epoch": 1.88799319834997, + "grad_norm": 0.23470145463943481, + "learning_rate": 0.00010898911204409218, + "loss": 0.2709, + "step": 14990 + }, + { + "epoch": 1.8886229807601473, + "grad_norm": 0.27916932106018066, + "learning_rate": 0.00010888338849431629, + "loss": 0.279, + "step": 14995 + }, + { + "epoch": 1.8892527631703246, + "grad_norm": 0.24980424344539642, + "learning_rate": 0.00010877768702509996, + "loss": 0.2982, + "step": 15000 + }, + { + "epoch": 1.8892527631703246, + "eval_loss": 0.3032541871070862, + "eval_runtime": 6.1659, + "eval_samples_per_second": 162.182, + "eval_steps_per_second": 10.217, + "step": 15000 + }, + { + "epoch": 1.889882545580502, + "grad_norm": 0.24535268545150757, + "learning_rate": 0.00010867200769320732, + "loss": 0.2667, + "step": 15005 + }, + { + "epoch": 1.8905123279906793, + "grad_norm": 0.2690826654434204, + "learning_rate": 0.0001085663505553906, + "loss": 0.2703, + "step": 15010 + }, + { + "epoch": 1.8911421104008566, + "grad_norm": 0.2511346936225891, + "learning_rate": 0.00010846071566839008, + "loss": 0.3011, + "step": 15015 + }, + { + "epoch": 1.8917718928110339, + "grad_norm": 0.28077587485313416, + "learning_rate": 0.00010835510308893407, + "loss": 0.285, + "step": 15020 + }, + { + "epoch": 1.892401675221211, + "grad_norm": 0.309238463640213, + "learning_rate": 0.000108249512873739, + "loss": 0.2894, + "step": 15025 + }, + { + "epoch": 1.8930314576313885, + "grad_norm": 0.26940178871154785, + "learning_rate": 0.00010814394507950917, + "loss": 0.2864, + "step": 15030 + }, + { + "epoch": 1.8936612400415656, + "grad_norm": 0.27850431203842163, + "learning_rate": 0.00010803839976293694, + "loss": 0.2716, + "step": 15035 + }, + { + "epoch": 1.894291022451743, + "grad_norm": 0.24114792048931122, + "learning_rate": 0.00010793287698070256, + "loss": 0.2695, + "step": 15040 + }, + { + "epoch": 1.8949208048619202, + "grad_norm": 0.3137163817882538, + "learning_rate": 0.0001078273767894741, + "loss": 0.3063, + "step": 15045 + }, + { + "epoch": 1.8955505872720975, + "grad_norm": 0.27090078592300415, + "learning_rate": 0.00010772189924590773, + "loss": 0.2643, + "step": 15050 + }, + { + "epoch": 1.8961803696822748, + "grad_norm": 0.27956193685531616, + "learning_rate": 0.00010761644440664714, + "loss": 0.271, + "step": 15055 + }, + { + "epoch": 1.896810152092452, + "grad_norm": 0.24823328852653503, + "learning_rate": 0.00010751101232832401, + "loss": 0.2849, + "step": 15060 + }, + { + "epoch": 1.8974399345026294, + "grad_norm": 0.2675158977508545, + "learning_rate": 0.00010740560306755787, + "loss": 0.2744, + "step": 15065 + }, + { + "epoch": 1.8980697169128065, + "grad_norm": 0.2589218318462372, + "learning_rate": 0.0001073002166809558, + "loss": 0.2834, + "step": 15070 + }, + { + "epoch": 1.898699499322984, + "grad_norm": 0.277705579996109, + "learning_rate": 0.00010719485322511273, + "loss": 0.2826, + "step": 15075 + }, + { + "epoch": 1.899329281733161, + "grad_norm": 0.23539955914020538, + "learning_rate": 0.0001070895127566113, + "loss": 0.2589, + "step": 15080 + }, + { + "epoch": 1.8999590641433386, + "grad_norm": 0.3010064661502838, + "learning_rate": 0.00010698419533202172, + "loss": 0.2804, + "step": 15085 + }, + { + "epoch": 1.9005888465535157, + "grad_norm": 0.25453826785087585, + "learning_rate": 0.00010687890100790175, + "loss": 0.2863, + "step": 15090 + }, + { + "epoch": 1.9012186289636932, + "grad_norm": 0.2774878144264221, + "learning_rate": 0.00010677362984079699, + "loss": 0.2933, + "step": 15095 + }, + { + "epoch": 1.9018484113738703, + "grad_norm": 0.26002323627471924, + "learning_rate": 0.00010666838188724038, + "loss": 0.2891, + "step": 15100 + }, + { + "epoch": 1.9024781937840476, + "grad_norm": 0.25788870453834534, + "learning_rate": 0.00010656315720375246, + "loss": 0.2934, + "step": 15105 + }, + { + "epoch": 1.903107976194225, + "grad_norm": 0.24301236867904663, + "learning_rate": 0.00010645795584684138, + "loss": 0.2848, + "step": 15110 + }, + { + "epoch": 1.9037377586044022, + "grad_norm": 0.309514582157135, + "learning_rate": 0.00010635277787300256, + "loss": 0.2846, + "step": 15115 + }, + { + "epoch": 1.9043675410145795, + "grad_norm": 0.274870902299881, + "learning_rate": 0.00010624762333871913, + "loss": 0.2956, + "step": 15120 + }, + { + "epoch": 1.9049973234247566, + "grad_norm": 0.24861137568950653, + "learning_rate": 0.00010614249230046129, + "loss": 0.2777, + "step": 15125 + }, + { + "epoch": 1.905627105834934, + "grad_norm": 0.26125532388687134, + "learning_rate": 0.00010603738481468693, + "loss": 0.2794, + "step": 15130 + }, + { + "epoch": 1.9062568882451112, + "grad_norm": 0.24094760417938232, + "learning_rate": 0.0001059323009378411, + "loss": 0.2633, + "step": 15135 + }, + { + "epoch": 1.9068866706552887, + "grad_norm": 0.3418034315109253, + "learning_rate": 0.0001058272407263563, + "loss": 0.3045, + "step": 15140 + }, + { + "epoch": 1.9075164530654658, + "grad_norm": 0.2657215893268585, + "learning_rate": 0.00010572220423665222, + "loss": 0.3085, + "step": 15145 + }, + { + "epoch": 1.9081462354756433, + "grad_norm": 0.23728597164154053, + "learning_rate": 0.00010561719152513591, + "loss": 0.2788, + "step": 15150 + }, + { + "epoch": 1.9087760178858204, + "grad_norm": 0.2741139829158783, + "learning_rate": 0.0001055122026482016, + "loss": 0.2855, + "step": 15155 + }, + { + "epoch": 1.9094058002959977, + "grad_norm": 0.2415517419576645, + "learning_rate": 0.00010540723766223064, + "loss": 0.2799, + "step": 15160 + }, + { + "epoch": 1.910035582706175, + "grad_norm": 0.2724277675151825, + "learning_rate": 0.00010530229662359162, + "loss": 0.2821, + "step": 15165 + }, + { + "epoch": 1.9106653651163523, + "grad_norm": 0.28418639302253723, + "learning_rate": 0.00010519737958864036, + "loss": 0.2899, + "step": 15170 + }, + { + "epoch": 1.9112951475265296, + "grad_norm": 0.26423749327659607, + "learning_rate": 0.00010509248661371962, + "loss": 0.3033, + "step": 15175 + }, + { + "epoch": 1.9119249299367067, + "grad_norm": 0.2523916959762573, + "learning_rate": 0.00010498761775515941, + "loss": 0.2763, + "step": 15180 + }, + { + "epoch": 1.9125547123468842, + "grad_norm": 0.29665645956993103, + "learning_rate": 0.00010488277306927663, + "loss": 0.2918, + "step": 15185 + }, + { + "epoch": 1.9131844947570613, + "grad_norm": 0.2941978871822357, + "learning_rate": 0.00010477795261237537, + "loss": 0.2753, + "step": 15190 + }, + { + "epoch": 1.9138142771672388, + "grad_norm": 0.2701078951358795, + "learning_rate": 0.00010467315644074646, + "loss": 0.2925, + "step": 15195 + }, + { + "epoch": 1.914444059577416, + "grad_norm": 0.2497081160545349, + "learning_rate": 0.00010456838461066793, + "loss": 0.2669, + "step": 15200 + }, + { + "epoch": 1.9150738419875935, + "grad_norm": 0.2448865920305252, + "learning_rate": 0.00010446363717840462, + "loss": 0.2766, + "step": 15205 + }, + { + "epoch": 1.9157036243977705, + "grad_norm": 0.26188936829566956, + "learning_rate": 0.00010435891420020833, + "loss": 0.2935, + "step": 15210 + }, + { + "epoch": 1.9163334068079478, + "grad_norm": 0.3044489622116089, + "learning_rate": 0.00010425421573231767, + "loss": 0.2791, + "step": 15215 + }, + { + "epoch": 1.9169631892181251, + "grad_norm": 0.30361208319664, + "learning_rate": 0.00010414954183095813, + "loss": 0.277, + "step": 15220 + }, + { + "epoch": 1.9175929716283024, + "grad_norm": 0.31100359559059143, + "learning_rate": 0.00010404489255234191, + "loss": 0.2687, + "step": 15225 + }, + { + "epoch": 1.9182227540384797, + "grad_norm": 0.26500749588012695, + "learning_rate": 0.00010394026795266814, + "loss": 0.2804, + "step": 15230 + }, + { + "epoch": 1.9188525364486568, + "grad_norm": 0.33220374584198, + "learning_rate": 0.00010383566808812257, + "loss": 0.284, + "step": 15235 + }, + { + "epoch": 1.9194823188588344, + "grad_norm": 0.23146981000900269, + "learning_rate": 0.00010373109301487777, + "loss": 0.2949, + "step": 15240 + }, + { + "epoch": 1.9201121012690114, + "grad_norm": 0.24833330512046814, + "learning_rate": 0.00010362654278909292, + "loss": 0.2685, + "step": 15245 + }, + { + "epoch": 1.920741883679189, + "grad_norm": 0.22905099391937256, + "learning_rate": 0.00010352201746691381, + "loss": 0.248, + "step": 15250 + }, + { + "epoch": 1.921371666089366, + "grad_norm": 0.2544589936733246, + "learning_rate": 0.00010341751710447308, + "loss": 0.2763, + "step": 15255 + }, + { + "epoch": 1.9220014484995434, + "grad_norm": 0.24207763373851776, + "learning_rate": 0.0001033130417578897, + "loss": 0.2691, + "step": 15260 + }, + { + "epoch": 1.9226312309097207, + "grad_norm": 0.3025490939617157, + "learning_rate": 0.0001032085914832693, + "loss": 0.2902, + "step": 15265 + }, + { + "epoch": 1.923261013319898, + "grad_norm": 0.2563372552394867, + "learning_rate": 0.00010310416633670413, + "loss": 0.2937, + "step": 15270 + }, + { + "epoch": 1.9238907957300753, + "grad_norm": 0.22143816947937012, + "learning_rate": 0.00010299976637427285, + "loss": 0.2615, + "step": 15275 + }, + { + "epoch": 1.9245205781402526, + "grad_norm": 0.26383697986602783, + "learning_rate": 0.00010289539165204058, + "loss": 0.2834, + "step": 15280 + }, + { + "epoch": 1.9251503605504299, + "grad_norm": 0.2607567310333252, + "learning_rate": 0.00010279104222605903, + "loss": 0.2875, + "step": 15285 + }, + { + "epoch": 1.925780142960607, + "grad_norm": 0.23255427181720734, + "learning_rate": 0.0001026867181523662, + "loss": 0.2645, + "step": 15290 + }, + { + "epoch": 1.9264099253707845, + "grad_norm": 0.2203371226787567, + "learning_rate": 0.00010258241948698641, + "loss": 0.276, + "step": 15295 + }, + { + "epoch": 1.9270397077809616, + "grad_norm": 0.2557859718799591, + "learning_rate": 0.00010247814628593052, + "loss": 0.2877, + "step": 15300 + }, + { + "epoch": 1.927669490191139, + "grad_norm": 0.2551586925983429, + "learning_rate": 0.00010237389860519557, + "loss": 0.2678, + "step": 15305 + }, + { + "epoch": 1.9282992726013162, + "grad_norm": 0.2592737376689911, + "learning_rate": 0.00010226967650076495, + "loss": 0.2645, + "step": 15310 + }, + { + "epoch": 1.9289290550114935, + "grad_norm": 0.25076064467430115, + "learning_rate": 0.00010216548002860836, + "loss": 0.2595, + "step": 15315 + }, + { + "epoch": 1.9295588374216708, + "grad_norm": 0.28892189264297485, + "learning_rate": 0.0001020613092446816, + "loss": 0.2658, + "step": 15320 + }, + { + "epoch": 1.930188619831848, + "grad_norm": 0.28119730949401855, + "learning_rate": 0.00010195716420492692, + "loss": 0.2783, + "step": 15325 + }, + { + "epoch": 1.9308184022420254, + "grad_norm": 0.23143291473388672, + "learning_rate": 0.00010185304496527239, + "loss": 0.2745, + "step": 15330 + }, + { + "epoch": 1.9314481846522027, + "grad_norm": 0.23947221040725708, + "learning_rate": 0.00010174895158163252, + "loss": 0.2642, + "step": 15335 + }, + { + "epoch": 1.93207796706238, + "grad_norm": 0.27924421429634094, + "learning_rate": 0.00010164488410990779, + "loss": 0.2895, + "step": 15340 + }, + { + "epoch": 1.932707749472557, + "grad_norm": 0.2736763656139374, + "learning_rate": 0.00010154084260598488, + "loss": 0.2798, + "step": 15345 + }, + { + "epoch": 1.9333375318827346, + "grad_norm": 0.26288047432899475, + "learning_rate": 0.00010143682712573639, + "loss": 0.2799, + "step": 15350 + }, + { + "epoch": 1.9339673142929117, + "grad_norm": 0.2662082016468048, + "learning_rate": 0.00010133283772502105, + "loss": 0.2708, + "step": 15355 + }, + { + "epoch": 1.9345970967030892, + "grad_norm": 0.2595316767692566, + "learning_rate": 0.00010122887445968358, + "loss": 0.2631, + "step": 15360 + }, + { + "epoch": 1.9352268791132663, + "grad_norm": 0.22839054465293884, + "learning_rate": 0.00010112493738555453, + "loss": 0.2533, + "step": 15365 + }, + { + "epoch": 1.9358566615234436, + "grad_norm": 0.25195086002349854, + "learning_rate": 0.0001010210265584505, + "loss": 0.26, + "step": 15370 + }, + { + "epoch": 1.936486443933621, + "grad_norm": 0.2431613951921463, + "learning_rate": 0.00010091714203417404, + "loss": 0.2802, + "step": 15375 + }, + { + "epoch": 1.9371162263437982, + "grad_norm": 0.24503393471240997, + "learning_rate": 0.00010081328386851342, + "loss": 0.2968, + "step": 15380 + }, + { + "epoch": 1.9377460087539755, + "grad_norm": 0.26283174753189087, + "learning_rate": 0.00010070945211724298, + "loss": 0.2831, + "step": 15385 + }, + { + "epoch": 1.9383757911641528, + "grad_norm": 0.23644685745239258, + "learning_rate": 0.00010060564683612264, + "loss": 0.2843, + "step": 15390 + }, + { + "epoch": 1.9390055735743301, + "grad_norm": 0.271457314491272, + "learning_rate": 0.00010050186808089828, + "loss": 0.2736, + "step": 15395 + }, + { + "epoch": 1.9396353559845072, + "grad_norm": 0.2437523454427719, + "learning_rate": 0.00010039811590730137, + "loss": 0.2839, + "step": 15400 + }, + { + "epoch": 1.9402651383946847, + "grad_norm": 0.25611042976379395, + "learning_rate": 0.00010029439037104925, + "loss": 0.2671, + "step": 15405 + }, + { + "epoch": 1.9408949208048618, + "grad_norm": 0.2646775245666504, + "learning_rate": 0.00010019069152784486, + "loss": 0.3072, + "step": 15410 + }, + { + "epoch": 1.9415247032150393, + "grad_norm": 0.26959145069122314, + "learning_rate": 0.00010008701943337695, + "loss": 0.2655, + "step": 15415 + }, + { + "epoch": 1.9421544856252164, + "grad_norm": 0.28409838676452637, + "learning_rate": 9.998337414331971e-05, + "loss": 0.2643, + "step": 15420 + }, + { + "epoch": 1.9427842680353937, + "grad_norm": 0.288766086101532, + "learning_rate": 9.987975571333303e-05, + "loss": 0.2849, + "step": 15425 + }, + { + "epoch": 1.943414050445571, + "grad_norm": 0.28650057315826416, + "learning_rate": 9.977616419906247e-05, + "loss": 0.2672, + "step": 15430 + }, + { + "epoch": 1.9440438328557483, + "grad_norm": 0.28229546546936035, + "learning_rate": 9.967259965613893e-05, + "loss": 0.2649, + "step": 15435 + }, + { + "epoch": 1.9446736152659256, + "grad_norm": 0.21892526745796204, + "learning_rate": 9.956906214017894e-05, + "loss": 0.2668, + "step": 15440 + }, + { + "epoch": 1.945303397676103, + "grad_norm": 0.27021822333335876, + "learning_rate": 9.946555170678458e-05, + "loss": 0.2725, + "step": 15445 + }, + { + "epoch": 1.9459331800862802, + "grad_norm": 0.2574271857738495, + "learning_rate": 9.936206841154328e-05, + "loss": 0.2643, + "step": 15450 + }, + { + "epoch": 1.9465629624964573, + "grad_norm": 0.2907993495464325, + "learning_rate": 9.925861231002792e-05, + "loss": 0.3103, + "step": 15455 + }, + { + "epoch": 1.9471927449066349, + "grad_norm": 0.225221186876297, + "learning_rate": 9.915518345779681e-05, + "loss": 0.2804, + "step": 15460 + }, + { + "epoch": 1.947822527316812, + "grad_norm": 0.2557651400566101, + "learning_rate": 9.905178191039365e-05, + "loss": 0.2735, + "step": 15465 + }, + { + "epoch": 1.9484523097269895, + "grad_norm": 0.26498880982398987, + "learning_rate": 9.894840772334733e-05, + "loss": 0.2664, + "step": 15470 + }, + { + "epoch": 1.9490820921371665, + "grad_norm": 0.2424790859222412, + "learning_rate": 9.884506095217222e-05, + "loss": 0.2693, + "step": 15475 + }, + { + "epoch": 1.9497118745473438, + "grad_norm": 0.25802767276763916, + "learning_rate": 9.87417416523679e-05, + "loss": 0.2831, + "step": 15480 + }, + { + "epoch": 1.9503416569575212, + "grad_norm": 0.2601839005947113, + "learning_rate": 9.863844987941912e-05, + "loss": 0.2629, + "step": 15485 + }, + { + "epoch": 1.9509714393676985, + "grad_norm": 0.26015961170196533, + "learning_rate": 9.853518568879602e-05, + "loss": 0.2634, + "step": 15490 + }, + { + "epoch": 1.9516012217778758, + "grad_norm": 0.2370160073041916, + "learning_rate": 9.843194913595374e-05, + "loss": 0.2557, + "step": 15495 + }, + { + "epoch": 1.952231004188053, + "grad_norm": 0.2519363462924957, + "learning_rate": 9.832874027633281e-05, + "loss": 0.255, + "step": 15500 + }, + { + "epoch": 1.9528607865982304, + "grad_norm": 0.3419806659221649, + "learning_rate": 9.822555916535858e-05, + "loss": 0.2744, + "step": 15505 + }, + { + "epoch": 1.9534905690084075, + "grad_norm": 0.24397574365139008, + "learning_rate": 9.812240585844176e-05, + "loss": 0.2619, + "step": 15510 + }, + { + "epoch": 1.954120351418585, + "grad_norm": 0.2432924211025238, + "learning_rate": 9.801928041097795e-05, + "loss": 0.2581, + "step": 15515 + }, + { + "epoch": 1.954750133828762, + "grad_norm": 0.27478650212287903, + "learning_rate": 9.791618287834797e-05, + "loss": 0.2606, + "step": 15520 + }, + { + "epoch": 1.9553799162389396, + "grad_norm": 0.29080766439437866, + "learning_rate": 9.781311331591747e-05, + "loss": 0.2656, + "step": 15525 + }, + { + "epoch": 1.9560096986491167, + "grad_norm": 0.24801793694496155, + "learning_rate": 9.771007177903723e-05, + "loss": 0.2651, + "step": 15530 + }, + { + "epoch": 1.956639481059294, + "grad_norm": 0.22467739880084991, + "learning_rate": 9.76070583230429e-05, + "loss": 0.2663, + "step": 15535 + }, + { + "epoch": 1.9572692634694713, + "grad_norm": 0.24151213467121124, + "learning_rate": 9.750407300325502e-05, + "loss": 0.2612, + "step": 15540 + }, + { + "epoch": 1.9578990458796486, + "grad_norm": 0.262352854013443, + "learning_rate": 9.7401115874979e-05, + "loss": 0.2508, + "step": 15545 + }, + { + "epoch": 1.9585288282898259, + "grad_norm": 0.2491580843925476, + "learning_rate": 9.72981869935053e-05, + "loss": 0.2735, + "step": 15550 + }, + { + "epoch": 1.9591586107000032, + "grad_norm": 0.27000558376312256, + "learning_rate": 9.719528641410898e-05, + "loss": 0.2794, + "step": 15555 + }, + { + "epoch": 1.9597883931101805, + "grad_norm": 0.2562926113605499, + "learning_rate": 9.709241419205008e-05, + "loss": 0.2829, + "step": 15560 + }, + { + "epoch": 1.9604181755203576, + "grad_norm": 0.2559642493724823, + "learning_rate": 9.69895703825733e-05, + "loss": 0.2768, + "step": 15565 + }, + { + "epoch": 1.961047957930535, + "grad_norm": 0.23282787203788757, + "learning_rate": 9.688675504090811e-05, + "loss": 0.2648, + "step": 15570 + }, + { + "epoch": 1.9616777403407122, + "grad_norm": 0.2280416637659073, + "learning_rate": 9.678396822226868e-05, + "loss": 0.2474, + "step": 15575 + }, + { + "epoch": 1.9623075227508897, + "grad_norm": 0.2516798973083496, + "learning_rate": 9.668120998185392e-05, + "loss": 0.2855, + "step": 15580 + }, + { + "epoch": 1.9629373051610668, + "grad_norm": 0.24892964959144592, + "learning_rate": 9.657848037484726e-05, + "loss": 0.2731, + "step": 15585 + }, + { + "epoch": 1.963567087571244, + "grad_norm": 0.2524420917034149, + "learning_rate": 9.647577945641699e-05, + "loss": 0.275, + "step": 15590 + }, + { + "epoch": 1.9641968699814214, + "grad_norm": 0.2617582380771637, + "learning_rate": 9.637310728171577e-05, + "loss": 0.293, + "step": 15595 + }, + { + "epoch": 1.9648266523915987, + "grad_norm": 0.2635948061943054, + "learning_rate": 9.627046390588086e-05, + "loss": 0.2642, + "step": 15600 + }, + { + "epoch": 1.965456434801776, + "grad_norm": 0.22701425850391388, + "learning_rate": 9.61678493840342e-05, + "loss": 0.2647, + "step": 15605 + }, + { + "epoch": 1.9660862172119533, + "grad_norm": 0.2594752609729767, + "learning_rate": 9.606526377128207e-05, + "loss": 0.2846, + "step": 15610 + }, + { + "epoch": 1.9667159996221306, + "grad_norm": 0.25541216135025024, + "learning_rate": 9.596270712271524e-05, + "loss": 0.2712, + "step": 15615 + }, + { + "epoch": 1.9673457820323077, + "grad_norm": 0.26473337411880493, + "learning_rate": 9.586017949340909e-05, + "loss": 0.2515, + "step": 15620 + }, + { + "epoch": 1.9679755644424852, + "grad_norm": 0.2370501607656479, + "learning_rate": 9.575768093842321e-05, + "loss": 0.2569, + "step": 15625 + }, + { + "epoch": 1.9686053468526623, + "grad_norm": 0.25999268889427185, + "learning_rate": 9.565521151280168e-05, + "loss": 0.2846, + "step": 15630 + }, + { + "epoch": 1.9692351292628398, + "grad_norm": 0.2597227394580841, + "learning_rate": 9.555277127157294e-05, + "loss": 0.2814, + "step": 15635 + }, + { + "epoch": 1.969864911673017, + "grad_norm": 0.22267143428325653, + "learning_rate": 9.545036026974979e-05, + "loss": 0.2703, + "step": 15640 + }, + { + "epoch": 1.9704946940831942, + "grad_norm": 0.2599702477455139, + "learning_rate": 9.534797856232913e-05, + "loss": 0.2741, + "step": 15645 + }, + { + "epoch": 1.9711244764933715, + "grad_norm": 0.23703083395957947, + "learning_rate": 9.524562620429243e-05, + "loss": 0.2657, + "step": 15650 + }, + { + "epoch": 1.9717542589035488, + "grad_norm": 0.24194732308387756, + "learning_rate": 9.514330325060515e-05, + "loss": 0.2613, + "step": 15655 + }, + { + "epoch": 1.9723840413137261, + "grad_norm": 0.2648374140262604, + "learning_rate": 9.504100975621709e-05, + "loss": 0.2808, + "step": 15660 + }, + { + "epoch": 1.9730138237239034, + "grad_norm": 0.2491552084684372, + "learning_rate": 9.493874577606218e-05, + "loss": 0.2622, + "step": 15665 + }, + { + "epoch": 1.9736436061340807, + "grad_norm": 0.25322696566581726, + "learning_rate": 9.483651136505857e-05, + "loss": 0.2716, + "step": 15670 + }, + { + "epoch": 1.9742733885442578, + "grad_norm": 0.22565199434757233, + "learning_rate": 9.473430657810838e-05, + "loss": 0.2947, + "step": 15675 + }, + { + "epoch": 1.9749031709544353, + "grad_norm": 0.2594245672225952, + "learning_rate": 9.463213147009795e-05, + "loss": 0.2793, + "step": 15680 + }, + { + "epoch": 1.9755329533646124, + "grad_norm": 0.2432025521993637, + "learning_rate": 9.452998609589769e-05, + "loss": 0.2559, + "step": 15685 + }, + { + "epoch": 1.97616273577479, + "grad_norm": 0.2537454068660736, + "learning_rate": 9.442787051036192e-05, + "loss": 0.2842, + "step": 15690 + }, + { + "epoch": 1.976792518184967, + "grad_norm": 0.2597581446170807, + "learning_rate": 9.432578476832911e-05, + "loss": 0.2755, + "step": 15695 + }, + { + "epoch": 1.9774223005951443, + "grad_norm": 0.25382810831069946, + "learning_rate": 9.42237289246216e-05, + "loss": 0.2653, + "step": 15700 + }, + { + "epoch": 1.9780520830053216, + "grad_norm": 0.23822832107543945, + "learning_rate": 9.412170303404579e-05, + "loss": 0.2624, + "step": 15705 + }, + { + "epoch": 1.978681865415499, + "grad_norm": 0.2722800076007843, + "learning_rate": 9.40197071513918e-05, + "loss": 0.2712, + "step": 15710 + }, + { + "epoch": 1.9793116478256763, + "grad_norm": 0.2273283749818802, + "learning_rate": 9.39177413314338e-05, + "loss": 0.2545, + "step": 15715 + }, + { + "epoch": 1.9799414302358536, + "grad_norm": 0.24674946069717407, + "learning_rate": 9.381580562892972e-05, + "loss": 0.2606, + "step": 15720 + }, + { + "epoch": 1.9805712126460309, + "grad_norm": 0.23100855946540833, + "learning_rate": 9.371390009862145e-05, + "loss": 0.2632, + "step": 15725 + }, + { + "epoch": 1.981200995056208, + "grad_norm": 0.23489323258399963, + "learning_rate": 9.361202479523448e-05, + "loss": 0.2833, + "step": 15730 + }, + { + "epoch": 1.9818307774663855, + "grad_norm": 0.26526087522506714, + "learning_rate": 9.35101797734783e-05, + "loss": 0.2717, + "step": 15735 + }, + { + "epoch": 1.9824605598765626, + "grad_norm": 0.27042046189308167, + "learning_rate": 9.340836508804595e-05, + "loss": 0.2664, + "step": 15740 + }, + { + "epoch": 1.98309034228674, + "grad_norm": 0.28461650013923645, + "learning_rate": 9.330658079361422e-05, + "loss": 0.26, + "step": 15745 + }, + { + "epoch": 1.9837201246969172, + "grad_norm": 0.26529213786125183, + "learning_rate": 9.320482694484356e-05, + "loss": 0.2808, + "step": 15750 + }, + { + "epoch": 1.9843499071070945, + "grad_norm": 0.32026639580726624, + "learning_rate": 9.310310359637823e-05, + "loss": 0.2631, + "step": 15755 + }, + { + "epoch": 1.9849796895172718, + "grad_norm": 0.2596029043197632, + "learning_rate": 9.300141080284588e-05, + "loss": 0.2771, + "step": 15760 + }, + { + "epoch": 1.985609471927449, + "grad_norm": 0.25400853157043457, + "learning_rate": 9.289974861885796e-05, + "loss": 0.2532, + "step": 15765 + }, + { + "epoch": 1.9862392543376264, + "grad_norm": 0.29176244139671326, + "learning_rate": 9.279811709900934e-05, + "loss": 0.2719, + "step": 15770 + }, + { + "epoch": 1.9868690367478037, + "grad_norm": 0.2530720829963684, + "learning_rate": 9.26965162978785e-05, + "loss": 0.2676, + "step": 15775 + }, + { + "epoch": 1.987498819157981, + "grad_norm": 0.23311518132686615, + "learning_rate": 9.259494627002728e-05, + "loss": 0.2632, + "step": 15780 + }, + { + "epoch": 1.988128601568158, + "grad_norm": 0.2402007132768631, + "learning_rate": 9.249340707000123e-05, + "loss": 0.2419, + "step": 15785 + }, + { + "epoch": 1.9887583839783356, + "grad_norm": 0.24241755902767181, + "learning_rate": 9.239189875232914e-05, + "loss": 0.2567, + "step": 15790 + }, + { + "epoch": 1.9893881663885127, + "grad_norm": 0.31180015206336975, + "learning_rate": 9.229042137152337e-05, + "loss": 0.2864, + "step": 15795 + }, + { + "epoch": 1.9900179487986902, + "grad_norm": 0.29330986738204956, + "learning_rate": 9.218897498207952e-05, + "loss": 0.2626, + "step": 15800 + }, + { + "epoch": 1.9906477312088673, + "grad_norm": 0.24687117338180542, + "learning_rate": 9.208755963847663e-05, + "loss": 0.2721, + "step": 15805 + }, + { + "epoch": 1.9912775136190446, + "grad_norm": 0.22886700928211212, + "learning_rate": 9.198617539517714e-05, + "loss": 0.2626, + "step": 15810 + }, + { + "epoch": 1.991907296029222, + "grad_norm": 0.2331649363040924, + "learning_rate": 9.188482230662662e-05, + "loss": 0.2484, + "step": 15815 + }, + { + "epoch": 1.9925370784393992, + "grad_norm": 0.23371124267578125, + "learning_rate": 9.178350042725397e-05, + "loss": 0.291, + "step": 15820 + }, + { + "epoch": 1.9931668608495765, + "grad_norm": 0.2478175163269043, + "learning_rate": 9.168220981147143e-05, + "loss": 0.2748, + "step": 15825 + }, + { + "epoch": 1.9937966432597538, + "grad_norm": 0.25952714681625366, + "learning_rate": 9.158095051367433e-05, + "loss": 0.2568, + "step": 15830 + }, + { + "epoch": 1.9944264256699311, + "grad_norm": 0.2522846758365631, + "learning_rate": 9.14797225882412e-05, + "loss": 0.2414, + "step": 15835 + }, + { + "epoch": 1.9950562080801082, + "grad_norm": 0.24577966332435608, + "learning_rate": 9.137852608953384e-05, + "loss": 0.2573, + "step": 15840 + }, + { + "epoch": 1.9956859904902857, + "grad_norm": 0.2714809775352478, + "learning_rate": 9.127736107189705e-05, + "loss": 0.2703, + "step": 15845 + }, + { + "epoch": 1.9963157729004628, + "grad_norm": 0.25562793016433716, + "learning_rate": 9.117622758965866e-05, + "loss": 0.2601, + "step": 15850 + }, + { + "epoch": 1.9969455553106403, + "grad_norm": 0.23811009526252747, + "learning_rate": 9.107512569712975e-05, + "loss": 0.2474, + "step": 15855 + }, + { + "epoch": 1.9975753377208174, + "grad_norm": 0.24953188002109528, + "learning_rate": 9.097405544860437e-05, + "loss": 0.2582, + "step": 15860 + }, + { + "epoch": 1.9982051201309947, + "grad_norm": 0.23611120879650116, + "learning_rate": 9.087301689835944e-05, + "loss": 0.253, + "step": 15865 + }, + { + "epoch": 1.998834902541172, + "grad_norm": 0.24487170577049255, + "learning_rate": 9.077201010065509e-05, + "loss": 0.2508, + "step": 15870 + }, + { + "epoch": 1.9994646849513493, + "grad_norm": 0.2839270830154419, + "learning_rate": 9.06710351097342e-05, + "loss": 0.2748, + "step": 15875 + }, + { + "epoch": 2.0, + "grad_norm": 0.2304636538028717, + "learning_rate": 9.057009197982272e-05, + "loss": 0.2653, + "step": 15880 + }, + { + "epoch": 2.000629782410177, + "grad_norm": 0.19537301361560822, + "learning_rate": 9.046918076512935e-05, + "loss": 0.1844, + "step": 15885 + }, + { + "epoch": 2.0012595648203546, + "grad_norm": 0.22054894268512726, + "learning_rate": 9.036830151984571e-05, + "loss": 0.196, + "step": 15890 + }, + { + "epoch": 2.0018893472305317, + "grad_norm": 0.23987694084644318, + "learning_rate": 9.02674542981463e-05, + "loss": 0.1962, + "step": 15895 + }, + { + "epoch": 2.002519129640709, + "grad_norm": 0.24562768638134003, + "learning_rate": 9.016663915418835e-05, + "loss": 0.1826, + "step": 15900 + }, + { + "epoch": 2.0031489120508863, + "grad_norm": 0.27111175656318665, + "learning_rate": 9.00658561421119e-05, + "loss": 0.2025, + "step": 15905 + }, + { + "epoch": 2.003778694461064, + "grad_norm": 0.21321839094161987, + "learning_rate": 8.99651053160398e-05, + "loss": 0.1743, + "step": 15910 + }, + { + "epoch": 2.004408476871241, + "grad_norm": 0.2295263558626175, + "learning_rate": 8.986438673007749e-05, + "loss": 0.1856, + "step": 15915 + }, + { + "epoch": 2.0050382592814184, + "grad_norm": 0.22658327221870422, + "learning_rate": 8.976370043831313e-05, + "loss": 0.1896, + "step": 15920 + }, + { + "epoch": 2.0056680416915955, + "grad_norm": 0.21595464646816254, + "learning_rate": 8.966304649481753e-05, + "loss": 0.1865, + "step": 15925 + }, + { + "epoch": 2.006297824101773, + "grad_norm": 0.23339222371578217, + "learning_rate": 8.956242495364426e-05, + "loss": 0.1866, + "step": 15930 + }, + { + "epoch": 2.00692760651195, + "grad_norm": 0.20041927695274353, + "learning_rate": 8.946183586882929e-05, + "loss": 0.1745, + "step": 15935 + }, + { + "epoch": 2.007557388922127, + "grad_norm": 0.19914592802524567, + "learning_rate": 8.936127929439131e-05, + "loss": 0.1885, + "step": 15940 + }, + { + "epoch": 2.0081871713323047, + "grad_norm": 0.20688550174236298, + "learning_rate": 8.926075528433149e-05, + "loss": 0.1932, + "step": 15945 + }, + { + "epoch": 2.008816953742482, + "grad_norm": 0.23507048189640045, + "learning_rate": 8.916026389263358e-05, + "loss": 0.1865, + "step": 15950 + }, + { + "epoch": 2.0094467361526593, + "grad_norm": 0.2366725355386734, + "learning_rate": 8.905980517326358e-05, + "loss": 0.1867, + "step": 15955 + }, + { + "epoch": 2.0100765185628364, + "grad_norm": 0.20678187906742096, + "learning_rate": 8.895937918017028e-05, + "loss": 0.1785, + "step": 15960 + }, + { + "epoch": 2.010706300973014, + "grad_norm": 0.2642296850681305, + "learning_rate": 8.885898596728463e-05, + "loss": 0.1812, + "step": 15965 + }, + { + "epoch": 2.011336083383191, + "grad_norm": 0.20598894357681274, + "learning_rate": 8.875862558852016e-05, + "loss": 0.1861, + "step": 15970 + }, + { + "epoch": 2.0119658657933686, + "grad_norm": 0.23556114733219147, + "learning_rate": 8.865829809777265e-05, + "loss": 0.1873, + "step": 15975 + }, + { + "epoch": 2.0125956482035456, + "grad_norm": 0.25772175192832947, + "learning_rate": 8.855800354892022e-05, + "loss": 0.1858, + "step": 15980 + }, + { + "epoch": 2.013225430613723, + "grad_norm": 0.21538549661636353, + "learning_rate": 8.845774199582344e-05, + "loss": 0.1738, + "step": 15985 + }, + { + "epoch": 2.0138552130239002, + "grad_norm": 0.22819840908050537, + "learning_rate": 8.835751349232496e-05, + "loss": 0.1843, + "step": 15990 + }, + { + "epoch": 2.0144849954340773, + "grad_norm": 0.23319579660892487, + "learning_rate": 8.825731809224976e-05, + "loss": 0.1878, + "step": 15995 + }, + { + "epoch": 2.015114777844255, + "grad_norm": 0.24107947945594788, + "learning_rate": 8.815715584940511e-05, + "loss": 0.1867, + "step": 16000 + }, + { + "epoch": 2.015114777844255, + "eval_loss": 0.3415575921535492, + "eval_runtime": 6.166, + "eval_samples_per_second": 162.181, + "eval_steps_per_second": 10.217, + "step": 16000 + }, + { + "epoch": 2.015744560254432, + "grad_norm": 0.2272019386291504, + "learning_rate": 8.805702681758042e-05, + "loss": 0.1718, + "step": 16005 + }, + { + "epoch": 2.0163743426646095, + "grad_norm": 0.22147491574287415, + "learning_rate": 8.795693105054723e-05, + "loss": 0.175, + "step": 16010 + }, + { + "epoch": 2.0170041250747865, + "grad_norm": 0.21899926662445068, + "learning_rate": 8.785686860205929e-05, + "loss": 0.1749, + "step": 16015 + }, + { + "epoch": 2.017633907484964, + "grad_norm": 0.24299047887325287, + "learning_rate": 8.775683952585246e-05, + "loss": 0.1902, + "step": 16020 + }, + { + "epoch": 2.018263689895141, + "grad_norm": 0.24278461933135986, + "learning_rate": 8.765684387564454e-05, + "loss": 0.1872, + "step": 16025 + }, + { + "epoch": 2.0188934723053187, + "grad_norm": 0.24929705262184143, + "learning_rate": 8.75568817051355e-05, + "loss": 0.1838, + "step": 16030 + }, + { + "epoch": 2.0195232547154958, + "grad_norm": 0.20675018429756165, + "learning_rate": 8.745695306800738e-05, + "loss": 0.1734, + "step": 16035 + }, + { + "epoch": 2.0201530371256733, + "grad_norm": 0.25064778327941895, + "learning_rate": 8.73570580179241e-05, + "loss": 0.1821, + "step": 16040 + }, + { + "epoch": 2.0207828195358504, + "grad_norm": 0.23618988692760468, + "learning_rate": 8.725719660853157e-05, + "loss": 0.1935, + "step": 16045 + }, + { + "epoch": 2.0214126019460275, + "grad_norm": 0.2201015204191208, + "learning_rate": 8.715736889345766e-05, + "loss": 0.1806, + "step": 16050 + }, + { + "epoch": 2.022042384356205, + "grad_norm": 0.23748455941677094, + "learning_rate": 8.705757492631214e-05, + "loss": 0.1807, + "step": 16055 + }, + { + "epoch": 2.022672166766382, + "grad_norm": 0.2563530504703522, + "learning_rate": 8.695781476068664e-05, + "loss": 0.1825, + "step": 16060 + }, + { + "epoch": 2.0233019491765596, + "grad_norm": 0.27659016847610474, + "learning_rate": 8.685808845015464e-05, + "loss": 0.1861, + "step": 16065 + }, + { + "epoch": 2.0239317315867367, + "grad_norm": 0.19301186501979828, + "learning_rate": 8.675839604827146e-05, + "loss": 0.1804, + "step": 16070 + }, + { + "epoch": 2.024561513996914, + "grad_norm": 0.245374858379364, + "learning_rate": 8.665873760857415e-05, + "loss": 0.1785, + "step": 16075 + }, + { + "epoch": 2.0251912964070913, + "grad_norm": 0.21472232043743134, + "learning_rate": 8.655911318458166e-05, + "loss": 0.1785, + "step": 16080 + }, + { + "epoch": 2.025821078817269, + "grad_norm": 0.22257132828235626, + "learning_rate": 8.645952282979453e-05, + "loss": 0.1812, + "step": 16085 + }, + { + "epoch": 2.026450861227446, + "grad_norm": 0.25223472714424133, + "learning_rate": 8.635996659769512e-05, + "loss": 0.1934, + "step": 16090 + }, + { + "epoch": 2.0270806436376234, + "grad_norm": 0.22251825034618378, + "learning_rate": 8.626044454174724e-05, + "loss": 0.1895, + "step": 16095 + }, + { + "epoch": 2.0277104260478005, + "grad_norm": 0.2073337882757187, + "learning_rate": 8.616095671539663e-05, + "loss": 0.1851, + "step": 16100 + }, + { + "epoch": 2.0283402084579776, + "grad_norm": 0.21960042417049408, + "learning_rate": 8.606150317207053e-05, + "loss": 0.1809, + "step": 16105 + }, + { + "epoch": 2.028969990868155, + "grad_norm": 0.23633064329624176, + "learning_rate": 8.596208396517771e-05, + "loss": 0.1839, + "step": 16110 + }, + { + "epoch": 2.029599773278332, + "grad_norm": 0.21128375828266144, + "learning_rate": 8.586269914810855e-05, + "loss": 0.1828, + "step": 16115 + }, + { + "epoch": 2.0302295556885097, + "grad_norm": 0.24467304348945618, + "learning_rate": 8.576334877423505e-05, + "loss": 0.1784, + "step": 16120 + }, + { + "epoch": 2.030859338098687, + "grad_norm": 0.24976873397827148, + "learning_rate": 8.566403289691062e-05, + "loss": 0.1924, + "step": 16125 + }, + { + "epoch": 2.0314891205088643, + "grad_norm": 0.258323609828949, + "learning_rate": 8.556475156947008e-05, + "loss": 0.1889, + "step": 16130 + }, + { + "epoch": 2.0321189029190414, + "grad_norm": 0.24420535564422607, + "learning_rate": 8.546550484522973e-05, + "loss": 0.197, + "step": 16135 + }, + { + "epoch": 2.032748685329219, + "grad_norm": 0.2438700944185257, + "learning_rate": 8.536629277748746e-05, + "loss": 0.1958, + "step": 16140 + }, + { + "epoch": 2.033378467739396, + "grad_norm": 0.25343936681747437, + "learning_rate": 8.526711541952236e-05, + "loss": 0.1877, + "step": 16145 + }, + { + "epoch": 2.0340082501495735, + "grad_norm": 0.24403081834316254, + "learning_rate": 8.516797282459493e-05, + "loss": 0.1774, + "step": 16150 + }, + { + "epoch": 2.0346380325597506, + "grad_norm": 0.24733777344226837, + "learning_rate": 8.506886504594704e-05, + "loss": 0.1792, + "step": 16155 + }, + { + "epoch": 2.0352678149699277, + "grad_norm": 0.22619028389453888, + "learning_rate": 8.496979213680177e-05, + "loss": 0.1807, + "step": 16160 + }, + { + "epoch": 2.0358975973801052, + "grad_norm": 0.23040007054805756, + "learning_rate": 8.48707541503636e-05, + "loss": 0.1804, + "step": 16165 + }, + { + "epoch": 2.0365273797902823, + "grad_norm": 0.21034270524978638, + "learning_rate": 8.477175113981813e-05, + "loss": 0.1787, + "step": 16170 + }, + { + "epoch": 2.03715716220046, + "grad_norm": 0.21682168543338776, + "learning_rate": 8.467278315833224e-05, + "loss": 0.1817, + "step": 16175 + }, + { + "epoch": 2.037786944610637, + "grad_norm": 0.2700116038322449, + "learning_rate": 8.457385025905407e-05, + "loss": 0.1896, + "step": 16180 + }, + { + "epoch": 2.0384167270208144, + "grad_norm": 0.214239239692688, + "learning_rate": 8.44749524951128e-05, + "loss": 0.1827, + "step": 16185 + }, + { + "epoch": 2.0390465094309915, + "grad_norm": 0.2243194878101349, + "learning_rate": 8.437608991961885e-05, + "loss": 0.1833, + "step": 16190 + }, + { + "epoch": 2.039676291841169, + "grad_norm": 0.28487569093704224, + "learning_rate": 8.427726258566353e-05, + "loss": 0.1901, + "step": 16195 + }, + { + "epoch": 2.040306074251346, + "grad_norm": 0.24857446551322937, + "learning_rate": 8.41784705463195e-05, + "loss": 0.192, + "step": 16200 + }, + { + "epoch": 2.0409358566615237, + "grad_norm": 0.22208547592163086, + "learning_rate": 8.407971385464032e-05, + "loss": 0.1907, + "step": 16205 + }, + { + "epoch": 2.0415656390717007, + "grad_norm": 0.22752498090267181, + "learning_rate": 8.398099256366057e-05, + "loss": 0.1827, + "step": 16210 + }, + { + "epoch": 2.042195421481878, + "grad_norm": 0.25674304366111755, + "learning_rate": 8.388230672639584e-05, + "loss": 0.1889, + "step": 16215 + }, + { + "epoch": 2.0428252038920554, + "grad_norm": 0.22372281551361084, + "learning_rate": 8.378365639584264e-05, + "loss": 0.1816, + "step": 16220 + }, + { + "epoch": 2.0434549863022324, + "grad_norm": 0.25298216938972473, + "learning_rate": 8.368504162497859e-05, + "loss": 0.1813, + "step": 16225 + }, + { + "epoch": 2.04408476871241, + "grad_norm": 0.21058551967144012, + "learning_rate": 8.358646246676197e-05, + "loss": 0.1855, + "step": 16230 + }, + { + "epoch": 2.044714551122587, + "grad_norm": 0.2757975459098816, + "learning_rate": 8.348791897413196e-05, + "loss": 0.1749, + "step": 16235 + }, + { + "epoch": 2.0453443335327646, + "grad_norm": 0.22646676003932953, + "learning_rate": 8.338941120000884e-05, + "loss": 0.1852, + "step": 16240 + }, + { + "epoch": 2.0459741159429417, + "grad_norm": 0.23769816756248474, + "learning_rate": 8.329093919729342e-05, + "loss": 0.1869, + "step": 16245 + }, + { + "epoch": 2.046603898353119, + "grad_norm": 0.22907455265522003, + "learning_rate": 8.319250301886746e-05, + "loss": 0.1876, + "step": 16250 + }, + { + "epoch": 2.0472336807632963, + "grad_norm": 0.22925196588039398, + "learning_rate": 8.309410271759342e-05, + "loss": 0.1885, + "step": 16255 + }, + { + "epoch": 2.0478634631734733, + "grad_norm": 0.22043700516223907, + "learning_rate": 8.299573834631454e-05, + "loss": 0.181, + "step": 16260 + }, + { + "epoch": 2.048493245583651, + "grad_norm": 0.23858542740345, + "learning_rate": 8.289740995785468e-05, + "loss": 0.1898, + "step": 16265 + }, + { + "epoch": 2.049123027993828, + "grad_norm": 0.23982049524784088, + "learning_rate": 8.279911760501846e-05, + "loss": 0.1838, + "step": 16270 + }, + { + "epoch": 2.0497528104040055, + "grad_norm": 0.21694807708263397, + "learning_rate": 8.270086134059113e-05, + "loss": 0.1795, + "step": 16275 + }, + { + "epoch": 2.0503825928141826, + "grad_norm": 0.20050913095474243, + "learning_rate": 8.260264121733846e-05, + "loss": 0.175, + "step": 16280 + }, + { + "epoch": 2.05101237522436, + "grad_norm": 0.2118636816740036, + "learning_rate": 8.250445728800706e-05, + "loss": 0.1778, + "step": 16285 + }, + { + "epoch": 2.051642157634537, + "grad_norm": 0.2250407338142395, + "learning_rate": 8.240630960532382e-05, + "loss": 0.1885, + "step": 16290 + }, + { + "epoch": 2.0522719400447147, + "grad_norm": 0.2565051019191742, + "learning_rate": 8.230819822199642e-05, + "loss": 0.1901, + "step": 16295 + }, + { + "epoch": 2.0529017224548918, + "grad_norm": 0.24367564916610718, + "learning_rate": 8.221012319071268e-05, + "loss": 0.1798, + "step": 16300 + }, + { + "epoch": 2.0535315048650693, + "grad_norm": 0.24313905835151672, + "learning_rate": 8.211208456414135e-05, + "loss": 0.1908, + "step": 16305 + }, + { + "epoch": 2.0541612872752464, + "grad_norm": 0.23950958251953125, + "learning_rate": 8.201408239493131e-05, + "loss": 0.1815, + "step": 16310 + }, + { + "epoch": 2.0547910696854235, + "grad_norm": 0.24551273882389069, + "learning_rate": 8.1916116735712e-05, + "loss": 0.1941, + "step": 16315 + }, + { + "epoch": 2.055420852095601, + "grad_norm": 0.21070988476276398, + "learning_rate": 8.181818763909314e-05, + "loss": 0.1868, + "step": 16320 + }, + { + "epoch": 2.056050634505778, + "grad_norm": 0.21926933526992798, + "learning_rate": 8.172029515766502e-05, + "loss": 0.1848, + "step": 16325 + }, + { + "epoch": 2.0566804169159556, + "grad_norm": 0.22517934441566467, + "learning_rate": 8.162243934399812e-05, + "loss": 0.1912, + "step": 16330 + }, + { + "epoch": 2.0573101993261327, + "grad_norm": 0.2571990489959717, + "learning_rate": 8.152462025064315e-05, + "loss": 0.1834, + "step": 16335 + }, + { + "epoch": 2.05793998173631, + "grad_norm": 0.22555163502693176, + "learning_rate": 8.14268379301312e-05, + "loss": 0.19, + "step": 16340 + }, + { + "epoch": 2.0585697641464873, + "grad_norm": 0.2326682209968567, + "learning_rate": 8.13290924349737e-05, + "loss": 0.1786, + "step": 16345 + }, + { + "epoch": 2.059199546556665, + "grad_norm": 0.22472088038921356, + "learning_rate": 8.123138381766218e-05, + "loss": 0.1843, + "step": 16350 + }, + { + "epoch": 2.059829328966842, + "grad_norm": 0.2206810563802719, + "learning_rate": 8.113371213066838e-05, + "loss": 0.1781, + "step": 16355 + }, + { + "epoch": 2.0604591113770194, + "grad_norm": 0.2740577757358551, + "learning_rate": 8.103607742644426e-05, + "loss": 0.1875, + "step": 16360 + }, + { + "epoch": 2.0610888937871965, + "grad_norm": 0.22217485308647156, + "learning_rate": 8.093847975742185e-05, + "loss": 0.1748, + "step": 16365 + }, + { + "epoch": 2.0617186761973736, + "grad_norm": 0.2460946887731552, + "learning_rate": 8.084091917601336e-05, + "loss": 0.1839, + "step": 16370 + }, + { + "epoch": 2.062348458607551, + "grad_norm": 0.2489384114742279, + "learning_rate": 8.074339573461101e-05, + "loss": 0.1818, + "step": 16375 + }, + { + "epoch": 2.062978241017728, + "grad_norm": 0.22755055129528046, + "learning_rate": 8.06459094855871e-05, + "loss": 0.1885, + "step": 16380 + }, + { + "epoch": 2.0636080234279057, + "grad_norm": 0.22558000683784485, + "learning_rate": 8.054846048129406e-05, + "loss": 0.1805, + "step": 16385 + }, + { + "epoch": 2.064237805838083, + "grad_norm": 0.2083364725112915, + "learning_rate": 8.045104877406418e-05, + "loss": 0.1809, + "step": 16390 + }, + { + "epoch": 2.0648675882482603, + "grad_norm": 0.23679542541503906, + "learning_rate": 8.035367441620976e-05, + "loss": 0.181, + "step": 16395 + }, + { + "epoch": 2.0654973706584374, + "grad_norm": 0.2173621654510498, + "learning_rate": 8.025633746002311e-05, + "loss": 0.1857, + "step": 16400 + }, + { + "epoch": 2.066127153068615, + "grad_norm": 0.22376009821891785, + "learning_rate": 8.015903795777634e-05, + "loss": 0.1832, + "step": 16405 + }, + { + "epoch": 2.066756935478792, + "grad_norm": 0.24444858729839325, + "learning_rate": 8.00617759617215e-05, + "loss": 0.1959, + "step": 16410 + }, + { + "epoch": 2.0673867178889695, + "grad_norm": 0.21472635865211487, + "learning_rate": 7.996455152409055e-05, + "loss": 0.17, + "step": 16415 + }, + { + "epoch": 2.0680165002991466, + "grad_norm": 0.22463464736938477, + "learning_rate": 7.986736469709521e-05, + "loss": 0.1847, + "step": 16420 + }, + { + "epoch": 2.0686462827093237, + "grad_norm": 0.2251402884721756, + "learning_rate": 7.977021553292696e-05, + "loss": 0.1822, + "step": 16425 + }, + { + "epoch": 2.0692760651195012, + "grad_norm": 0.21793001890182495, + "learning_rate": 7.967310408375725e-05, + "loss": 0.1862, + "step": 16430 + }, + { + "epoch": 2.0699058475296783, + "grad_norm": 0.2344975620508194, + "learning_rate": 7.957603040173714e-05, + "loss": 0.1791, + "step": 16435 + }, + { + "epoch": 2.070535629939856, + "grad_norm": 0.23466047644615173, + "learning_rate": 7.947899453899725e-05, + "loss": 0.1867, + "step": 16440 + }, + { + "epoch": 2.071165412350033, + "grad_norm": 0.2190965861082077, + "learning_rate": 7.93819965476482e-05, + "loss": 0.1831, + "step": 16445 + }, + { + "epoch": 2.0717951947602105, + "grad_norm": 0.22384218871593475, + "learning_rate": 7.928503647978012e-05, + "loss": 0.1745, + "step": 16450 + }, + { + "epoch": 2.0724249771703875, + "grad_norm": 0.23837679624557495, + "learning_rate": 7.918811438746272e-05, + "loss": 0.1875, + "step": 16455 + }, + { + "epoch": 2.073054759580565, + "grad_norm": 0.2510152757167816, + "learning_rate": 7.909123032274542e-05, + "loss": 0.1849, + "step": 16460 + }, + { + "epoch": 2.073684541990742, + "grad_norm": 0.2514597475528717, + "learning_rate": 7.899438433765711e-05, + "loss": 0.1882, + "step": 16465 + }, + { + "epoch": 2.0743143244009197, + "grad_norm": 0.20441210269927979, + "learning_rate": 7.889757648420648e-05, + "loss": 0.1754, + "step": 16470 + }, + { + "epoch": 2.0749441068110968, + "grad_norm": 0.25783875584602356, + "learning_rate": 7.880080681438134e-05, + "loss": 0.1859, + "step": 16475 + }, + { + "epoch": 2.075573889221274, + "grad_norm": 0.2234499454498291, + "learning_rate": 7.870407538014933e-05, + "loss": 0.1842, + "step": 16480 + }, + { + "epoch": 2.0762036716314514, + "grad_norm": 0.24572981894016266, + "learning_rate": 7.860738223345734e-05, + "loss": 0.1728, + "step": 16485 + }, + { + "epoch": 2.0768334540416284, + "grad_norm": 0.23702028393745422, + "learning_rate": 7.851072742623194e-05, + "loss": 0.1748, + "step": 16490 + }, + { + "epoch": 2.077463236451806, + "grad_norm": 0.23450568318367004, + "learning_rate": 7.84141110103789e-05, + "loss": 0.1826, + "step": 16495 + }, + { + "epoch": 2.078093018861983, + "grad_norm": 0.23022450506687164, + "learning_rate": 7.831753303778342e-05, + "loss": 0.1684, + "step": 16500 + }, + { + "epoch": 2.0787228012721606, + "grad_norm": 0.22727181017398834, + "learning_rate": 7.822099356031014e-05, + "loss": 0.1751, + "step": 16505 + }, + { + "epoch": 2.0793525836823377, + "grad_norm": 0.20935000479221344, + "learning_rate": 7.812449262980289e-05, + "loss": 0.1748, + "step": 16510 + }, + { + "epoch": 2.079982366092515, + "grad_norm": 0.2445985972881317, + "learning_rate": 7.802803029808492e-05, + "loss": 0.1869, + "step": 16515 + }, + { + "epoch": 2.0806121485026923, + "grad_norm": 0.21021974086761475, + "learning_rate": 7.793160661695867e-05, + "loss": 0.1778, + "step": 16520 + }, + { + "epoch": 2.08124193091287, + "grad_norm": 0.20149335265159607, + "learning_rate": 7.783522163820587e-05, + "loss": 0.1685, + "step": 16525 + }, + { + "epoch": 2.081871713323047, + "grad_norm": 0.2342994064092636, + "learning_rate": 7.773887541358749e-05, + "loss": 0.1714, + "step": 16530 + }, + { + "epoch": 2.082501495733224, + "grad_norm": 0.2518448829650879, + "learning_rate": 7.764256799484364e-05, + "loss": 0.1899, + "step": 16535 + }, + { + "epoch": 2.0831312781434015, + "grad_norm": 0.22891752421855927, + "learning_rate": 7.754629943369365e-05, + "loss": 0.1724, + "step": 16540 + }, + { + "epoch": 2.0837610605535786, + "grad_norm": 0.2348988950252533, + "learning_rate": 7.74500697818358e-05, + "loss": 0.1772, + "step": 16545 + }, + { + "epoch": 2.084390842963756, + "grad_norm": 0.21126072108745575, + "learning_rate": 7.735387909094772e-05, + "loss": 0.182, + "step": 16550 + }, + { + "epoch": 2.085020625373933, + "grad_norm": 0.2134072482585907, + "learning_rate": 7.725772741268598e-05, + "loss": 0.1861, + "step": 16555 + }, + { + "epoch": 2.0856504077841107, + "grad_norm": 0.22559498250484467, + "learning_rate": 7.716161479868623e-05, + "loss": 0.1745, + "step": 16560 + }, + { + "epoch": 2.086280190194288, + "grad_norm": 0.2076030671596527, + "learning_rate": 7.706554130056315e-05, + "loss": 0.1811, + "step": 16565 + }, + { + "epoch": 2.0869099726044653, + "grad_norm": 0.24279461801052094, + "learning_rate": 7.696950696991032e-05, + "loss": 0.1829, + "step": 16570 + }, + { + "epoch": 2.0875397550146424, + "grad_norm": 0.21790249645709991, + "learning_rate": 7.687351185830058e-05, + "loss": 0.1835, + "step": 16575 + }, + { + "epoch": 2.08816953742482, + "grad_norm": 0.2210235744714737, + "learning_rate": 7.677755601728527e-05, + "loss": 0.1678, + "step": 16580 + }, + { + "epoch": 2.088799319834997, + "grad_norm": 0.21354030072689056, + "learning_rate": 7.668163949839492e-05, + "loss": 0.1863, + "step": 16585 + }, + { + "epoch": 2.089429102245174, + "grad_norm": 0.264240026473999, + "learning_rate": 7.658576235313896e-05, + "loss": 0.1879, + "step": 16590 + }, + { + "epoch": 2.0900588846553516, + "grad_norm": 0.2348974198102951, + "learning_rate": 7.648992463300561e-05, + "loss": 0.1796, + "step": 16595 + }, + { + "epoch": 2.0906886670655287, + "grad_norm": 0.23128418624401093, + "learning_rate": 7.639412638946186e-05, + "loss": 0.1793, + "step": 16600 + }, + { + "epoch": 2.091318449475706, + "grad_norm": 0.2405007928609848, + "learning_rate": 7.629836767395359e-05, + "loss": 0.1856, + "step": 16605 + }, + { + "epoch": 2.0919482318858833, + "grad_norm": 0.23123788833618164, + "learning_rate": 7.620264853790539e-05, + "loss": 0.1752, + "step": 16610 + }, + { + "epoch": 2.092578014296061, + "grad_norm": 0.22082751989364624, + "learning_rate": 7.610696903272062e-05, + "loss": 0.1731, + "step": 16615 + }, + { + "epoch": 2.093207796706238, + "grad_norm": 0.23356421291828156, + "learning_rate": 7.601132920978139e-05, + "loss": 0.1839, + "step": 16620 + }, + { + "epoch": 2.0938375791164154, + "grad_norm": 0.2418486326932907, + "learning_rate": 7.591572912044846e-05, + "loss": 0.1883, + "step": 16625 + }, + { + "epoch": 2.0944673615265925, + "grad_norm": 0.2357870191335678, + "learning_rate": 7.58201688160612e-05, + "loss": 0.176, + "step": 16630 + }, + { + "epoch": 2.09509714393677, + "grad_norm": 0.27169832587242126, + "learning_rate": 7.572464834793778e-05, + "loss": 0.1824, + "step": 16635 + }, + { + "epoch": 2.095726926346947, + "grad_norm": 0.23245801031589508, + "learning_rate": 7.562916776737488e-05, + "loss": 0.1937, + "step": 16640 + }, + { + "epoch": 2.096356708757124, + "grad_norm": 0.2312193661928177, + "learning_rate": 7.55337271256476e-05, + "loss": 0.1873, + "step": 16645 + }, + { + "epoch": 2.0969864911673017, + "grad_norm": 0.2394751012325287, + "learning_rate": 7.543832647400989e-05, + "loss": 0.1748, + "step": 16650 + }, + { + "epoch": 2.097616273577479, + "grad_norm": 0.2679862976074219, + "learning_rate": 7.534296586369402e-05, + "loss": 0.1868, + "step": 16655 + }, + { + "epoch": 2.0982460559876563, + "grad_norm": 0.2397966831922531, + "learning_rate": 7.524764534591086e-05, + "loss": 0.1768, + "step": 16660 + }, + { + "epoch": 2.0988758383978334, + "grad_norm": 0.22550681233406067, + "learning_rate": 7.515236497184965e-05, + "loss": 0.1764, + "step": 16665 + }, + { + "epoch": 2.099505620808011, + "grad_norm": 0.23124639689922333, + "learning_rate": 7.505712479267809e-05, + "loss": 0.1828, + "step": 16670 + }, + { + "epoch": 2.100135403218188, + "grad_norm": 0.2034096121788025, + "learning_rate": 7.496192485954254e-05, + "loss": 0.179, + "step": 16675 + }, + { + "epoch": 2.1007651856283656, + "grad_norm": 0.2237498164176941, + "learning_rate": 7.486676522356732e-05, + "loss": 0.1867, + "step": 16680 + }, + { + "epoch": 2.1013949680385426, + "grad_norm": 0.22583693265914917, + "learning_rate": 7.477164593585537e-05, + "loss": 0.1882, + "step": 16685 + }, + { + "epoch": 2.10202475044872, + "grad_norm": 0.20145735144615173, + "learning_rate": 7.467656704748792e-05, + "loss": 0.1749, + "step": 16690 + }, + { + "epoch": 2.1026545328588973, + "grad_norm": 0.204311341047287, + "learning_rate": 7.458152860952458e-05, + "loss": 0.1803, + "step": 16695 + }, + { + "epoch": 2.1032843152690743, + "grad_norm": 0.23768644034862518, + "learning_rate": 7.448653067300313e-05, + "loss": 0.1915, + "step": 16700 + }, + { + "epoch": 2.103914097679252, + "grad_norm": 0.21348991990089417, + "learning_rate": 7.439157328893961e-05, + "loss": 0.1778, + "step": 16705 + }, + { + "epoch": 2.104543880089429, + "grad_norm": 0.22427400946617126, + "learning_rate": 7.429665650832831e-05, + "loss": 0.1712, + "step": 16710 + }, + { + "epoch": 2.1051736624996065, + "grad_norm": 0.22512148320674896, + "learning_rate": 7.420178038214172e-05, + "loss": 0.1889, + "step": 16715 + }, + { + "epoch": 2.1058034449097836, + "grad_norm": 0.22715777158737183, + "learning_rate": 7.410694496133048e-05, + "loss": 0.1737, + "step": 16720 + }, + { + "epoch": 2.106433227319961, + "grad_norm": 0.2505483627319336, + "learning_rate": 7.401215029682339e-05, + "loss": 0.1809, + "step": 16725 + }, + { + "epoch": 2.107063009730138, + "grad_norm": 0.2218826860189438, + "learning_rate": 7.391739643952725e-05, + "loss": 0.1766, + "step": 16730 + }, + { + "epoch": 2.1076927921403157, + "grad_norm": 0.2085668295621872, + "learning_rate": 7.38226834403272e-05, + "loss": 0.1739, + "step": 16735 + }, + { + "epoch": 2.1083225745504928, + "grad_norm": 0.21690475940704346, + "learning_rate": 7.372801135008622e-05, + "loss": 0.1738, + "step": 16740 + }, + { + "epoch": 2.1089523569606703, + "grad_norm": 0.263988733291626, + "learning_rate": 7.363338021964545e-05, + "loss": 0.1951, + "step": 16745 + }, + { + "epoch": 2.1095821393708474, + "grad_norm": 0.24228844046592712, + "learning_rate": 7.353879009982377e-05, + "loss": 0.1775, + "step": 16750 + }, + { + "epoch": 2.1102119217810245, + "grad_norm": 0.2030615508556366, + "learning_rate": 7.344424104141843e-05, + "loss": 0.1754, + "step": 16755 + }, + { + "epoch": 2.110841704191202, + "grad_norm": 0.22505883872509003, + "learning_rate": 7.334973309520438e-05, + "loss": 0.1814, + "step": 16760 + }, + { + "epoch": 2.111471486601379, + "grad_norm": 0.28446871042251587, + "learning_rate": 7.32552663119345e-05, + "loss": 0.2009, + "step": 16765 + }, + { + "epoch": 2.1121012690115566, + "grad_norm": 0.2320084124803543, + "learning_rate": 7.316084074233968e-05, + "loss": 0.1866, + "step": 16770 + }, + { + "epoch": 2.1127310514217337, + "grad_norm": 0.23432306945323944, + "learning_rate": 7.306645643712851e-05, + "loss": 0.1838, + "step": 16775 + }, + { + "epoch": 2.113360833831911, + "grad_norm": 0.20252206921577454, + "learning_rate": 7.297211344698769e-05, + "loss": 0.1753, + "step": 16780 + }, + { + "epoch": 2.1139906162420883, + "grad_norm": 0.25251004099845886, + "learning_rate": 7.28778118225814e-05, + "loss": 0.1836, + "step": 16785 + }, + { + "epoch": 2.114620398652266, + "grad_norm": 0.2514311373233795, + "learning_rate": 7.278355161455176e-05, + "loss": 0.1838, + "step": 16790 + }, + { + "epoch": 2.115250181062443, + "grad_norm": 0.21513232588768005, + "learning_rate": 7.268933287351876e-05, + "loss": 0.1745, + "step": 16795 + }, + { + "epoch": 2.1158799634726204, + "grad_norm": 0.2200087606906891, + "learning_rate": 7.259515565007999e-05, + "loss": 0.1839, + "step": 16800 + }, + { + "epoch": 2.1165097458827975, + "grad_norm": 0.22383321821689606, + "learning_rate": 7.250101999481073e-05, + "loss": 0.1865, + "step": 16805 + }, + { + "epoch": 2.1171395282929746, + "grad_norm": 0.2382001131772995, + "learning_rate": 7.2406925958264e-05, + "loss": 0.1862, + "step": 16810 + }, + { + "epoch": 2.117769310703152, + "grad_norm": 0.2178415209054947, + "learning_rate": 7.231287359097045e-05, + "loss": 0.1799, + "step": 16815 + }, + { + "epoch": 2.118399093113329, + "grad_norm": 0.22616611421108246, + "learning_rate": 7.221886294343834e-05, + "loss": 0.1819, + "step": 16820 + }, + { + "epoch": 2.1190288755235067, + "grad_norm": 0.24810658395290375, + "learning_rate": 7.212489406615355e-05, + "loss": 0.181, + "step": 16825 + }, + { + "epoch": 2.119658657933684, + "grad_norm": 0.2408507764339447, + "learning_rate": 7.20309670095795e-05, + "loss": 0.1867, + "step": 16830 + }, + { + "epoch": 2.1202884403438613, + "grad_norm": 0.20721390843391418, + "learning_rate": 7.19370818241571e-05, + "loss": 0.175, + "step": 16835 + }, + { + "epoch": 2.1209182227540384, + "grad_norm": 0.22691728174686432, + "learning_rate": 7.184323856030497e-05, + "loss": 0.1753, + "step": 16840 + }, + { + "epoch": 2.121548005164216, + "grad_norm": 0.22788456082344055, + "learning_rate": 7.174943726841902e-05, + "loss": 0.1829, + "step": 16845 + }, + { + "epoch": 2.122177787574393, + "grad_norm": 0.21744227409362793, + "learning_rate": 7.165567799887268e-05, + "loss": 0.1797, + "step": 16850 + }, + { + "epoch": 2.1228075699845705, + "grad_norm": 0.211074560880661, + "learning_rate": 7.156196080201685e-05, + "loss": 0.1875, + "step": 16855 + }, + { + "epoch": 2.1234373523947476, + "grad_norm": 0.27859583497047424, + "learning_rate": 7.146828572817975e-05, + "loss": 0.1791, + "step": 16860 + }, + { + "epoch": 2.1240671348049247, + "grad_norm": 0.202862948179245, + "learning_rate": 7.13746528276671e-05, + "loss": 0.1752, + "step": 16865 + }, + { + "epoch": 2.1246969172151022, + "grad_norm": 0.2529730498790741, + "learning_rate": 7.128106215076187e-05, + "loss": 0.1734, + "step": 16870 + }, + { + "epoch": 2.1253266996252793, + "grad_norm": 0.22796177864074707, + "learning_rate": 7.118751374772433e-05, + "loss": 0.1807, + "step": 16875 + }, + { + "epoch": 2.125956482035457, + "grad_norm": 0.20112904906272888, + "learning_rate": 7.109400766879223e-05, + "loss": 0.1711, + "step": 16880 + }, + { + "epoch": 2.126586264445634, + "grad_norm": 0.22492708265781403, + "learning_rate": 7.100054396418048e-05, + "loss": 0.1784, + "step": 16885 + }, + { + "epoch": 2.1272160468558114, + "grad_norm": 0.25224363803863525, + "learning_rate": 7.09071226840811e-05, + "loss": 0.185, + "step": 16890 + }, + { + "epoch": 2.1278458292659885, + "grad_norm": 0.24734210968017578, + "learning_rate": 7.081374387866346e-05, + "loss": 0.1739, + "step": 16895 + }, + { + "epoch": 2.128475611676166, + "grad_norm": 0.21726474165916443, + "learning_rate": 7.07204075980742e-05, + "loss": 0.1695, + "step": 16900 + }, + { + "epoch": 2.129105394086343, + "grad_norm": 0.2073916345834732, + "learning_rate": 7.062711389243703e-05, + "loss": 0.1782, + "step": 16905 + }, + { + "epoch": 2.1297351764965207, + "grad_norm": 0.2361113578081131, + "learning_rate": 7.053386281185274e-05, + "loss": 0.1787, + "step": 16910 + }, + { + "epoch": 2.1303649589066977, + "grad_norm": 0.22586499154567719, + "learning_rate": 7.044065440639933e-05, + "loss": 0.1738, + "step": 16915 + }, + { + "epoch": 2.130994741316875, + "grad_norm": 0.23469188809394836, + "learning_rate": 7.034748872613184e-05, + "loss": 0.1805, + "step": 16920 + }, + { + "epoch": 2.1316245237270524, + "grad_norm": 0.1897682100534439, + "learning_rate": 7.025436582108234e-05, + "loss": 0.171, + "step": 16925 + }, + { + "epoch": 2.1322543061372294, + "grad_norm": 0.22100795805454254, + "learning_rate": 7.016128574126e-05, + "loss": 0.1736, + "step": 16930 + }, + { + "epoch": 2.132884088547407, + "grad_norm": 0.2332223504781723, + "learning_rate": 7.006824853665085e-05, + "loss": 0.1729, + "step": 16935 + }, + { + "epoch": 2.133513870957584, + "grad_norm": 0.23929065465927124, + "learning_rate": 6.997525425721814e-05, + "loss": 0.1736, + "step": 16940 + }, + { + "epoch": 2.1341436533677616, + "grad_norm": 0.26240813732147217, + "learning_rate": 6.988230295290185e-05, + "loss": 0.1798, + "step": 16945 + }, + { + "epoch": 2.1347734357779387, + "grad_norm": 0.22387517988681793, + "learning_rate": 6.978939467361895e-05, + "loss": 0.1734, + "step": 16950 + }, + { + "epoch": 2.135403218188116, + "grad_norm": 0.246952623128891, + "learning_rate": 6.969652946926332e-05, + "loss": 0.1834, + "step": 16955 + }, + { + "epoch": 2.1360330005982933, + "grad_norm": 0.25226834416389465, + "learning_rate": 6.960370738970568e-05, + "loss": 0.1798, + "step": 16960 + }, + { + "epoch": 2.136662783008471, + "grad_norm": 0.22118602693080902, + "learning_rate": 6.951092848479364e-05, + "loss": 0.1863, + "step": 16965 + }, + { + "epoch": 2.137292565418648, + "grad_norm": 0.2567583918571472, + "learning_rate": 6.941819280435155e-05, + "loss": 0.1828, + "step": 16970 + }, + { + "epoch": 2.137922347828825, + "grad_norm": 0.28791603446006775, + "learning_rate": 6.93255003981806e-05, + "loss": 0.1817, + "step": 16975 + }, + { + "epoch": 2.1385521302390025, + "grad_norm": 0.2655430734157562, + "learning_rate": 6.923285131605871e-05, + "loss": 0.1789, + "step": 16980 + }, + { + "epoch": 2.1391819126491796, + "grad_norm": 0.24513307213783264, + "learning_rate": 6.914024560774061e-05, + "loss": 0.1885, + "step": 16985 + }, + { + "epoch": 2.139811695059357, + "grad_norm": 0.211643248796463, + "learning_rate": 6.904768332295772e-05, + "loss": 0.188, + "step": 16990 + }, + { + "epoch": 2.140441477469534, + "grad_norm": 0.2373894900083542, + "learning_rate": 6.895516451141791e-05, + "loss": 0.1819, + "step": 16995 + }, + { + "epoch": 2.1410712598797117, + "grad_norm": 0.22991600632667542, + "learning_rate": 6.88626892228061e-05, + "loss": 0.189, + "step": 17000 + }, + { + "epoch": 2.1410712598797117, + "eval_loss": 0.3501429557800293, + "eval_runtime": 6.1606, + "eval_samples_per_second": 162.322, + "eval_steps_per_second": 10.226, + "step": 17000 + }, + { + "epoch": 2.141701042289889, + "grad_norm": 0.23578788340091705, + "learning_rate": 6.877025750678352e-05, + "loss": 0.1804, + "step": 17005 + }, + { + "epoch": 2.1423308247000663, + "grad_norm": 0.20814631879329681, + "learning_rate": 6.867786941298816e-05, + "loss": 0.1776, + "step": 17010 + }, + { + "epoch": 2.1429606071102434, + "grad_norm": 0.24113385379314423, + "learning_rate": 6.858552499103451e-05, + "loss": 0.171, + "step": 17015 + }, + { + "epoch": 2.143590389520421, + "grad_norm": 0.2317270189523697, + "learning_rate": 6.84932242905136e-05, + "loss": 0.1881, + "step": 17020 + }, + { + "epoch": 2.144220171930598, + "grad_norm": 0.26681753993034363, + "learning_rate": 6.840096736099314e-05, + "loss": 0.1792, + "step": 17025 + }, + { + "epoch": 2.144849954340775, + "grad_norm": 0.2119479924440384, + "learning_rate": 6.83087542520171e-05, + "loss": 0.178, + "step": 17030 + }, + { + "epoch": 2.1454797367509526, + "grad_norm": 0.20759105682373047, + "learning_rate": 6.821658501310604e-05, + "loss": 0.1754, + "step": 17035 + }, + { + "epoch": 2.1461095191611297, + "grad_norm": 0.23515643179416656, + "learning_rate": 6.812445969375691e-05, + "loss": 0.1854, + "step": 17040 + }, + { + "epoch": 2.146739301571307, + "grad_norm": 0.20694191753864288, + "learning_rate": 6.803237834344322e-05, + "loss": 0.1801, + "step": 17045 + }, + { + "epoch": 2.1473690839814843, + "grad_norm": 0.21541932225227356, + "learning_rate": 6.794034101161469e-05, + "loss": 0.1752, + "step": 17050 + }, + { + "epoch": 2.147998866391662, + "grad_norm": 0.20586980879306793, + "learning_rate": 6.784834774769748e-05, + "loss": 0.1803, + "step": 17055 + }, + { + "epoch": 2.148628648801839, + "grad_norm": 0.23750190436840057, + "learning_rate": 6.775639860109406e-05, + "loss": 0.1842, + "step": 17060 + }, + { + "epoch": 2.1492584312120164, + "grad_norm": 0.2041424959897995, + "learning_rate": 6.766449362118324e-05, + "loss": 0.1729, + "step": 17065 + }, + { + "epoch": 2.1498882136221935, + "grad_norm": 0.24630430340766907, + "learning_rate": 6.757263285732009e-05, + "loss": 0.1821, + "step": 17070 + }, + { + "epoch": 2.150517996032371, + "grad_norm": 0.23113587498664856, + "learning_rate": 6.748081635883594e-05, + "loss": 0.1821, + "step": 17075 + }, + { + "epoch": 2.151147778442548, + "grad_norm": 0.203240305185318, + "learning_rate": 6.738904417503829e-05, + "loss": 0.1767, + "step": 17080 + }, + { + "epoch": 2.151777560852725, + "grad_norm": 0.2500320374965668, + "learning_rate": 6.7297316355211e-05, + "loss": 0.1852, + "step": 17085 + }, + { + "epoch": 2.1524073432629027, + "grad_norm": 0.2349621206521988, + "learning_rate": 6.720563294861403e-05, + "loss": 0.1764, + "step": 17090 + }, + { + "epoch": 2.15303712567308, + "grad_norm": 0.2351408451795578, + "learning_rate": 6.71139940044833e-05, + "loss": 0.1835, + "step": 17095 + }, + { + "epoch": 2.1536669080832573, + "grad_norm": 0.2078278511762619, + "learning_rate": 6.702239957203108e-05, + "loss": 0.1783, + "step": 17100 + }, + { + "epoch": 2.1542966904934344, + "grad_norm": 0.23805204033851624, + "learning_rate": 6.693084970044574e-05, + "loss": 0.1858, + "step": 17105 + }, + { + "epoch": 2.154926472903612, + "grad_norm": 0.22789132595062256, + "learning_rate": 6.683934443889161e-05, + "loss": 0.1839, + "step": 17110 + }, + { + "epoch": 2.155556255313789, + "grad_norm": 0.27035263180732727, + "learning_rate": 6.674788383650911e-05, + "loss": 0.1878, + "step": 17115 + }, + { + "epoch": 2.1561860377239666, + "grad_norm": 0.21787506341934204, + "learning_rate": 6.665646794241468e-05, + "loss": 0.1854, + "step": 17120 + }, + { + "epoch": 2.1568158201341436, + "grad_norm": 0.2302270233631134, + "learning_rate": 6.656509680570073e-05, + "loss": 0.1822, + "step": 17125 + }, + { + "epoch": 2.157445602544321, + "grad_norm": 0.21228045225143433, + "learning_rate": 6.647377047543563e-05, + "loss": 0.1855, + "step": 17130 + }, + { + "epoch": 2.1580753849544982, + "grad_norm": 0.22131386399269104, + "learning_rate": 6.638248900066375e-05, + "loss": 0.1763, + "step": 17135 + }, + { + "epoch": 2.1587051673646753, + "grad_norm": 0.2691584527492523, + "learning_rate": 6.629125243040524e-05, + "loss": 0.1815, + "step": 17140 + }, + { + "epoch": 2.159334949774853, + "grad_norm": 0.22926035523414612, + "learning_rate": 6.620006081365634e-05, + "loss": 0.1833, + "step": 17145 + }, + { + "epoch": 2.15996473218503, + "grad_norm": 0.20654956996440887, + "learning_rate": 6.610891419938899e-05, + "loss": 0.1755, + "step": 17150 + }, + { + "epoch": 2.1605945145952075, + "grad_norm": 0.22390377521514893, + "learning_rate": 6.601781263655096e-05, + "loss": 0.1839, + "step": 17155 + }, + { + "epoch": 2.1612242970053845, + "grad_norm": 0.23877164721488953, + "learning_rate": 6.592675617406593e-05, + "loss": 0.1739, + "step": 17160 + }, + { + "epoch": 2.161854079415562, + "grad_norm": 0.24347762763500214, + "learning_rate": 6.583574486083325e-05, + "loss": 0.1863, + "step": 17165 + }, + { + "epoch": 2.162483861825739, + "grad_norm": 0.23407521843910217, + "learning_rate": 6.574477874572811e-05, + "loss": 0.1741, + "step": 17170 + }, + { + "epoch": 2.1631136442359167, + "grad_norm": 0.23338505625724792, + "learning_rate": 6.565385787760137e-05, + "loss": 0.1754, + "step": 17175 + }, + { + "epoch": 2.1637434266460938, + "grad_norm": 0.2206541895866394, + "learning_rate": 6.556298230527962e-05, + "loss": 0.1706, + "step": 17180 + }, + { + "epoch": 2.1643732090562713, + "grad_norm": 0.20819810032844543, + "learning_rate": 6.547215207756504e-05, + "loss": 0.1735, + "step": 17185 + }, + { + "epoch": 2.1650029914664484, + "grad_norm": 0.22891941666603088, + "learning_rate": 6.53813672432357e-05, + "loss": 0.187, + "step": 17190 + }, + { + "epoch": 2.1656327738766254, + "grad_norm": 0.2094859778881073, + "learning_rate": 6.52906278510451e-05, + "loss": 0.1795, + "step": 17195 + }, + { + "epoch": 2.166262556286803, + "grad_norm": 0.20969723165035248, + "learning_rate": 6.519993394972219e-05, + "loss": 0.1679, + "step": 17200 + }, + { + "epoch": 2.16689233869698, + "grad_norm": 0.25252285599708557, + "learning_rate": 6.510928558797185e-05, + "loss": 0.183, + "step": 17205 + }, + { + "epoch": 2.1675221211071576, + "grad_norm": 0.22556447982788086, + "learning_rate": 6.501868281447424e-05, + "loss": 0.1694, + "step": 17210 + }, + { + "epoch": 2.1681519035173347, + "grad_norm": 0.2429586797952652, + "learning_rate": 6.492812567788516e-05, + "loss": 0.18, + "step": 17215 + }, + { + "epoch": 2.168781685927512, + "grad_norm": 0.2400483787059784, + "learning_rate": 6.483761422683582e-05, + "loss": 0.1818, + "step": 17220 + }, + { + "epoch": 2.1694114683376893, + "grad_norm": 0.228154718875885, + "learning_rate": 6.47471485099329e-05, + "loss": 0.1744, + "step": 17225 + }, + { + "epoch": 2.170041250747867, + "grad_norm": 0.21748559176921844, + "learning_rate": 6.465672857575875e-05, + "loss": 0.1765, + "step": 17230 + }, + { + "epoch": 2.170671033158044, + "grad_norm": 0.2296319603919983, + "learning_rate": 6.456635447287073e-05, + "loss": 0.1881, + "step": 17235 + }, + { + "epoch": 2.1713008155682214, + "grad_norm": 0.2402602881193161, + "learning_rate": 6.447602624980186e-05, + "loss": 0.1769, + "step": 17240 + }, + { + "epoch": 2.1719305979783985, + "grad_norm": 0.2783866226673126, + "learning_rate": 6.438574395506043e-05, + "loss": 0.1836, + "step": 17245 + }, + { + "epoch": 2.1725603803885756, + "grad_norm": 0.20301677286624908, + "learning_rate": 6.429550763713017e-05, + "loss": 0.1655, + "step": 17250 + }, + { + "epoch": 2.173190162798753, + "grad_norm": 0.21163971722126007, + "learning_rate": 6.420531734447e-05, + "loss": 0.1764, + "step": 17255 + }, + { + "epoch": 2.17381994520893, + "grad_norm": 0.24942253530025482, + "learning_rate": 6.41151731255142e-05, + "loss": 0.1833, + "step": 17260 + }, + { + "epoch": 2.1744497276191077, + "grad_norm": 0.22958967089653015, + "learning_rate": 6.402507502867222e-05, + "loss": 0.1703, + "step": 17265 + }, + { + "epoch": 2.175079510029285, + "grad_norm": 0.21424312889575958, + "learning_rate": 6.393502310232886e-05, + "loss": 0.1757, + "step": 17270 + }, + { + "epoch": 2.1757092924394623, + "grad_norm": 0.20825864374637604, + "learning_rate": 6.384501739484401e-05, + "loss": 0.1715, + "step": 17275 + }, + { + "epoch": 2.1763390748496394, + "grad_norm": 0.21387939155101776, + "learning_rate": 6.375505795455281e-05, + "loss": 0.1697, + "step": 17280 + }, + { + "epoch": 2.176968857259817, + "grad_norm": 0.2073564976453781, + "learning_rate": 6.366514482976546e-05, + "loss": 0.1846, + "step": 17285 + }, + { + "epoch": 2.177598639669994, + "grad_norm": 0.21405762434005737, + "learning_rate": 6.35752780687675e-05, + "loss": 0.1777, + "step": 17290 + }, + { + "epoch": 2.1782284220801715, + "grad_norm": 0.22343981266021729, + "learning_rate": 6.348545771981938e-05, + "loss": 0.1801, + "step": 17295 + }, + { + "epoch": 2.1788582044903486, + "grad_norm": 0.22697073221206665, + "learning_rate": 6.339568383115668e-05, + "loss": 0.1829, + "step": 17300 + }, + { + "epoch": 2.1794879869005257, + "grad_norm": 0.2561056613922119, + "learning_rate": 6.330595645098996e-05, + "loss": 0.185, + "step": 17305 + }, + { + "epoch": 2.1801177693107032, + "grad_norm": 0.2563771903514862, + "learning_rate": 6.321627562750495e-05, + "loss": 0.1752, + "step": 17310 + }, + { + "epoch": 2.1807475517208803, + "grad_norm": 0.21171104907989502, + "learning_rate": 6.312664140886228e-05, + "loss": 0.166, + "step": 17315 + }, + { + "epoch": 2.181377334131058, + "grad_norm": 0.23899543285369873, + "learning_rate": 6.303705384319757e-05, + "loss": 0.1828, + "step": 17320 + }, + { + "epoch": 2.182007116541235, + "grad_norm": 0.26108884811401367, + "learning_rate": 6.29475129786214e-05, + "loss": 0.1829, + "step": 17325 + }, + { + "epoch": 2.1826368989514124, + "grad_norm": 0.2397276908159256, + "learning_rate": 6.285801886321919e-05, + "loss": 0.1733, + "step": 17330 + }, + { + "epoch": 2.1832666813615895, + "grad_norm": 0.22638286650180817, + "learning_rate": 6.27685715450515e-05, + "loss": 0.1719, + "step": 17335 + }, + { + "epoch": 2.183896463771767, + "grad_norm": 0.2424623966217041, + "learning_rate": 6.26791710721534e-05, + "loss": 0.1749, + "step": 17340 + }, + { + "epoch": 2.184526246181944, + "grad_norm": 0.23895704746246338, + "learning_rate": 6.2589817492535e-05, + "loss": 0.178, + "step": 17345 + }, + { + "epoch": 2.1851560285921217, + "grad_norm": 0.2223139852285385, + "learning_rate": 6.250051085418133e-05, + "loss": 0.1872, + "step": 17350 + }, + { + "epoch": 2.1857858110022987, + "grad_norm": 0.22255347669124603, + "learning_rate": 6.241125120505204e-05, + "loss": 0.1791, + "step": 17355 + }, + { + "epoch": 2.186415593412476, + "grad_norm": 0.23792186379432678, + "learning_rate": 6.232203859308157e-05, + "loss": 0.1738, + "step": 17360 + }, + { + "epoch": 2.1870453758226533, + "grad_norm": 0.24884961545467377, + "learning_rate": 6.223287306617915e-05, + "loss": 0.1778, + "step": 17365 + }, + { + "epoch": 2.1876751582328304, + "grad_norm": 0.2130117118358612, + "learning_rate": 6.214375467222873e-05, + "loss": 0.1666, + "step": 17370 + }, + { + "epoch": 2.188304940643008, + "grad_norm": 0.20538979768753052, + "learning_rate": 6.205468345908888e-05, + "loss": 0.1716, + "step": 17375 + }, + { + "epoch": 2.188934723053185, + "grad_norm": 0.2519354224205017, + "learning_rate": 6.196565947459292e-05, + "loss": 0.1885, + "step": 17380 + }, + { + "epoch": 2.1895645054633626, + "grad_norm": 0.2644721567630768, + "learning_rate": 6.187668276654872e-05, + "loss": 0.1923, + "step": 17385 + }, + { + "epoch": 2.1901942878735396, + "grad_norm": 0.22676245868206024, + "learning_rate": 6.178775338273876e-05, + "loss": 0.1745, + "step": 17390 + }, + { + "epoch": 2.190824070283717, + "grad_norm": 0.21329110860824585, + "learning_rate": 6.169887137092029e-05, + "loss": 0.1782, + "step": 17395 + }, + { + "epoch": 2.1914538526938943, + "grad_norm": 0.2096760869026184, + "learning_rate": 6.161003677882489e-05, + "loss": 0.1705, + "step": 17400 + }, + { + "epoch": 2.1920836351040713, + "grad_norm": 0.20192061364650726, + "learning_rate": 6.15212496541588e-05, + "loss": 0.1662, + "step": 17405 + }, + { + "epoch": 2.192713417514249, + "grad_norm": 0.2351575493812561, + "learning_rate": 6.14325100446027e-05, + "loss": 0.1716, + "step": 17410 + }, + { + "epoch": 2.193343199924426, + "grad_norm": 0.23202987015247345, + "learning_rate": 6.13438179978118e-05, + "loss": 0.1848, + "step": 17415 + }, + { + "epoch": 2.1939729823346035, + "grad_norm": 0.22229251265525818, + "learning_rate": 6.125517356141576e-05, + "loss": 0.1757, + "step": 17420 + }, + { + "epoch": 2.1946027647447806, + "grad_norm": 0.20741891860961914, + "learning_rate": 6.116657678301868e-05, + "loss": 0.1804, + "step": 17425 + }, + { + "epoch": 2.195232547154958, + "grad_norm": 0.2023356705904007, + "learning_rate": 6.107802771019895e-05, + "loss": 0.168, + "step": 17430 + }, + { + "epoch": 2.195862329565135, + "grad_norm": 0.30032244324684143, + "learning_rate": 6.098952639050961e-05, + "loss": 0.176, + "step": 17435 + }, + { + "epoch": 2.1964921119753127, + "grad_norm": 0.2093886286020279, + "learning_rate": 6.090107287147786e-05, + "loss": 0.171, + "step": 17440 + }, + { + "epoch": 2.1971218943854898, + "grad_norm": 0.20918086171150208, + "learning_rate": 6.081266720060517e-05, + "loss": 0.1705, + "step": 17445 + }, + { + "epoch": 2.1977516767956673, + "grad_norm": 0.2089412659406662, + "learning_rate": 6.072430942536737e-05, + "loss": 0.1797, + "step": 17450 + }, + { + "epoch": 2.1983814592058444, + "grad_norm": 0.2460128515958786, + "learning_rate": 6.0635999593214765e-05, + "loss": 0.1752, + "step": 17455 + }, + { + "epoch": 2.1990112416160215, + "grad_norm": 0.25952646136283875, + "learning_rate": 6.0547737751571654e-05, + "loss": 0.1784, + "step": 17460 + }, + { + "epoch": 2.199641024026199, + "grad_norm": 0.2011132687330246, + "learning_rate": 6.0459523947836674e-05, + "loss": 0.1714, + "step": 17465 + }, + { + "epoch": 2.200270806436376, + "grad_norm": 0.19077162444591522, + "learning_rate": 6.03713582293826e-05, + "loss": 0.174, + "step": 17470 + }, + { + "epoch": 2.2009005888465536, + "grad_norm": 0.22354647517204285, + "learning_rate": 6.02832406435566e-05, + "loss": 0.1754, + "step": 17475 + }, + { + "epoch": 2.2015303712567307, + "grad_norm": 0.22434799373149872, + "learning_rate": 6.019517123767968e-05, + "loss": 0.1747, + "step": 17480 + }, + { + "epoch": 2.202160153666908, + "grad_norm": 0.22911998629570007, + "learning_rate": 6.010715005904716e-05, + "loss": 0.1812, + "step": 17485 + }, + { + "epoch": 2.2027899360770853, + "grad_norm": 0.23919759690761566, + "learning_rate": 6.0019177154928364e-05, + "loss": 0.1771, + "step": 17490 + }, + { + "epoch": 2.203419718487263, + "grad_norm": 0.21539629995822906, + "learning_rate": 5.993125257256687e-05, + "loss": 0.1799, + "step": 17495 + }, + { + "epoch": 2.20404950089744, + "grad_norm": 0.22069337964057922, + "learning_rate": 5.984337635918014e-05, + "loss": 0.177, + "step": 17500 + }, + { + "epoch": 2.2046792833076174, + "grad_norm": 0.20763671398162842, + "learning_rate": 5.97555485619597e-05, + "loss": 0.1664, + "step": 17505 + }, + { + "epoch": 2.2053090657177945, + "grad_norm": 0.1950199007987976, + "learning_rate": 5.966776922807109e-05, + "loss": 0.1648, + "step": 17510 + }, + { + "epoch": 2.2059388481279716, + "grad_norm": 0.25142478942871094, + "learning_rate": 5.95800384046538e-05, + "loss": 0.1754, + "step": 17515 + }, + { + "epoch": 2.206568630538149, + "grad_norm": 0.2232702225446701, + "learning_rate": 5.94923561388213e-05, + "loss": 0.1716, + "step": 17520 + }, + { + "epoch": 2.207198412948326, + "grad_norm": 0.27657321095466614, + "learning_rate": 5.940472247766097e-05, + "loss": 0.1878, + "step": 17525 + }, + { + "epoch": 2.2078281953585037, + "grad_norm": 0.21436183154582977, + "learning_rate": 5.9317137468234083e-05, + "loss": 0.1727, + "step": 17530 + }, + { + "epoch": 2.208457977768681, + "grad_norm": 0.19741742312908173, + "learning_rate": 5.9229601157575744e-05, + "loss": 0.1694, + "step": 17535 + }, + { + "epoch": 2.2090877601788583, + "grad_norm": 0.2042321413755417, + "learning_rate": 5.914211359269509e-05, + "loss": 0.17, + "step": 17540 + }, + { + "epoch": 2.2097175425890354, + "grad_norm": 0.21126088500022888, + "learning_rate": 5.9054674820574814e-05, + "loss": 0.1703, + "step": 17545 + }, + { + "epoch": 2.210347324999213, + "grad_norm": 0.20463821291923523, + "learning_rate": 5.896728488817151e-05, + "loss": 0.172, + "step": 17550 + }, + { + "epoch": 2.21097710740939, + "grad_norm": 0.204604834318161, + "learning_rate": 5.887994384241569e-05, + "loss": 0.1723, + "step": 17555 + }, + { + "epoch": 2.2116068898195675, + "grad_norm": 0.18806815147399902, + "learning_rate": 5.879265173021141e-05, + "loss": 0.161, + "step": 17560 + }, + { + "epoch": 2.2122366722297446, + "grad_norm": 0.22745926678180695, + "learning_rate": 5.870540859843656e-05, + "loss": 0.1653, + "step": 17565 + }, + { + "epoch": 2.2128664546399217, + "grad_norm": 0.1888933777809143, + "learning_rate": 5.8618214493942675e-05, + "loss": 0.1685, + "step": 17570 + }, + { + "epoch": 2.2134962370500992, + "grad_norm": 0.19480280578136444, + "learning_rate": 5.853106946355501e-05, + "loss": 0.1676, + "step": 17575 + }, + { + "epoch": 2.2141260194602763, + "grad_norm": 0.2703428864479065, + "learning_rate": 5.8443973554072383e-05, + "loss": 0.1788, + "step": 17580 + }, + { + "epoch": 2.214755801870454, + "grad_norm": 0.21035927534103394, + "learning_rate": 5.8356926812267335e-05, + "loss": 0.1806, + "step": 17585 + }, + { + "epoch": 2.215385584280631, + "grad_norm": 0.21794281899929047, + "learning_rate": 5.826992928488594e-05, + "loss": 0.1641, + "step": 17590 + }, + { + "epoch": 2.2160153666908085, + "grad_norm": 0.2512260675430298, + "learning_rate": 5.818298101864779e-05, + "loss": 0.1697, + "step": 17595 + }, + { + "epoch": 2.2166451491009855, + "grad_norm": 0.2089598923921585, + "learning_rate": 5.8096082060246226e-05, + "loss": 0.1656, + "step": 17600 + }, + { + "epoch": 2.217274931511163, + "grad_norm": 0.2160467952489853, + "learning_rate": 5.80092324563479e-05, + "loss": 0.185, + "step": 17605 + }, + { + "epoch": 2.21790471392134, + "grad_norm": 0.20858334004878998, + "learning_rate": 5.7922432253593025e-05, + "loss": 0.1721, + "step": 17610 + }, + { + "epoch": 2.2185344963315172, + "grad_norm": 0.2090991735458374, + "learning_rate": 5.7835681498595327e-05, + "loss": 0.1706, + "step": 17615 + }, + { + "epoch": 2.2191642787416948, + "grad_norm": 0.21040284633636475, + "learning_rate": 5.77489802379419e-05, + "loss": 0.1789, + "step": 17620 + }, + { + "epoch": 2.219794061151872, + "grad_norm": 0.22497640550136566, + "learning_rate": 5.766232851819332e-05, + "loss": 0.1779, + "step": 17625 + }, + { + "epoch": 2.2204238435620494, + "grad_norm": 0.2845938801765442, + "learning_rate": 5.757572638588356e-05, + "loss": 0.1771, + "step": 17630 + }, + { + "epoch": 2.2210536259722264, + "grad_norm": 0.21166571974754333, + "learning_rate": 5.748917388751985e-05, + "loss": 0.1741, + "step": 17635 + }, + { + "epoch": 2.221683408382404, + "grad_norm": 0.26706454157829285, + "learning_rate": 5.7402671069583004e-05, + "loss": 0.1715, + "step": 17640 + }, + { + "epoch": 2.222313190792581, + "grad_norm": 0.2745297849178314, + "learning_rate": 5.731621797852698e-05, + "loss": 0.1843, + "step": 17645 + }, + { + "epoch": 2.2229429732027586, + "grad_norm": 0.2507629990577698, + "learning_rate": 5.7229814660778985e-05, + "loss": 0.186, + "step": 17650 + }, + { + "epoch": 2.2235727556129357, + "grad_norm": 0.21768365800380707, + "learning_rate": 5.7143461162739545e-05, + "loss": 0.1731, + "step": 17655 + }, + { + "epoch": 2.224202538023113, + "grad_norm": 0.22099876403808594, + "learning_rate": 5.705715753078259e-05, + "loss": 0.1802, + "step": 17660 + }, + { + "epoch": 2.2248323204332903, + "grad_norm": 0.20643608272075653, + "learning_rate": 5.697090381125507e-05, + "loss": 0.1769, + "step": 17665 + }, + { + "epoch": 2.2254621028434673, + "grad_norm": 0.2723044455051422, + "learning_rate": 5.688470005047722e-05, + "loss": 0.1882, + "step": 17670 + }, + { + "epoch": 2.226091885253645, + "grad_norm": 0.23548351228237152, + "learning_rate": 5.679854629474238e-05, + "loss": 0.1702, + "step": 17675 + }, + { + "epoch": 2.226721667663822, + "grad_norm": 0.24578404426574707, + "learning_rate": 5.671244259031722e-05, + "loss": 0.1736, + "step": 17680 + }, + { + "epoch": 2.2273514500739995, + "grad_norm": 0.21030524373054504, + "learning_rate": 5.662638898344125e-05, + "loss": 0.1711, + "step": 17685 + }, + { + "epoch": 2.2279812324841766, + "grad_norm": 0.24249999225139618, + "learning_rate": 5.6540385520327275e-05, + "loss": 0.1742, + "step": 17690 + }, + { + "epoch": 2.228611014894354, + "grad_norm": 0.23971515893936157, + "learning_rate": 5.645443224716106e-05, + "loss": 0.1655, + "step": 17695 + }, + { + "epoch": 2.229240797304531, + "grad_norm": 0.2133120596408844, + "learning_rate": 5.636852921010161e-05, + "loss": 0.1786, + "step": 17700 + }, + { + "epoch": 2.2298705797147087, + "grad_norm": 0.23475615680217743, + "learning_rate": 5.628267645528073e-05, + "loss": 0.1753, + "step": 17705 + }, + { + "epoch": 2.230500362124886, + "grad_norm": 0.22111907601356506, + "learning_rate": 5.619687402880332e-05, + "loss": 0.1617, + "step": 17710 + }, + { + "epoch": 2.2311301445350633, + "grad_norm": 0.2323450744152069, + "learning_rate": 5.611112197674725e-05, + "loss": 0.167, + "step": 17715 + }, + { + "epoch": 2.2317599269452404, + "grad_norm": 0.18698996305465698, + "learning_rate": 5.602542034516333e-05, + "loss": 0.1632, + "step": 17720 + }, + { + "epoch": 2.2323897093554175, + "grad_norm": 0.2252064198255539, + "learning_rate": 5.5939769180075286e-05, + "loss": 0.1709, + "step": 17725 + }, + { + "epoch": 2.233019491765595, + "grad_norm": 0.2561705410480499, + "learning_rate": 5.5854168527479756e-05, + "loss": 0.1826, + "step": 17730 + }, + { + "epoch": 2.233649274175772, + "grad_norm": 0.2448531985282898, + "learning_rate": 5.576861843334625e-05, + "loss": 0.1819, + "step": 17735 + }, + { + "epoch": 2.2342790565859496, + "grad_norm": 0.238671213388443, + "learning_rate": 5.568311894361707e-05, + "loss": 0.1839, + "step": 17740 + }, + { + "epoch": 2.2349088389961267, + "grad_norm": 0.22651298344135284, + "learning_rate": 5.5597670104207485e-05, + "loss": 0.172, + "step": 17745 + }, + { + "epoch": 2.235538621406304, + "grad_norm": 0.23881249129772186, + "learning_rate": 5.551227196100549e-05, + "loss": 0.1698, + "step": 17750 + }, + { + "epoch": 2.2361684038164813, + "grad_norm": 0.23065873980522156, + "learning_rate": 5.542692455987167e-05, + "loss": 0.1727, + "step": 17755 + }, + { + "epoch": 2.236798186226659, + "grad_norm": 0.19607169926166534, + "learning_rate": 5.534162794663969e-05, + "loss": 0.1719, + "step": 17760 + }, + { + "epoch": 2.237427968636836, + "grad_norm": 0.2033766806125641, + "learning_rate": 5.525638216711573e-05, + "loss": 0.171, + "step": 17765 + }, + { + "epoch": 2.2380577510470134, + "grad_norm": 0.20412589609622955, + "learning_rate": 5.5171187267078733e-05, + "loss": 0.1633, + "step": 17770 + }, + { + "epoch": 2.2386875334571905, + "grad_norm": 0.21895913779735565, + "learning_rate": 5.508604329228028e-05, + "loss": 0.1801, + "step": 17775 + }, + { + "epoch": 2.2393173158673676, + "grad_norm": 0.19198501110076904, + "learning_rate": 5.50009502884446e-05, + "loss": 0.1764, + "step": 17780 + }, + { + "epoch": 2.239947098277545, + "grad_norm": 0.21897682547569275, + "learning_rate": 5.4915908301268724e-05, + "loss": 0.1719, + "step": 17785 + }, + { + "epoch": 2.240576880687722, + "grad_norm": 0.22070536017417908, + "learning_rate": 5.483091737642198e-05, + "loss": 0.1678, + "step": 17790 + }, + { + "epoch": 2.2412066630978997, + "grad_norm": 0.2158748209476471, + "learning_rate": 5.474597755954651e-05, + "loss": 0.1703, + "step": 17795 + }, + { + "epoch": 2.241836445508077, + "grad_norm": 0.21174906194210052, + "learning_rate": 5.466108889625687e-05, + "loss": 0.1698, + "step": 17800 + }, + { + "epoch": 2.2424662279182543, + "grad_norm": 0.23331063985824585, + "learning_rate": 5.457625143214029e-05, + "loss": 0.1855, + "step": 17805 + }, + { + "epoch": 2.2430960103284314, + "grad_norm": 0.2186896651983261, + "learning_rate": 5.449146521275643e-05, + "loss": 0.1629, + "step": 17810 + }, + { + "epoch": 2.243725792738609, + "grad_norm": 0.22406966984272003, + "learning_rate": 5.440673028363738e-05, + "loss": 0.1731, + "step": 17815 + }, + { + "epoch": 2.244355575148786, + "grad_norm": 0.21894322335720062, + "learning_rate": 5.432204669028777e-05, + "loss": 0.1671, + "step": 17820 + }, + { + "epoch": 2.2449853575589636, + "grad_norm": 0.19151312112808228, + "learning_rate": 5.4237414478184585e-05, + "loss": 0.1634, + "step": 17825 + }, + { + "epoch": 2.2456151399691406, + "grad_norm": 0.20597226917743683, + "learning_rate": 5.415283369277729e-05, + "loss": 0.1594, + "step": 17830 + }, + { + "epoch": 2.2462449223793177, + "grad_norm": 0.23415236175060272, + "learning_rate": 5.406830437948767e-05, + "loss": 0.1667, + "step": 17835 + }, + { + "epoch": 2.2468747047894952, + "grad_norm": 0.21160747110843658, + "learning_rate": 5.398382658370986e-05, + "loss": 0.1694, + "step": 17840 + }, + { + "epoch": 2.2475044871996723, + "grad_norm": 0.2644958198070526, + "learning_rate": 5.3899400350810466e-05, + "loss": 0.1767, + "step": 17845 + }, + { + "epoch": 2.24813426960985, + "grad_norm": 0.23654960095882416, + "learning_rate": 5.381502572612826e-05, + "loss": 0.1684, + "step": 17850 + }, + { + "epoch": 2.248764052020027, + "grad_norm": 0.22581151127815247, + "learning_rate": 5.373070275497439e-05, + "loss": 0.1805, + "step": 17855 + }, + { + "epoch": 2.2493938344302045, + "grad_norm": 0.21524479985237122, + "learning_rate": 5.364643148263205e-05, + "loss": 0.1753, + "step": 17860 + }, + { + "epoch": 2.2500236168403815, + "grad_norm": 0.22853802144527435, + "learning_rate": 5.3562211954357006e-05, + "loss": 0.1752, + "step": 17865 + }, + { + "epoch": 2.250653399250559, + "grad_norm": 0.19708101451396942, + "learning_rate": 5.347804421537701e-05, + "loss": 0.1701, + "step": 17870 + }, + { + "epoch": 2.251283181660736, + "grad_norm": 0.22857971489429474, + "learning_rate": 5.339392831089209e-05, + "loss": 0.1662, + "step": 17875 + }, + { + "epoch": 2.2519129640709137, + "grad_norm": 0.2373005598783493, + "learning_rate": 5.33098642860743e-05, + "loss": 0.1878, + "step": 17880 + }, + { + "epoch": 2.2525427464810908, + "grad_norm": 0.22458739578723907, + "learning_rate": 5.322585218606811e-05, + "loss": 0.1711, + "step": 17885 + }, + { + "epoch": 2.253172528891268, + "grad_norm": 0.24684731662273407, + "learning_rate": 5.314189205598987e-05, + "loss": 0.1833, + "step": 17890 + }, + { + "epoch": 2.2538023113014454, + "grad_norm": 0.22604569792747498, + "learning_rate": 5.3057983940928046e-05, + "loss": 0.1683, + "step": 17895 + }, + { + "epoch": 2.2544320937116225, + "grad_norm": 0.23015649616718292, + "learning_rate": 5.2974127885943166e-05, + "loss": 0.1793, + "step": 17900 + }, + { + "epoch": 2.2550618761218, + "grad_norm": 0.2156984657049179, + "learning_rate": 5.289032393606797e-05, + "loss": 0.1816, + "step": 17905 + }, + { + "epoch": 2.255691658531977, + "grad_norm": 0.2468300610780716, + "learning_rate": 5.280657213630704e-05, + "loss": 0.1795, + "step": 17910 + }, + { + "epoch": 2.2563214409421546, + "grad_norm": 0.19326730072498322, + "learning_rate": 5.2722872531637024e-05, + "loss": 0.1726, + "step": 17915 + }, + { + "epoch": 2.2569512233523317, + "grad_norm": 0.19111455976963043, + "learning_rate": 5.2639225167006475e-05, + "loss": 0.1709, + "step": 17920 + }, + { + "epoch": 2.257581005762509, + "grad_norm": 0.24302569031715393, + "learning_rate": 5.255563008733599e-05, + "loss": 0.1752, + "step": 17925 + }, + { + "epoch": 2.2582107881726863, + "grad_norm": 0.20797547698020935, + "learning_rate": 5.247208733751801e-05, + "loss": 0.1792, + "step": 17930 + }, + { + "epoch": 2.258840570582864, + "grad_norm": 0.21642006933689117, + "learning_rate": 5.238859696241689e-05, + "loss": 0.1673, + "step": 17935 + }, + { + "epoch": 2.259470352993041, + "grad_norm": 0.22728614509105682, + "learning_rate": 5.2305159006868885e-05, + "loss": 0.1793, + "step": 17940 + }, + { + "epoch": 2.260100135403218, + "grad_norm": 0.24052174389362335, + "learning_rate": 5.2221773515682035e-05, + "loss": 0.1791, + "step": 17945 + }, + { + "epoch": 2.2607299178133955, + "grad_norm": 0.21312139928340912, + "learning_rate": 5.213844053363635e-05, + "loss": 0.177, + "step": 17950 + }, + { + "epoch": 2.2613597002235726, + "grad_norm": 0.22087723016738892, + "learning_rate": 5.205516010548349e-05, + "loss": 0.1764, + "step": 17955 + }, + { + "epoch": 2.26198948263375, + "grad_norm": 0.24077439308166504, + "learning_rate": 5.1971932275946967e-05, + "loss": 0.1884, + "step": 17960 + }, + { + "epoch": 2.262619265043927, + "grad_norm": 0.2120356261730194, + "learning_rate": 5.188875708972198e-05, + "loss": 0.173, + "step": 17965 + }, + { + "epoch": 2.2632490474541047, + "grad_norm": 0.24573729932308197, + "learning_rate": 5.1805634591475555e-05, + "loss": 0.1824, + "step": 17970 + }, + { + "epoch": 2.263878829864282, + "grad_norm": 0.20354896783828735, + "learning_rate": 5.1722564825846336e-05, + "loss": 0.1738, + "step": 17975 + }, + { + "epoch": 2.2645086122744593, + "grad_norm": 0.2105248123407364, + "learning_rate": 5.1639547837444725e-05, + "loss": 0.1694, + "step": 17980 + }, + { + "epoch": 2.2651383946846364, + "grad_norm": 0.21009747684001923, + "learning_rate": 5.1556583670852636e-05, + "loss": 0.1773, + "step": 17985 + }, + { + "epoch": 2.265768177094814, + "grad_norm": 0.21542850136756897, + "learning_rate": 5.147367237062387e-05, + "loss": 0.1682, + "step": 17990 + }, + { + "epoch": 2.266397959504991, + "grad_norm": 0.20584627985954285, + "learning_rate": 5.1390813981283676e-05, + "loss": 0.1734, + "step": 17995 + }, + { + "epoch": 2.267027741915168, + "grad_norm": 0.2486305981874466, + "learning_rate": 5.130800854732877e-05, + "loss": 0.1825, + "step": 18000 + }, + { + "epoch": 2.267027741915168, + "eval_loss": 0.35427358746528625, + "eval_runtime": 6.1591, + "eval_samples_per_second": 162.361, + "eval_steps_per_second": 10.229, + "step": 18000 + }, + { + "epoch": 2.2676575243253456, + "grad_norm": 0.19808907806873322, + "learning_rate": 5.122525611322761e-05, + "loss": 0.1625, + "step": 18005 + }, + { + "epoch": 2.2682873067355227, + "grad_norm": 0.24098962545394897, + "learning_rate": 5.114255672342022e-05, + "loss": 0.1687, + "step": 18010 + }, + { + "epoch": 2.2689170891457002, + "grad_norm": 0.22834831476211548, + "learning_rate": 5.105991042231799e-05, + "loss": 0.1695, + "step": 18015 + }, + { + "epoch": 2.2695468715558773, + "grad_norm": 0.19950784742832184, + "learning_rate": 5.097731725430392e-05, + "loss": 0.1692, + "step": 18020 + }, + { + "epoch": 2.270176653966055, + "grad_norm": 0.23613286018371582, + "learning_rate": 5.0894777263732405e-05, + "loss": 0.176, + "step": 18025 + }, + { + "epoch": 2.270806436376232, + "grad_norm": 0.2248247116804123, + "learning_rate": 5.081229049492929e-05, + "loss": 0.1638, + "step": 18030 + }, + { + "epoch": 2.2714362187864094, + "grad_norm": 0.21063442528247833, + "learning_rate": 5.072985699219186e-05, + "loss": 0.1696, + "step": 18035 + }, + { + "epoch": 2.2720660011965865, + "grad_norm": 0.26251456141471863, + "learning_rate": 5.064747679978881e-05, + "loss": 0.1784, + "step": 18040 + }, + { + "epoch": 2.272695783606764, + "grad_norm": 0.20396436750888824, + "learning_rate": 5.056514996196011e-05, + "loss": 0.1733, + "step": 18045 + }, + { + "epoch": 2.273325566016941, + "grad_norm": 0.21515126526355743, + "learning_rate": 5.048287652291728e-05, + "loss": 0.1625, + "step": 18050 + }, + { + "epoch": 2.273955348427118, + "grad_norm": 0.24371370673179626, + "learning_rate": 5.0400656526842946e-05, + "loss": 0.1739, + "step": 18055 + }, + { + "epoch": 2.2745851308372957, + "grad_norm": 0.22852087020874023, + "learning_rate": 5.03184900178912e-05, + "loss": 0.171, + "step": 18060 + }, + { + "epoch": 2.275214913247473, + "grad_norm": 0.22659562528133392, + "learning_rate": 5.023637704018719e-05, + "loss": 0.1769, + "step": 18065 + }, + { + "epoch": 2.2758446956576504, + "grad_norm": 0.2462269514799118, + "learning_rate": 5.01543176378276e-05, + "loss": 0.1731, + "step": 18070 + }, + { + "epoch": 2.2764744780678274, + "grad_norm": 0.21395175158977509, + "learning_rate": 5.007231185488016e-05, + "loss": 0.1705, + "step": 18075 + }, + { + "epoch": 2.277104260478005, + "grad_norm": 0.2166956514120102, + "learning_rate": 4.9990359735383837e-05, + "loss": 0.1671, + "step": 18080 + }, + { + "epoch": 2.277734042888182, + "grad_norm": 0.23139755427837372, + "learning_rate": 4.9908461323348754e-05, + "loss": 0.1785, + "step": 18085 + }, + { + "epoch": 2.2783638252983596, + "grad_norm": 0.23193643987178802, + "learning_rate": 4.982661666275632e-05, + "loss": 0.1746, + "step": 18090 + }, + { + "epoch": 2.2789936077085367, + "grad_norm": 0.21008536219596863, + "learning_rate": 4.974482579755899e-05, + "loss": 0.1784, + "step": 18095 + }, + { + "epoch": 2.279623390118714, + "grad_norm": 0.23688139021396637, + "learning_rate": 4.9663088771680235e-05, + "loss": 0.1812, + "step": 18100 + }, + { + "epoch": 2.2802531725288913, + "grad_norm": 0.20811019837856293, + "learning_rate": 4.958140562901468e-05, + "loss": 0.1721, + "step": 18105 + }, + { + "epoch": 2.2808829549390683, + "grad_norm": 0.2096734642982483, + "learning_rate": 4.9499776413428167e-05, + "loss": 0.1697, + "step": 18110 + }, + { + "epoch": 2.281512737349246, + "grad_norm": 0.22839121520519257, + "learning_rate": 4.9418201168757386e-05, + "loss": 0.1729, + "step": 18115 + }, + { + "epoch": 2.282142519759423, + "grad_norm": 0.21908484399318695, + "learning_rate": 4.9336679938810106e-05, + "loss": 0.1659, + "step": 18120 + }, + { + "epoch": 2.2827723021696005, + "grad_norm": 0.20620904862880707, + "learning_rate": 4.925521276736511e-05, + "loss": 0.1636, + "step": 18125 + }, + { + "epoch": 2.2834020845797776, + "grad_norm": 0.28344854712486267, + "learning_rate": 4.9173799698172095e-05, + "loss": 0.1753, + "step": 18130 + }, + { + "epoch": 2.284031866989955, + "grad_norm": 0.2172774374485016, + "learning_rate": 4.909244077495175e-05, + "loss": 0.1702, + "step": 18135 + }, + { + "epoch": 2.284661649400132, + "grad_norm": 0.19668060541152954, + "learning_rate": 4.90111360413957e-05, + "loss": 0.1715, + "step": 18140 + }, + { + "epoch": 2.2852914318103097, + "grad_norm": 0.19766007363796234, + "learning_rate": 4.892988554116642e-05, + "loss": 0.1608, + "step": 18145 + }, + { + "epoch": 2.2859212142204868, + "grad_norm": 0.2108301967382431, + "learning_rate": 4.884868931789724e-05, + "loss": 0.1633, + "step": 18150 + }, + { + "epoch": 2.2865509966306643, + "grad_norm": 0.25781720876693726, + "learning_rate": 4.8767547415192476e-05, + "loss": 0.1634, + "step": 18155 + }, + { + "epoch": 2.2871807790408414, + "grad_norm": 0.21515868604183197, + "learning_rate": 4.8686459876627164e-05, + "loss": 0.1687, + "step": 18160 + }, + { + "epoch": 2.2878105614510185, + "grad_norm": 0.23936854302883148, + "learning_rate": 4.860542674574713e-05, + "loss": 0.1786, + "step": 18165 + }, + { + "epoch": 2.288440343861196, + "grad_norm": 0.2083710879087448, + "learning_rate": 4.852444806606904e-05, + "loss": 0.1727, + "step": 18170 + }, + { + "epoch": 2.289070126271373, + "grad_norm": 0.24087072908878326, + "learning_rate": 4.844352388108028e-05, + "loss": 0.1646, + "step": 18175 + }, + { + "epoch": 2.2896999086815506, + "grad_norm": 0.22956833243370056, + "learning_rate": 4.836265423423898e-05, + "loss": 0.1667, + "step": 18180 + }, + { + "epoch": 2.2903296910917277, + "grad_norm": 0.2500525414943695, + "learning_rate": 4.828183916897402e-05, + "loss": 0.1788, + "step": 18185 + }, + { + "epoch": 2.290959473501905, + "grad_norm": 0.23779354989528656, + "learning_rate": 4.820107872868486e-05, + "loss": 0.1687, + "step": 18190 + }, + { + "epoch": 2.2915892559120823, + "grad_norm": 0.21519017219543457, + "learning_rate": 4.81203729567418e-05, + "loss": 0.173, + "step": 18195 + }, + { + "epoch": 2.29221903832226, + "grad_norm": 0.2123459428548813, + "learning_rate": 4.803972189648568e-05, + "loss": 0.1648, + "step": 18200 + }, + { + "epoch": 2.292848820732437, + "grad_norm": 0.2364078015089035, + "learning_rate": 4.795912559122789e-05, + "loss": 0.1743, + "step": 18205 + }, + { + "epoch": 2.2934786031426144, + "grad_norm": 0.23717305064201355, + "learning_rate": 4.787858408425045e-05, + "loss": 0.1827, + "step": 18210 + }, + { + "epoch": 2.2941083855527915, + "grad_norm": 0.197091206908226, + "learning_rate": 4.7798097418806134e-05, + "loss": 0.1713, + "step": 18215 + }, + { + "epoch": 2.2947381679629686, + "grad_norm": 0.19760344922542572, + "learning_rate": 4.771766563811803e-05, + "loss": 0.1612, + "step": 18220 + }, + { + "epoch": 2.295367950373146, + "grad_norm": 0.22046242654323578, + "learning_rate": 4.763728878537984e-05, + "loss": 0.1691, + "step": 18225 + }, + { + "epoch": 2.295997732783323, + "grad_norm": 0.22356641292572021, + "learning_rate": 4.755696690375574e-05, + "loss": 0.1684, + "step": 18230 + }, + { + "epoch": 2.2966275151935007, + "grad_norm": 0.20664890110492706, + "learning_rate": 4.7476700036380565e-05, + "loss": 0.1656, + "step": 18235 + }, + { + "epoch": 2.297257297603678, + "grad_norm": 0.2873956859111786, + "learning_rate": 4.73964882263593e-05, + "loss": 0.1811, + "step": 18240 + }, + { + "epoch": 2.2978870800138553, + "grad_norm": 0.23324726521968842, + "learning_rate": 4.7316331516767575e-05, + "loss": 0.17, + "step": 18245 + }, + { + "epoch": 2.2985168624240324, + "grad_norm": 0.22407886385917664, + "learning_rate": 4.7236229950651314e-05, + "loss": 0.1589, + "step": 18250 + }, + { + "epoch": 2.29914664483421, + "grad_norm": 0.2202986776828766, + "learning_rate": 4.7156183571026985e-05, + "loss": 0.1806, + "step": 18255 + }, + { + "epoch": 2.299776427244387, + "grad_norm": 0.1998445987701416, + "learning_rate": 4.707619242088129e-05, + "loss": 0.1571, + "step": 18260 + }, + { + "epoch": 2.3004062096545645, + "grad_norm": 0.24477636814117432, + "learning_rate": 4.69962565431713e-05, + "loss": 0.1788, + "step": 18265 + }, + { + "epoch": 2.3010359920647416, + "grad_norm": 0.2186649590730667, + "learning_rate": 4.691637598082439e-05, + "loss": 0.1837, + "step": 18270 + }, + { + "epoch": 2.3016657744749187, + "grad_norm": 0.19296254217624664, + "learning_rate": 4.683655077673826e-05, + "loss": 0.1609, + "step": 18275 + }, + { + "epoch": 2.3022955568850962, + "grad_norm": 0.234447181224823, + "learning_rate": 4.675678097378086e-05, + "loss": 0.1711, + "step": 18280 + }, + { + "epoch": 2.3029253392952733, + "grad_norm": 0.19974513351917267, + "learning_rate": 4.667706661479041e-05, + "loss": 0.1666, + "step": 18285 + }, + { + "epoch": 2.303555121705451, + "grad_norm": 0.23064357042312622, + "learning_rate": 4.659740774257527e-05, + "loss": 0.1684, + "step": 18290 + }, + { + "epoch": 2.304184904115628, + "grad_norm": 0.19428302347660065, + "learning_rate": 4.6517804399914214e-05, + "loss": 0.166, + "step": 18295 + }, + { + "epoch": 2.3048146865258055, + "grad_norm": 0.23040397465229034, + "learning_rate": 4.6438256629555956e-05, + "loss": 0.1687, + "step": 18300 + }, + { + "epoch": 2.3054444689359825, + "grad_norm": 0.22161847352981567, + "learning_rate": 4.635876447421955e-05, + "loss": 0.1784, + "step": 18305 + }, + { + "epoch": 2.30607425134616, + "grad_norm": 0.22831936180591583, + "learning_rate": 4.6279327976593924e-05, + "loss": 0.1731, + "step": 18310 + }, + { + "epoch": 2.306704033756337, + "grad_norm": 0.25957801938056946, + "learning_rate": 4.619994717933848e-05, + "loss": 0.1823, + "step": 18315 + }, + { + "epoch": 2.3073338161665147, + "grad_norm": 0.23449194431304932, + "learning_rate": 4.6120622125082426e-05, + "loss": 0.1725, + "step": 18320 + }, + { + "epoch": 2.3079635985766918, + "grad_norm": 0.24584275484085083, + "learning_rate": 4.604135285642514e-05, + "loss": 0.1857, + "step": 18325 + }, + { + "epoch": 2.308593380986869, + "grad_norm": 0.21245352923870087, + "learning_rate": 4.5962139415936056e-05, + "loss": 0.164, + "step": 18330 + }, + { + "epoch": 2.3092231633970464, + "grad_norm": 0.2068212777376175, + "learning_rate": 4.588298184615453e-05, + "loss": 0.1661, + "step": 18335 + }, + { + "epoch": 2.3098529458072234, + "grad_norm": 0.21349553763866425, + "learning_rate": 4.580388018959013e-05, + "loss": 0.1707, + "step": 18340 + }, + { + "epoch": 2.310482728217401, + "grad_norm": 0.2073366641998291, + "learning_rate": 4.5724834488722106e-05, + "loss": 0.1608, + "step": 18345 + }, + { + "epoch": 2.311112510627578, + "grad_norm": 0.2493850737810135, + "learning_rate": 4.564584478599982e-05, + "loss": 0.176, + "step": 18350 + }, + { + "epoch": 2.3117422930377556, + "grad_norm": 0.25253990292549133, + "learning_rate": 4.556691112384262e-05, + "loss": 0.1744, + "step": 18355 + }, + { + "epoch": 2.3123720754479327, + "grad_norm": 0.24499280750751495, + "learning_rate": 4.548803354463967e-05, + "loss": 0.1755, + "step": 18360 + }, + { + "epoch": 2.31300185785811, + "grad_norm": 0.21188803017139435, + "learning_rate": 4.540921209075e-05, + "loss": 0.1675, + "step": 18365 + }, + { + "epoch": 2.3136316402682873, + "grad_norm": 0.2255249321460724, + "learning_rate": 4.5330446804502543e-05, + "loss": 0.1668, + "step": 18370 + }, + { + "epoch": 2.314261422678465, + "grad_norm": 0.2088666409254074, + "learning_rate": 4.525173772819606e-05, + "loss": 0.173, + "step": 18375 + }, + { + "epoch": 2.314891205088642, + "grad_norm": 0.24474313855171204, + "learning_rate": 4.517308490409912e-05, + "loss": 0.1672, + "step": 18380 + }, + { + "epoch": 2.315520987498819, + "grad_norm": 0.2033611238002777, + "learning_rate": 4.5094488374450085e-05, + "loss": 0.1677, + "step": 18385 + }, + { + "epoch": 2.3161507699089965, + "grad_norm": 0.22693341970443726, + "learning_rate": 4.50159481814571e-05, + "loss": 0.1653, + "step": 18390 + }, + { + "epoch": 2.3167805523191736, + "grad_norm": 0.24162709712982178, + "learning_rate": 4.493746436729797e-05, + "loss": 0.1668, + "step": 18395 + }, + { + "epoch": 2.317410334729351, + "grad_norm": 0.21281133592128754, + "learning_rate": 4.485903697412041e-05, + "loss": 0.167, + "step": 18400 + }, + { + "epoch": 2.318040117139528, + "grad_norm": 0.2348182648420334, + "learning_rate": 4.478066604404168e-05, + "loss": 0.1683, + "step": 18405 + }, + { + "epoch": 2.3186698995497057, + "grad_norm": 0.2391456663608551, + "learning_rate": 4.470235161914878e-05, + "loss": 0.1708, + "step": 18410 + }, + { + "epoch": 2.319299681959883, + "grad_norm": 0.2014867216348648, + "learning_rate": 4.462409374149822e-05, + "loss": 0.1679, + "step": 18415 + }, + { + "epoch": 2.3199294643700603, + "grad_norm": 0.19464534521102905, + "learning_rate": 4.4545892453116414e-05, + "loss": 0.167, + "step": 18420 + }, + { + "epoch": 2.3205592467802374, + "grad_norm": 0.18525034189224243, + "learning_rate": 4.446774779599918e-05, + "loss": 0.16, + "step": 18425 + }, + { + "epoch": 2.321189029190415, + "grad_norm": 0.220379039645195, + "learning_rate": 4.438965981211201e-05, + "loss": 0.1728, + "step": 18430 + }, + { + "epoch": 2.321818811600592, + "grad_norm": 0.22186563909053802, + "learning_rate": 4.431162854338985e-05, + "loss": 0.1651, + "step": 18435 + }, + { + "epoch": 2.322448594010769, + "grad_norm": 0.22272159159183502, + "learning_rate": 4.423365403173739e-05, + "loss": 0.171, + "step": 18440 + }, + { + "epoch": 2.3230783764209466, + "grad_norm": 0.220636785030365, + "learning_rate": 4.4155736319028725e-05, + "loss": 0.1691, + "step": 18445 + }, + { + "epoch": 2.3237081588311237, + "grad_norm": 0.22500810027122498, + "learning_rate": 4.4077875447107356e-05, + "loss": 0.1648, + "step": 18450 + }, + { + "epoch": 2.324337941241301, + "grad_norm": 0.2163766771554947, + "learning_rate": 4.4000071457786335e-05, + "loss": 0.1655, + "step": 18455 + }, + { + "epoch": 2.3249677236514783, + "grad_norm": 0.2258923053741455, + "learning_rate": 4.392232439284829e-05, + "loss": 0.1704, + "step": 18460 + }, + { + "epoch": 2.325597506061656, + "grad_norm": 0.23461341857910156, + "learning_rate": 4.384463429404511e-05, + "loss": 0.1686, + "step": 18465 + }, + { + "epoch": 2.326227288471833, + "grad_norm": 0.22406549751758575, + "learning_rate": 4.376700120309816e-05, + "loss": 0.1655, + "step": 18470 + }, + { + "epoch": 2.3268570708820104, + "grad_norm": 0.21646642684936523, + "learning_rate": 4.368942516169819e-05, + "loss": 0.1682, + "step": 18475 + }, + { + "epoch": 2.3274868532921875, + "grad_norm": 0.23925819993019104, + "learning_rate": 4.3611906211505284e-05, + "loss": 0.1746, + "step": 18480 + }, + { + "epoch": 2.328116635702365, + "grad_norm": 0.19920630753040314, + "learning_rate": 4.35344443941489e-05, + "loss": 0.158, + "step": 18485 + }, + { + "epoch": 2.328746418112542, + "grad_norm": 0.2575379014015198, + "learning_rate": 4.345703975122783e-05, + "loss": 0.1708, + "step": 18490 + }, + { + "epoch": 2.329376200522719, + "grad_norm": 0.19556741416454315, + "learning_rate": 4.3379692324310056e-05, + "loss": 0.1677, + "step": 18495 + }, + { + "epoch": 2.3300059829328967, + "grad_norm": 0.2595387101173401, + "learning_rate": 4.3302402154933005e-05, + "loss": 0.1705, + "step": 18500 + }, + { + "epoch": 2.330635765343074, + "grad_norm": 0.21318422257900238, + "learning_rate": 4.322516928460325e-05, + "loss": 0.1676, + "step": 18505 + }, + { + "epoch": 2.3312655477532513, + "grad_norm": 0.2212359607219696, + "learning_rate": 4.3147993754796624e-05, + "loss": 0.1661, + "step": 18510 + }, + { + "epoch": 2.3318953301634284, + "grad_norm": 0.1886136680841446, + "learning_rate": 4.3070875606958006e-05, + "loss": 0.1613, + "step": 18515 + }, + { + "epoch": 2.332525112573606, + "grad_norm": 0.23505628108978271, + "learning_rate": 4.2993814882501754e-05, + "loss": 0.1687, + "step": 18520 + }, + { + "epoch": 2.333154894983783, + "grad_norm": 0.18686296045780182, + "learning_rate": 4.2916811622811195e-05, + "loss": 0.1613, + "step": 18525 + }, + { + "epoch": 2.3337846773939606, + "grad_norm": 0.21165959537029266, + "learning_rate": 4.2839865869238845e-05, + "loss": 0.1604, + "step": 18530 + }, + { + "epoch": 2.3344144598041376, + "grad_norm": 0.29806169867515564, + "learning_rate": 4.27629776631063e-05, + "loss": 0.1682, + "step": 18535 + }, + { + "epoch": 2.335044242214315, + "grad_norm": 0.2488899528980255, + "learning_rate": 4.268614704570426e-05, + "loss": 0.1758, + "step": 18540 + }, + { + "epoch": 2.3356740246244923, + "grad_norm": 0.21834008395671844, + "learning_rate": 4.2609374058292666e-05, + "loss": 0.1587, + "step": 18545 + }, + { + "epoch": 2.3363038070346693, + "grad_norm": 0.22900566458702087, + "learning_rate": 4.253265874210022e-05, + "loss": 0.1798, + "step": 18550 + }, + { + "epoch": 2.336933589444847, + "grad_norm": 0.22346030175685883, + "learning_rate": 4.2456001138324794e-05, + "loss": 0.1656, + "step": 18555 + }, + { + "epoch": 2.337563371855024, + "grad_norm": 0.22244654595851898, + "learning_rate": 4.237940128813336e-05, + "loss": 0.1734, + "step": 18560 + }, + { + "epoch": 2.3381931542652015, + "grad_norm": 0.19254350662231445, + "learning_rate": 4.230285923266175e-05, + "loss": 0.1619, + "step": 18565 + }, + { + "epoch": 2.3388229366753785, + "grad_norm": 0.22871673107147217, + "learning_rate": 4.222637501301481e-05, + "loss": 0.166, + "step": 18570 + }, + { + "epoch": 2.339452719085556, + "grad_norm": 0.20270411670207977, + "learning_rate": 4.2149948670266284e-05, + "loss": 0.1637, + "step": 18575 + }, + { + "epoch": 2.340082501495733, + "grad_norm": 0.23636558651924133, + "learning_rate": 4.2073580245458874e-05, + "loss": 0.1839, + "step": 18580 + }, + { + "epoch": 2.3407122839059107, + "grad_norm": 0.24934862554073334, + "learning_rate": 4.1997269779604185e-05, + "loss": 0.1661, + "step": 18585 + }, + { + "epoch": 2.3413420663160878, + "grad_norm": 0.2234071046113968, + "learning_rate": 4.192101731368267e-05, + "loss": 0.1699, + "step": 18590 + }, + { + "epoch": 2.3419718487262653, + "grad_norm": 0.20725548267364502, + "learning_rate": 4.1844822888643634e-05, + "loss": 0.1663, + "step": 18595 + }, + { + "epoch": 2.3426016311364424, + "grad_norm": 0.22668230533599854, + "learning_rate": 4.1768686545405186e-05, + "loss": 0.1647, + "step": 18600 + }, + { + "epoch": 2.3432314135466195, + "grad_norm": 0.23123641312122345, + "learning_rate": 4.1692608324854384e-05, + "loss": 0.171, + "step": 18605 + }, + { + "epoch": 2.343861195956797, + "grad_norm": 0.21715596318244934, + "learning_rate": 4.161658826784692e-05, + "loss": 0.1631, + "step": 18610 + }, + { + "epoch": 2.344490978366974, + "grad_norm": 0.24206319451332092, + "learning_rate": 4.154062641520732e-05, + "loss": 0.1724, + "step": 18615 + }, + { + "epoch": 2.3451207607771516, + "grad_norm": 0.21535861492156982, + "learning_rate": 4.1464722807728724e-05, + "loss": 0.1673, + "step": 18620 + }, + { + "epoch": 2.3457505431873287, + "grad_norm": 0.24345341324806213, + "learning_rate": 4.1388877486173245e-05, + "loss": 0.1648, + "step": 18625 + }, + { + "epoch": 2.346380325597506, + "grad_norm": 0.2361554056406021, + "learning_rate": 4.131309049127149e-05, + "loss": 0.1624, + "step": 18630 + }, + { + "epoch": 2.3470101080076833, + "grad_norm": 0.20666177570819855, + "learning_rate": 4.1237361863722816e-05, + "loss": 0.1662, + "step": 18635 + }, + { + "epoch": 2.347639890417861, + "grad_norm": 0.22876566648483276, + "learning_rate": 4.1161691644195165e-05, + "loss": 0.1767, + "step": 18640 + }, + { + "epoch": 2.348269672828038, + "grad_norm": 0.19370432198047638, + "learning_rate": 4.108607987332529e-05, + "loss": 0.1604, + "step": 18645 + }, + { + "epoch": 2.3488994552382154, + "grad_norm": 0.22485142946243286, + "learning_rate": 4.101052659171842e-05, + "loss": 0.1667, + "step": 18650 + }, + { + "epoch": 2.3495292376483925, + "grad_norm": 0.2446049600839615, + "learning_rate": 4.0935031839948315e-05, + "loss": 0.1719, + "step": 18655 + }, + { + "epoch": 2.3501590200585696, + "grad_norm": 0.22652800381183624, + "learning_rate": 4.0859595658557367e-05, + "loss": 0.1666, + "step": 18660 + }, + { + "epoch": 2.350788802468747, + "grad_norm": 0.1760840266942978, + "learning_rate": 4.078421808805663e-05, + "loss": 0.1516, + "step": 18665 + }, + { + "epoch": 2.351418584878924, + "grad_norm": 0.20791617035865784, + "learning_rate": 4.070889916892553e-05, + "loss": 0.164, + "step": 18670 + }, + { + "epoch": 2.3520483672891017, + "grad_norm": 0.2205626517534256, + "learning_rate": 4.063363894161206e-05, + "loss": 0.1669, + "step": 18675 + }, + { + "epoch": 2.352678149699279, + "grad_norm": 0.23379269242286682, + "learning_rate": 4.055843744653266e-05, + "loss": 0.1593, + "step": 18680 + }, + { + "epoch": 2.3533079321094563, + "grad_norm": 0.23451068997383118, + "learning_rate": 4.0483294724072254e-05, + "loss": 0.1633, + "step": 18685 + }, + { + "epoch": 2.3539377145196334, + "grad_norm": 0.28889602422714233, + "learning_rate": 4.040821081458422e-05, + "loss": 0.1752, + "step": 18690 + }, + { + "epoch": 2.354567496929811, + "grad_norm": 0.2054235339164734, + "learning_rate": 4.0333185758390307e-05, + "loss": 0.1666, + "step": 18695 + }, + { + "epoch": 2.355197279339988, + "grad_norm": 0.19770711660385132, + "learning_rate": 4.025821959578067e-05, + "loss": 0.1701, + "step": 18700 + }, + { + "epoch": 2.3558270617501655, + "grad_norm": 0.25233033299446106, + "learning_rate": 4.0183312367013906e-05, + "loss": 0.1722, + "step": 18705 + }, + { + "epoch": 2.3564568441603426, + "grad_norm": 0.20867769420146942, + "learning_rate": 4.010846411231689e-05, + "loss": 0.1601, + "step": 18710 + }, + { + "epoch": 2.3570866265705197, + "grad_norm": 0.21671661734580994, + "learning_rate": 4.003367487188483e-05, + "loss": 0.1658, + "step": 18715 + }, + { + "epoch": 2.3577164089806972, + "grad_norm": 0.17957130074501038, + "learning_rate": 3.9958944685881265e-05, + "loss": 0.1619, + "step": 18720 + }, + { + "epoch": 2.3583461913908743, + "grad_norm": 0.21048414707183838, + "learning_rate": 3.988427359443802e-05, + "loss": 0.1668, + "step": 18725 + }, + { + "epoch": 2.358975973801052, + "grad_norm": 0.21969716250896454, + "learning_rate": 3.980966163765513e-05, + "loss": 0.1619, + "step": 18730 + }, + { + "epoch": 2.359605756211229, + "grad_norm": 0.22368858754634857, + "learning_rate": 3.9735108855600984e-05, + "loss": 0.168, + "step": 18735 + }, + { + "epoch": 2.3602355386214064, + "grad_norm": 0.2626504600048065, + "learning_rate": 3.966061528831209e-05, + "loss": 0.1651, + "step": 18740 + }, + { + "epoch": 2.3608653210315835, + "grad_norm": 0.21985310316085815, + "learning_rate": 3.958618097579316e-05, + "loss": 0.1671, + "step": 18745 + }, + { + "epoch": 2.361495103441761, + "grad_norm": 0.22451792657375336, + "learning_rate": 3.9511805958017205e-05, + "loss": 0.1609, + "step": 18750 + }, + { + "epoch": 2.362124885851938, + "grad_norm": 0.2123977243900299, + "learning_rate": 3.943749027492532e-05, + "loss": 0.1719, + "step": 18755 + }, + { + "epoch": 2.3627546682621157, + "grad_norm": 0.2234313040971756, + "learning_rate": 3.936323396642658e-05, + "loss": 0.1556, + "step": 18760 + }, + { + "epoch": 2.3633844506722927, + "grad_norm": 0.19645099341869354, + "learning_rate": 3.928903707239846e-05, + "loss": 0.1673, + "step": 18765 + }, + { + "epoch": 2.36401423308247, + "grad_norm": 0.22249870002269745, + "learning_rate": 3.9214899632686334e-05, + "loss": 0.1589, + "step": 18770 + }, + { + "epoch": 2.3646440154926474, + "grad_norm": 0.2180803418159485, + "learning_rate": 3.914082168710369e-05, + "loss": 0.1685, + "step": 18775 + }, + { + "epoch": 2.3652737979028244, + "grad_norm": 0.2156234085559845, + "learning_rate": 3.906680327543212e-05, + "loss": 0.1613, + "step": 18780 + }, + { + "epoch": 2.365903580313002, + "grad_norm": 0.2180781066417694, + "learning_rate": 3.899284443742112e-05, + "loss": 0.1654, + "step": 18785 + }, + { + "epoch": 2.366533362723179, + "grad_norm": 0.2102290391921997, + "learning_rate": 3.89189452127884e-05, + "loss": 0.1635, + "step": 18790 + }, + { + "epoch": 2.3671631451333566, + "grad_norm": 0.26211512088775635, + "learning_rate": 3.884510564121944e-05, + "loss": 0.174, + "step": 18795 + }, + { + "epoch": 2.3677929275435337, + "grad_norm": 0.1999218463897705, + "learning_rate": 3.877132576236778e-05, + "loss": 0.1619, + "step": 18800 + }, + { + "epoch": 2.368422709953711, + "grad_norm": 0.21774223446846008, + "learning_rate": 3.8697605615854875e-05, + "loss": 0.1616, + "step": 18805 + }, + { + "epoch": 2.3690524923638883, + "grad_norm": 0.2304651439189911, + "learning_rate": 3.862394524127023e-05, + "loss": 0.1705, + "step": 18810 + }, + { + "epoch": 2.369682274774066, + "grad_norm": 0.24826854467391968, + "learning_rate": 3.8550344678171084e-05, + "loss": 0.1734, + "step": 18815 + }, + { + "epoch": 2.370312057184243, + "grad_norm": 0.21676623821258545, + "learning_rate": 3.847680396608262e-05, + "loss": 0.1669, + "step": 18820 + }, + { + "epoch": 2.37094183959442, + "grad_norm": 0.21203717589378357, + "learning_rate": 3.840332314449788e-05, + "loss": 0.1633, + "step": 18825 + }, + { + "epoch": 2.3715716220045975, + "grad_norm": 0.2409755140542984, + "learning_rate": 3.832990225287776e-05, + "loss": 0.1687, + "step": 18830 + }, + { + "epoch": 2.3722014044147746, + "grad_norm": 0.19998323917388916, + "learning_rate": 3.825654133065094e-05, + "loss": 0.1578, + "step": 18835 + }, + { + "epoch": 2.372831186824952, + "grad_norm": 0.22083517909049988, + "learning_rate": 3.818324041721391e-05, + "loss": 0.1721, + "step": 18840 + }, + { + "epoch": 2.373460969235129, + "grad_norm": 0.19865678250789642, + "learning_rate": 3.8109999551930914e-05, + "loss": 0.1613, + "step": 18845 + }, + { + "epoch": 2.3740907516453067, + "grad_norm": 0.2167719304561615, + "learning_rate": 3.8036818774134037e-05, + "loss": 0.1569, + "step": 18850 + }, + { + "epoch": 2.374720534055484, + "grad_norm": 0.2173914611339569, + "learning_rate": 3.796369812312298e-05, + "loss": 0.1676, + "step": 18855 + }, + { + "epoch": 2.3753503164656613, + "grad_norm": 0.22559495270252228, + "learning_rate": 3.7890637638165255e-05, + "loss": 0.169, + "step": 18860 + }, + { + "epoch": 2.3759800988758384, + "grad_norm": 0.2124035507440567, + "learning_rate": 3.781763735849589e-05, + "loss": 0.1715, + "step": 18865 + }, + { + "epoch": 2.376609881286016, + "grad_norm": 0.23133991658687592, + "learning_rate": 3.774469732331782e-05, + "loss": 0.162, + "step": 18870 + }, + { + "epoch": 2.377239663696193, + "grad_norm": 0.1754513680934906, + "learning_rate": 3.7671817571801464e-05, + "loss": 0.1602, + "step": 18875 + }, + { + "epoch": 2.37786944610637, + "grad_norm": 0.2158019244670868, + "learning_rate": 3.7598998143084924e-05, + "loss": 0.1571, + "step": 18880 + }, + { + "epoch": 2.3784992285165476, + "grad_norm": 0.20694270730018616, + "learning_rate": 3.752623907627388e-05, + "loss": 0.162, + "step": 18885 + }, + { + "epoch": 2.3791290109267247, + "grad_norm": 0.250929057598114, + "learning_rate": 3.7453540410441604e-05, + "loss": 0.1744, + "step": 18890 + }, + { + "epoch": 2.379758793336902, + "grad_norm": 0.23653000593185425, + "learning_rate": 3.738090218462903e-05, + "loss": 0.1789, + "step": 18895 + }, + { + "epoch": 2.3803885757470793, + "grad_norm": 0.1936427801847458, + "learning_rate": 3.730832443784443e-05, + "loss": 0.1532, + "step": 18900 + }, + { + "epoch": 2.381018358157257, + "grad_norm": 0.188064306974411, + "learning_rate": 3.7235807209063716e-05, + "loss": 0.1629, + "step": 18905 + }, + { + "epoch": 2.381648140567434, + "grad_norm": 0.2069697082042694, + "learning_rate": 3.71633505372304e-05, + "loss": 0.1568, + "step": 18910 + }, + { + "epoch": 2.3822779229776114, + "grad_norm": 0.21809029579162598, + "learning_rate": 3.709095446125529e-05, + "loss": 0.1717, + "step": 18915 + }, + { + "epoch": 2.3829077053877885, + "grad_norm": 0.23560817539691925, + "learning_rate": 3.701861902001675e-05, + "loss": 0.1662, + "step": 18920 + }, + { + "epoch": 2.383537487797966, + "grad_norm": 0.19693933427333832, + "learning_rate": 3.694634425236057e-05, + "loss": 0.1558, + "step": 18925 + }, + { + "epoch": 2.384167270208143, + "grad_norm": 0.19060872495174408, + "learning_rate": 3.687413019709994e-05, + "loss": 0.1621, + "step": 18930 + }, + { + "epoch": 2.38479705261832, + "grad_norm": 0.2021481990814209, + "learning_rate": 3.680197689301548e-05, + "loss": 0.1551, + "step": 18935 + }, + { + "epoch": 2.3854268350284977, + "grad_norm": 0.22511224448680878, + "learning_rate": 3.672988437885512e-05, + "loss": 0.1587, + "step": 18940 + }, + { + "epoch": 2.386056617438675, + "grad_norm": 0.2018289864063263, + "learning_rate": 3.665785269333423e-05, + "loss": 0.1654, + "step": 18945 + }, + { + "epoch": 2.3866863998488523, + "grad_norm": 0.21350149810314178, + "learning_rate": 3.65858818751354e-05, + "loss": 0.1673, + "step": 18950 + }, + { + "epoch": 2.3873161822590294, + "grad_norm": 0.21213771402835846, + "learning_rate": 3.65139719629087e-05, + "loss": 0.1791, + "step": 18955 + }, + { + "epoch": 2.387945964669207, + "grad_norm": 0.24175149202346802, + "learning_rate": 3.644212299527139e-05, + "loss": 0.1714, + "step": 18960 + }, + { + "epoch": 2.388575747079384, + "grad_norm": 0.2541513741016388, + "learning_rate": 3.63703350108079e-05, + "loss": 0.1685, + "step": 18965 + }, + { + "epoch": 2.389205529489561, + "grad_norm": 0.24447733163833618, + "learning_rate": 3.629860804807011e-05, + "loss": 0.1728, + "step": 18970 + }, + { + "epoch": 2.3898353118997386, + "grad_norm": 0.1830032914876938, + "learning_rate": 3.622694214557702e-05, + "loss": 0.1698, + "step": 18975 + }, + { + "epoch": 2.390465094309916, + "grad_norm": 0.23851166665554047, + "learning_rate": 3.6155337341814844e-05, + "loss": 0.1754, + "step": 18980 + }, + { + "epoch": 2.3910948767200932, + "grad_norm": 0.1973876655101776, + "learning_rate": 3.608379367523702e-05, + "loss": 0.1703, + "step": 18985 + }, + { + "epoch": 2.3917246591302703, + "grad_norm": 0.2209198772907257, + "learning_rate": 3.6012311184264046e-05, + "loss": 0.1674, + "step": 18990 + }, + { + "epoch": 2.392354441540448, + "grad_norm": 0.216825932264328, + "learning_rate": 3.5940889907283834e-05, + "loss": 0.1677, + "step": 18995 + }, + { + "epoch": 2.392984223950625, + "grad_norm": 0.1855764538049698, + "learning_rate": 3.586952988265106e-05, + "loss": 0.1592, + "step": 19000 + }, + { + "epoch": 2.392984223950625, + "eval_loss": 0.35235053300857544, + "eval_runtime": 6.1677, + "eval_samples_per_second": 162.135, + "eval_steps_per_second": 10.215, + "step": 19000 + }, + { + "epoch": 2.3936140063608025, + "grad_norm": 0.25512510538101196, + "learning_rate": 3.579823114868778e-05, + "loss": 0.1649, + "step": 19005 + }, + { + "epoch": 2.3942437887709795, + "grad_norm": 0.20220012962818146, + "learning_rate": 3.572699374368296e-05, + "loss": 0.1638, + "step": 19010 + }, + { + "epoch": 2.394873571181157, + "grad_norm": 0.21141274273395538, + "learning_rate": 3.5655817705892814e-05, + "loss": 0.1697, + "step": 19015 + }, + { + "epoch": 2.395503353591334, + "grad_norm": 0.23368066549301147, + "learning_rate": 3.558470307354046e-05, + "loss": 0.1653, + "step": 19020 + }, + { + "epoch": 2.3961331360015112, + "grad_norm": 0.20436784625053406, + "learning_rate": 3.5513649884816064e-05, + "loss": 0.1561, + "step": 19025 + }, + { + "epoch": 2.3967629184116888, + "grad_norm": 0.223700612783432, + "learning_rate": 3.5442658177876835e-05, + "loss": 0.1693, + "step": 19030 + }, + { + "epoch": 2.3973927008218663, + "grad_norm": 0.26057204604148865, + "learning_rate": 3.5371727990846944e-05, + "loss": 0.1767, + "step": 19035 + }, + { + "epoch": 2.3980224832320434, + "grad_norm": 0.21637168526649475, + "learning_rate": 3.53008593618175e-05, + "loss": 0.1671, + "step": 19040 + }, + { + "epoch": 2.3986522656422204, + "grad_norm": 0.23995353281497955, + "learning_rate": 3.5230052328846585e-05, + "loss": 0.1788, + "step": 19045 + }, + { + "epoch": 2.399282048052398, + "grad_norm": 0.2069759964942932, + "learning_rate": 3.5159306929959144e-05, + "loss": 0.1655, + "step": 19050 + }, + { + "epoch": 2.399911830462575, + "grad_norm": 0.20498618483543396, + "learning_rate": 3.508862320314717e-05, + "loss": 0.1589, + "step": 19055 + }, + { + "epoch": 2.4005416128727526, + "grad_norm": 0.20835870504379272, + "learning_rate": 3.501800118636939e-05, + "loss": 0.1556, + "step": 19060 + }, + { + "epoch": 2.4011713952829297, + "grad_norm": 0.22261159121990204, + "learning_rate": 3.4947440917551475e-05, + "loss": 0.1645, + "step": 19065 + }, + { + "epoch": 2.401801177693107, + "grad_norm": 0.20514576137065887, + "learning_rate": 3.487694243458578e-05, + "loss": 0.1558, + "step": 19070 + }, + { + "epoch": 2.4024309601032843, + "grad_norm": 0.18798956274986267, + "learning_rate": 3.480650577533175e-05, + "loss": 0.1635, + "step": 19075 + }, + { + "epoch": 2.4030607425134614, + "grad_norm": 0.1777620017528534, + "learning_rate": 3.47361309776154e-05, + "loss": 0.1613, + "step": 19080 + }, + { + "epoch": 2.403690524923639, + "grad_norm": 0.22258161008358002, + "learning_rate": 3.466581807922962e-05, + "loss": 0.1657, + "step": 19085 + }, + { + "epoch": 2.4043203073338164, + "grad_norm": 0.20584604144096375, + "learning_rate": 3.4595567117934045e-05, + "loss": 0.1609, + "step": 19090 + }, + { + "epoch": 2.4049500897439935, + "grad_norm": 0.26850444078445435, + "learning_rate": 3.452537813145501e-05, + "loss": 0.165, + "step": 19095 + }, + { + "epoch": 2.4055798721541706, + "grad_norm": 0.1789688616991043, + "learning_rate": 3.4455251157485706e-05, + "loss": 0.1597, + "step": 19100 + }, + { + "epoch": 2.406209654564348, + "grad_norm": 0.24336829781532288, + "learning_rate": 3.438518623368581e-05, + "loss": 0.1582, + "step": 19105 + }, + { + "epoch": 2.406839436974525, + "grad_norm": 0.20159520208835602, + "learning_rate": 3.4315183397681806e-05, + "loss": 0.1572, + "step": 19110 + }, + { + "epoch": 2.4074692193847027, + "grad_norm": 0.22423477470874786, + "learning_rate": 3.424524268706686e-05, + "loss": 0.1611, + "step": 19115 + }, + { + "epoch": 2.40809900179488, + "grad_norm": 0.22861574590206146, + "learning_rate": 3.417536413940073e-05, + "loss": 0.1708, + "step": 19120 + }, + { + "epoch": 2.4087287842050573, + "grad_norm": 0.22517502307891846, + "learning_rate": 3.4105547792209766e-05, + "loss": 0.1498, + "step": 19125 + }, + { + "epoch": 2.4093585666152344, + "grad_norm": 0.22406402230262756, + "learning_rate": 3.403579368298694e-05, + "loss": 0.1722, + "step": 19130 + }, + { + "epoch": 2.4099883490254115, + "grad_norm": 0.21624189615249634, + "learning_rate": 3.3966101849191807e-05, + "loss": 0.165, + "step": 19135 + }, + { + "epoch": 2.410618131435589, + "grad_norm": 0.2186998724937439, + "learning_rate": 3.389647232825048e-05, + "loss": 0.1545, + "step": 19140 + }, + { + "epoch": 2.4112479138457665, + "grad_norm": 0.20615451037883759, + "learning_rate": 3.38269051575556e-05, + "loss": 0.1653, + "step": 19145 + }, + { + "epoch": 2.4118776962559436, + "grad_norm": 0.21351304650306702, + "learning_rate": 3.3757400374466323e-05, + "loss": 0.1667, + "step": 19150 + }, + { + "epoch": 2.4125074786661207, + "grad_norm": 0.2263455092906952, + "learning_rate": 3.368795801630826e-05, + "loss": 0.1635, + "step": 19155 + }, + { + "epoch": 2.4131372610762982, + "grad_norm": 0.20655429363250732, + "learning_rate": 3.361857812037365e-05, + "loss": 0.1657, + "step": 19160 + }, + { + "epoch": 2.4137670434864753, + "grad_norm": 0.1987982541322708, + "learning_rate": 3.354926072392101e-05, + "loss": 0.1554, + "step": 19165 + }, + { + "epoch": 2.414396825896653, + "grad_norm": 0.20431582629680634, + "learning_rate": 3.348000586417539e-05, + "loss": 0.1552, + "step": 19170 + }, + { + "epoch": 2.41502660830683, + "grad_norm": 0.241183340549469, + "learning_rate": 3.34108135783282e-05, + "loss": 0.1758, + "step": 19175 + }, + { + "epoch": 2.4156563907170074, + "grad_norm": 0.1910007894039154, + "learning_rate": 3.3341683903537295e-05, + "loss": 0.1609, + "step": 19180 + }, + { + "epoch": 2.4162861731271845, + "grad_norm": 0.2089349776506424, + "learning_rate": 3.3272616876926916e-05, + "loss": 0.1608, + "step": 19185 + }, + { + "epoch": 2.4169159555373616, + "grad_norm": 0.20799914002418518, + "learning_rate": 3.3203612535587594e-05, + "loss": 0.1636, + "step": 19190 + }, + { + "epoch": 2.417545737947539, + "grad_norm": 0.2071768194437027, + "learning_rate": 3.313467091657622e-05, + "loss": 0.1643, + "step": 19195 + }, + { + "epoch": 2.4181755203577167, + "grad_norm": 0.22981540858745575, + "learning_rate": 3.3065792056916077e-05, + "loss": 0.1749, + "step": 19200 + }, + { + "epoch": 2.4188053027678937, + "grad_norm": 0.22096100449562073, + "learning_rate": 3.2996975993596706e-05, + "loss": 0.1671, + "step": 19205 + }, + { + "epoch": 2.419435085178071, + "grad_norm": 0.19851092994213104, + "learning_rate": 3.292822276357382e-05, + "loss": 0.1605, + "step": 19210 + }, + { + "epoch": 2.4200648675882483, + "grad_norm": 0.21692755818367004, + "learning_rate": 3.285953240376947e-05, + "loss": 0.1629, + "step": 19215 + }, + { + "epoch": 2.4206946499984254, + "grad_norm": 0.1912955939769745, + "learning_rate": 3.279090495107204e-05, + "loss": 0.1626, + "step": 19220 + }, + { + "epoch": 2.421324432408603, + "grad_norm": 0.17941045761108398, + "learning_rate": 3.2722340442335993e-05, + "loss": 0.1528, + "step": 19225 + }, + { + "epoch": 2.42195421481878, + "grad_norm": 0.24879097938537598, + "learning_rate": 3.265383891438203e-05, + "loss": 0.1622, + "step": 19230 + }, + { + "epoch": 2.4225839972289576, + "grad_norm": 0.27064043283462524, + "learning_rate": 3.258540040399703e-05, + "loss": 0.1677, + "step": 19235 + }, + { + "epoch": 2.4232137796391346, + "grad_norm": 0.20788533985614777, + "learning_rate": 3.2517024947934046e-05, + "loss": 0.1742, + "step": 19240 + }, + { + "epoch": 2.4238435620493117, + "grad_norm": 0.20137952268123627, + "learning_rate": 3.2448712582912265e-05, + "loss": 0.1656, + "step": 19245 + }, + { + "epoch": 2.4244733444594893, + "grad_norm": 0.22439540922641754, + "learning_rate": 3.2380463345616986e-05, + "loss": 0.1704, + "step": 19250 + }, + { + "epoch": 2.4251031268696663, + "grad_norm": 0.22877377271652222, + "learning_rate": 3.231227727269956e-05, + "loss": 0.1655, + "step": 19255 + }, + { + "epoch": 2.425732909279844, + "grad_norm": 0.19454975426197052, + "learning_rate": 3.224415440077757e-05, + "loss": 0.1711, + "step": 19260 + }, + { + "epoch": 2.426362691690021, + "grad_norm": 0.21811099350452423, + "learning_rate": 3.217609476643447e-05, + "loss": 0.1602, + "step": 19265 + }, + { + "epoch": 2.4269924741001985, + "grad_norm": 0.21663612127304077, + "learning_rate": 3.2108098406219884e-05, + "loss": 0.1626, + "step": 19270 + }, + { + "epoch": 2.4276222565103756, + "grad_norm": 0.21124348044395447, + "learning_rate": 3.204016535664937e-05, + "loss": 0.1621, + "step": 19275 + }, + { + "epoch": 2.428252038920553, + "grad_norm": 0.20466844737529755, + "learning_rate": 3.1972295654204554e-05, + "loss": 0.1608, + "step": 19280 + }, + { + "epoch": 2.42888182133073, + "grad_norm": 0.20153960585594177, + "learning_rate": 3.1904489335333014e-05, + "loss": 0.1699, + "step": 19285 + }, + { + "epoch": 2.4295116037409077, + "grad_norm": 0.2766586244106293, + "learning_rate": 3.1836746436448294e-05, + "loss": 0.1716, + "step": 19290 + }, + { + "epoch": 2.4301413861510848, + "grad_norm": 0.1904149353504181, + "learning_rate": 3.176906699392986e-05, + "loss": 0.1756, + "step": 19295 + }, + { + "epoch": 2.430771168561262, + "grad_norm": 0.19312819838523865, + "learning_rate": 3.170145104412309e-05, + "loss": 0.1666, + "step": 19300 + }, + { + "epoch": 2.4314009509714394, + "grad_norm": 0.19623906910419464, + "learning_rate": 3.163389862333939e-05, + "loss": 0.1541, + "step": 19305 + }, + { + "epoch": 2.4320307333816165, + "grad_norm": 0.1920466423034668, + "learning_rate": 3.156640976785592e-05, + "loss": 0.1575, + "step": 19310 + }, + { + "epoch": 2.432660515791794, + "grad_norm": 0.2178039401769638, + "learning_rate": 3.149898451391565e-05, + "loss": 0.1533, + "step": 19315 + }, + { + "epoch": 2.433290298201971, + "grad_norm": 0.21117891371250153, + "learning_rate": 3.143162289772757e-05, + "loss": 0.1529, + "step": 19320 + }, + { + "epoch": 2.4339200806121486, + "grad_norm": 0.21997326612472534, + "learning_rate": 3.1364324955466405e-05, + "loss": 0.167, + "step": 19325 + }, + { + "epoch": 2.4345498630223257, + "grad_norm": 0.2015310823917389, + "learning_rate": 3.129709072327264e-05, + "loss": 0.1608, + "step": 19330 + }, + { + "epoch": 2.435179645432503, + "grad_norm": 0.21516267955303192, + "learning_rate": 3.122992023725263e-05, + "loss": 0.159, + "step": 19335 + }, + { + "epoch": 2.4358094278426803, + "grad_norm": 0.22670945525169373, + "learning_rate": 3.116281353347841e-05, + "loss": 0.1703, + "step": 19340 + }, + { + "epoch": 2.436439210252858, + "grad_norm": 0.19361603260040283, + "learning_rate": 3.109577064798793e-05, + "loss": 0.1647, + "step": 19345 + }, + { + "epoch": 2.437068992663035, + "grad_norm": 0.18795832991600037, + "learning_rate": 3.1028791616784624e-05, + "loss": 0.1532, + "step": 19350 + }, + { + "epoch": 2.437698775073212, + "grad_norm": 0.24311493337154388, + "learning_rate": 3.0961876475837814e-05, + "loss": 0.1599, + "step": 19355 + }, + { + "epoch": 2.4383285574833895, + "grad_norm": 0.20328237116336823, + "learning_rate": 3.089502526108242e-05, + "loss": 0.1604, + "step": 19360 + }, + { + "epoch": 2.4389583398935666, + "grad_norm": 0.2067318707704544, + "learning_rate": 3.082823800841914e-05, + "loss": 0.161, + "step": 19365 + }, + { + "epoch": 2.439588122303744, + "grad_norm": 0.22590148448944092, + "learning_rate": 3.0761514753714235e-05, + "loss": 0.1711, + "step": 19370 + }, + { + "epoch": 2.440217904713921, + "grad_norm": 0.2264234572649002, + "learning_rate": 3.069485553279958e-05, + "loss": 0.1625, + "step": 19375 + }, + { + "epoch": 2.4408476871240987, + "grad_norm": 0.21357667446136475, + "learning_rate": 3.062826038147274e-05, + "loss": 0.162, + "step": 19380 + }, + { + "epoch": 2.441477469534276, + "grad_norm": 0.19787681102752686, + "learning_rate": 3.0561729335496816e-05, + "loss": 0.1566, + "step": 19385 + }, + { + "epoch": 2.4421072519444533, + "grad_norm": 0.20055502653121948, + "learning_rate": 3.0495262430600487e-05, + "loss": 0.1612, + "step": 19390 + }, + { + "epoch": 2.4427370343546304, + "grad_norm": 0.2178819328546524, + "learning_rate": 3.0428859702478003e-05, + "loss": 0.1701, + "step": 19395 + }, + { + "epoch": 2.443366816764808, + "grad_norm": 0.2206692099571228, + "learning_rate": 3.0362521186789125e-05, + "loss": 0.1668, + "step": 19400 + }, + { + "epoch": 2.443996599174985, + "grad_norm": 0.22452767193317413, + "learning_rate": 3.0296246919159218e-05, + "loss": 0.1713, + "step": 19405 + }, + { + "epoch": 2.444626381585162, + "grad_norm": 0.23711274564266205, + "learning_rate": 3.023003693517908e-05, + "loss": 0.1637, + "step": 19410 + }, + { + "epoch": 2.4452561639953396, + "grad_norm": 0.2252456545829773, + "learning_rate": 3.0163891270404904e-05, + "loss": 0.1685, + "step": 19415 + }, + { + "epoch": 2.4458859464055167, + "grad_norm": 0.2477557361125946, + "learning_rate": 3.0097809960358427e-05, + "loss": 0.1669, + "step": 19420 + }, + { + "epoch": 2.4465157288156942, + "grad_norm": 0.21543872356414795, + "learning_rate": 3.003179304052689e-05, + "loss": 0.1624, + "step": 19425 + }, + { + "epoch": 2.4471455112258713, + "grad_norm": 0.18810850381851196, + "learning_rate": 2.9965840546362858e-05, + "loss": 0.1531, + "step": 19430 + }, + { + "epoch": 2.447775293636049, + "grad_norm": 0.2468540370464325, + "learning_rate": 2.9899952513284307e-05, + "loss": 0.1644, + "step": 19435 + }, + { + "epoch": 2.448405076046226, + "grad_norm": 0.2639712393283844, + "learning_rate": 2.9834128976674643e-05, + "loss": 0.166, + "step": 19440 + }, + { + "epoch": 2.4490348584564035, + "grad_norm": 0.18254607915878296, + "learning_rate": 2.9768369971882598e-05, + "loss": 0.1478, + "step": 19445 + }, + { + "epoch": 2.4496646408665805, + "grad_norm": 0.2060953974723816, + "learning_rate": 2.9702675534222265e-05, + "loss": 0.161, + "step": 19450 + }, + { + "epoch": 2.450294423276758, + "grad_norm": 0.20919503271579742, + "learning_rate": 2.963704569897305e-05, + "loss": 0.1635, + "step": 19455 + }, + { + "epoch": 2.450924205686935, + "grad_norm": 0.20381583273410797, + "learning_rate": 2.957148050137963e-05, + "loss": 0.1677, + "step": 19460 + }, + { + "epoch": 2.4515539880971122, + "grad_norm": 0.23379412293434143, + "learning_rate": 2.9505979976652106e-05, + "loss": 0.1669, + "step": 19465 + }, + { + "epoch": 2.4521837705072898, + "grad_norm": 0.21713408827781677, + "learning_rate": 2.9440544159965707e-05, + "loss": 0.1639, + "step": 19470 + }, + { + "epoch": 2.452813552917467, + "grad_norm": 0.2360960692167282, + "learning_rate": 2.9375173086460975e-05, + "loss": 0.1682, + "step": 19475 + }, + { + "epoch": 2.4534433353276444, + "grad_norm": 0.21496212482452393, + "learning_rate": 2.9309866791243643e-05, + "loss": 0.1508, + "step": 19480 + }, + { + "epoch": 2.4540731177378214, + "grad_norm": 0.19526614248752594, + "learning_rate": 2.9244625309384706e-05, + "loss": 0.1607, + "step": 19485 + }, + { + "epoch": 2.454702900147999, + "grad_norm": 0.2625288665294647, + "learning_rate": 2.917944867592031e-05, + "loss": 0.1708, + "step": 19490 + }, + { + "epoch": 2.455332682558176, + "grad_norm": 0.19196555018424988, + "learning_rate": 2.9114336925851818e-05, + "loss": 0.1715, + "step": 19495 + }, + { + "epoch": 2.4559624649683536, + "grad_norm": 0.21597431600093842, + "learning_rate": 2.9049290094145726e-05, + "loss": 0.1508, + "step": 19500 + }, + { + "epoch": 2.4565922473785307, + "grad_norm": 0.2433023750782013, + "learning_rate": 2.8984308215733615e-05, + "loss": 0.1568, + "step": 19505 + }, + { + "epoch": 2.457222029788708, + "grad_norm": 0.231834277510643, + "learning_rate": 2.8919391325512314e-05, + "loss": 0.1552, + "step": 19510 + }, + { + "epoch": 2.4578518121988853, + "grad_norm": 0.21281488239765167, + "learning_rate": 2.885453945834369e-05, + "loss": 0.161, + "step": 19515 + }, + { + "epoch": 2.4584815946090623, + "grad_norm": 0.21355679631233215, + "learning_rate": 2.878975264905455e-05, + "loss": 0.1515, + "step": 19520 + }, + { + "epoch": 2.45911137701924, + "grad_norm": 0.20718532800674438, + "learning_rate": 2.8725030932437025e-05, + "loss": 0.1622, + "step": 19525 + }, + { + "epoch": 2.459741159429417, + "grad_norm": 0.21609242260456085, + "learning_rate": 2.8660374343248087e-05, + "loss": 0.1531, + "step": 19530 + }, + { + "epoch": 2.4603709418395945, + "grad_norm": 0.2453998625278473, + "learning_rate": 2.8595782916209825e-05, + "loss": 0.1605, + "step": 19535 + }, + { + "epoch": 2.4610007242497716, + "grad_norm": 0.27632614970207214, + "learning_rate": 2.8531256686009306e-05, + "loss": 0.1598, + "step": 19540 + }, + { + "epoch": 2.461630506659949, + "grad_norm": 0.19357621669769287, + "learning_rate": 2.846679568729855e-05, + "loss": 0.1527, + "step": 19545 + }, + { + "epoch": 2.462260289070126, + "grad_norm": 0.19920161366462708, + "learning_rate": 2.8402399954694692e-05, + "loss": 0.1561, + "step": 19550 + }, + { + "epoch": 2.4628900714803037, + "grad_norm": 0.19081860780715942, + "learning_rate": 2.8338069522779595e-05, + "loss": 0.1524, + "step": 19555 + }, + { + "epoch": 2.463519853890481, + "grad_norm": 0.22451332211494446, + "learning_rate": 2.8273804426100234e-05, + "loss": 0.1628, + "step": 19560 + }, + { + "epoch": 2.4641496363006583, + "grad_norm": 0.19204290211200714, + "learning_rate": 2.820960469916837e-05, + "loss": 0.1499, + "step": 19565 + }, + { + "epoch": 2.4647794187108354, + "grad_norm": 0.20258976519107819, + "learning_rate": 2.814547037646081e-05, + "loss": 0.1514, + "step": 19570 + }, + { + "epoch": 2.4654092011210125, + "grad_norm": 0.21591047942638397, + "learning_rate": 2.8081401492419102e-05, + "loss": 0.1555, + "step": 19575 + }, + { + "epoch": 2.46603898353119, + "grad_norm": 0.20639857649803162, + "learning_rate": 2.8017398081449728e-05, + "loss": 0.1597, + "step": 19580 + }, + { + "epoch": 2.466668765941367, + "grad_norm": 0.18190859258174896, + "learning_rate": 2.7953460177923953e-05, + "loss": 0.1676, + "step": 19585 + }, + { + "epoch": 2.4672985483515446, + "grad_norm": 0.23196272552013397, + "learning_rate": 2.7889587816177884e-05, + "loss": 0.1644, + "step": 19590 + }, + { + "epoch": 2.4679283307617217, + "grad_norm": 0.23499402403831482, + "learning_rate": 2.782578103051248e-05, + "loss": 0.1596, + "step": 19595 + }, + { + "epoch": 2.468558113171899, + "grad_norm": 0.19516189396381378, + "learning_rate": 2.7762039855193398e-05, + "loss": 0.1592, + "step": 19600 + }, + { + "epoch": 2.4691878955820763, + "grad_norm": 0.25550252199172974, + "learning_rate": 2.769836432445109e-05, + "loss": 0.1652, + "step": 19605 + }, + { + "epoch": 2.469817677992254, + "grad_norm": 0.20900960266590118, + "learning_rate": 2.7634754472480852e-05, + "loss": 0.1576, + "step": 19610 + }, + { + "epoch": 2.470447460402431, + "grad_norm": 0.19483284652233124, + "learning_rate": 2.757121033344258e-05, + "loss": 0.1671, + "step": 19615 + }, + { + "epoch": 2.4710772428126084, + "grad_norm": 0.21054719388484955, + "learning_rate": 2.7507731941460952e-05, + "loss": 0.1572, + "step": 19620 + }, + { + "epoch": 2.4717070252227855, + "grad_norm": 0.23577210307121277, + "learning_rate": 2.7444319330625243e-05, + "loss": 0.1657, + "step": 19625 + }, + { + "epoch": 2.4723368076329626, + "grad_norm": 0.21099181473255157, + "learning_rate": 2.7380972534989538e-05, + "loss": 0.1696, + "step": 19630 + }, + { + "epoch": 2.47296659004314, + "grad_norm": 0.20165832340717316, + "learning_rate": 2.7317691588572495e-05, + "loss": 0.1529, + "step": 19635 + }, + { + "epoch": 2.473596372453317, + "grad_norm": 0.19725088775157928, + "learning_rate": 2.7254476525357443e-05, + "loss": 0.1503, + "step": 19640 + }, + { + "epoch": 2.4742261548634947, + "grad_norm": 0.23867055773735046, + "learning_rate": 2.7191327379292283e-05, + "loss": 0.1766, + "step": 19645 + }, + { + "epoch": 2.474855937273672, + "grad_norm": 0.21567271649837494, + "learning_rate": 2.712824418428955e-05, + "loss": 0.1562, + "step": 19650 + }, + { + "epoch": 2.4754857196838493, + "grad_norm": 0.2127571702003479, + "learning_rate": 2.7065226974226444e-05, + "loss": 0.1588, + "step": 19655 + }, + { + "epoch": 2.4761155020940264, + "grad_norm": 0.192424476146698, + "learning_rate": 2.700227578294455e-05, + "loss": 0.1632, + "step": 19660 + }, + { + "epoch": 2.476745284504204, + "grad_norm": 0.19549550116062164, + "learning_rate": 2.693939064425007e-05, + "loss": 0.1666, + "step": 19665 + }, + { + "epoch": 2.477375066914381, + "grad_norm": 0.21715867519378662, + "learning_rate": 2.6876571591913874e-05, + "loss": 0.1637, + "step": 19670 + }, + { + "epoch": 2.4780048493245586, + "grad_norm": 0.246476948261261, + "learning_rate": 2.6813818659671167e-05, + "loss": 0.1691, + "step": 19675 + }, + { + "epoch": 2.4786346317347356, + "grad_norm": 0.19329246878623962, + "learning_rate": 2.6751131881221698e-05, + "loss": 0.1576, + "step": 19680 + }, + { + "epoch": 2.4792644141449127, + "grad_norm": 0.1897173672914505, + "learning_rate": 2.6688511290229714e-05, + "loss": 0.1566, + "step": 19685 + }, + { + "epoch": 2.4798941965550902, + "grad_norm": 0.19795387983322144, + "learning_rate": 2.662595692032391e-05, + "loss": 0.159, + "step": 19690 + }, + { + "epoch": 2.4805239789652673, + "grad_norm": 0.19520628452301025, + "learning_rate": 2.65634688050974e-05, + "loss": 0.1577, + "step": 19695 + }, + { + "epoch": 2.481153761375445, + "grad_norm": 0.21223746240139008, + "learning_rate": 2.650104697810772e-05, + "loss": 0.1674, + "step": 19700 + }, + { + "epoch": 2.481783543785622, + "grad_norm": 0.19204822182655334, + "learning_rate": 2.6438691472876828e-05, + "loss": 0.1492, + "step": 19705 + }, + { + "epoch": 2.4824133261957995, + "grad_norm": 0.2568466067314148, + "learning_rate": 2.6376402322891032e-05, + "loss": 0.1557, + "step": 19710 + }, + { + "epoch": 2.4830431086059765, + "grad_norm": 0.21695761382579803, + "learning_rate": 2.6314179561601078e-05, + "loss": 0.1715, + "step": 19715 + }, + { + "epoch": 2.483672891016154, + "grad_norm": 0.21485815942287445, + "learning_rate": 2.625202322242197e-05, + "loss": 0.1599, + "step": 19720 + }, + { + "epoch": 2.484302673426331, + "grad_norm": 0.18373069167137146, + "learning_rate": 2.6189933338733122e-05, + "loss": 0.1636, + "step": 19725 + }, + { + "epoch": 2.4849324558365087, + "grad_norm": 0.2190975546836853, + "learning_rate": 2.6127909943878177e-05, + "loss": 0.1613, + "step": 19730 + }, + { + "epoch": 2.4855622382466858, + "grad_norm": 0.22146424651145935, + "learning_rate": 2.606595307116513e-05, + "loss": 0.1554, + "step": 19735 + }, + { + "epoch": 2.486192020656863, + "grad_norm": 0.22576889395713806, + "learning_rate": 2.6004062753866228e-05, + "loss": 0.1723, + "step": 19740 + }, + { + "epoch": 2.4868218030670404, + "grad_norm": 0.22661438584327698, + "learning_rate": 2.5942239025218004e-05, + "loss": 0.1616, + "step": 19745 + }, + { + "epoch": 2.4874515854772175, + "grad_norm": 0.20992781221866608, + "learning_rate": 2.588048191842118e-05, + "loss": 0.1666, + "step": 19750 + }, + { + "epoch": 2.488081367887395, + "grad_norm": 0.18685118854045868, + "learning_rate": 2.581879146664078e-05, + "loss": 0.163, + "step": 19755 + }, + { + "epoch": 2.488711150297572, + "grad_norm": 0.2547582983970642, + "learning_rate": 2.5757167703005987e-05, + "loss": 0.1683, + "step": 19760 + }, + { + "epoch": 2.4893409327077496, + "grad_norm": 0.18066510558128357, + "learning_rate": 2.569561066061013e-05, + "loss": 0.1581, + "step": 19765 + }, + { + "epoch": 2.4899707151179267, + "grad_norm": 0.22709952294826508, + "learning_rate": 2.5634120372510708e-05, + "loss": 0.1655, + "step": 19770 + }, + { + "epoch": 2.490600497528104, + "grad_norm": 0.18300481140613556, + "learning_rate": 2.5572696871729496e-05, + "loss": 0.1634, + "step": 19775 + }, + { + "epoch": 2.4912302799382813, + "grad_norm": 0.23889437317848206, + "learning_rate": 2.5511340191252294e-05, + "loss": 0.1653, + "step": 19780 + }, + { + "epoch": 2.491860062348459, + "grad_norm": 0.18972428143024445, + "learning_rate": 2.545005036402904e-05, + "loss": 0.1522, + "step": 19785 + }, + { + "epoch": 2.492489844758636, + "grad_norm": 0.1869877278804779, + "learning_rate": 2.5388827422973722e-05, + "loss": 0.1587, + "step": 19790 + }, + { + "epoch": 2.493119627168813, + "grad_norm": 0.200529083609581, + "learning_rate": 2.5327671400964562e-05, + "loss": 0.1621, + "step": 19795 + }, + { + "epoch": 2.4937494095789905, + "grad_norm": 0.20414294302463531, + "learning_rate": 2.526658233084365e-05, + "loss": 0.1619, + "step": 19800 + }, + { + "epoch": 2.4943791919891676, + "grad_norm": 0.2506503760814667, + "learning_rate": 2.5205560245417227e-05, + "loss": 0.1711, + "step": 19805 + }, + { + "epoch": 2.495008974399345, + "grad_norm": 0.2258518785238266, + "learning_rate": 2.5144605177455534e-05, + "loss": 0.1718, + "step": 19810 + }, + { + "epoch": 2.495638756809522, + "grad_norm": 0.22719348967075348, + "learning_rate": 2.5083717159692902e-05, + "loss": 0.1611, + "step": 19815 + }, + { + "epoch": 2.4962685392196997, + "grad_norm": 0.18670164048671722, + "learning_rate": 2.502289622482752e-05, + "loss": 0.155, + "step": 19820 + }, + { + "epoch": 2.496898321629877, + "grad_norm": 0.19051162898540497, + "learning_rate": 2.4962142405521666e-05, + "loss": 0.1528, + "step": 19825 + }, + { + "epoch": 2.4975281040400543, + "grad_norm": 0.2364228218793869, + "learning_rate": 2.4901455734401508e-05, + "loss": 0.1642, + "step": 19830 + }, + { + "epoch": 2.4981578864502314, + "grad_norm": 0.1748083382844925, + "learning_rate": 2.484083624405716e-05, + "loss": 0.1536, + "step": 19835 + }, + { + "epoch": 2.498787668860409, + "grad_norm": 0.21124523878097534, + "learning_rate": 2.4780283967042697e-05, + "loss": 0.1641, + "step": 19840 + }, + { + "epoch": 2.499417451270586, + "grad_norm": 0.21559958159923553, + "learning_rate": 2.4719798935876073e-05, + "loss": 0.1522, + "step": 19845 + }, + { + "epoch": 2.500047233680763, + "grad_norm": 0.20545977354049683, + "learning_rate": 2.4659381183039105e-05, + "loss": 0.1492, + "step": 19850 + }, + { + "epoch": 2.5006770160909406, + "grad_norm": 0.21759046614170074, + "learning_rate": 2.459903074097749e-05, + "loss": 0.1637, + "step": 19855 + }, + { + "epoch": 2.5013067985011177, + "grad_norm": 0.2807125151157379, + "learning_rate": 2.4538747642100927e-05, + "loss": 0.1701, + "step": 19860 + }, + { + "epoch": 2.5019365809112952, + "grad_norm": 0.1915740966796875, + "learning_rate": 2.4478531918782656e-05, + "loss": 0.1551, + "step": 19865 + }, + { + "epoch": 2.5025663633214723, + "grad_norm": 0.1929636150598526, + "learning_rate": 2.441838360335992e-05, + "loss": 0.1561, + "step": 19870 + }, + { + "epoch": 2.50319614573165, + "grad_norm": 0.23392513394355774, + "learning_rate": 2.4358302728133827e-05, + "loss": 0.1606, + "step": 19875 + }, + { + "epoch": 2.503825928141827, + "grad_norm": 0.21680179238319397, + "learning_rate": 2.4298289325369137e-05, + "loss": 0.166, + "step": 19880 + }, + { + "epoch": 2.5044557105520044, + "grad_norm": 0.20863774418830872, + "learning_rate": 2.42383434272944e-05, + "loss": 0.153, + "step": 19885 + }, + { + "epoch": 2.5050854929621815, + "grad_norm": 0.2562030851840973, + "learning_rate": 2.4178465066101933e-05, + "loss": 0.1591, + "step": 19890 + }, + { + "epoch": 2.505715275372359, + "grad_norm": 0.22802165150642395, + "learning_rate": 2.4118654273947796e-05, + "loss": 0.1664, + "step": 19895 + }, + { + "epoch": 2.506345057782536, + "grad_norm": 0.23240098357200623, + "learning_rate": 2.4058911082951764e-05, + "loss": 0.1585, + "step": 19900 + }, + { + "epoch": 2.506974840192713, + "grad_norm": 0.21342670917510986, + "learning_rate": 2.3999235525197275e-05, + "loss": 0.1471, + "step": 19905 + }, + { + "epoch": 2.5076046226028907, + "grad_norm": 0.20659485459327698, + "learning_rate": 2.3939627632731458e-05, + "loss": 0.1593, + "step": 19910 + }, + { + "epoch": 2.508234405013068, + "grad_norm": 0.21293510496616364, + "learning_rate": 2.3880087437565104e-05, + "loss": 0.1575, + "step": 19915 + }, + { + "epoch": 2.5088641874232454, + "grad_norm": 0.2346251904964447, + "learning_rate": 2.382061497167271e-05, + "loss": 0.1639, + "step": 19920 + }, + { + "epoch": 2.5094939698334224, + "grad_norm": 0.22029395401477814, + "learning_rate": 2.376121026699232e-05, + "loss": 0.1537, + "step": 19925 + }, + { + "epoch": 2.5101237522436, + "grad_norm": 0.1979423314332962, + "learning_rate": 2.3701873355425606e-05, + "loss": 0.154, + "step": 19930 + }, + { + "epoch": 2.510753534653777, + "grad_norm": 0.1969837099313736, + "learning_rate": 2.3642604268837873e-05, + "loss": 0.1623, + "step": 19935 + }, + { + "epoch": 2.5113833170639546, + "grad_norm": 0.23190250992774963, + "learning_rate": 2.3583403039057946e-05, + "loss": 0.1673, + "step": 19940 + }, + { + "epoch": 2.5120130994741316, + "grad_norm": 0.20579595863819122, + "learning_rate": 2.3524269697878244e-05, + "loss": 0.1638, + "step": 19945 + }, + { + "epoch": 2.512642881884309, + "grad_norm": 0.2181597501039505, + "learning_rate": 2.3465204277054734e-05, + "loss": 0.1535, + "step": 19950 + }, + { + "epoch": 2.5132726642944863, + "grad_norm": 0.27504584193229675, + "learning_rate": 2.3406206808306854e-05, + "loss": 0.1687, + "step": 19955 + }, + { + "epoch": 2.5139024467046633, + "grad_norm": 0.21288301050662994, + "learning_rate": 2.334727732331765e-05, + "loss": 0.1611, + "step": 19960 + }, + { + "epoch": 2.514532229114841, + "grad_norm": 0.20768193900585175, + "learning_rate": 2.3288415853733615e-05, + "loss": 0.1595, + "step": 19965 + }, + { + "epoch": 2.515162011525018, + "grad_norm": 0.1934243142604828, + "learning_rate": 2.322962243116464e-05, + "loss": 0.1573, + "step": 19970 + }, + { + "epoch": 2.5157917939351955, + "grad_norm": 0.21198545396327972, + "learning_rate": 2.3170897087184133e-05, + "loss": 0.1549, + "step": 19975 + }, + { + "epoch": 2.5164215763453726, + "grad_norm": 0.25898632407188416, + "learning_rate": 2.3112239853328996e-05, + "loss": 0.171, + "step": 19980 + }, + { + "epoch": 2.51705135875555, + "grad_norm": 0.2114986777305603, + "learning_rate": 2.3053650761099485e-05, + "loss": 0.1544, + "step": 19985 + }, + { + "epoch": 2.517681141165727, + "grad_norm": 0.21560825407505035, + "learning_rate": 2.2995129841959266e-05, + "loss": 0.1736, + "step": 19990 + }, + { + "epoch": 2.5183109235759042, + "grad_norm": 0.18198496103286743, + "learning_rate": 2.2936677127335395e-05, + "loss": 0.154, + "step": 19995 + }, + { + "epoch": 2.5189407059860818, + "grad_norm": 0.20246680080890656, + "learning_rate": 2.287829264861842e-05, + "loss": 0.1598, + "step": 20000 + }, + { + "epoch": 2.5189407059860818, + "eval_loss": 0.3499235212802887, + "eval_runtime": 6.1623, + "eval_samples_per_second": 162.278, + "eval_steps_per_second": 10.224, + "step": 20000 + }, + { + "epoch": 2.5195704883962593, + "grad_norm": 0.2162911742925644, + "learning_rate": 2.2819976437162e-05, + "loss": 0.1623, + "step": 20005 + }, + { + "epoch": 2.5202002708064364, + "grad_norm": 0.16897226870059967, + "learning_rate": 2.2761728524283344e-05, + "loss": 0.1511, + "step": 20010 + }, + { + "epoch": 2.5208300532166135, + "grad_norm": 0.19399495422840118, + "learning_rate": 2.2703548941262877e-05, + "loss": 0.1615, + "step": 20015 + }, + { + "epoch": 2.521459835626791, + "grad_norm": 0.24832330644130707, + "learning_rate": 2.2645437719344424e-05, + "loss": 0.1596, + "step": 20020 + }, + { + "epoch": 2.522089618036968, + "grad_norm": 0.1990746706724167, + "learning_rate": 2.2587394889734982e-05, + "loss": 0.1517, + "step": 20025 + }, + { + "epoch": 2.5227194004471456, + "grad_norm": 0.2356463521718979, + "learning_rate": 2.252942048360491e-05, + "loss": 0.1666, + "step": 20030 + }, + { + "epoch": 2.5233491828573227, + "grad_norm": 0.2032928168773651, + "learning_rate": 2.2471514532087766e-05, + "loss": 0.1463, + "step": 20035 + }, + { + "epoch": 2.5239789652675, + "grad_norm": 0.24515411257743835, + "learning_rate": 2.2413677066280388e-05, + "loss": 0.1629, + "step": 20040 + }, + { + "epoch": 2.5246087476776773, + "grad_norm": 0.24597153067588806, + "learning_rate": 2.2355908117242803e-05, + "loss": 0.1507, + "step": 20045 + }, + { + "epoch": 2.5252385300878544, + "grad_norm": 0.1958838254213333, + "learning_rate": 2.2298207715998246e-05, + "loss": 0.167, + "step": 20050 + }, + { + "epoch": 2.525868312498032, + "grad_norm": 0.18343359231948853, + "learning_rate": 2.2240575893533176e-05, + "loss": 0.1582, + "step": 20055 + }, + { + "epoch": 2.5264980949082094, + "grad_norm": 0.2554282248020172, + "learning_rate": 2.218301268079715e-05, + "loss": 0.1701, + "step": 20060 + }, + { + "epoch": 2.5271278773183865, + "grad_norm": 0.2655259072780609, + "learning_rate": 2.2125518108703e-05, + "loss": 0.1666, + "step": 20065 + }, + { + "epoch": 2.5277576597285636, + "grad_norm": 0.23147699236869812, + "learning_rate": 2.206809220812662e-05, + "loss": 0.1646, + "step": 20070 + }, + { + "epoch": 2.528387442138741, + "grad_norm": 0.19453732669353485, + "learning_rate": 2.2010735009906926e-05, + "loss": 0.1595, + "step": 20075 + }, + { + "epoch": 2.529017224548918, + "grad_norm": 0.21716727316379547, + "learning_rate": 2.195344654484615e-05, + "loss": 0.1656, + "step": 20080 + }, + { + "epoch": 2.5296470069590957, + "grad_norm": 0.19851936399936676, + "learning_rate": 2.1896226843709475e-05, + "loss": 0.1545, + "step": 20085 + }, + { + "epoch": 2.530276789369273, + "grad_norm": 0.20362606644630432, + "learning_rate": 2.1839075937225192e-05, + "loss": 0.1534, + "step": 20090 + }, + { + "epoch": 2.5309065717794503, + "grad_norm": 0.23197387158870697, + "learning_rate": 2.1781993856084633e-05, + "loss": 0.1624, + "step": 20095 + }, + { + "epoch": 2.5315363541896274, + "grad_norm": 0.2547961473464966, + "learning_rate": 2.1724980630942145e-05, + "loss": 0.1539, + "step": 20100 + }, + { + "epoch": 2.5321661365998045, + "grad_norm": 0.1867532879114151, + "learning_rate": 2.1668036292415237e-05, + "loss": 0.1518, + "step": 20105 + }, + { + "epoch": 2.532795919009982, + "grad_norm": 0.19402964413166046, + "learning_rate": 2.161116087108421e-05, + "loss": 0.1522, + "step": 20110 + }, + { + "epoch": 2.5334257014201595, + "grad_norm": 0.20450226962566376, + "learning_rate": 2.1554354397492517e-05, + "loss": 0.155, + "step": 20115 + }, + { + "epoch": 2.5340554838303366, + "grad_norm": 0.22179925441741943, + "learning_rate": 2.149761690214649e-05, + "loss": 0.1557, + "step": 20120 + }, + { + "epoch": 2.5346852662405137, + "grad_norm": 0.2105506807565689, + "learning_rate": 2.1440948415515524e-05, + "loss": 0.1668, + "step": 20125 + }, + { + "epoch": 2.5353150486506912, + "grad_norm": 0.24963414669036865, + "learning_rate": 2.1384348968031857e-05, + "loss": 0.1597, + "step": 20130 + }, + { + "epoch": 2.5359448310608683, + "grad_norm": 0.23433445394039154, + "learning_rate": 2.132781859009069e-05, + "loss": 0.1579, + "step": 20135 + }, + { + "epoch": 2.536574613471046, + "grad_norm": 0.19620360434055328, + "learning_rate": 2.1271357312050126e-05, + "loss": 0.1492, + "step": 20140 + }, + { + "epoch": 2.537204395881223, + "grad_norm": 0.23040203750133514, + "learning_rate": 2.1214965164231157e-05, + "loss": 0.1585, + "step": 20145 + }, + { + "epoch": 2.5378341782914005, + "grad_norm": 0.23273873329162598, + "learning_rate": 2.1158642176917647e-05, + "loss": 0.1589, + "step": 20150 + }, + { + "epoch": 2.5384639607015775, + "grad_norm": 0.2472730576992035, + "learning_rate": 2.1102388380356344e-05, + "loss": 0.1677, + "step": 20155 + }, + { + "epoch": 2.5390937431117546, + "grad_norm": 0.19982990622520447, + "learning_rate": 2.104620380475679e-05, + "loss": 0.1515, + "step": 20160 + }, + { + "epoch": 2.539723525521932, + "grad_norm": 0.21257297694683075, + "learning_rate": 2.099008848029143e-05, + "loss": 0.165, + "step": 20165 + }, + { + "epoch": 2.5403533079321097, + "grad_norm": 0.20112313330173492, + "learning_rate": 2.0934042437095457e-05, + "loss": 0.1497, + "step": 20170 + }, + { + "epoch": 2.5409830903422868, + "grad_norm": 0.24434730410575867, + "learning_rate": 2.087806570526691e-05, + "loss": 0.1583, + "step": 20175 + }, + { + "epoch": 2.541612872752464, + "grad_norm": 0.20866596698760986, + "learning_rate": 2.0822158314866467e-05, + "loss": 0.1584, + "step": 20180 + }, + { + "epoch": 2.5422426551626414, + "grad_norm": 0.1903751641511917, + "learning_rate": 2.076632029591777e-05, + "loss": 0.1447, + "step": 20185 + }, + { + "epoch": 2.5428724375728184, + "grad_norm": 0.24377766251564026, + "learning_rate": 2.071055167840709e-05, + "loss": 0.1636, + "step": 20190 + }, + { + "epoch": 2.543502219982996, + "grad_norm": 0.25960245728492737, + "learning_rate": 2.0654852492283446e-05, + "loss": 0.164, + "step": 20195 + }, + { + "epoch": 2.544132002393173, + "grad_norm": 0.23870185017585754, + "learning_rate": 2.0599222767458533e-05, + "loss": 0.1579, + "step": 20200 + }, + { + "epoch": 2.5447617848033506, + "grad_norm": 0.2245192676782608, + "learning_rate": 2.0543662533806855e-05, + "loss": 0.1655, + "step": 20205 + }, + { + "epoch": 2.5453915672135277, + "grad_norm": 0.23136839270591736, + "learning_rate": 2.048817182116554e-05, + "loss": 0.1591, + "step": 20210 + }, + { + "epoch": 2.5460213496237047, + "grad_norm": 0.21092520654201508, + "learning_rate": 2.043275065933427e-05, + "loss": 0.1536, + "step": 20215 + }, + { + "epoch": 2.5466511320338823, + "grad_norm": 0.18601630628108978, + "learning_rate": 2.0377399078075485e-05, + "loss": 0.1523, + "step": 20220 + }, + { + "epoch": 2.54728091444406, + "grad_norm": 0.21489211916923523, + "learning_rate": 2.0322117107114343e-05, + "loss": 0.1554, + "step": 20225 + }, + { + "epoch": 2.547910696854237, + "grad_norm": 0.2098049521446228, + "learning_rate": 2.026690477613845e-05, + "loss": 0.1522, + "step": 20230 + }, + { + "epoch": 2.548540479264414, + "grad_norm": 0.2187887281179428, + "learning_rate": 2.021176211479813e-05, + "loss": 0.15, + "step": 20235 + }, + { + "epoch": 2.5491702616745915, + "grad_norm": 0.2641262710094452, + "learning_rate": 2.0156689152706216e-05, + "loss": 0.1725, + "step": 20240 + }, + { + "epoch": 2.5498000440847686, + "grad_norm": 0.22713615000247955, + "learning_rate": 2.010168591943817e-05, + "loss": 0.1528, + "step": 20245 + }, + { + "epoch": 2.550429826494946, + "grad_norm": 0.20724020898342133, + "learning_rate": 2.0046752444531976e-05, + "loss": 0.1646, + "step": 20250 + }, + { + "epoch": 2.551059608905123, + "grad_norm": 0.19516219198703766, + "learning_rate": 1.9991888757488156e-05, + "loss": 0.1574, + "step": 20255 + }, + { + "epoch": 2.5516893913153007, + "grad_norm": 0.22299246490001678, + "learning_rate": 1.993709488776979e-05, + "loss": 0.1656, + "step": 20260 + }, + { + "epoch": 2.552319173725478, + "grad_norm": 0.1897648572921753, + "learning_rate": 1.9882370864802373e-05, + "loss": 0.1639, + "step": 20265 + }, + { + "epoch": 2.552948956135655, + "grad_norm": 0.23607775568962097, + "learning_rate": 1.9827716717974048e-05, + "loss": 0.1618, + "step": 20270 + }, + { + "epoch": 2.5535787385458324, + "grad_norm": 0.250823438167572, + "learning_rate": 1.9773132476635285e-05, + "loss": 0.1628, + "step": 20275 + }, + { + "epoch": 2.55420852095601, + "grad_norm": 0.2012414038181305, + "learning_rate": 1.9718618170099087e-05, + "loss": 0.1536, + "step": 20280 + }, + { + "epoch": 2.554838303366187, + "grad_norm": 0.17350980639457703, + "learning_rate": 1.9664173827640873e-05, + "loss": 0.1524, + "step": 20285 + }, + { + "epoch": 2.555468085776364, + "grad_norm": 0.18761439621448517, + "learning_rate": 1.96097994784985e-05, + "loss": 0.1452, + "step": 20290 + }, + { + "epoch": 2.5560978681865416, + "grad_norm": 0.2061910331249237, + "learning_rate": 1.955549515187223e-05, + "loss": 0.1507, + "step": 20295 + }, + { + "epoch": 2.5567276505967187, + "grad_norm": 0.20667202770709991, + "learning_rate": 1.9501260876924736e-05, + "loss": 0.1484, + "step": 20300 + }, + { + "epoch": 2.557357433006896, + "grad_norm": 0.19904933869838715, + "learning_rate": 1.9447096682781015e-05, + "loss": 0.1562, + "step": 20305 + }, + { + "epoch": 2.5579872154170733, + "grad_norm": 0.20500166714191437, + "learning_rate": 1.9393002598528555e-05, + "loss": 0.1505, + "step": 20310 + }, + { + "epoch": 2.558616997827251, + "grad_norm": 0.21382258832454681, + "learning_rate": 1.933897865321712e-05, + "loss": 0.1606, + "step": 20315 + }, + { + "epoch": 2.559246780237428, + "grad_norm": 0.22117263078689575, + "learning_rate": 1.928502487585873e-05, + "loss": 0.1626, + "step": 20320 + }, + { + "epoch": 2.559876562647605, + "grad_norm": 0.2301877737045288, + "learning_rate": 1.9231141295427794e-05, + "loss": 0.1566, + "step": 20325 + }, + { + "epoch": 2.5605063450577825, + "grad_norm": 0.23893754184246063, + "learning_rate": 1.917732794086108e-05, + "loss": 0.1571, + "step": 20330 + }, + { + "epoch": 2.56113612746796, + "grad_norm": 0.2627946734428406, + "learning_rate": 1.9123584841057578e-05, + "loss": 0.1658, + "step": 20335 + }, + { + "epoch": 2.561765909878137, + "grad_norm": 0.19281533360481262, + "learning_rate": 1.906991202487854e-05, + "loss": 0.1525, + "step": 20340 + }, + { + "epoch": 2.562395692288314, + "grad_norm": 0.2772383689880371, + "learning_rate": 1.901630952114752e-05, + "loss": 0.1661, + "step": 20345 + }, + { + "epoch": 2.5630254746984917, + "grad_norm": 0.216465026140213, + "learning_rate": 1.896277735865027e-05, + "loss": 0.1538, + "step": 20350 + }, + { + "epoch": 2.563655257108669, + "grad_norm": 0.23878604173660278, + "learning_rate": 1.8909315566134782e-05, + "loss": 0.1601, + "step": 20355 + }, + { + "epoch": 2.5642850395188463, + "grad_norm": 0.2141411155462265, + "learning_rate": 1.8855924172311248e-05, + "loss": 0.1631, + "step": 20360 + }, + { + "epoch": 2.5649148219290234, + "grad_norm": 0.2064596712589264, + "learning_rate": 1.8802603205852073e-05, + "loss": 0.1578, + "step": 20365 + }, + { + "epoch": 2.565544604339201, + "grad_norm": 0.19963258504867554, + "learning_rate": 1.8749352695391867e-05, + "loss": 0.1609, + "step": 20370 + }, + { + "epoch": 2.566174386749378, + "grad_norm": 0.1846475601196289, + "learning_rate": 1.8696172669527336e-05, + "loss": 0.1544, + "step": 20375 + }, + { + "epoch": 2.566804169159555, + "grad_norm": 0.1911933869123459, + "learning_rate": 1.8643063156817423e-05, + "loss": 0.1565, + "step": 20380 + }, + { + "epoch": 2.5674339515697326, + "grad_norm": 0.2263742834329605, + "learning_rate": 1.8590024185783042e-05, + "loss": 0.1673, + "step": 20385 + }, + { + "epoch": 2.56806373397991, + "grad_norm": 0.21621178090572357, + "learning_rate": 1.8537055784907413e-05, + "loss": 0.1587, + "step": 20390 + }, + { + "epoch": 2.5686935163900873, + "grad_norm": 0.20875446498394012, + "learning_rate": 1.848415798263576e-05, + "loss": 0.1564, + "step": 20395 + }, + { + "epoch": 2.5693232988002643, + "grad_norm": 0.20144003629684448, + "learning_rate": 1.8431330807375417e-05, + "loss": 0.156, + "step": 20400 + }, + { + "epoch": 2.569953081210442, + "grad_norm": 0.24883227050304413, + "learning_rate": 1.837857428749575e-05, + "loss": 0.1568, + "step": 20405 + }, + { + "epoch": 2.570582863620619, + "grad_norm": 0.18426820635795593, + "learning_rate": 1.832588845132827e-05, + "loss": 0.1549, + "step": 20410 + }, + { + "epoch": 2.5712126460307965, + "grad_norm": 0.2462303191423416, + "learning_rate": 1.827327332716649e-05, + "loss": 0.1625, + "step": 20415 + }, + { + "epoch": 2.5718424284409735, + "grad_norm": 0.21659249067306519, + "learning_rate": 1.8220728943265837e-05, + "loss": 0.1521, + "step": 20420 + }, + { + "epoch": 2.572472210851151, + "grad_norm": 0.17811377346515656, + "learning_rate": 1.8168255327843882e-05, + "loss": 0.1586, + "step": 20425 + }, + { + "epoch": 2.573101993261328, + "grad_norm": 0.19524861872196198, + "learning_rate": 1.8115852509080197e-05, + "loss": 0.1474, + "step": 20430 + }, + { + "epoch": 2.5737317756715052, + "grad_norm": 0.17528071999549866, + "learning_rate": 1.806352051511627e-05, + "loss": 0.1599, + "step": 20435 + }, + { + "epoch": 2.5743615580816828, + "grad_norm": 0.19344571232795715, + "learning_rate": 1.801125937405557e-05, + "loss": 0.1579, + "step": 20440 + }, + { + "epoch": 2.5749913404918603, + "grad_norm": 0.20909984409809113, + "learning_rate": 1.795906911396353e-05, + "loss": 0.1584, + "step": 20445 + }, + { + "epoch": 2.5756211229020374, + "grad_norm": 0.17548586428165436, + "learning_rate": 1.790694976286752e-05, + "loss": 0.1535, + "step": 20450 + }, + { + "epoch": 2.5762509053122145, + "grad_norm": 0.25540080666542053, + "learning_rate": 1.7854901348756807e-05, + "loss": 0.1637, + "step": 20455 + }, + { + "epoch": 2.576880687722392, + "grad_norm": 0.1803160160779953, + "learning_rate": 1.780292389958257e-05, + "loss": 0.1526, + "step": 20460 + }, + { + "epoch": 2.577510470132569, + "grad_norm": 0.260122686624527, + "learning_rate": 1.775101744325792e-05, + "loss": 0.1704, + "step": 20465 + }, + { + "epoch": 2.5781402525427466, + "grad_norm": 0.19697842001914978, + "learning_rate": 1.7699182007657736e-05, + "loss": 0.1568, + "step": 20470 + }, + { + "epoch": 2.5787700349529237, + "grad_norm": 0.2179180532693863, + "learning_rate": 1.7647417620618936e-05, + "loss": 0.1612, + "step": 20475 + }, + { + "epoch": 2.579399817363101, + "grad_norm": 0.2509031593799591, + "learning_rate": 1.7595724309940117e-05, + "loss": 0.1531, + "step": 20480 + }, + { + "epoch": 2.5800295997732783, + "grad_norm": 0.19090527296066284, + "learning_rate": 1.754410210338179e-05, + "loss": 0.1477, + "step": 20485 + }, + { + "epoch": 2.5806593821834554, + "grad_norm": 0.20662526786327362, + "learning_rate": 1.749255102866623e-05, + "loss": 0.1584, + "step": 20490 + }, + { + "epoch": 2.581289164593633, + "grad_norm": 0.2258034199476242, + "learning_rate": 1.7441071113477572e-05, + "loss": 0.1597, + "step": 20495 + }, + { + "epoch": 2.5819189470038104, + "grad_norm": 0.22661426663398743, + "learning_rate": 1.738966238546169e-05, + "loss": 0.1582, + "step": 20500 + }, + { + "epoch": 2.5825487294139875, + "grad_norm": 0.24459710717201233, + "learning_rate": 1.7338324872226227e-05, + "loss": 0.1523, + "step": 20505 + }, + { + "epoch": 2.5831785118241646, + "grad_norm": 0.18816480040550232, + "learning_rate": 1.728705860134062e-05, + "loss": 0.1554, + "step": 20510 + }, + { + "epoch": 2.583808294234342, + "grad_norm": 0.18354368209838867, + "learning_rate": 1.7235863600336042e-05, + "loss": 0.1413, + "step": 20515 + }, + { + "epoch": 2.584438076644519, + "grad_norm": 0.1984662562608719, + "learning_rate": 1.71847398967054e-05, + "loss": 0.1566, + "step": 20520 + }, + { + "epoch": 2.5850678590546967, + "grad_norm": 0.2628153860569, + "learning_rate": 1.713368751790322e-05, + "loss": 0.1592, + "step": 20525 + }, + { + "epoch": 2.585697641464874, + "grad_norm": 0.18952016532421112, + "learning_rate": 1.7082706491345806e-05, + "loss": 0.1531, + "step": 20530 + }, + { + "epoch": 2.5863274238750513, + "grad_norm": 0.16905049979686737, + "learning_rate": 1.7031796844411198e-05, + "loss": 0.1556, + "step": 20535 + }, + { + "epoch": 2.5869572062852284, + "grad_norm": 0.20969530940055847, + "learning_rate": 1.6980958604438988e-05, + "loss": 0.1585, + "step": 20540 + }, + { + "epoch": 2.5875869886954055, + "grad_norm": 0.2143043577671051, + "learning_rate": 1.693019179873048e-05, + "loss": 0.1636, + "step": 20545 + }, + { + "epoch": 2.588216771105583, + "grad_norm": 0.24208824336528778, + "learning_rate": 1.6879496454548585e-05, + "loss": 0.1572, + "step": 20550 + }, + { + "epoch": 2.5888465535157605, + "grad_norm": 0.22409161925315857, + "learning_rate": 1.6828872599117958e-05, + "loss": 0.162, + "step": 20555 + }, + { + "epoch": 2.5894763359259376, + "grad_norm": 0.20685546100139618, + "learning_rate": 1.6778320259624654e-05, + "loss": 0.1587, + "step": 20560 + }, + { + "epoch": 2.5901061183361147, + "grad_norm": 0.19393740594387054, + "learning_rate": 1.672783946321649e-05, + "loss": 0.1491, + "step": 20565 + }, + { + "epoch": 2.5907359007462922, + "grad_norm": 0.1944616734981537, + "learning_rate": 1.667743023700275e-05, + "loss": 0.1565, + "step": 20570 + }, + { + "epoch": 2.5913656831564693, + "grad_norm": 0.21134309470653534, + "learning_rate": 1.662709260805442e-05, + "loss": 0.1575, + "step": 20575 + }, + { + "epoch": 2.591995465566647, + "grad_norm": 0.20300306379795074, + "learning_rate": 1.657682660340392e-05, + "loss": 0.1509, + "step": 20580 + }, + { + "epoch": 2.592625247976824, + "grad_norm": 0.209407240152359, + "learning_rate": 1.6526632250045237e-05, + "loss": 0.1568, + "step": 20585 + }, + { + "epoch": 2.5932550303870014, + "grad_norm": 0.18960040807724, + "learning_rate": 1.6476509574933888e-05, + "loss": 0.1561, + "step": 20590 + }, + { + "epoch": 2.5938848127971785, + "grad_norm": 0.2009792923927307, + "learning_rate": 1.6426458604986897e-05, + "loss": 0.1584, + "step": 20595 + }, + { + "epoch": 2.5945145952073556, + "grad_norm": 0.2359851896762848, + "learning_rate": 1.6376479367082796e-05, + "loss": 0.1573, + "step": 20600 + }, + { + "epoch": 2.595144377617533, + "grad_norm": 0.2108912616968155, + "learning_rate": 1.632657188806153e-05, + "loss": 0.1537, + "step": 20605 + }, + { + "epoch": 2.5957741600277107, + "grad_norm": 0.22792066633701324, + "learning_rate": 1.6276736194724575e-05, + "loss": 0.1611, + "step": 20610 + }, + { + "epoch": 2.5964039424378877, + "grad_norm": 0.1896820068359375, + "learning_rate": 1.622697231383488e-05, + "loss": 0.1623, + "step": 20615 + }, + { + "epoch": 2.597033724848065, + "grad_norm": 0.19234326481819153, + "learning_rate": 1.6177280272116728e-05, + "loss": 0.1448, + "step": 20620 + }, + { + "epoch": 2.5976635072582424, + "grad_norm": 0.17547307908535004, + "learning_rate": 1.6127660096255955e-05, + "loss": 0.1479, + "step": 20625 + }, + { + "epoch": 2.5982932896684194, + "grad_norm": 0.20076265931129456, + "learning_rate": 1.6078111812899618e-05, + "loss": 0.1504, + "step": 20630 + }, + { + "epoch": 2.598923072078597, + "grad_norm": 0.1888744831085205, + "learning_rate": 1.6028635448656364e-05, + "loss": 0.1587, + "step": 20635 + }, + { + "epoch": 2.599552854488774, + "grad_norm": 0.2309001237154007, + "learning_rate": 1.59792310300961e-05, + "loss": 0.1662, + "step": 20640 + }, + { + "epoch": 2.6001826368989516, + "grad_norm": 0.29581940174102783, + "learning_rate": 1.592989858375013e-05, + "loss": 0.1708, + "step": 20645 + }, + { + "epoch": 2.6008124193091287, + "grad_norm": 0.19039921462535858, + "learning_rate": 1.588063813611112e-05, + "loss": 0.1548, + "step": 20650 + }, + { + "epoch": 2.6014422017193057, + "grad_norm": 0.18266427516937256, + "learning_rate": 1.5831449713632993e-05, + "loss": 0.1523, + "step": 20655 + }, + { + "epoch": 2.6020719841294833, + "grad_norm": 0.1932811439037323, + "learning_rate": 1.5782333342731174e-05, + "loss": 0.1516, + "step": 20660 + }, + { + "epoch": 2.602701766539661, + "grad_norm": 0.17890222370624542, + "learning_rate": 1.5733289049782177e-05, + "loss": 0.1511, + "step": 20665 + }, + { + "epoch": 2.603331548949838, + "grad_norm": 0.20573283731937408, + "learning_rate": 1.5684316861123935e-05, + "loss": 0.1525, + "step": 20670 + }, + { + "epoch": 2.603961331360015, + "grad_norm": 0.21194593608379364, + "learning_rate": 1.5635416803055596e-05, + "loss": 0.1599, + "step": 20675 + }, + { + "epoch": 2.6045911137701925, + "grad_norm": 0.17930278182029724, + "learning_rate": 1.558658890183768e-05, + "loss": 0.1571, + "step": 20680 + }, + { + "epoch": 2.6052208961803696, + "grad_norm": 0.1965799480676651, + "learning_rate": 1.5537833183691857e-05, + "loss": 0.1552, + "step": 20685 + }, + { + "epoch": 2.605850678590547, + "grad_norm": 0.20715682208538055, + "learning_rate": 1.5489149674801054e-05, + "loss": 0.1588, + "step": 20690 + }, + { + "epoch": 2.606480461000724, + "grad_norm": 0.1894584596157074, + "learning_rate": 1.544053840130943e-05, + "loss": 0.1547, + "step": 20695 + }, + { + "epoch": 2.6071102434109017, + "grad_norm": 0.20791690051555634, + "learning_rate": 1.539199938932234e-05, + "loss": 0.1479, + "step": 20700 + }, + { + "epoch": 2.6077400258210788, + "grad_norm": 0.20393605530261993, + "learning_rate": 1.534353266490636e-05, + "loss": 0.1491, + "step": 20705 + }, + { + "epoch": 2.608369808231256, + "grad_norm": 0.20201466977596283, + "learning_rate": 1.5295138254089206e-05, + "loss": 0.1664, + "step": 20710 + }, + { + "epoch": 2.6089995906414334, + "grad_norm": 0.220575213432312, + "learning_rate": 1.5246816182859773e-05, + "loss": 0.16, + "step": 20715 + }, + { + "epoch": 2.609629373051611, + "grad_norm": 0.1888882964849472, + "learning_rate": 1.5198566477168166e-05, + "loss": 0.1592, + "step": 20720 + }, + { + "epoch": 2.610259155461788, + "grad_norm": 0.2035285383462906, + "learning_rate": 1.5150389162925564e-05, + "loss": 0.149, + "step": 20725 + }, + { + "epoch": 2.610888937871965, + "grad_norm": 0.21430674195289612, + "learning_rate": 1.5102284266004282e-05, + "loss": 0.1568, + "step": 20730 + }, + { + "epoch": 2.6115187202821426, + "grad_norm": 0.2220098227262497, + "learning_rate": 1.5054251812237695e-05, + "loss": 0.1601, + "step": 20735 + }, + { + "epoch": 2.6121485026923197, + "grad_norm": 0.18914029002189636, + "learning_rate": 1.5006291827420397e-05, + "loss": 0.1524, + "step": 20740 + }, + { + "epoch": 2.612778285102497, + "grad_norm": 0.19741562008857727, + "learning_rate": 1.4958404337307972e-05, + "loss": 0.1418, + "step": 20745 + }, + { + "epoch": 2.6134080675126743, + "grad_norm": 0.22962430119514465, + "learning_rate": 1.49105893676171e-05, + "loss": 0.1523, + "step": 20750 + }, + { + "epoch": 2.614037849922852, + "grad_norm": 0.17770111560821533, + "learning_rate": 1.4862846944025469e-05, + "loss": 0.1585, + "step": 20755 + }, + { + "epoch": 2.614667632333029, + "grad_norm": 0.22975338995456696, + "learning_rate": 1.481517709217191e-05, + "loss": 0.1608, + "step": 20760 + }, + { + "epoch": 2.615297414743206, + "grad_norm": 0.21070002019405365, + "learning_rate": 1.476757983765624e-05, + "loss": 0.1442, + "step": 20765 + }, + { + "epoch": 2.6159271971533835, + "grad_norm": 0.19414427876472473, + "learning_rate": 1.47200552060392e-05, + "loss": 0.1445, + "step": 20770 + }, + { + "epoch": 2.616556979563561, + "grad_norm": 0.18657416105270386, + "learning_rate": 1.4672603222842605e-05, + "loss": 0.1534, + "step": 20775 + }, + { + "epoch": 2.617186761973738, + "grad_norm": 0.2389591485261917, + "learning_rate": 1.4625223913549323e-05, + "loss": 0.158, + "step": 20780 + }, + { + "epoch": 2.617816544383915, + "grad_norm": 0.19741186499595642, + "learning_rate": 1.4577917303603081e-05, + "loss": 0.1585, + "step": 20785 + }, + { + "epoch": 2.6184463267940927, + "grad_norm": 0.18730677664279938, + "learning_rate": 1.4530683418408612e-05, + "loss": 0.1487, + "step": 20790 + }, + { + "epoch": 2.61907610920427, + "grad_norm": 0.2060120701789856, + "learning_rate": 1.4483522283331606e-05, + "loss": 0.1499, + "step": 20795 + }, + { + "epoch": 2.6197058916144473, + "grad_norm": 0.2186814844608307, + "learning_rate": 1.4436433923698638e-05, + "loss": 0.1562, + "step": 20800 + }, + { + "epoch": 2.6203356740246244, + "grad_norm": 0.21503032743930817, + "learning_rate": 1.4389418364797279e-05, + "loss": 0.1456, + "step": 20805 + }, + { + "epoch": 2.620965456434802, + "grad_norm": 0.17447194457054138, + "learning_rate": 1.4342475631875916e-05, + "loss": 0.1454, + "step": 20810 + }, + { + "epoch": 2.621595238844979, + "grad_norm": 0.18272021412849426, + "learning_rate": 1.4295605750143851e-05, + "loss": 0.149, + "step": 20815 + }, + { + "epoch": 2.622225021255156, + "grad_norm": 0.2014734447002411, + "learning_rate": 1.424880874477135e-05, + "loss": 0.1582, + "step": 20820 + }, + { + "epoch": 2.6228548036653336, + "grad_norm": 0.21231862902641296, + "learning_rate": 1.4202084640889443e-05, + "loss": 0.152, + "step": 20825 + }, + { + "epoch": 2.623484586075511, + "grad_norm": 0.19817417860031128, + "learning_rate": 1.415543346359006e-05, + "loss": 0.1492, + "step": 20830 + }, + { + "epoch": 2.6241143684856882, + "grad_norm": 0.20216423273086548, + "learning_rate": 1.410885523792586e-05, + "loss": 0.1452, + "step": 20835 + }, + { + "epoch": 2.6247441508958653, + "grad_norm": 0.20939548313617706, + "learning_rate": 1.4062349988910515e-05, + "loss": 0.1512, + "step": 20840 + }, + { + "epoch": 2.625373933306043, + "grad_norm": 0.19018815457820892, + "learning_rate": 1.4015917741518384e-05, + "loss": 0.1579, + "step": 20845 + }, + { + "epoch": 2.62600371571622, + "grad_norm": 0.20512887835502625, + "learning_rate": 1.396955852068462e-05, + "loss": 0.1624, + "step": 20850 + }, + { + "epoch": 2.6266334981263975, + "grad_norm": 0.24390068650245667, + "learning_rate": 1.3923272351305193e-05, + "loss": 0.1663, + "step": 20855 + }, + { + "epoch": 2.6272632805365745, + "grad_norm": 0.21338611841201782, + "learning_rate": 1.38770592582368e-05, + "loss": 0.1695, + "step": 20860 + }, + { + "epoch": 2.627893062946752, + "grad_norm": 0.21631261706352234, + "learning_rate": 1.3830919266297025e-05, + "loss": 0.1598, + "step": 20865 + }, + { + "epoch": 2.628522845356929, + "grad_norm": 0.21297498047351837, + "learning_rate": 1.3784852400264013e-05, + "loss": 0.1554, + "step": 20870 + }, + { + "epoch": 2.6291526277671062, + "grad_norm": 0.22182469069957733, + "learning_rate": 1.3738858684876724e-05, + "loss": 0.1618, + "step": 20875 + }, + { + "epoch": 2.6297824101772838, + "grad_norm": 0.2602301239967346, + "learning_rate": 1.369293814483487e-05, + "loss": 0.1616, + "step": 20880 + }, + { + "epoch": 2.6304121925874613, + "grad_norm": 0.20623180270195007, + "learning_rate": 1.3647090804798822e-05, + "loss": 0.1574, + "step": 20885 + }, + { + "epoch": 2.6310419749976384, + "grad_norm": 0.22888506948947906, + "learning_rate": 1.3601316689389635e-05, + "loss": 0.1476, + "step": 20890 + }, + { + "epoch": 2.6316717574078154, + "grad_norm": 0.1814454197883606, + "learning_rate": 1.3555615823189065e-05, + "loss": 0.1505, + "step": 20895 + }, + { + "epoch": 2.632301539817993, + "grad_norm": 0.19201195240020752, + "learning_rate": 1.350998823073951e-05, + "loss": 0.1559, + "step": 20900 + }, + { + "epoch": 2.63293132222817, + "grad_norm": 0.21456103026866913, + "learning_rate": 1.3464433936544055e-05, + "loss": 0.1519, + "step": 20905 + }, + { + "epoch": 2.6335611046383476, + "grad_norm": 0.21142180263996124, + "learning_rate": 1.3418952965066365e-05, + "loss": 0.153, + "step": 20910 + }, + { + "epoch": 2.6341908870485247, + "grad_norm": 0.2231752574443817, + "learning_rate": 1.3373545340730785e-05, + "loss": 0.1641, + "step": 20915 + }, + { + "epoch": 2.634820669458702, + "grad_norm": 0.19116418063640594, + "learning_rate": 1.3328211087922192e-05, + "loss": 0.1503, + "step": 20920 + }, + { + "epoch": 2.6354504518688793, + "grad_norm": 0.18010330200195312, + "learning_rate": 1.3282950230986194e-05, + "loss": 0.1434, + "step": 20925 + }, + { + "epoch": 2.6360802342790564, + "grad_norm": 0.179785817861557, + "learning_rate": 1.3237762794228884e-05, + "loss": 0.1502, + "step": 20930 + }, + { + "epoch": 2.636710016689234, + "grad_norm": 0.1842677742242813, + "learning_rate": 1.319264880191695e-05, + "loss": 0.1421, + "step": 20935 + }, + { + "epoch": 2.6373397990994114, + "grad_norm": 0.22725196182727814, + "learning_rate": 1.314760827827756e-05, + "loss": 0.1464, + "step": 20940 + }, + { + "epoch": 2.6379695815095885, + "grad_norm": 0.21761812269687653, + "learning_rate": 1.3102641247498585e-05, + "loss": 0.1492, + "step": 20945 + }, + { + "epoch": 2.6385993639197656, + "grad_norm": 0.2054702490568161, + "learning_rate": 1.305774773372834e-05, + "loss": 0.1626, + "step": 20950 + }, + { + "epoch": 2.639229146329943, + "grad_norm": 0.20189544558525085, + "learning_rate": 1.3012927761075658e-05, + "loss": 0.1672, + "step": 20955 + }, + { + "epoch": 2.63985892874012, + "grad_norm": 0.2214374542236328, + "learning_rate": 1.2968181353609852e-05, + "loss": 0.159, + "step": 20960 + }, + { + "epoch": 2.6404887111502977, + "grad_norm": 0.20227175951004028, + "learning_rate": 1.2923508535360833e-05, + "loss": 0.1668, + "step": 20965 + }, + { + "epoch": 2.641118493560475, + "grad_norm": 0.2125934362411499, + "learning_rate": 1.2878909330318893e-05, + "loss": 0.1587, + "step": 20970 + }, + { + "epoch": 2.6417482759706523, + "grad_norm": 0.20071247220039368, + "learning_rate": 1.2834383762434807e-05, + "loss": 0.1515, + "step": 20975 + }, + { + "epoch": 2.6423780583808294, + "grad_norm": 0.18576478958129883, + "learning_rate": 1.2789931855619817e-05, + "loss": 0.152, + "step": 20980 + }, + { + "epoch": 2.6430078407910065, + "grad_norm": 0.2210751622915268, + "learning_rate": 1.2745553633745642e-05, + "loss": 0.1542, + "step": 20985 + }, + { + "epoch": 2.643637623201184, + "grad_norm": 0.20466601848602295, + "learning_rate": 1.2701249120644402e-05, + "loss": 0.1599, + "step": 20990 + }, + { + "epoch": 2.6442674056113615, + "grad_norm": 0.18101972341537476, + "learning_rate": 1.2657018340108616e-05, + "loss": 0.1434, + "step": 20995 + }, + { + "epoch": 2.6448971880215386, + "grad_norm": 0.19254201650619507, + "learning_rate": 1.2612861315891215e-05, + "loss": 0.1492, + "step": 21000 + }, + { + "epoch": 2.6448971880215386, + "eval_loss": 0.35662394762039185, + "eval_runtime": 6.1657, + "eval_samples_per_second": 162.189, + "eval_steps_per_second": 10.218, + "step": 21000 + }, + { + "epoch": 2.6455269704317157, + "grad_norm": 0.21622531116008759, + "learning_rate": 1.2568778071705564e-05, + "loss": 0.1508, + "step": 21005 + }, + { + "epoch": 2.6461567528418932, + "grad_norm": 0.20254169404506683, + "learning_rate": 1.2524768631225329e-05, + "loss": 0.1541, + "step": 21010 + }, + { + "epoch": 2.6467865352520703, + "grad_norm": 0.18706361949443817, + "learning_rate": 1.2480833018084619e-05, + "loss": 0.1554, + "step": 21015 + }, + { + "epoch": 2.647416317662248, + "grad_norm": 0.19682380557060242, + "learning_rate": 1.2436971255877825e-05, + "loss": 0.1527, + "step": 21020 + }, + { + "epoch": 2.648046100072425, + "grad_norm": 0.193098783493042, + "learning_rate": 1.2393183368159759e-05, + "loss": 0.1505, + "step": 21025 + }, + { + "epoch": 2.6486758824826024, + "grad_norm": 0.1954520344734192, + "learning_rate": 1.2349469378445493e-05, + "loss": 0.1463, + "step": 21030 + }, + { + "epoch": 2.6493056648927795, + "grad_norm": 0.2523531913757324, + "learning_rate": 1.2305829310210446e-05, + "loss": 0.1655, + "step": 21035 + }, + { + "epoch": 2.6499354473029566, + "grad_norm": 0.20331156253814697, + "learning_rate": 1.2262263186890325e-05, + "loss": 0.1514, + "step": 21040 + }, + { + "epoch": 2.650565229713134, + "grad_norm": 0.2400408834218979, + "learning_rate": 1.221877103188113e-05, + "loss": 0.1673, + "step": 21045 + }, + { + "epoch": 2.6511950121233117, + "grad_norm": 0.18541163206100464, + "learning_rate": 1.2175352868539162e-05, + "loss": 0.1582, + "step": 21050 + }, + { + "epoch": 2.6518247945334887, + "grad_norm": 0.24442121386528015, + "learning_rate": 1.2132008720180953e-05, + "loss": 0.1525, + "step": 21055 + }, + { + "epoch": 2.652454576943666, + "grad_norm": 0.23133227229118347, + "learning_rate": 1.2088738610083282e-05, + "loss": 0.1547, + "step": 21060 + }, + { + "epoch": 2.6530843593538433, + "grad_norm": 0.21159769594669342, + "learning_rate": 1.2045542561483196e-05, + "loss": 0.1451, + "step": 21065 + }, + { + "epoch": 2.6537141417640204, + "grad_norm": 0.19382759928703308, + "learning_rate": 1.2002420597577972e-05, + "loss": 0.1532, + "step": 21070 + }, + { + "epoch": 2.654343924174198, + "grad_norm": 0.18077696859836578, + "learning_rate": 1.1959372741525135e-05, + "loss": 0.1493, + "step": 21075 + }, + { + "epoch": 2.654973706584375, + "grad_norm": 0.24647746980190277, + "learning_rate": 1.1916399016442264e-05, + "loss": 0.1533, + "step": 21080 + }, + { + "epoch": 2.6556034889945526, + "grad_norm": 0.19929052889347076, + "learning_rate": 1.1873499445407291e-05, + "loss": 0.1418, + "step": 21085 + }, + { + "epoch": 2.6562332714047296, + "grad_norm": 0.2208561897277832, + "learning_rate": 1.1830674051458277e-05, + "loss": 0.1628, + "step": 21090 + }, + { + "epoch": 2.6568630538149067, + "grad_norm": 0.18301671743392944, + "learning_rate": 1.1787922857593406e-05, + "loss": 0.1495, + "step": 21095 + }, + { + "epoch": 2.6574928362250843, + "grad_norm": 0.1876724660396576, + "learning_rate": 1.1745245886771065e-05, + "loss": 0.1526, + "step": 21100 + }, + { + "epoch": 2.658122618635262, + "grad_norm": 0.18740510940551758, + "learning_rate": 1.1702643161909736e-05, + "loss": 0.1469, + "step": 21105 + }, + { + "epoch": 2.658752401045439, + "grad_norm": 0.2159937024116516, + "learning_rate": 1.1660114705888119e-05, + "loss": 0.1534, + "step": 21110 + }, + { + "epoch": 2.659382183455616, + "grad_norm": 0.19949863851070404, + "learning_rate": 1.1617660541544893e-05, + "loss": 0.1446, + "step": 21115 + }, + { + "epoch": 2.6600119658657935, + "grad_norm": 0.18760685622692108, + "learning_rate": 1.1575280691678956e-05, + "loss": 0.1495, + "step": 21120 + }, + { + "epoch": 2.6606417482759706, + "grad_norm": 0.1935281902551651, + "learning_rate": 1.153297517904922e-05, + "loss": 0.1563, + "step": 21125 + }, + { + "epoch": 2.661271530686148, + "grad_norm": 0.20625917613506317, + "learning_rate": 1.1490744026374743e-05, + "loss": 0.1527, + "step": 21130 + }, + { + "epoch": 2.661901313096325, + "grad_norm": 0.22293558716773987, + "learning_rate": 1.1448587256334618e-05, + "loss": 0.1573, + "step": 21135 + }, + { + "epoch": 2.6625310955065027, + "grad_norm": 0.22753605246543884, + "learning_rate": 1.1406504891567986e-05, + "loss": 0.1563, + "step": 21140 + }, + { + "epoch": 2.6631608779166798, + "grad_norm": 0.1857980489730835, + "learning_rate": 1.1364496954674035e-05, + "loss": 0.1542, + "step": 21145 + }, + { + "epoch": 2.663790660326857, + "grad_norm": 0.20376616716384888, + "learning_rate": 1.1322563468212003e-05, + "loss": 0.1533, + "step": 21150 + }, + { + "epoch": 2.6644204427370344, + "grad_norm": 0.18928895890712738, + "learning_rate": 1.1280704454701111e-05, + "loss": 0.151, + "step": 21155 + }, + { + "epoch": 2.665050225147212, + "grad_norm": 0.2215338945388794, + "learning_rate": 1.1238919936620593e-05, + "loss": 0.1484, + "step": 21160 + }, + { + "epoch": 2.665680007557389, + "grad_norm": 0.26164811849594116, + "learning_rate": 1.1197209936409702e-05, + "loss": 0.1604, + "step": 21165 + }, + { + "epoch": 2.666309789967566, + "grad_norm": 0.1930347979068756, + "learning_rate": 1.1155574476467682e-05, + "loss": 0.1578, + "step": 21170 + }, + { + "epoch": 2.6669395723777436, + "grad_norm": 0.18400873243808746, + "learning_rate": 1.1114013579153719e-05, + "loss": 0.1559, + "step": 21175 + }, + { + "epoch": 2.6675693547879207, + "grad_norm": 0.19113576412200928, + "learning_rate": 1.1072527266786974e-05, + "loss": 0.1583, + "step": 21180 + }, + { + "epoch": 2.668199137198098, + "grad_norm": 0.1980462521314621, + "learning_rate": 1.1031115561646476e-05, + "loss": 0.1516, + "step": 21185 + }, + { + "epoch": 2.6688289196082753, + "grad_norm": 0.24164487421512604, + "learning_rate": 1.0989778485971334e-05, + "loss": 0.1578, + "step": 21190 + }, + { + "epoch": 2.669458702018453, + "grad_norm": 0.18318641185760498, + "learning_rate": 1.0948516061960478e-05, + "loss": 0.1517, + "step": 21195 + }, + { + "epoch": 2.67008848442863, + "grad_norm": 0.20504848659038544, + "learning_rate": 1.0907328311772778e-05, + "loss": 0.1619, + "step": 21200 + }, + { + "epoch": 2.670718266838807, + "grad_norm": 0.214483380317688, + "learning_rate": 1.0866215257526978e-05, + "loss": 0.1445, + "step": 21205 + }, + { + "epoch": 2.6713480492489845, + "grad_norm": 0.24230434000492096, + "learning_rate": 1.0825176921301698e-05, + "loss": 0.1521, + "step": 21210 + }, + { + "epoch": 2.671977831659162, + "grad_norm": 0.22668616473674774, + "learning_rate": 1.0784213325135577e-05, + "loss": 0.1539, + "step": 21215 + }, + { + "epoch": 2.672607614069339, + "grad_norm": 0.18815076351165771, + "learning_rate": 1.0743324491026883e-05, + "loss": 0.1496, + "step": 21220 + }, + { + "epoch": 2.673237396479516, + "grad_norm": 0.24532602727413177, + "learning_rate": 1.070251044093387e-05, + "loss": 0.1572, + "step": 21225 + }, + { + "epoch": 2.6738671788896937, + "grad_norm": 0.2050776183605194, + "learning_rate": 1.066177119677467e-05, + "loss": 0.1585, + "step": 21230 + }, + { + "epoch": 2.674496961299871, + "grad_norm": 0.1992231011390686, + "learning_rate": 1.062110678042717e-05, + "loss": 0.1493, + "step": 21235 + }, + { + "epoch": 2.6751267437100483, + "grad_norm": 0.2188093513250351, + "learning_rate": 1.0580517213729062e-05, + "loss": 0.1526, + "step": 21240 + }, + { + "epoch": 2.6757565261202254, + "grad_norm": 0.17839093506336212, + "learning_rate": 1.0540002518477898e-05, + "loss": 0.146, + "step": 21245 + }, + { + "epoch": 2.676386308530403, + "grad_norm": 0.20759402215480804, + "learning_rate": 1.0499562716430987e-05, + "loss": 0.1527, + "step": 21250 + }, + { + "epoch": 2.67701609094058, + "grad_norm": 0.20209045708179474, + "learning_rate": 1.0459197829305427e-05, + "loss": 0.1507, + "step": 21255 + }, + { + "epoch": 2.677645873350757, + "grad_norm": 0.24553018808364868, + "learning_rate": 1.0418907878778077e-05, + "loss": 0.1568, + "step": 21260 + }, + { + "epoch": 2.6782756557609346, + "grad_norm": 0.24322043359279633, + "learning_rate": 1.0378692886485563e-05, + "loss": 0.1527, + "step": 21265 + }, + { + "epoch": 2.678905438171112, + "grad_norm": 0.20755696296691895, + "learning_rate": 1.0338552874024242e-05, + "loss": 0.1497, + "step": 21270 + }, + { + "epoch": 2.6795352205812892, + "grad_norm": 0.19075340032577515, + "learning_rate": 1.0298487862950256e-05, + "loss": 0.1514, + "step": 21275 + }, + { + "epoch": 2.6801650029914663, + "grad_norm": 0.20733466744422913, + "learning_rate": 1.0258497874779426e-05, + "loss": 0.1531, + "step": 21280 + }, + { + "epoch": 2.680794785401644, + "grad_norm": 0.20700550079345703, + "learning_rate": 1.0218582930987224e-05, + "loss": 0.1547, + "step": 21285 + }, + { + "epoch": 2.681424567811821, + "grad_norm": 0.1864207237958908, + "learning_rate": 1.0178743053008969e-05, + "loss": 0.1507, + "step": 21290 + }, + { + "epoch": 2.6820543502219985, + "grad_norm": 0.21107596158981323, + "learning_rate": 1.0138978262239532e-05, + "loss": 0.1511, + "step": 21295 + }, + { + "epoch": 2.6826841326321755, + "grad_norm": 0.25058072805404663, + "learning_rate": 1.0099288580033548e-05, + "loss": 0.1573, + "step": 21300 + }, + { + "epoch": 2.683313915042353, + "grad_norm": 0.18913084268569946, + "learning_rate": 1.005967402770525e-05, + "loss": 0.1415, + "step": 21305 + }, + { + "epoch": 2.68394369745253, + "grad_norm": 0.18435829877853394, + "learning_rate": 1.002013462652857e-05, + "loss": 0.1432, + "step": 21310 + }, + { + "epoch": 2.6845734798627072, + "grad_norm": 0.19929499924182892, + "learning_rate": 9.980670397737106e-06, + "loss": 0.1562, + "step": 21315 + }, + { + "epoch": 2.6852032622728847, + "grad_norm": 0.2412646859884262, + "learning_rate": 9.941281362524007e-06, + "loss": 0.1544, + "step": 21320 + }, + { + "epoch": 2.6858330446830623, + "grad_norm": 0.23384952545166016, + "learning_rate": 9.9019675420421e-06, + "loss": 0.1598, + "step": 21325 + }, + { + "epoch": 2.6864628270932394, + "grad_norm": 0.1778135895729065, + "learning_rate": 9.862728957403766e-06, + "loss": 0.1515, + "step": 21330 + }, + { + "epoch": 2.6870926095034164, + "grad_norm": 0.20782922208309174, + "learning_rate": 9.823565629681079e-06, + "loss": 0.1504, + "step": 21335 + }, + { + "epoch": 2.687722391913594, + "grad_norm": 0.18523196876049042, + "learning_rate": 9.78447757990562e-06, + "loss": 0.1425, + "step": 21340 + }, + { + "epoch": 2.688352174323771, + "grad_norm": 0.18965183198451996, + "learning_rate": 9.745464829068561e-06, + "loss": 0.1541, + "step": 21345 + }, + { + "epoch": 2.6889819567339486, + "grad_norm": 0.18834419548511505, + "learning_rate": 9.706527398120645e-06, + "loss": 0.1536, + "step": 21350 + }, + { + "epoch": 2.6896117391441257, + "grad_norm": 0.18705077469348907, + "learning_rate": 9.66766530797216e-06, + "loss": 0.1451, + "step": 21355 + }, + { + "epoch": 2.690241521554303, + "grad_norm": 0.1886008232831955, + "learning_rate": 9.628878579492932e-06, + "loss": 0.1484, + "step": 21360 + }, + { + "epoch": 2.6908713039644803, + "grad_norm": 0.19375784695148468, + "learning_rate": 9.590167233512314e-06, + "loss": 0.1554, + "step": 21365 + }, + { + "epoch": 2.6915010863746573, + "grad_norm": 0.18573135137557983, + "learning_rate": 9.551531290819192e-06, + "loss": 0.1608, + "step": 21370 + }, + { + "epoch": 2.692130868784835, + "grad_norm": 0.18215128779411316, + "learning_rate": 9.512970772161955e-06, + "loss": 0.1564, + "step": 21375 + }, + { + "epoch": 2.6927606511950124, + "grad_norm": 0.1941639482975006, + "learning_rate": 9.474485698248469e-06, + "loss": 0.1551, + "step": 21380 + }, + { + "epoch": 2.6933904336051895, + "grad_norm": 0.19289493560791016, + "learning_rate": 9.436076089746153e-06, + "loss": 0.1537, + "step": 21385 + }, + { + "epoch": 2.6940202160153666, + "grad_norm": 0.19275395572185516, + "learning_rate": 9.397741967281724e-06, + "loss": 0.1441, + "step": 21390 + }, + { + "epoch": 2.694649998425544, + "grad_norm": 0.18316887319087982, + "learning_rate": 9.359483351441599e-06, + "loss": 0.1496, + "step": 21395 + }, + { + "epoch": 2.695279780835721, + "grad_norm": 0.16594599187374115, + "learning_rate": 9.321300262771475e-06, + "loss": 0.1408, + "step": 21400 + }, + { + "epoch": 2.6959095632458987, + "grad_norm": 0.2479625940322876, + "learning_rate": 9.28319272177655e-06, + "loss": 0.1565, + "step": 21405 + }, + { + "epoch": 2.696539345656076, + "grad_norm": 0.18492808938026428, + "learning_rate": 9.245160748921454e-06, + "loss": 0.143, + "step": 21410 + }, + { + "epoch": 2.6971691280662533, + "grad_norm": 0.22853007912635803, + "learning_rate": 9.207204364630182e-06, + "loss": 0.1668, + "step": 21415 + }, + { + "epoch": 2.6977989104764304, + "grad_norm": 0.1997872143983841, + "learning_rate": 9.169323589286264e-06, + "loss": 0.1563, + "step": 21420 + }, + { + "epoch": 2.6984286928866075, + "grad_norm": 0.23863272368907928, + "learning_rate": 9.131518443232476e-06, + "loss": 0.1554, + "step": 21425 + }, + { + "epoch": 2.699058475296785, + "grad_norm": 0.17353664338588715, + "learning_rate": 9.09378894677103e-06, + "loss": 0.147, + "step": 21430 + }, + { + "epoch": 2.699688257706962, + "grad_norm": 0.2168291211128235, + "learning_rate": 9.056135120163582e-06, + "loss": 0.1553, + "step": 21435 + }, + { + "epoch": 2.7003180401171396, + "grad_norm": 0.23211082816123962, + "learning_rate": 9.018556983631076e-06, + "loss": 0.1493, + "step": 21440 + }, + { + "epoch": 2.7009478225273167, + "grad_norm": 0.22088773548603058, + "learning_rate": 8.981054557353834e-06, + "loss": 0.1567, + "step": 21445 + }, + { + "epoch": 2.701577604937494, + "grad_norm": 0.20668818056583405, + "learning_rate": 8.943627861471497e-06, + "loss": 0.1559, + "step": 21450 + }, + { + "epoch": 2.7022073873476713, + "grad_norm": 0.22993560135364532, + "learning_rate": 8.906276916083072e-06, + "loss": 0.1628, + "step": 21455 + }, + { + "epoch": 2.702837169757849, + "grad_norm": 0.214871346950531, + "learning_rate": 8.869001741246862e-06, + "loss": 0.1567, + "step": 21460 + }, + { + "epoch": 2.703466952168026, + "grad_norm": 0.20056143403053284, + "learning_rate": 8.831802356980505e-06, + "loss": 0.1494, + "step": 21465 + }, + { + "epoch": 2.7040967345782034, + "grad_norm": 0.18365876376628876, + "learning_rate": 8.79467878326089e-06, + "loss": 0.1547, + "step": 21470 + }, + { + "epoch": 2.7047265169883805, + "grad_norm": 0.1938326060771942, + "learning_rate": 8.757631040024215e-06, + "loss": 0.1591, + "step": 21475 + }, + { + "epoch": 2.7053562993985576, + "grad_norm": 0.207264244556427, + "learning_rate": 8.72065914716602e-06, + "loss": 0.1588, + "step": 21480 + }, + { + "epoch": 2.705986081808735, + "grad_norm": 0.23815831542015076, + "learning_rate": 8.683763124541021e-06, + "loss": 0.1551, + "step": 21485 + }, + { + "epoch": 2.706615864218912, + "grad_norm": 0.20644132792949677, + "learning_rate": 8.646942991963236e-06, + "loss": 0.1496, + "step": 21490 + }, + { + "epoch": 2.7072456466290897, + "grad_norm": 0.19380377233028412, + "learning_rate": 8.610198769205895e-06, + "loss": 0.1499, + "step": 21495 + }, + { + "epoch": 2.707875429039267, + "grad_norm": 0.1877509504556656, + "learning_rate": 8.5735304760015e-06, + "loss": 0.1582, + "step": 21500 + }, + { + "epoch": 2.7085052114494443, + "grad_norm": 0.20092125236988068, + "learning_rate": 8.536938132041781e-06, + "loss": 0.1541, + "step": 21505 + }, + { + "epoch": 2.7091349938596214, + "grad_norm": 0.20917046070098877, + "learning_rate": 8.500421756977637e-06, + "loss": 0.1555, + "step": 21510 + }, + { + "epoch": 2.709764776269799, + "grad_norm": 0.18814347684383392, + "learning_rate": 8.463981370419165e-06, + "loss": 0.1511, + "step": 21515 + }, + { + "epoch": 2.710394558679976, + "grad_norm": 0.2021394819021225, + "learning_rate": 8.427616991935759e-06, + "loss": 0.1539, + "step": 21520 + }, + { + "epoch": 2.7110243410901536, + "grad_norm": 0.19899116456508636, + "learning_rate": 8.3913286410559e-06, + "loss": 0.1553, + "step": 21525 + }, + { + "epoch": 2.7116541235003306, + "grad_norm": 0.2093294858932495, + "learning_rate": 8.355116337267231e-06, + "loss": 0.1581, + "step": 21530 + }, + { + "epoch": 2.7122839059105077, + "grad_norm": 0.215724378824234, + "learning_rate": 8.318980100016564e-06, + "loss": 0.1516, + "step": 21535 + }, + { + "epoch": 2.7129136883206852, + "grad_norm": 0.21019119024276733, + "learning_rate": 8.28291994870996e-06, + "loss": 0.1521, + "step": 21540 + }, + { + "epoch": 2.7135434707308623, + "grad_norm": 0.20992571115493774, + "learning_rate": 8.246935902712493e-06, + "loss": 0.1401, + "step": 21545 + }, + { + "epoch": 2.71417325314104, + "grad_norm": 0.1939440220594406, + "learning_rate": 8.21102798134844e-06, + "loss": 0.1572, + "step": 21550 + }, + { + "epoch": 2.714803035551217, + "grad_norm": 0.2128129005432129, + "learning_rate": 8.175196203901157e-06, + "loss": 0.1624, + "step": 21555 + }, + { + "epoch": 2.7154328179613945, + "grad_norm": 0.22001588344573975, + "learning_rate": 8.139440589613122e-06, + "loss": 0.1498, + "step": 21560 + }, + { + "epoch": 2.7160626003715715, + "grad_norm": 0.24958358705043793, + "learning_rate": 8.103761157685939e-06, + "loss": 0.1614, + "step": 21565 + }, + { + "epoch": 2.716692382781749, + "grad_norm": 0.21756353974342346, + "learning_rate": 8.068157927280284e-06, + "loss": 0.1515, + "step": 21570 + }, + { + "epoch": 2.717322165191926, + "grad_norm": 0.19753116369247437, + "learning_rate": 8.032630917515842e-06, + "loss": 0.1504, + "step": 21575 + }, + { + "epoch": 2.7179519476021037, + "grad_norm": 0.2083761841058731, + "learning_rate": 7.997180147471505e-06, + "loss": 0.1488, + "step": 21580 + }, + { + "epoch": 2.7185817300122808, + "grad_norm": 0.2009708434343338, + "learning_rate": 7.961805636185126e-06, + "loss": 0.1475, + "step": 21585 + }, + { + "epoch": 2.719211512422458, + "grad_norm": 0.23513175547122955, + "learning_rate": 7.926507402653609e-06, + "loss": 0.1479, + "step": 21590 + }, + { + "epoch": 2.7198412948326354, + "grad_norm": 0.1990012526512146, + "learning_rate": 7.891285465832909e-06, + "loss": 0.1498, + "step": 21595 + }, + { + "epoch": 2.7204710772428125, + "grad_norm": 0.2000730186700821, + "learning_rate": 7.856139844638044e-06, + "loss": 0.1553, + "step": 21600 + }, + { + "epoch": 2.72110085965299, + "grad_norm": 0.17009419202804565, + "learning_rate": 7.821070557942966e-06, + "loss": 0.138, + "step": 21605 + }, + { + "epoch": 2.721730642063167, + "grad_norm": 0.19666020572185516, + "learning_rate": 7.786077624580728e-06, + "loss": 0.1505, + "step": 21610 + }, + { + "epoch": 2.7223604244733446, + "grad_norm": 0.20230218768119812, + "learning_rate": 7.751161063343314e-06, + "loss": 0.1459, + "step": 21615 + }, + { + "epoch": 2.7229902068835217, + "grad_norm": 0.20249028503894806, + "learning_rate": 7.716320892981692e-06, + "loss": 0.1481, + "step": 21620 + }, + { + "epoch": 2.7236199892936988, + "grad_norm": 0.183380126953125, + "learning_rate": 7.681557132205861e-06, + "loss": 0.1513, + "step": 21625 + }, + { + "epoch": 2.7242497717038763, + "grad_norm": 0.22188283503055573, + "learning_rate": 7.646869799684791e-06, + "loss": 0.1534, + "step": 21630 + }, + { + "epoch": 2.724879554114054, + "grad_norm": 0.19538500905036926, + "learning_rate": 7.6122589140462766e-06, + "loss": 0.1524, + "step": 21635 + }, + { + "epoch": 2.725509336524231, + "grad_norm": 0.1824834644794464, + "learning_rate": 7.577724493877219e-06, + "loss": 0.1564, + "step": 21640 + }, + { + "epoch": 2.726139118934408, + "grad_norm": 0.18397974967956543, + "learning_rate": 7.543266557723398e-06, + "loss": 0.1467, + "step": 21645 + }, + { + "epoch": 2.7267689013445855, + "grad_norm": 0.22993116080760956, + "learning_rate": 7.508885124089481e-06, + "loss": 0.1546, + "step": 21650 + }, + { + "epoch": 2.7273986837547626, + "grad_norm": 0.18351049721240997, + "learning_rate": 7.47458021143908e-06, + "loss": 0.1616, + "step": 21655 + }, + { + "epoch": 2.72802846616494, + "grad_norm": 0.20072756707668304, + "learning_rate": 7.440351838194724e-06, + "loss": 0.1451, + "step": 21660 + }, + { + "epoch": 2.728658248575117, + "grad_norm": 0.19199103116989136, + "learning_rate": 7.406200022737879e-06, + "loss": 0.1518, + "step": 21665 + }, + { + "epoch": 2.7292880309852947, + "grad_norm": 0.21039634943008423, + "learning_rate": 7.372124783408789e-06, + "loss": 0.154, + "step": 21670 + }, + { + "epoch": 2.729917813395472, + "grad_norm": 0.2162015289068222, + "learning_rate": 7.33812613850665e-06, + "loss": 0.1459, + "step": 21675 + }, + { + "epoch": 2.730547595805649, + "grad_norm": 0.192021444439888, + "learning_rate": 7.304204106289507e-06, + "loss": 0.1547, + "step": 21680 + }, + { + "epoch": 2.7311773782158264, + "grad_norm": 0.20860375463962555, + "learning_rate": 7.270358704974289e-06, + "loss": 0.1501, + "step": 21685 + }, + { + "epoch": 2.731807160626004, + "grad_norm": 0.1841016709804535, + "learning_rate": 7.236589952736738e-06, + "loss": 0.1538, + "step": 21690 + }, + { + "epoch": 2.732436943036181, + "grad_norm": 0.23411309719085693, + "learning_rate": 7.202897867711449e-06, + "loss": 0.153, + "step": 21695 + }, + { + "epoch": 2.733066725446358, + "grad_norm": 0.2005651742219925, + "learning_rate": 7.1692824679918325e-06, + "loss": 0.1505, + "step": 21700 + }, + { + "epoch": 2.7336965078565356, + "grad_norm": 0.18157663941383362, + "learning_rate": 7.135743771630131e-06, + "loss": 0.1424, + "step": 21705 + }, + { + "epoch": 2.7343262902667127, + "grad_norm": 0.20939917862415314, + "learning_rate": 7.102281796637388e-06, + "loss": 0.1585, + "step": 21710 + }, + { + "epoch": 2.7349560726768902, + "grad_norm": 0.17006689310073853, + "learning_rate": 7.068896560983445e-06, + "loss": 0.1529, + "step": 21715 + }, + { + "epoch": 2.7355858550870673, + "grad_norm": 0.23061016201972961, + "learning_rate": 7.035588082596927e-06, + "loss": 0.1556, + "step": 21720 + }, + { + "epoch": 2.736215637497245, + "grad_norm": 0.20175643265247345, + "learning_rate": 7.002356379365276e-06, + "loss": 0.1559, + "step": 21725 + }, + { + "epoch": 2.736845419907422, + "grad_norm": 0.19943305850028992, + "learning_rate": 6.969201469134683e-06, + "loss": 0.147, + "step": 21730 + }, + { + "epoch": 2.737475202317599, + "grad_norm": 0.22196878492832184, + "learning_rate": 6.936123369710056e-06, + "loss": 0.1517, + "step": 21735 + }, + { + "epoch": 2.7381049847277765, + "grad_norm": 0.19505414366722107, + "learning_rate": 6.903122098855085e-06, + "loss": 0.1464, + "step": 21740 + }, + { + "epoch": 2.738734767137954, + "grad_norm": 0.19797982275485992, + "learning_rate": 6.870197674292227e-06, + "loss": 0.1407, + "step": 21745 + }, + { + "epoch": 2.739364549548131, + "grad_norm": 0.2223568856716156, + "learning_rate": 6.837350113702672e-06, + "loss": 0.1524, + "step": 21750 + }, + { + "epoch": 2.739994331958308, + "grad_norm": 0.22087423503398895, + "learning_rate": 6.804579434726276e-06, + "loss": 0.1578, + "step": 21755 + }, + { + "epoch": 2.7406241143684857, + "grad_norm": 0.19389192759990692, + "learning_rate": 6.771885654961662e-06, + "loss": 0.1445, + "step": 21760 + }, + { + "epoch": 2.741253896778663, + "grad_norm": 0.20979470014572144, + "learning_rate": 6.739268791966118e-06, + "loss": 0.1548, + "step": 21765 + }, + { + "epoch": 2.7418836791888404, + "grad_norm": 0.22365309298038483, + "learning_rate": 6.7067288632556505e-06, + "loss": 0.1471, + "step": 21770 + }, + { + "epoch": 2.7425134615990174, + "grad_norm": 0.20007841289043427, + "learning_rate": 6.674265886304964e-06, + "loss": 0.1548, + "step": 21775 + }, + { + "epoch": 2.743143244009195, + "grad_norm": 0.1756853312253952, + "learning_rate": 6.641879878547379e-06, + "loss": 0.1443, + "step": 21780 + }, + { + "epoch": 2.743773026419372, + "grad_norm": 0.21500404179096222, + "learning_rate": 6.609570857374952e-06, + "loss": 0.1584, + "step": 21785 + }, + { + "epoch": 2.744402808829549, + "grad_norm": 0.1938805729150772, + "learning_rate": 6.577338840138369e-06, + "loss": 0.155, + "step": 21790 + }, + { + "epoch": 2.7450325912397266, + "grad_norm": 0.20673929154872894, + "learning_rate": 6.545183844146951e-06, + "loss": 0.1526, + "step": 21795 + }, + { + "epoch": 2.745662373649904, + "grad_norm": 0.19749803841114044, + "learning_rate": 6.513105886668668e-06, + "loss": 0.1533, + "step": 21800 + }, + { + "epoch": 2.7462921560600813, + "grad_norm": 0.212607279419899, + "learning_rate": 6.481104984930107e-06, + "loss": 0.1565, + "step": 21805 + }, + { + "epoch": 2.7469219384702583, + "grad_norm": 0.1796950250864029, + "learning_rate": 6.449181156116473e-06, + "loss": 0.1464, + "step": 21810 + }, + { + "epoch": 2.747551720880436, + "grad_norm": 0.18281513452529907, + "learning_rate": 6.417334417371616e-06, + "loss": 0.1482, + "step": 21815 + }, + { + "epoch": 2.748181503290613, + "grad_norm": 0.23321060836315155, + "learning_rate": 6.385564785797958e-06, + "loss": 0.1489, + "step": 21820 + }, + { + "epoch": 2.7488112857007905, + "grad_norm": 0.2202220857143402, + "learning_rate": 6.353872278456501e-06, + "loss": 0.1477, + "step": 21825 + }, + { + "epoch": 2.7494410681109676, + "grad_norm": 0.283456951379776, + "learning_rate": 6.3222569123668635e-06, + "loss": 0.1582, + "step": 21830 + }, + { + "epoch": 2.750070850521145, + "grad_norm": 0.18883143365383148, + "learning_rate": 6.29071870450723e-06, + "loss": 0.1469, + "step": 21835 + }, + { + "epoch": 2.750700632931322, + "grad_norm": 0.20364224910736084, + "learning_rate": 6.259257671814272e-06, + "loss": 0.1567, + "step": 21840 + }, + { + "epoch": 2.7513304153414992, + "grad_norm": 0.19058570265769958, + "learning_rate": 6.227873831183355e-06, + "loss": 0.1449, + "step": 21845 + }, + { + "epoch": 2.7519601977516768, + "grad_norm": 0.20439192652702332, + "learning_rate": 6.196567199468299e-06, + "loss": 0.1486, + "step": 21850 + }, + { + "epoch": 2.7525899801618543, + "grad_norm": 0.1962665468454361, + "learning_rate": 6.165337793481473e-06, + "loss": 0.1499, + "step": 21855 + }, + { + "epoch": 2.7532197625720314, + "grad_norm": 0.22097113728523254, + "learning_rate": 6.134185629993793e-06, + "loss": 0.153, + "step": 21860 + }, + { + "epoch": 2.7538495449822085, + "grad_norm": 0.20070448517799377, + "learning_rate": 6.103110725734644e-06, + "loss": 0.1463, + "step": 21865 + }, + { + "epoch": 2.754479327392386, + "grad_norm": 0.20577707886695862, + "learning_rate": 6.072113097392028e-06, + "loss": 0.1549, + "step": 21870 + }, + { + "epoch": 2.755109109802563, + "grad_norm": 0.1798795461654663, + "learning_rate": 6.041192761612313e-06, + "loss": 0.1454, + "step": 21875 + }, + { + "epoch": 2.7557388922127406, + "grad_norm": 0.20694920420646667, + "learning_rate": 6.010349735000464e-06, + "loss": 0.1524, + "step": 21880 + }, + { + "epoch": 2.7563686746229177, + "grad_norm": 0.19873858988285065, + "learning_rate": 5.979584034119867e-06, + "loss": 0.1523, + "step": 21885 + }, + { + "epoch": 2.756998457033095, + "grad_norm": 0.2215358465909958, + "learning_rate": 5.948895675492421e-06, + "loss": 0.1508, + "step": 21890 + }, + { + "epoch": 2.7576282394432723, + "grad_norm": 0.21731533110141754, + "learning_rate": 5.918284675598478e-06, + "loss": 0.149, + "step": 21895 + }, + { + "epoch": 2.7582580218534494, + "grad_norm": 0.21298860013484955, + "learning_rate": 5.887751050876837e-06, + "loss": 0.156, + "step": 21900 + }, + { + "epoch": 2.758887804263627, + "grad_norm": 0.20131991803646088, + "learning_rate": 5.85729481772475e-06, + "loss": 0.1403, + "step": 21905 + }, + { + "epoch": 2.7595175866738044, + "grad_norm": 0.17870669066905975, + "learning_rate": 5.826915992497932e-06, + "loss": 0.1483, + "step": 21910 + }, + { + "epoch": 2.7601473690839815, + "grad_norm": 0.2430955022573471, + "learning_rate": 5.796614591510468e-06, + "loss": 0.1484, + "step": 21915 + }, + { + "epoch": 2.7607771514941586, + "grad_norm": 0.1986503154039383, + "learning_rate": 5.766390631034939e-06, + "loss": 0.1524, + "step": 21920 + }, + { + "epoch": 2.761406933904336, + "grad_norm": 0.1926422268152237, + "learning_rate": 5.7362441273022645e-06, + "loss": 0.1484, + "step": 21925 + }, + { + "epoch": 2.762036716314513, + "grad_norm": 0.23347438871860504, + "learning_rate": 5.706175096501825e-06, + "loss": 0.1512, + "step": 21930 + }, + { + "epoch": 2.7626664987246907, + "grad_norm": 0.20513305068016052, + "learning_rate": 5.676183554781405e-06, + "loss": 0.1518, + "step": 21935 + }, + { + "epoch": 2.763296281134868, + "grad_norm": 0.18283484876155853, + "learning_rate": 5.64626951824712e-06, + "loss": 0.1381, + "step": 21940 + }, + { + "epoch": 2.7639260635450453, + "grad_norm": 0.17075172066688538, + "learning_rate": 5.616433002963472e-06, + "loss": 0.1501, + "step": 21945 + }, + { + "epoch": 2.7645558459552224, + "grad_norm": 0.2107374221086502, + "learning_rate": 5.5866740249533746e-06, + "loss": 0.1581, + "step": 21950 + }, + { + "epoch": 2.7651856283653995, + "grad_norm": 0.23205851018428802, + "learning_rate": 5.556992600198079e-06, + "loss": 0.1467, + "step": 21955 + }, + { + "epoch": 2.765815410775577, + "grad_norm": 0.1973281055688858, + "learning_rate": 5.527388744637201e-06, + "loss": 0.1434, + "step": 21960 + }, + { + "epoch": 2.7664451931857545, + "grad_norm": 0.20235906541347504, + "learning_rate": 5.497862474168657e-06, + "loss": 0.1454, + "step": 21965 + }, + { + "epoch": 2.7670749755959316, + "grad_norm": 0.21266506612300873, + "learning_rate": 5.4684138046487134e-06, + "loss": 0.1454, + "step": 21970 + }, + { + "epoch": 2.7677047580061087, + "grad_norm": 0.1890571415424347, + "learning_rate": 5.43904275189207e-06, + "loss": 0.1414, + "step": 21975 + }, + { + "epoch": 2.7683345404162862, + "grad_norm": 0.1897963136434555, + "learning_rate": 5.409749331671559e-06, + "loss": 0.1493, + "step": 21980 + }, + { + "epoch": 2.7689643228264633, + "grad_norm": 0.18935035169124603, + "learning_rate": 5.380533559718414e-06, + "loss": 0.1543, + "step": 21985 + }, + { + "epoch": 2.769594105236641, + "grad_norm": 0.20879988372325897, + "learning_rate": 5.351395451722251e-06, + "loss": 0.151, + "step": 21990 + }, + { + "epoch": 2.770223887646818, + "grad_norm": 0.20008423924446106, + "learning_rate": 5.322335023330837e-06, + "loss": 0.1515, + "step": 21995 + }, + { + "epoch": 2.7708536700569955, + "grad_norm": 0.18473681807518005, + "learning_rate": 5.293352290150321e-06, + "loss": 0.1464, + "step": 22000 + }, + { + "epoch": 2.7708536700569955, + "eval_loss": 0.3584047257900238, + "eval_runtime": 6.1661, + "eval_samples_per_second": 162.176, + "eval_steps_per_second": 10.217, + "step": 22000 + }, + { + "epoch": 2.7714834524671725, + "grad_norm": 0.19308076798915863, + "learning_rate": 5.264447267745053e-06, + "loss": 0.1582, + "step": 22005 + }, + { + "epoch": 2.7721132348773496, + "grad_norm": 0.23008759319782257, + "learning_rate": 5.235619971637734e-06, + "loss": 0.1546, + "step": 22010 + }, + { + "epoch": 2.772743017287527, + "grad_norm": 0.21323955059051514, + "learning_rate": 5.206870417309245e-06, + "loss": 0.1536, + "step": 22015 + }, + { + "epoch": 2.7733727996977047, + "grad_norm": 0.23257404565811157, + "learning_rate": 5.17819862019877e-06, + "loss": 0.1516, + "step": 22020 + }, + { + "epoch": 2.7740025821078818, + "grad_norm": 0.22094878554344177, + "learning_rate": 5.14960459570371e-06, + "loss": 0.1546, + "step": 22025 + }, + { + "epoch": 2.774632364518059, + "grad_norm": 0.21868959069252014, + "learning_rate": 5.121088359179698e-06, + "loss": 0.1567, + "step": 22030 + }, + { + "epoch": 2.7752621469282364, + "grad_norm": 0.19147329032421112, + "learning_rate": 5.09264992594065e-06, + "loss": 0.1502, + "step": 22035 + }, + { + "epoch": 2.7758919293384134, + "grad_norm": 0.17076317965984344, + "learning_rate": 5.064289311258618e-06, + "loss": 0.1511, + "step": 22040 + }, + { + "epoch": 2.776521711748591, + "grad_norm": 0.23041397333145142, + "learning_rate": 5.036006530363917e-06, + "loss": 0.1611, + "step": 22045 + }, + { + "epoch": 2.777151494158768, + "grad_norm": 0.21972966194152832, + "learning_rate": 5.007801598445033e-06, + "loss": 0.1493, + "step": 22050 + }, + { + "epoch": 2.7777812765689456, + "grad_norm": 0.17348721623420715, + "learning_rate": 4.979674530648664e-06, + "loss": 0.1481, + "step": 22055 + }, + { + "epoch": 2.7784110589791227, + "grad_norm": 0.19225727021694183, + "learning_rate": 4.9516253420796795e-06, + "loss": 0.1493, + "step": 22060 + }, + { + "epoch": 2.7790408413892997, + "grad_norm": 0.19729195535182953, + "learning_rate": 4.9236540478011625e-06, + "loss": 0.1442, + "step": 22065 + }, + { + "epoch": 2.7796706237994773, + "grad_norm": 0.17798985540866852, + "learning_rate": 4.8957606628342805e-06, + "loss": 0.1507, + "step": 22070 + }, + { + "epoch": 2.780300406209655, + "grad_norm": 0.19311825931072235, + "learning_rate": 4.867945202158469e-06, + "loss": 0.149, + "step": 22075 + }, + { + "epoch": 2.780930188619832, + "grad_norm": 0.18525920808315277, + "learning_rate": 4.840207680711278e-06, + "loss": 0.1635, + "step": 22080 + }, + { + "epoch": 2.781559971030009, + "grad_norm": 0.18988420069217682, + "learning_rate": 4.812548113388342e-06, + "loss": 0.153, + "step": 22085 + }, + { + "epoch": 2.7821897534401865, + "grad_norm": 0.18699151277542114, + "learning_rate": 4.784966515043498e-06, + "loss": 0.147, + "step": 22090 + }, + { + "epoch": 2.7828195358503636, + "grad_norm": 0.23182329535484314, + "learning_rate": 4.757462900488695e-06, + "loss": 0.1496, + "step": 22095 + }, + { + "epoch": 2.783449318260541, + "grad_norm": 0.20079541206359863, + "learning_rate": 4.730037284494021e-06, + "loss": 0.1583, + "step": 22100 + }, + { + "epoch": 2.784079100670718, + "grad_norm": 0.21548844873905182, + "learning_rate": 4.702689681787625e-06, + "loss": 0.1481, + "step": 22105 + }, + { + "epoch": 2.7847088830808957, + "grad_norm": 0.1968826800584793, + "learning_rate": 4.6754201070558105e-06, + "loss": 0.1452, + "step": 22110 + }, + { + "epoch": 2.785338665491073, + "grad_norm": 0.20061470568180084, + "learning_rate": 4.648228574942997e-06, + "loss": 0.1472, + "step": 22115 + }, + { + "epoch": 2.78596844790125, + "grad_norm": 0.19061359763145447, + "learning_rate": 4.621115100051604e-06, + "loss": 0.1478, + "step": 22120 + }, + { + "epoch": 2.7865982303114274, + "grad_norm": 0.23252861201763153, + "learning_rate": 4.594079696942199e-06, + "loss": 0.1527, + "step": 22125 + }, + { + "epoch": 2.787228012721605, + "grad_norm": 0.1698002964258194, + "learning_rate": 4.56712238013342e-06, + "loss": 0.1379, + "step": 22130 + }, + { + "epoch": 2.787857795131782, + "grad_norm": 0.19811010360717773, + "learning_rate": 4.540243164101954e-06, + "loss": 0.1417, + "step": 22135 + }, + { + "epoch": 2.788487577541959, + "grad_norm": 0.2089819759130478, + "learning_rate": 4.513442063282585e-06, + "loss": 0.1517, + "step": 22140 + }, + { + "epoch": 2.7891173599521366, + "grad_norm": 0.21028514206409454, + "learning_rate": 4.486719092068086e-06, + "loss": 0.1536, + "step": 22145 + }, + { + "epoch": 2.7897471423623137, + "grad_norm": 0.20895244181156158, + "learning_rate": 4.46007426480931e-06, + "loss": 0.1421, + "step": 22150 + }, + { + "epoch": 2.790376924772491, + "grad_norm": 0.1925353854894638, + "learning_rate": 4.4335075958151275e-06, + "loss": 0.1506, + "step": 22155 + }, + { + "epoch": 2.7910067071826683, + "grad_norm": 0.21809720993041992, + "learning_rate": 4.407019099352477e-06, + "loss": 0.1537, + "step": 22160 + }, + { + "epoch": 2.791636489592846, + "grad_norm": 0.23316286504268646, + "learning_rate": 4.380608789646245e-06, + "loss": 0.1593, + "step": 22165 + }, + { + "epoch": 2.792266272003023, + "grad_norm": 0.20298117399215698, + "learning_rate": 4.354276680879404e-06, + "loss": 0.1469, + "step": 22170 + }, + { + "epoch": 2.7928960544132, + "grad_norm": 0.18828284740447998, + "learning_rate": 4.328022787192875e-06, + "loss": 0.1478, + "step": 22175 + }, + { + "epoch": 2.7935258368233775, + "grad_norm": 0.19351090490818024, + "learning_rate": 4.301847122685614e-06, + "loss": 0.139, + "step": 22180 + }, + { + "epoch": 2.794155619233555, + "grad_norm": 0.19426658749580383, + "learning_rate": 4.27574970141456e-06, + "loss": 0.148, + "step": 22185 + }, + { + "epoch": 2.794785401643732, + "grad_norm": 0.18554694950580597, + "learning_rate": 4.2497305373945855e-06, + "loss": 0.1484, + "step": 22190 + }, + { + "epoch": 2.795415184053909, + "grad_norm": 0.21555371582508087, + "learning_rate": 4.223789644598613e-06, + "loss": 0.1537, + "step": 22195 + }, + { + "epoch": 2.7960449664640867, + "grad_norm": 0.20736396312713623, + "learning_rate": 4.197927036957499e-06, + "loss": 0.1533, + "step": 22200 + }, + { + "epoch": 2.796674748874264, + "grad_norm": 0.2143113762140274, + "learning_rate": 4.172142728360017e-06, + "loss": 0.1509, + "step": 22205 + }, + { + "epoch": 2.7973045312844413, + "grad_norm": 0.1888829916715622, + "learning_rate": 4.146436732652958e-06, + "loss": 0.1507, + "step": 22210 + }, + { + "epoch": 2.7979343136946184, + "grad_norm": 0.19072696566581726, + "learning_rate": 4.1208090636410286e-06, + "loss": 0.153, + "step": 22215 + }, + { + "epoch": 2.798564096104796, + "grad_norm": 0.23674504458904266, + "learning_rate": 4.09525973508687e-06, + "loss": 0.1584, + "step": 22220 + }, + { + "epoch": 2.799193878514973, + "grad_norm": 0.23174551129341125, + "learning_rate": 4.06978876071104e-06, + "loss": 0.1475, + "step": 22225 + }, + { + "epoch": 2.79982366092515, + "grad_norm": 0.2185906022787094, + "learning_rate": 4.044396154192031e-06, + "loss": 0.1494, + "step": 22230 + }, + { + "epoch": 2.8004534433353276, + "grad_norm": 0.1940082162618637, + "learning_rate": 4.019081929166268e-06, + "loss": 0.1497, + "step": 22235 + }, + { + "epoch": 2.801083225745505, + "grad_norm": 0.1945921629667282, + "learning_rate": 3.993846099228093e-06, + "loss": 0.1524, + "step": 22240 + }, + { + "epoch": 2.8017130081556822, + "grad_norm": 0.21760894358158112, + "learning_rate": 3.968688677929682e-06, + "loss": 0.1459, + "step": 22245 + }, + { + "epoch": 2.8023427905658593, + "grad_norm": 0.19670112431049347, + "learning_rate": 3.943609678781162e-06, + "loss": 0.151, + "step": 22250 + }, + { + "epoch": 2.802972572976037, + "grad_norm": 0.2076457440853119, + "learning_rate": 3.918609115250509e-06, + "loss": 0.1515, + "step": 22255 + }, + { + "epoch": 2.803602355386214, + "grad_norm": 0.20138059556484222, + "learning_rate": 3.893687000763635e-06, + "loss": 0.1492, + "step": 22260 + }, + { + "epoch": 2.8042321377963915, + "grad_norm": 0.20619480311870575, + "learning_rate": 3.868843348704265e-06, + "loss": 0.1516, + "step": 22265 + }, + { + "epoch": 2.8048619202065685, + "grad_norm": 0.17885464429855347, + "learning_rate": 3.844078172413994e-06, + "loss": 0.1413, + "step": 22270 + }, + { + "epoch": 2.805491702616746, + "grad_norm": 0.17029553651809692, + "learning_rate": 3.8193914851922855e-06, + "loss": 0.143, + "step": 22275 + }, + { + "epoch": 2.806121485026923, + "grad_norm": 0.18624289333820343, + "learning_rate": 3.794783300296483e-06, + "loss": 0.1448, + "step": 22280 + }, + { + "epoch": 2.8067512674371002, + "grad_norm": 0.20082144439220428, + "learning_rate": 3.7702536309417497e-06, + "loss": 0.1498, + "step": 22285 + }, + { + "epoch": 2.8073810498472778, + "grad_norm": 0.213558167219162, + "learning_rate": 3.745802490301031e-06, + "loss": 0.165, + "step": 22290 + }, + { + "epoch": 2.8080108322574553, + "grad_norm": 0.23692555725574493, + "learning_rate": 3.721429891505173e-06, + "loss": 0.1568, + "step": 22295 + }, + { + "epoch": 2.8086406146676324, + "grad_norm": 0.18088509142398834, + "learning_rate": 3.6971358476428237e-06, + "loss": 0.1508, + "step": 22300 + }, + { + "epoch": 2.8092703970778095, + "grad_norm": 0.20369915664196014, + "learning_rate": 3.672920371760446e-06, + "loss": 0.1469, + "step": 22305 + }, + { + "epoch": 2.809900179487987, + "grad_norm": 0.18801896274089813, + "learning_rate": 3.6487834768622883e-06, + "loss": 0.1417, + "step": 22310 + }, + { + "epoch": 2.810529961898164, + "grad_norm": 0.2028091549873352, + "learning_rate": 3.6247251759104145e-06, + "loss": 0.157, + "step": 22315 + }, + { + "epoch": 2.8111597443083416, + "grad_norm": 0.16689006984233856, + "learning_rate": 3.600745481824707e-06, + "loss": 0.1393, + "step": 22320 + }, + { + "epoch": 2.8117895267185187, + "grad_norm": 0.20889881253242493, + "learning_rate": 3.576844407482765e-06, + "loss": 0.1586, + "step": 22325 + }, + { + "epoch": 2.812419309128696, + "grad_norm": 0.21049942076206207, + "learning_rate": 3.5530219657200543e-06, + "loss": 0.155, + "step": 22330 + }, + { + "epoch": 2.8130490915388733, + "grad_norm": 0.23332563042640686, + "learning_rate": 3.5292781693297247e-06, + "loss": 0.1557, + "step": 22335 + }, + { + "epoch": 2.8136788739490504, + "grad_norm": 0.1706390082836151, + "learning_rate": 3.505613031062776e-06, + "loss": 0.1421, + "step": 22340 + }, + { + "epoch": 2.814308656359228, + "grad_norm": 0.17925478518009186, + "learning_rate": 3.4820265636279265e-06, + "loss": 0.1433, + "step": 22345 + }, + { + "epoch": 2.8149384387694054, + "grad_norm": 0.15641047060489655, + "learning_rate": 3.458518779691627e-06, + "loss": 0.1423, + "step": 22350 + }, + { + "epoch": 2.8155682211795825, + "grad_norm": 0.18733102083206177, + "learning_rate": 3.435089691878112e-06, + "loss": 0.1533, + "step": 22355 + }, + { + "epoch": 2.8161980035897596, + "grad_norm": 0.22065778076648712, + "learning_rate": 3.4117393127693183e-06, + "loss": 0.157, + "step": 22360 + }, + { + "epoch": 2.816827785999937, + "grad_norm": 0.20604351162910461, + "learning_rate": 3.388467654904947e-06, + "loss": 0.1438, + "step": 22365 + }, + { + "epoch": 2.817457568410114, + "grad_norm": 0.17883001267910004, + "learning_rate": 3.365274730782419e-06, + "loss": 0.1465, + "step": 22370 + }, + { + "epoch": 2.8180873508202917, + "grad_norm": 0.18118852376937866, + "learning_rate": 3.3421605528568374e-06, + "loss": 0.1501, + "step": 22375 + }, + { + "epoch": 2.818717133230469, + "grad_norm": 0.2178465574979782, + "learning_rate": 3.3191251335410564e-06, + "loss": 0.1467, + "step": 22380 + }, + { + "epoch": 2.8193469156406463, + "grad_norm": 0.18433082103729248, + "learning_rate": 3.29616848520563e-06, + "loss": 0.1478, + "step": 22385 + }, + { + "epoch": 2.8199766980508234, + "grad_norm": 0.19671761989593506, + "learning_rate": 3.273290620178831e-06, + "loss": 0.144, + "step": 22390 + }, + { + "epoch": 2.8206064804610005, + "grad_norm": 0.2000323235988617, + "learning_rate": 3.2504915507465144e-06, + "loss": 0.1443, + "step": 22395 + }, + { + "epoch": 2.821236262871178, + "grad_norm": 0.19443731009960175, + "learning_rate": 3.22777128915237e-06, + "loss": 0.1537, + "step": 22400 + }, + { + "epoch": 2.8218660452813555, + "grad_norm": 0.19904273748397827, + "learning_rate": 3.2051298475976707e-06, + "loss": 0.1581, + "step": 22405 + }, + { + "epoch": 2.8224958276915326, + "grad_norm": 0.1972033828496933, + "learning_rate": 3.18256723824139e-06, + "loss": 0.1383, + "step": 22410 + }, + { + "epoch": 2.8231256101017097, + "grad_norm": 0.21138480305671692, + "learning_rate": 3.16008347320017e-06, + "loss": 0.1442, + "step": 22415 + }, + { + "epoch": 2.8237553925118872, + "grad_norm": 0.22747448086738586, + "learning_rate": 3.1376785645483016e-06, + "loss": 0.1485, + "step": 22420 + }, + { + "epoch": 2.8243851749220643, + "grad_norm": 0.23757314682006836, + "learning_rate": 3.11535252431776e-06, + "loss": 0.1568, + "step": 22425 + }, + { + "epoch": 2.825014957332242, + "grad_norm": 0.2193070352077484, + "learning_rate": 3.0931053644980885e-06, + "loss": 0.1605, + "step": 22430 + }, + { + "epoch": 2.825644739742419, + "grad_norm": 0.2223901003599167, + "learning_rate": 3.0709370970365464e-06, + "loss": 0.1453, + "step": 22435 + }, + { + "epoch": 2.8262745221525964, + "grad_norm": 0.23655427992343903, + "learning_rate": 3.0488477338379944e-06, + "loss": 0.1484, + "step": 22440 + }, + { + "epoch": 2.8269043045627735, + "grad_norm": 0.20859979093074799, + "learning_rate": 3.026837286764944e-06, + "loss": 0.154, + "step": 22445 + }, + { + "epoch": 2.8275340869729506, + "grad_norm": 0.1994808316230774, + "learning_rate": 3.004905767637472e-06, + "loss": 0.1634, + "step": 22450 + }, + { + "epoch": 2.828163869383128, + "grad_norm": 0.19530266523361206, + "learning_rate": 2.983053188233342e-06, + "loss": 0.1458, + "step": 22455 + }, + { + "epoch": 2.8287936517933057, + "grad_norm": 0.19528019428253174, + "learning_rate": 2.9612795602878827e-06, + "loss": 0.1472, + "step": 22460 + }, + { + "epoch": 2.8294234342034827, + "grad_norm": 0.20543955266475677, + "learning_rate": 2.939584895494007e-06, + "loss": 0.1544, + "step": 22465 + }, + { + "epoch": 2.83005321661366, + "grad_norm": 0.18907050788402557, + "learning_rate": 2.917969205502263e-06, + "loss": 0.1469, + "step": 22470 + }, + { + "epoch": 2.8306829990238374, + "grad_norm": 0.2009141594171524, + "learning_rate": 2.896432501920748e-06, + "loss": 0.1463, + "step": 22475 + }, + { + "epoch": 2.8313127814340144, + "grad_norm": 0.1845710128545761, + "learning_rate": 2.8749747963151937e-06, + "loss": 0.1523, + "step": 22480 + }, + { + "epoch": 2.831942563844192, + "grad_norm": 0.22671662271022797, + "learning_rate": 2.853596100208866e-06, + "loss": 0.1553, + "step": 22485 + }, + { + "epoch": 2.832572346254369, + "grad_norm": 0.1716582477092743, + "learning_rate": 2.832296425082614e-06, + "loss": 0.1423, + "step": 22490 + }, + { + "epoch": 2.8332021286645466, + "grad_norm": 0.17477920651435852, + "learning_rate": 2.8110757823748554e-06, + "loss": 0.142, + "step": 22495 + }, + { + "epoch": 2.8338319110747237, + "grad_norm": 0.22391197085380554, + "learning_rate": 2.7899341834815236e-06, + "loss": 0.1576, + "step": 22500 + }, + { + "epoch": 2.8344616934849007, + "grad_norm": 0.19235247373580933, + "learning_rate": 2.7688716397561874e-06, + "loss": 0.1432, + "step": 22505 + }, + { + "epoch": 2.8350914758950783, + "grad_norm": 0.21828468143939972, + "learning_rate": 2.747888162509898e-06, + "loss": 0.1461, + "step": 22510 + }, + { + "epoch": 2.835721258305256, + "grad_norm": 0.19712364673614502, + "learning_rate": 2.726983763011259e-06, + "loss": 0.1461, + "step": 22515 + }, + { + "epoch": 2.836351040715433, + "grad_norm": 0.22868654131889343, + "learning_rate": 2.7061584524864066e-06, + "loss": 0.1546, + "step": 22520 + }, + { + "epoch": 2.83698082312561, + "grad_norm": 0.18876421451568604, + "learning_rate": 2.685412242119012e-06, + "loss": 0.1481, + "step": 22525 + }, + { + "epoch": 2.8376106055357875, + "grad_norm": 0.21973784267902374, + "learning_rate": 2.664745143050295e-06, + "loss": 0.1568, + "step": 22530 + }, + { + "epoch": 2.8382403879459646, + "grad_norm": 0.24478502571582794, + "learning_rate": 2.6441571663788963e-06, + "loss": 0.1558, + "step": 22535 + }, + { + "epoch": 2.838870170356142, + "grad_norm": 0.18952693045139313, + "learning_rate": 2.6236483231610707e-06, + "loss": 0.1461, + "step": 22540 + }, + { + "epoch": 2.839499952766319, + "grad_norm": 0.20026876032352448, + "learning_rate": 2.603218624410525e-06, + "loss": 0.1466, + "step": 22545 + }, + { + "epoch": 2.8401297351764967, + "grad_norm": 0.21935871243476868, + "learning_rate": 2.5828680810984824e-06, + "loss": 0.1563, + "step": 22550 + }, + { + "epoch": 2.8407595175866738, + "grad_norm": 0.22446821630001068, + "learning_rate": 2.5625967041536354e-06, + "loss": 0.1462, + "step": 22555 + }, + { + "epoch": 2.841389299996851, + "grad_norm": 0.2072252631187439, + "learning_rate": 2.5424045044621922e-06, + "loss": 0.1505, + "step": 22560 + }, + { + "epoch": 2.8420190824070284, + "grad_norm": 0.19828562438488007, + "learning_rate": 2.5222914928678285e-06, + "loss": 0.1462, + "step": 22565 + }, + { + "epoch": 2.842648864817206, + "grad_norm": 0.18411174416542053, + "learning_rate": 2.502257680171671e-06, + "loss": 0.1415, + "step": 22570 + }, + { + "epoch": 2.843278647227383, + "grad_norm": 0.20017574727535248, + "learning_rate": 2.482303077132347e-06, + "loss": 0.1556, + "step": 22575 + }, + { + "epoch": 2.84390842963756, + "grad_norm": 0.1881314069032669, + "learning_rate": 2.462427694465935e-06, + "loss": 0.1464, + "step": 22580 + }, + { + "epoch": 2.8445382120477376, + "grad_norm": 0.2211647629737854, + "learning_rate": 2.4426315428459466e-06, + "loss": 0.1471, + "step": 22585 + }, + { + "epoch": 2.8451679944579147, + "grad_norm": 0.20288364589214325, + "learning_rate": 2.4229146329033944e-06, + "loss": 0.146, + "step": 22590 + }, + { + "epoch": 2.845797776868092, + "grad_norm": 0.22115926444530487, + "learning_rate": 2.4032769752267087e-06, + "loss": 0.1422, + "step": 22595 + }, + { + "epoch": 2.8464275592782693, + "grad_norm": 0.196670264005661, + "learning_rate": 2.3837185803617544e-06, + "loss": 0.153, + "step": 22600 + }, + { + "epoch": 2.847057341688447, + "grad_norm": 0.23514890670776367, + "learning_rate": 2.3642394588118285e-06, + "loss": 0.1573, + "step": 22605 + }, + { + "epoch": 2.847687124098624, + "grad_norm": 0.1987423151731491, + "learning_rate": 2.3448396210376807e-06, + "loss": 0.1457, + "step": 22610 + }, + { + "epoch": 2.848316906508801, + "grad_norm": 0.18859946727752686, + "learning_rate": 2.3255190774574605e-06, + "loss": 0.1533, + "step": 22615 + }, + { + "epoch": 2.8489466889189785, + "grad_norm": 0.21700045466423035, + "learning_rate": 2.306277838446735e-06, + "loss": 0.1416, + "step": 22620 + }, + { + "epoch": 2.849576471329156, + "grad_norm": 0.17610225081443787, + "learning_rate": 2.2871159143384723e-06, + "loss": 0.1498, + "step": 22625 + }, + { + "epoch": 2.850206253739333, + "grad_norm": 0.2066749781370163, + "learning_rate": 2.26803331542309e-06, + "loss": 0.1587, + "step": 22630 + }, + { + "epoch": 2.85083603614951, + "grad_norm": 0.19877871870994568, + "learning_rate": 2.2490300519484082e-06, + "loss": 0.1526, + "step": 22635 + }, + { + "epoch": 2.8514658185596877, + "grad_norm": 0.19332483410835266, + "learning_rate": 2.230106134119547e-06, + "loss": 0.1562, + "step": 22640 + }, + { + "epoch": 2.852095600969865, + "grad_norm": 0.21806974709033966, + "learning_rate": 2.21126157209911e-06, + "loss": 0.1508, + "step": 22645 + }, + { + "epoch": 2.8527253833800423, + "grad_norm": 0.20896165072917938, + "learning_rate": 2.192496376007069e-06, + "loss": 0.1612, + "step": 22650 + }, + { + "epoch": 2.8533551657902194, + "grad_norm": 0.2381521761417389, + "learning_rate": 2.1738105559207465e-06, + "loss": 0.1545, + "step": 22655 + }, + { + "epoch": 2.853984948200397, + "grad_norm": 0.24022352695465088, + "learning_rate": 2.155204121874882e-06, + "loss": 0.1548, + "step": 22660 + }, + { + "epoch": 2.854614730610574, + "grad_norm": 0.20042377710342407, + "learning_rate": 2.1366770838615322e-06, + "loss": 0.1423, + "step": 22665 + }, + { + "epoch": 2.855244513020751, + "grad_norm": 0.1943242996931076, + "learning_rate": 2.118229451830139e-06, + "loss": 0.1453, + "step": 22670 + }, + { + "epoch": 2.8558742954309286, + "grad_norm": 0.20173771679401398, + "learning_rate": 2.0998612356874944e-06, + "loss": 0.1406, + "step": 22675 + }, + { + "epoch": 2.856504077841106, + "grad_norm": 0.21339194476604462, + "learning_rate": 2.081572445297791e-06, + "loss": 0.1447, + "step": 22680 + }, + { + "epoch": 2.8571338602512832, + "grad_norm": 0.18814577162265778, + "learning_rate": 2.0633630904824727e-06, + "loss": 0.144, + "step": 22685 + }, + { + "epoch": 2.8577636426614603, + "grad_norm": 0.1956281065940857, + "learning_rate": 2.045233181020417e-06, + "loss": 0.1503, + "step": 22690 + }, + { + "epoch": 2.858393425071638, + "grad_norm": 0.22954149544239044, + "learning_rate": 2.027182726647786e-06, + "loss": 0.1491, + "step": 22695 + }, + { + "epoch": 2.859023207481815, + "grad_norm": 0.18004447221755981, + "learning_rate": 2.009211737058092e-06, + "loss": 0.1492, + "step": 22700 + }, + { + "epoch": 2.8596529898919925, + "grad_norm": 0.226220041513443, + "learning_rate": 1.991320221902165e-06, + "loss": 0.159, + "step": 22705 + }, + { + "epoch": 2.8602827723021695, + "grad_norm": 0.1808856725692749, + "learning_rate": 1.9735081907881367e-06, + "loss": 0.1473, + "step": 22710 + }, + { + "epoch": 2.860912554712347, + "grad_norm": 0.2538818418979645, + "learning_rate": 1.9557756532815216e-06, + "loss": 0.1523, + "step": 22715 + }, + { + "epoch": 2.861542337122524, + "grad_norm": 0.18744130432605743, + "learning_rate": 1.9381226189050524e-06, + "loss": 0.1501, + "step": 22720 + }, + { + "epoch": 2.8621721195327012, + "grad_norm": 0.2162604182958603, + "learning_rate": 1.920549097138813e-06, + "loss": 0.1505, + "step": 22725 + }, + { + "epoch": 2.8628019019428788, + "grad_norm": 0.2076927125453949, + "learning_rate": 1.9030550974202197e-06, + "loss": 0.1493, + "step": 22730 + }, + { + "epoch": 2.8634316843530563, + "grad_norm": 0.2153797596693039, + "learning_rate": 1.885640629143942e-06, + "loss": 0.1409, + "step": 22735 + }, + { + "epoch": 2.8640614667632334, + "grad_norm": 0.19790925085544586, + "learning_rate": 1.868305701661932e-06, + "loss": 0.1596, + "step": 22740 + }, + { + "epoch": 2.8646912491734104, + "grad_norm": 0.18141327798366547, + "learning_rate": 1.8510503242834263e-06, + "loss": 0.1459, + "step": 22745 + }, + { + "epoch": 2.865321031583588, + "grad_norm": 0.20295578241348267, + "learning_rate": 1.833874506274996e-06, + "loss": 0.1485, + "step": 22750 + }, + { + "epoch": 2.865950813993765, + "grad_norm": 0.20226307213306427, + "learning_rate": 1.8167782568604127e-06, + "loss": 0.1507, + "step": 22755 + }, + { + "epoch": 2.8665805964039426, + "grad_norm": 0.19584356248378754, + "learning_rate": 1.7997615852207825e-06, + "loss": 0.1526, + "step": 22760 + }, + { + "epoch": 2.8672103788141197, + "grad_norm": 0.17093075811862946, + "learning_rate": 1.7828245004944286e-06, + "loss": 0.1481, + "step": 22765 + }, + { + "epoch": 2.867840161224297, + "grad_norm": 0.17845821380615234, + "learning_rate": 1.7659670117769587e-06, + "loss": 0.1459, + "step": 22770 + }, + { + "epoch": 2.8684699436344743, + "grad_norm": 0.1874646097421646, + "learning_rate": 1.749189128121231e-06, + "loss": 0.1493, + "step": 22775 + }, + { + "epoch": 2.8690997260446514, + "grad_norm": 0.18263909220695496, + "learning_rate": 1.7324908585373387e-06, + "loss": 0.1438, + "step": 22780 + }, + { + "epoch": 2.869729508454829, + "grad_norm": 0.1789528727531433, + "learning_rate": 1.7158722119926583e-06, + "loss": 0.1476, + "step": 22785 + }, + { + "epoch": 2.8703592908650064, + "grad_norm": 0.21683086454868317, + "learning_rate": 1.6993331974117508e-06, + "loss": 0.1499, + "step": 22790 + }, + { + "epoch": 2.8709890732751835, + "grad_norm": 0.24182718992233276, + "learning_rate": 1.6828738236764617e-06, + "loss": 0.1541, + "step": 22795 + }, + { + "epoch": 2.8716188556853606, + "grad_norm": 0.21868962049484253, + "learning_rate": 1.6664940996258702e-06, + "loss": 0.1471, + "step": 22800 + }, + { + "epoch": 2.872248638095538, + "grad_norm": 0.1993272453546524, + "learning_rate": 1.6501940340562236e-06, + "loss": 0.1526, + "step": 22805 + }, + { + "epoch": 2.872878420505715, + "grad_norm": 0.201304093003273, + "learning_rate": 1.6339736357210697e-06, + "loss": 0.1516, + "step": 22810 + }, + { + "epoch": 2.8735082029158927, + "grad_norm": 0.25056761503219604, + "learning_rate": 1.6178329133310908e-06, + "loss": 0.151, + "step": 22815 + }, + { + "epoch": 2.87413798532607, + "grad_norm": 0.19561152160167694, + "learning_rate": 1.6017718755542696e-06, + "loss": 0.143, + "step": 22820 + }, + { + "epoch": 2.8747677677362473, + "grad_norm": 0.22097674012184143, + "learning_rate": 1.5857905310157071e-06, + "loss": 0.1512, + "step": 22825 + }, + { + "epoch": 2.8753975501464244, + "grad_norm": 0.21212686598300934, + "learning_rate": 1.5698888882977712e-06, + "loss": 0.1541, + "step": 22830 + }, + { + "epoch": 2.8760273325566015, + "grad_norm": 0.20324502885341644, + "learning_rate": 1.5540669559399977e-06, + "loss": 0.1533, + "step": 22835 + }, + { + "epoch": 2.876657114966779, + "grad_norm": 0.169882133603096, + "learning_rate": 1.5383247424391564e-06, + "loss": 0.1406, + "step": 22840 + }, + { + "epoch": 2.8772868973769565, + "grad_norm": 0.23402316868305206, + "learning_rate": 1.5226622562491352e-06, + "loss": 0.1569, + "step": 22845 + }, + { + "epoch": 2.8779166797871336, + "grad_norm": 0.17247354984283447, + "learning_rate": 1.5070795057810559e-06, + "loss": 0.1432, + "step": 22850 + }, + { + "epoch": 2.8785464621973107, + "grad_norm": 0.21733173727989197, + "learning_rate": 1.4915764994032409e-06, + "loss": 0.1523, + "step": 22855 + }, + { + "epoch": 2.879176244607488, + "grad_norm": 0.19319911301136017, + "learning_rate": 1.4761532454411306e-06, + "loss": 0.1391, + "step": 22860 + }, + { + "epoch": 2.8798060270176653, + "grad_norm": 0.19645391404628754, + "learning_rate": 1.4608097521773664e-06, + "loss": 0.1499, + "step": 22865 + }, + { + "epoch": 2.880435809427843, + "grad_norm": 0.18772046267986298, + "learning_rate": 1.4455460278517572e-06, + "loss": 0.1483, + "step": 22870 + }, + { + "epoch": 2.88106559183802, + "grad_norm": 0.22282320261001587, + "learning_rate": 1.4303620806612792e-06, + "loss": 0.1468, + "step": 22875 + }, + { + "epoch": 2.8816953742481974, + "grad_norm": 0.19287440180778503, + "learning_rate": 1.4152579187600599e-06, + "loss": 0.1495, + "step": 22880 + }, + { + "epoch": 2.8823251566583745, + "grad_norm": 0.1981481909751892, + "learning_rate": 1.400233550259361e-06, + "loss": 0.1506, + "step": 22885 + }, + { + "epoch": 2.8829549390685516, + "grad_norm": 0.21331623196601868, + "learning_rate": 1.385288983227628e-06, + "loss": 0.1483, + "step": 22890 + }, + { + "epoch": 2.883584721478729, + "grad_norm": 0.20138582587242126, + "learning_rate": 1.3704242256904252e-06, + "loss": 0.1504, + "step": 22895 + }, + { + "epoch": 2.8842145038889067, + "grad_norm": 0.18493309617042542, + "learning_rate": 1.3556392856304831e-06, + "loss": 0.1539, + "step": 22900 + }, + { + "epoch": 2.8848442862990837, + "grad_norm": 0.22465452551841736, + "learning_rate": 1.3409341709876343e-06, + "loss": 0.1511, + "step": 22905 + }, + { + "epoch": 2.885474068709261, + "grad_norm": 0.19788892567157745, + "learning_rate": 1.326308889658878e-06, + "loss": 0.1504, + "step": 22910 + }, + { + "epoch": 2.8861038511194383, + "grad_norm": 0.20821528136730194, + "learning_rate": 1.3117634494982986e-06, + "loss": 0.1511, + "step": 22915 + }, + { + "epoch": 2.8867336335296154, + "grad_norm": 0.24520978331565857, + "learning_rate": 1.2972978583171644e-06, + "loss": 0.1503, + "step": 22920 + }, + { + "epoch": 2.887363415939793, + "grad_norm": 0.18331633508205414, + "learning_rate": 1.2829121238837947e-06, + "loss": 0.1485, + "step": 22925 + }, + { + "epoch": 2.88799319834997, + "grad_norm": 0.23501911759376526, + "learning_rate": 1.2686062539236762e-06, + "loss": 0.1452, + "step": 22930 + }, + { + "epoch": 2.8886229807601476, + "grad_norm": 0.19778122007846832, + "learning_rate": 1.2543802561193806e-06, + "loss": 0.1523, + "step": 22935 + }, + { + "epoch": 2.8892527631703246, + "grad_norm": 0.21170009672641754, + "learning_rate": 1.2402341381105962e-06, + "loss": 0.1627, + "step": 22940 + }, + { + "epoch": 2.8898825455805017, + "grad_norm": 0.2205863893032074, + "learning_rate": 1.22616790749413e-06, + "loss": 0.1507, + "step": 22945 + }, + { + "epoch": 2.8905123279906793, + "grad_norm": 0.1905989944934845, + "learning_rate": 1.2121815718238393e-06, + "loss": 0.15, + "step": 22950 + }, + { + "epoch": 2.891142110400857, + "grad_norm": 0.19767914712429047, + "learning_rate": 1.1982751386107159e-06, + "loss": 0.1499, + "step": 22955 + }, + { + "epoch": 2.891771892811034, + "grad_norm": 0.23298701643943787, + "learning_rate": 1.1844486153228361e-06, + "loss": 0.1507, + "step": 22960 + }, + { + "epoch": 2.892401675221211, + "grad_norm": 0.17696991562843323, + "learning_rate": 1.1707020093853602e-06, + "loss": 0.1458, + "step": 22965 + }, + { + "epoch": 2.8930314576313885, + "grad_norm": 0.22472181916236877, + "learning_rate": 1.1570353281805334e-06, + "loss": 0.154, + "step": 22970 + }, + { + "epoch": 2.8936612400415656, + "grad_norm": 0.18597114086151123, + "learning_rate": 1.1434485790476512e-06, + "loss": 0.1531, + "step": 22975 + }, + { + "epoch": 2.894291022451743, + "grad_norm": 0.20187944173812866, + "learning_rate": 1.1299417692831436e-06, + "loss": 0.1558, + "step": 22980 + }, + { + "epoch": 2.89492080486192, + "grad_norm": 0.19914616644382477, + "learning_rate": 1.1165149061404422e-06, + "loss": 0.1418, + "step": 22985 + }, + { + "epoch": 2.8955505872720977, + "grad_norm": 0.2554416060447693, + "learning_rate": 1.1031679968301122e-06, + "loss": 0.1556, + "step": 22990 + }, + { + "epoch": 2.8961803696822748, + "grad_norm": 0.2221318930387497, + "learning_rate": 1.08990104851972e-06, + "loss": 0.1523, + "step": 22995 + }, + { + "epoch": 2.896810152092452, + "grad_norm": 0.2058124542236328, + "learning_rate": 1.0767140683339336e-06, + "loss": 0.1403, + "step": 23000 + }, + { + "epoch": 2.896810152092452, + "eval_loss": 0.36229029297828674, + "eval_runtime": 6.1614, + "eval_samples_per_second": 162.302, + "eval_steps_per_second": 10.225, + "step": 23000 + }, + { + "epoch": 2.8974399345026294, + "grad_norm": 0.2248660773038864, + "learning_rate": 1.0636070633544547e-06, + "loss": 0.1514, + "step": 23005 + }, + { + "epoch": 2.8980697169128065, + "grad_norm": 0.1835104525089264, + "learning_rate": 1.0505800406200526e-06, + "loss": 0.1405, + "step": 23010 + }, + { + "epoch": 2.898699499322984, + "grad_norm": 0.22200733423233032, + "learning_rate": 1.0376330071265482e-06, + "loss": 0.1503, + "step": 23015 + }, + { + "epoch": 2.899329281733161, + "grad_norm": 0.18144001066684723, + "learning_rate": 1.024765969826763e-06, + "loss": 0.1516, + "step": 23020 + }, + { + "epoch": 2.8999590641433386, + "grad_norm": 0.17609558999538422, + "learning_rate": 1.0119789356306196e-06, + "loss": 0.1517, + "step": 23025 + }, + { + "epoch": 2.9005888465535157, + "grad_norm": 0.2412068098783493, + "learning_rate": 9.99271911405025e-07, + "loss": 0.157, + "step": 23030 + }, + { + "epoch": 2.901218628963693, + "grad_norm": 0.19257797300815582, + "learning_rate": 9.866449039739544e-07, + "loss": 0.1486, + "step": 23035 + }, + { + "epoch": 2.9018484113738703, + "grad_norm": 0.22018341720104218, + "learning_rate": 9.74097920118383e-07, + "loss": 0.1517, + "step": 23040 + }, + { + "epoch": 2.902478193784048, + "grad_norm": 0.19569897651672363, + "learning_rate": 9.616309665763544e-07, + "loss": 0.144, + "step": 23045 + }, + { + "epoch": 2.903107976194225, + "grad_norm": 0.20541365444660187, + "learning_rate": 9.492440500428966e-07, + "loss": 0.1535, + "step": 23050 + }, + { + "epoch": 2.903737758604402, + "grad_norm": 0.1934703141450882, + "learning_rate": 9.369371771700552e-07, + "loss": 0.1369, + "step": 23055 + }, + { + "epoch": 2.9043675410145795, + "grad_norm": 0.16967949271202087, + "learning_rate": 9.247103545669266e-07, + "loss": 0.1447, + "step": 23060 + }, + { + "epoch": 2.9049973234247566, + "grad_norm": 0.15972602367401123, + "learning_rate": 9.125635887995586e-07, + "loss": 0.1484, + "step": 23065 + }, + { + "epoch": 2.905627105834934, + "grad_norm": 0.21369343996047974, + "learning_rate": 9.004968863910667e-07, + "loss": 0.153, + "step": 23070 + }, + { + "epoch": 2.906256888245111, + "grad_norm": 0.18001650273799896, + "learning_rate": 8.885102538215338e-07, + "loss": 0.1499, + "step": 23075 + }, + { + "epoch": 2.9068866706552887, + "grad_norm": 0.20539362728595734, + "learning_rate": 8.766036975280777e-07, + "loss": 0.1495, + "step": 23080 + }, + { + "epoch": 2.907516453065466, + "grad_norm": 0.2048049122095108, + "learning_rate": 8.647772239047667e-07, + "loss": 0.1477, + "step": 23085 + }, + { + "epoch": 2.9081462354756433, + "grad_norm": 0.2018105536699295, + "learning_rate": 8.530308393027041e-07, + "loss": 0.1457, + "step": 23090 + }, + { + "epoch": 2.9087760178858204, + "grad_norm": 0.21647921204566956, + "learning_rate": 8.413645500299437e-07, + "loss": 0.1511, + "step": 23095 + }, + { + "epoch": 2.909405800295998, + "grad_norm": 0.19362643361091614, + "learning_rate": 8.297783623515741e-07, + "loss": 0.1491, + "step": 23100 + }, + { + "epoch": 2.910035582706175, + "grad_norm": 0.18819986283779144, + "learning_rate": 8.182722824896182e-07, + "loss": 0.1482, + "step": 23105 + }, + { + "epoch": 2.910665365116352, + "grad_norm": 0.18341930210590363, + "learning_rate": 8.068463166231332e-07, + "loss": 0.1468, + "step": 23110 + }, + { + "epoch": 2.9112951475265296, + "grad_norm": 0.17555510997772217, + "learning_rate": 7.955004708881107e-07, + "loss": 0.1538, + "step": 23115 + }, + { + "epoch": 2.9119249299367067, + "grad_norm": 0.1862919181585312, + "learning_rate": 7.842347513775271e-07, + "loss": 0.1485, + "step": 23120 + }, + { + "epoch": 2.9125547123468842, + "grad_norm": 0.23576001822948456, + "learning_rate": 7.730491641413262e-07, + "loss": 0.15, + "step": 23125 + }, + { + "epoch": 2.9131844947570613, + "grad_norm": 0.20176522433757782, + "learning_rate": 7.619437151864194e-07, + "loss": 0.1528, + "step": 23130 + }, + { + "epoch": 2.913814277167239, + "grad_norm": 0.1988651603460312, + "learning_rate": 7.50918410476703e-07, + "loss": 0.1513, + "step": 23135 + }, + { + "epoch": 2.914444059577416, + "grad_norm": 0.19462084770202637, + "learning_rate": 7.399732559330074e-07, + "loss": 0.1471, + "step": 23140 + }, + { + "epoch": 2.9150738419875935, + "grad_norm": 0.2095441371202469, + "learning_rate": 7.291082574331309e-07, + "loss": 0.1488, + "step": 23145 + }, + { + "epoch": 2.9157036243977705, + "grad_norm": 0.19712376594543457, + "learning_rate": 7.18323420811856e-07, + "loss": 0.146, + "step": 23150 + }, + { + "epoch": 2.916333406807948, + "grad_norm": 0.20228298008441925, + "learning_rate": 7.076187518608168e-07, + "loss": 0.1408, + "step": 23155 + }, + { + "epoch": 2.916963189218125, + "grad_norm": 0.21097783744335175, + "learning_rate": 6.969942563287311e-07, + "loss": 0.1463, + "step": 23160 + }, + { + "epoch": 2.9175929716283022, + "grad_norm": 0.1911788433790207, + "learning_rate": 6.864499399211687e-07, + "loss": 0.1359, + "step": 23165 + }, + { + "epoch": 2.9182227540384797, + "grad_norm": 0.19333137571811676, + "learning_rate": 6.759858083006831e-07, + "loss": 0.1521, + "step": 23170 + }, + { + "epoch": 2.918852536448657, + "grad_norm": 0.20187996327877045, + "learning_rate": 6.656018670867125e-07, + "loss": 0.1391, + "step": 23175 + }, + { + "epoch": 2.9194823188588344, + "grad_norm": 0.26705697178840637, + "learning_rate": 6.55298121855713e-07, + "loss": 0.1531, + "step": 23180 + }, + { + "epoch": 2.9201121012690114, + "grad_norm": 0.2374356985092163, + "learning_rate": 6.450745781410249e-07, + "loss": 0.1604, + "step": 23185 + }, + { + "epoch": 2.920741883679189, + "grad_norm": 0.18227587640285492, + "learning_rate": 6.349312414329067e-07, + "loss": 0.1486, + "step": 23190 + }, + { + "epoch": 2.921371666089366, + "grad_norm": 0.22778551280498505, + "learning_rate": 6.248681171785675e-07, + "loss": 0.1475, + "step": 23195 + }, + { + "epoch": 2.922001448499543, + "grad_norm": 0.2099718153476715, + "learning_rate": 6.148852107821511e-07, + "loss": 0.1442, + "step": 23200 + }, + { + "epoch": 2.9226312309097207, + "grad_norm": 0.19987498223781586, + "learning_rate": 6.04982527604686e-07, + "loss": 0.1504, + "step": 23205 + }, + { + "epoch": 2.923261013319898, + "grad_norm": 0.1993655115365982, + "learning_rate": 5.951600729641515e-07, + "loss": 0.154, + "step": 23210 + }, + { + "epoch": 2.9238907957300753, + "grad_norm": 0.19234336912631989, + "learning_rate": 5.854178521354113e-07, + "loss": 0.1531, + "step": 23215 + }, + { + "epoch": 2.9245205781402523, + "grad_norm": 0.20532859861850739, + "learning_rate": 5.757558703502973e-07, + "loss": 0.1522, + "step": 23220 + }, + { + "epoch": 2.92515036055043, + "grad_norm": 0.23982007801532745, + "learning_rate": 5.661741327974755e-07, + "loss": 0.15, + "step": 23225 + }, + { + "epoch": 2.925780142960607, + "grad_norm": 0.18457266688346863, + "learning_rate": 5.5667264462258e-07, + "loss": 0.1457, + "step": 23230 + }, + { + "epoch": 2.9264099253707845, + "grad_norm": 0.1986248642206192, + "learning_rate": 5.472514109281123e-07, + "loss": 0.145, + "step": 23235 + }, + { + "epoch": 2.9270397077809616, + "grad_norm": 0.2265506535768509, + "learning_rate": 5.379104367735087e-07, + "loss": 0.147, + "step": 23240 + }, + { + "epoch": 2.927669490191139, + "grad_norm": 0.21421676874160767, + "learning_rate": 5.286497271750733e-07, + "loss": 0.1492, + "step": 23245 + }, + { + "epoch": 2.928299272601316, + "grad_norm": 0.1608610451221466, + "learning_rate": 5.19469287106028e-07, + "loss": 0.1339, + "step": 23250 + }, + { + "epoch": 2.9289290550114933, + "grad_norm": 0.18140849471092224, + "learning_rate": 5.103691214964789e-07, + "loss": 0.1469, + "step": 23255 + }, + { + "epoch": 2.929558837421671, + "grad_norm": 0.21651338040828705, + "learning_rate": 5.013492352334003e-07, + "loss": 0.1596, + "step": 23260 + }, + { + "epoch": 2.9301886198318483, + "grad_norm": 0.18771570920944214, + "learning_rate": 4.924096331607008e-07, + "loss": 0.1482, + "step": 23265 + }, + { + "epoch": 2.9308184022420254, + "grad_norm": 0.2401566356420517, + "learning_rate": 4.835503200791402e-07, + "loss": 0.1557, + "step": 23270 + }, + { + "epoch": 2.9314481846522025, + "grad_norm": 0.22064423561096191, + "learning_rate": 4.747713007463627e-07, + "loss": 0.1621, + "step": 23275 + }, + { + "epoch": 2.93207796706238, + "grad_norm": 0.18799829483032227, + "learning_rate": 4.660725798769305e-07, + "loss": 0.1517, + "step": 23280 + }, + { + "epoch": 2.932707749472557, + "grad_norm": 0.24584966897964478, + "learning_rate": 4.574541621422401e-07, + "loss": 0.1563, + "step": 23285 + }, + { + "epoch": 2.9333375318827346, + "grad_norm": 0.18013089895248413, + "learning_rate": 4.489160521705726e-07, + "loss": 0.1388, + "step": 23290 + }, + { + "epoch": 2.9339673142929117, + "grad_norm": 0.20351989567279816, + "learning_rate": 4.404582545470936e-07, + "loss": 0.1451, + "step": 23295 + }, + { + "epoch": 2.934597096703089, + "grad_norm": 0.17512726783752441, + "learning_rate": 4.3208077381383655e-07, + "loss": 0.1413, + "step": 23300 + }, + { + "epoch": 2.9352268791132663, + "grad_norm": 0.2058653086423874, + "learning_rate": 4.2378361446970267e-07, + "loss": 0.1578, + "step": 23305 + }, + { + "epoch": 2.9358566615234434, + "grad_norm": 0.23548051714897156, + "learning_rate": 4.155667809704444e-07, + "loss": 0.154, + "step": 23310 + }, + { + "epoch": 2.936486443933621, + "grad_norm": 0.19606271386146545, + "learning_rate": 4.074302777286986e-07, + "loss": 0.1523, + "step": 23315 + }, + { + "epoch": 2.9371162263437984, + "grad_norm": 0.19439321756362915, + "learning_rate": 3.993741091139369e-07, + "loss": 0.1447, + "step": 23320 + }, + { + "epoch": 2.9377460087539755, + "grad_norm": 0.20347769558429718, + "learning_rate": 3.9139827945253167e-07, + "loss": 0.1466, + "step": 23325 + }, + { + "epoch": 2.9383757911641526, + "grad_norm": 0.23724155128002167, + "learning_rate": 3.835027930276735e-07, + "loss": 0.1597, + "step": 23330 + }, + { + "epoch": 2.93900557357433, + "grad_norm": 0.1813487559556961, + "learning_rate": 3.7568765407940406e-07, + "loss": 0.1358, + "step": 23335 + }, + { + "epoch": 2.939635355984507, + "grad_norm": 0.21012306213378906, + "learning_rate": 3.679528668046494e-07, + "loss": 0.1508, + "step": 23340 + }, + { + "epoch": 2.9402651383946847, + "grad_norm": 0.21550029516220093, + "learning_rate": 3.602984353571703e-07, + "loss": 0.1449, + "step": 23345 + }, + { + "epoch": 2.940894920804862, + "grad_norm": 0.203638955950737, + "learning_rate": 3.5272436384756186e-07, + "loss": 0.1491, + "step": 23350 + }, + { + "epoch": 2.9415247032150393, + "grad_norm": 0.21173645555973053, + "learning_rate": 3.452306563432872e-07, + "loss": 0.1553, + "step": 23355 + }, + { + "epoch": 2.9421544856252164, + "grad_norm": 0.22810731828212738, + "learning_rate": 3.3781731686861047e-07, + "loss": 0.1606, + "step": 23360 + }, + { + "epoch": 2.9427842680353935, + "grad_norm": 0.19044247269630432, + "learning_rate": 3.3048434940469713e-07, + "loss": 0.1421, + "step": 23365 + }, + { + "epoch": 2.943414050445571, + "grad_norm": 0.21632073819637299, + "learning_rate": 3.232317578894805e-07, + "loss": 0.1473, + "step": 23370 + }, + { + "epoch": 2.9440438328557486, + "grad_norm": 0.17703349888324738, + "learning_rate": 3.160595462178117e-07, + "loss": 0.1503, + "step": 23375 + }, + { + "epoch": 2.9446736152659256, + "grad_norm": 0.19983936846256256, + "learning_rate": 3.089677182412931e-07, + "loss": 0.1427, + "step": 23380 + }, + { + "epoch": 2.9453033976761027, + "grad_norm": 0.18913906812667847, + "learning_rate": 3.019562777684115e-07, + "loss": 0.1443, + "step": 23385 + }, + { + "epoch": 2.9459331800862802, + "grad_norm": 0.2024787813425064, + "learning_rate": 2.950252285644883e-07, + "loss": 0.1501, + "step": 23390 + }, + { + "epoch": 2.9465629624964573, + "grad_norm": 0.22307011485099792, + "learning_rate": 2.8817457435164614e-07, + "loss": 0.1526, + "step": 23395 + }, + { + "epoch": 2.947192744906635, + "grad_norm": 0.23350244760513306, + "learning_rate": 2.814043188088255e-07, + "loss": 0.1583, + "step": 23400 + }, + { + "epoch": 2.947822527316812, + "grad_norm": 0.18705366551876068, + "learning_rate": 2.7471446557181807e-07, + "loss": 0.1515, + "step": 23405 + }, + { + "epoch": 2.9484523097269895, + "grad_norm": 0.18902996182441711, + "learning_rate": 2.681050182332334e-07, + "loss": 0.1489, + "step": 23410 + }, + { + "epoch": 2.9490820921371665, + "grad_norm": 0.18764075636863708, + "learning_rate": 2.6157598034249885e-07, + "loss": 0.1519, + "step": 23415 + }, + { + "epoch": 2.9497118745473436, + "grad_norm": 0.20529431104660034, + "learning_rate": 2.5512735540584305e-07, + "loss": 0.1504, + "step": 23420 + }, + { + "epoch": 2.950341656957521, + "grad_norm": 0.21828597784042358, + "learning_rate": 2.487591468863293e-07, + "loss": 0.147, + "step": 23425 + }, + { + "epoch": 2.9509714393676987, + "grad_norm": 0.2224801480770111, + "learning_rate": 2.424713582038551e-07, + "loss": 0.1542, + "step": 23430 + }, + { + "epoch": 2.9516012217778758, + "grad_norm": 0.1950775682926178, + "learning_rate": 2.3626399273506957e-07, + "loss": 0.1462, + "step": 23435 + }, + { + "epoch": 2.952231004188053, + "grad_norm": 0.17124883830547333, + "learning_rate": 2.3013705381348946e-07, + "loss": 0.1374, + "step": 23440 + }, + { + "epoch": 2.9528607865982304, + "grad_norm": 0.20156201720237732, + "learning_rate": 2.2409054472941613e-07, + "loss": 0.1471, + "step": 23445 + }, + { + "epoch": 2.9534905690084075, + "grad_norm": 0.19449764490127563, + "learning_rate": 2.1812446872995214e-07, + "loss": 0.158, + "step": 23450 + }, + { + "epoch": 2.954120351418585, + "grad_norm": 0.18049843609333038, + "learning_rate": 2.1223882901905132e-07, + "loss": 0.1518, + "step": 23455 + }, + { + "epoch": 2.954750133828762, + "grad_norm": 0.19583791494369507, + "learning_rate": 2.06433628757402e-07, + "loss": 0.1556, + "step": 23460 + }, + { + "epoch": 2.9553799162389396, + "grad_norm": 0.1842242181301117, + "learning_rate": 2.0070887106254373e-07, + "loss": 0.1494, + "step": 23465 + }, + { + "epoch": 2.9560096986491167, + "grad_norm": 0.22290416061878204, + "learning_rate": 1.950645590088007e-07, + "loss": 0.1454, + "step": 23470 + }, + { + "epoch": 2.9566394810592938, + "grad_norm": 0.18742886185646057, + "learning_rate": 1.895006956272982e-07, + "loss": 0.1432, + "step": 23475 + }, + { + "epoch": 2.9572692634694713, + "grad_norm": 0.181674525141716, + "learning_rate": 1.8401728390594617e-07, + "loss": 0.1534, + "step": 23480 + }, + { + "epoch": 2.957899045879649, + "grad_norm": 0.2029273808002472, + "learning_rate": 1.786143267894724e-07, + "loss": 0.1515, + "step": 23485 + }, + { + "epoch": 2.958528828289826, + "grad_norm": 0.23291803896427155, + "learning_rate": 1.7329182717940594e-07, + "loss": 0.144, + "step": 23490 + }, + { + "epoch": 2.959158610700003, + "grad_norm": 0.24005140364170074, + "learning_rate": 1.6804978793401036e-07, + "loss": 0.1508, + "step": 23495 + }, + { + "epoch": 2.9597883931101805, + "grad_norm": 0.18763796985149384, + "learning_rate": 1.6288821186841716e-07, + "loss": 0.1477, + "step": 23500 + }, + { + "epoch": 2.9604181755203576, + "grad_norm": 0.18949641287326813, + "learning_rate": 1.578071017544924e-07, + "loss": 0.1501, + "step": 23505 + }, + { + "epoch": 2.961047957930535, + "grad_norm": 0.21106213331222534, + "learning_rate": 1.5280646032092003e-07, + "loss": 0.152, + "step": 23510 + }, + { + "epoch": 2.961677740340712, + "grad_norm": 0.231742724776268, + "learning_rate": 1.4788629025313526e-07, + "loss": 0.1546, + "step": 23515 + }, + { + "epoch": 2.9623075227508897, + "grad_norm": 0.21462422609329224, + "learning_rate": 1.430465941934078e-07, + "loss": 0.149, + "step": 23520 + }, + { + "epoch": 2.962937305161067, + "grad_norm": 0.18480440974235535, + "learning_rate": 1.382873747407587e-07, + "loss": 0.1486, + "step": 23525 + }, + { + "epoch": 2.963567087571244, + "grad_norm": 0.18864907324314117, + "learning_rate": 1.3360863445097682e-07, + "loss": 0.1556, + "step": 23530 + }, + { + "epoch": 2.9641968699814214, + "grad_norm": 0.17919088900089264, + "learning_rate": 1.2901037583668562e-07, + "loss": 0.1435, + "step": 23535 + }, + { + "epoch": 2.964826652391599, + "grad_norm": 0.22338031232357025, + "learning_rate": 1.2449260136722649e-07, + "loss": 0.1538, + "step": 23540 + }, + { + "epoch": 2.965456434801776, + "grad_norm": 0.23020599782466888, + "learning_rate": 1.200553134687754e-07, + "loss": 0.1559, + "step": 23545 + }, + { + "epoch": 2.966086217211953, + "grad_norm": 0.20723643898963928, + "learning_rate": 1.1569851452422618e-07, + "loss": 0.1415, + "step": 23550 + }, + { + "epoch": 2.9667159996221306, + "grad_norm": 0.2118474692106247, + "learning_rate": 1.1142220687330727e-07, + "loss": 0.1567, + "step": 23555 + }, + { + "epoch": 2.9673457820323077, + "grad_norm": 0.20279090106487274, + "learning_rate": 1.0722639281246503e-07, + "loss": 0.145, + "step": 23560 + }, + { + "epoch": 2.9679755644424852, + "grad_norm": 0.21842657029628754, + "learning_rate": 1.0311107459498035e-07, + "loss": 0.1557, + "step": 23565 + }, + { + "epoch": 2.9686053468526623, + "grad_norm": 0.2776351571083069, + "learning_rate": 9.90762544308521e-08, + "loss": 0.1638, + "step": 23570 + }, + { + "epoch": 2.96923512926284, + "grad_norm": 0.19698885083198547, + "learning_rate": 9.512193448686367e-08, + "loss": 0.1457, + "step": 23575 + }, + { + "epoch": 2.969864911673017, + "grad_norm": 0.1835564374923706, + "learning_rate": 9.124811688659972e-08, + "loss": 0.1569, + "step": 23580 + }, + { + "epoch": 2.970494694083194, + "grad_norm": 0.20081757009029388, + "learning_rate": 8.745480371036284e-08, + "loss": 0.1451, + "step": 23585 + }, + { + "epoch": 2.9711244764933715, + "grad_norm": 0.2095508873462677, + "learning_rate": 8.37419969952735e-08, + "loss": 0.1461, + "step": 23590 + }, + { + "epoch": 2.971754258903549, + "grad_norm": 0.24606873095035553, + "learning_rate": 8.010969873517015e-08, + "loss": 0.1555, + "step": 23595 + }, + { + "epoch": 2.972384041313726, + "grad_norm": 0.2295389175415039, + "learning_rate": 7.65579108806924e-08, + "loss": 0.1534, + "step": 23600 + }, + { + "epoch": 2.973013823723903, + "grad_norm": 0.217758446931839, + "learning_rate": 7.308663533924786e-08, + "loss": 0.1647, + "step": 23605 + }, + { + "epoch": 2.9736436061340807, + "grad_norm": 0.21592317521572113, + "learning_rate": 6.969587397496201e-08, + "loss": 0.1482, + "step": 23610 + }, + { + "epoch": 2.974273388544258, + "grad_norm": 0.2219853401184082, + "learning_rate": 6.638562860876162e-08, + "loss": 0.151, + "step": 23615 + }, + { + "epoch": 2.9749031709544353, + "grad_norm": 0.21385011076927185, + "learning_rate": 6.315590101832468e-08, + "loss": 0.1477, + "step": 23620 + }, + { + "epoch": 2.9755329533646124, + "grad_norm": 0.18287068605422974, + "learning_rate": 6.000669293808048e-08, + "loss": 0.1478, + "step": 23625 + }, + { + "epoch": 2.97616273577479, + "grad_norm": 0.23458221554756165, + "learning_rate": 5.693800605924281e-08, + "loss": 0.1521, + "step": 23630 + }, + { + "epoch": 2.976792518184967, + "grad_norm": 0.2393937110900879, + "learning_rate": 5.394984202976016e-08, + "loss": 0.1443, + "step": 23635 + }, + { + "epoch": 2.977422300595144, + "grad_norm": 0.2523866891860962, + "learning_rate": 5.104220245434887e-08, + "loss": 0.16, + "step": 23640 + }, + { + "epoch": 2.9780520830053216, + "grad_norm": 0.20749832689762115, + "learning_rate": 4.821508889445991e-08, + "loss": 0.1493, + "step": 23645 + }, + { + "epoch": 2.978681865415499, + "grad_norm": 0.20586150884628296, + "learning_rate": 4.546850286834547e-08, + "loss": 0.1425, + "step": 23650 + }, + { + "epoch": 2.9793116478256763, + "grad_norm": 0.20742247998714447, + "learning_rate": 4.2802445850959046e-08, + "loss": 0.1556, + "step": 23655 + }, + { + "epoch": 2.9799414302358533, + "grad_norm": 0.178171768784523, + "learning_rate": 4.0216919274038696e-08, + "loss": 0.1532, + "step": 23660 + }, + { + "epoch": 2.980571212646031, + "grad_norm": 0.24389080703258514, + "learning_rate": 3.771192452607374e-08, + "loss": 0.1551, + "step": 23665 + }, + { + "epoch": 2.981200995056208, + "grad_norm": 0.18905188143253326, + "learning_rate": 3.528746295232143e-08, + "loss": 0.144, + "step": 23670 + }, + { + "epoch": 2.9818307774663855, + "grad_norm": 0.258633017539978, + "learning_rate": 3.2943535854756956e-08, + "loss": 0.1624, + "step": 23675 + }, + { + "epoch": 2.9824605598765626, + "grad_norm": 0.19491221010684967, + "learning_rate": 3.0680144492123416e-08, + "loss": 0.1429, + "step": 23680 + }, + { + "epoch": 2.98309034228674, + "grad_norm": 0.19454774260520935, + "learning_rate": 2.8497290079898537e-08, + "loss": 0.1439, + "step": 23685 + }, + { + "epoch": 2.983720124696917, + "grad_norm": 0.21658724546432495, + "learning_rate": 2.6394973790361262e-08, + "loss": 0.1397, + "step": 23690 + }, + { + "epoch": 2.9843499071070942, + "grad_norm": 0.18729209899902344, + "learning_rate": 2.4373196752475177e-08, + "loss": 0.1494, + "step": 23695 + }, + { + "epoch": 2.9849796895172718, + "grad_norm": 0.19640378654003143, + "learning_rate": 2.243196005198844e-08, + "loss": 0.1497, + "step": 23700 + }, + { + "epoch": 2.9856094719274493, + "grad_norm": 0.20427659153938293, + "learning_rate": 2.0571264731383817e-08, + "loss": 0.1533, + "step": 23705 + }, + { + "epoch": 2.9862392543376264, + "grad_norm": 0.217566579580307, + "learning_rate": 1.8791111789911995e-08, + "loss": 0.1539, + "step": 23710 + }, + { + "epoch": 2.9868690367478035, + "grad_norm": 0.2069326490163803, + "learning_rate": 1.7091502183541606e-08, + "loss": 0.1481, + "step": 23715 + }, + { + "epoch": 2.987498819157981, + "grad_norm": 0.17276856303215027, + "learning_rate": 1.5472436825009205e-08, + "loss": 0.1457, + "step": 23720 + }, + { + "epoch": 2.988128601568158, + "grad_norm": 0.21325160562992096, + "learning_rate": 1.3933916583785954e-08, + "loss": 0.1453, + "step": 23725 + }, + { + "epoch": 2.9887583839783356, + "grad_norm": 0.2071818858385086, + "learning_rate": 1.2475942286094275e-08, + "loss": 0.1522, + "step": 23730 + }, + { + "epoch": 2.9893881663885127, + "grad_norm": 0.19544194638729095, + "learning_rate": 1.1098514714891205e-08, + "loss": 0.1427, + "step": 23735 + }, + { + "epoch": 2.99001794879869, + "grad_norm": 0.2268587052822113, + "learning_rate": 9.801634609901688e-09, + "loss": 0.1574, + "step": 23740 + }, + { + "epoch": 2.9906477312088673, + "grad_norm": 0.1800483763217926, + "learning_rate": 8.585302667585281e-09, + "loss": 0.1491, + "step": 23745 + }, + { + "epoch": 2.9912775136190444, + "grad_norm": 0.1946074366569519, + "learning_rate": 7.449519541119498e-09, + "loss": 0.1504, + "step": 23750 + }, + { + "epoch": 2.991907296029222, + "grad_norm": 0.19125299155712128, + "learning_rate": 6.394285840449764e-09, + "loss": 0.148, + "step": 23755 + }, + { + "epoch": 2.9925370784393994, + "grad_norm": 0.18596796691417694, + "learning_rate": 5.419602132272771e-09, + "loss": 0.1418, + "step": 23760 + }, + { + "epoch": 2.9931668608495765, + "grad_norm": 0.20286522805690765, + "learning_rate": 4.525468940003163e-09, + "loss": 0.1425, + "step": 23765 + }, + { + "epoch": 2.9937966432597536, + "grad_norm": 0.1934228241443634, + "learning_rate": 3.7118867438068465e-09, + "loss": 0.1456, + "step": 23770 + }, + { + "epoch": 2.994426425669931, + "grad_norm": 0.19864603877067566, + "learning_rate": 2.9788559806176447e-09, + "loss": 0.141, + "step": 23775 + }, + { + "epoch": 2.995056208080108, + "grad_norm": 0.1812632828950882, + "learning_rate": 2.326377044070682e-09, + "loss": 0.1414, + "step": 23780 + }, + { + "epoch": 2.9956859904902857, + "grad_norm": 0.18837498128414154, + "learning_rate": 1.7544502845856512e-09, + "loss": 0.1466, + "step": 23785 + }, + { + "epoch": 2.996315772900463, + "grad_norm": 0.19479897618293762, + "learning_rate": 1.2630760092668946e-09, + "loss": 0.1434, + "step": 23790 + }, + { + "epoch": 2.9969455553106403, + "grad_norm": 0.2550322413444519, + "learning_rate": 8.522544820199761e-10, + "loss": 0.1527, + "step": 23795 + }, + { + "epoch": 2.9975753377208174, + "grad_norm": 0.22663426399230957, + "learning_rate": 5.21985923451762e-10, + "loss": 0.154, + "step": 23800 + }, + { + "epoch": 2.9982051201309945, + "grad_norm": 0.19043132662773132, + "learning_rate": 2.7227051092038043e-10, + "loss": 0.1413, + "step": 23805 + }, + { + "epoch": 2.998834902541172, + "grad_norm": 0.17332880198955536, + "learning_rate": 1.0310837855187492e-10, + "loss": 0.137, + "step": 23810 + }, + { + "epoch": 2.9994646849513495, + "grad_norm": 0.2147851288318634, + "learning_rate": 1.4499617156937815e-11, + "loss": 0.1501, + "step": 23815 + } + ], + "logging_steps": 5, + "max_steps": 23817, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.0258751160588435e+19, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..bc1eadc --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62d10522f577ccde7133c9f6fc2ec8af0e0b4a33068ba484c9e2478c4fd87588 +size 7633