commit c29d8559c06038d0d97b433e63cc4395d76f1053 Author: ModelHub XC Date: Sun May 3 10:17:08 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: rbelanec/train_boolq_42_1776331558 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..c7c19e6 --- /dev/null +++ b/README.md @@ -0,0 +1,81 @@ +--- +library_name: transformers +license: llama3.2 +base_model: meta-llama/Llama-3.2-1B-Instruct +tags: +- peft-factory +- full +- llama-factory +- generated_from_trainer +model-index: +- name: train_boolq_42_1776331558 + results: [] +--- + + + +# train_boolq_42_1776331558 + +This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) on the boolq dataset. +It achieves the following results on the evaluation set: +- Loss: 0.1885 +- Num Input Tokens Seen: 12333600 + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 5e-06 +- train_batch_size: 8 +- eval_batch_size: 8 +- seed: 42 +- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_ratio: 0.1 +- num_epochs: 5 + +### Training results + +| Training Loss | Epoch | Step | Validation Loss | Input Tokens Seen | +|:-------------:|:------:|:----:|:---------------:|:-----------------:| +| 0.2277 | 0.2507 | 266 | 0.2505 | 618432 | +| 0.2193 | 0.5014 | 532 | 0.3166 | 1225408 | +| 0.2554 | 0.7521 | 798 | 0.2179 | 1851072 | +| 0.3676 | 1.0028 | 1064 | 0.1885 | 2475808 | +| 0.165 | 1.2535 | 1330 | 0.4608 | 3091552 | +| 0.2207 | 1.5042 | 1596 | 0.3545 | 3699104 | +| 0.1138 | 1.7549 | 1862 | 0.3500 | 4324256 | +| 0.0762 | 2.0057 | 2128 | 0.3345 | 4940992 | +| 0.0898 | 2.2564 | 2394 | 0.4647 | 5558144 | +| 0.0692 | 2.5071 | 2660 | 0.4098 | 6183872 | +| 0.227 | 2.7578 | 2926 | 0.4303 | 6806208 | +| 0.0004 | 3.0085 | 3192 | 0.3937 | 7421856 | +| 0.0 | 3.2592 | 3458 | 0.5191 | 8043744 | +| 0.0002 | 3.5099 | 3724 | 0.4636 | 8660768 | +| 0.0 | 3.7606 | 3990 | 0.5201 | 9286304 | +| 0.0001 | 4.0113 | 4256 | 0.5146 | 9894624 | +| 0.0782 | 4.2620 | 4522 | 0.5548 | 10512416 | +| 0.0 | 4.5127 | 4788 | 0.5418 | 11115040 | +| 0.0 | 4.7634 | 5054 | 0.5422 | 11736672 | + + +### Framework versions + +- Transformers 4.51.3 +- Pytorch 2.10.0+cu128 +- Datasets 4.0.0 +- Tokenizers 0.21.4 diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..b409842 --- /dev/null +++ b/all_results.json @@ -0,0 +1,13 @@ +{ + "epoch": 5.0, + "eval_loss": 0.18848362565040588, + "eval_runtime": 2.7208, + "eval_samples_per_second": 346.59, + "eval_steps_per_second": 43.37, + "num_input_tokens_seen": 12333600, + "total_flos": 7.20143693217792e+16, + "train_loss": 0.11108403178044919, + "train_runtime": 1575.5047, + "train_samples_per_second": 26.925, + "train_steps_per_second": 3.367 +} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..5a2b93f --- /dev/null +++ b/config.json @@ -0,0 +1,39 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.51.3", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/eval_results.json b/eval_results.json new file mode 100644 index 0000000..f0c0fdd --- /dev/null +++ b/eval_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 5.0, + "eval_loss": 0.18848362565040588, + "eval_runtime": 2.7208, + "eval_samples_per_second": 346.59, + "eval_steps_per_second": 43.37, + "num_input_tokens_seen": 12333600 +} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..2b8ae57 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.51.3" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..aeb02b2 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4da06e50263a9fa89db7141d357687cbad06fe618344dc99d66e34eeb5327e36 +size 4943274328 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..14daf45 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,26 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..ddc3ce0 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2069 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/train.yaml b/train.yaml new file mode 100644 index 0000000..797f2bb --- /dev/null +++ b/train.yaml @@ -0,0 +1,55 @@ +seed: 42 + +### model +model_name_or_path: meta-llama/Llama-3.2-1B-Instruct +trust_remote_code: true +flash_attn: auto +use_cache: false + +### method +stage: sft +do_train: true +finetuning_type: full + +### dataset +dataset: boolq +template: llama3 +cutoff_len: 2048 +overwrite_cache: true +preprocessing_num_workers: 4 +dataloader_num_workers: 4 +packing: false + +### output +output_dir: saves_bts_preliminary/base/llama-3.2-1b-instruct/train_boolq_42_1776331558 +logging_steps: 5 +save_steps: 0.05 +overwrite_output_dir: true +save_only_model: false +plot_loss: true +include_num_input_tokens_seen: true +push_to_hub: true +push_to_hub_organization: rbelanec +load_best_model_at_end: true +save_total_limit: 1 + +### train +per_device_train_batch_size: 8 +learning_rate: 5.0e-6 +num_train_epochs: 5 +weight_decay: 1.0e-5 +lr_scheduler_type: cosine +bf16: true +ddp_timeout: 180000000 +resume_from_checkpoint: null +warmup_ratio: 0.1 +optim: adamw_torch +report_to: +- wandb +run_name: base_llama-3.2-1b-instruct_train_boolq_42_1776331558 + +### eval +per_device_eval_batch_size: 8 +eval_strategy: steps +eval_steps: 0.05 +val_size: 0.1 \ No newline at end of file diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..b16b6ec --- /dev/null +++ b/train_results.json @@ -0,0 +1,9 @@ +{ + "epoch": 5.0, + "num_input_tokens_seen": 12333600, + "total_flos": 7.20143693217792e+16, + "train_loss": 0.11108403178044919, + "train_runtime": 1575.5047, + "train_samples_per_second": 26.925, + "train_steps_per_second": 3.367 +} \ No newline at end of file diff --git a/trainer_log.jsonl b/trainer_log.jsonl new file mode 100644 index 0000000..6ffe2b1 --- /dev/null +++ b/trainer_log.jsonl @@ -0,0 +1,1081 @@ +{"current_steps": 5, "total_steps": 5305, "loss": 0.8967, "lr": 3.766478342749529e-08, "epoch": 0.00471253534401508, "percentage": 0.09, "elapsed_time": "0:00:00", "remaining_time": "0:15:25", "throughput": 12319.81, "total_tokens": 10752} +{"current_steps": 10, "total_steps": 5305, "loss": 0.865, "lr": 8.474576271186442e-08, "epoch": 0.00942507068803016, "percentage": 0.19, "elapsed_time": "0:00:01", "remaining_time": "0:12:19", "throughput": 14852.79, "total_tokens": 20736} +{"current_steps": 15, "total_steps": 5305, "loss": 0.8712, "lr": 1.3182674199623353e-07, "epoch": 0.01413760603204524, "percentage": 0.28, "elapsed_time": "0:00:01", "remaining_time": "0:11:24", "throughput": 16082.47, "total_tokens": 31232} +{"current_steps": 20, "total_steps": 5305, "loss": 0.7056, "lr": 1.7890772128060264e-07, "epoch": 0.01885014137606032, "percentage": 0.38, "elapsed_time": "0:00:02", "remaining_time": "0:10:44", "throughput": 16536.51, "total_tokens": 40320} +{"current_steps": 25, "total_steps": 5305, "loss": 0.5698, "lr": 2.2598870056497177e-07, "epoch": 0.0235626767200754, "percentage": 0.47, "elapsed_time": "0:00:02", "remaining_time": "0:10:30", "throughput": 17139.68, "total_tokens": 51136} +{"current_steps": 30, "total_steps": 5305, "loss": 0.4948, "lr": 2.730696798493409e-07, "epoch": 0.02827521206409048, "percentage": 0.57, "elapsed_time": "0:00:03", "remaining_time": "0:10:28", "throughput": 17731.14, "total_tokens": 63424} +{"current_steps": 35, "total_steps": 5305, "loss": 0.3516, "lr": 3.2015065913371e-07, "epoch": 0.03298774740810556, "percentage": 0.66, "elapsed_time": "0:00:04", "remaining_time": "0:10:25", "throughput": 18135.68, "total_tokens": 75328} +{"current_steps": 40, "total_steps": 5305, "loss": 0.3321, "lr": 3.6723163841807916e-07, "epoch": 0.03770028275212064, "percentage": 0.75, "elapsed_time": "0:00:04", "remaining_time": "0:10:20", "throughput": 18408.17, "total_tokens": 86784} +{"current_steps": 45, "total_steps": 5305, "loss": 0.3329, "lr": 4.1431261770244826e-07, "epoch": 0.04241281809613572, "percentage": 0.85, "elapsed_time": "0:00:05", "remaining_time": "0:10:19", "throughput": 18696.48, "total_tokens": 99136} +{"current_steps": 50, "total_steps": 5305, "loss": 0.4395, "lr": 4.613935969868174e-07, "epoch": 0.0471253534401508, "percentage": 0.94, "elapsed_time": "0:00:05", "remaining_time": "0:10:10", "throughput": 18708.59, "total_tokens": 108672} +{"current_steps": 55, "total_steps": 5305, "loss": 0.4549, "lr": 5.084745762711865e-07, "epoch": 0.051837888784165884, "percentage": 1.04, "elapsed_time": "0:00:06", "remaining_time": "0:10:06", "throughput": 18856.96, "total_tokens": 119808} +{"current_steps": 60, "total_steps": 5305, "loss": 0.3434, "lr": 5.555555555555555e-07, "epoch": 0.05655042412818096, "percentage": 1.13, "elapsed_time": "0:00:06", "remaining_time": "0:10:00", "throughput": 18923.12, "total_tokens": 130048} +{"current_steps": 65, "total_steps": 5305, "loss": 0.3434, "lr": 6.026365348399247e-07, "epoch": 0.061262959472196045, "percentage": 1.23, "elapsed_time": "0:00:07", "remaining_time": "0:10:01", "throughput": 19096.46, "total_tokens": 142464} +{"current_steps": 70, "total_steps": 5305, "loss": 0.3516, "lr": 6.497175141242938e-07, "epoch": 0.06597549481621112, "percentage": 1.32, "elapsed_time": "0:00:08", "remaining_time": "0:09:59", "throughput": 19211.51, "total_tokens": 154048} +{"current_steps": 75, "total_steps": 5305, "loss": 0.3088, "lr": 6.96798493408663e-07, "epoch": 0.0706880301602262, "percentage": 1.41, "elapsed_time": "0:00:08", "remaining_time": "0:10:00", "throughput": 19354.97, "total_tokens": 166720} +{"current_steps": 80, "total_steps": 5305, "loss": 0.3218, "lr": 7.43879472693032e-07, "epoch": 0.07540056550424128, "percentage": 1.51, "elapsed_time": "0:00:09", "remaining_time": "0:10:00", "throughput": 19489.27, "total_tokens": 179200} +{"current_steps": 85, "total_steps": 5305, "loss": 0.3962, "lr": 7.909604519774013e-07, "epoch": 0.08011310084825636, "percentage": 1.6, "elapsed_time": "0:00:09", "remaining_time": "0:09:58", "throughput": 19551.39, "total_tokens": 190464} +{"current_steps": 90, "total_steps": 5305, "loss": 0.3243, "lr": 8.380414312617704e-07, "epoch": 0.08482563619227144, "percentage": 1.7, "elapsed_time": "0:00:10", "remaining_time": "0:10:03", "throughput": 19708.81, "total_tokens": 205376} +{"current_steps": 95, "total_steps": 5305, "loss": 0.383, "lr": 8.851224105461394e-07, "epoch": 0.08953817153628653, "percentage": 1.79, "elapsed_time": "0:00:10", "remaining_time": "0:09:59", "throughput": 19706.64, "total_tokens": 215424} +{"current_steps": 100, "total_steps": 5305, "loss": 0.2987, "lr": 9.322033898305086e-07, "epoch": 0.0942507068803016, "percentage": 1.89, "elapsed_time": "0:00:11", "remaining_time": "0:09:57", "throughput": 19740.77, "total_tokens": 226688} +{"current_steps": 105, "total_steps": 5305, "loss": 0.2859, "lr": 9.792843691148776e-07, "epoch": 0.09896324222431668, "percentage": 1.98, "elapsed_time": "0:00:12", "remaining_time": "0:09:54", "throughput": 19757.7, "total_tokens": 237248} +{"current_steps": 110, "total_steps": 5305, "loss": 0.3517, "lr": 1.0263653483992468e-06, "epoch": 0.10367577756833177, "percentage": 2.07, "elapsed_time": "0:00:12", "remaining_time": "0:10:00", "throughput": 19974.86, "total_tokens": 254144} +{"current_steps": 115, "total_steps": 5305, "loss": 0.296, "lr": 1.073446327683616e-06, "epoch": 0.10838831291234684, "percentage": 2.17, "elapsed_time": "0:00:13", "remaining_time": "0:09:59", "throughput": 20015.85, "total_tokens": 265920} +{"current_steps": 120, "total_steps": 5305, "loss": 0.4375, "lr": 1.120527306967985e-06, "epoch": 0.11310084825636192, "percentage": 2.26, "elapsed_time": "0:00:13", "remaining_time": "0:09:57", "throughput": 20038.79, "total_tokens": 277184} +{"current_steps": 125, "total_steps": 5305, "loss": 0.2998, "lr": 1.167608286252354e-06, "epoch": 0.117813383600377, "percentage": 2.36, "elapsed_time": "0:00:14", "remaining_time": "0:09:56", "throughput": 20063.22, "total_tokens": 289024} +{"current_steps": 130, "total_steps": 5305, "loss": 0.2883, "lr": 1.2146892655367234e-06, "epoch": 0.12252591894439209, "percentage": 2.45, "elapsed_time": "0:00:14", "remaining_time": "0:09:54", "throughput": 20050.7, "total_tokens": 299456} +{"current_steps": 135, "total_steps": 5305, "loss": 0.3329, "lr": 1.2617702448210926e-06, "epoch": 0.12723845428840716, "percentage": 2.54, "elapsed_time": "0:00:15", "remaining_time": "0:09:56", "throughput": 20140.53, "total_tokens": 313728} +{"current_steps": 140, "total_steps": 5305, "loss": 0.2533, "lr": 1.3088512241054615e-06, "epoch": 0.13195098963242224, "percentage": 2.64, "elapsed_time": "0:00:16", "remaining_time": "0:09:56", "throughput": 20178.41, "total_tokens": 326080} +{"current_steps": 145, "total_steps": 5305, "loss": 0.2705, "lr": 1.3559322033898307e-06, "epoch": 0.13666352497643733, "percentage": 2.73, "elapsed_time": "0:00:16", "remaining_time": "0:09:55", "throughput": 20227.49, "total_tokens": 338688} +{"current_steps": 150, "total_steps": 5305, "loss": 0.3576, "lr": 1.4030131826741996e-06, "epoch": 0.1413760603204524, "percentage": 2.83, "elapsed_time": "0:00:17", "remaining_time": "0:09:54", "throughput": 20223.12, "total_tokens": 349632} +{"current_steps": 155, "total_steps": 5305, "loss": 0.2256, "lr": 1.4500941619585688e-06, "epoch": 0.1460885956644675, "percentage": 2.92, "elapsed_time": "0:00:17", "remaining_time": "0:09:55", "throughput": 20290.72, "total_tokens": 363968} +{"current_steps": 160, "total_steps": 5305, "loss": 0.4483, "lr": 1.4971751412429381e-06, "epoch": 0.15080113100848255, "percentage": 3.02, "elapsed_time": "0:00:18", "remaining_time": "0:09:54", "throughput": 20306.75, "total_tokens": 375680} +{"current_steps": 165, "total_steps": 5305, "loss": 0.269, "lr": 1.544256120527307e-06, "epoch": 0.15551366635249764, "percentage": 3.11, "elapsed_time": "0:00:19", "remaining_time": "0:09:52", "throughput": 20301.51, "total_tokens": 386368} +{"current_steps": 170, "total_steps": 5305, "loss": 0.3019, "lr": 1.5913370998116762e-06, "epoch": 0.16022620169651272, "percentage": 3.2, "elapsed_time": "0:00:19", "remaining_time": "0:09:50", "throughput": 20297.31, "total_tokens": 396992} +{"current_steps": 175, "total_steps": 5305, "loss": 0.3423, "lr": 1.6384180790960452e-06, "epoch": 0.1649387370405278, "percentage": 3.3, "elapsed_time": "0:00:20", "remaining_time": "0:09:50", "throughput": 20316.18, "total_tokens": 408960} +{"current_steps": 180, "total_steps": 5305, "loss": 0.2813, "lr": 1.6854990583804145e-06, "epoch": 0.1696512723845429, "percentage": 3.39, "elapsed_time": "0:00:20", "remaining_time": "0:09:47", "throughput": 20293.69, "total_tokens": 419008} +{"current_steps": 185, "total_steps": 5305, "loss": 0.2445, "lr": 1.7325800376647837e-06, "epoch": 0.17436380772855797, "percentage": 3.49, "elapsed_time": "0:00:21", "remaining_time": "0:09:46", "throughput": 20298.59, "total_tokens": 430144} +{"current_steps": 190, "total_steps": 5305, "loss": 0.2385, "lr": 1.7796610169491526e-06, "epoch": 0.17907634307257306, "percentage": 3.58, "elapsed_time": "0:00:21", "remaining_time": "0:09:45", "throughput": 20300.11, "total_tokens": 441216} +{"current_steps": 195, "total_steps": 5305, "loss": 0.2216, "lr": 1.8267419962335218e-06, "epoch": 0.18378887841658811, "percentage": 3.68, "elapsed_time": "0:00:22", "remaining_time": "0:09:43", "throughput": 20290.17, "total_tokens": 451584} +{"current_steps": 200, "total_steps": 5305, "loss": 0.4569, "lr": 1.873822975517891e-06, "epoch": 0.1885014137606032, "percentage": 3.77, "elapsed_time": "0:00:22", "remaining_time": "0:09:43", "throughput": 20317.38, "total_tokens": 464384} +{"current_steps": 205, "total_steps": 5305, "loss": 0.4075, "lr": 1.92090395480226e-06, "epoch": 0.19321394910461828, "percentage": 3.86, "elapsed_time": "0:00:23", "remaining_time": "0:09:47", "throughput": 20415.14, "total_tokens": 481792} +{"current_steps": 210, "total_steps": 5305, "loss": 0.2703, "lr": 1.9679849340866293e-06, "epoch": 0.19792648444863337, "percentage": 3.96, "elapsed_time": "0:00:24", "remaining_time": "0:09:46", "throughput": 20430.22, "total_tokens": 493952} +{"current_steps": 215, "total_steps": 5305, "loss": 0.2604, "lr": 2.015065913370998e-06, "epoch": 0.20263901979264845, "percentage": 4.05, "elapsed_time": "0:00:24", "remaining_time": "0:09:45", "throughput": 20429.09, "total_tokens": 504832} +{"current_steps": 220, "total_steps": 5305, "loss": 0.277, "lr": 2.062146892655367e-06, "epoch": 0.20735155513666353, "percentage": 4.15, "elapsed_time": "0:00:25", "remaining_time": "0:09:42", "throughput": 20397.41, "total_tokens": 514368} +{"current_steps": 225, "total_steps": 5305, "loss": 0.2405, "lr": 2.1092278719397365e-06, "epoch": 0.21206409048067862, "percentage": 4.24, "elapsed_time": "0:00:25", "remaining_time": "0:09:41", "throughput": 20404.06, "total_tokens": 525568} +{"current_steps": 230, "total_steps": 5305, "loss": 0.259, "lr": 2.1563088512241055e-06, "epoch": 0.21677662582469368, "percentage": 4.34, "elapsed_time": "0:00:26", "remaining_time": "0:09:40", "throughput": 20427.31, "total_tokens": 537664} +{"current_steps": 235, "total_steps": 5305, "loss": 0.2561, "lr": 2.203389830508475e-06, "epoch": 0.22148916116870876, "percentage": 4.43, "elapsed_time": "0:00:26", "remaining_time": "0:09:38", "throughput": 20409.56, "total_tokens": 547584} +{"current_steps": 240, "total_steps": 5305, "loss": 0.3491, "lr": 2.2504708097928438e-06, "epoch": 0.22620169651272384, "percentage": 4.52, "elapsed_time": "0:00:27", "remaining_time": "0:09:37", "throughput": 20403.11, "total_tokens": 558144} +{"current_steps": 245, "total_steps": 5305, "loss": 0.2543, "lr": 2.297551789077213e-06, "epoch": 0.23091423185673893, "percentage": 4.62, "elapsed_time": "0:00:27", "remaining_time": "0:09:36", "throughput": 20398.15, "total_tokens": 569024} +{"current_steps": 250, "total_steps": 5305, "loss": 0.3033, "lr": 2.344632768361582e-06, "epoch": 0.235626767200754, "percentage": 4.71, "elapsed_time": "0:00:28", "remaining_time": "0:09:35", "throughput": 20412.07, "total_tokens": 580864} +{"current_steps": 255, "total_steps": 5305, "loss": 0.2747, "lr": 2.391713747645951e-06, "epoch": 0.2403393025447691, "percentage": 4.81, "elapsed_time": "0:00:29", "remaining_time": "0:09:34", "throughput": 20423.53, "total_tokens": 592768} +{"current_steps": 260, "total_steps": 5305, "loss": 0.336, "lr": 2.4387947269303204e-06, "epoch": 0.24505183788878418, "percentage": 4.9, "elapsed_time": "0:00:29", "remaining_time": "0:09:33", "throughput": 20428.71, "total_tokens": 604032} +{"current_steps": 265, "total_steps": 5305, "loss": 0.2277, "lr": 2.4858757062146898e-06, "epoch": 0.24976437323279924, "percentage": 5.0, "elapsed_time": "0:00:30", "remaining_time": "0:09:33", "throughput": 20440.08, "total_tokens": 616256} +{"current_steps": 266, "total_steps": 5305, "eval_loss": 0.25048765540122986, "epoch": 0.25070688030160226, "percentage": 5.01, "elapsed_time": "0:00:32", "remaining_time": "0:10:24", "throughput": 18744.75, "total_tokens": 618432} +{"current_steps": 270, "total_steps": 5305, "loss": 0.2331, "lr": 2.5329566854990583e-06, "epoch": 0.2544769085768143, "percentage": 5.09, "elapsed_time": "0:01:58", "remaining_time": "0:36:52", "throughput": 5285.23, "total_tokens": 627072} +{"current_steps": 275, "total_steps": 5305, "loss": 0.157, "lr": 2.5800376647834272e-06, "epoch": 0.25918944392082943, "percentage": 5.18, "elapsed_time": "0:01:59", "remaining_time": "0:36:20", "throughput": 5357.1, "total_tokens": 638592} +{"current_steps": 280, "total_steps": 5305, "loss": 0.3209, "lr": 2.627118644067797e-06, "epoch": 0.2639019792648445, "percentage": 5.28, "elapsed_time": "0:01:59", "remaining_time": "0:35:48", "throughput": 5416.58, "total_tokens": 648448} +{"current_steps": 285, "total_steps": 5305, "loss": 0.2578, "lr": 2.674199623352166e-06, "epoch": 0.26861451460885954, "percentage": 5.37, "elapsed_time": "0:02:00", "remaining_time": "0:35:19", "throughput": 5506.97, "total_tokens": 662784} +{"current_steps": 290, "total_steps": 5305, "loss": 0.3557, "lr": 2.7212806026365353e-06, "epoch": 0.27332704995287466, "percentage": 5.47, "elapsed_time": "0:02:00", "remaining_time": "0:34:50", "throughput": 5573.82, "total_tokens": 673856} +{"current_steps": 295, "total_steps": 5305, "loss": 0.2089, "lr": 2.7683615819209043e-06, "epoch": 0.2780395852968897, "percentage": 5.56, "elapsed_time": "0:02:01", "remaining_time": "0:34:21", "throughput": 5627.84, "total_tokens": 683136} +{"current_steps": 300, "total_steps": 5305, "loss": 0.2989, "lr": 2.8154425612052732e-06, "epoch": 0.2827521206409048, "percentage": 5.66, "elapsed_time": "0:02:01", "remaining_time": "0:33:54", "throughput": 5697.58, "total_tokens": 694784} +{"current_steps": 305, "total_steps": 5305, "loss": 0.2632, "lr": 2.862523540489642e-06, "epoch": 0.2874646559849199, "percentage": 5.75, "elapsed_time": "0:02:02", "remaining_time": "0:33:28", "throughput": 5768.13, "total_tokens": 706624} +{"current_steps": 310, "total_steps": 5305, "loss": 0.2979, "lr": 2.9096045197740115e-06, "epoch": 0.292177191328935, "percentage": 5.84, "elapsed_time": "0:02:03", "remaining_time": "0:33:02", "throughput": 5826.65, "total_tokens": 716800} +{"current_steps": 315, "total_steps": 5305, "loss": 0.3261, "lr": 2.9566854990583805e-06, "epoch": 0.29688972667295005, "percentage": 5.94, "elapsed_time": "0:02:03", "remaining_time": "0:32:37", "throughput": 5896.2, "total_tokens": 728704} +{"current_steps": 320, "total_steps": 5305, "loss": 0.1851, "lr": 3.00376647834275e-06, "epoch": 0.3016022620169651, "percentage": 6.03, "elapsed_time": "0:02:04", "remaining_time": "0:32:14", "throughput": 5963.2, "total_tokens": 740352} +{"current_steps": 325, "total_steps": 5305, "loss": 0.2727, "lr": 3.0508474576271192e-06, "epoch": 0.3063147973609802, "percentage": 6.13, "elapsed_time": "0:02:04", "remaining_time": "0:31:50", "throughput": 6029.47, "total_tokens": 751936} +{"current_steps": 330, "total_steps": 5305, "loss": 0.3077, "lr": 3.097928436911488e-06, "epoch": 0.3110273327049953, "percentage": 6.22, "elapsed_time": "0:02:05", "remaining_time": "0:31:28", "throughput": 6093.41, "total_tokens": 763264} +{"current_steps": 335, "total_steps": 5305, "loss": 0.3285, "lr": 3.145009416195857e-06, "epoch": 0.3157398680490104, "percentage": 6.31, "elapsed_time": "0:02:05", "remaining_time": "0:31:05", "throughput": 6146.11, "total_tokens": 772992} +{"current_steps": 340, "total_steps": 5305, "loss": 0.2493, "lr": 3.192090395480226e-06, "epoch": 0.32045240339302544, "percentage": 6.41, "elapsed_time": "0:02:06", "remaining_time": "0:30:45", "throughput": 6226.27, "total_tokens": 787008} +{"current_steps": 345, "total_steps": 5305, "loss": 0.233, "lr": 3.2391713747645954e-06, "epoch": 0.32516493873704055, "percentage": 6.5, "elapsed_time": "0:02:06", "remaining_time": "0:30:25", "throughput": 6291.61, "total_tokens": 798848} +{"current_steps": 350, "total_steps": 5305, "loss": 0.3409, "lr": 3.2862523540489644e-06, "epoch": 0.3298774740810556, "percentage": 6.6, "elapsed_time": "0:02:07", "remaining_time": "0:30:05", "throughput": 6362.1, "total_tokens": 811584} +{"current_steps": 355, "total_steps": 5305, "loss": 0.1946, "lr": 3.3333333333333333e-06, "epoch": 0.33459000942507067, "percentage": 6.69, "elapsed_time": "0:02:08", "remaining_time": "0:29:46", "throughput": 6418.8, "total_tokens": 822208} +{"current_steps": 360, "total_steps": 5305, "loss": 0.3024, "lr": 3.3804143126177023e-06, "epoch": 0.3393025447690858, "percentage": 6.79, "elapsed_time": "0:02:08", "remaining_time": "0:29:27", "throughput": 6480.74, "total_tokens": 833792} +{"current_steps": 365, "total_steps": 5305, "loss": 0.347, "lr": 3.427495291902072e-06, "epoch": 0.34401508011310084, "percentage": 6.88, "elapsed_time": "0:02:09", "remaining_time": "0:29:08", "throughput": 6543.51, "total_tokens": 845568} +{"current_steps": 370, "total_steps": 5305, "loss": 0.3787, "lr": 3.474576271186441e-06, "epoch": 0.34872761545711595, "percentage": 6.97, "elapsed_time": "0:02:09", "remaining_time": "0:28:50", "throughput": 6591.04, "total_tokens": 855040} +{"current_steps": 375, "total_steps": 5305, "loss": 0.3066, "lr": 3.5216572504708104e-06, "epoch": 0.353440150801131, "percentage": 7.07, "elapsed_time": "0:02:10", "remaining_time": "0:28:33", "throughput": 6658.45, "total_tokens": 867712} +{"current_steps": 380, "total_steps": 5305, "loss": 0.2843, "lr": 3.5687382297551793e-06, "epoch": 0.3581526861451461, "percentage": 7.16, "elapsed_time": "0:02:10", "remaining_time": "0:28:16", "throughput": 6729.13, "total_tokens": 880960} +{"current_steps": 385, "total_steps": 5305, "loss": 0.2422, "lr": 3.6158192090395483e-06, "epoch": 0.36286522148916117, "percentage": 7.26, "elapsed_time": "0:02:11", "remaining_time": "0:28:01", "throughput": 6802.08, "total_tokens": 894784} +{"current_steps": 390, "total_steps": 5305, "loss": 0.2664, "lr": 3.662900188323917e-06, "epoch": 0.36757775683317623, "percentage": 7.35, "elapsed_time": "0:02:12", "remaining_time": "0:27:44", "throughput": 6856.47, "total_tokens": 905600} +{"current_steps": 395, "total_steps": 5305, "loss": 0.3223, "lr": 3.7099811676082866e-06, "epoch": 0.37229029217719134, "percentage": 7.45, "elapsed_time": "0:02:12", "remaining_time": "0:27:28", "throughput": 6901.97, "total_tokens": 915072} +{"current_steps": 400, "total_steps": 5305, "loss": 0.2066, "lr": 3.7570621468926555e-06, "epoch": 0.3770028275212064, "percentage": 7.54, "elapsed_time": "0:02:13", "remaining_time": "0:27:12", "throughput": 6965.23, "total_tokens": 927552} +{"current_steps": 405, "total_steps": 5305, "loss": 0.2964, "lr": 3.8041431261770245e-06, "epoch": 0.3817153628652215, "percentage": 7.63, "elapsed_time": "0:02:13", "remaining_time": "0:26:58", "throughput": 7028.74, "total_tokens": 940160} +{"current_steps": 410, "total_steps": 5305, "loss": 0.1989, "lr": 3.851224105461394e-06, "epoch": 0.38642789820923656, "percentage": 7.73, "elapsed_time": "0:02:14", "remaining_time": "0:26:43", "throughput": 7073.71, "total_tokens": 949760} +{"current_steps": 415, "total_steps": 5305, "loss": 0.2152, "lr": 3.898305084745763e-06, "epoch": 0.3911404335532517, "percentage": 7.82, "elapsed_time": "0:02:14", "remaining_time": "0:26:28", "throughput": 7127.73, "total_tokens": 960896} +{"current_steps": 420, "total_steps": 5305, "loss": 0.3237, "lr": 3.945386064030132e-06, "epoch": 0.39585296889726673, "percentage": 7.92, "elapsed_time": "0:02:15", "remaining_time": "0:26:14", "throughput": 7179.1, "total_tokens": 971648} +{"current_steps": 425, "total_steps": 5305, "loss": 0.2182, "lr": 3.992467043314501e-06, "epoch": 0.4005655042412818, "percentage": 8.01, "elapsed_time": "0:02:15", "remaining_time": "0:25:59", "throughput": 7224.71, "total_tokens": 981504} +{"current_steps": 430, "total_steps": 5305, "loss": 0.1811, "lr": 4.03954802259887e-06, "epoch": 0.4052780395852969, "percentage": 8.11, "elapsed_time": "0:02:16", "remaining_time": "0:25:46", "throughput": 7283.05, "total_tokens": 993664} +{"current_steps": 435, "total_steps": 5305, "loss": 0.2151, "lr": 4.08662900188324e-06, "epoch": 0.40999057492931196, "percentage": 8.2, "elapsed_time": "0:02:16", "remaining_time": "0:25:33", "throughput": 7324.7, "total_tokens": 1003008} +{"current_steps": 440, "total_steps": 5305, "loss": 0.2333, "lr": 4.133709981167609e-06, "epoch": 0.41470311027332707, "percentage": 8.29, "elapsed_time": "0:02:17", "remaining_time": "0:25:19", "throughput": 7371.39, "total_tokens": 1013248} +{"current_steps": 445, "total_steps": 5305, "loss": 0.2694, "lr": 4.180790960451978e-06, "epoch": 0.4194156456173421, "percentage": 8.39, "elapsed_time": "0:02:17", "remaining_time": "0:25:06", "throughput": 7416.62, "total_tokens": 1023296} +{"current_steps": 450, "total_steps": 5305, "loss": 0.3493, "lr": 4.2278719397363475e-06, "epoch": 0.42412818096135724, "percentage": 8.48, "elapsed_time": "0:02:18", "remaining_time": "0:24:54", "throughput": 7460.63, "total_tokens": 1033152} +{"current_steps": 455, "total_steps": 5305, "loss": 0.2147, "lr": 4.2749529190207165e-06, "epoch": 0.4288407163053723, "percentage": 8.58, "elapsed_time": "0:02:19", "remaining_time": "0:24:42", "throughput": 7516.75, "total_tokens": 1045248} +{"current_steps": 460, "total_steps": 5305, "loss": 0.3071, "lr": 4.322033898305085e-06, "epoch": 0.43355325164938735, "percentage": 8.67, "elapsed_time": "0:02:19", "remaining_time": "0:24:31", "throughput": 7581.24, "total_tokens": 1058944} +{"current_steps": 465, "total_steps": 5305, "loss": 0.2118, "lr": 4.369114877589454e-06, "epoch": 0.43826578699340246, "percentage": 8.77, "elapsed_time": "0:02:20", "remaining_time": "0:24:20", "throughput": 7640.14, "total_tokens": 1071680} +{"current_steps": 470, "total_steps": 5305, "loss": 0.2319, "lr": 4.416195856873823e-06, "epoch": 0.4429783223374175, "percentage": 8.86, "elapsed_time": "0:02:20", "remaining_time": "0:24:08", "throughput": 7692.75, "total_tokens": 1083328} +{"current_steps": 475, "total_steps": 5305, "loss": 0.4004, "lr": 4.463276836158192e-06, "epoch": 0.44769085768143263, "percentage": 8.95, "elapsed_time": "0:02:21", "remaining_time": "0:23:58", "throughput": 7759.39, "total_tokens": 1097600} +{"current_steps": 480, "total_steps": 5305, "loss": 0.218, "lr": 4.510357815442561e-06, "epoch": 0.4524033930254477, "percentage": 9.05, "elapsed_time": "0:02:22", "remaining_time": "0:23:47", "throughput": 7811.71, "total_tokens": 1109376} +{"current_steps": 485, "total_steps": 5305, "loss": 0.2579, "lr": 4.55743879472693e-06, "epoch": 0.4571159283694628, "percentage": 9.14, "elapsed_time": "0:02:22", "remaining_time": "0:23:36", "throughput": 7859.63, "total_tokens": 1120448} +{"current_steps": 490, "total_steps": 5305, "loss": 0.298, "lr": 4.6045197740113e-06, "epoch": 0.46182846371347785, "percentage": 9.24, "elapsed_time": "0:02:23", "remaining_time": "0:23:26", "throughput": 7906.46, "total_tokens": 1131392} +{"current_steps": 495, "total_steps": 5305, "loss": 0.2203, "lr": 4.651600753295669e-06, "epoch": 0.4665409990574929, "percentage": 9.33, "elapsed_time": "0:02:23", "remaining_time": "0:23:16", "throughput": 7959.39, "total_tokens": 1143552} +{"current_steps": 500, "total_steps": 5305, "loss": 0.1785, "lr": 4.698681732580039e-06, "epoch": 0.471253534401508, "percentage": 9.43, "elapsed_time": "0:02:24", "remaining_time": "0:23:05", "throughput": 7997.71, "total_tokens": 1153088} +{"current_steps": 505, "total_steps": 5305, "loss": 0.2125, "lr": 4.745762711864408e-06, "epoch": 0.4759660697455231, "percentage": 9.52, "elapsed_time": "0:02:24", "remaining_time": "0:22:56", "throughput": 8054.61, "total_tokens": 1166080} +{"current_steps": 510, "total_steps": 5305, "loss": 0.3882, "lr": 4.7928436911487765e-06, "epoch": 0.4806786050895382, "percentage": 9.61, "elapsed_time": "0:02:25", "remaining_time": "0:22:46", "throughput": 8099.32, "total_tokens": 1176896} +{"current_steps": 515, "total_steps": 5305, "loss": 0.2743, "lr": 4.8399246704331455e-06, "epoch": 0.48539114043355325, "percentage": 9.71, "elapsed_time": "0:02:25", "remaining_time": "0:22:36", "throughput": 8141.79, "total_tokens": 1187392} +{"current_steps": 520, "total_steps": 5305, "loss": 0.3309, "lr": 4.8870056497175144e-06, "epoch": 0.49010367577756836, "percentage": 9.8, "elapsed_time": "0:02:26", "remaining_time": "0:22:26", "throughput": 8177.45, "total_tokens": 1196672} +{"current_steps": 525, "total_steps": 5305, "loss": 0.3264, "lr": 4.934086629001883e-06, "epoch": 0.4948162111215834, "percentage": 9.9, "elapsed_time": "0:02:26", "remaining_time": "0:22:17", "throughput": 8231.01, "total_tokens": 1209344} +{"current_steps": 530, "total_steps": 5305, "loss": 0.2193, "lr": 4.981167608286252e-06, "epoch": 0.49952874646559847, "percentage": 9.99, "elapsed_time": "0:02:27", "remaining_time": "0:22:08", "throughput": 8281.78, "total_tokens": 1221504} +{"current_steps": 532, "total_steps": 5305, "eval_loss": 0.31662699580192566, "epoch": 0.5014137606032045, "percentage": 10.03, "elapsed_time": "0:02:31", "remaining_time": "0:22:38", "throughput": 8094.58, "total_tokens": 1225408} +{"current_steps": 535, "total_steps": 5305, "loss": 0.3059, "lr": 4.999995128224159e-06, "epoch": 0.5042412818096136, "percentage": 10.08, "elapsed_time": "0:03:14", "remaining_time": "0:28:58", "throughput": 6319.95, "total_tokens": 1232256} +{"current_steps": 540, "total_steps": 5305, "loss": 0.2494, "lr": 4.999965356329446e-06, "epoch": 0.5089538171536286, "percentage": 10.18, "elapsed_time": "0:03:15", "remaining_time": "0:28:45", "throughput": 6357.32, "total_tokens": 1242880} +{"current_steps": 545, "total_steps": 5305, "loss": 0.2812, "lr": 4.99990851940408e-06, "epoch": 0.5136663524976437, "percentage": 10.27, "elapsed_time": "0:03:16", "remaining_time": "0:28:32", "throughput": 6393.27, "total_tokens": 1253248} +{"current_steps": 550, "total_steps": 5305, "loss": 0.2639, "lr": 4.999824618063384e-06, "epoch": 0.5183788878416589, "percentage": 10.37, "elapsed_time": "0:03:16", "remaining_time": "0:28:19", "throughput": 6436.08, "total_tokens": 1265280} +{"current_steps": 555, "total_steps": 5305, "loss": 0.3403, "lr": 4.99971365321569e-06, "epoch": 0.5230914231856739, "percentage": 10.46, "elapsed_time": "0:03:17", "remaining_time": "0:28:06", "throughput": 6470.11, "total_tokens": 1275328} +{"current_steps": 560, "total_steps": 5305, "loss": 0.2942, "lr": 4.9995756260623194e-06, "epoch": 0.527803958529689, "percentage": 10.56, "elapsed_time": "0:03:17", "remaining_time": "0:27:54", "throughput": 6507.86, "total_tokens": 1286272} +{"current_steps": 565, "total_steps": 5305, "loss": 0.2036, "lr": 4.999410538097579e-06, "epoch": 0.532516493873704, "percentage": 10.65, "elapsed_time": "0:03:18", "remaining_time": "0:27:43", "throughput": 6551.91, "total_tokens": 1298816} +{"current_steps": 570, "total_steps": 5305, "loss": 0.2656, "lr": 4.999218391108735e-06, "epoch": 0.5372290292177191, "percentage": 10.74, "elapsed_time": "0:03:18", "remaining_time": "0:27:31", "throughput": 6596.9, "total_tokens": 1311680} +{"current_steps": 575, "total_steps": 5305, "loss": 0.2828, "lr": 4.9989991871760054e-06, "epoch": 0.5419415645617343, "percentage": 10.84, "elapsed_time": "0:03:19", "remaining_time": "0:27:20", "throughput": 6642.43, "total_tokens": 1324672} +{"current_steps": 580, "total_steps": 5305, "loss": 0.2081, "lr": 4.998752928672525e-06, "epoch": 0.5466540999057493, "percentage": 10.93, "elapsed_time": "0:03:20", "remaining_time": "0:27:09", "throughput": 6684.45, "total_tokens": 1336896} +{"current_steps": 585, "total_steps": 5305, "loss": 0.1485, "lr": 4.9984796182643285e-06, "epoch": 0.5513666352497644, "percentage": 11.03, "elapsed_time": "0:03:20", "remaining_time": "0:26:58", "throughput": 6725.43, "total_tokens": 1348928} +{"current_steps": 590, "total_steps": 5305, "loss": 0.6339, "lr": 4.99817925891032e-06, "epoch": 0.5560791705937794, "percentage": 11.12, "elapsed_time": "0:03:21", "remaining_time": "0:26:47", "throughput": 6772.31, "total_tokens": 1362496} +{"current_steps": 595, "total_steps": 5305, "loss": 0.5079, "lr": 4.997851853862237e-06, "epoch": 0.5607917059377945, "percentage": 11.22, "elapsed_time": "0:03:21", "remaining_time": "0:26:37", "throughput": 6823.24, "total_tokens": 1377152} +{"current_steps": 600, "total_steps": 5305, "loss": 0.2396, "lr": 4.997497406664621e-06, "epoch": 0.5655042412818096, "percentage": 11.31, "elapsed_time": "0:03:22", "remaining_time": "0:26:27", "throughput": 6866.92, "total_tokens": 1390016} +{"current_steps": 605, "total_steps": 5305, "loss": 0.2335, "lr": 4.997115921154774e-06, "epoch": 0.5702167766258247, "percentage": 11.4, "elapsed_time": "0:03:22", "remaining_time": "0:26:16", "throughput": 6906.19, "total_tokens": 1401856} +{"current_steps": 610, "total_steps": 5305, "loss": 0.1719, "lr": 4.9967074014627206e-06, "epoch": 0.5749293119698398, "percentage": 11.5, "elapsed_time": "0:03:23", "remaining_time": "0:26:06", "throughput": 6941.46, "total_tokens": 1412736} +{"current_steps": 615, "total_steps": 5305, "loss": 0.3201, "lr": 4.996271852011161e-06, "epoch": 0.5796418473138548, "percentage": 11.59, "elapsed_time": "0:03:24", "remaining_time": "0:25:56", "throughput": 6983.08, "total_tokens": 1425280} +{"current_steps": 620, "total_steps": 5305, "loss": 0.1993, "lr": 4.995809277515424e-06, "epoch": 0.58435438265787, "percentage": 11.69, "elapsed_time": "0:03:24", "remaining_time": "0:25:46", "throughput": 7019.12, "total_tokens": 1436480} +{"current_steps": 625, "total_steps": 5305, "loss": 0.3072, "lr": 4.995319682983417e-06, "epoch": 0.589066918001885, "percentage": 11.78, "elapsed_time": "0:03:25", "remaining_time": "0:25:36", "throughput": 7055.29, "total_tokens": 1447808} +{"current_steps": 630, "total_steps": 5305, "loss": 0.3263, "lr": 4.99480307371557e-06, "epoch": 0.5937794533459001, "percentage": 11.88, "elapsed_time": "0:03:25", "remaining_time": "0:25:27", "throughput": 7096.4, "total_tokens": 1460352} +{"current_steps": 635, "total_steps": 5305, "loss": 0.2747, "lr": 4.9942594553047775e-06, "epoch": 0.5984919886899152, "percentage": 11.97, "elapsed_time": "0:03:26", "remaining_time": "0:25:17", "throughput": 7136.28, "total_tokens": 1472640} +{"current_steps": 640, "total_steps": 5305, "loss": 0.2984, "lr": 4.993688833636341e-06, "epoch": 0.6032045240339302, "percentage": 12.06, "elapsed_time": "0:03:26", "remaining_time": "0:25:07", "throughput": 7167.0, "total_tokens": 1482688} +{"current_steps": 645, "total_steps": 5305, "loss": 0.2671, "lr": 4.993091214887904e-06, "epoch": 0.6079170593779454, "percentage": 12.16, "elapsed_time": "0:03:27", "remaining_time": "0:24:58", "throughput": 7203.76, "total_tokens": 1494336} +{"current_steps": 650, "total_steps": 5305, "loss": 0.1511, "lr": 4.992466605529384e-06, "epoch": 0.6126295947219604, "percentage": 12.25, "elapsed_time": "0:03:27", "remaining_time": "0:24:49", "throughput": 7236.24, "total_tokens": 1504896} +{"current_steps": 655, "total_steps": 5305, "loss": 0.3427, "lr": 4.991815012322902e-06, "epoch": 0.6173421300659755, "percentage": 12.35, "elapsed_time": "0:03:28", "remaining_time": "0:24:40", "throughput": 7280.32, "total_tokens": 1518592} +{"current_steps": 660, "total_steps": 5305, "loss": 0.2164, "lr": 4.991136442322713e-06, "epoch": 0.6220546654099905, "percentage": 12.44, "elapsed_time": "0:03:29", "remaining_time": "0:24:32", "throughput": 7320.52, "total_tokens": 1531264} +{"current_steps": 665, "total_steps": 5305, "loss": 0.2187, "lr": 4.990430902875125e-06, "epoch": 0.6267672007540056, "percentage": 12.54, "elapsed_time": "0:03:29", "remaining_time": "0:24:23", "throughput": 7350.6, "total_tokens": 1541376} +{"current_steps": 670, "total_steps": 5305, "loss": 0.2911, "lr": 4.989698401618423e-06, "epoch": 0.6314797360980208, "percentage": 12.63, "elapsed_time": "0:03:30", "remaining_time": "0:24:14", "throughput": 7380.31, "total_tokens": 1551424} +{"current_steps": 675, "total_steps": 5305, "loss": 0.1331, "lr": 4.988938946482786e-06, "epoch": 0.6361922714420358, "percentage": 12.72, "elapsed_time": "0:03:30", "remaining_time": "0:24:05", "throughput": 7414.16, "total_tokens": 1562624} +{"current_steps": 680, "total_steps": 5305, "loss": 0.2686, "lr": 4.988152545690197e-06, "epoch": 0.6409048067860509, "percentage": 12.82, "elapsed_time": "0:03:31", "remaining_time": "0:23:57", "throughput": 7448.8, "total_tokens": 1574016} +{"current_steps": 685, "total_steps": 5305, "loss": 0.3307, "lr": 4.987339207754358e-06, "epoch": 0.6456173421300659, "percentage": 12.91, "elapsed_time": "0:03:31", "remaining_time": "0:23:49", "throughput": 7487.78, "total_tokens": 1586688} +{"current_steps": 690, "total_steps": 5305, "loss": 0.247, "lr": 4.9864989414806e-06, "epoch": 0.6503298774740811, "percentage": 13.01, "elapsed_time": "0:03:32", "remaining_time": "0:23:40", "throughput": 7517.75, "total_tokens": 1596992} +{"current_steps": 695, "total_steps": 5305, "loss": 0.3232, "lr": 4.985631755965779e-06, "epoch": 0.6550424128180962, "percentage": 13.1, "elapsed_time": "0:03:33", "remaining_time": "0:23:33", "throughput": 7557.12, "total_tokens": 1609920} +{"current_steps": 700, "total_steps": 5305, "loss": 0.2132, "lr": 4.984737660598187e-06, "epoch": 0.6597549481621112, "percentage": 13.2, "elapsed_time": "0:03:33", "remaining_time": "0:23:25", "throughput": 7588.61, "total_tokens": 1620736} +{"current_steps": 705, "total_steps": 5305, "loss": 0.2797, "lr": 4.983816665057447e-06, "epoch": 0.6644674835061263, "percentage": 13.29, "elapsed_time": "0:03:34", "remaining_time": "0:23:17", "throughput": 7623.39, "total_tokens": 1632512} +{"current_steps": 710, "total_steps": 5305, "loss": 0.3142, "lr": 4.982868779314405e-06, "epoch": 0.6691800188501413, "percentage": 13.38, "elapsed_time": "0:03:34", "remaining_time": "0:23:09", "throughput": 7654.24, "total_tokens": 1643264} +{"current_steps": 715, "total_steps": 5305, "loss": 0.1914, "lr": 4.981894013631026e-06, "epoch": 0.6738925541941565, "percentage": 13.48, "elapsed_time": "0:03:35", "remaining_time": "0:23:01", "throughput": 7685.74, "total_tokens": 1654208} +{"current_steps": 720, "total_steps": 5305, "loss": 0.1985, "lr": 4.980892378560281e-06, "epoch": 0.6786050895381716, "percentage": 13.57, "elapsed_time": "0:03:35", "remaining_time": "0:22:53", "throughput": 7715.33, "total_tokens": 1664640} +{"current_steps": 725, "total_steps": 5305, "loss": 0.2831, "lr": 4.979863884946034e-06, "epoch": 0.6833176248821866, "percentage": 13.67, "elapsed_time": "0:03:36", "remaining_time": "0:22:46", "throughput": 7751.16, "total_tokens": 1676864} +{"current_steps": 730, "total_steps": 5305, "loss": 0.2082, "lr": 4.978808543922925e-06, "epoch": 0.6880301602262017, "percentage": 13.76, "elapsed_time": "0:03:36", "remaining_time": "0:22:39", "throughput": 7793.72, "total_tokens": 1691072} +{"current_steps": 735, "total_steps": 5305, "loss": 0.1227, "lr": 4.9777263669162465e-06, "epoch": 0.6927426955702167, "percentage": 13.85, "elapsed_time": "0:03:37", "remaining_time": "0:22:32", "throughput": 7825.88, "total_tokens": 1702400} +{"current_steps": 740, "total_steps": 5305, "loss": 0.1471, "lr": 4.976617365641822e-06, "epoch": 0.6974552309142319, "percentage": 13.95, "elapsed_time": "0:03:38", "remaining_time": "0:22:25", "throughput": 7862.35, "total_tokens": 1714944} +{"current_steps": 745, "total_steps": 5305, "loss": 0.3082, "lr": 4.97548155210588e-06, "epoch": 0.702167766258247, "percentage": 14.04, "elapsed_time": "0:03:38", "remaining_time": "0:22:18", "throughput": 7891.2, "total_tokens": 1725376} +{"current_steps": 750, "total_steps": 5305, "loss": 0.4432, "lr": 4.974318938604921e-06, "epoch": 0.706880301602262, "percentage": 14.14, "elapsed_time": "0:03:39", "remaining_time": "0:22:11", "throughput": 7924.52, "total_tokens": 1737152} +{"current_steps": 755, "total_steps": 5305, "loss": 0.1969, "lr": 4.9731295377255885e-06, "epoch": 0.7115928369462771, "percentage": 14.23, "elapsed_time": "0:03:39", "remaining_time": "0:22:04", "throughput": 7958.51, "total_tokens": 1749120} +{"current_steps": 760, "total_steps": 5305, "loss": 0.272, "lr": 4.971913362344529e-06, "epoch": 0.7163053722902922, "percentage": 14.33, "elapsed_time": "0:03:40", "remaining_time": "0:21:57", "throughput": 7989.48, "total_tokens": 1760384} +{"current_steps": 765, "total_steps": 5305, "loss": 0.1454, "lr": 4.970670425628255e-06, "epoch": 0.7210179076343073, "percentage": 14.42, "elapsed_time": "0:03:40", "remaining_time": "0:21:51", "throughput": 8026.42, "total_tokens": 1773632} +{"current_steps": 770, "total_steps": 5305, "loss": 0.184, "lr": 4.969400741032999e-06, "epoch": 0.7257304429783223, "percentage": 14.51, "elapsed_time": "0:03:41", "remaining_time": "0:21:45", "throughput": 8067.26, "total_tokens": 1787776} +{"current_steps": 775, "total_steps": 5305, "loss": 0.2148, "lr": 4.968104322304575e-06, "epoch": 0.7304429783223374, "percentage": 14.61, "elapsed_time": "0:03:42", "remaining_time": "0:21:38", "throughput": 8095.45, "total_tokens": 1798336} +{"current_steps": 780, "total_steps": 5305, "loss": 0.2897, "lr": 4.966781183478223e-06, "epoch": 0.7351555136663525, "percentage": 14.7, "elapsed_time": "0:03:42", "remaining_time": "0:21:31", "throughput": 8124.81, "total_tokens": 1809216} +{"current_steps": 785, "total_steps": 5305, "loss": 0.2981, "lr": 4.965431338878456e-06, "epoch": 0.7398680490103676, "percentage": 14.8, "elapsed_time": "0:03:43", "remaining_time": "0:21:25", "throughput": 8160.88, "total_tokens": 1822144} +{"current_steps": 790, "total_steps": 5305, "loss": 0.2476, "lr": 4.9640548031189125e-06, "epoch": 0.7445805843543827, "percentage": 14.89, "elapsed_time": "0:03:43", "remaining_time": "0:21:19", "throughput": 8189.84, "total_tokens": 1833088} +{"current_steps": 795, "total_steps": 5305, "loss": 0.2554, "lr": 4.962651591102191e-06, "epoch": 0.7492931196983977, "percentage": 14.99, "elapsed_time": "0:03:44", "remaining_time": "0:21:12", "throughput": 8222.32, "total_tokens": 1845056} +{"current_steps": 798, "total_steps": 5305, "eval_loss": 0.2178538739681244, "epoch": 0.7521206409048068, "percentage": 15.04, "elapsed_time": "0:03:47", "remaining_time": "0:21:24", "throughput": 8138.24, "total_tokens": 1851072} +{"current_steps": 800, "total_steps": 5305, "loss": 0.2507, "lr": 4.961221718019695e-06, "epoch": 0.7540056550424128, "percentage": 15.08, "elapsed_time": "0:04:17", "remaining_time": "0:24:10", "throughput": 7203.34, "total_tokens": 1855168} +{"current_steps": 805, "total_steps": 5305, "loss": 0.3006, "lr": 4.9597651993514585e-06, "epoch": 0.7587181903864278, "percentage": 15.17, "elapsed_time": "0:04:18", "remaining_time": "0:24:02", "throughput": 7234.36, "total_tokens": 1867328} +{"current_steps": 810, "total_steps": 5305, "loss": 0.1949, "lr": 4.9582820508659924e-06, "epoch": 0.763430725730443, "percentage": 15.27, "elapsed_time": "0:04:18", "remaining_time": "0:23:56", "throughput": 7274.47, "total_tokens": 1882560} +{"current_steps": 815, "total_steps": 5305, "loss": 0.1866, "lr": 4.956772288620101e-06, "epoch": 0.7681432610744581, "percentage": 15.36, "elapsed_time": "0:04:19", "remaining_time": "0:23:48", "throughput": 7301.06, "total_tokens": 1893376} +{"current_steps": 820, "total_steps": 5305, "loss": 0.1114, "lr": 4.955235928958716e-06, "epoch": 0.7728557964184731, "percentage": 15.46, "elapsed_time": "0:04:19", "remaining_time": "0:23:41", "throughput": 7333.26, "total_tokens": 1906048} +{"current_steps": 825, "total_steps": 5305, "loss": 0.2425, "lr": 4.953672988514716e-06, "epoch": 0.7775683317624882, "percentage": 15.55, "elapsed_time": "0:04:20", "remaining_time": "0:23:34", "throughput": 7361.84, "total_tokens": 1917568} +{"current_steps": 830, "total_steps": 5305, "loss": 0.4121, "lr": 4.95208348420875e-06, "epoch": 0.7822808671065034, "percentage": 15.65, "elapsed_time": "0:04:21", "remaining_time": "0:23:27", "throughput": 7390.73, "total_tokens": 1929216} +{"current_steps": 835, "total_steps": 5305, "loss": 0.1859, "lr": 4.950467433249046e-06, "epoch": 0.7869934024505184, "percentage": 15.74, "elapsed_time": "0:04:21", "remaining_time": "0:23:20", "throughput": 7417.87, "total_tokens": 1940416} +{"current_steps": 840, "total_steps": 5305, "loss": 0.2065, "lr": 4.948824853131237e-06, "epoch": 0.7917059377945335, "percentage": 15.83, "elapsed_time": "0:04:22", "remaining_time": "0:23:13", "throughput": 7439.16, "total_tokens": 1949632} +{"current_steps": 845, "total_steps": 5305, "loss": 0.2102, "lr": 4.94715576163816e-06, "epoch": 0.7964184731385485, "percentage": 15.93, "elapsed_time": "0:04:22", "remaining_time": "0:23:06", "throughput": 7469.51, "total_tokens": 1961920} +{"current_steps": 850, "total_steps": 5305, "loss": 0.2975, "lr": 4.945460176839671e-06, "epoch": 0.8011310084825636, "percentage": 16.02, "elapsed_time": "0:04:23", "remaining_time": "0:22:59", "throughput": 7498.35, "total_tokens": 1973696} +{"current_steps": 855, "total_steps": 5305, "loss": 0.294, "lr": 4.943738117092447e-06, "epoch": 0.8058435438265787, "percentage": 16.12, "elapsed_time": "0:04:23", "remaining_time": "0:22:52", "throughput": 7526.53, "total_tokens": 1985280} +{"current_steps": 860, "total_steps": 5305, "loss": 0.2107, "lr": 4.941989601039785e-06, "epoch": 0.8105560791705938, "percentage": 16.21, "elapsed_time": "0:04:24", "remaining_time": "0:22:46", "throughput": 7556.57, "total_tokens": 1997504} +{"current_steps": 865, "total_steps": 5305, "loss": 0.2815, "lr": 4.940214647611405e-06, "epoch": 0.8152686145146089, "percentage": 16.31, "elapsed_time": "0:04:24", "remaining_time": "0:22:39", "throughput": 7585.89, "total_tokens": 2009600} +{"current_steps": 870, "total_steps": 5305, "loss": 0.1509, "lr": 4.9384132760232395e-06, "epoch": 0.8199811498586239, "percentage": 16.4, "elapsed_time": "0:04:25", "remaining_time": "0:22:33", "throughput": 7611.99, "total_tokens": 2020672} +{"current_steps": 875, "total_steps": 5305, "loss": 0.258, "lr": 4.93658550577723e-06, "epoch": 0.824693685202639, "percentage": 16.49, "elapsed_time": "0:04:26", "remaining_time": "0:22:26", "throughput": 7643.01, "total_tokens": 2033408} +{"current_steps": 880, "total_steps": 5305, "loss": 0.2403, "lr": 4.9347313566611145e-06, "epoch": 0.8294062205466541, "percentage": 16.59, "elapsed_time": "0:04:26", "remaining_time": "0:22:20", "throughput": 7665.57, "total_tokens": 2043328} +{"current_steps": 885, "total_steps": 5305, "loss": 0.2631, "lr": 4.9328508487482115e-06, "epoch": 0.8341187558906692, "percentage": 16.68, "elapsed_time": "0:04:27", "remaining_time": "0:22:14", "throughput": 7692.3, "total_tokens": 2054656} +{"current_steps": 890, "total_steps": 5305, "loss": 0.2302, "lr": 4.930944002397204e-06, "epoch": 0.8388312912346843, "percentage": 16.78, "elapsed_time": "0:04:27", "remaining_time": "0:22:07", "throughput": 7713.4, "total_tokens": 2064128} +{"current_steps": 895, "total_steps": 5305, "loss": 0.2009, "lr": 4.929010838251923e-06, "epoch": 0.8435438265786993, "percentage": 16.87, "elapsed_time": "0:04:28", "remaining_time": "0:22:01", "throughput": 7743.97, "total_tokens": 2076864} +{"current_steps": 900, "total_steps": 5305, "loss": 0.1868, "lr": 4.927051377241115e-06, "epoch": 0.8482563619227145, "percentage": 16.97, "elapsed_time": "0:04:28", "remaining_time": "0:21:55", "throughput": 7766.98, "total_tokens": 2087104} +{"current_steps": 905, "total_steps": 5305, "loss": 0.3066, "lr": 4.9250656405782215e-06, "epoch": 0.8529688972667295, "percentage": 17.06, "elapsed_time": "0:04:29", "remaining_time": "0:21:49", "throughput": 7791.14, "total_tokens": 2097728} +{"current_steps": 910, "total_steps": 5305, "loss": 0.1685, "lr": 4.9230536497611525e-06, "epoch": 0.8576814326107446, "percentage": 17.15, "elapsed_time": "0:04:29", "remaining_time": "0:21:42", "throughput": 7813.94, "total_tokens": 2107904} +{"current_steps": 915, "total_steps": 5305, "loss": 0.3358, "lr": 4.921015426572047e-06, "epoch": 0.8623939679547596, "percentage": 17.25, "elapsed_time": "0:04:30", "remaining_time": "0:21:37", "throughput": 7842.59, "total_tokens": 2120192} +{"current_steps": 920, "total_steps": 5305, "loss": 0.2411, "lr": 4.918950993077039e-06, "epoch": 0.8671065032987747, "percentage": 17.34, "elapsed_time": "0:04:30", "remaining_time": "0:21:31", "throughput": 7869.53, "total_tokens": 2131904} +{"current_steps": 925, "total_steps": 5305, "loss": 0.3069, "lr": 4.91686037162602e-06, "epoch": 0.8718190386427899, "percentage": 17.44, "elapsed_time": "0:04:31", "remaining_time": "0:21:25", "throughput": 7899.2, "total_tokens": 2144640} +{"current_steps": 930, "total_steps": 5305, "loss": 0.1587, "lr": 4.9147435848523975e-06, "epoch": 0.8765315739868049, "percentage": 17.53, "elapsed_time": "0:04:32", "remaining_time": "0:21:19", "throughput": 7919.45, "total_tokens": 2154112} +{"current_steps": 935, "total_steps": 5305, "loss": 0.1468, "lr": 4.91260065567285e-06, "epoch": 0.88124410933082, "percentage": 17.62, "elapsed_time": "0:04:32", "remaining_time": "0:21:14", "throughput": 7950.2, "total_tokens": 2167232} +{"current_steps": 940, "total_steps": 5305, "loss": 0.2699, "lr": 4.910431607287075e-06, "epoch": 0.885956644674835, "percentage": 17.72, "elapsed_time": "0:04:33", "remaining_time": "0:21:08", "throughput": 7977.74, "total_tokens": 2179264} +{"current_steps": 945, "total_steps": 5305, "loss": 0.3797, "lr": 4.908236463177544e-06, "epoch": 0.8906691800188501, "percentage": 17.81, "elapsed_time": "0:04:33", "remaining_time": "0:21:03", "throughput": 8005.44, "total_tokens": 2191488} +{"current_steps": 950, "total_steps": 5305, "loss": 0.1988, "lr": 4.906015247109242e-06, "epoch": 0.8953817153628653, "percentage": 17.91, "elapsed_time": "0:04:34", "remaining_time": "0:20:57", "throughput": 8027.96, "total_tokens": 2201856} +{"current_steps": 955, "total_steps": 5305, "loss": 0.3161, "lr": 4.903767983129414e-06, "epoch": 0.9000942507068803, "percentage": 18.0, "elapsed_time": "0:04:34", "remaining_time": "0:20:51", "throughput": 8056.57, "total_tokens": 2214464} +{"current_steps": 960, "total_steps": 5305, "loss": 0.2565, "lr": 4.901494695567306e-06, "epoch": 0.9048067860508954, "percentage": 18.1, "elapsed_time": "0:04:35", "remaining_time": "0:20:46", "throughput": 8091.03, "total_tokens": 2229184} +{"current_steps": 965, "total_steps": 5305, "loss": 0.2214, "lr": 4.899195409033897e-06, "epoch": 0.9095193213949104, "percentage": 18.19, "elapsed_time": "0:04:36", "remaining_time": "0:20:41", "throughput": 8111.86, "total_tokens": 2239104} +{"current_steps": 970, "total_steps": 5305, "loss": 0.1992, "lr": 4.896870148421637e-06, "epoch": 0.9142318567389256, "percentage": 18.28, "elapsed_time": "0:04:36", "remaining_time": "0:20:35", "throughput": 8133.12, "total_tokens": 2249152} +{"current_steps": 975, "total_steps": 5305, "loss": 0.1527, "lr": 4.894518938904175e-06, "epoch": 0.9189443920829407, "percentage": 18.38, "elapsed_time": "0:04:37", "remaining_time": "0:20:30", "throughput": 8159.97, "total_tokens": 2261312} +{"current_steps": 980, "total_steps": 5305, "loss": 0.1398, "lr": 4.892141805936085e-06, "epoch": 0.9236569274269557, "percentage": 18.47, "elapsed_time": "0:04:37", "remaining_time": "0:20:25", "throughput": 8191.34, "total_tokens": 2275008} +{"current_steps": 985, "total_steps": 5305, "loss": 0.276, "lr": 4.889738775252596e-06, "epoch": 0.9283694627709708, "percentage": 18.57, "elapsed_time": "0:04:38", "remaining_time": "0:20:20", "throughput": 8219.48, "total_tokens": 2287680} +{"current_steps": 990, "total_steps": 5305, "loss": 0.2869, "lr": 4.887309872869308e-06, "epoch": 0.9330819981149858, "percentage": 18.66, "elapsed_time": "0:04:38", "remaining_time": "0:20:15", "throughput": 8246.18, "total_tokens": 2299840} +{"current_steps": 995, "total_steps": 5305, "loss": 0.2347, "lr": 4.884855125081912e-06, "epoch": 0.937794533459001, "percentage": 18.76, "elapsed_time": "0:04:39", "remaining_time": "0:20:10", "throughput": 8270.44, "total_tokens": 2311104} +{"current_steps": 1000, "total_steps": 5305, "loss": 0.326, "lr": 4.882374558465906e-06, "epoch": 0.942507068803016, "percentage": 18.85, "elapsed_time": "0:04:39", "remaining_time": "0:20:05", "throughput": 8294.58, "total_tokens": 2322432} +{"current_steps": 1005, "total_steps": 5305, "loss": 0.2946, "lr": 4.8798681998763056e-06, "epoch": 0.9472196041470311, "percentage": 18.94, "elapsed_time": "0:04:40", "remaining_time": "0:20:00", "throughput": 8316.99, "total_tokens": 2333120} +{"current_steps": 1010, "total_steps": 5305, "loss": 0.2846, "lr": 4.877336076447358e-06, "epoch": 0.9519321394910462, "percentage": 19.04, "elapsed_time": "0:04:41", "remaining_time": "0:19:55", "throughput": 8343.73, "total_tokens": 2345472} +{"current_steps": 1015, "total_steps": 5305, "loss": 0.1988, "lr": 4.87477821559224e-06, "epoch": 0.9566446748350612, "percentage": 19.13, "elapsed_time": "0:04:41", "remaining_time": "0:19:50", "throughput": 8369.85, "total_tokens": 2357568} +{"current_steps": 1020, "total_steps": 5305, "loss": 0.2295, "lr": 4.87219464500277e-06, "epoch": 0.9613572101790764, "percentage": 19.23, "elapsed_time": "0:04:42", "remaining_time": "0:19:45", "throughput": 8391.35, "total_tokens": 2368064} +{"current_steps": 1025, "total_steps": 5305, "loss": 0.2166, "lr": 4.869585392649102e-06, "epoch": 0.9660697455230914, "percentage": 19.32, "elapsed_time": "0:04:42", "remaining_time": "0:19:40", "throughput": 8420.05, "total_tokens": 2381184} +{"current_steps": 1030, "total_steps": 5305, "loss": 0.1964, "lr": 4.866950486779425e-06, "epoch": 0.9707822808671065, "percentage": 19.42, "elapsed_time": "0:04:43", "remaining_time": "0:19:36", "throughput": 8446.06, "total_tokens": 2393408} +{"current_steps": 1035, "total_steps": 5305, "loss": 0.2603, "lr": 4.864289955919658e-06, "epoch": 0.9754948162111216, "percentage": 19.51, "elapsed_time": "0:04:43", "remaining_time": "0:19:31", "throughput": 8474.95, "total_tokens": 2406720} +{"current_steps": 1040, "total_steps": 5305, "loss": 0.3101, "lr": 4.8616038288731394e-06, "epoch": 0.9802073515551367, "percentage": 19.6, "elapsed_time": "0:04:44", "remaining_time": "0:19:27", "throughput": 8504.38, "total_tokens": 2420288} +{"current_steps": 1045, "total_steps": 5305, "loss": 0.1463, "lr": 4.8588921347203175e-06, "epoch": 0.9849198868991518, "percentage": 19.7, "elapsed_time": "0:04:45", "remaining_time": "0:19:22", "throughput": 8527.42, "total_tokens": 2431488} +{"current_steps": 1050, "total_steps": 5305, "loss": 0.2497, "lr": 4.8561549028184315e-06, "epoch": 0.9896324222431668, "percentage": 19.79, "elapsed_time": "0:04:45", "remaining_time": "0:19:17", "throughput": 8553.88, "total_tokens": 2444032} +{"current_steps": 1055, "total_steps": 5305, "loss": 0.1574, "lr": 4.8533921628012e-06, "epoch": 0.9943449575871819, "percentage": 19.89, "elapsed_time": "0:04:46", "remaining_time": "0:19:13", "throughput": 8575.73, "total_tokens": 2454912} +{"current_steps": 1060, "total_steps": 5305, "loss": 0.3676, "lr": 4.850603944578494e-06, "epoch": 0.9990574929311969, "percentage": 19.98, "elapsed_time": "0:04:46", "remaining_time": "0:19:08", "throughput": 8602.42, "total_tokens": 2467584} +{"current_steps": 1064, "total_steps": 5305, "eval_loss": 0.18848362565040588, "epoch": 1.002827521206409, "percentage": 20.06, "elapsed_time": "0:04:50", "remaining_time": "0:19:16", "throughput": 8534.86, "total_tokens": 2475808} +{"current_steps": 1065, "total_steps": 5305, "loss": 0.1493, "lr": 4.847790278336017e-06, "epoch": 1.003770028275212, "percentage": 20.08, "elapsed_time": "0:05:33", "remaining_time": "0:22:06", "throughput": 7440.06, "total_tokens": 2478048} +{"current_steps": 1070, "total_steps": 5305, "loss": 0.1749, "lr": 4.844951194534975e-06, "epoch": 1.0084825636192272, "percentage": 20.17, "elapsed_time": "0:05:33", "remaining_time": "0:22:00", "throughput": 7469.27, "total_tokens": 2492576} +{"current_steps": 1075, "total_steps": 5305, "loss": 0.1307, "lr": 4.842086723911751e-06, "epoch": 1.0131950989632421, "percentage": 20.26, "elapsed_time": "0:05:34", "remaining_time": "0:21:55", "throughput": 7494.44, "total_tokens": 2505440} +{"current_steps": 1080, "total_steps": 5305, "loss": 0.1119, "lr": 4.839196897477569e-06, "epoch": 1.0179076343072573, "percentage": 20.36, "elapsed_time": "0:05:34", "remaining_time": "0:21:49", "throughput": 7512.96, "total_tokens": 2515488} +{"current_steps": 1085, "total_steps": 5305, "loss": 0.1664, "lr": 4.836281746518159e-06, "epoch": 1.0226201696512724, "percentage": 20.45, "elapsed_time": "0:05:35", "remaining_time": "0:21:44", "throughput": 7540.8, "total_tokens": 2529504} +{"current_steps": 1090, "total_steps": 5305, "loss": 0.1393, "lr": 4.833341302593417e-06, "epoch": 1.0273327049952874, "percentage": 20.55, "elapsed_time": "0:05:35", "remaining_time": "0:21:39", "throughput": 7559.85, "total_tokens": 2539872} +{"current_steps": 1095, "total_steps": 5305, "loss": 0.0376, "lr": 4.830375597537068e-06, "epoch": 1.0320452403393026, "percentage": 20.64, "elapsed_time": "0:05:36", "remaining_time": "0:21:33", "throughput": 7577.87, "total_tokens": 2549856} +{"current_steps": 1100, "total_steps": 5305, "loss": 0.1836, "lr": 4.827384663456315e-06, "epoch": 1.0367577756833177, "percentage": 20.74, "elapsed_time": "0:05:36", "remaining_time": "0:21:28", "throughput": 7594.8, "total_tokens": 2559328} +{"current_steps": 1105, "total_steps": 5305, "loss": 0.369, "lr": 4.824368532731496e-06, "epoch": 1.0414703110273327, "percentage": 20.83, "elapsed_time": "0:05:37", "remaining_time": "0:21:22", "throughput": 7613.14, "total_tokens": 2569440} +{"current_steps": 1110, "total_steps": 5305, "loss": 0.084, "lr": 4.821327238015732e-06, "epoch": 1.0461828463713478, "percentage": 20.92, "elapsed_time": "0:05:38", "remaining_time": "0:21:17", "throughput": 7633.67, "total_tokens": 2580448} +{"current_steps": 1115, "total_steps": 5305, "loss": 0.4176, "lr": 4.818260812234572e-06, "epoch": 1.0508953817153628, "percentage": 21.02, "elapsed_time": "0:05:38", "remaining_time": "0:21:12", "throughput": 7650.62, "total_tokens": 2590752} +{"current_steps": 1120, "total_steps": 5305, "loss": 0.0664, "lr": 4.815169288585641e-06, "epoch": 1.055607917059378, "percentage": 21.11, "elapsed_time": "0:05:39", "remaining_time": "0:21:07", "throughput": 7667.19, "total_tokens": 2600160} +{"current_steps": 1125, "total_steps": 5305, "loss": 0.1558, "lr": 4.812052700538274e-06, "epoch": 1.0603204524033931, "percentage": 21.21, "elapsed_time": "0:05:39", "remaining_time": "0:21:02", "throughput": 7687.55, "total_tokens": 2611232} +{"current_steps": 1130, "total_steps": 5305, "loss": 0.1476, "lr": 4.808911081833161e-06, "epoch": 1.065032987747408, "percentage": 21.3, "elapsed_time": "0:05:40", "remaining_time": "0:20:57", "throughput": 7710.97, "total_tokens": 2623712} +{"current_steps": 1135, "total_steps": 5305, "loss": 0.0875, "lr": 4.805744466481974e-06, "epoch": 1.0697455230914232, "percentage": 21.39, "elapsed_time": "0:05:40", "remaining_time": "0:20:52", "throughput": 7733.82, "total_tokens": 2635936} +{"current_steps": 1140, "total_steps": 5305, "loss": 0.1297, "lr": 4.802552888767005e-06, "epoch": 1.0744580584354382, "percentage": 21.49, "elapsed_time": "0:05:41", "remaining_time": "0:20:47", "throughput": 7751.44, "total_tokens": 2645920} +{"current_steps": 1145, "total_steps": 5305, "loss": 0.2563, "lr": 4.799336383240793e-06, "epoch": 1.0791705937794533, "percentage": 21.58, "elapsed_time": "0:05:41", "remaining_time": "0:20:42", "throughput": 7780.07, "total_tokens": 2660768} +{"current_steps": 1150, "total_steps": 5305, "loss": 0.1484, "lr": 4.796094984725749e-06, "epoch": 1.0838831291234685, "percentage": 21.68, "elapsed_time": "0:05:42", "remaining_time": "0:20:37", "throughput": 7798.63, "total_tokens": 2671200} +{"current_steps": 1155, "total_steps": 5305, "loss": 0.1145, "lr": 4.792828728313778e-06, "epoch": 1.0885956644674835, "percentage": 21.77, "elapsed_time": "0:05:43", "remaining_time": "0:20:32", "throughput": 7820.37, "total_tokens": 2683040} +{"current_steps": 1160, "total_steps": 5305, "loss": 0.0767, "lr": 4.789537649365904e-06, "epoch": 1.0933081998114986, "percentage": 21.87, "elapsed_time": "0:05:43", "remaining_time": "0:20:27", "throughput": 7841.1, "total_tokens": 2694432} +{"current_steps": 1165, "total_steps": 5305, "loss": 0.0079, "lr": 4.78622178351188e-06, "epoch": 1.0980207351555136, "percentage": 21.96, "elapsed_time": "0:05:44", "remaining_time": "0:20:23", "throughput": 7864.69, "total_tokens": 2707168} +{"current_steps": 1170, "total_steps": 5305, "loss": 0.1644, "lr": 4.782881166649808e-06, "epoch": 1.1027332704995287, "percentage": 22.05, "elapsed_time": "0:05:44", "remaining_time": "0:20:18", "throughput": 7883.68, "total_tokens": 2717984} +{"current_steps": 1175, "total_steps": 5305, "loss": 0.2543, "lr": 4.77951583494575e-06, "epoch": 1.107445805843544, "percentage": 22.15, "elapsed_time": "0:05:45", "remaining_time": "0:20:13", "throughput": 7907.38, "total_tokens": 2730784} +{"current_steps": 1180, "total_steps": 5305, "loss": 0.4821, "lr": 4.77612582483333e-06, "epoch": 1.1121583411875589, "percentage": 22.24, "elapsed_time": "0:05:45", "remaining_time": "0:20:09", "throughput": 7924.23, "total_tokens": 2740704} +{"current_steps": 1185, "total_steps": 5305, "loss": 0.2498, "lr": 4.772711173013352e-06, "epoch": 1.116870876531574, "percentage": 22.34, "elapsed_time": "0:05:46", "remaining_time": "0:20:04", "throughput": 7944.13, "total_tokens": 2751968} +{"current_steps": 1190, "total_steps": 5305, "loss": 0.1649, "lr": 4.769271916453387e-06, "epoch": 1.121583411875589, "percentage": 22.43, "elapsed_time": "0:05:46", "remaining_time": "0:19:59", "throughput": 7965.25, "total_tokens": 2763808} +{"current_steps": 1195, "total_steps": 5305, "loss": 0.0735, "lr": 4.765808092387385e-06, "epoch": 1.1262959472196041, "percentage": 22.53, "elapsed_time": "0:05:47", "remaining_time": "0:19:55", "throughput": 7984.15, "total_tokens": 2774624} +{"current_steps": 1200, "total_steps": 5305, "loss": 0.2639, "lr": 4.762319738315269e-06, "epoch": 1.1310084825636193, "percentage": 22.62, "elapsed_time": "0:05:48", "remaining_time": "0:19:50", "throughput": 8003.92, "total_tokens": 2785888} +{"current_steps": 1205, "total_steps": 5305, "loss": 0.3194, "lr": 4.758806892002526e-06, "epoch": 1.1357210179076342, "percentage": 22.71, "elapsed_time": "0:05:48", "remaining_time": "0:19:46", "throughput": 8023.84, "total_tokens": 2797216} +{"current_steps": 1210, "total_steps": 5305, "loss": 0.1395, "lr": 4.7552695914798e-06, "epoch": 1.1404335532516494, "percentage": 22.81, "elapsed_time": "0:05:49", "remaining_time": "0:19:41", "throughput": 8042.47, "total_tokens": 2808032} +{"current_steps": 1215, "total_steps": 5305, "loss": 0.2734, "lr": 4.751707875042481e-06, "epoch": 1.1451460885956646, "percentage": 22.9, "elapsed_time": "0:05:49", "remaining_time": "0:19:37", "throughput": 8070.04, "total_tokens": 2823008} +{"current_steps": 1220, "total_steps": 5305, "loss": 0.0883, "lr": 4.748121781250288e-06, "epoch": 1.1498586239396795, "percentage": 23.0, "elapsed_time": "0:05:50", "remaining_time": "0:19:33", "throughput": 8093.18, "total_tokens": 2835936} +{"current_steps": 1225, "total_steps": 5305, "loss": 0.169, "lr": 4.744511348926855e-06, "epoch": 1.1545711592836947, "percentage": 23.09, "elapsed_time": "0:05:50", "remaining_time": "0:19:28", "throughput": 8113.49, "total_tokens": 2847584} +{"current_steps": 1230, "total_steps": 5305, "loss": 0.1451, "lr": 4.740876617159308e-06, "epoch": 1.1592836946277096, "percentage": 23.19, "elapsed_time": "0:05:51", "remaining_time": "0:19:24", "throughput": 8130.84, "total_tokens": 2857952} +{"current_steps": 1235, "total_steps": 5305, "loss": 0.2114, "lr": 4.737217625297844e-06, "epoch": 1.1639962299717248, "percentage": 23.28, "elapsed_time": "0:05:52", "remaining_time": "0:19:20", "throughput": 8147.9, "total_tokens": 2868192} +{"current_steps": 1240, "total_steps": 5305, "loss": 0.1145, "lr": 4.733534412955301e-06, "epoch": 1.1687087653157398, "percentage": 23.37, "elapsed_time": "0:05:52", "remaining_time": "0:19:15", "throughput": 8168.09, "total_tokens": 2879904} +{"current_steps": 1245, "total_steps": 5305, "loss": 0.1768, "lr": 4.729827020006735e-06, "epoch": 1.173421300659755, "percentage": 23.47, "elapsed_time": "0:05:53", "remaining_time": "0:19:11", "throughput": 8189.94, "total_tokens": 2892384} +{"current_steps": 1250, "total_steps": 5305, "loss": 0.1507, "lr": 4.726095486588983e-06, "epoch": 1.17813383600377, "percentage": 23.56, "elapsed_time": "0:05:53", "remaining_time": "0:19:07", "throughput": 8212.31, "total_tokens": 2905184} +{"current_steps": 1255, "total_steps": 5305, "loss": 0.0958, "lr": 4.722339853100232e-06, "epoch": 1.182846371347785, "percentage": 23.66, "elapsed_time": "0:05:54", "remaining_time": "0:19:03", "throughput": 8231.86, "total_tokens": 2916640} +{"current_steps": 1260, "total_steps": 5305, "loss": 0.1192, "lr": 4.718560160199579e-06, "epoch": 1.1875589066918002, "percentage": 23.75, "elapsed_time": "0:05:54", "remaining_time": "0:18:59", "throughput": 8249.0, "total_tokens": 2927072} +{"current_steps": 1265, "total_steps": 5305, "loss": 0.2693, "lr": 4.714756448806592e-06, "epoch": 1.1922714420358154, "percentage": 23.85, "elapsed_time": "0:05:55", "remaining_time": "0:18:54", "throughput": 8266.79, "total_tokens": 2937888} +{"current_steps": 1270, "total_steps": 5305, "loss": 0.1689, "lr": 4.71092876010087e-06, "epoch": 1.1969839773798303, "percentage": 23.94, "elapsed_time": "0:05:55", "remaining_time": "0:18:51", "throughput": 8288.98, "total_tokens": 2950752} +{"current_steps": 1275, "total_steps": 5305, "loss": 0.0997, "lr": 4.70707713552159e-06, "epoch": 1.2016965127238455, "percentage": 24.03, "elapsed_time": "0:05:56", "remaining_time": "0:18:46", "throughput": 8305.66, "total_tokens": 2961056} +{"current_steps": 1280, "total_steps": 5305, "loss": 0.1164, "lr": 4.703201616767067e-06, "epoch": 1.2064090480678604, "percentage": 24.13, "elapsed_time": "0:05:57", "remaining_time": "0:18:42", "throughput": 8322.67, "total_tokens": 2971552} +{"current_steps": 1285, "total_steps": 5305, "loss": 0.0178, "lr": 4.699302245794293e-06, "epoch": 1.2111215834118756, "percentage": 24.22, "elapsed_time": "0:05:57", "remaining_time": "0:18:38", "throughput": 8346.32, "total_tokens": 2985120} +{"current_steps": 1290, "total_steps": 5305, "loss": 0.1821, "lr": 4.6953790648184924e-06, "epoch": 1.2158341187558908, "percentage": 24.32, "elapsed_time": "0:05:58", "remaining_time": "0:18:34", "throughput": 8364.29, "total_tokens": 2996128} +{"current_steps": 1295, "total_steps": 5305, "loss": 0.0199, "lr": 4.691432116312661e-06, "epoch": 1.2205466540999057, "percentage": 24.41, "elapsed_time": "0:05:58", "remaining_time": "0:18:30", "throughput": 8382.15, "total_tokens": 3007072} +{"current_steps": 1300, "total_steps": 5305, "loss": 0.006, "lr": 4.687461443007101e-06, "epoch": 1.2252591894439209, "percentage": 24.51, "elapsed_time": "0:05:59", "remaining_time": "0:18:26", "throughput": 8401.26, "total_tokens": 3018656} +{"current_steps": 1305, "total_steps": 5305, "loss": 0.1915, "lr": 4.683467087888967e-06, "epoch": 1.2299717247879358, "percentage": 24.6, "elapsed_time": "0:05:59", "remaining_time": "0:18:23", "throughput": 8421.27, "total_tokens": 3030624} +{"current_steps": 1310, "total_steps": 5305, "loss": 0.2276, "lr": 4.6794490942017955e-06, "epoch": 1.234684260131951, "percentage": 24.69, "elapsed_time": "0:06:00", "remaining_time": "0:18:19", "throughput": 8442.16, "total_tokens": 3043040} +{"current_steps": 1315, "total_steps": 5305, "loss": 0.0236, "lr": 4.6754075054450385e-06, "epoch": 1.2393967954759662, "percentage": 24.79, "elapsed_time": "0:06:01", "remaining_time": "0:18:15", "throughput": 8467.5, "total_tokens": 3057632} +{"current_steps": 1320, "total_steps": 5305, "loss": 0.1376, "lr": 4.671342365373592e-06, "epoch": 1.244109330819981, "percentage": 24.88, "elapsed_time": "0:06:01", "remaining_time": "0:18:11", "throughput": 8487.61, "total_tokens": 3069792} +{"current_steps": 1325, "total_steps": 5305, "loss": 0.2062, "lr": 4.667253717997324e-06, "epoch": 1.2488218661639963, "percentage": 24.98, "elapsed_time": "0:06:02", "remaining_time": "0:18:08", "throughput": 8504.83, "total_tokens": 3080608} +{"current_steps": 1330, "total_steps": 5305, "loss": 0.165, "lr": 4.663141607580589e-06, "epoch": 1.2535344015080114, "percentage": 25.07, "elapsed_time": "0:06:02", "remaining_time": "0:18:04", "throughput": 8522.31, "total_tokens": 3091552} +{"current_steps": 1330, "total_steps": 5305, "eval_loss": 0.4607957601547241, "epoch": 1.2535344015080114, "percentage": 25.07, "elapsed_time": "0:06:05", "remaining_time": "0:18:12", "throughput": 8458.8, "total_tokens": 3091552} +{"current_steps": 1335, "total_steps": 5305, "loss": 0.222, "lr": 4.659006078641766e-06, "epoch": 1.2582469368520264, "percentage": 25.16, "elapsed_time": "0:06:52", "remaining_time": "0:20:26", "throughput": 7527.24, "total_tokens": 3103712} +{"current_steps": 1340, "total_steps": 5305, "loss": 0.2312, "lr": 4.6548471759527634e-06, "epoch": 1.2629594721960415, "percentage": 25.26, "elapsed_time": "0:06:52", "remaining_time": "0:20:21", "throughput": 7544.66, "total_tokens": 3115104} +{"current_steps": 1345, "total_steps": 5305, "loss": 0.011, "lr": 4.6506649445385335e-06, "epoch": 1.2676720075400565, "percentage": 25.35, "elapsed_time": "0:06:53", "remaining_time": "0:20:17", "throughput": 7564.33, "total_tokens": 3127648} +{"current_steps": 1350, "total_steps": 5305, "loss": 0.2732, "lr": 4.646459429676594e-06, "epoch": 1.2723845428840717, "percentage": 25.45, "elapsed_time": "0:06:54", "remaining_time": "0:20:12", "throughput": 7580.02, "total_tokens": 3138208} +{"current_steps": 1355, "total_steps": 5305, "loss": 0.148, "lr": 4.642230676896531e-06, "epoch": 1.2770970782280866, "percentage": 25.54, "elapsed_time": "0:06:54", "remaining_time": "0:20:08", "throughput": 7594.7, "total_tokens": 3148256} +{"current_steps": 1360, "total_steps": 5305, "loss": 0.0901, "lr": 4.6379787319795076e-06, "epoch": 1.2818096135721018, "percentage": 25.64, "elapsed_time": "0:06:55", "remaining_time": "0:20:03", "throughput": 7608.47, "total_tokens": 3157856} +{"current_steps": 1365, "total_steps": 5305, "loss": 0.24, "lr": 4.6337036409577705e-06, "epoch": 1.286522148916117, "percentage": 25.73, "elapsed_time": "0:06:55", "remaining_time": "0:19:59", "throughput": 7621.68, "total_tokens": 3167136} +{"current_steps": 1370, "total_steps": 5305, "loss": 0.0842, "lr": 4.62940545011415e-06, "epoch": 1.2912346842601319, "percentage": 25.82, "elapsed_time": "0:06:56", "remaining_time": "0:19:55", "throughput": 7645.08, "total_tokens": 3181984} +{"current_steps": 1375, "total_steps": 5305, "loss": 0.1368, "lr": 4.625084205981554e-06, "epoch": 1.295947219604147, "percentage": 25.92, "elapsed_time": "0:06:56", "remaining_time": "0:19:51", "throughput": 7666.59, "total_tokens": 3195744} +{"current_steps": 1380, "total_steps": 5305, "loss": 0.2497, "lr": 4.620739955342476e-06, "epoch": 1.3006597549481622, "percentage": 26.01, "elapsed_time": "0:06:57", "remaining_time": "0:19:47", "throughput": 7684.83, "total_tokens": 3207776} +{"current_steps": 1385, "total_steps": 5305, "loss": 0.0782, "lr": 4.616372745228477e-06, "epoch": 1.3053722902921772, "percentage": 26.11, "elapsed_time": "0:06:57", "remaining_time": "0:19:43", "throughput": 7702.02, "total_tokens": 3219296} +{"current_steps": 1390, "total_steps": 5305, "loss": 0.3956, "lr": 4.611982622919684e-06, "epoch": 1.3100848256361923, "percentage": 26.2, "elapsed_time": "0:06:58", "remaining_time": "0:19:38", "throughput": 7717.78, "total_tokens": 3230048} +{"current_steps": 1395, "total_steps": 5305, "loss": 0.1166, "lr": 4.607569635944271e-06, "epoch": 1.3147973609802073, "percentage": 26.3, "elapsed_time": "0:06:59", "remaining_time": "0:19:34", "throughput": 7730.55, "total_tokens": 3239200} +{"current_steps": 1400, "total_steps": 5305, "loss": 0.2557, "lr": 4.603133832077953e-06, "epoch": 1.3195098963242224, "percentage": 26.39, "elapsed_time": "0:06:59", "remaining_time": "0:19:30", "throughput": 7755.46, "total_tokens": 3255008} +{"current_steps": 1405, "total_steps": 5305, "loss": 0.2547, "lr": 4.598675259343462e-06, "epoch": 1.3242224316682374, "percentage": 26.48, "elapsed_time": "0:07:00", "remaining_time": "0:19:26", "throughput": 7773.58, "total_tokens": 3267040} +{"current_steps": 1410, "total_steps": 5305, "loss": 0.2374, "lr": 4.594193966010031e-06, "epoch": 1.3289349670122526, "percentage": 26.58, "elapsed_time": "0:07:00", "remaining_time": "0:19:22", "throughput": 7787.7, "total_tokens": 3276960} +{"current_steps": 1415, "total_steps": 5305, "loss": 0.0795, "lr": 4.589690000592868e-06, "epoch": 1.3336475023562677, "percentage": 26.67, "elapsed_time": "0:07:01", "remaining_time": "0:19:18", "throughput": 7803.5, "total_tokens": 3287840} +{"current_steps": 1420, "total_steps": 5305, "loss": 0.2095, "lr": 4.585163411852632e-06, "epoch": 1.3383600377002827, "percentage": 26.77, "elapsed_time": "0:07:01", "remaining_time": "0:19:14", "throughput": 7822.06, "total_tokens": 3300256} +{"current_steps": 1425, "total_steps": 5305, "loss": 0.3144, "lr": 4.58061424879491e-06, "epoch": 1.3430725730442978, "percentage": 26.86, "elapsed_time": "0:07:02", "remaining_time": "0:19:10", "throughput": 7838.93, "total_tokens": 3311712} +{"current_steps": 1430, "total_steps": 5305, "loss": 0.1113, "lr": 4.576042560669678e-06, "epoch": 1.347785108388313, "percentage": 26.96, "elapsed_time": "0:07:02", "remaining_time": "0:19:06", "throughput": 7853.88, "total_tokens": 3322144} +{"current_steps": 1435, "total_steps": 5305, "loss": 0.4022, "lr": 4.571448396970773e-06, "epoch": 1.352497643732328, "percentage": 27.05, "elapsed_time": "0:07:03", "remaining_time": "0:19:02", "throughput": 7871.19, "total_tokens": 3333856} +{"current_steps": 1440, "total_steps": 5305, "loss": 0.1542, "lr": 4.566831807435359e-06, "epoch": 1.3572101790763431, "percentage": 27.14, "elapsed_time": "0:07:04", "remaining_time": "0:18:58", "throughput": 7888.5, "total_tokens": 3345696} +{"current_steps": 1445, "total_steps": 5305, "loss": 0.2594, "lr": 4.562192842043381e-06, "epoch": 1.3619227144203583, "percentage": 27.24, "elapsed_time": "0:07:04", "remaining_time": "0:18:54", "throughput": 7904.84, "total_tokens": 3357024} +{"current_steps": 1450, "total_steps": 5305, "loss": 0.1721, "lr": 4.557531551017034e-06, "epoch": 1.3666352497643732, "percentage": 27.33, "elapsed_time": "0:07:05", "remaining_time": "0:18:50", "throughput": 7921.4, "total_tokens": 3368480} +{"current_steps": 1455, "total_steps": 5305, "loss": 0.1418, "lr": 4.552847984820208e-06, "epoch": 1.3713477851083884, "percentage": 27.43, "elapsed_time": "0:07:05", "remaining_time": "0:18:46", "throughput": 7935.75, "total_tokens": 3378720} +{"current_steps": 1460, "total_steps": 5305, "loss": 0.1344, "lr": 4.548142194157951e-06, "epoch": 1.3760603204524033, "percentage": 27.52, "elapsed_time": "0:07:06", "remaining_time": "0:18:42", "throughput": 7953.21, "total_tokens": 3390688} +{"current_steps": 1465, "total_steps": 5305, "loss": 0.2518, "lr": 4.54341422997592e-06, "epoch": 1.3807728557964185, "percentage": 27.62, "elapsed_time": "0:07:06", "remaining_time": "0:18:39", "throughput": 7972.12, "total_tokens": 3403488} +{"current_steps": 1470, "total_steps": 5305, "loss": 0.1194, "lr": 4.538664143459819e-06, "epoch": 1.3854853911404335, "percentage": 27.71, "elapsed_time": "0:07:07", "remaining_time": "0:18:35", "throughput": 7989.84, "total_tokens": 3415648} +{"current_steps": 1475, "total_steps": 5305, "loss": 0.1113, "lr": 4.5338919860348565e-06, "epoch": 1.3901979264844486, "percentage": 27.8, "elapsed_time": "0:07:08", "remaining_time": "0:18:31", "throughput": 8006.45, "total_tokens": 3427168} +{"current_steps": 1480, "total_steps": 5305, "loss": 0.1426, "lr": 4.529097809365184e-06, "epoch": 1.3949104618284638, "percentage": 27.9, "elapsed_time": "0:07:08", "remaining_time": "0:18:27", "throughput": 8021.11, "total_tokens": 3437664} +{"current_steps": 1485, "total_steps": 5305, "loss": 0.3136, "lr": 4.524281665353334e-06, "epoch": 1.3996229971724787, "percentage": 27.99, "elapsed_time": "0:07:09", "remaining_time": "0:18:23", "throughput": 8039.27, "total_tokens": 3450144} +{"current_steps": 1490, "total_steps": 5305, "loss": 0.1617, "lr": 4.519443606139665e-06, "epoch": 1.404335532516494, "percentage": 28.09, "elapsed_time": "0:07:09", "remaining_time": "0:18:20", "throughput": 8055.0, "total_tokens": 3461280} +{"current_steps": 1495, "total_steps": 5305, "loss": 0.2666, "lr": 4.514583684101792e-06, "epoch": 1.409048067860509, "percentage": 28.18, "elapsed_time": "0:07:10", "remaining_time": "0:18:16", "throughput": 8070.97, "total_tokens": 3472608} +{"current_steps": 1500, "total_steps": 5305, "loss": 0.105, "lr": 4.509701951854018e-06, "epoch": 1.413760603204524, "percentage": 28.28, "elapsed_time": "0:07:10", "remaining_time": "0:18:12", "throughput": 8088.92, "total_tokens": 3485024} +{"current_steps": 1505, "total_steps": 5305, "loss": 0.2341, "lr": 4.504798462246768e-06, "epoch": 1.4184731385485392, "percentage": 28.37, "elapsed_time": "0:07:11", "remaining_time": "0:18:09", "throughput": 8104.34, "total_tokens": 3496096} +{"current_steps": 1510, "total_steps": 5305, "loss": 0.2829, "lr": 4.499873268366017e-06, "epoch": 1.4231856738925541, "percentage": 28.46, "elapsed_time": "0:07:11", "remaining_time": "0:18:05", "throughput": 8119.21, "total_tokens": 3506848} +{"current_steps": 1515, "total_steps": 5305, "loss": 0.1819, "lr": 4.494926423532715e-06, "epoch": 1.4278982092365693, "percentage": 28.56, "elapsed_time": "0:07:12", "remaining_time": "0:18:02", "throughput": 8140.94, "total_tokens": 3521568} +{"current_steps": 1520, "total_steps": 5305, "loss": 0.1103, "lr": 4.4899579813022046e-06, "epoch": 1.4326107445805842, "percentage": 28.65, "elapsed_time": "0:07:13", "remaining_time": "0:17:58", "throughput": 8158.46, "total_tokens": 3533856} +{"current_steps": 1525, "total_steps": 5305, "loss": 0.216, "lr": 4.484967995463648e-06, "epoch": 1.4373232799245994, "percentage": 28.75, "elapsed_time": "0:07:13", "remaining_time": "0:17:54", "throughput": 8173.11, "total_tokens": 3544544} +{"current_steps": 1530, "total_steps": 5305, "loss": 0.303, "lr": 4.479956520039443e-06, "epoch": 1.4420358152686146, "percentage": 28.84, "elapsed_time": "0:07:14", "remaining_time": "0:17:51", "throughput": 8186.1, "total_tokens": 3554336} +{"current_steps": 1535, "total_steps": 5305, "loss": 0.0434, "lr": 4.474923609284635e-06, "epoch": 1.4467483506126295, "percentage": 28.93, "elapsed_time": "0:07:14", "remaining_time": "0:17:47", "throughput": 8199.44, "total_tokens": 3564384} +{"current_steps": 1540, "total_steps": 5305, "loss": 0.1438, "lr": 4.469869317686332e-06, "epoch": 1.4514608859566447, "percentage": 29.03, "elapsed_time": "0:07:15", "remaining_time": "0:17:44", "throughput": 8217.32, "total_tokens": 3576992} +{"current_steps": 1545, "total_steps": 5305, "loss": 0.1766, "lr": 4.464793699963116e-06, "epoch": 1.4561734213006599, "percentage": 29.12, "elapsed_time": "0:07:15", "remaining_time": "0:17:40", "throughput": 8232.21, "total_tokens": 3587872} +{"current_steps": 1550, "total_steps": 5305, "loss": 0.0997, "lr": 4.4596968110644484e-06, "epoch": 1.4608859566446748, "percentage": 29.22, "elapsed_time": "0:07:16", "remaining_time": "0:17:37", "throughput": 8246.64, "total_tokens": 3598560} +{"current_steps": 1555, "total_steps": 5305, "loss": 0.1595, "lr": 4.454578706170075e-06, "epoch": 1.46559849198869, "percentage": 29.31, "elapsed_time": "0:07:16", "remaining_time": "0:17:33", "throughput": 8260.4, "total_tokens": 3608864} +{"current_steps": 1560, "total_steps": 5305, "loss": 0.0274, "lr": 4.44943944068943e-06, "epoch": 1.4703110273327051, "percentage": 29.41, "elapsed_time": "0:07:17", "remaining_time": "0:17:30", "throughput": 8277.18, "total_tokens": 3620960} +{"current_steps": 1565, "total_steps": 5305, "loss": 0.4584, "lr": 4.444279070261035e-06, "epoch": 1.47502356267672, "percentage": 29.5, "elapsed_time": "0:07:18", "remaining_time": "0:17:26", "throughput": 8292.37, "total_tokens": 3632096} +{"current_steps": 1570, "total_steps": 5305, "loss": 0.2423, "lr": 4.4390976507518994e-06, "epoch": 1.479736098020735, "percentage": 29.59, "elapsed_time": "0:07:18", "remaining_time": "0:17:23", "throughput": 8307.85, "total_tokens": 3643424} +{"current_steps": 1575, "total_steps": 5305, "loss": 0.046, "lr": 4.433895238256909e-06, "epoch": 1.4844486333647502, "percentage": 29.69, "elapsed_time": "0:07:19", "remaining_time": "0:17:19", "throughput": 8323.02, "total_tokens": 3654624} +{"current_steps": 1580, "total_steps": 5305, "loss": 0.0609, "lr": 4.4286718890982275e-06, "epoch": 1.4891611687087654, "percentage": 29.78, "elapsed_time": "0:07:19", "remaining_time": "0:17:16", "throughput": 8337.56, "total_tokens": 3665504} +{"current_steps": 1585, "total_steps": 5305, "loss": 0.2488, "lr": 4.423427659824681e-06, "epoch": 1.4938737040527803, "percentage": 29.88, "elapsed_time": "0:07:20", "remaining_time": "0:17:13", "throughput": 8352.21, "total_tokens": 3676448} +{"current_steps": 1590, "total_steps": 5305, "loss": 0.4721, "lr": 4.418162607211146e-06, "epoch": 1.4985862393967955, "percentage": 29.97, "elapsed_time": "0:07:20", "remaining_time": "0:17:09", "throughput": 8365.14, "total_tokens": 3686432} +{"current_steps": 1595, "total_steps": 5305, "loss": 0.2207, "lr": 4.412876788257936e-06, "epoch": 1.5032987747408106, "percentage": 30.07, "elapsed_time": "0:07:21", "remaining_time": "0:17:06", "throughput": 8379.66, "total_tokens": 3697312} +{"current_steps": 1596, "total_steps": 5305, "eval_loss": 0.35448023676872253, "epoch": 1.5042412818096136, "percentage": 30.08, "elapsed_time": "0:07:24", "remaining_time": "0:17:11", "throughput": 8330.57, "total_tokens": 3699104} +{"current_steps": 1600, "total_steps": 5305, "loss": 0.2648, "lr": 4.407570260190186e-06, "epoch": 1.5080113100848256, "percentage": 30.16, "elapsed_time": "0:08:31", "remaining_time": "0:19:44", "throughput": 7247.48, "total_tokens": 3707808} +{"current_steps": 1605, "total_steps": 5305, "loss": 0.3225, "lr": 4.402243080457229e-06, "epoch": 1.5127238454288408, "percentage": 30.25, "elapsed_time": "0:08:32", "remaining_time": "0:19:40", "throughput": 7262.9, "total_tokens": 3719840} +{"current_steps": 1610, "total_steps": 5305, "loss": 0.2234, "lr": 4.396895306731978e-06, "epoch": 1.517436380772856, "percentage": 30.35, "elapsed_time": "0:08:32", "remaining_time": "0:19:36", "throughput": 7277.21, "total_tokens": 3731168} +{"current_steps": 1615, "total_steps": 5305, "loss": 0.2199, "lr": 4.391526996910298e-06, "epoch": 1.5221489161168709, "percentage": 30.44, "elapsed_time": "0:08:33", "remaining_time": "0:19:32", "throughput": 7294.07, "total_tokens": 3744160} +{"current_steps": 1620, "total_steps": 5305, "loss": 0.1515, "lr": 4.386138209110385e-06, "epoch": 1.5268614514608858, "percentage": 30.54, "elapsed_time": "0:08:33", "remaining_time": "0:19:28", "throughput": 7307.41, "total_tokens": 3754912} +{"current_steps": 1625, "total_steps": 5305, "loss": 0.1179, "lr": 4.3807290016721265e-06, "epoch": 1.5315739868049012, "percentage": 30.63, "elapsed_time": "0:08:34", "remaining_time": "0:19:25", "throughput": 7323.94, "total_tokens": 3767776} +{"current_steps": 1630, "total_steps": 5305, "loss": 0.1079, "lr": 4.375299433156483e-06, "epoch": 1.5362865221489161, "percentage": 30.73, "elapsed_time": "0:08:35", "remaining_time": "0:19:21", "throughput": 7338.06, "total_tokens": 3779104} +{"current_steps": 1635, "total_steps": 5305, "loss": 0.359, "lr": 4.3698495623448424e-06, "epoch": 1.540999057492931, "percentage": 30.82, "elapsed_time": "0:08:35", "remaining_time": "0:19:17", "throughput": 7350.65, "total_tokens": 3789408} +{"current_steps": 1640, "total_steps": 5305, "loss": 0.1058, "lr": 4.364379448238392e-06, "epoch": 1.5457115928369463, "percentage": 30.91, "elapsed_time": "0:08:36", "remaining_time": "0:19:13", "throughput": 7362.96, "total_tokens": 3799584} +{"current_steps": 1645, "total_steps": 5305, "loss": 0.3319, "lr": 4.358889150057476e-06, "epoch": 1.5504241281809614, "percentage": 31.01, "elapsed_time": "0:08:36", "remaining_time": "0:19:09", "throughput": 7380.73, "total_tokens": 3813344} +{"current_steps": 1650, "total_steps": 5305, "loss": 0.1354, "lr": 4.35337872724095e-06, "epoch": 1.5551366635249764, "percentage": 31.1, "elapsed_time": "0:08:37", "remaining_time": "0:19:05", "throughput": 7392.68, "total_tokens": 3823328} +{"current_steps": 1655, "total_steps": 5305, "loss": 0.1612, "lr": 4.347848239445548e-06, "epoch": 1.5598491988689915, "percentage": 31.2, "elapsed_time": "0:08:37", "remaining_time": "0:19:01", "throughput": 7407.61, "total_tokens": 3835232} +{"current_steps": 1660, "total_steps": 5305, "loss": 0.2858, "lr": 4.342297746545228e-06, "epoch": 1.5645617342130067, "percentage": 31.29, "elapsed_time": "0:08:38", "remaining_time": "0:18:58", "throughput": 7421.28, "total_tokens": 3846368} +{"current_steps": 1665, "total_steps": 5305, "loss": 0.0313, "lr": 4.336727308630527e-06, "epoch": 1.5692742695570217, "percentage": 31.39, "elapsed_time": "0:08:38", "remaining_time": "0:18:54", "throughput": 7436.81, "total_tokens": 3858656} +{"current_steps": 1670, "total_steps": 5305, "loss": 0.1587, "lr": 4.33113698600791e-06, "epoch": 1.5739868049010366, "percentage": 31.48, "elapsed_time": "0:08:39", "remaining_time": "0:18:50", "throughput": 7453.41, "total_tokens": 3871776} +{"current_steps": 1675, "total_steps": 5305, "loss": 0.0377, "lr": 4.325526839199115e-06, "epoch": 1.578699340245052, "percentage": 31.57, "elapsed_time": "0:08:40", "remaining_time": "0:18:47", "throughput": 7469.21, "total_tokens": 3884384} +{"current_steps": 1680, "total_steps": 5305, "loss": 0.2741, "lr": 4.319896928940505e-06, "epoch": 1.583411875589067, "percentage": 31.67, "elapsed_time": "0:08:40", "remaining_time": "0:18:43", "throughput": 7483.85, "total_tokens": 3896224} +{"current_steps": 1685, "total_steps": 5305, "loss": 0.1037, "lr": 4.3142473161824e-06, "epoch": 1.5881244109330819, "percentage": 31.76, "elapsed_time": "0:08:41", "remaining_time": "0:18:39", "throughput": 7496.08, "total_tokens": 3906528} +{"current_steps": 1690, "total_steps": 5305, "loss": 0.1437, "lr": 4.308578062088426e-06, "epoch": 1.592836946277097, "percentage": 31.86, "elapsed_time": "0:08:41", "remaining_time": "0:18:35", "throughput": 7509.63, "total_tokens": 3917728} +{"current_steps": 1695, "total_steps": 5305, "loss": 0.3957, "lr": 4.302889228034846e-06, "epoch": 1.5975494816211122, "percentage": 31.95, "elapsed_time": "0:08:42", "remaining_time": "0:18:32", "throughput": 7521.91, "total_tokens": 3928032} +{"current_steps": 1700, "total_steps": 5305, "loss": 0.1641, "lr": 4.297180875609902e-06, "epoch": 1.6022620169651272, "percentage": 32.05, "elapsed_time": "0:08:42", "remaining_time": "0:18:28", "throughput": 7537.22, "total_tokens": 3940384} +{"current_steps": 1705, "total_steps": 5305, "loss": 0.0949, "lr": 4.2914530666131436e-06, "epoch": 1.6069745523091423, "percentage": 32.14, "elapsed_time": "0:08:43", "remaining_time": "0:18:25", "throughput": 7551.26, "total_tokens": 3951904} +{"current_steps": 1710, "total_steps": 5305, "loss": 0.2799, "lr": 4.285705863054759e-06, "epoch": 1.6116870876531575, "percentage": 32.23, "elapsed_time": "0:08:43", "remaining_time": "0:18:21", "throughput": 7565.08, "total_tokens": 3963360} +{"current_steps": 1715, "total_steps": 5305, "loss": 0.3126, "lr": 4.279939327154909e-06, "epoch": 1.6163996229971724, "percentage": 32.33, "elapsed_time": "0:08:44", "remaining_time": "0:18:17", "throughput": 7578.36, "total_tokens": 3974432} +{"current_steps": 1720, "total_steps": 5305, "loss": 0.2358, "lr": 4.274153521343047e-06, "epoch": 1.6211121583411876, "percentage": 32.42, "elapsed_time": "0:08:44", "remaining_time": "0:18:14", "throughput": 7589.91, "total_tokens": 3984352} +{"current_steps": 1725, "total_steps": 5305, "loss": 0.0892, "lr": 4.268348508257243e-06, "epoch": 1.6258246936852028, "percentage": 32.52, "elapsed_time": "0:08:45", "remaining_time": "0:18:10", "throughput": 7601.06, "total_tokens": 3994016} +{"current_steps": 1730, "total_steps": 5305, "loss": 0.3199, "lr": 4.262524350743512e-06, "epoch": 1.6305372290292177, "percentage": 32.61, "elapsed_time": "0:08:46", "remaining_time": "0:18:06", "throughput": 7615.46, "total_tokens": 4005856} +{"current_steps": 1735, "total_steps": 5305, "loss": 0.1497, "lr": 4.25668111185513e-06, "epoch": 1.6352497643732327, "percentage": 32.7, "elapsed_time": "0:08:46", "remaining_time": "0:18:03", "throughput": 7629.08, "total_tokens": 4017248} +{"current_steps": 1740, "total_steps": 5305, "loss": 0.1124, "lr": 4.250818854851948e-06, "epoch": 1.6399622997172478, "percentage": 32.8, "elapsed_time": "0:08:47", "remaining_time": "0:17:59", "throughput": 7641.97, "total_tokens": 4028128} +{"current_steps": 1745, "total_steps": 5305, "loss": 0.1923, "lr": 4.244937643199711e-06, "epoch": 1.644674835061263, "percentage": 32.89, "elapsed_time": "0:08:47", "remaining_time": "0:17:56", "throughput": 7663.14, "total_tokens": 4044768} +{"current_steps": 1750, "total_steps": 5305, "loss": 0.1026, "lr": 4.239037540569373e-06, "epoch": 1.649387370405278, "percentage": 32.99, "elapsed_time": "0:08:48", "remaining_time": "0:17:53", "throughput": 7685.91, "total_tokens": 4062432} +{"current_steps": 1755, "total_steps": 5305, "loss": 0.0699, "lr": 4.233118610836401e-06, "epoch": 1.654099905749293, "percentage": 33.08, "elapsed_time": "0:08:49", "remaining_time": "0:17:50", "throughput": 7699.73, "total_tokens": 4074016} +{"current_steps": 1760, "total_steps": 5305, "loss": 0.1875, "lr": 4.227180918080089e-06, "epoch": 1.6588124410933083, "percentage": 33.18, "elapsed_time": "0:08:49", "remaining_time": "0:17:46", "throughput": 7712.12, "total_tokens": 4084704} +{"current_steps": 1765, "total_steps": 5305, "loss": 0.0828, "lr": 4.221224526582863e-06, "epoch": 1.6635249764373232, "percentage": 33.27, "elapsed_time": "0:08:50", "remaining_time": "0:17:43", "throughput": 7724.13, "total_tokens": 4095136} +{"current_steps": 1770, "total_steps": 5305, "loss": 0.1379, "lr": 4.215249500829583e-06, "epoch": 1.6682375117813384, "percentage": 33.36, "elapsed_time": "0:08:50", "remaining_time": "0:17:40", "throughput": 7739.34, "total_tokens": 4107744} +{"current_steps": 1775, "total_steps": 5305, "loss": 0.2322, "lr": 4.209255905506847e-06, "epoch": 1.6729500471253536, "percentage": 33.46, "elapsed_time": "0:08:51", "remaining_time": "0:17:36", "throughput": 7752.03, "total_tokens": 4118624} +{"current_steps": 1780, "total_steps": 5305, "loss": 0.1804, "lr": 4.2032438055022925e-06, "epoch": 1.6776625824693685, "percentage": 33.55, "elapsed_time": "0:08:51", "remaining_time": "0:17:33", "throughput": 7764.19, "total_tokens": 4129184} +{"current_steps": 1785, "total_steps": 5305, "loss": 0.3414, "lr": 4.197213265903889e-06, "epoch": 1.6823751178133834, "percentage": 33.65, "elapsed_time": "0:08:52", "remaining_time": "0:17:29", "throughput": 7778.18, "total_tokens": 4141024} +{"current_steps": 1790, "total_steps": 5305, "loss": 0.3523, "lr": 4.191164351999236e-06, "epoch": 1.6870876531573988, "percentage": 33.74, "elapsed_time": "0:08:52", "remaining_time": "0:17:26", "throughput": 7790.59, "total_tokens": 4151840} +{"current_steps": 1795, "total_steps": 5305, "loss": 0.2797, "lr": 4.18509712927486e-06, "epoch": 1.6918001885014138, "percentage": 33.84, "elapsed_time": "0:08:53", "remaining_time": "0:17:23", "throughput": 7805.98, "total_tokens": 4164704} +{"current_steps": 1800, "total_steps": 5305, "loss": 0.2943, "lr": 4.179011663415494e-06, "epoch": 1.6965127238454287, "percentage": 33.93, "elapsed_time": "0:08:54", "remaining_time": "0:17:20", "throughput": 7820.91, "total_tokens": 4177184} +{"current_steps": 1805, "total_steps": 5305, "loss": 0.0589, "lr": 4.172908020303384e-06, "epoch": 1.701225259189444, "percentage": 34.02, "elapsed_time": "0:08:54", "remaining_time": "0:17:16", "throughput": 7834.44, "total_tokens": 4188768} +{"current_steps": 1810, "total_steps": 5305, "loss": 0.1865, "lr": 4.166786266017557e-06, "epoch": 1.705937794533459, "percentage": 34.12, "elapsed_time": "0:08:55", "remaining_time": "0:17:13", "throughput": 7848.17, "total_tokens": 4200480} +{"current_steps": 1815, "total_steps": 5305, "loss": 0.1045, "lr": 4.160646466833121e-06, "epoch": 1.710650329877474, "percentage": 34.21, "elapsed_time": "0:08:55", "remaining_time": "0:17:10", "throughput": 7861.66, "total_tokens": 4212064} +{"current_steps": 1820, "total_steps": 5305, "loss": 0.2373, "lr": 4.154488689220536e-06, "epoch": 1.7153628652214892, "percentage": 34.31, "elapsed_time": "0:08:56", "remaining_time": "0:17:06", "throughput": 7872.22, "total_tokens": 4221728} +{"current_steps": 1825, "total_steps": 5305, "loss": 0.216, "lr": 4.1483129998449035e-06, "epoch": 1.7200754005655043, "percentage": 34.4, "elapsed_time": "0:08:56", "remaining_time": "0:17:03", "throughput": 7886.45, "total_tokens": 4233888} +{"current_steps": 1830, "total_steps": 5305, "loss": 0.2308, "lr": 4.142119465565238e-06, "epoch": 1.7247879359095193, "percentage": 34.5, "elapsed_time": "0:08:57", "remaining_time": "0:17:00", "throughput": 7899.66, "total_tokens": 4245344} +{"current_steps": 1835, "total_steps": 5305, "loss": 0.0663, "lr": 4.135908153433748e-06, "epoch": 1.7295004712535345, "percentage": 34.59, "elapsed_time": "0:08:57", "remaining_time": "0:16:57", "throughput": 7913.13, "total_tokens": 4256992} +{"current_steps": 1840, "total_steps": 5305, "loss": 0.0795, "lr": 4.129679130695105e-06, "epoch": 1.7342130065975496, "percentage": 34.68, "elapsed_time": "0:08:58", "remaining_time": "0:16:54", "throughput": 7923.88, "total_tokens": 4266784} +{"current_steps": 1845, "total_steps": 5305, "loss": 0.0953, "lr": 4.123432464785721e-06, "epoch": 1.7389255419415646, "percentage": 34.78, "elapsed_time": "0:08:59", "remaining_time": "0:16:51", "throughput": 7941.71, "total_tokens": 4281504} +{"current_steps": 1850, "total_steps": 5305, "loss": 0.3657, "lr": 4.117168223333015e-06, "epoch": 1.7436380772855795, "percentage": 34.87, "elapsed_time": "0:08:59", "remaining_time": "0:16:48", "throughput": 7959.28, "total_tokens": 4296032} +{"current_steps": 1855, "total_steps": 5305, "loss": 0.0417, "lr": 4.1108864741546815e-06, "epoch": 1.7483506126295947, "percentage": 34.97, "elapsed_time": "0:09:00", "remaining_time": "0:16:44", "throughput": 7974.85, "total_tokens": 4309280} +{"current_steps": 1860, "total_steps": 5305, "loss": 0.1138, "lr": 4.1045872852579546e-06, "epoch": 1.7530631479736098, "percentage": 35.06, "elapsed_time": "0:09:00", "remaining_time": "0:16:41", "throughput": 7986.29, "total_tokens": 4319648} +{"current_steps": 1862, "total_steps": 5305, "eval_loss": 0.3500010073184967, "epoch": 1.7549481621112157, "percentage": 35.1, "elapsed_time": "0:09:03", "remaining_time": "0:16:45", "throughput": 7951.47, "total_tokens": 4324256} +{"current_steps": 1865, "total_steps": 5305, "loss": 0.0767, "lr": 4.098270724838879e-06, "epoch": 1.7577756833176248, "percentage": 35.16, "elapsed_time": "0:09:34", "remaining_time": "0:17:40", "throughput": 7533.11, "total_tokens": 4330144} +{"current_steps": 1870, "total_steps": 5305, "loss": 0.0415, "lr": 4.091936861281561e-06, "epoch": 1.76248821866164, "percentage": 35.25, "elapsed_time": "0:09:35", "remaining_time": "0:17:37", "throughput": 7548.63, "total_tokens": 4343712} +{"current_steps": 1875, "total_steps": 5305, "loss": 0.4214, "lr": 4.085585763157435e-06, "epoch": 1.7672007540056551, "percentage": 35.34, "elapsed_time": "0:09:35", "remaining_time": "0:17:33", "throughput": 7559.85, "total_tokens": 4354144} +{"current_steps": 1880, "total_steps": 5305, "loss": 0.013, "lr": 4.07921749922452e-06, "epoch": 1.77191328934967, "percentage": 35.44, "elapsed_time": "0:09:36", "remaining_time": "0:17:30", "throughput": 7571.51, "total_tokens": 4364896} +{"current_steps": 1885, "total_steps": 5305, "loss": 0.1879, "lr": 4.0728321384266764e-06, "epoch": 1.7766258246936852, "percentage": 35.53, "elapsed_time": "0:09:37", "remaining_time": "0:17:26", "throughput": 7585.19, "total_tokens": 4377120} +{"current_steps": 1890, "total_steps": 5305, "loss": 0.1512, "lr": 4.066429749892854e-06, "epoch": 1.7813383600377004, "percentage": 35.63, "elapsed_time": "0:09:37", "remaining_time": "0:17:23", "throughput": 7597.16, "total_tokens": 4388128} +{"current_steps": 1895, "total_steps": 5305, "loss": 0.1946, "lr": 4.060010402936353e-06, "epoch": 1.7860508953817154, "percentage": 35.72, "elapsed_time": "0:09:38", "remaining_time": "0:17:20", "throughput": 7613.32, "total_tokens": 4402272} +{"current_steps": 1900, "total_steps": 5305, "loss": 0.0513, "lr": 4.053574167054063e-06, "epoch": 1.7907634307257303, "percentage": 35.82, "elapsed_time": "0:09:38", "remaining_time": "0:17:17", "throughput": 7623.91, "total_tokens": 4412640} +{"current_steps": 1905, "total_steps": 5305, "loss": 0.2935, "lr": 4.047121111925718e-06, "epoch": 1.7954759660697457, "percentage": 35.91, "elapsed_time": "0:09:39", "remaining_time": "0:17:13", "throughput": 7636.41, "total_tokens": 4424096} +{"current_steps": 1910, "total_steps": 5305, "loss": 0.1499, "lr": 4.040651307413142e-06, "epoch": 1.8001885014137606, "percentage": 36.0, "elapsed_time": "0:09:39", "remaining_time": "0:17:10", "throughput": 7646.98, "total_tokens": 4434144} +{"current_steps": 1915, "total_steps": 5305, "loss": 0.1671, "lr": 4.034164823559487e-06, "epoch": 1.8049010367577756, "percentage": 36.1, "elapsed_time": "0:09:40", "remaining_time": "0:17:07", "throughput": 7660.29, "total_tokens": 4446240} +{"current_steps": 1920, "total_steps": 5305, "loss": 0.183, "lr": 4.02766173058848e-06, "epoch": 1.8096135721017907, "percentage": 36.19, "elapsed_time": "0:09:40", "remaining_time": "0:17:04", "throughput": 7670.01, "total_tokens": 4455712} +{"current_steps": 1925, "total_steps": 5305, "loss": 0.2619, "lr": 4.021142098903662e-06, "epoch": 1.814326107445806, "percentage": 36.29, "elapsed_time": "0:09:41", "remaining_time": "0:17:00", "throughput": 7681.07, "total_tokens": 4466144} +{"current_steps": 1930, "total_steps": 5305, "loss": 0.2168, "lr": 4.014605999087623e-06, "epoch": 1.8190386427898209, "percentage": 36.38, "elapsed_time": "0:09:41", "remaining_time": "0:16:57", "throughput": 7691.33, "total_tokens": 4476064} +{"current_steps": 1935, "total_steps": 5305, "loss": 0.1402, "lr": 4.008053501901239e-06, "epoch": 1.823751178133836, "percentage": 36.48, "elapsed_time": "0:09:42", "remaining_time": "0:16:54", "throughput": 7703.64, "total_tokens": 4487456} +{"current_steps": 1940, "total_steps": 5305, "loss": 0.2318, "lr": 4.001484678282911e-06, "epoch": 1.8284637134778512, "percentage": 36.57, "elapsed_time": "0:09:43", "remaining_time": "0:16:51", "throughput": 7715.3, "total_tokens": 4498400} +{"current_steps": 1945, "total_steps": 5305, "loss": 0.1527, "lr": 3.994899599347787e-06, "epoch": 1.8331762488218661, "percentage": 36.66, "elapsed_time": "0:09:43", "remaining_time": "0:16:48", "throughput": 7729.78, "total_tokens": 4511520} +{"current_steps": 1950, "total_steps": 5305, "loss": 0.151, "lr": 3.9882983363869995e-06, "epoch": 1.837888784165881, "percentage": 36.76, "elapsed_time": "0:09:44", "remaining_time": "0:16:45", "throughput": 7742.41, "total_tokens": 4523232} +{"current_steps": 1955, "total_steps": 5305, "loss": 0.084, "lr": 3.981680960866896e-06, "epoch": 1.8426013195098965, "percentage": 36.85, "elapsed_time": "0:09:44", "remaining_time": "0:16:42", "throughput": 7757.02, "total_tokens": 4536416} +{"current_steps": 1960, "total_steps": 5305, "loss": 0.1193, "lr": 3.9750475444282545e-06, "epoch": 1.8473138548539114, "percentage": 36.95, "elapsed_time": "0:09:45", "remaining_time": "0:16:38", "throughput": 7767.46, "total_tokens": 4546528} +{"current_steps": 1965, "total_steps": 5305, "loss": 0.0301, "lr": 3.968398158885519e-06, "epoch": 1.8520263901979264, "percentage": 37.04, "elapsed_time": "0:09:45", "remaining_time": "0:16:35", "throughput": 7781.07, "total_tokens": 4559008} +{"current_steps": 1970, "total_steps": 5305, "loss": 0.1272, "lr": 3.961732876226016e-06, "epoch": 1.8567389255419415, "percentage": 37.13, "elapsed_time": "0:09:46", "remaining_time": "0:16:32", "throughput": 7792.41, "total_tokens": 4569824} +{"current_steps": 1975, "total_steps": 5305, "loss": 0.0125, "lr": 3.955051768609179e-06, "epoch": 1.8614514608859567, "percentage": 37.23, "elapsed_time": "0:09:47", "remaining_time": "0:16:29", "throughput": 7805.06, "total_tokens": 4581664} +{"current_steps": 1980, "total_steps": 5305, "loss": 0.2273, "lr": 3.948354908365762e-06, "epoch": 1.8661639962299716, "percentage": 37.32, "elapsed_time": "0:09:47", "remaining_time": "0:16:26", "throughput": 7817.9, "total_tokens": 4593696} +{"current_steps": 1985, "total_steps": 5305, "loss": 0.3306, "lr": 3.941642367997062e-06, "epoch": 1.8708765315739868, "percentage": 37.42, "elapsed_time": "0:09:48", "remaining_time": "0:16:23", "throughput": 7828.61, "total_tokens": 4604064} +{"current_steps": 1990, "total_steps": 5305, "loss": 0.2246, "lr": 3.934914220174128e-06, "epoch": 1.875589066918002, "percentage": 37.51, "elapsed_time": "0:09:48", "remaining_time": "0:16:20", "throughput": 7838.43, "total_tokens": 4613856} +{"current_steps": 1995, "total_steps": 5305, "loss": 0.262, "lr": 3.9281705377369814e-06, "epoch": 1.880301602262017, "percentage": 37.61, "elapsed_time": "0:09:49", "remaining_time": "0:16:17", "throughput": 7849.45, "total_tokens": 4624480} +{"current_steps": 2000, "total_steps": 5305, "loss": 0.0359, "lr": 3.921411393693823e-06, "epoch": 1.885014137606032, "percentage": 37.7, "elapsed_time": "0:09:49", "remaining_time": "0:16:14", "throughput": 7859.79, "total_tokens": 4634720} +{"current_steps": 2005, "total_steps": 5305, "loss": 0.1522, "lr": 3.9146368612202425e-06, "epoch": 1.8897266729500473, "percentage": 37.79, "elapsed_time": "0:09:50", "remaining_time": "0:16:11", "throughput": 7869.34, "total_tokens": 4644320} +{"current_steps": 2010, "total_steps": 5305, "loss": 0.1144, "lr": 3.907847013658429e-06, "epoch": 1.8944392082940622, "percentage": 37.89, "elapsed_time": "0:09:50", "remaining_time": "0:16:08", "throughput": 7882.48, "total_tokens": 4656672} +{"current_steps": 2015, "total_steps": 5305, "loss": 0.152, "lr": 3.901041924516372e-06, "epoch": 1.8991517436380771, "percentage": 37.98, "elapsed_time": "0:09:51", "remaining_time": "0:16:05", "throughput": 7895.41, "total_tokens": 4668832} +{"current_steps": 2020, "total_steps": 5305, "loss": 0.0683, "lr": 3.894221667467074e-06, "epoch": 1.9038642789820923, "percentage": 38.08, "elapsed_time": "0:09:51", "remaining_time": "0:16:02", "throughput": 7907.15, "total_tokens": 4680096} +{"current_steps": 2025, "total_steps": 5305, "loss": 0.0966, "lr": 3.887386316347742e-06, "epoch": 1.9085768143261075, "percentage": 38.17, "elapsed_time": "0:09:52", "remaining_time": "0:15:59", "throughput": 7920.05, "total_tokens": 4692320} +{"current_steps": 2030, "total_steps": 5305, "loss": 0.1503, "lr": 3.880535945158997e-06, "epoch": 1.9132893496701224, "percentage": 38.27, "elapsed_time": "0:09:53", "remaining_time": "0:15:56", "throughput": 7939.16, "total_tokens": 4709344} +{"current_steps": 2035, "total_steps": 5305, "loss": 0.0726, "lr": 3.873670628064071e-06, "epoch": 1.9180018850141376, "percentage": 38.36, "elapsed_time": "0:09:53", "remaining_time": "0:15:54", "throughput": 7952.47, "total_tokens": 4721888} +{"current_steps": 2040, "total_steps": 5305, "loss": 0.117, "lr": 3.866790439387998e-06, "epoch": 1.9227144203581528, "percentage": 38.45, "elapsed_time": "0:09:54", "remaining_time": "0:15:51", "throughput": 7963.06, "total_tokens": 4732384} +{"current_steps": 2045, "total_steps": 5305, "loss": 0.2188, "lr": 3.85989545361682e-06, "epoch": 1.9274269557021677, "percentage": 38.55, "elapsed_time": "0:09:54", "remaining_time": "0:15:48", "throughput": 7974.13, "total_tokens": 4743264} +{"current_steps": 2050, "total_steps": 5305, "loss": 0.1091, "lr": 3.85298574539677e-06, "epoch": 1.9321394910461829, "percentage": 38.64, "elapsed_time": "0:09:55", "remaining_time": "0:15:45", "throughput": 7983.97, "total_tokens": 4753248} +{"current_steps": 2055, "total_steps": 5305, "loss": 0.0907, "lr": 3.846061389533472e-06, "epoch": 1.936852026390198, "percentage": 38.74, "elapsed_time": "0:09:55", "remaining_time": "0:15:42", "throughput": 7995.81, "total_tokens": 4764768} +{"current_steps": 2060, "total_steps": 5305, "loss": 0.2683, "lr": 3.839122460991124e-06, "epoch": 1.941564561734213, "percentage": 38.83, "elapsed_time": "0:09:56", "remaining_time": "0:15:39", "throughput": 8006.65, "total_tokens": 4775456} +{"current_steps": 2065, "total_steps": 5305, "loss": 0.3549, "lr": 3.832169034891695e-06, "epoch": 1.946277097078228, "percentage": 38.93, "elapsed_time": "0:09:57", "remaining_time": "0:15:36", "throughput": 8021.3, "total_tokens": 4789152} +{"current_steps": 2070, "total_steps": 5305, "loss": 0.0639, "lr": 3.825201186514103e-06, "epoch": 1.9509896324222433, "percentage": 39.02, "elapsed_time": "0:09:57", "remaining_time": "0:15:34", "throughput": 8036.72, "total_tokens": 4803488} +{"current_steps": 2075, "total_steps": 5305, "loss": 0.2019, "lr": 3.818218991293406e-06, "epoch": 1.9557021677662583, "percentage": 39.11, "elapsed_time": "0:09:58", "remaining_time": "0:15:31", "throughput": 8046.21, "total_tokens": 4813216} +{"current_steps": 2080, "total_steps": 5305, "loss": 0.1943, "lr": 3.811222524819983e-06, "epoch": 1.9604147031102732, "percentage": 39.21, "elapsed_time": "0:09:58", "remaining_time": "0:15:28", "throughput": 8056.6, "total_tokens": 4823584} +{"current_steps": 2085, "total_steps": 5305, "loss": 0.0531, "lr": 3.8042118628387138e-06, "epoch": 1.9651272384542884, "percentage": 39.3, "elapsed_time": "0:09:59", "remaining_time": "0:15:25", "throughput": 8072.83, "total_tokens": 4838624} +{"current_steps": 2090, "total_steps": 5305, "loss": 0.0121, "lr": 3.7971870812481636e-06, "epoch": 1.9698397737983036, "percentage": 39.4, "elapsed_time": "0:09:59", "remaining_time": "0:15:22", "throughput": 8086.32, "total_tokens": 4851552} +{"current_steps": 2095, "total_steps": 5305, "loss": 0.1929, "lr": 3.7901482560997577e-06, "epoch": 1.9745523091423185, "percentage": 39.49, "elapsed_time": "0:10:00", "remaining_time": "0:15:20", "throughput": 8099.68, "total_tokens": 4864352} +{"current_steps": 2100, "total_steps": 5305, "loss": 0.2053, "lr": 3.78309546359696e-06, "epoch": 1.9792648444863337, "percentage": 39.59, "elapsed_time": "0:10:01", "remaining_time": "0:15:17", "throughput": 8110.99, "total_tokens": 4875616} +{"current_steps": 2105, "total_steps": 5305, "loss": 0.0107, "lr": 3.776028780094446e-06, "epoch": 1.9839773798303488, "percentage": 39.68, "elapsed_time": "0:10:01", "remaining_time": "0:15:14", "throughput": 8121.94, "total_tokens": 4886560} +{"current_steps": 2110, "total_steps": 5305, "loss": 0.2379, "lr": 3.7689482820972797e-06, "epoch": 1.9886899151743638, "percentage": 39.77, "elapsed_time": "0:10:02", "remaining_time": "0:15:11", "throughput": 8134.21, "total_tokens": 4898592} +{"current_steps": 2115, "total_steps": 5305, "loss": 0.2504, "lr": 3.7618540462600792e-06, "epoch": 1.993402450518379, "percentage": 39.87, "elapsed_time": "0:10:02", "remaining_time": "0:15:09", "throughput": 8148.48, "total_tokens": 4912160} +{"current_steps": 2120, "total_steps": 5305, "loss": 0.1832, "lr": 3.7547461493861948e-06, "epoch": 1.998114985862394, "percentage": 39.96, "elapsed_time": "0:10:03", "remaining_time": "0:15:06", "throughput": 8159.79, "total_tokens": 4923424} +{"current_steps": 2125, "total_steps": 5305, "loss": 0.0762, "lr": 3.7476246684268703e-06, "epoch": 2.002827521206409, "percentage": 40.06, "elapsed_time": "0:10:04", "remaining_time": "0:15:05", "throughput": 8154.58, "total_tokens": 4932416} +{"current_steps": 2128, "total_steps": 5305, "eval_loss": 0.33445462584495544, "epoch": 2.005655042412818, "percentage": 40.11, "elapsed_time": "0:10:08", "remaining_time": "0:15:08", "throughput": 8118.67, "total_tokens": 4940992} +{"current_steps": 2130, "total_steps": 5305, "loss": 0.0528, "lr": 3.740489680480415e-06, "epoch": 2.007540056550424, "percentage": 40.15, "elapsed_time": "0:11:00", "remaining_time": "0:16:25", "throughput": 7487.96, "total_tokens": 4948288} +{"current_steps": 2135, "total_steps": 5305, "loss": 0.0067, "lr": 3.733341262791366e-06, "epoch": 2.0122525918944394, "percentage": 40.25, "elapsed_time": "0:11:01", "remaining_time": "0:16:22", "throughput": 7499.95, "total_tokens": 4960512} +{"current_steps": 2140, "total_steps": 5305, "loss": 0.0027, "lr": 3.7261794927496535e-06, "epoch": 2.0169651272384543, "percentage": 40.34, "elapsed_time": "0:11:01", "remaining_time": "0:16:19", "throughput": 7511.43, "total_tokens": 4972352} +{"current_steps": 2145, "total_steps": 5305, "loss": 0.0681, "lr": 3.719004447889762e-06, "epoch": 2.0216776625824693, "percentage": 40.43, "elapsed_time": "0:11:02", "remaining_time": "0:16:15", "throughput": 7520.63, "total_tokens": 4982272} +{"current_steps": 2150, "total_steps": 5305, "loss": 0.1795, "lr": 3.7118162058898915e-06, "epoch": 2.0263901979264842, "percentage": 40.53, "elapsed_time": "0:11:03", "remaining_time": "0:16:12", "throughput": 7530.9, "total_tokens": 4993088} +{"current_steps": 2155, "total_steps": 5305, "loss": 0.0124, "lr": 3.704614844571117e-06, "epoch": 2.0311027332704996, "percentage": 40.62, "elapsed_time": "0:11:03", "remaining_time": "0:16:09", "throughput": 7540.49, "total_tokens": 5003392} +{"current_steps": 2160, "total_steps": 5305, "loss": 0.0007, "lr": 3.6974004418965435e-06, "epoch": 2.0358152686145146, "percentage": 40.72, "elapsed_time": "0:11:04", "remaining_time": "0:16:06", "throughput": 7551.12, "total_tokens": 5014592} +{"current_steps": 2165, "total_steps": 5305, "loss": 0.1943, "lr": 3.6901730759704674e-06, "epoch": 2.0405278039585295, "percentage": 40.81, "elapsed_time": "0:11:04", "remaining_time": "0:16:04", "throughput": 7564.58, "total_tokens": 5028160} +{"current_steps": 2170, "total_steps": 5305, "loss": 0.1365, "lr": 3.682932825037523e-06, "epoch": 2.045240339302545, "percentage": 40.9, "elapsed_time": "0:11:05", "remaining_time": "0:16:01", "throughput": 7572.98, "total_tokens": 5037504} +{"current_steps": 2175, "total_steps": 5305, "loss": 0.0894, "lr": 3.675679767481842e-06, "epoch": 2.04995287464656, "percentage": 41.0, "elapsed_time": "0:11:05", "remaining_time": "0:15:58", "throughput": 7587.85, "total_tokens": 5052288} +{"current_steps": 2180, "total_steps": 5305, "loss": 0.1397, "lr": 3.6684139818262045e-06, "epoch": 2.054665409990575, "percentage": 41.09, "elapsed_time": "0:11:06", "remaining_time": "0:15:55", "throughput": 7599.5, "total_tokens": 5064384} +{"current_steps": 2185, "total_steps": 5305, "loss": 0.0268, "lr": 3.6611355467311825e-06, "epoch": 2.05937794533459, "percentage": 41.19, "elapsed_time": "0:11:06", "remaining_time": "0:15:52", "throughput": 7608.46, "total_tokens": 5074240} +{"current_steps": 2190, "total_steps": 5305, "loss": 0.0081, "lr": 3.653844540994298e-06, "epoch": 2.064090480678605, "percentage": 41.28, "elapsed_time": "0:11:07", "remaining_time": "0:15:49", "throughput": 7618.86, "total_tokens": 5085312} +{"current_steps": 2195, "total_steps": 5305, "loss": 0.0006, "lr": 3.6465410435491603e-06, "epoch": 2.06880301602262, "percentage": 41.38, "elapsed_time": "0:11:07", "remaining_time": "0:15:46", "throughput": 7627.12, "total_tokens": 5094592} +{"current_steps": 2200, "total_steps": 5305, "loss": 0.0012, "lr": 3.6392251334646194e-06, "epoch": 2.0735155513666355, "percentage": 41.47, "elapsed_time": "0:11:08", "remaining_time": "0:15:43", "throughput": 7640.88, "total_tokens": 5108544} +{"current_steps": 2205, "total_steps": 5305, "loss": 0.2164, "lr": 3.6318968899439042e-06, "epoch": 2.0782280867106504, "percentage": 41.56, "elapsed_time": "0:11:09", "remaining_time": "0:15:40", "throughput": 7648.84, "total_tokens": 5118976} +{"current_steps": 2210, "total_steps": 5305, "loss": 0.0004, "lr": 3.6245563923237692e-06, "epoch": 2.0829406220546653, "percentage": 41.66, "elapsed_time": "0:11:09", "remaining_time": "0:15:38", "throughput": 7664.08, "total_tokens": 5134272} +{"current_steps": 2215, "total_steps": 5305, "loss": 0.0463, "lr": 3.617203720073633e-06, "epoch": 2.0876531573986803, "percentage": 41.75, "elapsed_time": "0:11:10", "remaining_time": "0:15:35", "throughput": 7674.4, "total_tokens": 5145408} +{"current_steps": 2220, "total_steps": 5305, "loss": 0.1413, "lr": 3.6098389527947164e-06, "epoch": 2.0923656927426957, "percentage": 41.85, "elapsed_time": "0:11:11", "remaining_time": "0:15:32", "throughput": 7685.87, "total_tokens": 5157440} +{"current_steps": 2225, "total_steps": 5305, "loss": 0.0007, "lr": 3.6024621702191876e-06, "epoch": 2.0970782280867106, "percentage": 41.94, "elapsed_time": "0:11:11", "remaining_time": "0:15:29", "throughput": 7698.04, "total_tokens": 5170176} +{"current_steps": 2230, "total_steps": 5305, "loss": 0.2877, "lr": 3.5950734522092908e-06, "epoch": 2.1017907634307256, "percentage": 42.04, "elapsed_time": "0:11:12", "remaining_time": "0:15:26", "throughput": 7705.6, "total_tokens": 5178944} +{"current_steps": 2235, "total_steps": 5305, "loss": 0.0007, "lr": 3.587672878756487e-06, "epoch": 2.106503298774741, "percentage": 42.13, "elapsed_time": "0:11:12", "remaining_time": "0:15:23", "throughput": 7716.13, "total_tokens": 5190272} +{"current_steps": 2240, "total_steps": 5305, "loss": 0.0004, "lr": 3.5802605299805843e-06, "epoch": 2.111215834118756, "percentage": 42.22, "elapsed_time": "0:11:13", "remaining_time": "0:15:21", "throughput": 7727.47, "total_tokens": 5202304} +{"current_steps": 2245, "total_steps": 5305, "loss": 0.1757, "lr": 3.5728364861288743e-06, "epoch": 2.115928369462771, "percentage": 42.32, "elapsed_time": "0:11:13", "remaining_time": "0:15:18", "throughput": 7740.51, "total_tokens": 5215808} +{"current_steps": 2250, "total_steps": 5305, "loss": 0.0003, "lr": 3.5654008275752607e-06, "epoch": 2.1206409048067862, "percentage": 42.41, "elapsed_time": "0:11:14", "remaining_time": "0:15:15", "throughput": 7753.23, "total_tokens": 5229056} +{"current_steps": 2255, "total_steps": 5305, "loss": 0.0007, "lr": 3.557953634819389e-06, "epoch": 2.125353440150801, "percentage": 42.51, "elapsed_time": "0:11:14", "remaining_time": "0:15:12", "throughput": 7762.86, "total_tokens": 5239616} +{"current_steps": 2260, "total_steps": 5305, "loss": 0.1511, "lr": 3.550494988485777e-06, "epoch": 2.130065975494816, "percentage": 42.6, "elapsed_time": "0:11:15", "remaining_time": "0:15:10", "throughput": 7771.73, "total_tokens": 5249600} +{"current_steps": 2265, "total_steps": 5305, "loss": 0.2004, "lr": 3.5430249693229403e-06, "epoch": 2.1347785108388315, "percentage": 42.7, "elapsed_time": "0:11:16", "remaining_time": "0:15:07", "throughput": 7783.28, "total_tokens": 5261888} +{"current_steps": 2270, "total_steps": 5305, "loss": 0.0272, "lr": 3.5355436582025184e-06, "epoch": 2.1394910461828465, "percentage": 42.79, "elapsed_time": "0:11:16", "remaining_time": "0:15:04", "throughput": 7793.17, "total_tokens": 5272768} +{"current_steps": 2275, "total_steps": 5305, "loss": 0.142, "lr": 3.5280511361183995e-06, "epoch": 2.1442035815268614, "percentage": 42.88, "elapsed_time": "0:11:17", "remaining_time": "0:15:01", "throughput": 7802.88, "total_tokens": 5283520} +{"current_steps": 2280, "total_steps": 5305, "loss": 0.0003, "lr": 3.5205474841858444e-06, "epoch": 2.1489161168708764, "percentage": 42.98, "elapsed_time": "0:11:17", "remaining_time": "0:14:59", "throughput": 7812.59, "total_tokens": 5294336} +{"current_steps": 2285, "total_steps": 5305, "loss": 0.0445, "lr": 3.513032783640605e-06, "epoch": 2.1536286522148917, "percentage": 43.07, "elapsed_time": "0:11:18", "remaining_time": "0:14:56", "throughput": 7822.2, "total_tokens": 5304960} +{"current_steps": 2290, "total_steps": 5305, "loss": 0.0002, "lr": 3.5055071158380512e-06, "epoch": 2.1583411875589067, "percentage": 43.17, "elapsed_time": "0:11:18", "remaining_time": "0:14:53", "throughput": 7833.59, "total_tokens": 5317184} +{"current_steps": 2295, "total_steps": 5305, "loss": 0.0003, "lr": 3.497970562252282e-06, "epoch": 2.1630537229029216, "percentage": 43.26, "elapsed_time": "0:11:19", "remaining_time": "0:14:50", "throughput": 7844.67, "total_tokens": 5329152} +{"current_steps": 2300, "total_steps": 5305, "loss": 0.232, "lr": 3.4904232044752507e-06, "epoch": 2.167766258246937, "percentage": 43.36, "elapsed_time": "0:11:19", "remaining_time": "0:14:48", "throughput": 7856.77, "total_tokens": 5342016} +{"current_steps": 2305, "total_steps": 5305, "loss": 0.1157, "lr": 3.4828651242158764e-06, "epoch": 2.172478793590952, "percentage": 43.45, "elapsed_time": "0:11:20", "remaining_time": "0:14:45", "throughput": 7866.41, "total_tokens": 5352768} +{"current_steps": 2310, "total_steps": 5305, "loss": 0.1506, "lr": 3.4752964032991638e-06, "epoch": 2.177191328934967, "percentage": 43.54, "elapsed_time": "0:11:21", "remaining_time": "0:14:42", "throughput": 7876.76, "total_tokens": 5364160} +{"current_steps": 2315, "total_steps": 5305, "loss": 0.1442, "lr": 3.4677171236653133e-06, "epoch": 2.181903864278982, "percentage": 43.64, "elapsed_time": "0:11:21", "remaining_time": "0:14:40", "throughput": 7888.16, "total_tokens": 5376448} +{"current_steps": 2320, "total_steps": 5305, "loss": 0.0562, "lr": 3.460127367368836e-06, "epoch": 2.1866163996229973, "percentage": 43.73, "elapsed_time": "0:11:22", "remaining_time": "0:14:37", "throughput": 7897.01, "total_tokens": 5386560} +{"current_steps": 2325, "total_steps": 5305, "loss": 0.1956, "lr": 3.452527216577665e-06, "epoch": 2.191328934967012, "percentage": 43.83, "elapsed_time": "0:11:22", "remaining_time": "0:14:35", "throughput": 7908.79, "total_tokens": 5399296} +{"current_steps": 2330, "total_steps": 5305, "loss": 0.1061, "lr": 3.444916753572267e-06, "epoch": 2.196041470311027, "percentage": 43.92, "elapsed_time": "0:11:23", "remaining_time": "0:14:32", "throughput": 7919.33, "total_tokens": 5410944} +{"current_steps": 2335, "total_steps": 5305, "loss": 0.0012, "lr": 3.4372960607447493e-06, "epoch": 2.2007540056550425, "percentage": 44.02, "elapsed_time": "0:11:23", "remaining_time": "0:14:29", "throughput": 7930.56, "total_tokens": 5423168} +{"current_steps": 2340, "total_steps": 5305, "loss": 0.0111, "lr": 3.429665220597968e-06, "epoch": 2.2054665409990575, "percentage": 44.11, "elapsed_time": "0:11:24", "remaining_time": "0:14:27", "throughput": 7943.07, "total_tokens": 5436544} +{"current_steps": 2345, "total_steps": 5305, "loss": 0.0934, "lr": 3.4220243157446388e-06, "epoch": 2.2101790763430724, "percentage": 44.2, "elapsed_time": "0:11:25", "remaining_time": "0:14:24", "throughput": 7954.0, "total_tokens": 5448512} +{"current_steps": 2350, "total_steps": 5305, "loss": 0.0139, "lr": 3.4143734289064363e-06, "epoch": 2.214891611687088, "percentage": 44.3, "elapsed_time": "0:11:25", "remaining_time": "0:14:22", "throughput": 7964.37, "total_tokens": 5460032} +{"current_steps": 2355, "total_steps": 5305, "loss": 0.0004, "lr": 3.4067126429131035e-06, "epoch": 2.2196041470311028, "percentage": 44.39, "elapsed_time": "0:11:26", "remaining_time": "0:14:19", "throughput": 7976.18, "total_tokens": 5472896} +{"current_steps": 2360, "total_steps": 5305, "loss": 0.0005, "lr": 3.3990420407015534e-06, "epoch": 2.2243166823751177, "percentage": 44.49, "elapsed_time": "0:11:26", "remaining_time": "0:14:16", "throughput": 7984.8, "total_tokens": 5482944} +{"current_steps": 2365, "total_steps": 5305, "loss": 0.0536, "lr": 3.3913617053149694e-06, "epoch": 2.229029217719133, "percentage": 44.58, "elapsed_time": "0:11:27", "remaining_time": "0:14:14", "throughput": 7994.92, "total_tokens": 5494336} +{"current_steps": 2370, "total_steps": 5305, "loss": 0.0001, "lr": 3.3836717199019087e-06, "epoch": 2.233741753063148, "percentage": 44.67, "elapsed_time": "0:11:27", "remaining_time": "0:14:11", "throughput": 8005.03, "total_tokens": 5505728} +{"current_steps": 2375, "total_steps": 5305, "loss": 0.0861, "lr": 3.3759721677154022e-06, "epoch": 2.238454288407163, "percentage": 44.77, "elapsed_time": "0:11:28", "remaining_time": "0:14:09", "throughput": 8013.21, "total_tokens": 5515328} +{"current_steps": 2380, "total_steps": 5305, "loss": 0.0002, "lr": 3.3682631321120507e-06, "epoch": 2.243166823751178, "percentage": 44.86, "elapsed_time": "0:11:28", "remaining_time": "0:14:06", "throughput": 8022.23, "total_tokens": 5525760} +{"current_steps": 2385, "total_steps": 5305, "loss": 0.168, "lr": 3.3605446965511256e-06, "epoch": 2.2478793590951933, "percentage": 44.96, "elapsed_time": "0:11:29", "remaining_time": "0:14:03", "throughput": 8032.52, "total_tokens": 5537280} +{"current_steps": 2390, "total_steps": 5305, "loss": 0.0898, "lr": 3.3528169445936616e-06, "epoch": 2.2525918944392083, "percentage": 45.05, "elapsed_time": "0:11:29", "remaining_time": "0:14:01", "throughput": 8042.92, "total_tokens": 5548928} +{"current_steps": 2394, "total_steps": 5305, "eval_loss": 0.46465176343917847, "epoch": 2.2563619227144205, "percentage": 45.13, "elapsed_time": "0:11:33", "remaining_time": "0:14:02", "throughput": 8019.56, "total_tokens": 5558144} +{"current_steps": 2395, "total_steps": 5305, "loss": 0.1847, "lr": 3.3450799599015567e-06, "epoch": 2.257304429783223, "percentage": 45.15, "elapsed_time": "0:12:25", "remaining_time": "0:15:06", "throughput": 7453.31, "total_tokens": 5559872} +{"current_steps": 2400, "total_steps": 5305, "loss": 0.0234, "lr": 3.3373338262366617e-06, "epoch": 2.2620169651272386, "percentage": 45.24, "elapsed_time": "0:12:26", "remaining_time": "0:15:03", "throughput": 7463.04, "total_tokens": 5571264} +{"current_steps": 2405, "total_steps": 5305, "loss": 0.0881, "lr": 3.329578627459878e-06, "epoch": 2.2667295004712535, "percentage": 45.33, "elapsed_time": "0:12:27", "remaining_time": "0:15:00", "throughput": 7471.35, "total_tokens": 5581312} +{"current_steps": 2410, "total_steps": 5305, "loss": 0.0004, "lr": 3.3218144475302444e-06, "epoch": 2.2714420358152685, "percentage": 45.43, "elapsed_time": "0:12:27", "remaining_time": "0:14:58", "throughput": 7480.7, "total_tokens": 5592384} +{"current_steps": 2415, "total_steps": 5305, "loss": 0.1036, "lr": 3.314041370504034e-06, "epoch": 2.276154571159284, "percentage": 45.52, "elapsed_time": "0:12:28", "remaining_time": "0:14:55", "throughput": 7490.13, "total_tokens": 5603456} +{"current_steps": 2420, "total_steps": 5305, "loss": 0.0579, "lr": 3.30625948053384e-06, "epoch": 2.280867106503299, "percentage": 45.62, "elapsed_time": "0:12:28", "remaining_time": "0:14:52", "throughput": 7499.39, "total_tokens": 5614464} +{"current_steps": 2425, "total_steps": 5305, "loss": 0.089, "lr": 3.2984688618676665e-06, "epoch": 2.2855796418473138, "percentage": 45.71, "elapsed_time": "0:12:29", "remaining_time": "0:14:49", "throughput": 7509.32, "total_tokens": 5626112} +{"current_steps": 2430, "total_steps": 5305, "loss": 0.0886, "lr": 3.2906695988480144e-06, "epoch": 2.290292177191329, "percentage": 45.81, "elapsed_time": "0:12:29", "remaining_time": "0:14:47", "throughput": 7518.75, "total_tokens": 5637248} +{"current_steps": 2435, "total_steps": 5305, "loss": 0.0709, "lr": 3.2828617759109715e-06, "epoch": 2.295004712535344, "percentage": 45.9, "elapsed_time": "0:12:30", "remaining_time": "0:14:44", "throughput": 7527.22, "total_tokens": 5647552} +{"current_steps": 2440, "total_steps": 5305, "loss": 0.0006, "lr": 3.2750454775852956e-06, "epoch": 2.299717247879359, "percentage": 45.99, "elapsed_time": "0:12:30", "remaining_time": "0:14:41", "throughput": 7540.11, "total_tokens": 5662080} +{"current_steps": 2445, "total_steps": 5305, "loss": 0.0005, "lr": 3.2672207884915017e-06, "epoch": 2.304429783223374, "percentage": 46.09, "elapsed_time": "0:12:31", "remaining_time": "0:14:39", "throughput": 7550.19, "total_tokens": 5673856} +{"current_steps": 2450, "total_steps": 5305, "loss": 0.107, "lr": 3.2593877933409436e-06, "epoch": 2.3091423185673894, "percentage": 46.18, "elapsed_time": "0:12:32", "remaining_time": "0:14:36", "throughput": 7558.37, "total_tokens": 5683904} +{"current_steps": 2455, "total_steps": 5305, "loss": 0.0003, "lr": 3.251546576934897e-06, "epoch": 2.3138548539114043, "percentage": 46.28, "elapsed_time": "0:12:32", "remaining_time": "0:14:33", "throughput": 7567.03, "total_tokens": 5694400} +{"current_steps": 2460, "total_steps": 5305, "loss": 0.1635, "lr": 3.2436972241636443e-06, "epoch": 2.3185673892554193, "percentage": 46.37, "elapsed_time": "0:12:33", "remaining_time": "0:14:30", "throughput": 7576.42, "total_tokens": 5705664} +{"current_steps": 2465, "total_steps": 5305, "loss": 0.0001, "lr": 3.2358398200055515e-06, "epoch": 2.3232799245994347, "percentage": 46.47, "elapsed_time": "0:12:33", "remaining_time": "0:14:28", "throughput": 7587.87, "total_tokens": 5718848} +{"current_steps": 2470, "total_steps": 5305, "loss": 0.0504, "lr": 3.227974449526152e-06, "epoch": 2.3279924599434496, "percentage": 46.56, "elapsed_time": "0:12:34", "remaining_time": "0:14:25", "throughput": 7599.34, "total_tokens": 5732096} +{"current_steps": 2475, "total_steps": 5305, "loss": 0.09, "lr": 3.2201011978772224e-06, "epoch": 2.3327049952874646, "percentage": 46.65, "elapsed_time": "0:12:34", "remaining_time": "0:14:23", "throughput": 7607.44, "total_tokens": 5742144} +{"current_steps": 2480, "total_steps": 5305, "loss": 0.0647, "lr": 3.2122201502958635e-06, "epoch": 2.3374175306314795, "percentage": 46.75, "elapsed_time": "0:12:35", "remaining_time": "0:14:20", "throughput": 7617.6, "total_tokens": 5754176} +{"current_steps": 2485, "total_steps": 5305, "loss": 0.0155, "lr": 3.2043313921035747e-06, "epoch": 2.342130065975495, "percentage": 46.84, "elapsed_time": "0:12:35", "remaining_time": "0:14:17", "throughput": 7628.61, "total_tokens": 5767104} +{"current_steps": 2490, "total_steps": 5305, "loss": 0.3015, "lr": 3.1964350087053323e-06, "epoch": 2.34684260131951, "percentage": 46.94, "elapsed_time": "0:12:36", "remaining_time": "0:14:15", "throughput": 7639.16, "total_tokens": 5779520} +{"current_steps": 2495, "total_steps": 5305, "loss": 0.0284, "lr": 3.1885310855886655e-06, "epoch": 2.3515551366635252, "percentage": 47.03, "elapsed_time": "0:12:37", "remaining_time": "0:14:12", "throughput": 7650.39, "total_tokens": 5792640} +{"current_steps": 2500, "total_steps": 5305, "loss": 0.0001, "lr": 3.1806197083227276e-06, "epoch": 2.35626767200754, "percentage": 47.13, "elapsed_time": "0:12:37", "remaining_time": "0:14:10", "throughput": 7661.54, "total_tokens": 5805696} +{"current_steps": 2505, "total_steps": 5305, "loss": 0.168, "lr": 3.172700962557373e-06, "epoch": 2.360980207351555, "percentage": 47.22, "elapsed_time": "0:12:38", "remaining_time": "0:14:07", "throughput": 7673.81, "total_tokens": 5819840} +{"current_steps": 2510, "total_steps": 5305, "loss": 0.1209, "lr": 3.1647749340222288e-06, "epoch": 2.36569274269557, "percentage": 47.31, "elapsed_time": "0:12:38", "remaining_time": "0:14:05", "throughput": 7681.99, "total_tokens": 5830016} +{"current_steps": 2515, "total_steps": 5305, "loss": 0.0744, "lr": 3.1568417085257653e-06, "epoch": 2.3704052780395855, "percentage": 47.41, "elapsed_time": "0:12:39", "remaining_time": "0:14:02", "throughput": 7689.91, "total_tokens": 5840000} +{"current_steps": 2520, "total_steps": 5305, "loss": 0.0681, "lr": 3.1489013719543703e-06, "epoch": 2.3751178133836004, "percentage": 47.5, "elapsed_time": "0:12:39", "remaining_time": "0:13:59", "throughput": 7697.8, "total_tokens": 5849920} +{"current_steps": 2525, "total_steps": 5305, "loss": 0.2567, "lr": 3.140954010271416e-06, "epoch": 2.3798303487276153, "percentage": 47.6, "elapsed_time": "0:12:40", "remaining_time": "0:13:57", "throughput": 7706.35, "total_tokens": 5860480} +{"current_steps": 2530, "total_steps": 5305, "loss": 0.0055, "lr": 3.132999709516329e-06, "epoch": 2.3845428840716307, "percentage": 47.69, "elapsed_time": "0:12:41", "remaining_time": "0:13:54", "throughput": 7717.24, "total_tokens": 5873408} +{"current_steps": 2535, "total_steps": 5305, "loss": 0.0887, "lr": 3.1250385558036606e-06, "epoch": 2.3892554194156457, "percentage": 47.79, "elapsed_time": "0:12:41", "remaining_time": "0:13:52", "throughput": 7726.37, "total_tokens": 5884608} +{"current_steps": 2540, "total_steps": 5305, "loss": 0.2362, "lr": 3.1170706353221525e-06, "epoch": 2.3939679547596606, "percentage": 47.88, "elapsed_time": "0:12:42", "remaining_time": "0:13:49", "throughput": 7735.81, "total_tokens": 5896064} +{"current_steps": 2545, "total_steps": 5305, "loss": 0.0014, "lr": 3.109096034333805e-06, "epoch": 2.3986804901036756, "percentage": 47.97, "elapsed_time": "0:12:42", "remaining_time": "0:13:47", "throughput": 7745.46, "total_tokens": 5907776} +{"current_steps": 2550, "total_steps": 5305, "loss": 0.0292, "lr": 3.1011148391729434e-06, "epoch": 2.403393025447691, "percentage": 48.07, "elapsed_time": "0:12:43", "remaining_time": "0:13:44", "throughput": 7755.38, "total_tokens": 5919744} +{"current_steps": 2555, "total_steps": 5305, "loss": 0.18, "lr": 3.0931271362452803e-06, "epoch": 2.408105560791706, "percentage": 48.16, "elapsed_time": "0:12:43", "remaining_time": "0:13:42", "throughput": 7765.8, "total_tokens": 5932224} +{"current_steps": 2560, "total_steps": 5305, "loss": 0.001, "lr": 3.085133012026985e-06, "epoch": 2.412818096135721, "percentage": 48.26, "elapsed_time": "0:12:44", "remaining_time": "0:13:39", "throughput": 7774.88, "total_tokens": 5943424} +{"current_steps": 2565, "total_steps": 5305, "loss": 0.1243, "lr": 3.0771325530637434e-06, "epoch": 2.4175306314797362, "percentage": 48.35, "elapsed_time": "0:12:45", "remaining_time": "0:13:37", "throughput": 7785.29, "total_tokens": 5955904} +{"current_steps": 2570, "total_steps": 5305, "loss": 0.0789, "lr": 3.0691258459698227e-06, "epoch": 2.422243166823751, "percentage": 48.44, "elapsed_time": "0:12:45", "remaining_time": "0:13:34", "throughput": 7794.63, "total_tokens": 5967360} +{"current_steps": 2575, "total_steps": 5305, "loss": 0.1948, "lr": 3.0611129774271318e-06, "epoch": 2.426955702167766, "percentage": 48.54, "elapsed_time": "0:12:46", "remaining_time": "0:13:32", "throughput": 7805.78, "total_tokens": 5980608} +{"current_steps": 2580, "total_steps": 5305, "loss": 0.0003, "lr": 3.0530940341842883e-06, "epoch": 2.4316682375117815, "percentage": 48.63, "elapsed_time": "0:12:46", "remaining_time": "0:13:29", "throughput": 7816.51, "total_tokens": 5993472} +{"current_steps": 2585, "total_steps": 5305, "loss": 0.0005, "lr": 3.045069103055672e-06, "epoch": 2.4363807728557965, "percentage": 48.73, "elapsed_time": "0:12:47", "remaining_time": "0:13:27", "throughput": 7824.35, "total_tokens": 6003520} +{"current_steps": 2590, "total_steps": 5305, "loss": 0.0118, "lr": 3.037038270920489e-06, "epoch": 2.4410933081998114, "percentage": 48.82, "elapsed_time": "0:12:47", "remaining_time": "0:13:24", "throughput": 7833.37, "total_tokens": 6014720} +{"current_steps": 2595, "total_steps": 5305, "loss": 0.0956, "lr": 3.0290016247218323e-06, "epoch": 2.445805843543827, "percentage": 48.92, "elapsed_time": "0:12:48", "remaining_time": "0:13:22", "throughput": 7848.64, "total_tokens": 6032192} +{"current_steps": 2600, "total_steps": 5305, "loss": 0.2412, "lr": 3.0209592514657365e-06, "epoch": 2.4505183788878417, "percentage": 49.01, "elapsed_time": "0:12:49", "remaining_time": "0:13:20", "throughput": 7857.57, "total_tokens": 6043328} +{"current_steps": 2605, "total_steps": 5305, "loss": 0.0061, "lr": 3.012911238220241e-06, "epoch": 2.4552309142318567, "percentage": 49.1, "elapsed_time": "0:12:49", "remaining_time": "0:13:17", "throughput": 7867.39, "total_tokens": 6055424} +{"current_steps": 2610, "total_steps": 5305, "loss": 0.2284, "lr": 3.004857672114443e-06, "epoch": 2.4599434495758716, "percentage": 49.2, "elapsed_time": "0:12:50", "remaining_time": "0:13:15", "throughput": 7875.23, "total_tokens": 6065472} +{"current_steps": 2615, "total_steps": 5305, "loss": 0.0007, "lr": 2.996798640337556e-06, "epoch": 2.464655984919887, "percentage": 49.29, "elapsed_time": "0:12:50", "remaining_time": "0:13:12", "throughput": 7885.5, "total_tokens": 6078016} +{"current_steps": 2620, "total_steps": 5305, "loss": 0.0974, "lr": 2.9887342301379653e-06, "epoch": 2.469368520263902, "percentage": 49.39, "elapsed_time": "0:12:51", "remaining_time": "0:13:10", "throughput": 7894.69, "total_tokens": 6089472} +{"current_steps": 2625, "total_steps": 5305, "loss": 0.1484, "lr": 2.9806645288222854e-06, "epoch": 2.474081055607917, "percentage": 49.48, "elapsed_time": "0:12:51", "remaining_time": "0:13:08", "throughput": 7903.94, "total_tokens": 6100992} +{"current_steps": 2630, "total_steps": 5305, "loss": 0.0821, "lr": 2.9725896237544115e-06, "epoch": 2.4787935909519323, "percentage": 49.58, "elapsed_time": "0:12:52", "remaining_time": "0:13:05", "throughput": 7913.43, "total_tokens": 6112768} +{"current_steps": 2635, "total_steps": 5305, "loss": 0.0017, "lr": 2.9645096023545774e-06, "epoch": 2.4835061262959472, "percentage": 49.67, "elapsed_time": "0:12:52", "remaining_time": "0:13:03", "throughput": 7921.11, "total_tokens": 6122752} +{"current_steps": 2640, "total_steps": 5305, "loss": 0.05, "lr": 2.956424552098405e-06, "epoch": 2.488218661639962, "percentage": 49.76, "elapsed_time": "0:12:53", "remaining_time": "0:13:00", "throughput": 7932.31, "total_tokens": 6136256} +{"current_steps": 2645, "total_steps": 5305, "loss": 0.0714, "lr": 2.94833456051596e-06, "epoch": 2.492931196983977, "percentage": 49.86, "elapsed_time": "0:12:54", "remaining_time": "0:12:58", "throughput": 7940.99, "total_tokens": 6147264} +{"current_steps": 2650, "total_steps": 5305, "loss": 0.0012, "lr": 2.9402397151908056e-06, "epoch": 2.4976437323279925, "percentage": 49.95, "elapsed_time": "0:12:54", "remaining_time": "0:12:56", "throughput": 7952.49, "total_tokens": 6161088} +{"current_steps": 2655, "total_steps": 5305, "loss": 0.0567, "lr": 2.93214010375905e-06, "epoch": 2.5023562676720075, "percentage": 50.05, "elapsed_time": "0:12:55", "remaining_time": "0:12:53", "throughput": 7962.62, "total_tokens": 6173568} +{"current_steps": 2660, "total_steps": 5305, "loss": 0.0692, "lr": 2.924035813908402e-06, "epoch": 2.507068803016023, "percentage": 50.14, "elapsed_time": "0:12:55", "remaining_time": "0:12:51", "throughput": 7970.61, "total_tokens": 6183872} +{"current_steps": 2660, "total_steps": 5305, "eval_loss": 0.40977799892425537, "epoch": 2.507068803016023, "percentage": 50.14, "elapsed_time": "0:12:58", "remaining_time": "0:12:54", "throughput": 7942.58, "total_tokens": 6183872} +{"current_steps": 2665, "total_steps": 5305, "loss": 0.0693, "lr": 2.9159269333772173e-06, "epoch": 2.511781338360038, "percentage": 50.24, "elapsed_time": "0:13:40", "remaining_time": "0:13:32", "throughput": 7554.65, "total_tokens": 6195648} +{"current_steps": 2670, "total_steps": 5305, "loss": 0.0003, "lr": 2.9078135499535535e-06, "epoch": 2.5164938737040528, "percentage": 50.33, "elapsed_time": "0:13:40", "remaining_time": "0:13:29", "throughput": 7562.14, "total_tokens": 6205696} +{"current_steps": 2675, "total_steps": 5305, "loss": 0.0993, "lr": 2.8996957514742164e-06, "epoch": 2.5212064090480677, "percentage": 50.42, "elapsed_time": "0:13:41", "remaining_time": "0:13:27", "throughput": 7573.41, "total_tokens": 6219648} +{"current_steps": 2680, "total_steps": 5305, "loss": 0.0016, "lr": 2.891573625823808e-06, "epoch": 2.525918944392083, "percentage": 50.52, "elapsed_time": "0:13:41", "remaining_time": "0:13:25", "throughput": 7584.65, "total_tokens": 6233664} +{"current_steps": 2685, "total_steps": 5305, "loss": 0.0002, "lr": 2.883447260933781e-06, "epoch": 2.530631479736098, "percentage": 50.61, "elapsed_time": "0:13:42", "remaining_time": "0:13:22", "throughput": 7594.69, "total_tokens": 6246400} +{"current_steps": 2690, "total_steps": 5305, "loss": 0.0776, "lr": 2.875316744781479e-06, "epoch": 2.535344015080113, "percentage": 50.71, "elapsed_time": "0:13:42", "remaining_time": "0:13:20", "throughput": 7602.21, "total_tokens": 6256576} +{"current_steps": 2695, "total_steps": 5305, "loss": 0.0909, "lr": 2.8671821653891903e-06, "epoch": 2.5400565504241284, "percentage": 50.8, "elapsed_time": "0:13:43", "remaining_time": "0:13:17", "throughput": 7609.28, "total_tokens": 6266240} +{"current_steps": 2700, "total_steps": 5305, "loss": 0.1384, "lr": 2.85904361082319e-06, "epoch": 2.5447690857681433, "percentage": 50.9, "elapsed_time": "0:13:44", "remaining_time": "0:13:15", "throughput": 7620.12, "total_tokens": 6279872} +{"current_steps": 2705, "total_steps": 5305, "loss": 0.0001, "lr": 2.8509011691927923e-06, "epoch": 2.5494816211121583, "percentage": 50.99, "elapsed_time": "0:13:44", "remaining_time": "0:13:12", "throughput": 7627.66, "total_tokens": 6290048} +{"current_steps": 2710, "total_steps": 5305, "loss": 0.0368, "lr": 2.8427549286493906e-06, "epoch": 2.554194156456173, "percentage": 51.08, "elapsed_time": "0:13:45", "remaining_time": "0:13:10", "throughput": 7636.02, "total_tokens": 6301120} +{"current_steps": 2715, "total_steps": 5305, "loss": 0.1002, "lr": 2.8346049773855077e-06, "epoch": 2.5589066918001886, "percentage": 51.18, "elapsed_time": "0:13:45", "remaining_time": "0:13:07", "throughput": 7644.74, "total_tokens": 6312512} +{"current_steps": 2720, "total_steps": 5305, "loss": 0.0002, "lr": 2.8264514036338385e-06, "epoch": 2.5636192271442035, "percentage": 51.27, "elapsed_time": "0:13:46", "remaining_time": "0:13:05", "throughput": 7653.23, "total_tokens": 6323776} +{"current_steps": 2725, "total_steps": 5305, "loss": 0.0003, "lr": 2.818294295666295e-06, "epoch": 2.568331762488219, "percentage": 51.37, "elapsed_time": "0:13:46", "remaining_time": "0:13:02", "throughput": 7661.0, "total_tokens": 6334208} +{"current_steps": 2730, "total_steps": 5305, "loss": 0.0952, "lr": 2.8101337417930523e-06, "epoch": 2.573044297832234, "percentage": 51.46, "elapsed_time": "0:13:47", "remaining_time": "0:13:00", "throughput": 7669.29, "total_tokens": 6345216} +{"current_steps": 2735, "total_steps": 5305, "loss": 0.2239, "lr": 2.8019698303615912e-06, "epoch": 2.577756833176249, "percentage": 51.56, "elapsed_time": "0:13:47", "remaining_time": "0:12:57", "throughput": 7675.69, "total_tokens": 6354304} +{"current_steps": 2740, "total_steps": 5305, "loss": 0.0628, "lr": 2.7938026497557414e-06, "epoch": 2.5824693685202638, "percentage": 51.65, "elapsed_time": "0:13:48", "remaining_time": "0:12:55", "throughput": 7686.64, "total_tokens": 6368192} +{"current_steps": 2745, "total_steps": 5305, "loss": 0.0454, "lr": 2.7856322883947253e-06, "epoch": 2.5871819038642787, "percentage": 51.74, "elapsed_time": "0:13:49", "remaining_time": "0:12:53", "throughput": 7697.92, "total_tokens": 6382400} +{"current_steps": 2750, "total_steps": 5305, "loss": 0.0836, "lr": 2.7774588347322016e-06, "epoch": 2.591894439208294, "percentage": 51.84, "elapsed_time": "0:13:49", "remaining_time": "0:12:50", "throughput": 7708.21, "total_tokens": 6395584} +{"current_steps": 2755, "total_steps": 5305, "loss": 0.1468, "lr": 2.7692823772553057e-06, "epoch": 2.596606974552309, "percentage": 51.93, "elapsed_time": "0:13:50", "remaining_time": "0:12:48", "throughput": 7716.54, "total_tokens": 6406720} +{"current_steps": 2760, "total_steps": 5305, "loss": 0.1705, "lr": 2.7611030044836927e-06, "epoch": 2.6013195098963244, "percentage": 52.03, "elapsed_time": "0:13:50", "remaining_time": "0:12:46", "throughput": 7725.16, "total_tokens": 6418112} +{"current_steps": 2765, "total_steps": 5305, "loss": 0.0602, "lr": 2.752920804968581e-06, "epoch": 2.6060320452403394, "percentage": 52.12, "elapsed_time": "0:13:51", "remaining_time": "0:12:43", "throughput": 7735.26, "total_tokens": 6431104} +{"current_steps": 2770, "total_steps": 5305, "loss": 0.0038, "lr": 2.744735867291789e-06, "epoch": 2.6107445805843543, "percentage": 52.21, "elapsed_time": "0:13:51", "remaining_time": "0:12:41", "throughput": 7743.16, "total_tokens": 6441792} +{"current_steps": 2775, "total_steps": 5305, "loss": 0.167, "lr": 2.736548280064781e-06, "epoch": 2.6154571159283693, "percentage": 52.31, "elapsed_time": "0:13:52", "remaining_time": "0:12:38", "throughput": 7751.17, "total_tokens": 6452672} +{"current_steps": 2780, "total_steps": 5305, "loss": 0.1083, "lr": 2.728358131927704e-06, "epoch": 2.6201696512723847, "percentage": 52.4, "elapsed_time": "0:13:53", "remaining_time": "0:12:36", "throughput": 7761.12, "total_tokens": 6465600} +{"current_steps": 2785, "total_steps": 5305, "loss": 0.0731, "lr": 2.720165511548433e-06, "epoch": 2.6248821866163996, "percentage": 52.5, "elapsed_time": "0:13:53", "remaining_time": "0:12:34", "throughput": 7769.94, "total_tokens": 6477312} +{"current_steps": 2790, "total_steps": 5305, "loss": 0.179, "lr": 2.711970507621603e-06, "epoch": 2.6295947219604145, "percentage": 52.59, "elapsed_time": "0:13:54", "remaining_time": "0:12:31", "throughput": 7776.52, "total_tokens": 6486592} +{"current_steps": 2795, "total_steps": 5305, "loss": 0.0011, "lr": 2.7037732088676583e-06, "epoch": 2.63430725730443, "percentage": 52.69, "elapsed_time": "0:13:54", "remaining_time": "0:12:29", "throughput": 7784.15, "total_tokens": 6497088} +{"current_steps": 2800, "total_steps": 5305, "loss": 0.0035, "lr": 2.6955737040318853e-06, "epoch": 2.639019792648445, "percentage": 52.78, "elapsed_time": "0:13:55", "remaining_time": "0:12:27", "throughput": 7790.27, "total_tokens": 6505984} +{"current_steps": 2805, "total_steps": 5305, "loss": 0.0009, "lr": 2.687372081883454e-06, "epoch": 2.64373232799246, "percentage": 52.87, "elapsed_time": "0:13:55", "remaining_time": "0:12:24", "throughput": 7798.31, "total_tokens": 6516928} +{"current_steps": 2810, "total_steps": 5305, "loss": 0.0096, "lr": 2.6791684312144565e-06, "epoch": 2.6484448633364748, "percentage": 52.97, "elapsed_time": "0:13:56", "remaining_time": "0:12:22", "throughput": 7805.94, "total_tokens": 6527424} +{"current_steps": 2815, "total_steps": 5305, "loss": 0.0955, "lr": 2.670962840838946e-06, "epoch": 2.65315739868049, "percentage": 53.06, "elapsed_time": "0:13:56", "remaining_time": "0:12:20", "throughput": 7813.99, "total_tokens": 6538432} +{"current_steps": 2820, "total_steps": 5305, "loss": 0.0341, "lr": 2.6627553995919763e-06, "epoch": 2.657869934024505, "percentage": 53.16, "elapsed_time": "0:13:57", "remaining_time": "0:12:17", "throughput": 7824.08, "total_tokens": 6551552} +{"current_steps": 2825, "total_steps": 5305, "loss": 0.0005, "lr": 2.6545461963286374e-06, "epoch": 2.6625824693685205, "percentage": 53.25, "elapsed_time": "0:13:58", "remaining_time": "0:12:15", "throughput": 7835.49, "total_tokens": 6566208} +{"current_steps": 2830, "total_steps": 5305, "loss": 0.1887, "lr": 2.646335319923097e-06, "epoch": 2.6672950047125354, "percentage": 53.35, "elapsed_time": "0:13:58", "remaining_time": "0:12:13", "throughput": 7843.77, "total_tokens": 6577472} +{"current_steps": 2835, "total_steps": 5305, "loss": 0.1243, "lr": 2.6381228592676343e-06, "epoch": 2.6720075400565504, "percentage": 53.44, "elapsed_time": "0:13:59", "remaining_time": "0:12:11", "throughput": 7851.92, "total_tokens": 6588608} +{"current_steps": 2840, "total_steps": 5305, "loss": 0.1048, "lr": 2.629908903271683e-06, "epoch": 2.6767200754005653, "percentage": 53.53, "elapsed_time": "0:13:59", "remaining_time": "0:12:08", "throughput": 7861.37, "total_tokens": 6601088} +{"current_steps": 2845, "total_steps": 5305, "loss": 0.0005, "lr": 2.6216935408608617e-06, "epoch": 2.6814326107445807, "percentage": 53.63, "elapsed_time": "0:14:00", "remaining_time": "0:12:06", "throughput": 7868.76, "total_tokens": 6611392} +{"current_steps": 2850, "total_steps": 5305, "loss": 0.001, "lr": 2.6134768609760187e-06, "epoch": 2.6861451460885957, "percentage": 53.72, "elapsed_time": "0:14:00", "remaining_time": "0:12:04", "throughput": 7877.0, "total_tokens": 6622656} +{"current_steps": 2855, "total_steps": 5305, "loss": 0.0916, "lr": 2.605258952572263e-06, "epoch": 2.6908576814326106, "percentage": 53.82, "elapsed_time": "0:14:01", "remaining_time": "0:12:01", "throughput": 7886.48, "total_tokens": 6635264} +{"current_steps": 2860, "total_steps": 5305, "loss": 0.0028, "lr": 2.5970399046180043e-06, "epoch": 2.695570216776626, "percentage": 53.91, "elapsed_time": "0:14:01", "remaining_time": "0:11:59", "throughput": 7895.73, "total_tokens": 6647680} +{"current_steps": 2865, "total_steps": 5305, "loss": 0.0001, "lr": 2.588819806093991e-06, "epoch": 2.700282752120641, "percentage": 54.01, "elapsed_time": "0:14:02", "remaining_time": "0:11:57", "throughput": 7906.79, "total_tokens": 6662016} +{"current_steps": 2870, "total_steps": 5305, "loss": 0.1805, "lr": 2.580598745992342e-06, "epoch": 2.704995287464656, "percentage": 54.1, "elapsed_time": "0:14:03", "remaining_time": "0:11:55", "throughput": 7914.71, "total_tokens": 6673024} +{"current_steps": 2875, "total_steps": 5305, "loss": 0.0001, "lr": 2.5723768133155894e-06, "epoch": 2.709707822808671, "percentage": 54.19, "elapsed_time": "0:14:03", "remaining_time": "0:11:53", "throughput": 7923.04, "total_tokens": 6684416} +{"current_steps": 2880, "total_steps": 5305, "loss": 0.0783, "lr": 2.5641540970757105e-06, "epoch": 2.7144203581526862, "percentage": 54.29, "elapsed_time": "0:14:04", "remaining_time": "0:11:50", "throughput": 7931.95, "total_tokens": 6696448} +{"current_steps": 2885, "total_steps": 5305, "loss": 0.0002, "lr": 2.555930686293165e-06, "epoch": 2.719132893496701, "percentage": 54.38, "elapsed_time": "0:14:04", "remaining_time": "0:11:48", "throughput": 7942.64, "total_tokens": 6710528} +{"current_steps": 2890, "total_steps": 5305, "loss": 0.0004, "lr": 2.547706669995933e-06, "epoch": 2.7238454288407166, "percentage": 54.48, "elapsed_time": "0:14:05", "remaining_time": "0:11:46", "throughput": 7951.15, "total_tokens": 6722176} +{"current_steps": 2895, "total_steps": 5305, "loss": 0.1775, "lr": 2.53948213721855e-06, "epoch": 2.7285579641847315, "percentage": 54.57, "elapsed_time": "0:14:05", "remaining_time": "0:11:44", "throughput": 7958.31, "total_tokens": 6732416} +{"current_steps": 2900, "total_steps": 5305, "loss": 0.1137, "lr": 2.531257177001141e-06, "epoch": 2.7332704995287465, "percentage": 54.67, "elapsed_time": "0:14:06", "remaining_time": "0:11:42", "throughput": 7968.39, "total_tokens": 6745728} +{"current_steps": 2905, "total_steps": 5305, "loss": 0.0956, "lr": 2.523031878388463e-06, "epoch": 2.7379830348727614, "percentage": 54.76, "elapsed_time": "0:14:07", "remaining_time": "0:11:39", "throughput": 7975.66, "total_tokens": 6756096} +{"current_steps": 2910, "total_steps": 5305, "loss": 0.063, "lr": 2.5148063304289306e-06, "epoch": 2.742695570216777, "percentage": 54.85, "elapsed_time": "0:14:07", "remaining_time": "0:11:37", "throughput": 7983.39, "total_tokens": 6766976} +{"current_steps": 2915, "total_steps": 5305, "loss": 0.1039, "lr": 2.5065806221736617e-06, "epoch": 2.7474081055607917, "percentage": 54.95, "elapsed_time": "0:14:08", "remaining_time": "0:11:35", "throughput": 7991.06, "total_tokens": 6777792} +{"current_steps": 2920, "total_steps": 5305, "loss": 0.0003, "lr": 2.4983548426755104e-06, "epoch": 2.7521206409048067, "percentage": 55.04, "elapsed_time": "0:14:08", "remaining_time": "0:11:33", "throughput": 7999.6, "total_tokens": 6789568} +{"current_steps": 2925, "total_steps": 5305, "loss": 0.227, "lr": 2.4901290809880984e-06, "epoch": 2.756833176248822, "percentage": 55.14, "elapsed_time": "0:14:09", "remaining_time": "0:11:31", "throughput": 8010.05, "total_tokens": 6803392} +{"current_steps": 2926, "total_steps": 5305, "eval_loss": 0.43027257919311523, "epoch": 2.757775683317625, "percentage": 55.16, "elapsed_time": "0:14:12", "remaining_time": "0:11:32", "throughput": 7986.35, "total_tokens": 6806208} +{"current_steps": 2930, "total_steps": 5305, "loss": 0.0645, "lr": 2.4819034261648574e-06, "epoch": 2.761545711592837, "percentage": 55.23, "elapsed_time": "0:16:08", "remaining_time": "0:13:04", "throughput": 7044.22, "total_tokens": 6821760} +{"current_steps": 2935, "total_steps": 5305, "loss": 0.2084, "lr": 2.4736779672580625e-06, "epoch": 2.766258246936852, "percentage": 55.33, "elapsed_time": "0:16:09", "remaining_time": "0:13:02", "throughput": 7053.22, "total_tokens": 6834688} +{"current_steps": 2940, "total_steps": 5305, "loss": 0.0731, "lr": 2.465452793317865e-06, "epoch": 2.770970782280867, "percentage": 55.42, "elapsed_time": "0:16:09", "remaining_time": "0:12:59", "throughput": 7061.52, "total_tokens": 6846784} +{"current_steps": 2945, "total_steps": 5305, "loss": 0.0866, "lr": 2.457227993391333e-06, "epoch": 2.7756833176248823, "percentage": 55.51, "elapsed_time": "0:16:10", "remaining_time": "0:12:57", "throughput": 7070.3, "total_tokens": 6859520} +{"current_steps": 2950, "total_steps": 5305, "loss": 0.0008, "lr": 2.4490036565214876e-06, "epoch": 2.7803958529688972, "percentage": 55.61, "elapsed_time": "0:16:10", "remaining_time": "0:12:54", "throughput": 7078.31, "total_tokens": 6871296} +{"current_steps": 2955, "total_steps": 5305, "loss": 0.0151, "lr": 2.440779871746331e-06, "epoch": 2.785108388312912, "percentage": 55.7, "elapsed_time": "0:16:11", "remaining_time": "0:12:52", "throughput": 7085.83, "total_tokens": 6882496} +{"current_steps": 2960, "total_steps": 5305, "loss": 0.0708, "lr": 2.4325567280978937e-06, "epoch": 2.7898209236569276, "percentage": 55.8, "elapsed_time": "0:16:11", "remaining_time": "0:12:49", "throughput": 7094.03, "total_tokens": 6894528} +{"current_steps": 2965, "total_steps": 5305, "loss": 0.1738, "lr": 2.424334314601263e-06, "epoch": 2.7945334590009425, "percentage": 55.89, "elapsed_time": "0:16:12", "remaining_time": "0:12:47", "throughput": 7100.92, "total_tokens": 6904960} +{"current_steps": 2970, "total_steps": 5305, "loss": 0.155, "lr": 2.416112720273623e-06, "epoch": 2.7992459943449575, "percentage": 55.98, "elapsed_time": "0:16:12", "remaining_time": "0:12:44", "throughput": 7107.45, "total_tokens": 6914944} +{"current_steps": 2975, "total_steps": 5305, "loss": 0.0006, "lr": 2.4078920341232856e-06, "epoch": 2.8039585296889724, "percentage": 56.08, "elapsed_time": "0:16:13", "remaining_time": "0:12:42", "throughput": 7114.88, "total_tokens": 6926080} +{"current_steps": 2980, "total_steps": 5305, "loss": 0.0028, "lr": 2.3996723451487344e-06, "epoch": 2.808671065032988, "percentage": 56.17, "elapsed_time": "0:16:13", "remaining_time": "0:12:39", "throughput": 7122.04, "total_tokens": 6936832} +{"current_steps": 2985, "total_steps": 5305, "loss": 0.2284, "lr": 2.391453742337657e-06, "epoch": 2.8133836003770027, "percentage": 56.27, "elapsed_time": "0:16:14", "remaining_time": "0:12:37", "throughput": 7129.64, "total_tokens": 6948160} +{"current_steps": 2990, "total_steps": 5305, "loss": 0.0003, "lr": 2.3832363146659806e-06, "epoch": 2.818096135721018, "percentage": 56.36, "elapsed_time": "0:16:15", "remaining_time": "0:12:34", "throughput": 7136.71, "total_tokens": 6958848} +{"current_steps": 2995, "total_steps": 5305, "loss": 0.1133, "lr": 2.37502015109691e-06, "epoch": 2.822808671065033, "percentage": 56.46, "elapsed_time": "0:16:15", "remaining_time": "0:12:32", "throughput": 7144.5, "total_tokens": 6970432} +{"current_steps": 3000, "total_steps": 5305, "loss": 0.0691, "lr": 2.3668053405799667e-06, "epoch": 2.827521206409048, "percentage": 56.55, "elapsed_time": "0:16:16", "remaining_time": "0:12:30", "throughput": 7151.03, "total_tokens": 6980480} +{"current_steps": 3005, "total_steps": 5305, "loss": 0.0368, "lr": 2.3585919720500214e-06, "epoch": 2.832233741753063, "percentage": 56.64, "elapsed_time": "0:16:16", "remaining_time": "0:12:27", "throughput": 7156.87, "total_tokens": 6989760} +{"current_steps": 3010, "total_steps": 5305, "loss": 0.093, "lr": 2.3503801344263347e-06, "epoch": 2.8369462770970784, "percentage": 56.74, "elapsed_time": "0:16:17", "remaining_time": "0:12:25", "throughput": 7162.88, "total_tokens": 6999232} +{"current_steps": 3015, "total_steps": 5305, "loss": 0.2148, "lr": 2.3421699166115946e-06, "epoch": 2.8416588124410933, "percentage": 56.83, "elapsed_time": "0:16:17", "remaining_time": "0:12:22", "throughput": 7170.74, "total_tokens": 7010944} +{"current_steps": 3020, "total_steps": 5305, "loss": 0.1475, "lr": 2.3339614074909495e-06, "epoch": 2.8463713477851083, "percentage": 56.93, "elapsed_time": "0:16:18", "remaining_time": "0:12:20", "throughput": 7177.94, "total_tokens": 7021824} +{"current_steps": 3025, "total_steps": 5305, "loss": 0.1085, "lr": 2.325754695931054e-06, "epoch": 2.8510838831291236, "percentage": 57.02, "elapsed_time": "0:16:18", "remaining_time": "0:12:17", "throughput": 7184.11, "total_tokens": 7031488} +{"current_steps": 3030, "total_steps": 5305, "loss": 0.0536, "lr": 2.3175498707790964e-06, "epoch": 2.8557964184731386, "percentage": 57.12, "elapsed_time": "0:16:19", "remaining_time": "0:12:15", "throughput": 7190.22, "total_tokens": 7041088} +{"current_steps": 3035, "total_steps": 5305, "loss": 0.1759, "lr": 2.3093470208618467e-06, "epoch": 2.8605089538171535, "percentage": 57.21, "elapsed_time": "0:16:19", "remaining_time": "0:12:12", "throughput": 7197.31, "total_tokens": 7051840} +{"current_steps": 3040, "total_steps": 5305, "loss": 0.0005, "lr": 2.3011462349846907e-06, "epoch": 2.8652214891611685, "percentage": 57.3, "elapsed_time": "0:16:20", "remaining_time": "0:12:10", "throughput": 7204.56, "total_tokens": 7062848} +{"current_steps": 3045, "total_steps": 5305, "loss": 0.0006, "lr": 2.292947601930664e-06, "epoch": 2.869934024505184, "percentage": 57.4, "elapsed_time": "0:16:21", "remaining_time": "0:12:08", "throughput": 7216.11, "total_tokens": 7079296} +{"current_steps": 3050, "total_steps": 5305, "loss": 0.1614, "lr": 2.2847512104595005e-06, "epoch": 2.874646559849199, "percentage": 57.49, "elapsed_time": "0:16:21", "remaining_time": "0:12:05", "throughput": 7223.66, "total_tokens": 7090752} +{"current_steps": 3055, "total_steps": 5305, "loss": 0.0003, "lr": 2.2765571493066647e-06, "epoch": 2.879359095193214, "percentage": 57.59, "elapsed_time": "0:16:22", "remaining_time": "0:12:03", "throughput": 7231.43, "total_tokens": 7102464} +{"current_steps": 3060, "total_steps": 5305, "loss": 0.038, "lr": 2.2683655071823925e-06, "epoch": 2.884071630537229, "percentage": 57.68, "elapsed_time": "0:16:22", "remaining_time": "0:12:01", "throughput": 7241.77, "total_tokens": 7117376} +{"current_steps": 3065, "total_steps": 5305, "loss": 0.0809, "lr": 2.2601763727707295e-06, "epoch": 2.888784165881244, "percentage": 57.78, "elapsed_time": "0:16:23", "remaining_time": "0:11:58", "throughput": 7251.54, "total_tokens": 7131584} +{"current_steps": 3070, "total_steps": 5305, "loss": 0.1831, "lr": 2.2519898347285745e-06, "epoch": 2.893496701225259, "percentage": 57.87, "elapsed_time": "0:16:24", "remaining_time": "0:11:56", "throughput": 7258.85, "total_tokens": 7142720} +{"current_steps": 3075, "total_steps": 5305, "loss": 0.1239, "lr": 2.2438059816847165e-06, "epoch": 2.8982092365692744, "percentage": 57.96, "elapsed_time": "0:16:24", "remaining_time": "0:11:54", "throughput": 7267.48, "total_tokens": 7155520} +{"current_steps": 3080, "total_steps": 5305, "loss": 0.0753, "lr": 2.235624902238879e-06, "epoch": 2.9029217719132894, "percentage": 58.06, "elapsed_time": "0:16:25", "remaining_time": "0:11:51", "throughput": 7273.81, "total_tokens": 7165504} +{"current_steps": 3085, "total_steps": 5305, "loss": 0.118, "lr": 2.2274466849607526e-06, "epoch": 2.9076343072573043, "percentage": 58.15, "elapsed_time": "0:16:25", "remaining_time": "0:11:49", "throughput": 7280.91, "total_tokens": 7176384} +{"current_steps": 3090, "total_steps": 5305, "loss": 0.0012, "lr": 2.219271418389046e-06, "epoch": 2.9123468426013197, "percentage": 58.25, "elapsed_time": "0:16:26", "remaining_time": "0:11:46", "throughput": 7288.83, "total_tokens": 7188288} +{"current_steps": 3095, "total_steps": 5305, "loss": 0.1523, "lr": 2.2110991910305233e-06, "epoch": 2.9170593779453347, "percentage": 58.34, "elapsed_time": "0:16:26", "remaining_time": "0:11:44", "throughput": 7296.29, "total_tokens": 7199680} +{"current_steps": 3100, "total_steps": 5305, "loss": 0.0548, "lr": 2.2029300913590413e-06, "epoch": 2.9217719132893496, "percentage": 58.44, "elapsed_time": "0:16:27", "remaining_time": "0:11:42", "throughput": 7304.11, "total_tokens": 7211520} +{"current_steps": 3105, "total_steps": 5305, "loss": 0.0932, "lr": 2.1947642078146005e-06, "epoch": 2.9264844486333645, "percentage": 58.53, "elapsed_time": "0:16:27", "remaining_time": "0:11:39", "throughput": 7310.39, "total_tokens": 7221440} +{"current_steps": 3110, "total_steps": 5305, "loss": 0.0528, "lr": 2.1866016288023815e-06, "epoch": 2.93119698397738, "percentage": 58.62, "elapsed_time": "0:16:28", "remaining_time": "0:11:37", "throughput": 7317.31, "total_tokens": 7232128} +{"current_steps": 3115, "total_steps": 5305, "loss": 0.1414, "lr": 2.178442442691789e-06, "epoch": 2.935909519321395, "percentage": 58.72, "elapsed_time": "0:16:28", "remaining_time": "0:11:35", "throughput": 7323.52, "total_tokens": 7241984} +{"current_steps": 3120, "total_steps": 5305, "loss": 0.0745, "lr": 2.170286737815495e-06, "epoch": 2.9406220546654103, "percentage": 58.81, "elapsed_time": "0:16:29", "remaining_time": "0:11:32", "throughput": 7330.41, "total_tokens": 7252672} +{"current_steps": 3125, "total_steps": 5305, "loss": 0.0453, "lr": 2.1621346024684854e-06, "epoch": 2.945334590009425, "percentage": 58.91, "elapsed_time": "0:16:29", "remaining_time": "0:11:30", "throughput": 7337.82, "total_tokens": 7264064} +{"current_steps": 3130, "total_steps": 5305, "loss": 0.0268, "lr": 2.1539861249071004e-06, "epoch": 2.95004712535344, "percentage": 59.0, "elapsed_time": "0:16:30", "remaining_time": "0:11:28", "throughput": 7345.51, "total_tokens": 7275776} +{"current_steps": 3135, "total_steps": 5305, "loss": 0.0361, "lr": 2.145841393348079e-06, "epoch": 2.954759660697455, "percentage": 59.1, "elapsed_time": "0:16:31", "remaining_time": "0:11:26", "throughput": 7353.35, "total_tokens": 7287680} +{"current_steps": 3140, "total_steps": 5305, "loss": 0.001, "lr": 2.1377004959676086e-06, "epoch": 2.95947219604147, "percentage": 59.19, "elapsed_time": "0:16:31", "remaining_time": "0:11:23", "throughput": 7361.52, "total_tokens": 7300032} +{"current_steps": 3145, "total_steps": 5305, "loss": 0.0632, "lr": 2.129563520900364e-06, "epoch": 2.9641847313854854, "percentage": 59.28, "elapsed_time": "0:16:32", "remaining_time": "0:11:21", "throughput": 7369.08, "total_tokens": 7311616} +{"current_steps": 3150, "total_steps": 5305, "loss": 0.1604, "lr": 2.1214305562385592e-06, "epoch": 2.9688972667295004, "percentage": 59.38, "elapsed_time": "0:16:32", "remaining_time": "0:11:19", "throughput": 7375.36, "total_tokens": 7321600} +{"current_steps": 3155, "total_steps": 5305, "loss": 0.0003, "lr": 2.1133016900309876e-06, "epoch": 2.9736098020735158, "percentage": 59.47, "elapsed_time": "0:16:33", "remaining_time": "0:11:16", "throughput": 7383.0, "total_tokens": 7333376} +{"current_steps": 3160, "total_steps": 5305, "loss": 0.0002, "lr": 2.1051770102820755e-06, "epoch": 2.9783223374175307, "percentage": 59.57, "elapsed_time": "0:16:33", "remaining_time": "0:11:14", "throughput": 7390.04, "total_tokens": 7344384} +{"current_steps": 3165, "total_steps": 5305, "loss": 0.0799, "lr": 2.0970566049509236e-06, "epoch": 2.9830348727615457, "percentage": 59.66, "elapsed_time": "0:16:34", "remaining_time": "0:11:12", "throughput": 7397.42, "total_tokens": 7355840} +{"current_steps": 3170, "total_steps": 5305, "loss": 0.0002, "lr": 2.088940561950359e-06, "epoch": 2.9877474081055606, "percentage": 59.75, "elapsed_time": "0:16:34", "remaining_time": "0:11:10", "throughput": 7405.5, "total_tokens": 7368128} +{"current_steps": 3175, "total_steps": 5305, "loss": 0.1426, "lr": 2.080828969145979e-06, "epoch": 2.992459943449576, "percentage": 59.85, "elapsed_time": "0:16:35", "remaining_time": "0:11:07", "throughput": 7414.02, "total_tokens": 7381056} +{"current_steps": 3180, "total_steps": 5305, "loss": 0.094, "lr": 2.0727219143552034e-06, "epoch": 2.997172478793591, "percentage": 59.94, "elapsed_time": "0:16:36", "remaining_time": "0:11:05", "throughput": 7422.19, "total_tokens": 7393536} +{"current_steps": 3185, "total_steps": 5305, "loss": 0.0923, "lr": 2.0646194853463255e-06, "epoch": 3.001885014137606, "percentage": 60.04, "elapsed_time": "0:16:36", "remaining_time": "0:11:03", "throughput": 7426.83, "total_tokens": 7402656} +{"current_steps": 3190, "total_steps": 5305, "loss": 0.0004, "lr": 2.056521769837553e-06, "epoch": 3.0065975494816213, "percentage": 60.13, "elapsed_time": "0:16:37", "remaining_time": "0:11:01", "throughput": 7436.05, "total_tokens": 7416480} +{"current_steps": 3192, "total_steps": 5305, "eval_loss": 0.3936729431152344, "epoch": 3.008482563619227, "percentage": 60.17, "elapsed_time": "0:16:40", "remaining_time": "0:11:02", "throughput": 7419.33, "total_tokens": 7421856} +{"current_steps": 3195, "total_steps": 5305, "loss": 0.0003, "lr": 2.0484288554960707e-06, "epoch": 3.0113100848256362, "percentage": 60.23, "elapsed_time": "0:17:06", "remaining_time": "0:11:17", "throughput": 7237.87, "total_tokens": 7430304} +{"current_steps": 3200, "total_steps": 5305, "loss": 0.052, "lr": 2.040340829937082e-06, "epoch": 3.016022620169651, "percentage": 60.32, "elapsed_time": "0:17:07", "remaining_time": "0:11:15", "throughput": 7244.93, "total_tokens": 7441568} +{"current_steps": 3205, "total_steps": 5305, "loss": 0.0003, "lr": 2.032257780722865e-06, "epoch": 3.0207351555136666, "percentage": 60.41, "elapsed_time": "0:17:07", "remaining_time": "0:11:13", "throughput": 7251.14, "total_tokens": 7451744} +{"current_steps": 3210, "total_steps": 5305, "loss": 0.0002, "lr": 2.0241797953618204e-06, "epoch": 3.0254476908576815, "percentage": 60.51, "elapsed_time": "0:17:08", "remaining_time": "0:11:11", "throughput": 7258.22, "total_tokens": 7463008} +{"current_steps": 3215, "total_steps": 5305, "loss": 0.0001, "lr": 2.0161069613075295e-06, "epoch": 3.0301602262016964, "percentage": 60.6, "elapsed_time": "0:17:08", "remaining_time": "0:11:08", "throughput": 7266.15, "total_tokens": 7475424} +{"current_steps": 3220, "total_steps": 5305, "loss": 0.0002, "lr": 2.008039365957804e-06, "epoch": 3.0348727615457114, "percentage": 60.7, "elapsed_time": "0:17:09", "remaining_time": "0:11:06", "throughput": 7273.0, "total_tokens": 7486368} +{"current_steps": 3225, "total_steps": 5305, "loss": 0.0005, "lr": 1.9999770966537416e-06, "epoch": 3.039585296889727, "percentage": 60.79, "elapsed_time": "0:17:09", "remaining_time": "0:11:04", "throughput": 7279.83, "total_tokens": 7497312} +{"current_steps": 3230, "total_steps": 5305, "loss": 0.0457, "lr": 1.991920240678776e-06, "epoch": 3.0442978322337417, "percentage": 60.89, "elapsed_time": "0:17:10", "remaining_time": "0:11:01", "throughput": 7286.09, "total_tokens": 7507552} +{"current_steps": 3235, "total_steps": 5305, "loss": 0.0001, "lr": 1.983868885257739e-06, "epoch": 3.0490103675777567, "percentage": 60.98, "elapsed_time": "0:17:10", "remaining_time": "0:10:59", "throughput": 7293.28, "total_tokens": 7519008} +{"current_steps": 3240, "total_steps": 5305, "loss": 0.0908, "lr": 1.97582311755591e-06, "epoch": 3.053722902921772, "percentage": 61.07, "elapsed_time": "0:17:11", "remaining_time": "0:10:57", "throughput": 7300.4, "total_tokens": 7530400} +{"current_steps": 3245, "total_steps": 5305, "loss": 0.0002, "lr": 1.9677830246780764e-06, "epoch": 3.058435438265787, "percentage": 61.17, "elapsed_time": "0:17:12", "remaining_time": "0:10:55", "throughput": 7309.26, "total_tokens": 7544096} +{"current_steps": 3250, "total_steps": 5305, "loss": 0.0044, "lr": 1.9597486936675886e-06, "epoch": 3.063147973609802, "percentage": 61.26, "elapsed_time": "0:17:12", "remaining_time": "0:10:52", "throughput": 7315.79, "total_tokens": 7554784} +{"current_steps": 3255, "total_steps": 5305, "loss": 0.0001, "lr": 1.9517202115054174e-06, "epoch": 3.0678605089538173, "percentage": 61.36, "elapsed_time": "0:17:13", "remaining_time": "0:10:50", "throughput": 7323.84, "total_tokens": 7567392} +{"current_steps": 3260, "total_steps": 5305, "loss": 0.0001, "lr": 1.9436976651092143e-06, "epoch": 3.0725730442978323, "percentage": 61.45, "elapsed_time": "0:17:13", "remaining_time": "0:10:48", "throughput": 7330.26, "total_tokens": 7578016} +{"current_steps": 3265, "total_steps": 5305, "loss": 0.0689, "lr": 1.9356811413323686e-06, "epoch": 3.0772855796418472, "percentage": 61.55, "elapsed_time": "0:17:14", "remaining_time": "0:10:46", "throughput": 7337.62, "total_tokens": 7589728} +{"current_steps": 3270, "total_steps": 5305, "loss": 0.0006, "lr": 1.9276707269630664e-06, "epoch": 3.081998114985862, "percentage": 61.64, "elapsed_time": "0:17:14", "remaining_time": "0:10:44", "throughput": 7344.77, "total_tokens": 7601184} +{"current_steps": 3275, "total_steps": 5305, "loss": 0.0001, "lr": 1.9196665087233548e-06, "epoch": 3.0867106503298776, "percentage": 61.73, "elapsed_time": "0:17:15", "remaining_time": "0:10:41", "throughput": 7351.49, "total_tokens": 7612128} +{"current_steps": 3280, "total_steps": 5305, "loss": 0.0004, "lr": 1.9116685732681995e-06, "epoch": 3.0914231856738925, "percentage": 61.83, "elapsed_time": "0:17:16", "remaining_time": "0:10:39", "throughput": 7358.78, "total_tokens": 7623776} +{"current_steps": 3285, "total_steps": 5305, "loss": 0.0001, "lr": 1.9036770071845467e-06, "epoch": 3.0961357210179075, "percentage": 61.92, "elapsed_time": "0:17:16", "remaining_time": "0:10:37", "throughput": 7366.58, "total_tokens": 7636128} +{"current_steps": 3290, "total_steps": 5305, "loss": 0.0002, "lr": 1.8956918969903881e-06, "epoch": 3.100848256361923, "percentage": 62.02, "elapsed_time": "0:17:17", "remaining_time": "0:10:35", "throughput": 7372.83, "total_tokens": 7646432} +{"current_steps": 3295, "total_steps": 5305, "loss": 0.0, "lr": 1.887713329133824e-06, "epoch": 3.105560791705938, "percentage": 62.11, "elapsed_time": "0:17:17", "remaining_time": "0:10:32", "throughput": 7379.86, "total_tokens": 7657824} +{"current_steps": 3300, "total_steps": 5305, "loss": 0.0829, "lr": 1.8797413899921224e-06, "epoch": 3.1102733270499527, "percentage": 62.21, "elapsed_time": "0:17:18", "remaining_time": "0:10:30", "throughput": 7387.42, "total_tokens": 7669920} +{"current_steps": 3305, "total_steps": 5305, "loss": 0.0054, "lr": 1.8717761658707916e-06, "epoch": 3.114985862393968, "percentage": 62.3, "elapsed_time": "0:17:18", "remaining_time": "0:10:28", "throughput": 7394.95, "total_tokens": 7681952} +{"current_steps": 3310, "total_steps": 5305, "loss": 0.0, "lr": 1.86381774300264e-06, "epoch": 3.119698397737983, "percentage": 62.39, "elapsed_time": "0:17:19", "remaining_time": "0:10:26", "throughput": 7401.6, "total_tokens": 7692832} +{"current_steps": 3315, "total_steps": 5305, "loss": 0.1029, "lr": 1.8558662075468468e-06, "epoch": 3.124410933081998, "percentage": 62.49, "elapsed_time": "0:17:19", "remaining_time": "0:10:24", "throughput": 7407.78, "total_tokens": 7703072} +{"current_steps": 3320, "total_steps": 5305, "loss": 0.0, "lr": 1.8479216455880225e-06, "epoch": 3.1291234684260134, "percentage": 62.58, "elapsed_time": "0:17:20", "remaining_time": "0:10:22", "throughput": 7414.44, "total_tokens": 7714016} +{"current_steps": 3325, "total_steps": 5305, "loss": 0.0002, "lr": 1.8399841431352855e-06, "epoch": 3.1338360037700284, "percentage": 62.68, "elapsed_time": "0:17:20", "remaining_time": "0:10:19", "throughput": 7422.4, "total_tokens": 7726688} +{"current_steps": 3330, "total_steps": 5305, "loss": 0.0001, "lr": 1.8320537861213267e-06, "epoch": 3.1385485391140433, "percentage": 62.77, "elapsed_time": "0:17:21", "remaining_time": "0:10:17", "throughput": 7430.57, "total_tokens": 7739680} +{"current_steps": 3335, "total_steps": 5305, "loss": 0.0001, "lr": 1.8241306604014761e-06, "epoch": 3.1432610744580582, "percentage": 62.87, "elapsed_time": "0:17:22", "remaining_time": "0:10:15", "throughput": 7435.98, "total_tokens": 7749024} +{"current_steps": 3340, "total_steps": 5305, "loss": 0.0008, "lr": 1.816214851752779e-06, "epoch": 3.1479736098020736, "percentage": 62.96, "elapsed_time": "0:17:22", "remaining_time": "0:10:13", "throughput": 7443.81, "total_tokens": 7761568} +{"current_steps": 3345, "total_steps": 5305, "loss": 0.0001, "lr": 1.8083064458730651e-06, "epoch": 3.1526861451460886, "percentage": 63.05, "elapsed_time": "0:17:23", "remaining_time": "0:10:11", "throughput": 7450.5, "total_tokens": 7772640} +{"current_steps": 3350, "total_steps": 5305, "loss": 0.0004, "lr": 1.8004055283800204e-06, "epoch": 3.1573986804901035, "percentage": 63.15, "elapsed_time": "0:17:23", "remaining_time": "0:10:09", "throughput": 7457.88, "total_tokens": 7784672} +{"current_steps": 3355, "total_steps": 5305, "loss": 0.0, "lr": 1.7925121848102583e-06, "epoch": 3.162111215834119, "percentage": 63.24, "elapsed_time": "0:17:24", "remaining_time": "0:10:07", "throughput": 7464.66, "total_tokens": 7795872} +{"current_steps": 3360, "total_steps": 5305, "loss": 0.0, "lr": 1.7846265006183976e-06, "epoch": 3.166823751178134, "percentage": 63.34, "elapsed_time": "0:17:24", "remaining_time": "0:10:04", "throughput": 7472.42, "total_tokens": 7808416} +{"current_steps": 3365, "total_steps": 5305, "loss": 0.0, "lr": 1.776748561176137e-06, "epoch": 3.171536286522149, "percentage": 63.43, "elapsed_time": "0:17:25", "remaining_time": "0:10:02", "throughput": 7479.95, "total_tokens": 7820640} +{"current_steps": 3370, "total_steps": 5305, "loss": 0.0, "lr": 1.7688784517713247e-06, "epoch": 3.176248821866164, "percentage": 63.52, "elapsed_time": "0:17:26", "remaining_time": "0:10:00", "throughput": 7486.12, "total_tokens": 7831072} +{"current_steps": 3375, "total_steps": 5305, "loss": 0.0969, "lr": 1.761016257607044e-06, "epoch": 3.180961357210179, "percentage": 63.62, "elapsed_time": "0:17:26", "remaining_time": "0:09:58", "throughput": 7492.59, "total_tokens": 7841888} +{"current_steps": 3380, "total_steps": 5305, "loss": 0.0488, "lr": 1.7531620638006834e-06, "epoch": 3.185673892554194, "percentage": 63.71, "elapsed_time": "0:17:27", "remaining_time": "0:09:56", "throughput": 7499.2, "total_tokens": 7852896} +{"current_steps": 3385, "total_steps": 5305, "loss": 0.0013, "lr": 1.7453159553830217e-06, "epoch": 3.190386427898209, "percentage": 63.81, "elapsed_time": "0:17:27", "remaining_time": "0:09:54", "throughput": 7509.09, "total_tokens": 7868384} +{"current_steps": 3390, "total_steps": 5305, "loss": 0.0001, "lr": 1.7374780172973004e-06, "epoch": 3.1950989632422244, "percentage": 63.9, "elapsed_time": "0:17:28", "remaining_time": "0:09:52", "throughput": 7517.1, "total_tokens": 7881312} +{"current_steps": 3395, "total_steps": 5305, "loss": 0.0564, "lr": 1.7296483343983095e-06, "epoch": 3.1998114985862394, "percentage": 64.0, "elapsed_time": "0:17:28", "remaining_time": "0:09:50", "throughput": 7523.53, "total_tokens": 7892128} +{"current_steps": 3400, "total_steps": 5305, "loss": 0.0002, "lr": 1.7218269914514668e-06, "epoch": 3.2045240339302543, "percentage": 64.09, "elapsed_time": "0:17:29", "remaining_time": "0:09:48", "throughput": 7529.74, "total_tokens": 7902624} +{"current_steps": 3405, "total_steps": 5305, "loss": 0.0001, "lr": 1.714014073131901e-06, "epoch": 3.2092365692742697, "percentage": 64.18, "elapsed_time": "0:17:30", "remaining_time": "0:09:45", "throughput": 7537.44, "total_tokens": 7915168} +{"current_steps": 3410, "total_steps": 5305, "loss": 0.0002, "lr": 1.7062096640235327e-06, "epoch": 3.2139491046182846, "percentage": 64.28, "elapsed_time": "0:17:30", "remaining_time": "0:09:43", "throughput": 7543.41, "total_tokens": 7925472} +{"current_steps": 3415, "total_steps": 5305, "loss": 0.0001, "lr": 1.6984138486181612e-06, "epoch": 3.2186616399622996, "percentage": 64.37, "elapsed_time": "0:17:31", "remaining_time": "0:09:41", "throughput": 7553.01, "total_tokens": 7940576} +{"current_steps": 3420, "total_steps": 5305, "loss": 0.0323, "lr": 1.6906267113145514e-06, "epoch": 3.223374175306315, "percentage": 64.47, "elapsed_time": "0:17:32", "remaining_time": "0:09:39", "throughput": 7562.75, "total_tokens": 7956064} +{"current_steps": 3425, "total_steps": 5305, "loss": 0.0, "lr": 1.6828483364175127e-06, "epoch": 3.22808671065033, "percentage": 64.56, "elapsed_time": "0:17:32", "remaining_time": "0:09:37", "throughput": 7569.42, "total_tokens": 7967264} +{"current_steps": 3430, "total_steps": 5305, "loss": 0.0003, "lr": 1.6750788081369951e-06, "epoch": 3.232799245994345, "percentage": 64.66, "elapsed_time": "0:17:33", "remaining_time": "0:09:35", "throughput": 7575.86, "total_tokens": 7978144} +{"current_steps": 3435, "total_steps": 5305, "loss": 0.0443, "lr": 1.6673182105871733e-06, "epoch": 3.23751178133836, "percentage": 64.75, "elapsed_time": "0:17:33", "remaining_time": "0:09:33", "throughput": 7582.39, "total_tokens": 7989152} +{"current_steps": 3440, "total_steps": 5305, "loss": 0.0, "lr": 1.659566627785536e-06, "epoch": 3.242224316682375, "percentage": 64.84, "elapsed_time": "0:17:34", "remaining_time": "0:09:31", "throughput": 7589.41, "total_tokens": 8000800} +{"current_steps": 3445, "total_steps": 5305, "loss": 0.0004, "lr": 1.651824143651975e-06, "epoch": 3.24693685202639, "percentage": 64.94, "elapsed_time": "0:17:34", "remaining_time": "0:09:29", "throughput": 7598.18, "total_tokens": 8014816} +{"current_steps": 3450, "total_steps": 5305, "loss": 0.0, "lr": 1.644090842007881e-06, "epoch": 3.251649387370405, "percentage": 65.03, "elapsed_time": "0:17:35", "remaining_time": "0:09:27", "throughput": 7604.19, "total_tokens": 8025120} +{"current_steps": 3455, "total_steps": 5305, "loss": 0.0, "lr": 1.6363668065752336e-06, "epoch": 3.2563619227144205, "percentage": 65.13, "elapsed_time": "0:17:35", "remaining_time": "0:09:25", "throughput": 7611.61, "total_tokens": 8037344} +{"current_steps": 3458, "total_steps": 5305, "eval_loss": 0.5191035270690918, "epoch": 3.2591894439208295, "percentage": 65.18, "elapsed_time": "0:17:38", "remaining_time": "0:09:25", "throughput": 7595.76, "total_tokens": 8043744} +{"current_steps": 3460, "total_steps": 5305, "loss": 0.0875, "lr": 1.6286521209756917e-06, "epoch": 3.2610744580584354, "percentage": 65.22, "elapsed_time": "0:18:25", "remaining_time": "0:09:49", "throughput": 7280.9, "total_tokens": 8048096} +{"current_steps": 3465, "total_steps": 5305, "loss": 0.0, "lr": 1.6209468687296947e-06, "epoch": 3.2657869934024504, "percentage": 65.32, "elapsed_time": "0:18:25", "remaining_time": "0:09:47", "throughput": 7288.84, "total_tokens": 8061344} +{"current_steps": 3470, "total_steps": 5305, "loss": 0.0, "lr": 1.613251133255554e-06, "epoch": 3.2704995287464658, "percentage": 65.41, "elapsed_time": "0:18:26", "remaining_time": "0:09:45", "throughput": 7295.85, "total_tokens": 8073184} +{"current_steps": 3475, "total_steps": 5305, "loss": 0.0, "lr": 1.6055649978685517e-06, "epoch": 3.2752120640904807, "percentage": 65.5, "elapsed_time": "0:18:27", "remaining_time": "0:09:42", "throughput": 7301.32, "total_tokens": 8082976} +{"current_steps": 3480, "total_steps": 5305, "loss": 0.0, "lr": 1.5978885457800348e-06, "epoch": 3.2799245994344957, "percentage": 65.6, "elapsed_time": "0:18:27", "remaining_time": "0:09:40", "throughput": 7308.16, "total_tokens": 8094624} +{"current_steps": 3485, "total_steps": 5305, "loss": 0.0843, "lr": 1.59022186009652e-06, "epoch": 3.284637134778511, "percentage": 65.69, "elapsed_time": "0:18:28", "remaining_time": "0:09:38", "throughput": 7314.03, "total_tokens": 8104928} +{"current_steps": 3490, "total_steps": 5305, "loss": 0.0, "lr": 1.5825650238187918e-06, "epoch": 3.289349670122526, "percentage": 65.79, "elapsed_time": "0:18:28", "remaining_time": "0:09:36", "throughput": 7321.09, "total_tokens": 8116896} +{"current_steps": 3495, "total_steps": 5305, "loss": 0.0875, "lr": 1.5749181198410014e-06, "epoch": 3.294062205466541, "percentage": 65.88, "elapsed_time": "0:18:29", "remaining_time": "0:09:34", "throughput": 7327.48, "total_tokens": 8127968} +{"current_steps": 3500, "total_steps": 5305, "loss": 0.0326, "lr": 1.5672812309497722e-06, "epoch": 3.298774740810556, "percentage": 65.98, "elapsed_time": "0:18:29", "remaining_time": "0:09:32", "throughput": 7334.51, "total_tokens": 8139936} +{"current_steps": 3505, "total_steps": 5305, "loss": 0.0001, "lr": 1.5596544398233028e-06, "epoch": 3.3034872761545713, "percentage": 66.07, "elapsed_time": "0:18:30", "remaining_time": "0:09:30", "throughput": 7341.16, "total_tokens": 8151392} +{"current_steps": 3510, "total_steps": 5305, "loss": 0.0, "lr": 1.5520378290304723e-06, "epoch": 3.308199811498586, "percentage": 66.16, "elapsed_time": "0:18:30", "remaining_time": "0:09:28", "throughput": 7349.52, "total_tokens": 8165280} +{"current_steps": 3515, "total_steps": 5305, "loss": 0.0, "lr": 1.544431481029944e-06, "epoch": 3.312912346842601, "percentage": 66.26, "elapsed_time": "0:18:31", "remaining_time": "0:09:26", "throughput": 7356.91, "total_tokens": 8177696} +{"current_steps": 3520, "total_steps": 5305, "loss": 0.0, "lr": 1.5368354781692764e-06, "epoch": 3.3176248821866166, "percentage": 66.35, "elapsed_time": "0:18:32", "remaining_time": "0:09:23", "throughput": 7363.62, "total_tokens": 8189280} +{"current_steps": 3525, "total_steps": 5305, "loss": 0.0001, "lr": 1.5292499026840292e-06, "epoch": 3.3223374175306315, "percentage": 66.45, "elapsed_time": "0:18:32", "remaining_time": "0:09:21", "throughput": 7371.7, "total_tokens": 8202784} +{"current_steps": 3530, "total_steps": 5305, "loss": 0.1032, "lr": 1.5216748366968743e-06, "epoch": 3.3270499528746464, "percentage": 66.54, "elapsed_time": "0:18:33", "remaining_time": "0:09:19", "throughput": 7379.58, "total_tokens": 8216032} +{"current_steps": 3535, "total_steps": 5305, "loss": 0.0001, "lr": 1.5141103622167042e-06, "epoch": 3.331762488218662, "percentage": 66.64, "elapsed_time": "0:18:33", "remaining_time": "0:09:17", "throughput": 7386.79, "total_tokens": 8228320} +{"current_steps": 3540, "total_steps": 5305, "loss": 0.0487, "lr": 1.5065565611377472e-06, "epoch": 3.336475023562677, "percentage": 66.73, "elapsed_time": "0:18:34", "remaining_time": "0:09:15", "throughput": 7393.88, "total_tokens": 8240416} +{"current_steps": 3545, "total_steps": 5305, "loss": 0.0, "lr": 1.4990135152386814e-06, "epoch": 3.3411875589066917, "percentage": 66.82, "elapsed_time": "0:18:35", "remaining_time": "0:09:13", "throughput": 7401.04, "total_tokens": 8252640} +{"current_steps": 3550, "total_steps": 5305, "loss": 0.0001, "lr": 1.4914813061817434e-06, "epoch": 3.345900094250707, "percentage": 66.92, "elapsed_time": "0:18:35", "remaining_time": "0:09:11", "throughput": 7406.13, "total_tokens": 8261984} +{"current_steps": 3555, "total_steps": 5305, "loss": 0.0036, "lr": 1.4839600155118525e-06, "epoch": 3.350612629594722, "percentage": 67.01, "elapsed_time": "0:18:36", "remaining_time": "0:09:09", "throughput": 7412.83, "total_tokens": 8273568} +{"current_steps": 3560, "total_steps": 5305, "loss": 0.0001, "lr": 1.4764497246557214e-06, "epoch": 3.355325164938737, "percentage": 67.11, "elapsed_time": "0:18:36", "remaining_time": "0:09:07", "throughput": 7419.76, "total_tokens": 8285472} +{"current_steps": 3565, "total_steps": 5305, "loss": 0.0008, "lr": 1.4689505149209788e-06, "epoch": 3.360037700282752, "percentage": 67.2, "elapsed_time": "0:18:37", "remaining_time": "0:09:05", "throughput": 7424.86, "total_tokens": 8294816} +{"current_steps": 3570, "total_steps": 5305, "loss": 0.0, "lr": 1.4614624674952843e-06, "epoch": 3.3647502356267673, "percentage": 67.3, "elapsed_time": "0:18:37", "remaining_time": "0:09:03", "throughput": 7430.89, "total_tokens": 8305504} +{"current_steps": 3575, "total_steps": 5305, "loss": 0.0518, "lr": 1.4539856634454558e-06, "epoch": 3.3694627709707823, "percentage": 67.39, "elapsed_time": "0:18:38", "remaining_time": "0:09:01", "throughput": 7436.97, "total_tokens": 8316320} +{"current_steps": 3580, "total_steps": 5305, "loss": 0.0384, "lr": 1.4465201837165876e-06, "epoch": 3.3741753063147972, "percentage": 67.48, "elapsed_time": "0:18:38", "remaining_time": "0:08:59", "throughput": 7443.04, "total_tokens": 8327200} +{"current_steps": 3585, "total_steps": 5305, "loss": 0.0, "lr": 1.4390661091311742e-06, "epoch": 3.3788878416588126, "percentage": 67.58, "elapsed_time": "0:18:39", "remaining_time": "0:08:57", "throughput": 7450.19, "total_tokens": 8339488} +{"current_steps": 3590, "total_steps": 5305, "loss": 0.0642, "lr": 1.4316235203882373e-06, "epoch": 3.3836003770028276, "percentage": 67.67, "elapsed_time": "0:18:39", "remaining_time": "0:08:55", "throughput": 7458.25, "total_tokens": 8353120} +{"current_steps": 3595, "total_steps": 5305, "loss": 0.0, "lr": 1.4241924980624485e-06, "epoch": 3.3883129123468425, "percentage": 67.77, "elapsed_time": "0:18:40", "remaining_time": "0:08:52", "throughput": 7464.93, "total_tokens": 8364768} +{"current_steps": 3600, "total_steps": 5305, "loss": 0.0029, "lr": 1.4167731226032656e-06, "epoch": 3.3930254476908575, "percentage": 67.86, "elapsed_time": "0:18:41", "remaining_time": "0:08:50", "throughput": 7471.66, "total_tokens": 8376480} +{"current_steps": 3605, "total_steps": 5305, "loss": 0.0122, "lr": 1.4093654743340462e-06, "epoch": 3.397737983034873, "percentage": 67.95, "elapsed_time": "0:18:41", "remaining_time": "0:08:48", "throughput": 7477.38, "total_tokens": 8386784} +{"current_steps": 3610, "total_steps": 5305, "loss": 0.0, "lr": 1.4019696334511962e-06, "epoch": 3.402450518378888, "percentage": 68.05, "elapsed_time": "0:18:42", "remaining_time": "0:08:46", "throughput": 7483.7, "total_tokens": 8397984} +{"current_steps": 3615, "total_steps": 5305, "loss": 0.0, "lr": 1.3945856800232874e-06, "epoch": 3.4071630537229027, "percentage": 68.14, "elapsed_time": "0:18:42", "remaining_time": "0:08:44", "throughput": 7489.57, "total_tokens": 8408544} +{"current_steps": 3620, "total_steps": 5305, "loss": 0.0, "lr": 1.3872136939902004e-06, "epoch": 3.411875589066918, "percentage": 68.24, "elapsed_time": "0:18:43", "remaining_time": "0:08:42", "throughput": 7495.77, "total_tokens": 8419552} +{"current_steps": 3625, "total_steps": 5305, "loss": 0.0001, "lr": 1.379853755162249e-06, "epoch": 3.416588124410933, "percentage": 68.33, "elapsed_time": "0:18:43", "remaining_time": "0:08:40", "throughput": 7501.32, "total_tokens": 8429664} +{"current_steps": 3630, "total_steps": 5305, "loss": 0.0, "lr": 1.3725059432193278e-06, "epoch": 3.421300659754948, "percentage": 68.43, "elapsed_time": "0:18:44", "remaining_time": "0:08:38", "throughput": 7508.01, "total_tokens": 8441376} +{"current_steps": 3635, "total_steps": 5305, "loss": 0.0, "lr": 1.3651703377100406e-06, "epoch": 3.4260131950989634, "percentage": 68.52, "elapsed_time": "0:18:44", "remaining_time": "0:08:36", "throughput": 7514.55, "total_tokens": 8452896} +{"current_steps": 3640, "total_steps": 5305, "loss": 0.0, "lr": 1.3578470180508432e-06, "epoch": 3.4307257304429783, "percentage": 68.61, "elapsed_time": "0:18:45", "remaining_time": "0:08:34", "throughput": 7520.3, "total_tokens": 8463328} +{"current_steps": 3645, "total_steps": 5305, "loss": 0.0, "lr": 1.3505360635251813e-06, "epoch": 3.4354382657869933, "percentage": 68.71, "elapsed_time": "0:18:45", "remaining_time": "0:08:32", "throughput": 7527.49, "total_tokens": 8475808} +{"current_steps": 3650, "total_steps": 5305, "loss": 0.0122, "lr": 1.3432375532826374e-06, "epoch": 3.4401508011310087, "percentage": 68.8, "elapsed_time": "0:18:46", "remaining_time": "0:08:30", "throughput": 7534.11, "total_tokens": 8487456} +{"current_steps": 3655, "total_steps": 5305, "loss": 0.0, "lr": 1.3359515663380668e-06, "epoch": 3.4448633364750236, "percentage": 68.9, "elapsed_time": "0:18:47", "remaining_time": "0:08:28", "throughput": 7543.81, "total_tokens": 8503712} +{"current_steps": 3660, "total_steps": 5305, "loss": 0.2188, "lr": 1.3286781815707465e-06, "epoch": 3.4495758718190386, "percentage": 68.99, "elapsed_time": "0:18:47", "remaining_time": "0:08:26", "throughput": 7550.05, "total_tokens": 8514848} +{"current_steps": 3665, "total_steps": 5305, "loss": 0.0985, "lr": 1.3214174777235192e-06, "epoch": 3.4542884071630535, "percentage": 69.09, "elapsed_time": "0:18:48", "remaining_time": "0:08:24", "throughput": 7555.57, "total_tokens": 8524960} +{"current_steps": 3670, "total_steps": 5305, "loss": 0.0001, "lr": 1.3141695334019453e-06, "epoch": 3.459000942507069, "percentage": 69.18, "elapsed_time": "0:18:48", "remaining_time": "0:08:22", "throughput": 7561.34, "total_tokens": 8535520} +{"current_steps": 3675, "total_steps": 5305, "loss": 0.0023, "lr": 1.3069344270734452e-06, "epoch": 3.463713477851084, "percentage": 69.27, "elapsed_time": "0:18:49", "remaining_time": "0:08:20", "throughput": 7566.27, "total_tokens": 8544864} +{"current_steps": 3680, "total_steps": 5305, "loss": 0.0001, "lr": 1.2997122370664538e-06, "epoch": 3.468426013195099, "percentage": 69.37, "elapsed_time": "0:18:49", "remaining_time": "0:08:18", "throughput": 7573.14, "total_tokens": 8556960} +{"current_steps": 3685, "total_steps": 5305, "loss": 0.0001, "lr": 1.2925030415695727e-06, "epoch": 3.473138548539114, "percentage": 69.46, "elapsed_time": "0:18:50", "remaining_time": "0:08:16", "throughput": 7579.18, "total_tokens": 8567968} +{"current_steps": 3690, "total_steps": 5305, "loss": 0.0595, "lr": 1.285306918630722e-06, "epoch": 3.477851083883129, "percentage": 69.56, "elapsed_time": "0:18:51", "remaining_time": "0:08:15", "throughput": 7587.33, "total_tokens": 8581920} +{"current_steps": 3695, "total_steps": 5305, "loss": 0.0442, "lr": 1.2781239461562966e-06, "epoch": 3.482563619227144, "percentage": 69.65, "elapsed_time": "0:18:51", "remaining_time": "0:08:13", "throughput": 7594.7, "total_tokens": 8594720} +{"current_steps": 3700, "total_steps": 5305, "loss": 0.0001, "lr": 1.2709542019103211e-06, "epoch": 3.4872761545711595, "percentage": 69.75, "elapsed_time": "0:18:52", "remaining_time": "0:08:11", "throughput": 7601.36, "total_tokens": 8606560} +{"current_steps": 3705, "total_steps": 5305, "loss": 0.0017, "lr": 1.2637977635136123e-06, "epoch": 3.4919886899151744, "percentage": 69.84, "elapsed_time": "0:18:52", "remaining_time": "0:08:09", "throughput": 7607.86, "total_tokens": 8618208} +{"current_steps": 3710, "total_steps": 5305, "loss": 0.0089, "lr": 1.2566547084429326e-06, "epoch": 3.4967012252591894, "percentage": 69.93, "elapsed_time": "0:18:53", "remaining_time": "0:08:07", "throughput": 7615.56, "total_tokens": 8631584} +{"current_steps": 3715, "total_steps": 5305, "loss": 0.0338, "lr": 1.2495251140301553e-06, "epoch": 3.5014137606032048, "percentage": 70.03, "elapsed_time": "0:18:53", "remaining_time": "0:08:05", "throughput": 7621.88, "total_tokens": 8642912} +{"current_steps": 3720, "total_steps": 5305, "loss": 0.0002, "lr": 1.2424090574614262e-06, "epoch": 3.5061262959472197, "percentage": 70.12, "elapsed_time": "0:18:54", "remaining_time": "0:08:03", "throughput": 7626.83, "total_tokens": 8652384} +{"current_steps": 3724, "total_steps": 5305, "eval_loss": 0.4635506868362427, "epoch": 3.5098963242224315, "percentage": 70.2, "elapsed_time": "0:18:57", "remaining_time": "0:08:02", "throughput": 7612.81, "total_tokens": 8660768} +{"current_steps": 3725, "total_steps": 5305, "loss": 0.0008, "lr": 1.2353066157763305e-06, "epoch": 3.5108388312912346, "percentage": 70.22, "elapsed_time": "0:19:39", "remaining_time": "0:08:20", "throughput": 7343.26, "total_tokens": 8662624} +{"current_steps": 3730, "total_steps": 5305, "loss": 0.0001, "lr": 1.2282178658670514e-06, "epoch": 3.5155513666352496, "percentage": 70.31, "elapsed_time": "0:19:40", "remaining_time": "0:08:18", "throughput": 7348.71, "total_tokens": 8672864} +{"current_steps": 3735, "total_steps": 5305, "loss": 0.0001, "lr": 1.221142884477548e-06, "epoch": 3.520263901979265, "percentage": 70.41, "elapsed_time": "0:19:40", "remaining_time": "0:08:16", "throughput": 7355.02, "total_tokens": 8684448} +{"current_steps": 3740, "total_steps": 5305, "loss": 0.0001, "lr": 1.2140817482027155e-06, "epoch": 3.52497643732328, "percentage": 70.5, "elapsed_time": "0:19:41", "remaining_time": "0:08:14", "throughput": 7362.93, "total_tokens": 8698336} +{"current_steps": 3745, "total_steps": 5305, "loss": 0.0, "lr": 1.207034533487564e-06, "epoch": 3.529688972667295, "percentage": 70.59, "elapsed_time": "0:19:41", "remaining_time": "0:08:12", "throughput": 7370.05, "total_tokens": 8711072} +{"current_steps": 3750, "total_steps": 5305, "loss": 0.0001, "lr": 1.2000013166263803e-06, "epoch": 3.5344015080113103, "percentage": 70.69, "elapsed_time": "0:19:42", "remaining_time": "0:08:10", "throughput": 7377.18, "total_tokens": 8723872} +{"current_steps": 3755, "total_steps": 5305, "loss": 0.0013, "lr": 1.1929821737619132e-06, "epoch": 3.539114043355325, "percentage": 70.78, "elapsed_time": "0:19:43", "remaining_time": "0:08:08", "throughput": 7383.71, "total_tokens": 8735776} +{"current_steps": 3760, "total_steps": 5305, "loss": 0.0, "lr": 1.1859771808845417e-06, "epoch": 3.54382657869934, "percentage": 70.88, "elapsed_time": "0:19:43", "remaining_time": "0:08:06", "throughput": 7393.57, "total_tokens": 8752736} +{"current_steps": 3765, "total_steps": 5305, "loss": 0.0001, "lr": 1.1789864138314577e-06, "epoch": 3.548539114043355, "percentage": 70.97, "elapsed_time": "0:19:44", "remaining_time": "0:08:04", "throughput": 7401.44, "total_tokens": 8766688} +{"current_steps": 3770, "total_steps": 5305, "loss": 0.0, "lr": 1.1720099482858364e-06, "epoch": 3.5532516493873705, "percentage": 71.07, "elapsed_time": "0:19:45", "remaining_time": "0:08:02", "throughput": 7409.9, "total_tokens": 8781536} +{"current_steps": 3775, "total_steps": 5305, "loss": 0.0001, "lr": 1.1650478597760284e-06, "epoch": 3.5579641847313854, "percentage": 71.16, "elapsed_time": "0:19:45", "remaining_time": "0:08:00", "throughput": 7415.58, "total_tokens": 8792224} +{"current_steps": 3780, "total_steps": 5305, "loss": 0.0704, "lr": 1.158100223674733e-06, "epoch": 3.562676720075401, "percentage": 71.25, "elapsed_time": "0:19:46", "remaining_time": "0:07:58", "throughput": 7421.42, "total_tokens": 8803168} +{"current_steps": 3785, "total_steps": 5305, "loss": 0.0001, "lr": 1.1511671151981861e-06, "epoch": 3.5673892554194158, "percentage": 71.35, "elapsed_time": "0:19:46", "remaining_time": "0:07:56", "throughput": 7426.89, "total_tokens": 8813536} +{"current_steps": 3790, "total_steps": 5305, "loss": 0.0, "lr": 1.1442486094053445e-06, "epoch": 3.5721017907634307, "percentage": 71.44, "elapsed_time": "0:19:47", "remaining_time": "0:07:54", "throughput": 7432.32, "total_tokens": 8823840} +{"current_steps": 3795, "total_steps": 5305, "loss": 0.0, "lr": 1.1373447811970762e-06, "epoch": 3.5768143261074457, "percentage": 71.54, "elapsed_time": "0:19:47", "remaining_time": "0:07:52", "throughput": 7439.34, "total_tokens": 8836576} +{"current_steps": 3800, "total_steps": 5305, "loss": 0.0, "lr": 1.130455705315345e-06, "epoch": 3.581526861451461, "percentage": 71.63, "elapsed_time": "0:19:48", "remaining_time": "0:07:50", "throughput": 7446.68, "total_tokens": 8849824} +{"current_steps": 3805, "total_steps": 5305, "loss": 0.1829, "lr": 1.1235814563424046e-06, "epoch": 3.586239396795476, "percentage": 71.72, "elapsed_time": "0:19:48", "remaining_time": "0:07:48", "throughput": 7452.29, "total_tokens": 8860448} +{"current_steps": 3810, "total_steps": 5305, "loss": 0.0001, "lr": 1.1167221086999897e-06, "epoch": 3.590951932139491, "percentage": 71.82, "elapsed_time": "0:19:49", "remaining_time": "0:07:46", "throughput": 7458.38, "total_tokens": 8871776} +{"current_steps": 3815, "total_steps": 5305, "loss": 0.0006, "lr": 1.10987773664851e-06, "epoch": 3.5956644674835063, "percentage": 71.91, "elapsed_time": "0:19:50", "remaining_time": "0:07:44", "throughput": 7466.21, "total_tokens": 8885728} +{"current_steps": 3820, "total_steps": 5305, "loss": 0.0, "lr": 1.1030484142862511e-06, "epoch": 3.6003770028275213, "percentage": 72.01, "elapsed_time": "0:19:50", "remaining_time": "0:07:42", "throughput": 7471.5, "total_tokens": 8895904} +{"current_steps": 3825, "total_steps": 5305, "loss": 0.0006, "lr": 1.0962342155485613e-06, "epoch": 3.605089538171536, "percentage": 72.1, "elapsed_time": "0:19:51", "remaining_time": "0:07:40", "throughput": 7477.93, "total_tokens": 8907808} +{"current_steps": 3830, "total_steps": 5305, "loss": 0.0, "lr": 1.0894352142070652e-06, "epoch": 3.609802073515551, "percentage": 72.2, "elapsed_time": "0:19:51", "remaining_time": "0:07:38", "throughput": 7483.51, "total_tokens": 8918432} +{"current_steps": 3835, "total_steps": 5305, "loss": 0.072, "lr": 1.0826514838688533e-06, "epoch": 3.6145146088595665, "percentage": 72.29, "elapsed_time": "0:19:52", "remaining_time": "0:07:37", "throughput": 7489.19, "total_tokens": 8929248} +{"current_steps": 3840, "total_steps": 5305, "loss": 0.0001, "lr": 1.075883097975691e-06, "epoch": 3.6192271442035815, "percentage": 72.38, "elapsed_time": "0:19:52", "remaining_time": "0:07:35", "throughput": 7495.1, "total_tokens": 8940384} +{"current_steps": 3845, "total_steps": 5305, "loss": 0.0, "lr": 1.0691301298032218e-06, "epoch": 3.623939679547597, "percentage": 72.48, "elapsed_time": "0:19:53", "remaining_time": "0:07:33", "throughput": 7500.51, "total_tokens": 8950816} +{"current_steps": 3850, "total_steps": 5305, "loss": 0.0001, "lr": 1.0623926524601771e-06, "epoch": 3.628652214891612, "percentage": 72.57, "elapsed_time": "0:19:53", "remaining_time": "0:07:31", "throughput": 7507.3, "total_tokens": 8963296} +{"current_steps": 3855, "total_steps": 5305, "loss": 0.0, "lr": 1.0556707388875786e-06, "epoch": 3.6333647502356268, "percentage": 72.67, "elapsed_time": "0:19:54", "remaining_time": "0:07:29", "throughput": 7513.32, "total_tokens": 8974624} +{"current_steps": 3860, "total_steps": 5305, "loss": 0.0596, "lr": 1.048964461857954e-06, "epoch": 3.6380772855796417, "percentage": 72.76, "elapsed_time": "0:19:55", "remaining_time": "0:07:27", "throughput": 7519.36, "total_tokens": 8985952} +{"current_steps": 3865, "total_steps": 5305, "loss": 0.0002, "lr": 1.0422738939745453e-06, "epoch": 3.6427898209236567, "percentage": 72.86, "elapsed_time": "0:19:55", "remaining_time": "0:07:25", "throughput": 7524.57, "total_tokens": 8996064} +{"current_steps": 3870, "total_steps": 5305, "loss": 0.0002, "lr": 1.035599107670529e-06, "epoch": 3.647502356267672, "percentage": 72.95, "elapsed_time": "0:19:56", "remaining_time": "0:07:23", "throughput": 7529.88, "total_tokens": 9006368} +{"current_steps": 3875, "total_steps": 5305, "loss": 0.0001, "lr": 1.0289401752082214e-06, "epoch": 3.652214891611687, "percentage": 73.04, "elapsed_time": "0:19:56", "remaining_time": "0:07:21", "throughput": 7536.25, "total_tokens": 9018272} +{"current_steps": 3880, "total_steps": 5305, "loss": 0.1112, "lr": 1.0222971686783089e-06, "epoch": 3.6569274269557024, "percentage": 73.14, "elapsed_time": "0:19:57", "remaining_time": "0:07:19", "throughput": 7542.14, "total_tokens": 9029472} +{"current_steps": 3885, "total_steps": 5305, "loss": 0.0001, "lr": 1.0156701599990562e-06, "epoch": 3.6616399622997173, "percentage": 73.23, "elapsed_time": "0:19:57", "remaining_time": "0:07:17", "throughput": 7548.81, "total_tokens": 9041824} +{"current_steps": 3890, "total_steps": 5305, "loss": 0.0381, "lr": 1.0090592209155373e-06, "epoch": 3.6663524976437323, "percentage": 73.33, "elapsed_time": "0:19:58", "remaining_time": "0:07:15", "throughput": 7555.81, "total_tokens": 9054752} +{"current_steps": 3895, "total_steps": 5305, "loss": 0.002, "lr": 1.0024644229988484e-06, "epoch": 3.6710650329877472, "percentage": 73.42, "elapsed_time": "0:19:58", "remaining_time": "0:07:14", "throughput": 7561.02, "total_tokens": 9064928} +{"current_steps": 3900, "total_steps": 5305, "loss": 0.0001, "lr": 9.95885837645344e-07, "epoch": 3.6757775683317626, "percentage": 73.52, "elapsed_time": "0:19:59", "remaining_time": "0:07:12", "throughput": 7567.0, "total_tokens": 9076256} +{"current_steps": 3905, "total_steps": 5305, "loss": 0.0954, "lr": 9.893235360758565e-07, "epoch": 3.6804901036757776, "percentage": 73.61, "elapsed_time": "0:19:59", "remaining_time": "0:07:10", "throughput": 7572.35, "total_tokens": 9086624} +{"current_steps": 3910, "total_steps": 5305, "loss": 0.0001, "lr": 9.827775893349273e-07, "epoch": 3.6852026390197925, "percentage": 73.7, "elapsed_time": "0:20:00", "remaining_time": "0:07:08", "throughput": 7578.21, "total_tokens": 9097824} +{"current_steps": 3915, "total_steps": 5305, "loss": 0.0323, "lr": 9.762480682900374e-07, "epoch": 3.689915174363808, "percentage": 73.8, "elapsed_time": "0:20:01", "remaining_time": "0:07:06", "throughput": 7582.95, "total_tokens": 9107296} +{"current_steps": 3920, "total_steps": 5305, "loss": 0.0039, "lr": 9.697350436308428e-07, "epoch": 3.694627709707823, "percentage": 73.89, "elapsed_time": "0:20:01", "remaining_time": "0:07:04", "throughput": 7589.19, "total_tokens": 9119008} +{"current_steps": 3925, "total_steps": 5305, "loss": 0.0001, "lr": 9.63238585868405e-07, "epoch": 3.699340245051838, "percentage": 73.99, "elapsed_time": "0:20:02", "remaining_time": "0:07:02", "throughput": 7595.77, "total_tokens": 9131296} +{"current_steps": 3930, "total_steps": 5305, "loss": 0.0001, "lr": 9.567587653344295e-07, "epoch": 3.7040527803958527, "percentage": 74.08, "elapsed_time": "0:20:02", "remaining_time": "0:07:00", "throughput": 7601.05, "total_tokens": 9141664} +{"current_steps": 3935, "total_steps": 5305, "loss": 0.0001, "lr": 9.502956521805054e-07, "epoch": 3.708765315739868, "percentage": 74.18, "elapsed_time": "0:20:03", "remaining_time": "0:06:58", "throughput": 7605.89, "total_tokens": 9151328} +{"current_steps": 3940, "total_steps": 5305, "loss": 0.0002, "lr": 9.438493163773433e-07, "epoch": 3.713477851083883, "percentage": 74.27, "elapsed_time": "0:20:03", "remaining_time": "0:06:57", "throughput": 7612.82, "total_tokens": 9164192} +{"current_steps": 3945, "total_steps": 5305, "loss": 0.0003, "lr": 9.374198277140237e-07, "epoch": 3.7181903864278985, "percentage": 74.36, "elapsed_time": "0:20:04", "remaining_time": "0:06:55", "throughput": 7619.38, "total_tokens": 9176544} +{"current_steps": 3950, "total_steps": 5305, "loss": 0.0162, "lr": 9.310072557972305e-07, "epoch": 3.7229029217719134, "percentage": 74.46, "elapsed_time": "0:20:04", "remaining_time": "0:06:53", "throughput": 7625.72, "total_tokens": 9188512} +{"current_steps": 3955, "total_steps": 5305, "loss": 0.0001, "lr": 9.246116700505109e-07, "epoch": 3.7276154571159283, "percentage": 74.55, "elapsed_time": "0:20:05", "remaining_time": "0:06:51", "throughput": 7632.36, "total_tokens": 9200992} +{"current_steps": 3960, "total_steps": 5305, "loss": 0.0001, "lr": 9.18233139713513e-07, "epoch": 3.7323279924599433, "percentage": 74.65, "elapsed_time": "0:20:06", "remaining_time": "0:06:49", "throughput": 7637.48, "total_tokens": 9211168} +{"current_steps": 3965, "total_steps": 5305, "loss": 0.0, "lr": 9.118717338412414e-07, "epoch": 3.7370405278039587, "percentage": 74.74, "elapsed_time": "0:20:06", "remaining_time": "0:06:47", "throughput": 7644.0, "total_tokens": 9223456} +{"current_steps": 3970, "total_steps": 5305, "loss": 0.0002, "lr": 9.055275213033077e-07, "epoch": 3.7417530631479736, "percentage": 74.84, "elapsed_time": "0:20:07", "remaining_time": "0:06:45", "throughput": 7649.12, "total_tokens": 9233632} +{"current_steps": 3975, "total_steps": 5305, "loss": 0.0751, "lr": 8.992005707831877e-07, "epoch": 3.7464655984919886, "percentage": 74.93, "elapsed_time": "0:20:07", "remaining_time": "0:06:44", "throughput": 7653.89, "total_tokens": 9243296} +{"current_steps": 3980, "total_steps": 5305, "loss": 0.0002, "lr": 8.928909507774741e-07, "epoch": 3.751178133836004, "percentage": 75.02, "elapsed_time": "0:20:08", "remaining_time": "0:06:42", "throughput": 7662.84, "total_tokens": 9259424} +{"current_steps": 3985, "total_steps": 5305, "loss": 0.0, "lr": 8.86598729595137e-07, "epoch": 3.755890669180019, "percentage": 75.12, "elapsed_time": "0:20:08", "remaining_time": "0:06:40", "throughput": 7669.39, "total_tokens": 9271840} +{"current_steps": 3990, "total_steps": 5305, "loss": 0.0, "lr": 8.80323975356783e-07, "epoch": 3.760603204524034, "percentage": 75.21, "elapsed_time": "0:20:09", "remaining_time": "0:06:38", "throughput": 7677.24, "total_tokens": 9286304} +{"current_steps": 3990, "total_steps": 5305, "eval_loss": 0.5201095938682556, "epoch": 3.760603204524034, "percentage": 75.21, "elapsed_time": "0:20:12", "remaining_time": "0:06:39", "throughput": 7659.55, "total_tokens": 9286304} +{"current_steps": 3995, "total_steps": 5305, "loss": 0.0004, "lr": 8.740667559939217e-07, "epoch": 3.765315739868049, "percentage": 75.31, "elapsed_time": "0:20:36", "remaining_time": "0:06:45", "throughput": 7516.18, "total_tokens": 9297056} +{"current_steps": 4000, "total_steps": 5305, "loss": 0.0, "lr": 8.678271392482243e-07, "epoch": 3.770028275212064, "percentage": 75.4, "elapsed_time": "0:20:37", "remaining_time": "0:06:43", "throughput": 7521.66, "total_tokens": 9307872} +{"current_steps": 4005, "total_steps": 5305, "loss": 0.0, "lr": 8.616051926707941e-07, "epoch": 3.774740810556079, "percentage": 75.49, "elapsed_time": "0:20:38", "remaining_time": "0:06:41", "throughput": 7527.22, "total_tokens": 9318816} +{"current_steps": 4010, "total_steps": 5305, "loss": 0.0308, "lr": 8.554009836214345e-07, "epoch": 3.7794533459000945, "percentage": 75.59, "elapsed_time": "0:20:38", "remaining_time": "0:06:39", "throughput": 7533.74, "total_tokens": 9331232} +{"current_steps": 4015, "total_steps": 5305, "loss": 0.0657, "lr": 8.49214579267921e-07, "epoch": 3.7841658812441095, "percentage": 75.68, "elapsed_time": "0:20:39", "remaining_time": "0:06:38", "throughput": 7539.17, "total_tokens": 9342112} +{"current_steps": 4020, "total_steps": 5305, "loss": 0.0, "lr": 8.430460465852683e-07, "epoch": 3.7888784165881244, "percentage": 75.78, "elapsed_time": "0:20:39", "remaining_time": "0:06:36", "throughput": 7546.5, "total_tokens": 9355872} +{"current_steps": 4025, "total_steps": 5305, "loss": 0.0, "lr": 8.368954523550146e-07, "epoch": 3.7935909519321394, "percentage": 75.87, "elapsed_time": "0:20:40", "remaining_time": "0:06:34", "throughput": 7552.15, "total_tokens": 9367008} +{"current_steps": 4030, "total_steps": 5305, "loss": 0.0001, "lr": 8.307628631644904e-07, "epoch": 3.7983034872761543, "percentage": 75.97, "elapsed_time": "0:20:40", "remaining_time": "0:06:32", "throughput": 7559.01, "total_tokens": 9380000} +{"current_steps": 4035, "total_steps": 5305, "loss": 0.0, "lr": 8.246483454061016e-07, "epoch": 3.8030160226201697, "percentage": 76.06, "elapsed_time": "0:20:41", "remaining_time": "0:06:30", "throughput": 7564.19, "total_tokens": 9390368} +{"current_steps": 4040, "total_steps": 5305, "loss": 0.0829, "lr": 8.185519652766091e-07, "epoch": 3.8077285579641846, "percentage": 76.15, "elapsed_time": "0:20:41", "remaining_time": "0:06:28", "throughput": 7570.09, "total_tokens": 9401952} +{"current_steps": 4045, "total_steps": 5305, "loss": 0.0, "lr": 8.124737887764148e-07, "epoch": 3.8124410933082, "percentage": 76.25, "elapsed_time": "0:20:42", "remaining_time": "0:06:27", "throughput": 7576.02, "total_tokens": 9413536} +{"current_steps": 4050, "total_steps": 5305, "loss": 0.09, "lr": 8.064138817088429e-07, "epoch": 3.817153628652215, "percentage": 76.34, "elapsed_time": "0:20:43", "remaining_time": "0:06:25", "throughput": 7581.77, "total_tokens": 9424864} +{"current_steps": 4055, "total_steps": 5305, "loss": 0.0, "lr": 8.003723096794314e-07, "epoch": 3.82186616399623, "percentage": 76.44, "elapsed_time": "0:20:43", "remaining_time": "0:06:23", "throughput": 7588.12, "total_tokens": 9437152} +{"current_steps": 4060, "total_steps": 5305, "loss": 0.0001, "lr": 7.94349138095219e-07, "epoch": 3.826578699340245, "percentage": 76.53, "elapsed_time": "0:20:44", "remaining_time": "0:06:21", "throughput": 7593.61, "total_tokens": 9448032} +{"current_steps": 4065, "total_steps": 5305, "loss": 0.0001, "lr": 7.883444321640383e-07, "epoch": 3.8312912346842602, "percentage": 76.63, "elapsed_time": "0:20:44", "remaining_time": "0:06:19", "throughput": 7599.38, "total_tokens": 9459424} +{"current_steps": 4070, "total_steps": 5305, "loss": 0.0001, "lr": 7.82358256893812e-07, "epoch": 3.836003770028275, "percentage": 76.72, "elapsed_time": "0:20:45", "remaining_time": "0:06:17", "throughput": 7604.33, "total_tokens": 9469536} +{"current_steps": 4075, "total_steps": 5305, "loss": 0.0, "lr": 7.763906770918428e-07, "epoch": 3.84071630537229, "percentage": 76.81, "elapsed_time": "0:20:45", "remaining_time": "0:06:16", "throughput": 7611.36, "total_tokens": 9482976} +{"current_steps": 4080, "total_steps": 5305, "loss": 0.0001, "lr": 7.704417573641196e-07, "epoch": 3.8454288407163055, "percentage": 76.91, "elapsed_time": "0:20:46", "remaining_time": "0:06:14", "throughput": 7616.08, "total_tokens": 9492704} +{"current_steps": 4085, "total_steps": 5305, "loss": 0.0, "lr": 7.645115621146116e-07, "epoch": 3.8501413760603205, "percentage": 77.0, "elapsed_time": "0:20:46", "remaining_time": "0:06:12", "throughput": 7622.31, "total_tokens": 9504864} +{"current_steps": 4090, "total_steps": 5305, "loss": 0.1079, "lr": 7.586001555445773e-07, "epoch": 3.8548539114043354, "percentage": 77.1, "elapsed_time": "0:20:47", "remaining_time": "0:06:10", "throughput": 7627.54, "total_tokens": 9515424} +{"current_steps": 4095, "total_steps": 5305, "loss": 0.0001, "lr": 7.527076016518603e-07, "epoch": 3.8595664467483504, "percentage": 77.19, "elapsed_time": "0:20:48", "remaining_time": "0:06:08", "throughput": 7632.67, "total_tokens": 9525792} +{"current_steps": 4100, "total_steps": 5305, "loss": 0.0001, "lr": 7.468339642302077e-07, "epoch": 3.8642789820923658, "percentage": 77.29, "elapsed_time": "0:20:48", "remaining_time": "0:06:06", "throughput": 7637.96, "total_tokens": 9536416} +{"current_steps": 4105, "total_steps": 5305, "loss": 0.0722, "lr": 7.409793068685709e-07, "epoch": 3.8689915174363807, "percentage": 77.38, "elapsed_time": "0:20:49", "remaining_time": "0:06:05", "throughput": 7645.63, "total_tokens": 9550880} +{"current_steps": 4110, "total_steps": 5305, "loss": 0.0, "lr": 7.351436929504203e-07, "epoch": 3.873704052780396, "percentage": 77.47, "elapsed_time": "0:20:49", "remaining_time": "0:06:03", "throughput": 7652.94, "total_tokens": 9564768} +{"current_steps": 4115, "total_steps": 5305, "loss": 0.0001, "lr": 7.293271856530585e-07, "epoch": 3.878416588124411, "percentage": 77.57, "elapsed_time": "0:20:50", "remaining_time": "0:06:01", "throughput": 7658.45, "total_tokens": 9575776} +{"current_steps": 4120, "total_steps": 5305, "loss": 0.0323, "lr": 7.235298479469391e-07, "epoch": 3.883129123468426, "percentage": 77.66, "elapsed_time": "0:20:50", "remaining_time": "0:05:59", "throughput": 7664.81, "total_tokens": 9588192} +{"current_steps": 4125, "total_steps": 5305, "loss": 0.0, "lr": 7.177517425949801e-07, "epoch": 3.887841658812441, "percentage": 77.76, "elapsed_time": "0:20:51", "remaining_time": "0:05:57", "throughput": 7669.81, "total_tokens": 9598432} +{"current_steps": 4130, "total_steps": 5305, "loss": 0.0001, "lr": 7.119929321518876e-07, "epoch": 3.8925541941564563, "percentage": 77.85, "elapsed_time": "0:20:52", "remaining_time": "0:05:56", "throughput": 7678.02, "total_tokens": 9613920} +{"current_steps": 4135, "total_steps": 5305, "loss": 0.0001, "lr": 7.062534789634772e-07, "epoch": 3.8972667295004713, "percentage": 77.95, "elapsed_time": "0:20:52", "remaining_time": "0:05:54", "throughput": 7683.41, "total_tokens": 9624864} +{"current_steps": 4140, "total_steps": 5305, "loss": 0.0004, "lr": 7.005334451660034e-07, "epoch": 3.901979264844486, "percentage": 78.04, "elapsed_time": "0:20:53", "remaining_time": "0:05:52", "throughput": 7688.47, "total_tokens": 9635232} +{"current_steps": 4145, "total_steps": 5305, "loss": 0.0, "lr": 6.948328926854767e-07, "epoch": 3.9066918001885016, "percentage": 78.13, "elapsed_time": "0:20:53", "remaining_time": "0:05:50", "throughput": 7695.38, "total_tokens": 9648544} +{"current_steps": 4150, "total_steps": 5305, "loss": 0.0074, "lr": 6.891518832370059e-07, "epoch": 3.9114043355325165, "percentage": 78.23, "elapsed_time": "0:20:54", "remaining_time": "0:05:49", "throughput": 7700.75, "total_tokens": 9659424} +{"current_steps": 4155, "total_steps": 5305, "loss": 0.0, "lr": 6.834904783241198e-07, "epoch": 3.9161168708765315, "percentage": 78.32, "elapsed_time": "0:20:54", "remaining_time": "0:05:47", "throughput": 7705.88, "total_tokens": 9669920} +{"current_steps": 4160, "total_steps": 5305, "loss": 0.0002, "lr": 6.778487392381089e-07, "epoch": 3.9208294062205464, "percentage": 78.42, "elapsed_time": "0:20:55", "remaining_time": "0:05:45", "throughput": 7711.61, "total_tokens": 9681376} +{"current_steps": 4165, "total_steps": 5305, "loss": 0.0, "lr": 6.722267270573529e-07, "epoch": 3.925541941564562, "percentage": 78.51, "elapsed_time": "0:20:55", "remaining_time": "0:05:43", "throughput": 7716.53, "total_tokens": 9691552} +{"current_steps": 4170, "total_steps": 5305, "loss": 0.0001, "lr": 6.666245026466708e-07, "epoch": 3.9302544769085768, "percentage": 78.61, "elapsed_time": "0:20:56", "remaining_time": "0:05:42", "throughput": 7723.09, "total_tokens": 9704288} +{"current_steps": 4175, "total_steps": 5305, "loss": 0.0595, "lr": 6.61042126656652e-07, "epoch": 3.934967012252592, "percentage": 78.7, "elapsed_time": "0:20:57", "remaining_time": "0:05:40", "throughput": 7727.69, "total_tokens": 9713952} +{"current_steps": 4180, "total_steps": 5305, "loss": 0.0642, "lr": 6.554796595230051e-07, "epoch": 3.939679547596607, "percentage": 78.79, "elapsed_time": "0:20:57", "remaining_time": "0:05:38", "throughput": 7732.9, "total_tokens": 9724576} +{"current_steps": 4185, "total_steps": 5305, "loss": 0.0002, "lr": 6.499371614659019e-07, "epoch": 3.944392082940622, "percentage": 78.89, "elapsed_time": "0:20:58", "remaining_time": "0:05:36", "throughput": 7738.24, "total_tokens": 9735392} +{"current_steps": 4190, "total_steps": 5305, "loss": 0.0766, "lr": 6.444146924893252e-07, "epoch": 3.949104618284637, "percentage": 78.98, "elapsed_time": "0:20:58", "remaining_time": "0:05:34", "throughput": 7743.35, "total_tokens": 9745888} +{"current_steps": 4195, "total_steps": 5305, "loss": 0.111, "lr": 6.389123123804217e-07, "epoch": 3.9538171536286524, "percentage": 79.08, "elapsed_time": "0:20:59", "remaining_time": "0:05:33", "throughput": 7747.62, "total_tokens": 9755104} +{"current_steps": 4200, "total_steps": 5305, "loss": 0.0003, "lr": 6.334300807088509e-07, "epoch": 3.9585296889726673, "percentage": 79.17, "elapsed_time": "0:20:59", "remaining_time": "0:05:31", "throughput": 7753.59, "total_tokens": 9766944} +{"current_steps": 4205, "total_steps": 5305, "loss": 0.0782, "lr": 6.279680568261423e-07, "epoch": 3.9632422243166823, "percentage": 79.26, "elapsed_time": "0:21:00", "remaining_time": "0:05:29", "throughput": 7759.23, "total_tokens": 9778336} +{"current_steps": 4210, "total_steps": 5305, "loss": 0.0004, "lr": 6.225262998650525e-07, "epoch": 3.9679547596606977, "percentage": 79.36, "elapsed_time": "0:21:00", "remaining_time": "0:05:27", "throughput": 7764.51, "total_tokens": 9789088} +{"current_steps": 4215, "total_steps": 5305, "loss": 0.0003, "lr": 6.171048687389273e-07, "epoch": 3.9726672950047126, "percentage": 79.45, "elapsed_time": "0:21:01", "remaining_time": "0:05:26", "throughput": 7769.44, "total_tokens": 9799392} +{"current_steps": 4220, "total_steps": 5305, "loss": 0.0, "lr": 6.117038221410568e-07, "epoch": 3.9773798303487276, "percentage": 79.55, "elapsed_time": "0:21:01", "remaining_time": "0:05:24", "throughput": 7775.43, "total_tokens": 9811360} +{"current_steps": 4225, "total_steps": 5305, "loss": 0.1016, "lr": 6.063232185440507e-07, "epoch": 3.9820923656927425, "percentage": 79.64, "elapsed_time": "0:21:02", "remaining_time": "0:05:22", "throughput": 7781.89, "total_tokens": 9824160} +{"current_steps": 4230, "total_steps": 5305, "loss": 0.0007, "lr": 6.009631161991958e-07, "epoch": 3.986804901036758, "percentage": 79.74, "elapsed_time": "0:21:02", "remaining_time": "0:05:20", "throughput": 7787.07, "total_tokens": 9834784} +{"current_steps": 4235, "total_steps": 5305, "loss": 0.0, "lr": 5.956235731358298e-07, "epoch": 3.991517436380773, "percentage": 79.83, "elapsed_time": "0:21:03", "remaining_time": "0:05:19", "throughput": 7792.54, "total_tokens": 9845920} +{"current_steps": 4240, "total_steps": 5305, "loss": 0.0, "lr": 5.903046471607121e-07, "epoch": 3.9962299717247878, "percentage": 79.92, "elapsed_time": "0:21:04", "remaining_time": "0:05:17", "throughput": 7798.71, "total_tokens": 9858208} +{"current_steps": 4245, "total_steps": 5305, "loss": 0.032, "lr": 5.850063958573993e-07, "epoch": 4.000942507068803, "percentage": 80.02, "elapsed_time": "0:21:04", "remaining_time": "0:05:15", "throughput": 7802.8, "total_tokens": 9868192} +{"current_steps": 4250, "total_steps": 5305, "loss": 0.0, "lr": 5.797288765856196e-07, "epoch": 4.005655042412818, "percentage": 80.11, "elapsed_time": "0:21:05", "remaining_time": "0:05:14", "throughput": 7810.28, "total_tokens": 9882784} +{"current_steps": 4255, "total_steps": 5305, "loss": 0.0001, "lr": 5.74472146480653e-07, "epoch": 4.010367577756833, "percentage": 80.21, "elapsed_time": "0:21:05", "remaining_time": "0:05:12", "throughput": 7814.81, "total_tokens": 9892448} +{"current_steps": 4256, "total_steps": 5305, "eval_loss": 0.5145591497421265, "epoch": 4.011310084825636, "percentage": 80.23, "elapsed_time": "0:21:08", "remaining_time": "0:05:12", "throughput": 7799.12, "total_tokens": 9894624} +{"current_steps": 4260, "total_steps": 5305, "loss": 0.0, "lr": 5.692362624527117e-07, "epoch": 4.015080113100848, "percentage": 80.3, "elapsed_time": "0:21:46", "remaining_time": "0:05:20", "throughput": 7581.51, "total_tokens": 9905376} +{"current_steps": 4265, "total_steps": 5305, "loss": 0.0, "lr": 5.640212811863277e-07, "epoch": 4.019792648444863, "percentage": 80.4, "elapsed_time": "0:21:47", "remaining_time": "0:05:18", "throughput": 7586.34, "total_tokens": 9915616} +{"current_steps": 4270, "total_steps": 5305, "loss": 0.0, "lr": 5.588272591397337e-07, "epoch": 4.024505183788879, "percentage": 80.49, "elapsed_time": "0:21:47", "remaining_time": "0:05:16", "throughput": 7592.6, "total_tokens": 9928288} +{"current_steps": 4275, "total_steps": 5305, "loss": 0.0001, "lr": 5.536542525442554e-07, "epoch": 4.029217719132894, "percentage": 80.58, "elapsed_time": "0:21:48", "remaining_time": "0:05:15", "throughput": 7597.81, "total_tokens": 9939232} +{"current_steps": 4280, "total_steps": 5305, "loss": 0.0, "lr": 5.485023174037005e-07, "epoch": 4.033930254476909, "percentage": 80.68, "elapsed_time": "0:21:48", "remaining_time": "0:05:13", "throughput": 7603.34, "total_tokens": 9950688} +{"current_steps": 4285, "total_steps": 5305, "loss": 0.0, "lr": 5.433715094937575e-07, "epoch": 4.038642789820924, "percentage": 80.77, "elapsed_time": "0:21:49", "remaining_time": "0:05:11", "throughput": 7608.7, "total_tokens": 9961824} +{"current_steps": 4290, "total_steps": 5305, "loss": 0.0, "lr": 5.382618843613827e-07, "epoch": 4.043355325164939, "percentage": 80.87, "elapsed_time": "0:21:49", "remaining_time": "0:05:09", "throughput": 7614.99, "total_tokens": 9974560} +{"current_steps": 4295, "total_steps": 5305, "loss": 0.0, "lr": 5.331734973242089e-07, "epoch": 4.0480678605089535, "percentage": 80.96, "elapsed_time": "0:21:50", "remaining_time": "0:05:08", "throughput": 7621.12, "total_tokens": 9987040} +{"current_steps": 4300, "total_steps": 5305, "loss": 0.0, "lr": 5.28106403469939e-07, "epoch": 4.0527803958529685, "percentage": 81.06, "elapsed_time": "0:21:51", "remaining_time": "0:05:06", "throughput": 7628.93, "total_tokens": 10002400} +{"current_steps": 4305, "total_steps": 5305, "loss": 0.0, "lr": 5.23060657655754e-07, "epoch": 4.057492931196984, "percentage": 81.15, "elapsed_time": "0:21:51", "remaining_time": "0:05:04", "throughput": 7633.6, "total_tokens": 10012448} +{"current_steps": 4310, "total_steps": 5305, "loss": 0.0001, "lr": 5.180363145077164e-07, "epoch": 4.062205466540999, "percentage": 81.24, "elapsed_time": "0:21:52", "remaining_time": "0:05:02", "throughput": 7638.79, "total_tokens": 10023392} +{"current_steps": 4315, "total_steps": 5305, "loss": 0.0002, "lr": 5.130334284201799e-07, "epoch": 4.066918001885014, "percentage": 81.34, "elapsed_time": "0:21:52", "remaining_time": "0:05:01", "throughput": 7644.1, "total_tokens": 10034528} +{"current_steps": 4320, "total_steps": 5305, "loss": 0.0, "lr": 5.080520535552028e-07, "epoch": 4.071630537229029, "percentage": 81.43, "elapsed_time": "0:21:53", "remaining_time": "0:04:59", "throughput": 7649.04, "total_tokens": 10045024} +{"current_steps": 4325, "total_steps": 5305, "loss": 0.0, "lr": 5.030922438419569e-07, "epoch": 4.076343072573044, "percentage": 81.53, "elapsed_time": "0:21:53", "remaining_time": "0:04:57", "throughput": 7653.82, "total_tokens": 10055328} +{"current_steps": 4330, "total_steps": 5305, "loss": 0.0, "lr": 4.981540529761473e-07, "epoch": 4.081055607917059, "percentage": 81.62, "elapsed_time": "0:21:54", "remaining_time": "0:04:55", "throughput": 7658.35, "total_tokens": 10065184} +{"current_steps": 4335, "total_steps": 5305, "loss": 0.0, "lr": 4.932375344194285e-07, "epoch": 4.085768143261075, "percentage": 81.72, "elapsed_time": "0:21:54", "remaining_time": "0:04:54", "throughput": 7664.11, "total_tokens": 10077088} +{"current_steps": 4340, "total_steps": 5305, "loss": 0.0, "lr": 4.88342741398831e-07, "epoch": 4.09048067860509, "percentage": 81.81, "elapsed_time": "0:21:55", "remaining_time": "0:04:52", "throughput": 7669.17, "total_tokens": 10087840} +{"current_steps": 4345, "total_steps": 5305, "loss": 0.0, "lr": 4.83469726906175e-07, "epoch": 4.095193213949105, "percentage": 81.9, "elapsed_time": "0:21:55", "remaining_time": "0:04:50", "throughput": 7674.26, "total_tokens": 10098656} +{"current_steps": 4350, "total_steps": 5305, "loss": 0.0, "lr": 4.786185436975085e-07, "epoch": 4.09990574929312, "percentage": 82.0, "elapsed_time": "0:21:56", "remaining_time": "0:04:49", "throughput": 7680.54, "total_tokens": 10111456} +{"current_steps": 4355, "total_steps": 5305, "loss": 0.0, "lr": 4.7378924429252735e-07, "epoch": 4.104618284637135, "percentage": 82.09, "elapsed_time": "0:21:57", "remaining_time": "0:04:47", "throughput": 7686.0, "total_tokens": 10122912} +{"current_steps": 4360, "total_steps": 5305, "loss": 0.0003, "lr": 4.689818809740118e-07, "epoch": 4.10933081998115, "percentage": 82.19, "elapsed_time": "0:21:57", "remaining_time": "0:04:45", "throughput": 7691.87, "total_tokens": 10135072} +{"current_steps": 4365, "total_steps": 5305, "loss": 0.0001, "lr": 4.641965057872552e-07, "epoch": 4.1140433553251645, "percentage": 82.28, "elapsed_time": "0:21:58", "remaining_time": "0:04:43", "throughput": 7696.87, "total_tokens": 10145760} +{"current_steps": 4370, "total_steps": 5305, "loss": 0.0001, "lr": 4.594331705395078e-07, "epoch": 4.11875589066918, "percentage": 82.38, "elapsed_time": "0:21:58", "remaining_time": "0:04:42", "throughput": 7701.61, "total_tokens": 10156000} +{"current_steps": 4375, "total_steps": 5305, "loss": 0.0, "lr": 4.5469192679940905e-07, "epoch": 4.123468426013195, "percentage": 82.47, "elapsed_time": "0:21:59", "remaining_time": "0:04:40", "throughput": 7707.81, "total_tokens": 10168736} +{"current_steps": 4380, "total_steps": 5305, "loss": 0.0, "lr": 4.4997282589643363e-07, "epoch": 4.12818096135721, "percentage": 82.56, "elapsed_time": "0:21:59", "remaining_time": "0:04:38", "throughput": 7713.99, "total_tokens": 10181408} +{"current_steps": 4385, "total_steps": 5305, "loss": 0.0, "lr": 4.4527591892033263e-07, "epoch": 4.132893496701225, "percentage": 82.66, "elapsed_time": "0:22:00", "remaining_time": "0:04:37", "throughput": 7718.85, "total_tokens": 10191904} +{"current_steps": 4390, "total_steps": 5305, "loss": 0.0, "lr": 4.406012567205847e-07, "epoch": 4.13760603204524, "percentage": 82.75, "elapsed_time": "0:22:00", "remaining_time": "0:04:35", "throughput": 7723.53, "total_tokens": 10202080} +{"current_steps": 4395, "total_steps": 5305, "loss": 0.0, "lr": 4.359488899058409e-07, "epoch": 4.142318567389255, "percentage": 82.85, "elapsed_time": "0:22:01", "remaining_time": "0:04:33", "throughput": 7728.09, "total_tokens": 10212064} +{"current_steps": 4400, "total_steps": 5305, "loss": 0.0, "lr": 4.313188688433792e-07, "epoch": 4.147031102733271, "percentage": 82.94, "elapsed_time": "0:22:01", "remaining_time": "0:04:31", "throughput": 7733.26, "total_tokens": 10223136} +{"current_steps": 4405, "total_steps": 5305, "loss": 0.0, "lr": 4.2671124365855853e-07, "epoch": 4.151743638077286, "percentage": 83.03, "elapsed_time": "0:22:02", "remaining_time": "0:04:30", "throughput": 7740.91, "total_tokens": 10238432} +{"current_steps": 4410, "total_steps": 5305, "loss": 0.0252, "lr": 4.2212606423427867e-07, "epoch": 4.156456173421301, "percentage": 83.13, "elapsed_time": "0:22:03", "remaining_time": "0:04:28", "throughput": 7746.82, "total_tokens": 10250784} +{"current_steps": 4415, "total_steps": 5305, "loss": 0.0, "lr": 4.175633802104337e-07, "epoch": 4.161168708765316, "percentage": 83.22, "elapsed_time": "0:22:03", "remaining_time": "0:04:26", "throughput": 7754.14, "total_tokens": 10265440} +{"current_steps": 4420, "total_steps": 5305, "loss": 0.0, "lr": 4.1302324098338315e-07, "epoch": 4.165881244109331, "percentage": 83.32, "elapsed_time": "0:22:04", "remaining_time": "0:04:25", "throughput": 7759.43, "total_tokens": 10276704} +{"current_steps": 4425, "total_steps": 5305, "loss": 0.0, "lr": 4.0850569570541036e-07, "epoch": 4.170593779453346, "percentage": 83.41, "elapsed_time": "0:22:04", "remaining_time": "0:04:23", "throughput": 7763.83, "total_tokens": 10286496} +{"current_steps": 4430, "total_steps": 5305, "loss": 0.0, "lr": 4.0401079328419384e-07, "epoch": 4.175306314797361, "percentage": 83.51, "elapsed_time": "0:22:05", "remaining_time": "0:04:21", "throughput": 7768.93, "total_tokens": 10297376} +{"current_steps": 4435, "total_steps": 5305, "loss": 0.0, "lr": 3.995385823822767e-07, "epoch": 4.180018850141376, "percentage": 83.6, "elapsed_time": "0:22:05", "remaining_time": "0:04:20", "throughput": 7773.21, "total_tokens": 10306976} +{"current_steps": 4440, "total_steps": 5305, "loss": 0.0, "lr": 3.9508911141653896e-07, "epoch": 4.184731385485391, "percentage": 83.69, "elapsed_time": "0:22:06", "remaining_time": "0:04:18", "throughput": 7778.88, "total_tokens": 10318880} +{"current_steps": 4445, "total_steps": 5305, "loss": 0.0001, "lr": 3.906624285576771e-07, "epoch": 4.189443920829406, "percentage": 83.79, "elapsed_time": "0:22:07", "remaining_time": "0:04:16", "throughput": 7784.5, "total_tokens": 10330784} +{"current_steps": 4450, "total_steps": 5305, "loss": 0.0, "lr": 3.862585817296771e-07, "epoch": 4.194156456173421, "percentage": 83.88, "elapsed_time": "0:22:07", "remaining_time": "0:04:15", "throughput": 7789.22, "total_tokens": 10341088} +{"current_steps": 4455, "total_steps": 5305, "loss": 0.0, "lr": 3.8187761860929956e-07, "epoch": 4.198868991517436, "percentage": 83.98, "elapsed_time": "0:22:08", "remaining_time": "0:04:13", "throughput": 7794.33, "total_tokens": 10352096} +{"current_steps": 4460, "total_steps": 5305, "loss": 0.0, "lr": 3.775195866255618e-07, "epoch": 4.203581526861451, "percentage": 84.07, "elapsed_time": "0:22:08", "remaining_time": "0:04:11", "throughput": 7800.24, "total_tokens": 10364448} +{"current_steps": 4465, "total_steps": 5305, "loss": 0.0, "lr": 3.731845329592268e-07, "epoch": 4.208294062205466, "percentage": 84.17, "elapsed_time": "0:22:09", "remaining_time": "0:04:10", "throughput": 7806.17, "total_tokens": 10376928} +{"current_steps": 4470, "total_steps": 5305, "loss": 0.0, "lr": 3.6887250454228666e-07, "epoch": 4.213006597549482, "percentage": 84.26, "elapsed_time": "0:22:09", "remaining_time": "0:04:08", "throughput": 7812.01, "total_tokens": 10389216} +{"current_steps": 4475, "total_steps": 5305, "loss": 0.0, "lr": 3.6458354805746304e-07, "epoch": 4.217719132893497, "percentage": 84.35, "elapsed_time": "0:22:10", "remaining_time": "0:04:06", "throughput": 7820.96, "total_tokens": 10406944} +{"current_steps": 4480, "total_steps": 5305, "loss": 0.0, "lr": 3.603177099376931e-07, "epoch": 4.222431668237512, "percentage": 84.45, "elapsed_time": "0:22:11", "remaining_time": "0:04:05", "throughput": 7825.9, "total_tokens": 10417760} +{"current_steps": 4485, "total_steps": 5305, "loss": 0.0, "lr": 3.5607503636563484e-07, "epoch": 4.227144203581527, "percentage": 84.54, "elapsed_time": "0:22:11", "remaining_time": "0:04:03", "throughput": 7831.24, "total_tokens": 10429216} +{"current_steps": 4490, "total_steps": 5305, "loss": 0.0, "lr": 3.5185557327315797e-07, "epoch": 4.231856738925542, "percentage": 84.64, "elapsed_time": "0:22:12", "remaining_time": "0:04:01", "throughput": 7837.81, "total_tokens": 10442784} +{"current_steps": 4495, "total_steps": 5305, "loss": 0.0, "lr": 3.47659366340857e-07, "epoch": 4.236569274269557, "percentage": 84.73, "elapsed_time": "0:22:12", "remaining_time": "0:04:00", "throughput": 7843.26, "total_tokens": 10454496} +{"current_steps": 4500, "total_steps": 5305, "loss": 0.0, "lr": 3.43486460997548e-07, "epoch": 4.2412818096135725, "percentage": 84.83, "elapsed_time": "0:22:13", "remaining_time": "0:03:58", "throughput": 7848.93, "total_tokens": 10466464} +{"current_steps": 4505, "total_steps": 5305, "loss": 0.0, "lr": 3.393369024197826e-07, "epoch": 4.245994344957587, "percentage": 84.92, "elapsed_time": "0:22:14", "remaining_time": "0:03:56", "throughput": 7853.57, "total_tokens": 10476768} +{"current_steps": 4510, "total_steps": 5305, "loss": 0.0, "lr": 3.352107355313536e-07, "epoch": 4.250706880301602, "percentage": 85.01, "elapsed_time": "0:22:14", "remaining_time": "0:03:55", "throughput": 7858.37, "total_tokens": 10487392} +{"current_steps": 4515, "total_steps": 5305, "loss": 0.0, "lr": 3.311080050028148e-07, "epoch": 4.255419415645617, "percentage": 85.11, "elapsed_time": "0:22:15", "remaining_time": "0:03:53", "throughput": 7863.26, "total_tokens": 10498144} +{"current_steps": 4520, "total_steps": 5305, "loss": 0.0782, "lr": 3.2702875525099235e-07, "epoch": 4.260131950989632, "percentage": 85.2, "elapsed_time": "0:22:15", "remaining_time": "0:03:51", "throughput": 7867.51, "total_tokens": 10507808} +{"current_steps": 4522, "total_steps": 5305, "eval_loss": 0.5548250675201416, "epoch": 4.262016965127239, "percentage": 85.24, "elapsed_time": "0:22:18", "remaining_time": "0:03:51", "throughput": 7853.49, "total_tokens": 10512416} +{"current_steps": 4525, "total_steps": 5305, "loss": 0.0, "lr": 3.2297303043850564e-07, "epoch": 4.264844486333647, "percentage": 85.3, "elapsed_time": "0:22:45", "remaining_time": "0:03:55", "throughput": 7700.29, "total_tokens": 10517408} +{"current_steps": 4530, "total_steps": 5305, "loss": 0.0, "lr": 3.189408744732897e-07, "epoch": 4.269557021677663, "percentage": 85.39, "elapsed_time": "0:22:46", "remaining_time": "0:03:53", "throughput": 7705.3, "total_tokens": 10528416} +{"current_steps": 4535, "total_steps": 5305, "loss": 0.0, "lr": 3.149323310081201e-07, "epoch": 4.274269557021678, "percentage": 85.49, "elapsed_time": "0:22:46", "remaining_time": "0:03:52", "throughput": 7711.3, "total_tokens": 10541216} +{"current_steps": 4540, "total_steps": 5305, "loss": 0.0, "lr": 3.1094744344013855e-07, "epoch": 4.278982092365693, "percentage": 85.58, "elapsed_time": "0:22:47", "remaining_time": "0:03:50", "throughput": 7717.32, "total_tokens": 10554016} +{"current_steps": 4545, "total_steps": 5305, "loss": 0.0, "lr": 3.069862549103841e-07, "epoch": 4.283694627709708, "percentage": 85.67, "elapsed_time": "0:22:48", "remaining_time": "0:03:48", "throughput": 7721.44, "total_tokens": 10563552} +{"current_steps": 4550, "total_steps": 5305, "loss": 0.0, "lr": 3.030488083033273e-07, "epoch": 4.288407163053723, "percentage": 85.77, "elapsed_time": "0:22:48", "remaining_time": "0:03:47", "throughput": 7727.44, "total_tokens": 10576288} +{"current_steps": 4555, "total_steps": 5305, "loss": 0.0, "lr": 2.991351462464037e-07, "epoch": 4.293119698397738, "percentage": 85.86, "elapsed_time": "0:22:49", "remaining_time": "0:03:45", "throughput": 7732.14, "total_tokens": 10586784} +{"current_steps": 4560, "total_steps": 5305, "loss": 0.0, "lr": 2.9524531110955406e-07, "epoch": 4.297832233741753, "percentage": 85.96, "elapsed_time": "0:22:49", "remaining_time": "0:03:43", "throughput": 7737.14, "total_tokens": 10597792} +{"current_steps": 4565, "total_steps": 5305, "loss": 0.0, "lr": 2.913793450047639e-07, "epoch": 4.3025447690857686, "percentage": 86.05, "elapsed_time": "0:22:50", "remaining_time": "0:03:42", "throughput": 7743.27, "total_tokens": 10610720} +{"current_steps": 4570, "total_steps": 5305, "loss": 0.0, "lr": 2.875372897856113e-07, "epoch": 4.3072573044297835, "percentage": 86.15, "elapsed_time": "0:22:50", "remaining_time": "0:03:40", "throughput": 7748.52, "total_tokens": 10622176} +{"current_steps": 4575, "total_steps": 5305, "loss": 0.0, "lr": 2.837191870468084e-07, "epoch": 4.311969839773798, "percentage": 86.24, "elapsed_time": "0:22:51", "remaining_time": "0:03:38", "throughput": 7753.31, "total_tokens": 10632864} +{"current_steps": 4580, "total_steps": 5305, "loss": 0.0039, "lr": 2.7992507812375557e-07, "epoch": 4.316682375117813, "percentage": 86.33, "elapsed_time": "0:22:51", "remaining_time": "0:03:37", "throughput": 7757.65, "total_tokens": 10642784} +{"current_steps": 4585, "total_steps": 5305, "loss": 0.0153, "lr": 2.76155004092091e-07, "epoch": 4.321394910461828, "percentage": 86.43, "elapsed_time": "0:22:52", "remaining_time": "0:03:35", "throughput": 7762.1, "total_tokens": 10652896} +{"current_steps": 4590, "total_steps": 5305, "loss": 0.1078, "lr": 2.7240900576724904e-07, "epoch": 4.326107445805843, "percentage": 86.52, "elapsed_time": "0:22:52", "remaining_time": "0:03:33", "throughput": 7767.85, "total_tokens": 10665248} +{"current_steps": 4595, "total_steps": 5305, "loss": 0.0001, "lr": 2.686871237040151e-07, "epoch": 4.330819981149858, "percentage": 86.62, "elapsed_time": "0:22:53", "remaining_time": "0:03:32", "throughput": 7772.88, "total_tokens": 10676384} +{"current_steps": 4600, "total_steps": 5305, "loss": 0.0, "lr": 2.6498939819608827e-07, "epoch": 4.335532516493874, "percentage": 86.71, "elapsed_time": "0:22:54", "remaining_time": "0:03:30", "throughput": 7778.41, "total_tokens": 10688352} +{"current_steps": 4605, "total_steps": 5305, "loss": 0.0, "lr": 2.613158692756443e-07, "epoch": 4.340245051837889, "percentage": 86.8, "elapsed_time": "0:22:54", "remaining_time": "0:03:28", "throughput": 7782.63, "total_tokens": 10698080} +{"current_steps": 4610, "total_steps": 5305, "loss": 0.0, "lr": 2.576665767129055e-07, "epoch": 4.344957587181904, "percentage": 86.9, "elapsed_time": "0:22:55", "remaining_time": "0:03:27", "throughput": 7788.52, "total_tokens": 10710816} +{"current_steps": 4615, "total_steps": 5305, "loss": 0.0, "lr": 2.5404156001570257e-07, "epoch": 4.349670122525919, "percentage": 86.99, "elapsed_time": "0:22:55", "remaining_time": "0:03:25", "throughput": 7793.88, "total_tokens": 10722592} +{"current_steps": 4620, "total_steps": 5305, "loss": 0.0, "lr": 2.5044085842905686e-07, "epoch": 4.354382657869934, "percentage": 87.09, "elapsed_time": "0:22:56", "remaining_time": "0:03:24", "throughput": 7799.44, "total_tokens": 10734752} +{"current_steps": 4625, "total_steps": 5305, "loss": 0.0001, "lr": 2.4686451093474673e-07, "epoch": 4.359095193213949, "percentage": 87.18, "elapsed_time": "0:22:56", "remaining_time": "0:03:22", "throughput": 7804.77, "total_tokens": 10746464} +{"current_steps": 4630, "total_steps": 5305, "loss": 0.0, "lr": 2.433125562508917e-07, "epoch": 4.363807728557964, "percentage": 87.28, "elapsed_time": "0:22:57", "remaining_time": "0:03:20", "throughput": 7809.7, "total_tokens": 10757472} +{"current_steps": 4635, "total_steps": 5305, "loss": 0.1078, "lr": 2.3978503283152847e-07, "epoch": 4.36852026390198, "percentage": 87.37, "elapsed_time": "0:22:58", "remaining_time": "0:03:19", "throughput": 7814.96, "total_tokens": 10769056} +{"current_steps": 4640, "total_steps": 5305, "loss": 0.0, "lr": 2.3628197886619852e-07, "epoch": 4.3732327992459945, "percentage": 87.46, "elapsed_time": "0:22:58", "remaining_time": "0:03:17", "throughput": 7820.06, "total_tokens": 10780384} +{"current_steps": 4645, "total_steps": 5305, "loss": 0.0, "lr": 2.3280343227953305e-07, "epoch": 4.3779453345900095, "percentage": 87.56, "elapsed_time": "0:22:59", "remaining_time": "0:03:15", "throughput": 7825.81, "total_tokens": 10792928} +{"current_steps": 4650, "total_steps": 5305, "loss": 0.0, "lr": 2.293494307308411e-07, "epoch": 4.382657869934024, "percentage": 87.65, "elapsed_time": "0:22:59", "remaining_time": "0:03:14", "throughput": 7830.65, "total_tokens": 10803808} +{"current_steps": 4655, "total_steps": 5305, "loss": 0.0, "lr": 2.2592001161370392e-07, "epoch": 4.387370405278039, "percentage": 87.75, "elapsed_time": "0:23:00", "remaining_time": "0:03:12", "throughput": 7835.39, "total_tokens": 10814496} +{"current_steps": 4660, "total_steps": 5305, "loss": 0.0, "lr": 2.2251521205557042e-07, "epoch": 4.392082940622054, "percentage": 87.84, "elapsed_time": "0:23:00", "remaining_time": "0:03:11", "throughput": 7841.25, "total_tokens": 10827168} +{"current_steps": 4665, "total_steps": 5305, "loss": 0.0, "lr": 2.1913506891735242e-07, "epoch": 4.39679547596607, "percentage": 87.94, "elapsed_time": "0:23:01", "remaining_time": "0:03:09", "throughput": 7846.85, "total_tokens": 10839392} +{"current_steps": 4670, "total_steps": 5305, "loss": 0.0, "lr": 2.1577961879302807e-07, "epoch": 4.401508011310085, "percentage": 88.03, "elapsed_time": "0:23:01", "remaining_time": "0:03:07", "throughput": 7852.5, "total_tokens": 10851744} +{"current_steps": 4675, "total_steps": 5305, "loss": 0.0, "lr": 2.124488980092454e-07, "epoch": 4.4062205466541, "percentage": 88.12, "elapsed_time": "0:23:02", "remaining_time": "0:03:06", "throughput": 7858.44, "total_tokens": 10864608} +{"current_steps": 4680, "total_steps": 5305, "loss": 0.0, "lr": 2.0914294262492723e-07, "epoch": 4.410933081998115, "percentage": 88.22, "elapsed_time": "0:23:03", "remaining_time": "0:03:04", "throughput": 7864.59, "total_tokens": 10877856} +{"current_steps": 4685, "total_steps": 5305, "loss": 0.0044, "lr": 2.0586178843088473e-07, "epoch": 4.41564561734213, "percentage": 88.31, "elapsed_time": "0:23:03", "remaining_time": "0:03:03", "throughput": 7871.04, "total_tokens": 10891616} +{"current_steps": 4690, "total_steps": 5305, "loss": 0.0, "lr": 2.026054709494235e-07, "epoch": 4.420358152686145, "percentage": 88.41, "elapsed_time": "0:23:04", "remaining_time": "0:03:01", "throughput": 7874.97, "total_tokens": 10901024} +{"current_steps": 4695, "total_steps": 5305, "loss": 0.0, "lr": 1.9937402543396683e-07, "epoch": 4.425070688030161, "percentage": 88.5, "elapsed_time": "0:23:04", "remaining_time": "0:02:59", "throughput": 7879.0, "total_tokens": 10910560} +{"current_steps": 4700, "total_steps": 5305, "loss": 0.0, "lr": 1.961674868686675e-07, "epoch": 4.429783223374176, "percentage": 88.6, "elapsed_time": "0:23:05", "remaining_time": "0:02:58", "throughput": 7884.01, "total_tokens": 10921824} +{"current_steps": 4705, "total_steps": 5305, "loss": 0.0, "lr": 1.929858899680323e-07, "epoch": 4.434495758718191, "percentage": 88.69, "elapsed_time": "0:23:05", "remaining_time": "0:02:56", "throughput": 7890.05, "total_tokens": 10934944} +{"current_steps": 4710, "total_steps": 5305, "loss": 0.0922, "lr": 1.8982926917654575e-07, "epoch": 4.4392082940622055, "percentage": 88.78, "elapsed_time": "0:23:06", "remaining_time": "0:02:55", "throughput": 7895.14, "total_tokens": 10946400} +{"current_steps": 4715, "total_steps": 5305, "loss": 0.0, "lr": 1.8669765866829724e-07, "epoch": 4.4439208294062205, "percentage": 88.88, "elapsed_time": "0:23:07", "remaining_time": "0:02:53", "throughput": 7900.41, "total_tokens": 10958112} +{"current_steps": 4720, "total_steps": 5305, "loss": 0.0, "lr": 1.835910923466097e-07, "epoch": 4.448633364750235, "percentage": 88.97, "elapsed_time": "0:23:07", "remaining_time": "0:02:51", "throughput": 7906.04, "total_tokens": 10970528} +{"current_steps": 4725, "total_steps": 5305, "loss": 0.0, "lr": 1.805096038436749e-07, "epoch": 4.45334590009425, "percentage": 89.07, "elapsed_time": "0:23:08", "remaining_time": "0:02:50", "throughput": 7911.19, "total_tokens": 10982048} +{"current_steps": 4730, "total_steps": 5305, "loss": 0.0, "lr": 1.774532265201867e-07, "epoch": 4.458058435438266, "percentage": 89.16, "elapsed_time": "0:23:08", "remaining_time": "0:02:48", "throughput": 7917.03, "total_tokens": 10994848} +{"current_steps": 4735, "total_steps": 5305, "loss": 0.0001, "lr": 1.7442199346498294e-07, "epoch": 4.462770970782281, "percentage": 89.26, "elapsed_time": "0:23:09", "remaining_time": "0:02:47", "throughput": 7921.36, "total_tokens": 11004896} +{"current_steps": 4740, "total_steps": 5305, "loss": 0.0, "lr": 1.7141593749468361e-07, "epoch": 4.467483506126296, "percentage": 89.35, "elapsed_time": "0:23:09", "remaining_time": "0:02:45", "throughput": 7926.85, "total_tokens": 11017056} +{"current_steps": 4745, "total_steps": 5305, "loss": 0.0, "lr": 1.6843509115333917e-07, "epoch": 4.472196041470311, "percentage": 89.44, "elapsed_time": "0:23:10", "remaining_time": "0:02:44", "throughput": 7931.04, "total_tokens": 11026912} +{"current_steps": 4750, "total_steps": 5305, "loss": 0.0, "lr": 1.6547948671207515e-07, "epoch": 4.476908576814326, "percentage": 89.54, "elapsed_time": "0:23:10", "remaining_time": "0:02:42", "throughput": 7936.04, "total_tokens": 11038176} +{"current_steps": 4755, "total_steps": 5305, "loss": 0.0, "lr": 1.6254915616874645e-07, "epoch": 4.481621112158341, "percentage": 89.63, "elapsed_time": "0:23:11", "remaining_time": "0:02:40", "throughput": 7939.99, "total_tokens": 11047648} +{"current_steps": 4760, "total_steps": 5305, "loss": 0.0441, "lr": 1.5964413124758492e-07, "epoch": 4.486333647502356, "percentage": 89.73, "elapsed_time": "0:23:11", "remaining_time": "0:02:39", "throughput": 7943.8, "total_tokens": 11056864} +{"current_steps": 4765, "total_steps": 5305, "loss": 0.0, "lr": 1.5676444339886327e-07, "epoch": 4.491046182846372, "percentage": 89.82, "elapsed_time": "0:23:12", "remaining_time": "0:02:37", "throughput": 7948.58, "total_tokens": 11067744} +{"current_steps": 4770, "total_steps": 5305, "loss": 0.0, "lr": 1.5391012379854937e-07, "epoch": 4.495758718190387, "percentage": 89.92, "elapsed_time": "0:23:12", "remaining_time": "0:02:36", "throughput": 7952.91, "total_tokens": 11077920} +{"current_steps": 4775, "total_steps": 5305, "loss": 0.0, "lr": 1.5108120334797e-07, "epoch": 4.500471253534402, "percentage": 90.01, "elapsed_time": "0:23:13", "remaining_time": "0:02:34", "throughput": 7957.67, "total_tokens": 11088864} +{"current_steps": 4780, "total_steps": 5305, "loss": 0.0, "lr": 1.4827771267347662e-07, "epoch": 4.5051837888784165, "percentage": 90.1, "elapsed_time": "0:23:13", "remaining_time": "0:02:33", "throughput": 7961.61, "total_tokens": 11098336} +{"current_steps": 4785, "total_steps": 5305, "loss": 0.0, "lr": 1.4549968212611538e-07, "epoch": 4.5098963242224315, "percentage": 90.2, "elapsed_time": "0:23:14", "remaining_time": "0:02:31", "throughput": 7965.47, "total_tokens": 11107680} +{"current_steps": 4788, "total_steps": 5305, "eval_loss": 0.5418137311935425, "epoch": 4.512723845428841, "percentage": 90.25, "elapsed_time": "0:23:17", "remaining_time": "0:02:30", "throughput": 7953.37, "total_tokens": 11115040} +{"current_steps": 4790, "total_steps": 5305, "loss": 0.0, "lr": 1.4274714178129534e-07, "epoch": 4.514608859566446, "percentage": 90.29, "elapsed_time": "0:24:00", "remaining_time": "0:02:34", "throughput": 7718.57, "total_tokens": 11120480} +{"current_steps": 4795, "total_steps": 5305, "loss": 0.0, "lr": 1.4002012143846472e-07, "epoch": 4.519321394910461, "percentage": 90.39, "elapsed_time": "0:24:01", "remaining_time": "0:02:33", "throughput": 7723.76, "total_tokens": 11132320} +{"current_steps": 4800, "total_steps": 5305, "loss": 0.0006, "lr": 1.3731865062078853e-07, "epoch": 4.524033930254477, "percentage": 90.48, "elapsed_time": "0:24:02", "remaining_time": "0:02:31", "throughput": 7731.54, "total_tokens": 11148960} +{"current_steps": 4805, "total_steps": 5305, "loss": 0.0, "lr": 1.3464275857482778e-07, "epoch": 4.528746465598492, "percentage": 90.57, "elapsed_time": "0:24:02", "remaining_time": "0:02:30", "throughput": 7736.27, "total_tokens": 11159968} +{"current_steps": 4810, "total_steps": 5305, "loss": 0.122, "lr": 1.3199247427022528e-07, "epoch": 4.533459000942507, "percentage": 90.67, "elapsed_time": "0:24:03", "remaining_time": "0:02:28", "throughput": 7740.91, "total_tokens": 11170848} +{"current_steps": 4815, "total_steps": 5305, "loss": 0.0, "lr": 1.293678263993872e-07, "epoch": 4.538171536286522, "percentage": 90.76, "elapsed_time": "0:24:03", "remaining_time": "0:02:26", "throughput": 7746.95, "total_tokens": 11184288} +{"current_steps": 4820, "total_steps": 5305, "loss": 0.0, "lr": 1.2676884337717882e-07, "epoch": 4.542884071630537, "percentage": 90.86, "elapsed_time": "0:24:04", "remaining_time": "0:02:25", "throughput": 7753.07, "total_tokens": 11197856} +{"current_steps": 4825, "total_steps": 5305, "loss": 0.0, "lr": 1.241955533406114e-07, "epoch": 4.547596606974552, "percentage": 90.95, "elapsed_time": "0:24:04", "remaining_time": "0:02:23", "throughput": 7758.24, "total_tokens": 11209696} +{"current_steps": 4830, "total_steps": 5305, "loss": 0.0, "lr": 1.2164798414854073e-07, "epoch": 4.552309142318568, "percentage": 91.05, "elapsed_time": "0:24:05", "remaining_time": "0:02:22", "throughput": 7762.6, "total_tokens": 11220064} +{"current_steps": 4835, "total_steps": 5305, "loss": 0.0, "lr": 1.1912616338136396e-07, "epoch": 4.557021677662583, "percentage": 91.14, "elapsed_time": "0:24:05", "remaining_time": "0:02:20", "throughput": 7766.9, "total_tokens": 11230304} +{"current_steps": 4840, "total_steps": 5305, "loss": 0.0, "lr": 1.1663011834072257e-07, "epoch": 4.561734213006598, "percentage": 91.23, "elapsed_time": "0:24:06", "remaining_time": "0:02:18", "throughput": 7770.92, "total_tokens": 11240096} +{"current_steps": 4845, "total_steps": 5305, "loss": 0.0, "lr": 1.1415987604920492e-07, "epoch": 4.566446748350613, "percentage": 91.33, "elapsed_time": "0:24:06", "remaining_time": "0:02:17", "throughput": 7775.63, "total_tokens": 11251104} +{"current_steps": 4850, "total_steps": 5305, "loss": 0.0, "lr": 1.11715463250055e-07, "epoch": 4.5711592836946275, "percentage": 91.42, "elapsed_time": "0:24:07", "remaining_time": "0:02:15", "throughput": 7779.76, "total_tokens": 11261088} +{"current_steps": 4855, "total_steps": 5305, "loss": 0.0072, "lr": 1.0929690640688218e-07, "epoch": 4.5758718190386425, "percentage": 91.52, "elapsed_time": "0:24:08", "remaining_time": "0:02:14", "throughput": 7785.1, "total_tokens": 11273312} +{"current_steps": 4860, "total_steps": 5305, "loss": 0.0003, "lr": 1.0690423170337554e-07, "epoch": 4.580584354382658, "percentage": 91.61, "elapsed_time": "0:24:08", "remaining_time": "0:02:12", "throughput": 7790.09, "total_tokens": 11284896} +{"current_steps": 4865, "total_steps": 5305, "loss": 0.0, "lr": 1.0453746504302003e-07, "epoch": 4.585296889726673, "percentage": 91.71, "elapsed_time": "0:24:09", "remaining_time": "0:02:11", "throughput": 7794.05, "total_tokens": 11294560} +{"current_steps": 4870, "total_steps": 5305, "loss": 0.0813, "lr": 1.021966320488152e-07, "epoch": 4.590009425070688, "percentage": 91.8, "elapsed_time": "0:24:09", "remaining_time": "0:02:09", "throughput": 7800.1, "total_tokens": 11308128} +{"current_steps": 4875, "total_steps": 5305, "loss": 0.0, "lr": 9.988175806299877e-08, "epoch": 4.594721960414703, "percentage": 91.89, "elapsed_time": "0:24:10", "remaining_time": "0:02:07", "throughput": 7805.81, "total_tokens": 11321056} +{"current_steps": 4880, "total_steps": 5305, "loss": 0.0, "lr": 9.759286814677305e-08, "epoch": 4.599434495758718, "percentage": 91.99, "elapsed_time": "0:24:10", "remaining_time": "0:02:06", "throughput": 7811.76, "total_tokens": 11334496} +{"current_steps": 4885, "total_steps": 5305, "loss": 0.0, "lr": 9.532998708003061e-08, "epoch": 4.604147031102733, "percentage": 92.08, "elapsed_time": "0:24:11", "remaining_time": "0:02:04", "throughput": 7816.76, "total_tokens": 11346208} +{"current_steps": 4890, "total_steps": 5305, "loss": 0.0, "lr": 9.309313936108983e-08, "epoch": 4.608859566446748, "percentage": 92.18, "elapsed_time": "0:24:12", "remaining_time": "0:02:03", "throughput": 7821.89, "total_tokens": 11358112} +{"current_steps": 4895, "total_steps": 5305, "loss": 0.0, "lr": 9.088234920642703e-08, "epoch": 4.613572101790764, "percentage": 92.27, "elapsed_time": "0:24:12", "remaining_time": "0:02:01", "throughput": 7826.0, "total_tokens": 11368096} +{"current_steps": 4900, "total_steps": 5305, "loss": 0.0, "lr": 8.869764055041501e-08, "epoch": 4.618284637134779, "percentage": 92.37, "elapsed_time": "0:24:13", "remaining_time": "0:02:00", "throughput": 7830.6, "total_tokens": 11378976} +{"current_steps": 4905, "total_steps": 5305, "loss": 0.0, "lr": 8.653903704506389e-08, "epoch": 4.622997172478794, "percentage": 92.46, "elapsed_time": "0:24:13", "remaining_time": "0:01:58", "throughput": 7835.61, "total_tokens": 11390688} +{"current_steps": 4910, "total_steps": 5305, "loss": 0.0, "lr": 8.440656205976644e-08, "epoch": 4.627709707822809, "percentage": 92.55, "elapsed_time": "0:24:14", "remaining_time": "0:01:56", "throughput": 7840.11, "total_tokens": 11401440} +{"current_steps": 4915, "total_steps": 5305, "loss": 0.0, "lr": 8.230023868104231e-08, "epoch": 4.632422243166824, "percentage": 92.65, "elapsed_time": "0:24:14", "remaining_time": "0:01:55", "throughput": 7844.73, "total_tokens": 11412448} +{"current_steps": 4920, "total_steps": 5305, "loss": 0.0, "lr": 8.022008971229039e-08, "epoch": 4.6371347785108386, "percentage": 92.74, "elapsed_time": "0:24:15", "remaining_time": "0:01:53", "throughput": 7848.85, "total_tokens": 11422496} +{"current_steps": 4925, "total_steps": 5305, "loss": 0.0, "lr": 7.816613767354098e-08, "epoch": 4.6418473138548535, "percentage": 92.84, "elapsed_time": "0:24:15", "remaining_time": "0:01:52", "throughput": 7853.53, "total_tokens": 11433632} +{"current_steps": 4930, "total_steps": 5305, "loss": 0.0, "lr": 7.613840480121176e-08, "epoch": 4.646559849198869, "percentage": 92.93, "elapsed_time": "0:24:16", "remaining_time": "0:01:50", "throughput": 7858.96, "total_tokens": 11446112} +{"current_steps": 4935, "total_steps": 5305, "loss": 0.0, "lr": 7.41369130478689e-08, "epoch": 4.651272384542884, "percentage": 93.03, "elapsed_time": "0:24:17", "remaining_time": "0:01:49", "throughput": 7864.84, "total_tokens": 11459552} +{"current_steps": 4940, "total_steps": 5305, "loss": 0.0, "lr": 7.216168408198554e-08, "epoch": 4.655984919886899, "percentage": 93.12, "elapsed_time": "0:24:17", "remaining_time": "0:01:47", "throughput": 7869.19, "total_tokens": 11469984} +{"current_steps": 4945, "total_steps": 5305, "loss": 0.0, "lr": 7.021273928771221e-08, "epoch": 4.660697455230914, "percentage": 93.21, "elapsed_time": "0:24:18", "remaining_time": "0:01:46", "throughput": 7874.28, "total_tokens": 11481888} +{"current_steps": 4950, "total_steps": 5305, "loss": 0.0579, "lr": 6.829009976464102e-08, "epoch": 4.665409990574929, "percentage": 93.31, "elapsed_time": "0:24:18", "remaining_time": "0:01:44", "throughput": 7879.99, "total_tokens": 11494944} +{"current_steps": 4955, "total_steps": 5305, "loss": 0.0, "lr": 6.639378632757986e-08, "epoch": 4.670122525918944, "percentage": 93.4, "elapsed_time": "0:24:19", "remaining_time": "0:01:43", "throughput": 7884.22, "total_tokens": 11505184} +{"current_steps": 4960, "total_steps": 5305, "loss": 0.0, "lr": 6.452381950632469e-08, "epoch": 4.674835061262959, "percentage": 93.5, "elapsed_time": "0:24:19", "remaining_time": "0:01:41", "throughput": 7889.71, "total_tokens": 11517856} +{"current_steps": 4965, "total_steps": 5305, "loss": 0.0, "lr": 6.268021954544095e-08, "epoch": 4.679547596606975, "percentage": 93.59, "elapsed_time": "0:24:20", "remaining_time": "0:01:40", "throughput": 7894.92, "total_tokens": 11530016} +{"current_steps": 4970, "total_steps": 5305, "loss": 0.0, "lr": 6.08630064040408e-08, "epoch": 4.68426013195099, "percentage": 93.69, "elapsed_time": "0:24:21", "remaining_time": "0:01:38", "throughput": 7901.82, "total_tokens": 11545376} +{"current_steps": 4975, "total_steps": 5305, "loss": 0.0, "lr": 5.9072199755567936e-08, "epoch": 4.688972667295005, "percentage": 93.78, "elapsed_time": "0:24:21", "remaining_time": "0:01:36", "throughput": 7906.43, "total_tokens": 11556448} +{"current_steps": 4980, "total_steps": 5305, "loss": 0.0, "lr": 5.730781898758614e-08, "epoch": 4.69368520263902, "percentage": 93.87, "elapsed_time": "0:24:22", "remaining_time": "0:01:35", "throughput": 7910.42, "total_tokens": 11566304} +{"current_steps": 4985, "total_steps": 5305, "loss": 0.0, "lr": 5.556988320156831e-08, "epoch": 4.698397737983035, "percentage": 93.97, "elapsed_time": "0:24:22", "remaining_time": "0:01:33", "throughput": 7914.89, "total_tokens": 11577056} +{"current_steps": 4990, "total_steps": 5305, "loss": 0.0, "lr": 5.3858411212689146e-08, "epoch": 4.7031102733270505, "percentage": 94.06, "elapsed_time": "0:24:23", "remaining_time": "0:01:32", "throughput": 7920.25, "total_tokens": 11589536} +{"current_steps": 4995, "total_steps": 5305, "loss": 0.0001, "lr": 5.2173421549621685e-08, "epoch": 4.707822808671065, "percentage": 94.16, "elapsed_time": "0:24:23", "remaining_time": "0:01:30", "throughput": 7924.38, "total_tokens": 11599648} +{"current_steps": 5000, "total_steps": 5305, "loss": 0.0, "lr": 5.051493245433775e-08, "epoch": 4.71253534401508, "percentage": 94.25, "elapsed_time": "0:24:24", "remaining_time": "0:01:29", "throughput": 7928.77, "total_tokens": 11610272} +{"current_steps": 5005, "total_steps": 5305, "loss": 0.0, "lr": 4.888296188190977e-08, "epoch": 4.717247879359095, "percentage": 94.34, "elapsed_time": "0:24:24", "remaining_time": "0:01:27", "throughput": 7933.1, "total_tokens": 11620768} +{"current_steps": 5010, "total_steps": 5305, "loss": 0.0, "lr": 4.727752750031511e-08, "epoch": 4.72196041470311, "percentage": 94.44, "elapsed_time": "0:24:25", "remaining_time": "0:01:26", "throughput": 7938.08, "total_tokens": 11632608} +{"current_steps": 5015, "total_steps": 5305, "loss": 0.0, "lr": 4.5698646690247874e-08, "epoch": 4.726672950047125, "percentage": 94.53, "elapsed_time": "0:24:26", "remaining_time": "0:01:24", "throughput": 7943.31, "total_tokens": 11644896} +{"current_steps": 5020, "total_steps": 5305, "loss": 0.0, "lr": 4.414633654492767e-08, "epoch": 4.73138548539114, "percentage": 94.63, "elapsed_time": "0:24:26", "remaining_time": "0:01:23", "throughput": 7950.77, "total_tokens": 11661344} +{"current_steps": 5025, "total_steps": 5305, "loss": 0.0, "lr": 4.2620613869915894e-08, "epoch": 4.736098020735156, "percentage": 94.72, "elapsed_time": "0:24:27", "remaining_time": "0:01:21", "throughput": 7955.29, "total_tokens": 11672288} +{"current_steps": 5030, "total_steps": 5305, "loss": 0.0, "lr": 4.112149518293362e-08, "epoch": 4.740810556079171, "percentage": 94.82, "elapsed_time": "0:24:27", "remaining_time": "0:01:20", "throughput": 7960.77, "total_tokens": 11684960} +{"current_steps": 5035, "total_steps": 5305, "loss": 0.0, "lr": 3.9648996713683715e-08, "epoch": 4.745523091423186, "percentage": 94.91, "elapsed_time": "0:24:28", "remaining_time": "0:01:18", "throughput": 7965.44, "total_tokens": 11696160} +{"current_steps": 5040, "total_steps": 5305, "loss": 0.0, "lr": 3.8203134403672905e-08, "epoch": 4.750235626767201, "percentage": 95.0, "elapsed_time": "0:24:28", "remaining_time": "0:01:17", "throughput": 7969.35, "total_tokens": 11705952} +{"current_steps": 5045, "total_steps": 5305, "loss": 0.0, "lr": 3.678392390604163e-08, "epoch": 4.754948162111216, "percentage": 95.1, "elapsed_time": "0:24:29", "remaining_time": "0:01:15", "throughput": 7973.51, "total_tokens": 11716192} +{"current_steps": 5050, "total_steps": 5305, "loss": 0.0, "lr": 3.539138058539282e-08, "epoch": 4.759660697455231, "percentage": 95.19, "elapsed_time": "0:24:29", "remaining_time": "0:01:14", "throughput": 7978.56, "total_tokens": 11728160} +{"current_steps": 5054, "total_steps": 5305, "eval_loss": 0.54215008020401, "epoch": 4.763430725730443, "percentage": 95.27, "elapsed_time": "0:24:33", "remaining_time": "0:01:13", "throughput": 7966.91, "total_tokens": 11736672} +{"current_steps": 5055, "total_steps": 5305, "loss": 0.0, "lr": 3.4025519517626174e-08, "epoch": 4.764373232799246, "percentage": 95.29, "elapsed_time": "0:24:56", "remaining_time": "0:01:13", "throughput": 7845.84, "total_tokens": 11738720} +{"current_steps": 5060, "total_steps": 5305, "loss": 0.0, "lr": 3.268635548977633e-08, "epoch": 4.7690857681432615, "percentage": 95.38, "elapsed_time": "0:24:56", "remaining_time": "0:01:12", "throughput": 7850.57, "total_tokens": 11750176} +{"current_steps": 5065, "total_steps": 5305, "loss": 0.0, "lr": 3.137390299984888e-08, "epoch": 4.773798303487276, "percentage": 95.48, "elapsed_time": "0:24:57", "remaining_time": "0:01:10", "throughput": 7855.17, "total_tokens": 11761312} +{"current_steps": 5070, "total_steps": 5305, "loss": 0.0, "lr": 3.0088176256668765e-08, "epoch": 4.778510838831291, "percentage": 95.57, "elapsed_time": "0:24:57", "remaining_time": "0:01:09", "throughput": 7860.39, "total_tokens": 11773728} +{"current_steps": 5075, "total_steps": 5305, "loss": 0.0, "lr": 2.8829189179721552e-08, "epoch": 4.783223374175306, "percentage": 95.66, "elapsed_time": "0:24:58", "remaining_time": "0:01:07", "throughput": 7864.87, "total_tokens": 11784672} +{"current_steps": 5080, "total_steps": 5305, "loss": 0.0, "lr": 2.759695539900603e-08, "epoch": 4.787935909519321, "percentage": 95.76, "elapsed_time": "0:24:58", "remaining_time": "0:01:06", "throughput": 7869.82, "total_tokens": 11796512} +{"current_steps": 5085, "total_steps": 5305, "loss": 0.0, "lr": 2.639148825488491e-08, "epoch": 4.792648444863336, "percentage": 95.85, "elapsed_time": "0:24:59", "remaining_time": "0:01:04", "throughput": 7875.84, "total_tokens": 11810464} +{"current_steps": 5090, "total_steps": 5305, "loss": 0.0, "lr": 2.5212800797941582e-08, "epoch": 4.797360980207351, "percentage": 95.95, "elapsed_time": "0:25:00", "remaining_time": "0:01:03", "throughput": 7879.94, "total_tokens": 11820768} +{"current_steps": 5095, "total_steps": 5305, "loss": 0.0, "lr": 2.406090578883691e-08, "epoch": 4.802073515551367, "percentage": 96.04, "elapsed_time": "0:25:00", "remaining_time": "0:01:01", "throughput": 7884.45, "total_tokens": 11831776} +{"current_steps": 5100, "total_steps": 5305, "loss": 0.0, "lr": 2.2935815698174045e-08, "epoch": 4.806786050895382, "percentage": 96.14, "elapsed_time": "0:25:01", "remaining_time": "0:01:00", "throughput": 7889.19, "total_tokens": 11843296} +{"current_steps": 5105, "total_steps": 5305, "loss": 0.0, "lr": 2.1837542706359958e-08, "epoch": 4.811498586239397, "percentage": 96.23, "elapsed_time": "0:25:01", "remaining_time": "0:00:58", "throughput": 7896.57, "total_tokens": 11860000} +{"current_steps": 5110, "total_steps": 5305, "loss": 0.0, "lr": 2.0766098703477178e-08, "epoch": 4.816211121583412, "percentage": 96.32, "elapsed_time": "0:25:02", "remaining_time": "0:00:57", "throughput": 7901.65, "total_tokens": 11872160} +{"current_steps": 5115, "total_steps": 5305, "loss": 0.0, "lr": 1.9721495289152237e-08, "epoch": 4.820923656927427, "percentage": 96.42, "elapsed_time": "0:25:03", "remaining_time": "0:00:55", "throughput": 7906.12, "total_tokens": 11883168} +{"current_steps": 5120, "total_steps": 5305, "loss": 0.0, "lr": 1.8703743772430783e-08, "epoch": 4.825636192271442, "percentage": 96.51, "elapsed_time": "0:25:03", "remaining_time": "0:00:54", "throughput": 7911.31, "total_tokens": 11895584} +{"current_steps": 5125, "total_steps": 5305, "loss": 0.0, "lr": 1.7712855171655996e-08, "epoch": 4.830348727615457, "percentage": 96.61, "elapsed_time": "0:25:04", "remaining_time": "0:00:52", "throughput": 7915.89, "total_tokens": 11906784} +{"current_steps": 5130, "total_steps": 5305, "loss": 0.0, "lr": 1.6748840214348972e-08, "epoch": 4.8350612629594725, "percentage": 96.7, "elapsed_time": "0:25:04", "remaining_time": "0:00:51", "throughput": 7920.26, "total_tokens": 11917600} +{"current_steps": 5135, "total_steps": 5305, "loss": 0.0, "lr": 1.5811709337091862e-08, "epoch": 4.839773798303487, "percentage": 96.8, "elapsed_time": "0:25:05", "remaining_time": "0:00:49", "throughput": 7925.25, "total_tokens": 11929632} +{"current_steps": 5140, "total_steps": 5305, "loss": 0.0, "lr": 1.4901472685415475e-08, "epoch": 4.844486333647502, "percentage": 96.89, "elapsed_time": "0:25:05", "remaining_time": "0:00:48", "throughput": 7928.71, "total_tokens": 11938720} +{"current_steps": 5145, "total_steps": 5305, "loss": 0.0072, "lr": 1.4018140113689904e-08, "epoch": 4.849198868991517, "percentage": 96.98, "elapsed_time": "0:25:06", "remaining_time": "0:00:46", "throughput": 7934.14, "total_tokens": 11951648} +{"current_steps": 5150, "total_steps": 5305, "loss": 0.0, "lr": 1.3161721185016852e-08, "epoch": 4.853911404335532, "percentage": 97.08, "elapsed_time": "0:25:06", "remaining_time": "0:00:45", "throughput": 7938.43, "total_tokens": 11962336} +{"current_steps": 5155, "total_steps": 5305, "loss": 0.0, "lr": 1.2332225171126366e-08, "epoch": 4.858623939679548, "percentage": 97.17, "elapsed_time": "0:25:07", "remaining_time": "0:00:43", "throughput": 7944.21, "total_tokens": 11975904} +{"current_steps": 5160, "total_steps": 5305, "loss": 0.0, "lr": 1.152966105227693e-08, "epoch": 4.863336475023563, "percentage": 97.27, "elapsed_time": "0:25:08", "remaining_time": "0:00:42", "throughput": 7948.3, "total_tokens": 11986208} +{"current_steps": 5165, "total_steps": 5305, "loss": 0.0, "lr": 1.0754037517158312e-08, "epoch": 4.868049010367578, "percentage": 97.36, "elapsed_time": "0:25:08", "remaining_time": "0:00:40", "throughput": 7953.92, "total_tokens": 11999520} +{"current_steps": 5170, "total_steps": 5305, "loss": 0.0, "lr": 1.0005362962796362e-08, "epoch": 4.872761545711593, "percentage": 97.46, "elapsed_time": "0:25:09", "remaining_time": "0:00:39", "throughput": 7958.8, "total_tokens": 12011424} +{"current_steps": 5175, "total_steps": 5305, "loss": 0.0, "lr": 9.283645494463368e-09, "epoch": 4.877474081055608, "percentage": 97.55, "elapsed_time": "0:25:09", "remaining_time": "0:00:37", "throughput": 7964.48, "total_tokens": 12024864} +{"current_steps": 5180, "total_steps": 5305, "loss": 0.0, "lr": 8.588892925590064e-09, "epoch": 4.882186616399623, "percentage": 97.64, "elapsed_time": "0:25:10", "remaining_time": "0:00:36", "throughput": 7968.97, "total_tokens": 12035936} +{"current_steps": 5185, "total_steps": 5305, "loss": 0.0, "lr": 7.92111277768015e-09, "epoch": 4.886899151743638, "percentage": 97.74, "elapsed_time": "0:25:10", "remaining_time": "0:00:34", "throughput": 7972.87, "total_tokens": 12045920} +{"current_steps": 5190, "total_steps": 5305, "loss": 0.0, "lr": 7.280312280230073e-09, "epoch": 4.891611687087654, "percentage": 97.83, "elapsed_time": "0:25:11", "remaining_time": "0:00:33", "throughput": 7977.9, "total_tokens": 12058144} +{"current_steps": 5195, "total_steps": 5305, "loss": 0.0, "lr": 6.666498370650198e-09, "epoch": 4.8963242224316685, "percentage": 97.93, "elapsed_time": "0:25:12", "remaining_time": "0:00:32", "throughput": 7982.66, "total_tokens": 12069792} +{"current_steps": 5200, "total_steps": 5305, "loss": 0.0, "lr": 6.079677694189046e-09, "epoch": 4.9010367577756835, "percentage": 98.02, "elapsed_time": "0:25:12", "remaining_time": "0:00:30", "throughput": 7987.12, "total_tokens": 12080864} +{"current_steps": 5205, "total_steps": 5305, "loss": 0.0, "lr": 5.5198566038627835e-09, "epoch": 4.905749293119698, "percentage": 98.11, "elapsed_time": "0:25:13", "remaining_time": "0:00:29", "throughput": 7991.73, "total_tokens": 12092256} +{"current_steps": 5210, "total_steps": 5305, "loss": 0.0, "lr": 4.987041160385287e-09, "epoch": 4.910461828463713, "percentage": 98.21, "elapsed_time": "0:25:13", "remaining_time": "0:00:27", "throughput": 7997.9, "total_tokens": 12106784} +{"current_steps": 5215, "total_steps": 5305, "loss": 0.0003, "lr": 4.481237132103189e-09, "epoch": 4.915174363807728, "percentage": 98.3, "elapsed_time": "0:25:14", "remaining_time": "0:00:26", "throughput": 8001.97, "total_tokens": 12117088} +{"current_steps": 5220, "total_steps": 5305, "loss": 0.0, "lr": 4.002449994932878e-09, "epoch": 4.919886899151743, "percentage": 98.4, "elapsed_time": "0:25:14", "remaining_time": "0:00:24", "throughput": 8006.71, "total_tokens": 12128736} +{"current_steps": 5225, "total_steps": 5305, "loss": 0.0, "lr": 3.550684932301374e-09, "epoch": 4.924599434495759, "percentage": 98.49, "elapsed_time": "0:25:15", "remaining_time": "0:00:23", "throughput": 8013.93, "total_tokens": 12145376} +{"current_steps": 5230, "total_steps": 5305, "loss": 0.0, "lr": 3.1259468350910982e-09, "epoch": 4.929311969839774, "percentage": 98.59, "elapsed_time": "0:25:16", "remaining_time": "0:00:21", "throughput": 8018.29, "total_tokens": 12156320} +{"current_steps": 5235, "total_steps": 5305, "loss": 0.0, "lr": 2.7282403015849167e-09, "epoch": 4.934024505183789, "percentage": 98.68, "elapsed_time": "0:25:16", "remaining_time": "0:00:20", "throughput": 8023.02, "total_tokens": 12167968} +{"current_steps": 5240, "total_steps": 5305, "loss": 0.0, "lr": 2.3575696374189548e-09, "epoch": 4.938737040527804, "percentage": 98.77, "elapsed_time": "0:25:17", "remaining_time": "0:00:18", "throughput": 8027.81, "total_tokens": 12179744} +{"current_steps": 5245, "total_steps": 5305, "loss": 0.0001, "lr": 2.013938855533748e-09, "epoch": 4.943449575871819, "percentage": 98.87, "elapsed_time": "0:25:17", "remaining_time": "0:00:17", "throughput": 8033.0, "total_tokens": 12192288} +{"current_steps": 5250, "total_steps": 5305, "loss": 0.0, "lr": 1.6973516761317755e-09, "epoch": 4.948162111215834, "percentage": 98.96, "elapsed_time": "0:25:18", "remaining_time": "0:00:15", "throughput": 8037.44, "total_tokens": 12203360} +{"current_steps": 5255, "total_steps": 5305, "loss": 0.0007, "lr": 1.407811526637215e-09, "epoch": 4.952874646559849, "percentage": 99.06, "elapsed_time": "0:25:18", "remaining_time": "0:00:14", "throughput": 8042.33, "total_tokens": 12215392} +{"current_steps": 5260, "total_steps": 5305, "loss": 0.0, "lr": 1.145321541659028e-09, "epoch": 4.957587181903865, "percentage": 99.15, "elapsed_time": "0:25:19", "remaining_time": "0:00:12", "throughput": 8047.34, "total_tokens": 12227680} +{"current_steps": 5265, "total_steps": 5305, "loss": 0.0, "lr": 9.098845629559871e-10, "epoch": 4.9622997172478795, "percentage": 99.25, "elapsed_time": "0:25:20", "remaining_time": "0:00:11", "throughput": 8053.55, "total_tokens": 12242336} +{"current_steps": 5270, "total_steps": 5305, "loss": 0.0, "lr": 7.015031394072557e-10, "epoch": 4.9670122525918945, "percentage": 99.34, "elapsed_time": "0:25:20", "remaining_time": "0:00:10", "throughput": 8057.18, "total_tokens": 12251936} +{"current_steps": 5275, "total_steps": 5305, "loss": 0.0, "lr": 5.201795269837995e-10, "epoch": 4.971724787935909, "percentage": 99.43, "elapsed_time": "0:25:21", "remaining_time": "0:00:08", "throughput": 8061.3, "total_tokens": 12262432} +{"current_steps": 5280, "total_steps": 5305, "loss": 0.0, "lr": 3.6591568872451634e-10, "epoch": 4.976437323279924, "percentage": 99.53, "elapsed_time": "0:25:21", "remaining_time": "0:00:07", "throughput": 8066.9, "total_tokens": 12275872} +{"current_steps": 5285, "total_steps": 5305, "loss": 0.0, "lr": 2.387132947151427e-10, "epoch": 4.981149858623939, "percentage": 99.62, "elapsed_time": "0:25:22", "remaining_time": "0:00:05", "throughput": 8072.29, "total_tokens": 12288928} +{"current_steps": 5290, "total_steps": 5305, "loss": 0.0969, "lr": 1.3857372206882436e-10, "epoch": 4.985862393967954, "percentage": 99.72, "elapsed_time": "0:25:22", "remaining_time": "0:00:04", "throughput": 8077.12, "total_tokens": 12300832} +{"current_steps": 5295, "total_steps": 5305, "loss": 0.0, "lr": 6.549805491307127e-11, "epoch": 4.99057492931197, "percentage": 99.81, "elapsed_time": "0:25:23", "remaining_time": "0:00:02", "throughput": 8081.73, "total_tokens": 12312352} +{"current_steps": 5300, "total_steps": 5305, "loss": 0.0, "lr": 1.948708437726765e-11, "epoch": 4.995287464655985, "percentage": 99.91, "elapsed_time": "0:25:23", "remaining_time": "0:00:01", "throughput": 8085.66, "total_tokens": 12322528} +{"current_steps": 5305, "total_steps": 5305, "loss": 0.0, "lr": 5.413085829575338e-13, "epoch": 5.0, "percentage": 100.0, "elapsed_time": "0:25:24", "remaining_time": "0:00:00", "throughput": 8089.82, "total_tokens": 12333600} +{"current_steps": 5305, "total_steps": 5305, "epoch": 5.0, "percentage": 100.0, "elapsed_time": "0:26:13", "remaining_time": "0:00:00", "throughput": 7836.34, "total_tokens": 12333600} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..c8149fc --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,8703 @@ +{ + "best_global_step": 1064, + "best_metric": 0.18848362565040588, + "best_model_checkpoint": "saves_bts_preliminary/base/llama-3.2-1b-instruct/train_boolq_42_1776331558/checkpoint-1064", + "epoch": 5.0, + "eval_steps": 266, + "global_step": 5305, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00471253534401508, + "grad_norm": 318.0024719238281, + "learning_rate": 3.766478342749529e-08, + "loss": 0.8967, + "num_input_tokens_seen": 10752, + "step": 5 + }, + { + "epoch": 0.00942507068803016, + "grad_norm": 311.7052001953125, + "learning_rate": 8.474576271186442e-08, + "loss": 0.865, + "num_input_tokens_seen": 20736, + "step": 10 + }, + { + "epoch": 0.01413760603204524, + "grad_norm": 320.9153747558594, + "learning_rate": 1.3182674199623353e-07, + "loss": 0.8712, + "num_input_tokens_seen": 31232, + "step": 15 + }, + { + "epoch": 0.01885014137606032, + "grad_norm": 198.89523315429688, + "learning_rate": 1.7890772128060264e-07, + "loss": 0.7056, + "num_input_tokens_seen": 40320, + "step": 20 + }, + { + "epoch": 0.0235626767200754, + "grad_norm": 119.11978149414062, + "learning_rate": 2.2598870056497177e-07, + "loss": 0.5698, + "num_input_tokens_seen": 51136, + "step": 25 + }, + { + "epoch": 0.02827521206409048, + "grad_norm": 40.273216247558594, + "learning_rate": 2.730696798493409e-07, + "loss": 0.4948, + "num_input_tokens_seen": 63424, + "step": 30 + }, + { + "epoch": 0.03298774740810556, + "grad_norm": 48.785831451416016, + "learning_rate": 3.2015065913371e-07, + "loss": 0.3516, + "num_input_tokens_seen": 75328, + "step": 35 + }, + { + "epoch": 0.03770028275212064, + "grad_norm": 40.042724609375, + "learning_rate": 3.6723163841807916e-07, + "loss": 0.3321, + "num_input_tokens_seen": 86784, + "step": 40 + }, + { + "epoch": 0.04241281809613572, + "grad_norm": 27.21363639831543, + "learning_rate": 4.1431261770244826e-07, + "loss": 0.3329, + "num_input_tokens_seen": 99136, + "step": 45 + }, + { + "epoch": 0.0471253534401508, + "grad_norm": 165.71754455566406, + "learning_rate": 4.613935969868174e-07, + "loss": 0.4395, + "num_input_tokens_seen": 108672, + "step": 50 + }, + { + "epoch": 0.051837888784165884, + "grad_norm": 41.76438903808594, + "learning_rate": 5.084745762711865e-07, + "loss": 0.4549, + "num_input_tokens_seen": 119808, + "step": 55 + }, + { + "epoch": 0.05655042412818096, + "grad_norm": 67.6883544921875, + "learning_rate": 5.555555555555555e-07, + "loss": 0.3434, + "num_input_tokens_seen": 130048, + "step": 60 + }, + { + "epoch": 0.061262959472196045, + "grad_norm": 72.87140655517578, + "learning_rate": 6.026365348399247e-07, + "loss": 0.3434, + "num_input_tokens_seen": 142464, + "step": 65 + }, + { + "epoch": 0.06597549481621112, + "grad_norm": 34.45458221435547, + "learning_rate": 6.497175141242938e-07, + "loss": 0.3516, + "num_input_tokens_seen": 154048, + "step": 70 + }, + { + "epoch": 0.0706880301602262, + "grad_norm": 33.45477294921875, + "learning_rate": 6.96798493408663e-07, + "loss": 0.3088, + "num_input_tokens_seen": 166720, + "step": 75 + }, + { + "epoch": 0.07540056550424128, + "grad_norm": 39.73252868652344, + "learning_rate": 7.43879472693032e-07, + "loss": 0.3218, + "num_input_tokens_seen": 179200, + "step": 80 + }, + { + "epoch": 0.08011310084825636, + "grad_norm": 39.0076789855957, + "learning_rate": 7.909604519774013e-07, + "loss": 0.3962, + "num_input_tokens_seen": 190464, + "step": 85 + }, + { + "epoch": 0.08482563619227144, + "grad_norm": 23.60104751586914, + "learning_rate": 8.380414312617704e-07, + "loss": 0.3243, + "num_input_tokens_seen": 205376, + "step": 90 + }, + { + "epoch": 0.08953817153628653, + "grad_norm": 43.6406364440918, + "learning_rate": 8.851224105461394e-07, + "loss": 0.383, + "num_input_tokens_seen": 215424, + "step": 95 + }, + { + "epoch": 0.0942507068803016, + "grad_norm": 20.507476806640625, + "learning_rate": 9.322033898305086e-07, + "loss": 0.2987, + "num_input_tokens_seen": 226688, + "step": 100 + }, + { + "epoch": 0.09896324222431668, + "grad_norm": 27.949003219604492, + "learning_rate": 9.792843691148776e-07, + "loss": 0.2859, + "num_input_tokens_seen": 237248, + "step": 105 + }, + { + "epoch": 0.10367577756833177, + "grad_norm": 44.55869674682617, + "learning_rate": 1.0263653483992468e-06, + "loss": 0.3517, + "num_input_tokens_seen": 254144, + "step": 110 + }, + { + "epoch": 0.10838831291234684, + "grad_norm": 65.89197540283203, + "learning_rate": 1.073446327683616e-06, + "loss": 0.296, + "num_input_tokens_seen": 265920, + "step": 115 + }, + { + "epoch": 0.11310084825636192, + "grad_norm": 77.50157165527344, + "learning_rate": 1.120527306967985e-06, + "loss": 0.4375, + "num_input_tokens_seen": 277184, + "step": 120 + }, + { + "epoch": 0.117813383600377, + "grad_norm": 48.51222229003906, + "learning_rate": 1.167608286252354e-06, + "loss": 0.2998, + "num_input_tokens_seen": 289024, + "step": 125 + }, + { + "epoch": 0.12252591894439209, + "grad_norm": 36.21791076660156, + "learning_rate": 1.2146892655367234e-06, + "loss": 0.2883, + "num_input_tokens_seen": 299456, + "step": 130 + }, + { + "epoch": 0.12723845428840716, + "grad_norm": 38.02757263183594, + "learning_rate": 1.2617702448210926e-06, + "loss": 0.3329, + "num_input_tokens_seen": 313728, + "step": 135 + }, + { + "epoch": 0.13195098963242224, + "grad_norm": 26.75225257873535, + "learning_rate": 1.3088512241054615e-06, + "loss": 0.2533, + "num_input_tokens_seen": 326080, + "step": 140 + }, + { + "epoch": 0.13666352497643733, + "grad_norm": 31.08643341064453, + "learning_rate": 1.3559322033898307e-06, + "loss": 0.2705, + "num_input_tokens_seen": 338688, + "step": 145 + }, + { + "epoch": 0.1413760603204524, + "grad_norm": 38.68683624267578, + "learning_rate": 1.4030131826741996e-06, + "loss": 0.3576, + "num_input_tokens_seen": 349632, + "step": 150 + }, + { + "epoch": 0.1460885956644675, + "grad_norm": 36.93141174316406, + "learning_rate": 1.4500941619585688e-06, + "loss": 0.2256, + "num_input_tokens_seen": 363968, + "step": 155 + }, + { + "epoch": 0.15080113100848255, + "grad_norm": 47.699241638183594, + "learning_rate": 1.4971751412429381e-06, + "loss": 0.4483, + "num_input_tokens_seen": 375680, + "step": 160 + }, + { + "epoch": 0.15551366635249764, + "grad_norm": 24.248348236083984, + "learning_rate": 1.544256120527307e-06, + "loss": 0.269, + "num_input_tokens_seen": 386368, + "step": 165 + }, + { + "epoch": 0.16022620169651272, + "grad_norm": 18.501712799072266, + "learning_rate": 1.5913370998116762e-06, + "loss": 0.3019, + "num_input_tokens_seen": 396992, + "step": 170 + }, + { + "epoch": 0.1649387370405278, + "grad_norm": 13.451111793518066, + "learning_rate": 1.6384180790960452e-06, + "loss": 0.3423, + "num_input_tokens_seen": 408960, + "step": 175 + }, + { + "epoch": 0.1696512723845429, + "grad_norm": 24.91837501525879, + "learning_rate": 1.6854990583804145e-06, + "loss": 0.2813, + "num_input_tokens_seen": 419008, + "step": 180 + }, + { + "epoch": 0.17436380772855797, + "grad_norm": 23.55257225036621, + "learning_rate": 1.7325800376647837e-06, + "loss": 0.2445, + "num_input_tokens_seen": 430144, + "step": 185 + }, + { + "epoch": 0.17907634307257306, + "grad_norm": 50.514373779296875, + "learning_rate": 1.7796610169491526e-06, + "loss": 0.2385, + "num_input_tokens_seen": 441216, + "step": 190 + }, + { + "epoch": 0.18378887841658811, + "grad_norm": 63.581478118896484, + "learning_rate": 1.8267419962335218e-06, + "loss": 0.2216, + "num_input_tokens_seen": 451584, + "step": 195 + }, + { + "epoch": 0.1885014137606032, + "grad_norm": 132.5011444091797, + "learning_rate": 1.873822975517891e-06, + "loss": 0.4569, + "num_input_tokens_seen": 464384, + "step": 200 + }, + { + "epoch": 0.19321394910461828, + "grad_norm": 17.86366081237793, + "learning_rate": 1.92090395480226e-06, + "loss": 0.4075, + "num_input_tokens_seen": 481792, + "step": 205 + }, + { + "epoch": 0.19792648444863337, + "grad_norm": 22.02827262878418, + "learning_rate": 1.9679849340866293e-06, + "loss": 0.2703, + "num_input_tokens_seen": 493952, + "step": 210 + }, + { + "epoch": 0.20263901979264845, + "grad_norm": 22.791790008544922, + "learning_rate": 2.015065913370998e-06, + "loss": 0.2604, + "num_input_tokens_seen": 504832, + "step": 215 + }, + { + "epoch": 0.20735155513666353, + "grad_norm": 26.63323974609375, + "learning_rate": 2.062146892655367e-06, + "loss": 0.277, + "num_input_tokens_seen": 514368, + "step": 220 + }, + { + "epoch": 0.21206409048067862, + "grad_norm": 48.75139617919922, + "learning_rate": 2.1092278719397365e-06, + "loss": 0.2405, + "num_input_tokens_seen": 525568, + "step": 225 + }, + { + "epoch": 0.21677662582469368, + "grad_norm": 29.34063148498535, + "learning_rate": 2.1563088512241055e-06, + "loss": 0.259, + "num_input_tokens_seen": 537664, + "step": 230 + }, + { + "epoch": 0.22148916116870876, + "grad_norm": 26.771190643310547, + "learning_rate": 2.203389830508475e-06, + "loss": 0.2561, + "num_input_tokens_seen": 547584, + "step": 235 + }, + { + "epoch": 0.22620169651272384, + "grad_norm": 32.1733512878418, + "learning_rate": 2.2504708097928438e-06, + "loss": 0.3491, + "num_input_tokens_seen": 558144, + "step": 240 + }, + { + "epoch": 0.23091423185673893, + "grad_norm": 17.223844528198242, + "learning_rate": 2.297551789077213e-06, + "loss": 0.2543, + "num_input_tokens_seen": 569024, + "step": 245 + }, + { + "epoch": 0.235626767200754, + "grad_norm": 28.303009033203125, + "learning_rate": 2.344632768361582e-06, + "loss": 0.3033, + "num_input_tokens_seen": 580864, + "step": 250 + }, + { + "epoch": 0.2403393025447691, + "grad_norm": 50.68221664428711, + "learning_rate": 2.391713747645951e-06, + "loss": 0.2747, + "num_input_tokens_seen": 592768, + "step": 255 + }, + { + "epoch": 0.24505183788878418, + "grad_norm": 22.0999698638916, + "learning_rate": 2.4387947269303204e-06, + "loss": 0.336, + "num_input_tokens_seen": 604032, + "step": 260 + }, + { + "epoch": 0.24976437323279924, + "grad_norm": 29.03078269958496, + "learning_rate": 2.4858757062146898e-06, + "loss": 0.2277, + "num_input_tokens_seen": 616256, + "step": 265 + }, + { + "epoch": 0.25070688030160226, + "eval_loss": 0.25048765540122986, + "eval_runtime": 2.7618, + "eval_samples_per_second": 341.444, + "eval_steps_per_second": 42.726, + "num_input_tokens_seen": 618432, + "step": 266 + }, + { + "epoch": 0.2544769085768143, + "grad_norm": 33.82787322998047, + "learning_rate": 2.5329566854990583e-06, + "loss": 0.2331, + "num_input_tokens_seen": 627072, + "step": 270 + }, + { + "epoch": 0.25918944392082943, + "grad_norm": 19.519996643066406, + "learning_rate": 2.5800376647834272e-06, + "loss": 0.157, + "num_input_tokens_seen": 638592, + "step": 275 + }, + { + "epoch": 0.2639019792648445, + "grad_norm": 43.415740966796875, + "learning_rate": 2.627118644067797e-06, + "loss": 0.3209, + "num_input_tokens_seen": 648448, + "step": 280 + }, + { + "epoch": 0.26861451460885954, + "grad_norm": 33.58452606201172, + "learning_rate": 2.674199623352166e-06, + "loss": 0.2578, + "num_input_tokens_seen": 662784, + "step": 285 + }, + { + "epoch": 0.27332704995287466, + "grad_norm": 22.619098663330078, + "learning_rate": 2.7212806026365353e-06, + "loss": 0.3557, + "num_input_tokens_seen": 673856, + "step": 290 + }, + { + "epoch": 0.2780395852968897, + "grad_norm": 19.780139923095703, + "learning_rate": 2.7683615819209043e-06, + "loss": 0.2089, + "num_input_tokens_seen": 683136, + "step": 295 + }, + { + "epoch": 0.2827521206409048, + "grad_norm": 16.88971519470215, + "learning_rate": 2.8154425612052732e-06, + "loss": 0.2989, + "num_input_tokens_seen": 694784, + "step": 300 + }, + { + "epoch": 0.2874646559849199, + "grad_norm": 18.517738342285156, + "learning_rate": 2.862523540489642e-06, + "loss": 0.2632, + "num_input_tokens_seen": 706624, + "step": 305 + }, + { + "epoch": 0.292177191328935, + "grad_norm": 26.673988342285156, + "learning_rate": 2.9096045197740115e-06, + "loss": 0.2979, + "num_input_tokens_seen": 716800, + "step": 310 + }, + { + "epoch": 0.29688972667295005, + "grad_norm": 40.26359176635742, + "learning_rate": 2.9566854990583805e-06, + "loss": 0.3261, + "num_input_tokens_seen": 728704, + "step": 315 + }, + { + "epoch": 0.3016022620169651, + "grad_norm": 20.343751907348633, + "learning_rate": 3.00376647834275e-06, + "loss": 0.1851, + "num_input_tokens_seen": 740352, + "step": 320 + }, + { + "epoch": 0.3063147973609802, + "grad_norm": 30.753236770629883, + "learning_rate": 3.0508474576271192e-06, + "loss": 0.2727, + "num_input_tokens_seen": 751936, + "step": 325 + }, + { + "epoch": 0.3110273327049953, + "grad_norm": 27.787220001220703, + "learning_rate": 3.097928436911488e-06, + "loss": 0.3077, + "num_input_tokens_seen": 763264, + "step": 330 + }, + { + "epoch": 0.3157398680490104, + "grad_norm": 13.1635103225708, + "learning_rate": 3.145009416195857e-06, + "loss": 0.3285, + "num_input_tokens_seen": 772992, + "step": 335 + }, + { + "epoch": 0.32045240339302544, + "grad_norm": 32.643653869628906, + "learning_rate": 3.192090395480226e-06, + "loss": 0.2493, + "num_input_tokens_seen": 787008, + "step": 340 + }, + { + "epoch": 0.32516493873704055, + "grad_norm": 31.31944465637207, + "learning_rate": 3.2391713747645954e-06, + "loss": 0.233, + "num_input_tokens_seen": 798848, + "step": 345 + }, + { + "epoch": 0.3298774740810556, + "grad_norm": 69.64946746826172, + "learning_rate": 3.2862523540489644e-06, + "loss": 0.3409, + "num_input_tokens_seen": 811584, + "step": 350 + }, + { + "epoch": 0.33459000942507067, + "grad_norm": 21.657026290893555, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.1946, + "num_input_tokens_seen": 822208, + "step": 355 + }, + { + "epoch": 0.3393025447690858, + "grad_norm": 48.79526138305664, + "learning_rate": 3.3804143126177023e-06, + "loss": 0.3024, + "num_input_tokens_seen": 833792, + "step": 360 + }, + { + "epoch": 0.34401508011310084, + "grad_norm": 22.62244415283203, + "learning_rate": 3.427495291902072e-06, + "loss": 0.347, + "num_input_tokens_seen": 845568, + "step": 365 + }, + { + "epoch": 0.34872761545711595, + "grad_norm": 23.822986602783203, + "learning_rate": 3.474576271186441e-06, + "loss": 0.3787, + "num_input_tokens_seen": 855040, + "step": 370 + }, + { + "epoch": 0.353440150801131, + "grad_norm": 8.774383544921875, + "learning_rate": 3.5216572504708104e-06, + "loss": 0.3066, + "num_input_tokens_seen": 867712, + "step": 375 + }, + { + "epoch": 0.3581526861451461, + "grad_norm": 8.585469245910645, + "learning_rate": 3.5687382297551793e-06, + "loss": 0.2843, + "num_input_tokens_seen": 880960, + "step": 380 + }, + { + "epoch": 0.36286522148916117, + "grad_norm": 16.818485260009766, + "learning_rate": 3.6158192090395483e-06, + "loss": 0.2422, + "num_input_tokens_seen": 894784, + "step": 385 + }, + { + "epoch": 0.36757775683317623, + "grad_norm": 16.943395614624023, + "learning_rate": 3.662900188323917e-06, + "loss": 0.2664, + "num_input_tokens_seen": 905600, + "step": 390 + }, + { + "epoch": 0.37229029217719134, + "grad_norm": 26.195528030395508, + "learning_rate": 3.7099811676082866e-06, + "loss": 0.3223, + "num_input_tokens_seen": 915072, + "step": 395 + }, + { + "epoch": 0.3770028275212064, + "grad_norm": 37.17734146118164, + "learning_rate": 3.7570621468926555e-06, + "loss": 0.2066, + "num_input_tokens_seen": 927552, + "step": 400 + }, + { + "epoch": 0.3817153628652215, + "grad_norm": 43.06858444213867, + "learning_rate": 3.8041431261770245e-06, + "loss": 0.2964, + "num_input_tokens_seen": 940160, + "step": 405 + }, + { + "epoch": 0.38642789820923656, + "grad_norm": 31.916301727294922, + "learning_rate": 3.851224105461394e-06, + "loss": 0.1989, + "num_input_tokens_seen": 949760, + "step": 410 + }, + { + "epoch": 0.3911404335532517, + "grad_norm": 21.781017303466797, + "learning_rate": 3.898305084745763e-06, + "loss": 0.2152, + "num_input_tokens_seen": 960896, + "step": 415 + }, + { + "epoch": 0.39585296889726673, + "grad_norm": 18.697711944580078, + "learning_rate": 3.945386064030132e-06, + "loss": 0.3237, + "num_input_tokens_seen": 971648, + "step": 420 + }, + { + "epoch": 0.4005655042412818, + "grad_norm": 24.69769287109375, + "learning_rate": 3.992467043314501e-06, + "loss": 0.2182, + "num_input_tokens_seen": 981504, + "step": 425 + }, + { + "epoch": 0.4052780395852969, + "grad_norm": 26.988998413085938, + "learning_rate": 4.03954802259887e-06, + "loss": 0.1811, + "num_input_tokens_seen": 993664, + "step": 430 + }, + { + "epoch": 0.40999057492931196, + "grad_norm": 39.637481689453125, + "learning_rate": 4.08662900188324e-06, + "loss": 0.2151, + "num_input_tokens_seen": 1003008, + "step": 435 + }, + { + "epoch": 0.41470311027332707, + "grad_norm": 36.60890579223633, + "learning_rate": 4.133709981167609e-06, + "loss": 0.2333, + "num_input_tokens_seen": 1013248, + "step": 440 + }, + { + "epoch": 0.4194156456173421, + "grad_norm": 21.840532302856445, + "learning_rate": 4.180790960451978e-06, + "loss": 0.2694, + "num_input_tokens_seen": 1023296, + "step": 445 + }, + { + "epoch": 0.42412818096135724, + "grad_norm": 48.131988525390625, + "learning_rate": 4.2278719397363475e-06, + "loss": 0.3493, + "num_input_tokens_seen": 1033152, + "step": 450 + }, + { + "epoch": 0.4288407163053723, + "grad_norm": 30.9437198638916, + "learning_rate": 4.2749529190207165e-06, + "loss": 0.2147, + "num_input_tokens_seen": 1045248, + "step": 455 + }, + { + "epoch": 0.43355325164938735, + "grad_norm": 38.648189544677734, + "learning_rate": 4.322033898305085e-06, + "loss": 0.3071, + "num_input_tokens_seen": 1058944, + "step": 460 + }, + { + "epoch": 0.43826578699340246, + "grad_norm": 45.260948181152344, + "learning_rate": 4.369114877589454e-06, + "loss": 0.2118, + "num_input_tokens_seen": 1071680, + "step": 465 + }, + { + "epoch": 0.4429783223374175, + "grad_norm": 30.458763122558594, + "learning_rate": 4.416195856873823e-06, + "loss": 0.2319, + "num_input_tokens_seen": 1083328, + "step": 470 + }, + { + "epoch": 0.44769085768143263, + "grad_norm": 44.823246002197266, + "learning_rate": 4.463276836158192e-06, + "loss": 0.4004, + "num_input_tokens_seen": 1097600, + "step": 475 + }, + { + "epoch": 0.4524033930254477, + "grad_norm": 30.685049057006836, + "learning_rate": 4.510357815442561e-06, + "loss": 0.218, + "num_input_tokens_seen": 1109376, + "step": 480 + }, + { + "epoch": 0.4571159283694628, + "grad_norm": 20.631444931030273, + "learning_rate": 4.55743879472693e-06, + "loss": 0.2579, + "num_input_tokens_seen": 1120448, + "step": 485 + }, + { + "epoch": 0.46182846371347785, + "grad_norm": 27.939373016357422, + "learning_rate": 4.6045197740113e-06, + "loss": 0.298, + "num_input_tokens_seen": 1131392, + "step": 490 + }, + { + "epoch": 0.4665409990574929, + "grad_norm": 13.28208065032959, + "learning_rate": 4.651600753295669e-06, + "loss": 0.2203, + "num_input_tokens_seen": 1143552, + "step": 495 + }, + { + "epoch": 0.471253534401508, + "grad_norm": 55.77337646484375, + "learning_rate": 4.698681732580039e-06, + "loss": 0.1785, + "num_input_tokens_seen": 1153088, + "step": 500 + }, + { + "epoch": 0.4759660697455231, + "grad_norm": 32.49051284790039, + "learning_rate": 4.745762711864408e-06, + "loss": 0.2125, + "num_input_tokens_seen": 1166080, + "step": 505 + }, + { + "epoch": 0.4806786050895382, + "grad_norm": 63.475467681884766, + "learning_rate": 4.7928436911487765e-06, + "loss": 0.3882, + "num_input_tokens_seen": 1176896, + "step": 510 + }, + { + "epoch": 0.48539114043355325, + "grad_norm": 49.208858489990234, + "learning_rate": 4.8399246704331455e-06, + "loss": 0.2743, + "num_input_tokens_seen": 1187392, + "step": 515 + }, + { + "epoch": 0.49010367577756836, + "grad_norm": 38.7435417175293, + "learning_rate": 4.8870056497175144e-06, + "loss": 0.3309, + "num_input_tokens_seen": 1196672, + "step": 520 + }, + { + "epoch": 0.4948162111215834, + "grad_norm": 37.79723358154297, + "learning_rate": 4.934086629001883e-06, + "loss": 0.3264, + "num_input_tokens_seen": 1209344, + "step": 525 + }, + { + "epoch": 0.49952874646559847, + "grad_norm": 14.449431419372559, + "learning_rate": 4.981167608286252e-06, + "loss": 0.2193, + "num_input_tokens_seen": 1221504, + "step": 530 + }, + { + "epoch": 0.5014137606032045, + "eval_loss": 0.31662699580192566, + "eval_runtime": 3.716, + "eval_samples_per_second": 253.768, + "eval_steps_per_second": 31.755, + "num_input_tokens_seen": 1225408, + "step": 532 + }, + { + "epoch": 0.5042412818096136, + "grad_norm": 28.057838439941406, + "learning_rate": 4.999995128224159e-06, + "loss": 0.3059, + "num_input_tokens_seen": 1232256, + "step": 535 + }, + { + "epoch": 0.5089538171536286, + "grad_norm": 24.673755645751953, + "learning_rate": 4.999965356329446e-06, + "loss": 0.2494, + "num_input_tokens_seen": 1242880, + "step": 540 + }, + { + "epoch": 0.5136663524976437, + "grad_norm": 24.47283172607422, + "learning_rate": 4.99990851940408e-06, + "loss": 0.2812, + "num_input_tokens_seen": 1253248, + "step": 545 + }, + { + "epoch": 0.5183788878416589, + "grad_norm": 34.05777359008789, + "learning_rate": 4.999824618063384e-06, + "loss": 0.2639, + "num_input_tokens_seen": 1265280, + "step": 550 + }, + { + "epoch": 0.5230914231856739, + "grad_norm": 23.17024803161621, + "learning_rate": 4.99971365321569e-06, + "loss": 0.3403, + "num_input_tokens_seen": 1275328, + "step": 555 + }, + { + "epoch": 0.527803958529689, + "grad_norm": 14.374199867248535, + "learning_rate": 4.9995756260623194e-06, + "loss": 0.2942, + "num_input_tokens_seen": 1286272, + "step": 560 + }, + { + "epoch": 0.532516493873704, + "grad_norm": 23.844751358032227, + "learning_rate": 4.999410538097579e-06, + "loss": 0.2036, + "num_input_tokens_seen": 1298816, + "step": 565 + }, + { + "epoch": 0.5372290292177191, + "grad_norm": 25.96900177001953, + "learning_rate": 4.999218391108735e-06, + "loss": 0.2656, + "num_input_tokens_seen": 1311680, + "step": 570 + }, + { + "epoch": 0.5419415645617343, + "grad_norm": 43.49790954589844, + "learning_rate": 4.9989991871760054e-06, + "loss": 0.2828, + "num_input_tokens_seen": 1324672, + "step": 575 + }, + { + "epoch": 0.5466540999057493, + "grad_norm": 14.262967109680176, + "learning_rate": 4.998752928672525e-06, + "loss": 0.2081, + "num_input_tokens_seen": 1336896, + "step": 580 + }, + { + "epoch": 0.5513666352497644, + "grad_norm": 3.46243953704834, + "learning_rate": 4.9984796182643285e-06, + "loss": 0.1485, + "num_input_tokens_seen": 1348928, + "step": 585 + }, + { + "epoch": 0.5560791705937794, + "grad_norm": 34.33613204956055, + "learning_rate": 4.99817925891032e-06, + "loss": 0.6339, + "num_input_tokens_seen": 1362496, + "step": 590 + }, + { + "epoch": 0.5607917059377945, + "grad_norm": 34.32979965209961, + "learning_rate": 4.997851853862237e-06, + "loss": 0.5079, + "num_input_tokens_seen": 1377152, + "step": 595 + }, + { + "epoch": 0.5655042412818096, + "grad_norm": 37.79439926147461, + "learning_rate": 4.997497406664621e-06, + "loss": 0.2396, + "num_input_tokens_seen": 1390016, + "step": 600 + }, + { + "epoch": 0.5702167766258247, + "grad_norm": 17.340059280395508, + "learning_rate": 4.997115921154774e-06, + "loss": 0.2335, + "num_input_tokens_seen": 1401856, + "step": 605 + }, + { + "epoch": 0.5749293119698398, + "grad_norm": 29.506044387817383, + "learning_rate": 4.9967074014627206e-06, + "loss": 0.1719, + "num_input_tokens_seen": 1412736, + "step": 610 + }, + { + "epoch": 0.5796418473138548, + "grad_norm": 37.41396713256836, + "learning_rate": 4.996271852011161e-06, + "loss": 0.3201, + "num_input_tokens_seen": 1425280, + "step": 615 + }, + { + "epoch": 0.58435438265787, + "grad_norm": 80.07007598876953, + "learning_rate": 4.995809277515424e-06, + "loss": 0.1993, + "num_input_tokens_seen": 1436480, + "step": 620 + }, + { + "epoch": 0.589066918001885, + "grad_norm": 25.41455841064453, + "learning_rate": 4.995319682983417e-06, + "loss": 0.3072, + "num_input_tokens_seen": 1447808, + "step": 625 + }, + { + "epoch": 0.5937794533459001, + "grad_norm": 25.022064208984375, + "learning_rate": 4.99480307371557e-06, + "loss": 0.3263, + "num_input_tokens_seen": 1460352, + "step": 630 + }, + { + "epoch": 0.5984919886899152, + "grad_norm": 15.871000289916992, + "learning_rate": 4.9942594553047775e-06, + "loss": 0.2747, + "num_input_tokens_seen": 1472640, + "step": 635 + }, + { + "epoch": 0.6032045240339302, + "grad_norm": 9.052998542785645, + "learning_rate": 4.993688833636341e-06, + "loss": 0.2984, + "num_input_tokens_seen": 1482688, + "step": 640 + }, + { + "epoch": 0.6079170593779454, + "grad_norm": 11.553794860839844, + "learning_rate": 4.993091214887904e-06, + "loss": 0.2671, + "num_input_tokens_seen": 1494336, + "step": 645 + }, + { + "epoch": 0.6126295947219604, + "grad_norm": 11.756524085998535, + "learning_rate": 4.992466605529384e-06, + "loss": 0.1511, + "num_input_tokens_seen": 1504896, + "step": 650 + }, + { + "epoch": 0.6173421300659755, + "grad_norm": 34.58543395996094, + "learning_rate": 4.991815012322902e-06, + "loss": 0.3427, + "num_input_tokens_seen": 1518592, + "step": 655 + }, + { + "epoch": 0.6220546654099905, + "grad_norm": 26.234251022338867, + "learning_rate": 4.991136442322713e-06, + "loss": 0.2164, + "num_input_tokens_seen": 1531264, + "step": 660 + }, + { + "epoch": 0.6267672007540056, + "grad_norm": 22.865320205688477, + "learning_rate": 4.990430902875125e-06, + "loss": 0.2187, + "num_input_tokens_seen": 1541376, + "step": 665 + }, + { + "epoch": 0.6314797360980208, + "grad_norm": 17.565690994262695, + "learning_rate": 4.989698401618423e-06, + "loss": 0.2911, + "num_input_tokens_seen": 1551424, + "step": 670 + }, + { + "epoch": 0.6361922714420358, + "grad_norm": 31.6148624420166, + "learning_rate": 4.988938946482786e-06, + "loss": 0.1331, + "num_input_tokens_seen": 1562624, + "step": 675 + }, + { + "epoch": 0.6409048067860509, + "grad_norm": 18.770896911621094, + "learning_rate": 4.988152545690197e-06, + "loss": 0.2686, + "num_input_tokens_seen": 1574016, + "step": 680 + }, + { + "epoch": 0.6456173421300659, + "grad_norm": 43.168392181396484, + "learning_rate": 4.987339207754358e-06, + "loss": 0.3307, + "num_input_tokens_seen": 1586688, + "step": 685 + }, + { + "epoch": 0.6503298774740811, + "grad_norm": 11.960392951965332, + "learning_rate": 4.9864989414806e-06, + "loss": 0.247, + "num_input_tokens_seen": 1596992, + "step": 690 + }, + { + "epoch": 0.6550424128180962, + "grad_norm": 13.759454727172852, + "learning_rate": 4.985631755965779e-06, + "loss": 0.3232, + "num_input_tokens_seen": 1609920, + "step": 695 + }, + { + "epoch": 0.6597549481621112, + "grad_norm": 11.490863800048828, + "learning_rate": 4.984737660598187e-06, + "loss": 0.2132, + "num_input_tokens_seen": 1620736, + "step": 700 + }, + { + "epoch": 0.6644674835061263, + "grad_norm": 39.30780029296875, + "learning_rate": 4.983816665057447e-06, + "loss": 0.2797, + "num_input_tokens_seen": 1632512, + "step": 705 + }, + { + "epoch": 0.6691800188501413, + "grad_norm": 10.898017883300781, + "learning_rate": 4.982868779314405e-06, + "loss": 0.3142, + "num_input_tokens_seen": 1643264, + "step": 710 + }, + { + "epoch": 0.6738925541941565, + "grad_norm": 16.296348571777344, + "learning_rate": 4.981894013631026e-06, + "loss": 0.1914, + "num_input_tokens_seen": 1654208, + "step": 715 + }, + { + "epoch": 0.6786050895381716, + "grad_norm": 23.66090202331543, + "learning_rate": 4.980892378560281e-06, + "loss": 0.1985, + "num_input_tokens_seen": 1664640, + "step": 720 + }, + { + "epoch": 0.6833176248821866, + "grad_norm": 27.988893508911133, + "learning_rate": 4.979863884946034e-06, + "loss": 0.2831, + "num_input_tokens_seen": 1676864, + "step": 725 + }, + { + "epoch": 0.6880301602262017, + "grad_norm": 20.06635284423828, + "learning_rate": 4.978808543922925e-06, + "loss": 0.2082, + "num_input_tokens_seen": 1691072, + "step": 730 + }, + { + "epoch": 0.6927426955702167, + "grad_norm": 19.601367950439453, + "learning_rate": 4.9777263669162465e-06, + "loss": 0.1227, + "num_input_tokens_seen": 1702400, + "step": 735 + }, + { + "epoch": 0.6974552309142319, + "grad_norm": 6.3521199226379395, + "learning_rate": 4.976617365641822e-06, + "loss": 0.1471, + "num_input_tokens_seen": 1714944, + "step": 740 + }, + { + "epoch": 0.702167766258247, + "grad_norm": 87.46131134033203, + "learning_rate": 4.97548155210588e-06, + "loss": 0.3082, + "num_input_tokens_seen": 1725376, + "step": 745 + }, + { + "epoch": 0.706880301602262, + "grad_norm": 80.72607421875, + "learning_rate": 4.974318938604921e-06, + "loss": 0.4432, + "num_input_tokens_seen": 1737152, + "step": 750 + }, + { + "epoch": 0.7115928369462771, + "grad_norm": 13.310392379760742, + "learning_rate": 4.9731295377255885e-06, + "loss": 0.1969, + "num_input_tokens_seen": 1749120, + "step": 755 + }, + { + "epoch": 0.7163053722902922, + "grad_norm": 23.76306915283203, + "learning_rate": 4.971913362344529e-06, + "loss": 0.272, + "num_input_tokens_seen": 1760384, + "step": 760 + }, + { + "epoch": 0.7210179076343073, + "grad_norm": 33.018524169921875, + "learning_rate": 4.970670425628255e-06, + "loss": 0.1454, + "num_input_tokens_seen": 1773632, + "step": 765 + }, + { + "epoch": 0.7257304429783223, + "grad_norm": 21.914316177368164, + "learning_rate": 4.969400741032999e-06, + "loss": 0.184, + "num_input_tokens_seen": 1787776, + "step": 770 + }, + { + "epoch": 0.7304429783223374, + "grad_norm": 38.7669792175293, + "learning_rate": 4.968104322304575e-06, + "loss": 0.2148, + "num_input_tokens_seen": 1798336, + "step": 775 + }, + { + "epoch": 0.7351555136663525, + "grad_norm": 41.227027893066406, + "learning_rate": 4.966781183478223e-06, + "loss": 0.2897, + "num_input_tokens_seen": 1809216, + "step": 780 + }, + { + "epoch": 0.7398680490103676, + "grad_norm": 26.2169189453125, + "learning_rate": 4.965431338878456e-06, + "loss": 0.2981, + "num_input_tokens_seen": 1822144, + "step": 785 + }, + { + "epoch": 0.7445805843543827, + "grad_norm": 5.09712553024292, + "learning_rate": 4.9640548031189125e-06, + "loss": 0.2476, + "num_input_tokens_seen": 1833088, + "step": 790 + }, + { + "epoch": 0.7492931196983977, + "grad_norm": 15.851964950561523, + "learning_rate": 4.962651591102191e-06, + "loss": 0.2554, + "num_input_tokens_seen": 1845056, + "step": 795 + }, + { + "epoch": 0.7521206409048068, + "eval_loss": 0.2178538739681244, + "eval_runtime": 2.7742, + "eval_samples_per_second": 339.916, + "eval_steps_per_second": 42.535, + "num_input_tokens_seen": 1851072, + "step": 798 + }, + { + "epoch": 0.7540056550424128, + "grad_norm": 14.348052978515625, + "learning_rate": 4.961221718019695e-06, + "loss": 0.2507, + "num_input_tokens_seen": 1855168, + "step": 800 + }, + { + "epoch": 0.7587181903864278, + "grad_norm": 20.442550659179688, + "learning_rate": 4.9597651993514585e-06, + "loss": 0.3006, + "num_input_tokens_seen": 1867328, + "step": 805 + }, + { + "epoch": 0.763430725730443, + "grad_norm": 18.405014038085938, + "learning_rate": 4.9582820508659924e-06, + "loss": 0.1949, + "num_input_tokens_seen": 1882560, + "step": 810 + }, + { + "epoch": 0.7681432610744581, + "grad_norm": 26.241788864135742, + "learning_rate": 4.956772288620101e-06, + "loss": 0.1866, + "num_input_tokens_seen": 1893376, + "step": 815 + }, + { + "epoch": 0.7728557964184731, + "grad_norm": 4.750776290893555, + "learning_rate": 4.955235928958716e-06, + "loss": 0.1114, + "num_input_tokens_seen": 1906048, + "step": 820 + }, + { + "epoch": 0.7775683317624882, + "grad_norm": 22.653051376342773, + "learning_rate": 4.953672988514716e-06, + "loss": 0.2425, + "num_input_tokens_seen": 1917568, + "step": 825 + }, + { + "epoch": 0.7822808671065034, + "grad_norm": 56.989315032958984, + "learning_rate": 4.95208348420875e-06, + "loss": 0.4121, + "num_input_tokens_seen": 1929216, + "step": 830 + }, + { + "epoch": 0.7869934024505184, + "grad_norm": 21.19652557373047, + "learning_rate": 4.950467433249046e-06, + "loss": 0.1859, + "num_input_tokens_seen": 1940416, + "step": 835 + }, + { + "epoch": 0.7917059377945335, + "grad_norm": 17.347103118896484, + "learning_rate": 4.948824853131237e-06, + "loss": 0.2065, + "num_input_tokens_seen": 1949632, + "step": 840 + }, + { + "epoch": 0.7964184731385485, + "grad_norm": 32.96878433227539, + "learning_rate": 4.94715576163816e-06, + "loss": 0.2102, + "num_input_tokens_seen": 1961920, + "step": 845 + }, + { + "epoch": 0.8011310084825636, + "grad_norm": 4.76591157913208, + "learning_rate": 4.945460176839671e-06, + "loss": 0.2975, + "num_input_tokens_seen": 1973696, + "step": 850 + }, + { + "epoch": 0.8058435438265787, + "grad_norm": 17.566848754882812, + "learning_rate": 4.943738117092447e-06, + "loss": 0.294, + "num_input_tokens_seen": 1985280, + "step": 855 + }, + { + "epoch": 0.8105560791705938, + "grad_norm": 34.71393966674805, + "learning_rate": 4.941989601039785e-06, + "loss": 0.2107, + "num_input_tokens_seen": 1997504, + "step": 860 + }, + { + "epoch": 0.8152686145146089, + "grad_norm": 15.716105461120605, + "learning_rate": 4.940214647611405e-06, + "loss": 0.2815, + "num_input_tokens_seen": 2009600, + "step": 865 + }, + { + "epoch": 0.8199811498586239, + "grad_norm": 5.163911819458008, + "learning_rate": 4.9384132760232395e-06, + "loss": 0.1509, + "num_input_tokens_seen": 2020672, + "step": 870 + }, + { + "epoch": 0.824693685202639, + "grad_norm": 32.56769943237305, + "learning_rate": 4.93658550577723e-06, + "loss": 0.258, + "num_input_tokens_seen": 2033408, + "step": 875 + }, + { + "epoch": 0.8294062205466541, + "grad_norm": 21.050493240356445, + "learning_rate": 4.9347313566611145e-06, + "loss": 0.2403, + "num_input_tokens_seen": 2043328, + "step": 880 + }, + { + "epoch": 0.8341187558906692, + "grad_norm": 13.551421165466309, + "learning_rate": 4.9328508487482115e-06, + "loss": 0.2631, + "num_input_tokens_seen": 2054656, + "step": 885 + }, + { + "epoch": 0.8388312912346843, + "grad_norm": 19.12700080871582, + "learning_rate": 4.930944002397204e-06, + "loss": 0.2302, + "num_input_tokens_seen": 2064128, + "step": 890 + }, + { + "epoch": 0.8435438265786993, + "grad_norm": 29.187570571899414, + "learning_rate": 4.929010838251923e-06, + "loss": 0.2009, + "num_input_tokens_seen": 2076864, + "step": 895 + }, + { + "epoch": 0.8482563619227145, + "grad_norm": 20.132150650024414, + "learning_rate": 4.927051377241115e-06, + "loss": 0.1868, + "num_input_tokens_seen": 2087104, + "step": 900 + }, + { + "epoch": 0.8529688972667295, + "grad_norm": 19.931499481201172, + "learning_rate": 4.9250656405782215e-06, + "loss": 0.3066, + "num_input_tokens_seen": 2097728, + "step": 905 + }, + { + "epoch": 0.8576814326107446, + "grad_norm": 21.25429916381836, + "learning_rate": 4.9230536497611525e-06, + "loss": 0.1685, + "num_input_tokens_seen": 2107904, + "step": 910 + }, + { + "epoch": 0.8623939679547596, + "grad_norm": 35.41661834716797, + "learning_rate": 4.921015426572047e-06, + "loss": 0.3358, + "num_input_tokens_seen": 2120192, + "step": 915 + }, + { + "epoch": 0.8671065032987747, + "grad_norm": 20.501426696777344, + "learning_rate": 4.918950993077039e-06, + "loss": 0.2411, + "num_input_tokens_seen": 2131904, + "step": 920 + }, + { + "epoch": 0.8718190386427899, + "grad_norm": 30.00881576538086, + "learning_rate": 4.91686037162602e-06, + "loss": 0.3069, + "num_input_tokens_seen": 2144640, + "step": 925 + }, + { + "epoch": 0.8765315739868049, + "grad_norm": 30.22358512878418, + "learning_rate": 4.9147435848523975e-06, + "loss": 0.1587, + "num_input_tokens_seen": 2154112, + "step": 930 + }, + { + "epoch": 0.88124410933082, + "grad_norm": 10.572684288024902, + "learning_rate": 4.91260065567285e-06, + "loss": 0.1468, + "num_input_tokens_seen": 2167232, + "step": 935 + }, + { + "epoch": 0.885956644674835, + "grad_norm": 61.71476745605469, + "learning_rate": 4.910431607287075e-06, + "loss": 0.2699, + "num_input_tokens_seen": 2179264, + "step": 940 + }, + { + "epoch": 0.8906691800188501, + "grad_norm": 10.816360473632812, + "learning_rate": 4.908236463177544e-06, + "loss": 0.3797, + "num_input_tokens_seen": 2191488, + "step": 945 + }, + { + "epoch": 0.8953817153628653, + "grad_norm": 39.76873016357422, + "learning_rate": 4.906015247109242e-06, + "loss": 0.1988, + "num_input_tokens_seen": 2201856, + "step": 950 + }, + { + "epoch": 0.9000942507068803, + "grad_norm": 23.409250259399414, + "learning_rate": 4.903767983129414e-06, + "loss": 0.3161, + "num_input_tokens_seen": 2214464, + "step": 955 + }, + { + "epoch": 0.9048067860508954, + "grad_norm": 23.47569465637207, + "learning_rate": 4.901494695567306e-06, + "loss": 0.2565, + "num_input_tokens_seen": 2229184, + "step": 960 + }, + { + "epoch": 0.9095193213949104, + "grad_norm": 12.153125762939453, + "learning_rate": 4.899195409033897e-06, + "loss": 0.2214, + "num_input_tokens_seen": 2239104, + "step": 965 + }, + { + "epoch": 0.9142318567389256, + "grad_norm": 13.904633522033691, + "learning_rate": 4.896870148421637e-06, + "loss": 0.1992, + "num_input_tokens_seen": 2249152, + "step": 970 + }, + { + "epoch": 0.9189443920829407, + "grad_norm": 9.68702507019043, + "learning_rate": 4.894518938904175e-06, + "loss": 0.1527, + "num_input_tokens_seen": 2261312, + "step": 975 + }, + { + "epoch": 0.9236569274269557, + "grad_norm": 35.594173431396484, + "learning_rate": 4.892141805936085e-06, + "loss": 0.1398, + "num_input_tokens_seen": 2275008, + "step": 980 + }, + { + "epoch": 0.9283694627709708, + "grad_norm": 30.414966583251953, + "learning_rate": 4.889738775252596e-06, + "loss": 0.276, + "num_input_tokens_seen": 2287680, + "step": 985 + }, + { + "epoch": 0.9330819981149858, + "grad_norm": 36.264251708984375, + "learning_rate": 4.887309872869308e-06, + "loss": 0.2869, + "num_input_tokens_seen": 2299840, + "step": 990 + }, + { + "epoch": 0.937794533459001, + "grad_norm": 34.444374084472656, + "learning_rate": 4.884855125081912e-06, + "loss": 0.2347, + "num_input_tokens_seen": 2311104, + "step": 995 + }, + { + "epoch": 0.942507068803016, + "grad_norm": 39.005767822265625, + "learning_rate": 4.882374558465906e-06, + "loss": 0.326, + "num_input_tokens_seen": 2322432, + "step": 1000 + }, + { + "epoch": 0.9472196041470311, + "grad_norm": 23.73866081237793, + "learning_rate": 4.8798681998763056e-06, + "loss": 0.2946, + "num_input_tokens_seen": 2333120, + "step": 1005 + }, + { + "epoch": 0.9519321394910462, + "grad_norm": 17.239654541015625, + "learning_rate": 4.877336076447358e-06, + "loss": 0.2846, + "num_input_tokens_seen": 2345472, + "step": 1010 + }, + { + "epoch": 0.9566446748350612, + "grad_norm": 16.902143478393555, + "learning_rate": 4.87477821559224e-06, + "loss": 0.1988, + "num_input_tokens_seen": 2357568, + "step": 1015 + }, + { + "epoch": 0.9613572101790764, + "grad_norm": 20.823362350463867, + "learning_rate": 4.87219464500277e-06, + "loss": 0.2295, + "num_input_tokens_seen": 2368064, + "step": 1020 + }, + { + "epoch": 0.9660697455230914, + "grad_norm": 22.501428604125977, + "learning_rate": 4.869585392649102e-06, + "loss": 0.2166, + "num_input_tokens_seen": 2381184, + "step": 1025 + }, + { + "epoch": 0.9707822808671065, + "grad_norm": 12.077306747436523, + "learning_rate": 4.866950486779425e-06, + "loss": 0.1964, + "num_input_tokens_seen": 2393408, + "step": 1030 + }, + { + "epoch": 0.9754948162111216, + "grad_norm": 24.82265281677246, + "learning_rate": 4.864289955919658e-06, + "loss": 0.2603, + "num_input_tokens_seen": 2406720, + "step": 1035 + }, + { + "epoch": 0.9802073515551367, + "grad_norm": 24.67642593383789, + "learning_rate": 4.8616038288731394e-06, + "loss": 0.3101, + "num_input_tokens_seen": 2420288, + "step": 1040 + }, + { + "epoch": 0.9849198868991518, + "grad_norm": 7.1168532371521, + "learning_rate": 4.8588921347203175e-06, + "loss": 0.1463, + "num_input_tokens_seen": 2431488, + "step": 1045 + }, + { + "epoch": 0.9896324222431668, + "grad_norm": 12.157154083251953, + "learning_rate": 4.8561549028184315e-06, + "loss": 0.2497, + "num_input_tokens_seen": 2444032, + "step": 1050 + }, + { + "epoch": 0.9943449575871819, + "grad_norm": 18.19011688232422, + "learning_rate": 4.8533921628012e-06, + "loss": 0.1574, + "num_input_tokens_seen": 2454912, + "step": 1055 + }, + { + "epoch": 0.9990574929311969, + "grad_norm": 22.441247940063477, + "learning_rate": 4.850603944578494e-06, + "loss": 0.3676, + "num_input_tokens_seen": 2467584, + "step": 1060 + }, + { + "epoch": 1.002827521206409, + "eval_loss": 0.18848362565040588, + "eval_runtime": 2.7314, + "eval_samples_per_second": 345.248, + "eval_steps_per_second": 43.202, + "num_input_tokens_seen": 2475808, + "step": 1064 + }, + { + "epoch": 1.003770028275212, + "grad_norm": 12.251239776611328, + "learning_rate": 4.847790278336017e-06, + "loss": 0.1493, + "num_input_tokens_seen": 2478048, + "step": 1065 + }, + { + "epoch": 1.0084825636192272, + "grad_norm": 23.926055908203125, + "learning_rate": 4.844951194534975e-06, + "loss": 0.1749, + "num_input_tokens_seen": 2492576, + "step": 1070 + }, + { + "epoch": 1.0131950989632421, + "grad_norm": 2.156106472015381, + "learning_rate": 4.842086723911751e-06, + "loss": 0.1307, + "num_input_tokens_seen": 2505440, + "step": 1075 + }, + { + "epoch": 1.0179076343072573, + "grad_norm": 29.823352813720703, + "learning_rate": 4.839196897477569e-06, + "loss": 0.1119, + "num_input_tokens_seen": 2515488, + "step": 1080 + }, + { + "epoch": 1.0226201696512724, + "grad_norm": 7.730029106140137, + "learning_rate": 4.836281746518159e-06, + "loss": 0.1664, + "num_input_tokens_seen": 2529504, + "step": 1085 + }, + { + "epoch": 1.0273327049952874, + "grad_norm": 35.77005386352539, + "learning_rate": 4.833341302593417e-06, + "loss": 0.1393, + "num_input_tokens_seen": 2539872, + "step": 1090 + }, + { + "epoch": 1.0320452403393026, + "grad_norm": 0.4067946672439575, + "learning_rate": 4.830375597537068e-06, + "loss": 0.0376, + "num_input_tokens_seen": 2549856, + "step": 1095 + }, + { + "epoch": 1.0367577756833177, + "grad_norm": 0.01726607233285904, + "learning_rate": 4.827384663456315e-06, + "loss": 0.1836, + "num_input_tokens_seen": 2559328, + "step": 1100 + }, + { + "epoch": 1.0414703110273327, + "grad_norm": 201.8726043701172, + "learning_rate": 4.824368532731496e-06, + "loss": 0.369, + "num_input_tokens_seen": 2569440, + "step": 1105 + }, + { + "epoch": 1.0461828463713478, + "grad_norm": 123.39364624023438, + "learning_rate": 4.821327238015732e-06, + "loss": 0.084, + "num_input_tokens_seen": 2580448, + "step": 1110 + }, + { + "epoch": 1.0508953817153628, + "grad_norm": 93.78629302978516, + "learning_rate": 4.818260812234572e-06, + "loss": 0.4176, + "num_input_tokens_seen": 2590752, + "step": 1115 + }, + { + "epoch": 1.055607917059378, + "grad_norm": 54.99407196044922, + "learning_rate": 4.815169288585641e-06, + "loss": 0.0664, + "num_input_tokens_seen": 2600160, + "step": 1120 + }, + { + "epoch": 1.0603204524033931, + "grad_norm": 33.015010833740234, + "learning_rate": 4.812052700538274e-06, + "loss": 0.1558, + "num_input_tokens_seen": 2611232, + "step": 1125 + }, + { + "epoch": 1.065032987747408, + "grad_norm": 10.432161331176758, + "learning_rate": 4.808911081833161e-06, + "loss": 0.1476, + "num_input_tokens_seen": 2623712, + "step": 1130 + }, + { + "epoch": 1.0697455230914232, + "grad_norm": 21.43227767944336, + "learning_rate": 4.805744466481974e-06, + "loss": 0.0875, + "num_input_tokens_seen": 2635936, + "step": 1135 + }, + { + "epoch": 1.0744580584354382, + "grad_norm": 0.39066728949546814, + "learning_rate": 4.802552888767005e-06, + "loss": 0.1297, + "num_input_tokens_seen": 2645920, + "step": 1140 + }, + { + "epoch": 1.0791705937794533, + "grad_norm": 79.24580383300781, + "learning_rate": 4.799336383240793e-06, + "loss": 0.2563, + "num_input_tokens_seen": 2660768, + "step": 1145 + }, + { + "epoch": 1.0838831291234685, + "grad_norm": 90.48008728027344, + "learning_rate": 4.796094984725749e-06, + "loss": 0.1484, + "num_input_tokens_seen": 2671200, + "step": 1150 + }, + { + "epoch": 1.0885956644674835, + "grad_norm": 0.7040526270866394, + "learning_rate": 4.792828728313778e-06, + "loss": 0.1145, + "num_input_tokens_seen": 2683040, + "step": 1155 + }, + { + "epoch": 1.0933081998114986, + "grad_norm": 80.87930297851562, + "learning_rate": 4.789537649365904e-06, + "loss": 0.0767, + "num_input_tokens_seen": 2694432, + "step": 1160 + }, + { + "epoch": 1.0980207351555136, + "grad_norm": 0.26429542899131775, + "learning_rate": 4.78622178351188e-06, + "loss": 0.0079, + "num_input_tokens_seen": 2707168, + "step": 1165 + }, + { + "epoch": 1.1027332704995287, + "grad_norm": 0.19729509949684143, + "learning_rate": 4.782881166649808e-06, + "loss": 0.1644, + "num_input_tokens_seen": 2717984, + "step": 1170 + }, + { + "epoch": 1.107445805843544, + "grad_norm": 220.86328125, + "learning_rate": 4.77951583494575e-06, + "loss": 0.2543, + "num_input_tokens_seen": 2730784, + "step": 1175 + }, + { + "epoch": 1.1121583411875589, + "grad_norm": 21.004072189331055, + "learning_rate": 4.77612582483333e-06, + "loss": 0.4821, + "num_input_tokens_seen": 2740704, + "step": 1180 + }, + { + "epoch": 1.116870876531574, + "grad_norm": 45.33163833618164, + "learning_rate": 4.772711173013352e-06, + "loss": 0.2498, + "num_input_tokens_seen": 2751968, + "step": 1185 + }, + { + "epoch": 1.121583411875589, + "grad_norm": 4.449422359466553, + "learning_rate": 4.769271916453387e-06, + "loss": 0.1649, + "num_input_tokens_seen": 2763808, + "step": 1190 + }, + { + "epoch": 1.1262959472196041, + "grad_norm": 154.34603881835938, + "learning_rate": 4.765808092387385e-06, + "loss": 0.0735, + "num_input_tokens_seen": 2774624, + "step": 1195 + }, + { + "epoch": 1.1310084825636193, + "grad_norm": 100.58317565917969, + "learning_rate": 4.762319738315269e-06, + "loss": 0.2639, + "num_input_tokens_seen": 2785888, + "step": 1200 + }, + { + "epoch": 1.1357210179076342, + "grad_norm": 45.887027740478516, + "learning_rate": 4.758806892002526e-06, + "loss": 0.3194, + "num_input_tokens_seen": 2797216, + "step": 1205 + }, + { + "epoch": 1.1404335532516494, + "grad_norm": 36.13898849487305, + "learning_rate": 4.7552695914798e-06, + "loss": 0.1395, + "num_input_tokens_seen": 2808032, + "step": 1210 + }, + { + "epoch": 1.1451460885956646, + "grad_norm": 96.65644836425781, + "learning_rate": 4.751707875042481e-06, + "loss": 0.2734, + "num_input_tokens_seen": 2823008, + "step": 1215 + }, + { + "epoch": 1.1498586239396795, + "grad_norm": 2.167825698852539, + "learning_rate": 4.748121781250288e-06, + "loss": 0.0883, + "num_input_tokens_seen": 2835936, + "step": 1220 + }, + { + "epoch": 1.1545711592836947, + "grad_norm": 14.599705696105957, + "learning_rate": 4.744511348926855e-06, + "loss": 0.169, + "num_input_tokens_seen": 2847584, + "step": 1225 + }, + { + "epoch": 1.1592836946277096, + "grad_norm": 68.31897735595703, + "learning_rate": 4.740876617159308e-06, + "loss": 0.1451, + "num_input_tokens_seen": 2857952, + "step": 1230 + }, + { + "epoch": 1.1639962299717248, + "grad_norm": 77.1812515258789, + "learning_rate": 4.737217625297844e-06, + "loss": 0.2114, + "num_input_tokens_seen": 2868192, + "step": 1235 + }, + { + "epoch": 1.1687087653157398, + "grad_norm": 6.400179862976074, + "learning_rate": 4.733534412955301e-06, + "loss": 0.1145, + "num_input_tokens_seen": 2879904, + "step": 1240 + }, + { + "epoch": 1.173421300659755, + "grad_norm": 1.274997353553772, + "learning_rate": 4.729827020006735e-06, + "loss": 0.1768, + "num_input_tokens_seen": 2892384, + "step": 1245 + }, + { + "epoch": 1.17813383600377, + "grad_norm": 32.56444549560547, + "learning_rate": 4.726095486588983e-06, + "loss": 0.1507, + "num_input_tokens_seen": 2905184, + "step": 1250 + }, + { + "epoch": 1.182846371347785, + "grad_norm": 7.450242042541504, + "learning_rate": 4.722339853100232e-06, + "loss": 0.0958, + "num_input_tokens_seen": 2916640, + "step": 1255 + }, + { + "epoch": 1.1875589066918002, + "grad_norm": 4.951867580413818, + "learning_rate": 4.718560160199579e-06, + "loss": 0.1192, + "num_input_tokens_seen": 2927072, + "step": 1260 + }, + { + "epoch": 1.1922714420358154, + "grad_norm": 50.1746940612793, + "learning_rate": 4.714756448806592e-06, + "loss": 0.2693, + "num_input_tokens_seen": 2937888, + "step": 1265 + }, + { + "epoch": 1.1969839773798303, + "grad_norm": 0.2431841641664505, + "learning_rate": 4.71092876010087e-06, + "loss": 0.1689, + "num_input_tokens_seen": 2950752, + "step": 1270 + }, + { + "epoch": 1.2016965127238455, + "grad_norm": 40.15456771850586, + "learning_rate": 4.70707713552159e-06, + "loss": 0.0997, + "num_input_tokens_seen": 2961056, + "step": 1275 + }, + { + "epoch": 1.2064090480678604, + "grad_norm": 154.8431396484375, + "learning_rate": 4.703201616767067e-06, + "loss": 0.1164, + "num_input_tokens_seen": 2971552, + "step": 1280 + }, + { + "epoch": 1.2111215834118756, + "grad_norm": 67.9471206665039, + "learning_rate": 4.699302245794293e-06, + "loss": 0.0178, + "num_input_tokens_seen": 2985120, + "step": 1285 + }, + { + "epoch": 1.2158341187558908, + "grad_norm": 104.93325805664062, + "learning_rate": 4.6953790648184924e-06, + "loss": 0.1821, + "num_input_tokens_seen": 2996128, + "step": 1290 + }, + { + "epoch": 1.2205466540999057, + "grad_norm": 0.03052549809217453, + "learning_rate": 4.691432116312661e-06, + "loss": 0.0199, + "num_input_tokens_seen": 3007072, + "step": 1295 + }, + { + "epoch": 1.2252591894439209, + "grad_norm": 0.9742458462715149, + "learning_rate": 4.687461443007101e-06, + "loss": 0.006, + "num_input_tokens_seen": 3018656, + "step": 1300 + }, + { + "epoch": 1.2299717247879358, + "grad_norm": 0.007309742737561464, + "learning_rate": 4.683467087888967e-06, + "loss": 0.1915, + "num_input_tokens_seen": 3030624, + "step": 1305 + }, + { + "epoch": 1.234684260131951, + "grad_norm": 0.3931090831756592, + "learning_rate": 4.6794490942017955e-06, + "loss": 0.2276, + "num_input_tokens_seen": 3043040, + "step": 1310 + }, + { + "epoch": 1.2393967954759662, + "grad_norm": 8.714564323425293, + "learning_rate": 4.6754075054450385e-06, + "loss": 0.0236, + "num_input_tokens_seen": 3057632, + "step": 1315 + }, + { + "epoch": 1.244109330819981, + "grad_norm": 0.008542931638658047, + "learning_rate": 4.671342365373592e-06, + "loss": 0.1376, + "num_input_tokens_seen": 3069792, + "step": 1320 + }, + { + "epoch": 1.2488218661639963, + "grad_norm": 52.45071792602539, + "learning_rate": 4.667253717997324e-06, + "loss": 0.2062, + "num_input_tokens_seen": 3080608, + "step": 1325 + }, + { + "epoch": 1.2535344015080114, + "grad_norm": 10.894562721252441, + "learning_rate": 4.663141607580589e-06, + "loss": 0.165, + "num_input_tokens_seen": 3091552, + "step": 1330 + }, + { + "epoch": 1.2535344015080114, + "eval_loss": 0.4607957601547241, + "eval_runtime": 2.7224, + "eval_samples_per_second": 346.386, + "eval_steps_per_second": 43.344, + "num_input_tokens_seen": 3091552, + "step": 1330 + }, + { + "epoch": 1.2582469368520264, + "grad_norm": 121.4914321899414, + "learning_rate": 4.659006078641766e-06, + "loss": 0.222, + "num_input_tokens_seen": 3103712, + "step": 1335 + }, + { + "epoch": 1.2629594721960415, + "grad_norm": 2.8751637935638428, + "learning_rate": 4.6548471759527634e-06, + "loss": 0.2312, + "num_input_tokens_seen": 3115104, + "step": 1340 + }, + { + "epoch": 1.2676720075400565, + "grad_norm": 3.6843035221099854, + "learning_rate": 4.6506649445385335e-06, + "loss": 0.011, + "num_input_tokens_seen": 3127648, + "step": 1345 + }, + { + "epoch": 1.2723845428840717, + "grad_norm": 26.937593460083008, + "learning_rate": 4.646459429676594e-06, + "loss": 0.2732, + "num_input_tokens_seen": 3138208, + "step": 1350 + }, + { + "epoch": 1.2770970782280866, + "grad_norm": 41.53554916381836, + "learning_rate": 4.642230676896531e-06, + "loss": 0.148, + "num_input_tokens_seen": 3148256, + "step": 1355 + }, + { + "epoch": 1.2818096135721018, + "grad_norm": 74.98961639404297, + "learning_rate": 4.6379787319795076e-06, + "loss": 0.0901, + "num_input_tokens_seen": 3157856, + "step": 1360 + }, + { + "epoch": 1.286522148916117, + "grad_norm": 1.2443631887435913, + "learning_rate": 4.6337036409577705e-06, + "loss": 0.24, + "num_input_tokens_seen": 3167136, + "step": 1365 + }, + { + "epoch": 1.2912346842601319, + "grad_norm": 0.20186370611190796, + "learning_rate": 4.62940545011415e-06, + "loss": 0.0842, + "num_input_tokens_seen": 3181984, + "step": 1370 + }, + { + "epoch": 1.295947219604147, + "grad_norm": 22.39756965637207, + "learning_rate": 4.625084205981554e-06, + "loss": 0.1368, + "num_input_tokens_seen": 3195744, + "step": 1375 + }, + { + "epoch": 1.3006597549481622, + "grad_norm": 9.254731178283691, + "learning_rate": 4.620739955342476e-06, + "loss": 0.2497, + "num_input_tokens_seen": 3207776, + "step": 1380 + }, + { + "epoch": 1.3053722902921772, + "grad_norm": 0.06419213116168976, + "learning_rate": 4.616372745228477e-06, + "loss": 0.0782, + "num_input_tokens_seen": 3219296, + "step": 1385 + }, + { + "epoch": 1.3100848256361923, + "grad_norm": 56.7759895324707, + "learning_rate": 4.611982622919684e-06, + "loss": 0.3956, + "num_input_tokens_seen": 3230048, + "step": 1390 + }, + { + "epoch": 1.3147973609802073, + "grad_norm": 68.79596710205078, + "learning_rate": 4.607569635944271e-06, + "loss": 0.1166, + "num_input_tokens_seen": 3239200, + "step": 1395 + }, + { + "epoch": 1.3195098963242224, + "grad_norm": 27.92612648010254, + "learning_rate": 4.603133832077953e-06, + "loss": 0.2557, + "num_input_tokens_seen": 3255008, + "step": 1400 + }, + { + "epoch": 1.3242224316682374, + "grad_norm": 13.399755477905273, + "learning_rate": 4.598675259343462e-06, + "loss": 0.2547, + "num_input_tokens_seen": 3267040, + "step": 1405 + }, + { + "epoch": 1.3289349670122526, + "grad_norm": 25.696258544921875, + "learning_rate": 4.594193966010031e-06, + "loss": 0.2374, + "num_input_tokens_seen": 3276960, + "step": 1410 + }, + { + "epoch": 1.3336475023562677, + "grad_norm": 29.0289363861084, + "learning_rate": 4.589690000592868e-06, + "loss": 0.0795, + "num_input_tokens_seen": 3287840, + "step": 1415 + }, + { + "epoch": 1.3383600377002827, + "grad_norm": 30.088584899902344, + "learning_rate": 4.585163411852632e-06, + "loss": 0.2095, + "num_input_tokens_seen": 3300256, + "step": 1420 + }, + { + "epoch": 1.3430725730442978, + "grad_norm": 3.960421562194824, + "learning_rate": 4.58061424879491e-06, + "loss": 0.3144, + "num_input_tokens_seen": 3311712, + "step": 1425 + }, + { + "epoch": 1.347785108388313, + "grad_norm": 75.69437408447266, + "learning_rate": 4.576042560669678e-06, + "loss": 0.1113, + "num_input_tokens_seen": 3322144, + "step": 1430 + }, + { + "epoch": 1.352497643732328, + "grad_norm": 53.89783477783203, + "learning_rate": 4.571448396970773e-06, + "loss": 0.4022, + "num_input_tokens_seen": 3333856, + "step": 1435 + }, + { + "epoch": 1.3572101790763431, + "grad_norm": 17.59637451171875, + "learning_rate": 4.566831807435359e-06, + "loss": 0.1542, + "num_input_tokens_seen": 3345696, + "step": 1440 + }, + { + "epoch": 1.3619227144203583, + "grad_norm": 15.906473159790039, + "learning_rate": 4.562192842043381e-06, + "loss": 0.2594, + "num_input_tokens_seen": 3357024, + "step": 1445 + }, + { + "epoch": 1.3666352497643732, + "grad_norm": 53.453163146972656, + "learning_rate": 4.557531551017034e-06, + "loss": 0.1721, + "num_input_tokens_seen": 3368480, + "step": 1450 + }, + { + "epoch": 1.3713477851083884, + "grad_norm": 10.427976608276367, + "learning_rate": 4.552847984820208e-06, + "loss": 0.1418, + "num_input_tokens_seen": 3378720, + "step": 1455 + }, + { + "epoch": 1.3760603204524033, + "grad_norm": 17.01227569580078, + "learning_rate": 4.548142194157951e-06, + "loss": 0.1344, + "num_input_tokens_seen": 3390688, + "step": 1460 + }, + { + "epoch": 1.3807728557964185, + "grad_norm": 0.41409215331077576, + "learning_rate": 4.54341422997592e-06, + "loss": 0.2518, + "num_input_tokens_seen": 3403488, + "step": 1465 + }, + { + "epoch": 1.3854853911404335, + "grad_norm": 3.571580410003662, + "learning_rate": 4.538664143459819e-06, + "loss": 0.1194, + "num_input_tokens_seen": 3415648, + "step": 1470 + }, + { + "epoch": 1.3901979264844486, + "grad_norm": 39.68430709838867, + "learning_rate": 4.5338919860348565e-06, + "loss": 0.1113, + "num_input_tokens_seen": 3427168, + "step": 1475 + }, + { + "epoch": 1.3949104618284638, + "grad_norm": 0.09742722660303116, + "learning_rate": 4.529097809365184e-06, + "loss": 0.1426, + "num_input_tokens_seen": 3437664, + "step": 1480 + }, + { + "epoch": 1.3996229971724787, + "grad_norm": 80.09423828125, + "learning_rate": 4.524281665353334e-06, + "loss": 0.3136, + "num_input_tokens_seen": 3450144, + "step": 1485 + }, + { + "epoch": 1.404335532516494, + "grad_norm": 38.64655303955078, + "learning_rate": 4.519443606139665e-06, + "loss": 0.1617, + "num_input_tokens_seen": 3461280, + "step": 1490 + }, + { + "epoch": 1.409048067860509, + "grad_norm": 60.909393310546875, + "learning_rate": 4.514583684101792e-06, + "loss": 0.2666, + "num_input_tokens_seen": 3472608, + "step": 1495 + }, + { + "epoch": 1.413760603204524, + "grad_norm": 89.08367919921875, + "learning_rate": 4.509701951854018e-06, + "loss": 0.105, + "num_input_tokens_seen": 3485024, + "step": 1500 + }, + { + "epoch": 1.4184731385485392, + "grad_norm": 73.14676666259766, + "learning_rate": 4.504798462246768e-06, + "loss": 0.2341, + "num_input_tokens_seen": 3496096, + "step": 1505 + }, + { + "epoch": 1.4231856738925541, + "grad_norm": 33.10121154785156, + "learning_rate": 4.499873268366017e-06, + "loss": 0.2829, + "num_input_tokens_seen": 3506848, + "step": 1510 + }, + { + "epoch": 1.4278982092365693, + "grad_norm": 45.99144744873047, + "learning_rate": 4.494926423532715e-06, + "loss": 0.1819, + "num_input_tokens_seen": 3521568, + "step": 1515 + }, + { + "epoch": 1.4326107445805842, + "grad_norm": 3.1161906719207764, + "learning_rate": 4.4899579813022046e-06, + "loss": 0.1103, + "num_input_tokens_seen": 3533856, + "step": 1520 + }, + { + "epoch": 1.4373232799245994, + "grad_norm": 1.9241315126419067, + "learning_rate": 4.484967995463648e-06, + "loss": 0.216, + "num_input_tokens_seen": 3544544, + "step": 1525 + }, + { + "epoch": 1.4420358152686146, + "grad_norm": 26.153079986572266, + "learning_rate": 4.479956520039443e-06, + "loss": 0.303, + "num_input_tokens_seen": 3554336, + "step": 1530 + }, + { + "epoch": 1.4467483506126295, + "grad_norm": 8.090953826904297, + "learning_rate": 4.474923609284635e-06, + "loss": 0.0434, + "num_input_tokens_seen": 3564384, + "step": 1535 + }, + { + "epoch": 1.4514608859566447, + "grad_norm": 0.26238393783569336, + "learning_rate": 4.469869317686332e-06, + "loss": 0.1438, + "num_input_tokens_seen": 3576992, + "step": 1540 + }, + { + "epoch": 1.4561734213006599, + "grad_norm": 92.67262268066406, + "learning_rate": 4.464793699963116e-06, + "loss": 0.1766, + "num_input_tokens_seen": 3587872, + "step": 1545 + }, + { + "epoch": 1.4608859566446748, + "grad_norm": 11.002724647521973, + "learning_rate": 4.4596968110644484e-06, + "loss": 0.0997, + "num_input_tokens_seen": 3598560, + "step": 1550 + }, + { + "epoch": 1.46559849198869, + "grad_norm": 77.25719451904297, + "learning_rate": 4.454578706170075e-06, + "loss": 0.1595, + "num_input_tokens_seen": 3608864, + "step": 1555 + }, + { + "epoch": 1.4703110273327051, + "grad_norm": 1.6689245700836182, + "learning_rate": 4.44943944068943e-06, + "loss": 0.0274, + "num_input_tokens_seen": 3620960, + "step": 1560 + }, + { + "epoch": 1.47502356267672, + "grad_norm": 103.46016693115234, + "learning_rate": 4.444279070261035e-06, + "loss": 0.4584, + "num_input_tokens_seen": 3632096, + "step": 1565 + }, + { + "epoch": 1.479736098020735, + "grad_norm": 57.57553482055664, + "learning_rate": 4.4390976507518994e-06, + "loss": 0.2423, + "num_input_tokens_seen": 3643424, + "step": 1570 + }, + { + "epoch": 1.4844486333647502, + "grad_norm": 0.6700392961502075, + "learning_rate": 4.433895238256909e-06, + "loss": 0.046, + "num_input_tokens_seen": 3654624, + "step": 1575 + }, + { + "epoch": 1.4891611687087654, + "grad_norm": 58.0783576965332, + "learning_rate": 4.4286718890982275e-06, + "loss": 0.0609, + "num_input_tokens_seen": 3665504, + "step": 1580 + }, + { + "epoch": 1.4938737040527803, + "grad_norm": 142.61090087890625, + "learning_rate": 4.423427659824681e-06, + "loss": 0.2488, + "num_input_tokens_seen": 3676448, + "step": 1585 + }, + { + "epoch": 1.4985862393967955, + "grad_norm": 40.1721305847168, + "learning_rate": 4.418162607211146e-06, + "loss": 0.4721, + "num_input_tokens_seen": 3686432, + "step": 1590 + }, + { + "epoch": 1.5032987747408106, + "grad_norm": 25.409154891967773, + "learning_rate": 4.412876788257936e-06, + "loss": 0.2207, + "num_input_tokens_seen": 3697312, + "step": 1595 + }, + { + "epoch": 1.5042412818096136, + "eval_loss": 0.35448023676872253, + "eval_runtime": 2.7456, + "eval_samples_per_second": 343.46, + "eval_steps_per_second": 42.978, + "num_input_tokens_seen": 3699104, + "step": 1596 + }, + { + "epoch": 1.5080113100848256, + "grad_norm": 44.117496490478516, + "learning_rate": 4.407570260190186e-06, + "loss": 0.2648, + "num_input_tokens_seen": 3707808, + "step": 1600 + }, + { + "epoch": 1.5127238454288408, + "grad_norm": 26.070695877075195, + "learning_rate": 4.402243080457229e-06, + "loss": 0.3225, + "num_input_tokens_seen": 3719840, + "step": 1605 + }, + { + "epoch": 1.517436380772856, + "grad_norm": 1.1607394218444824, + "learning_rate": 4.396895306731978e-06, + "loss": 0.2234, + "num_input_tokens_seen": 3731168, + "step": 1610 + }, + { + "epoch": 1.5221489161168709, + "grad_norm": 103.62728881835938, + "learning_rate": 4.391526996910298e-06, + "loss": 0.2199, + "num_input_tokens_seen": 3744160, + "step": 1615 + }, + { + "epoch": 1.5268614514608858, + "grad_norm": 31.115297317504883, + "learning_rate": 4.386138209110385e-06, + "loss": 0.1515, + "num_input_tokens_seen": 3754912, + "step": 1620 + }, + { + "epoch": 1.5315739868049012, + "grad_norm": 1.294524073600769, + "learning_rate": 4.3807290016721265e-06, + "loss": 0.1179, + "num_input_tokens_seen": 3767776, + "step": 1625 + }, + { + "epoch": 1.5362865221489161, + "grad_norm": 92.95679473876953, + "learning_rate": 4.375299433156483e-06, + "loss": 0.1079, + "num_input_tokens_seen": 3779104, + "step": 1630 + }, + { + "epoch": 1.540999057492931, + "grad_norm": 72.8927001953125, + "learning_rate": 4.3698495623448424e-06, + "loss": 0.359, + "num_input_tokens_seen": 3789408, + "step": 1635 + }, + { + "epoch": 1.5457115928369463, + "grad_norm": 31.62137794494629, + "learning_rate": 4.364379448238392e-06, + "loss": 0.1058, + "num_input_tokens_seen": 3799584, + "step": 1640 + }, + { + "epoch": 1.5504241281809614, + "grad_norm": 80.54794311523438, + "learning_rate": 4.358889150057476e-06, + "loss": 0.3319, + "num_input_tokens_seen": 3813344, + "step": 1645 + }, + { + "epoch": 1.5551366635249764, + "grad_norm": 91.38248443603516, + "learning_rate": 4.35337872724095e-06, + "loss": 0.1354, + "num_input_tokens_seen": 3823328, + "step": 1650 + }, + { + "epoch": 1.5598491988689915, + "grad_norm": 86.33023071289062, + "learning_rate": 4.347848239445548e-06, + "loss": 0.1612, + "num_input_tokens_seen": 3835232, + "step": 1655 + }, + { + "epoch": 1.5645617342130067, + "grad_norm": 24.640047073364258, + "learning_rate": 4.342297746545228e-06, + "loss": 0.2858, + "num_input_tokens_seen": 3846368, + "step": 1660 + }, + { + "epoch": 1.5692742695570217, + "grad_norm": 0.5544624924659729, + "learning_rate": 4.336727308630527e-06, + "loss": 0.0313, + "num_input_tokens_seen": 3858656, + "step": 1665 + }, + { + "epoch": 1.5739868049010366, + "grad_norm": 23.30266761779785, + "learning_rate": 4.33113698600791e-06, + "loss": 0.1587, + "num_input_tokens_seen": 3871776, + "step": 1670 + }, + { + "epoch": 1.578699340245052, + "grad_norm": 0.21707068383693695, + "learning_rate": 4.325526839199115e-06, + "loss": 0.0377, + "num_input_tokens_seen": 3884384, + "step": 1675 + }, + { + "epoch": 1.583411875589067, + "grad_norm": 97.02978515625, + "learning_rate": 4.319896928940505e-06, + "loss": 0.2741, + "num_input_tokens_seen": 3896224, + "step": 1680 + }, + { + "epoch": 1.5881244109330819, + "grad_norm": 6.382898807525635, + "learning_rate": 4.3142473161824e-06, + "loss": 0.1037, + "num_input_tokens_seen": 3906528, + "step": 1685 + }, + { + "epoch": 1.592836946277097, + "grad_norm": 36.04171371459961, + "learning_rate": 4.308578062088426e-06, + "loss": 0.1437, + "num_input_tokens_seen": 3917728, + "step": 1690 + }, + { + "epoch": 1.5975494816211122, + "grad_norm": 61.61280822753906, + "learning_rate": 4.302889228034846e-06, + "loss": 0.3957, + "num_input_tokens_seen": 3928032, + "step": 1695 + }, + { + "epoch": 1.6022620169651272, + "grad_norm": 1.8270617723464966, + "learning_rate": 4.297180875609902e-06, + "loss": 0.1641, + "num_input_tokens_seen": 3940384, + "step": 1700 + }, + { + "epoch": 1.6069745523091423, + "grad_norm": 0.7876982092857361, + "learning_rate": 4.2914530666131436e-06, + "loss": 0.0949, + "num_input_tokens_seen": 3951904, + "step": 1705 + }, + { + "epoch": 1.6116870876531575, + "grad_norm": 59.75898742675781, + "learning_rate": 4.285705863054759e-06, + "loss": 0.2799, + "num_input_tokens_seen": 3963360, + "step": 1710 + }, + { + "epoch": 1.6163996229971724, + "grad_norm": 50.44517517089844, + "learning_rate": 4.279939327154909e-06, + "loss": 0.3126, + "num_input_tokens_seen": 3974432, + "step": 1715 + }, + { + "epoch": 1.6211121583411876, + "grad_norm": 22.407121658325195, + "learning_rate": 4.274153521343047e-06, + "loss": 0.2358, + "num_input_tokens_seen": 3984352, + "step": 1720 + }, + { + "epoch": 1.6258246936852028, + "grad_norm": 2.445833206176758, + "learning_rate": 4.268348508257243e-06, + "loss": 0.0892, + "num_input_tokens_seen": 3994016, + "step": 1725 + }, + { + "epoch": 1.6305372290292177, + "grad_norm": 79.69355010986328, + "learning_rate": 4.262524350743512e-06, + "loss": 0.3199, + "num_input_tokens_seen": 4005856, + "step": 1730 + }, + { + "epoch": 1.6352497643732327, + "grad_norm": 27.91238784790039, + "learning_rate": 4.25668111185513e-06, + "loss": 0.1497, + "num_input_tokens_seen": 4017248, + "step": 1735 + }, + { + "epoch": 1.6399622997172478, + "grad_norm": 65.74903106689453, + "learning_rate": 4.250818854851948e-06, + "loss": 0.1124, + "num_input_tokens_seen": 4028128, + "step": 1740 + }, + { + "epoch": 1.644674835061263, + "grad_norm": 16.284719467163086, + "learning_rate": 4.244937643199711e-06, + "loss": 0.1923, + "num_input_tokens_seen": 4044768, + "step": 1745 + }, + { + "epoch": 1.649387370405278, + "grad_norm": 68.08360290527344, + "learning_rate": 4.239037540569373e-06, + "loss": 0.1026, + "num_input_tokens_seen": 4062432, + "step": 1750 + }, + { + "epoch": 1.654099905749293, + "grad_norm": 16.83579444885254, + "learning_rate": 4.233118610836401e-06, + "loss": 0.0699, + "num_input_tokens_seen": 4074016, + "step": 1755 + }, + { + "epoch": 1.6588124410933083, + "grad_norm": 26.799367904663086, + "learning_rate": 4.227180918080089e-06, + "loss": 0.1875, + "num_input_tokens_seen": 4084704, + "step": 1760 + }, + { + "epoch": 1.6635249764373232, + "grad_norm": 10.665923118591309, + "learning_rate": 4.221224526582863e-06, + "loss": 0.0828, + "num_input_tokens_seen": 4095136, + "step": 1765 + }, + { + "epoch": 1.6682375117813384, + "grad_norm": 0.24358469247817993, + "learning_rate": 4.215249500829583e-06, + "loss": 0.1379, + "num_input_tokens_seen": 4107744, + "step": 1770 + }, + { + "epoch": 1.6729500471253536, + "grad_norm": 0.6852381229400635, + "learning_rate": 4.209255905506847e-06, + "loss": 0.2322, + "num_input_tokens_seen": 4118624, + "step": 1775 + }, + { + "epoch": 1.6776625824693685, + "grad_norm": 0.456554651260376, + "learning_rate": 4.2032438055022925e-06, + "loss": 0.1804, + "num_input_tokens_seen": 4129184, + "step": 1780 + }, + { + "epoch": 1.6823751178133834, + "grad_norm": 96.7328872680664, + "learning_rate": 4.197213265903889e-06, + "loss": 0.3414, + "num_input_tokens_seen": 4141024, + "step": 1785 + }, + { + "epoch": 1.6870876531573988, + "grad_norm": 16.629526138305664, + "learning_rate": 4.191164351999236e-06, + "loss": 0.3523, + "num_input_tokens_seen": 4151840, + "step": 1790 + }, + { + "epoch": 1.6918001885014138, + "grad_norm": 23.59195899963379, + "learning_rate": 4.18509712927486e-06, + "loss": 0.2797, + "num_input_tokens_seen": 4164704, + "step": 1795 + }, + { + "epoch": 1.6965127238454287, + "grad_norm": 38.683265686035156, + "learning_rate": 4.179011663415494e-06, + "loss": 0.2943, + "num_input_tokens_seen": 4177184, + "step": 1800 + }, + { + "epoch": 1.701225259189444, + "grad_norm": 20.35943031311035, + "learning_rate": 4.172908020303384e-06, + "loss": 0.0589, + "num_input_tokens_seen": 4188768, + "step": 1805 + }, + { + "epoch": 1.705937794533459, + "grad_norm": 25.21088218688965, + "learning_rate": 4.166786266017557e-06, + "loss": 0.1865, + "num_input_tokens_seen": 4200480, + "step": 1810 + }, + { + "epoch": 1.710650329877474, + "grad_norm": 18.756656646728516, + "learning_rate": 4.160646466833121e-06, + "loss": 0.1045, + "num_input_tokens_seen": 4212064, + "step": 1815 + }, + { + "epoch": 1.7153628652214892, + "grad_norm": 38.346832275390625, + "learning_rate": 4.154488689220536e-06, + "loss": 0.2373, + "num_input_tokens_seen": 4221728, + "step": 1820 + }, + { + "epoch": 1.7200754005655043, + "grad_norm": 61.90775680541992, + "learning_rate": 4.1483129998449035e-06, + "loss": 0.216, + "num_input_tokens_seen": 4233888, + "step": 1825 + }, + { + "epoch": 1.7247879359095193, + "grad_norm": 35.818946838378906, + "learning_rate": 4.142119465565238e-06, + "loss": 0.2308, + "num_input_tokens_seen": 4245344, + "step": 1830 + }, + { + "epoch": 1.7295004712535345, + "grad_norm": 42.63814163208008, + "learning_rate": 4.135908153433748e-06, + "loss": 0.0663, + "num_input_tokens_seen": 4256992, + "step": 1835 + }, + { + "epoch": 1.7342130065975496, + "grad_norm": 1.1722609996795654, + "learning_rate": 4.129679130695105e-06, + "loss": 0.0795, + "num_input_tokens_seen": 4266784, + "step": 1840 + }, + { + "epoch": 1.7389255419415646, + "grad_norm": 73.20691680908203, + "learning_rate": 4.123432464785721e-06, + "loss": 0.0953, + "num_input_tokens_seen": 4281504, + "step": 1845 + }, + { + "epoch": 1.7436380772855795, + "grad_norm": 61.06163024902344, + "learning_rate": 4.117168223333015e-06, + "loss": 0.3657, + "num_input_tokens_seen": 4296032, + "step": 1850 + }, + { + "epoch": 1.7483506126295947, + "grad_norm": 3.197977304458618, + "learning_rate": 4.1108864741546815e-06, + "loss": 0.0417, + "num_input_tokens_seen": 4309280, + "step": 1855 + }, + { + "epoch": 1.7530631479736098, + "grad_norm": 0.4998331665992737, + "learning_rate": 4.1045872852579546e-06, + "loss": 0.1138, + "num_input_tokens_seen": 4319648, + "step": 1860 + }, + { + "epoch": 1.7549481621112157, + "eval_loss": 0.3500010073184967, + "eval_runtime": 2.7501, + "eval_samples_per_second": 342.894, + "eval_steps_per_second": 42.907, + "num_input_tokens_seen": 4324256, + "step": 1862 + }, + { + "epoch": 1.7577756833176248, + "grad_norm": 108.458740234375, + "learning_rate": 4.098270724838879e-06, + "loss": 0.0767, + "num_input_tokens_seen": 4330144, + "step": 1865 + }, + { + "epoch": 1.76248821866164, + "grad_norm": 0.2290242463350296, + "learning_rate": 4.091936861281561e-06, + "loss": 0.0415, + "num_input_tokens_seen": 4343712, + "step": 1870 + }, + { + "epoch": 1.7672007540056551, + "grad_norm": 93.17559814453125, + "learning_rate": 4.085585763157435e-06, + "loss": 0.4214, + "num_input_tokens_seen": 4354144, + "step": 1875 + }, + { + "epoch": 1.77191328934967, + "grad_norm": 10.659987449645996, + "learning_rate": 4.07921749922452e-06, + "loss": 0.013, + "num_input_tokens_seen": 4364896, + "step": 1880 + }, + { + "epoch": 1.7766258246936852, + "grad_norm": 0.5930144786834717, + "learning_rate": 4.0728321384266764e-06, + "loss": 0.1879, + "num_input_tokens_seen": 4377120, + "step": 1885 + }, + { + "epoch": 1.7813383600377004, + "grad_norm": 0.13112248480319977, + "learning_rate": 4.066429749892854e-06, + "loss": 0.1512, + "num_input_tokens_seen": 4388128, + "step": 1890 + }, + { + "epoch": 1.7860508953817154, + "grad_norm": 31.263877868652344, + "learning_rate": 4.060010402936353e-06, + "loss": 0.1946, + "num_input_tokens_seen": 4402272, + "step": 1895 + }, + { + "epoch": 1.7907634307257303, + "grad_norm": 66.94145965576172, + "learning_rate": 4.053574167054063e-06, + "loss": 0.0513, + "num_input_tokens_seen": 4412640, + "step": 1900 + }, + { + "epoch": 1.7954759660697457, + "grad_norm": 30.63470458984375, + "learning_rate": 4.047121111925718e-06, + "loss": 0.2935, + "num_input_tokens_seen": 4424096, + "step": 1905 + }, + { + "epoch": 1.8001885014137606, + "grad_norm": 64.27619171142578, + "learning_rate": 4.040651307413142e-06, + "loss": 0.1499, + "num_input_tokens_seen": 4434144, + "step": 1910 + }, + { + "epoch": 1.8049010367577756, + "grad_norm": 88.78367614746094, + "learning_rate": 4.034164823559487e-06, + "loss": 0.1671, + "num_input_tokens_seen": 4446240, + "step": 1915 + }, + { + "epoch": 1.8096135721017907, + "grad_norm": 47.201698303222656, + "learning_rate": 4.02766173058848e-06, + "loss": 0.183, + "num_input_tokens_seen": 4455712, + "step": 1920 + }, + { + "epoch": 1.814326107445806, + "grad_norm": 17.526779174804688, + "learning_rate": 4.021142098903662e-06, + "loss": 0.2619, + "num_input_tokens_seen": 4466144, + "step": 1925 + }, + { + "epoch": 1.8190386427898209, + "grad_norm": 18.032976150512695, + "learning_rate": 4.014605999087623e-06, + "loss": 0.2168, + "num_input_tokens_seen": 4476064, + "step": 1930 + }, + { + "epoch": 1.823751178133836, + "grad_norm": 4.104875564575195, + "learning_rate": 4.008053501901239e-06, + "loss": 0.1402, + "num_input_tokens_seen": 4487456, + "step": 1935 + }, + { + "epoch": 1.8284637134778512, + "grad_norm": 28.21024513244629, + "learning_rate": 4.001484678282911e-06, + "loss": 0.2318, + "num_input_tokens_seen": 4498400, + "step": 1940 + }, + { + "epoch": 1.8331762488218661, + "grad_norm": 36.88951873779297, + "learning_rate": 3.994899599347787e-06, + "loss": 0.1527, + "num_input_tokens_seen": 4511520, + "step": 1945 + }, + { + "epoch": 1.837888784165881, + "grad_norm": 12.032304763793945, + "learning_rate": 3.9882983363869995e-06, + "loss": 0.151, + "num_input_tokens_seen": 4523232, + "step": 1950 + }, + { + "epoch": 1.8426013195098965, + "grad_norm": 22.562625885009766, + "learning_rate": 3.981680960866896e-06, + "loss": 0.084, + "num_input_tokens_seen": 4536416, + "step": 1955 + }, + { + "epoch": 1.8473138548539114, + "grad_norm": 2.119037389755249, + "learning_rate": 3.9750475444282545e-06, + "loss": 0.1193, + "num_input_tokens_seen": 4546528, + "step": 1960 + }, + { + "epoch": 1.8520263901979264, + "grad_norm": 5.9970574378967285, + "learning_rate": 3.968398158885519e-06, + "loss": 0.0301, + "num_input_tokens_seen": 4559008, + "step": 1965 + }, + { + "epoch": 1.8567389255419415, + "grad_norm": 53.16204071044922, + "learning_rate": 3.961732876226016e-06, + "loss": 0.1272, + "num_input_tokens_seen": 4569824, + "step": 1970 + }, + { + "epoch": 1.8614514608859567, + "grad_norm": 34.37496566772461, + "learning_rate": 3.955051768609179e-06, + "loss": 0.0125, + "num_input_tokens_seen": 4581664, + "step": 1975 + }, + { + "epoch": 1.8661639962299716, + "grad_norm": 5.8095011711120605, + "learning_rate": 3.948354908365762e-06, + "loss": 0.2273, + "num_input_tokens_seen": 4593696, + "step": 1980 + }, + { + "epoch": 1.8708765315739868, + "grad_norm": 82.38545989990234, + "learning_rate": 3.941642367997062e-06, + "loss": 0.3306, + "num_input_tokens_seen": 4604064, + "step": 1985 + }, + { + "epoch": 1.875589066918002, + "grad_norm": 13.79807186126709, + "learning_rate": 3.934914220174128e-06, + "loss": 0.2246, + "num_input_tokens_seen": 4613856, + "step": 1990 + }, + { + "epoch": 1.880301602262017, + "grad_norm": 9.43858528137207, + "learning_rate": 3.9281705377369814e-06, + "loss": 0.262, + "num_input_tokens_seen": 4624480, + "step": 1995 + }, + { + "epoch": 1.885014137606032, + "grad_norm": 0.6858423352241516, + "learning_rate": 3.921411393693823e-06, + "loss": 0.0359, + "num_input_tokens_seen": 4634720, + "step": 2000 + }, + { + "epoch": 1.8897266729500473, + "grad_norm": 12.693150520324707, + "learning_rate": 3.9146368612202425e-06, + "loss": 0.1522, + "num_input_tokens_seen": 4644320, + "step": 2005 + }, + { + "epoch": 1.8944392082940622, + "grad_norm": 0.35528820753097534, + "learning_rate": 3.907847013658429e-06, + "loss": 0.1144, + "num_input_tokens_seen": 4656672, + "step": 2010 + }, + { + "epoch": 1.8991517436380771, + "grad_norm": 0.7190976142883301, + "learning_rate": 3.901041924516372e-06, + "loss": 0.152, + "num_input_tokens_seen": 4668832, + "step": 2015 + }, + { + "epoch": 1.9038642789820923, + "grad_norm": 18.8311767578125, + "learning_rate": 3.894221667467074e-06, + "loss": 0.0683, + "num_input_tokens_seen": 4680096, + "step": 2020 + }, + { + "epoch": 1.9085768143261075, + "grad_norm": 2.0841264724731445, + "learning_rate": 3.887386316347742e-06, + "loss": 0.0966, + "num_input_tokens_seen": 4692320, + "step": 2025 + }, + { + "epoch": 1.9132893496701224, + "grad_norm": 90.08401489257812, + "learning_rate": 3.880535945158997e-06, + "loss": 0.1503, + "num_input_tokens_seen": 4709344, + "step": 2030 + }, + { + "epoch": 1.9180018850141376, + "grad_norm": 0.7957233786582947, + "learning_rate": 3.873670628064071e-06, + "loss": 0.0726, + "num_input_tokens_seen": 4721888, + "step": 2035 + }, + { + "epoch": 1.9227144203581528, + "grad_norm": 115.30460357666016, + "learning_rate": 3.866790439387998e-06, + "loss": 0.117, + "num_input_tokens_seen": 4732384, + "step": 2040 + }, + { + "epoch": 1.9274269557021677, + "grad_norm": 0.2744818925857544, + "learning_rate": 3.85989545361682e-06, + "loss": 0.2188, + "num_input_tokens_seen": 4743264, + "step": 2045 + }, + { + "epoch": 1.9321394910461829, + "grad_norm": 0.26964840292930603, + "learning_rate": 3.85298574539677e-06, + "loss": 0.1091, + "num_input_tokens_seen": 4753248, + "step": 2050 + }, + { + "epoch": 1.936852026390198, + "grad_norm": 151.45645141601562, + "learning_rate": 3.846061389533472e-06, + "loss": 0.0907, + "num_input_tokens_seen": 4764768, + "step": 2055 + }, + { + "epoch": 1.941564561734213, + "grad_norm": 72.78887939453125, + "learning_rate": 3.839122460991124e-06, + "loss": 0.2683, + "num_input_tokens_seen": 4775456, + "step": 2060 + }, + { + "epoch": 1.946277097078228, + "grad_norm": 203.98098754882812, + "learning_rate": 3.832169034891695e-06, + "loss": 0.3549, + "num_input_tokens_seen": 4789152, + "step": 2065 + }, + { + "epoch": 1.9509896324222433, + "grad_norm": 12.131155014038086, + "learning_rate": 3.825201186514103e-06, + "loss": 0.0639, + "num_input_tokens_seen": 4803488, + "step": 2070 + }, + { + "epoch": 1.9557021677662583, + "grad_norm": 8.148255348205566, + "learning_rate": 3.818218991293406e-06, + "loss": 0.2019, + "num_input_tokens_seen": 4813216, + "step": 2075 + }, + { + "epoch": 1.9604147031102732, + "grad_norm": 39.3453369140625, + "learning_rate": 3.811222524819983e-06, + "loss": 0.1943, + "num_input_tokens_seen": 4823584, + "step": 2080 + }, + { + "epoch": 1.9651272384542884, + "grad_norm": 44.195316314697266, + "learning_rate": 3.8042118628387138e-06, + "loss": 0.0531, + "num_input_tokens_seen": 4838624, + "step": 2085 + }, + { + "epoch": 1.9698397737983036, + "grad_norm": 69.47586059570312, + "learning_rate": 3.7971870812481636e-06, + "loss": 0.0121, + "num_input_tokens_seen": 4851552, + "step": 2090 + }, + { + "epoch": 1.9745523091423185, + "grad_norm": 34.5429573059082, + "learning_rate": 3.7901482560997577e-06, + "loss": 0.1929, + "num_input_tokens_seen": 4864352, + "step": 2095 + }, + { + "epoch": 1.9792648444863337, + "grad_norm": 4.417181015014648, + "learning_rate": 3.78309546359696e-06, + "loss": 0.2053, + "num_input_tokens_seen": 4875616, + "step": 2100 + }, + { + "epoch": 1.9839773798303488, + "grad_norm": 43.39990997314453, + "learning_rate": 3.776028780094446e-06, + "loss": 0.0107, + "num_input_tokens_seen": 4886560, + "step": 2105 + }, + { + "epoch": 1.9886899151743638, + "grad_norm": 31.191131591796875, + "learning_rate": 3.7689482820972797e-06, + "loss": 0.2379, + "num_input_tokens_seen": 4898592, + "step": 2110 + }, + { + "epoch": 1.993402450518379, + "grad_norm": 87.375244140625, + "learning_rate": 3.7618540462600792e-06, + "loss": 0.2504, + "num_input_tokens_seen": 4912160, + "step": 2115 + }, + { + "epoch": 1.998114985862394, + "grad_norm": 16.684934616088867, + "learning_rate": 3.7547461493861948e-06, + "loss": 0.1832, + "num_input_tokens_seen": 4923424, + "step": 2120 + }, + { + "epoch": 2.002827521206409, + "grad_norm": 0.0688318982720375, + "learning_rate": 3.7476246684268703e-06, + "loss": 0.0762, + "num_input_tokens_seen": 4932416, + "step": 2125 + }, + { + "epoch": 2.005655042412818, + "eval_loss": 0.33445462584495544, + "eval_runtime": 3.3719, + "eval_samples_per_second": 279.667, + "eval_steps_per_second": 34.996, + "num_input_tokens_seen": 4940992, + "step": 2128 + }, + { + "epoch": 2.007540056550424, + "grad_norm": 2.835258722305298, + "learning_rate": 3.740489680480415e-06, + "loss": 0.0528, + "num_input_tokens_seen": 4948288, + "step": 2130 + }, + { + "epoch": 2.0122525918944394, + "grad_norm": 0.02049732208251953, + "learning_rate": 3.733341262791366e-06, + "loss": 0.0067, + "num_input_tokens_seen": 4960512, + "step": 2135 + }, + { + "epoch": 2.0169651272384543, + "grad_norm": 0.09395653009414673, + "learning_rate": 3.7261794927496535e-06, + "loss": 0.0027, + "num_input_tokens_seen": 4972352, + "step": 2140 + }, + { + "epoch": 2.0216776625824693, + "grad_norm": 159.60162353515625, + "learning_rate": 3.719004447889762e-06, + "loss": 0.0681, + "num_input_tokens_seen": 4982272, + "step": 2145 + }, + { + "epoch": 2.0263901979264842, + "grad_norm": 0.5360152721405029, + "learning_rate": 3.7118162058898915e-06, + "loss": 0.1795, + "num_input_tokens_seen": 4993088, + "step": 2150 + }, + { + "epoch": 2.0311027332704996, + "grad_norm": 0.0288984514772892, + "learning_rate": 3.704614844571117e-06, + "loss": 0.0124, + "num_input_tokens_seen": 5003392, + "step": 2155 + }, + { + "epoch": 2.0358152686145146, + "grad_norm": 0.07737737149000168, + "learning_rate": 3.6974004418965435e-06, + "loss": 0.0007, + "num_input_tokens_seen": 5014592, + "step": 2160 + }, + { + "epoch": 2.0405278039585295, + "grad_norm": 81.7595443725586, + "learning_rate": 3.6901730759704674e-06, + "loss": 0.1943, + "num_input_tokens_seen": 5028160, + "step": 2165 + }, + { + "epoch": 2.045240339302545, + "grad_norm": 0.019410187378525734, + "learning_rate": 3.682932825037523e-06, + "loss": 0.1365, + "num_input_tokens_seen": 5037504, + "step": 2170 + }, + { + "epoch": 2.04995287464656, + "grad_norm": 15.080971717834473, + "learning_rate": 3.675679767481842e-06, + "loss": 0.0894, + "num_input_tokens_seen": 5052288, + "step": 2175 + }, + { + "epoch": 2.054665409990575, + "grad_norm": 10.959814071655273, + "learning_rate": 3.6684139818262045e-06, + "loss": 0.1397, + "num_input_tokens_seen": 5064384, + "step": 2180 + }, + { + "epoch": 2.05937794533459, + "grad_norm": 158.70689392089844, + "learning_rate": 3.6611355467311825e-06, + "loss": 0.0268, + "num_input_tokens_seen": 5074240, + "step": 2185 + }, + { + "epoch": 2.064090480678605, + "grad_norm": 0.12513531744480133, + "learning_rate": 3.653844540994298e-06, + "loss": 0.0081, + "num_input_tokens_seen": 5085312, + "step": 2190 + }, + { + "epoch": 2.06880301602262, + "grad_norm": 0.03574146702885628, + "learning_rate": 3.6465410435491603e-06, + "loss": 0.0006, + "num_input_tokens_seen": 5094592, + "step": 2195 + }, + { + "epoch": 2.0735155513666355, + "grad_norm": 0.017842473462224007, + "learning_rate": 3.6392251334646194e-06, + "loss": 0.0012, + "num_input_tokens_seen": 5108544, + "step": 2200 + }, + { + "epoch": 2.0782280867106504, + "grad_norm": 0.040509432554244995, + "learning_rate": 3.6318968899439042e-06, + "loss": 0.2164, + "num_input_tokens_seen": 5118976, + "step": 2205 + }, + { + "epoch": 2.0829406220546653, + "grad_norm": 0.03663352131843567, + "learning_rate": 3.6245563923237692e-06, + "loss": 0.0004, + "num_input_tokens_seen": 5134272, + "step": 2210 + }, + { + "epoch": 2.0876531573986803, + "grad_norm": 0.11897611618041992, + "learning_rate": 3.617203720073633e-06, + "loss": 0.0463, + "num_input_tokens_seen": 5145408, + "step": 2215 + }, + { + "epoch": 2.0923656927426957, + "grad_norm": 0.11080852895975113, + "learning_rate": 3.6098389527947164e-06, + "loss": 0.1413, + "num_input_tokens_seen": 5157440, + "step": 2220 + }, + { + "epoch": 2.0970782280867106, + "grad_norm": 0.09218670427799225, + "learning_rate": 3.6024621702191876e-06, + "loss": 0.0007, + "num_input_tokens_seen": 5170176, + "step": 2225 + }, + { + "epoch": 2.1017907634307256, + "grad_norm": 1.5784250497817993, + "learning_rate": 3.5950734522092908e-06, + "loss": 0.2877, + "num_input_tokens_seen": 5178944, + "step": 2230 + }, + { + "epoch": 2.106503298774741, + "grad_norm": 0.22626134753227234, + "learning_rate": 3.587672878756487e-06, + "loss": 0.0007, + "num_input_tokens_seen": 5190272, + "step": 2235 + }, + { + "epoch": 2.111215834118756, + "grad_norm": 0.011661054566502571, + "learning_rate": 3.5802605299805843e-06, + "loss": 0.0004, + "num_input_tokens_seen": 5202304, + "step": 2240 + }, + { + "epoch": 2.115928369462771, + "grad_norm": 129.8340301513672, + "learning_rate": 3.5728364861288743e-06, + "loss": 0.1757, + "num_input_tokens_seen": 5215808, + "step": 2245 + }, + { + "epoch": 2.1206409048067862, + "grad_norm": 0.05797062814235687, + "learning_rate": 3.5654008275752607e-06, + "loss": 0.0003, + "num_input_tokens_seen": 5229056, + "step": 2250 + }, + { + "epoch": 2.125353440150801, + "grad_norm": 0.6185352206230164, + "learning_rate": 3.557953634819389e-06, + "loss": 0.0007, + "num_input_tokens_seen": 5239616, + "step": 2255 + }, + { + "epoch": 2.130065975494816, + "grad_norm": 271.65594482421875, + "learning_rate": 3.550494988485777e-06, + "loss": 0.1511, + "num_input_tokens_seen": 5249600, + "step": 2260 + }, + { + "epoch": 2.1347785108388315, + "grad_norm": 0.7488783001899719, + "learning_rate": 3.5430249693229403e-06, + "loss": 0.2004, + "num_input_tokens_seen": 5261888, + "step": 2265 + }, + { + "epoch": 2.1394910461828465, + "grad_norm": 0.022314058616757393, + "learning_rate": 3.5355436582025184e-06, + "loss": 0.0272, + "num_input_tokens_seen": 5272768, + "step": 2270 + }, + { + "epoch": 2.1442035815268614, + "grad_norm": 0.029360728338360786, + "learning_rate": 3.5280511361183995e-06, + "loss": 0.142, + "num_input_tokens_seen": 5283520, + "step": 2275 + }, + { + "epoch": 2.1489161168708764, + "grad_norm": 0.04351954534649849, + "learning_rate": 3.5205474841858444e-06, + "loss": 0.0003, + "num_input_tokens_seen": 5294336, + "step": 2280 + }, + { + "epoch": 2.1536286522148917, + "grad_norm": 0.8838725090026855, + "learning_rate": 3.513032783640605e-06, + "loss": 0.0445, + "num_input_tokens_seen": 5304960, + "step": 2285 + }, + { + "epoch": 2.1583411875589067, + "grad_norm": 0.011690633371472359, + "learning_rate": 3.5055071158380512e-06, + "loss": 0.0002, + "num_input_tokens_seen": 5317184, + "step": 2290 + }, + { + "epoch": 2.1630537229029216, + "grad_norm": 0.16222970187664032, + "learning_rate": 3.497970562252282e-06, + "loss": 0.0003, + "num_input_tokens_seen": 5329152, + "step": 2295 + }, + { + "epoch": 2.167766258246937, + "grad_norm": 128.02944946289062, + "learning_rate": 3.4904232044752507e-06, + "loss": 0.232, + "num_input_tokens_seen": 5342016, + "step": 2300 + }, + { + "epoch": 2.172478793590952, + "grad_norm": 73.5108413696289, + "learning_rate": 3.4828651242158764e-06, + "loss": 0.1157, + "num_input_tokens_seen": 5352768, + "step": 2305 + }, + { + "epoch": 2.177191328934967, + "grad_norm": 0.029827579855918884, + "learning_rate": 3.4752964032991638e-06, + "loss": 0.1506, + "num_input_tokens_seen": 5364160, + "step": 2310 + }, + { + "epoch": 2.181903864278982, + "grad_norm": 0.14194637537002563, + "learning_rate": 3.4677171236653133e-06, + "loss": 0.1442, + "num_input_tokens_seen": 5376448, + "step": 2315 + }, + { + "epoch": 2.1866163996229973, + "grad_norm": 90.07513427734375, + "learning_rate": 3.460127367368836e-06, + "loss": 0.0562, + "num_input_tokens_seen": 5386560, + "step": 2320 + }, + { + "epoch": 2.191328934967012, + "grad_norm": 0.10351494699716568, + "learning_rate": 3.452527216577665e-06, + "loss": 0.1956, + "num_input_tokens_seen": 5399296, + "step": 2325 + }, + { + "epoch": 2.196041470311027, + "grad_norm": 0.17435222864151, + "learning_rate": 3.444916753572267e-06, + "loss": 0.1061, + "num_input_tokens_seen": 5410944, + "step": 2330 + }, + { + "epoch": 2.2007540056550425, + "grad_norm": 0.2240123301744461, + "learning_rate": 3.4372960607447493e-06, + "loss": 0.0012, + "num_input_tokens_seen": 5423168, + "step": 2335 + }, + { + "epoch": 2.2054665409990575, + "grad_norm": 0.03307259455323219, + "learning_rate": 3.429665220597968e-06, + "loss": 0.0111, + "num_input_tokens_seen": 5436544, + "step": 2340 + }, + { + "epoch": 2.2101790763430724, + "grad_norm": 0.026153933256864548, + "learning_rate": 3.4220243157446388e-06, + "loss": 0.0934, + "num_input_tokens_seen": 5448512, + "step": 2345 + }, + { + "epoch": 2.214891611687088, + "grad_norm": 150.3295440673828, + "learning_rate": 3.4143734289064363e-06, + "loss": 0.0139, + "num_input_tokens_seen": 5460032, + "step": 2350 + }, + { + "epoch": 2.2196041470311028, + "grad_norm": 0.09933875501155853, + "learning_rate": 3.4067126429131035e-06, + "loss": 0.0004, + "num_input_tokens_seen": 5472896, + "step": 2355 + }, + { + "epoch": 2.2243166823751177, + "grad_norm": 0.017140116542577744, + "learning_rate": 3.3990420407015534e-06, + "loss": 0.0005, + "num_input_tokens_seen": 5482944, + "step": 2360 + }, + { + "epoch": 2.229029217719133, + "grad_norm": 88.95214080810547, + "learning_rate": 3.3913617053149694e-06, + "loss": 0.0536, + "num_input_tokens_seen": 5494336, + "step": 2365 + }, + { + "epoch": 2.233741753063148, + "grad_norm": 0.016227245330810547, + "learning_rate": 3.3836717199019087e-06, + "loss": 0.0001, + "num_input_tokens_seen": 5505728, + "step": 2370 + }, + { + "epoch": 2.238454288407163, + "grad_norm": 0.01039363257586956, + "learning_rate": 3.3759721677154022e-06, + "loss": 0.0861, + "num_input_tokens_seen": 5515328, + "step": 2375 + }, + { + "epoch": 2.243166823751178, + "grad_norm": 0.014802216552197933, + "learning_rate": 3.3682631321120507e-06, + "loss": 0.0002, + "num_input_tokens_seen": 5525760, + "step": 2380 + }, + { + "epoch": 2.2478793590951933, + "grad_norm": 0.8376834392547607, + "learning_rate": 3.3605446965511256e-06, + "loss": 0.168, + "num_input_tokens_seen": 5537280, + "step": 2385 + }, + { + "epoch": 2.2525918944392083, + "grad_norm": 20.87982749938965, + "learning_rate": 3.3528169445936616e-06, + "loss": 0.0898, + "num_input_tokens_seen": 5548928, + "step": 2390 + }, + { + "epoch": 2.2563619227144205, + "eval_loss": 0.46465176343917847, + "eval_runtime": 2.7461, + "eval_samples_per_second": 343.401, + "eval_steps_per_second": 42.971, + "num_input_tokens_seen": 5558144, + "step": 2394 + }, + { + "epoch": 2.257304429783223, + "grad_norm": 446.71234130859375, + "learning_rate": 3.3450799599015567e-06, + "loss": 0.1847, + "num_input_tokens_seen": 5559872, + "step": 2395 + }, + { + "epoch": 2.2620169651272386, + "grad_norm": 0.04414854571223259, + "learning_rate": 3.3373338262366617e-06, + "loss": 0.0234, + "num_input_tokens_seen": 5571264, + "step": 2400 + }, + { + "epoch": 2.2667295004712535, + "grad_norm": 0.1296156644821167, + "learning_rate": 3.329578627459878e-06, + "loss": 0.0881, + "num_input_tokens_seen": 5581312, + "step": 2405 + }, + { + "epoch": 2.2714420358152685, + "grad_norm": 0.03757103905081749, + "learning_rate": 3.3218144475302444e-06, + "loss": 0.0004, + "num_input_tokens_seen": 5592384, + "step": 2410 + }, + { + "epoch": 2.276154571159284, + "grad_norm": 0.04516446590423584, + "learning_rate": 3.314041370504034e-06, + "loss": 0.1036, + "num_input_tokens_seen": 5603456, + "step": 2415 + }, + { + "epoch": 2.280867106503299, + "grad_norm": 0.09362529218196869, + "learning_rate": 3.30625948053384e-06, + "loss": 0.0579, + "num_input_tokens_seen": 5614464, + "step": 2420 + }, + { + "epoch": 2.2855796418473138, + "grad_norm": 10.87307071685791, + "learning_rate": 3.2984688618676665e-06, + "loss": 0.089, + "num_input_tokens_seen": 5626112, + "step": 2425 + }, + { + "epoch": 2.290292177191329, + "grad_norm": 0.38255080580711365, + "learning_rate": 3.2906695988480144e-06, + "loss": 0.0886, + "num_input_tokens_seen": 5637248, + "step": 2430 + }, + { + "epoch": 2.295004712535344, + "grad_norm": 35.04936599731445, + "learning_rate": 3.2828617759109715e-06, + "loss": 0.0709, + "num_input_tokens_seen": 5647552, + "step": 2435 + }, + { + "epoch": 2.299717247879359, + "grad_norm": 0.16362737119197845, + "learning_rate": 3.2750454775852956e-06, + "loss": 0.0006, + "num_input_tokens_seen": 5662080, + "step": 2440 + }, + { + "epoch": 2.304429783223374, + "grad_norm": 0.023827245458960533, + "learning_rate": 3.2672207884915017e-06, + "loss": 0.0005, + "num_input_tokens_seen": 5673856, + "step": 2445 + }, + { + "epoch": 2.3091423185673894, + "grad_norm": 55.789493560791016, + "learning_rate": 3.2593877933409436e-06, + "loss": 0.107, + "num_input_tokens_seen": 5683904, + "step": 2450 + }, + { + "epoch": 2.3138548539114043, + "grad_norm": 0.02205372042953968, + "learning_rate": 3.251546576934897e-06, + "loss": 0.0003, + "num_input_tokens_seen": 5694400, + "step": 2455 + }, + { + "epoch": 2.3185673892554193, + "grad_norm": 65.09204864501953, + "learning_rate": 3.2436972241636443e-06, + "loss": 0.1635, + "num_input_tokens_seen": 5705664, + "step": 2460 + }, + { + "epoch": 2.3232799245994347, + "grad_norm": 0.022804006934165955, + "learning_rate": 3.2358398200055515e-06, + "loss": 0.0001, + "num_input_tokens_seen": 5718848, + "step": 2465 + }, + { + "epoch": 2.3279924599434496, + "grad_norm": 0.01745908334851265, + "learning_rate": 3.227974449526152e-06, + "loss": 0.0504, + "num_input_tokens_seen": 5732096, + "step": 2470 + }, + { + "epoch": 2.3327049952874646, + "grad_norm": 91.0146713256836, + "learning_rate": 3.2201011978772224e-06, + "loss": 0.09, + "num_input_tokens_seen": 5742144, + "step": 2475 + }, + { + "epoch": 2.3374175306314795, + "grad_norm": 0.06392789632081985, + "learning_rate": 3.2122201502958635e-06, + "loss": 0.0647, + "num_input_tokens_seen": 5754176, + "step": 2480 + }, + { + "epoch": 2.342130065975495, + "grad_norm": 0.008629159070551395, + "learning_rate": 3.2043313921035747e-06, + "loss": 0.0155, + "num_input_tokens_seen": 5767104, + "step": 2485 + }, + { + "epoch": 2.34684260131951, + "grad_norm": 113.13795471191406, + "learning_rate": 3.1964350087053323e-06, + "loss": 0.3015, + "num_input_tokens_seen": 5779520, + "step": 2490 + }, + { + "epoch": 2.3515551366635252, + "grad_norm": 243.08010864257812, + "learning_rate": 3.1885310855886655e-06, + "loss": 0.0284, + "num_input_tokens_seen": 5792640, + "step": 2495 + }, + { + "epoch": 2.35626767200754, + "grad_norm": 0.029916413128376007, + "learning_rate": 3.1806197083227276e-06, + "loss": 0.0001, + "num_input_tokens_seen": 5805696, + "step": 2500 + }, + { + "epoch": 2.360980207351555, + "grad_norm": 0.012451832182705402, + "learning_rate": 3.172700962557373e-06, + "loss": 0.168, + "num_input_tokens_seen": 5819840, + "step": 2505 + }, + { + "epoch": 2.36569274269557, + "grad_norm": 0.06405292451381683, + "learning_rate": 3.1647749340222288e-06, + "loss": 0.1209, + "num_input_tokens_seen": 5830016, + "step": 2510 + }, + { + "epoch": 2.3704052780395855, + "grad_norm": 31.766504287719727, + "learning_rate": 3.1568417085257653e-06, + "loss": 0.0744, + "num_input_tokens_seen": 5840000, + "step": 2515 + }, + { + "epoch": 2.3751178133836004, + "grad_norm": 117.67131805419922, + "learning_rate": 3.1489013719543703e-06, + "loss": 0.0681, + "num_input_tokens_seen": 5849920, + "step": 2520 + }, + { + "epoch": 2.3798303487276153, + "grad_norm": 17.114727020263672, + "learning_rate": 3.140954010271416e-06, + "loss": 0.2567, + "num_input_tokens_seen": 5860480, + "step": 2525 + }, + { + "epoch": 2.3845428840716307, + "grad_norm": 0.0346570685505867, + "learning_rate": 3.132999709516329e-06, + "loss": 0.0055, + "num_input_tokens_seen": 5873408, + "step": 2530 + }, + { + "epoch": 2.3892554194156457, + "grad_norm": 3.354789972305298, + "learning_rate": 3.1250385558036606e-06, + "loss": 0.0887, + "num_input_tokens_seen": 5884608, + "step": 2535 + }, + { + "epoch": 2.3939679547596606, + "grad_norm": 46.3475227355957, + "learning_rate": 3.1170706353221525e-06, + "loss": 0.2362, + "num_input_tokens_seen": 5896064, + "step": 2540 + }, + { + "epoch": 2.3986804901036756, + "grad_norm": 0.14980660378932953, + "learning_rate": 3.109096034333805e-06, + "loss": 0.0014, + "num_input_tokens_seen": 5907776, + "step": 2545 + }, + { + "epoch": 2.403393025447691, + "grad_norm": 63.976871490478516, + "learning_rate": 3.1011148391729434e-06, + "loss": 0.0292, + "num_input_tokens_seen": 5919744, + "step": 2550 + }, + { + "epoch": 2.408105560791706, + "grad_norm": 0.936824381351471, + "learning_rate": 3.0931271362452803e-06, + "loss": 0.18, + "num_input_tokens_seen": 5932224, + "step": 2555 + }, + { + "epoch": 2.412818096135721, + "grad_norm": 0.04161603003740311, + "learning_rate": 3.085133012026985e-06, + "loss": 0.001, + "num_input_tokens_seen": 5943424, + "step": 2560 + }, + { + "epoch": 2.4175306314797362, + "grad_norm": 50.99045181274414, + "learning_rate": 3.0771325530637434e-06, + "loss": 0.1243, + "num_input_tokens_seen": 5955904, + "step": 2565 + }, + { + "epoch": 2.422243166823751, + "grad_norm": 1.0831135511398315, + "learning_rate": 3.0691258459698227e-06, + "loss": 0.0789, + "num_input_tokens_seen": 5967360, + "step": 2570 + }, + { + "epoch": 2.426955702167766, + "grad_norm": 0.2694717049598694, + "learning_rate": 3.0611129774271318e-06, + "loss": 0.1948, + "num_input_tokens_seen": 5980608, + "step": 2575 + }, + { + "epoch": 2.4316682375117815, + "grad_norm": 0.017116645351052284, + "learning_rate": 3.0530940341842883e-06, + "loss": 0.0003, + "num_input_tokens_seen": 5993472, + "step": 2580 + }, + { + "epoch": 2.4363807728557965, + "grad_norm": 0.10097761452198029, + "learning_rate": 3.045069103055672e-06, + "loss": 0.0005, + "num_input_tokens_seen": 6003520, + "step": 2585 + }, + { + "epoch": 2.4410933081998114, + "grad_norm": 2.4190433025360107, + "learning_rate": 3.037038270920489e-06, + "loss": 0.0118, + "num_input_tokens_seen": 6014720, + "step": 2590 + }, + { + "epoch": 2.445805843543827, + "grad_norm": 0.22586967051029205, + "learning_rate": 3.0290016247218323e-06, + "loss": 0.0956, + "num_input_tokens_seen": 6032192, + "step": 2595 + }, + { + "epoch": 2.4505183788878417, + "grad_norm": 0.04388771951198578, + "learning_rate": 3.0209592514657365e-06, + "loss": 0.2412, + "num_input_tokens_seen": 6043328, + "step": 2600 + }, + { + "epoch": 2.4552309142318567, + "grad_norm": 27.575590133666992, + "learning_rate": 3.012911238220241e-06, + "loss": 0.0061, + "num_input_tokens_seen": 6055424, + "step": 2605 + }, + { + "epoch": 2.4599434495758716, + "grad_norm": 0.026035049930214882, + "learning_rate": 3.004857672114443e-06, + "loss": 0.2284, + "num_input_tokens_seen": 6065472, + "step": 2610 + }, + { + "epoch": 2.464655984919887, + "grad_norm": 1.065898060798645, + "learning_rate": 2.996798640337556e-06, + "loss": 0.0007, + "num_input_tokens_seen": 6078016, + "step": 2615 + }, + { + "epoch": 2.469368520263902, + "grad_norm": 17.635618209838867, + "learning_rate": 2.9887342301379653e-06, + "loss": 0.0974, + "num_input_tokens_seen": 6089472, + "step": 2620 + }, + { + "epoch": 2.474081055607917, + "grad_norm": 9.892471313476562, + "learning_rate": 2.9806645288222854e-06, + "loss": 0.1484, + "num_input_tokens_seen": 6100992, + "step": 2625 + }, + { + "epoch": 2.4787935909519323, + "grad_norm": 0.05071339011192322, + "learning_rate": 2.9725896237544115e-06, + "loss": 0.0821, + "num_input_tokens_seen": 6112768, + "step": 2630 + }, + { + "epoch": 2.4835061262959472, + "grad_norm": 0.19504772126674652, + "learning_rate": 2.9645096023545774e-06, + "loss": 0.0017, + "num_input_tokens_seen": 6122752, + "step": 2635 + }, + { + "epoch": 2.488218661639962, + "grad_norm": 0.5724993348121643, + "learning_rate": 2.956424552098405e-06, + "loss": 0.05, + "num_input_tokens_seen": 6136256, + "step": 2640 + }, + { + "epoch": 2.492931196983977, + "grad_norm": 0.17895296216011047, + "learning_rate": 2.94833456051596e-06, + "loss": 0.0714, + "num_input_tokens_seen": 6147264, + "step": 2645 + }, + { + "epoch": 2.4976437323279925, + "grad_norm": 0.440418004989624, + "learning_rate": 2.9402397151908056e-06, + "loss": 0.0012, + "num_input_tokens_seen": 6161088, + "step": 2650 + }, + { + "epoch": 2.5023562676720075, + "grad_norm": 0.04092998430132866, + "learning_rate": 2.93214010375905e-06, + "loss": 0.0567, + "num_input_tokens_seen": 6173568, + "step": 2655 + }, + { + "epoch": 2.507068803016023, + "grad_norm": 0.02762027271091938, + "learning_rate": 2.924035813908402e-06, + "loss": 0.0692, + "num_input_tokens_seen": 6183872, + "step": 2660 + }, + { + "epoch": 2.507068803016023, + "eval_loss": 0.40977799892425537, + "eval_runtime": 2.7366, + "eval_samples_per_second": 344.588, + "eval_steps_per_second": 43.119, + "num_input_tokens_seen": 6183872, + "step": 2660 + }, + { + "epoch": 2.511781338360038, + "grad_norm": 0.12844966351985931, + "learning_rate": 2.9159269333772173e-06, + "loss": 0.0693, + "num_input_tokens_seen": 6195648, + "step": 2665 + }, + { + "epoch": 2.5164938737040528, + "grad_norm": 0.043845776468515396, + "learning_rate": 2.9078135499535535e-06, + "loss": 0.0003, + "num_input_tokens_seen": 6205696, + "step": 2670 + }, + { + "epoch": 2.5212064090480677, + "grad_norm": 0.1523384004831314, + "learning_rate": 2.8996957514742164e-06, + "loss": 0.0993, + "num_input_tokens_seen": 6219648, + "step": 2675 + }, + { + "epoch": 2.525918944392083, + "grad_norm": 0.03311268240213394, + "learning_rate": 2.891573625823808e-06, + "loss": 0.0016, + "num_input_tokens_seen": 6233664, + "step": 2680 + }, + { + "epoch": 2.530631479736098, + "grad_norm": 0.013956856913864613, + "learning_rate": 2.883447260933781e-06, + "loss": 0.0002, + "num_input_tokens_seen": 6246400, + "step": 2685 + }, + { + "epoch": 2.535344015080113, + "grad_norm": 0.00951316673308611, + "learning_rate": 2.875316744781479e-06, + "loss": 0.0776, + "num_input_tokens_seen": 6256576, + "step": 2690 + }, + { + "epoch": 2.5400565504241284, + "grad_norm": 18.4660587310791, + "learning_rate": 2.8671821653891903e-06, + "loss": 0.0909, + "num_input_tokens_seen": 6266240, + "step": 2695 + }, + { + "epoch": 2.5447690857681433, + "grad_norm": 14.168307304382324, + "learning_rate": 2.85904361082319e-06, + "loss": 0.1384, + "num_input_tokens_seen": 6279872, + "step": 2700 + }, + { + "epoch": 2.5494816211121583, + "grad_norm": 0.031402263790369034, + "learning_rate": 2.8509011691927923e-06, + "loss": 0.0001, + "num_input_tokens_seen": 6290048, + "step": 2705 + }, + { + "epoch": 2.554194156456173, + "grad_norm": 0.06130323186516762, + "learning_rate": 2.8427549286493906e-06, + "loss": 0.0368, + "num_input_tokens_seen": 6301120, + "step": 2710 + }, + { + "epoch": 2.5589066918001886, + "grad_norm": 0.13699810206890106, + "learning_rate": 2.8346049773855077e-06, + "loss": 0.1002, + "num_input_tokens_seen": 6312512, + "step": 2715 + }, + { + "epoch": 2.5636192271442035, + "grad_norm": 0.026166977360844612, + "learning_rate": 2.8264514036338385e-06, + "loss": 0.0002, + "num_input_tokens_seen": 6323776, + "step": 2720 + }, + { + "epoch": 2.568331762488219, + "grad_norm": 0.05908394604921341, + "learning_rate": 2.818294295666295e-06, + "loss": 0.0003, + "num_input_tokens_seen": 6334208, + "step": 2725 + }, + { + "epoch": 2.573044297832234, + "grad_norm": 1.3038756847381592, + "learning_rate": 2.8101337417930523e-06, + "loss": 0.0952, + "num_input_tokens_seen": 6345216, + "step": 2730 + }, + { + "epoch": 2.577756833176249, + "grad_norm": 39.35552215576172, + "learning_rate": 2.8019698303615912e-06, + "loss": 0.2239, + "num_input_tokens_seen": 6354304, + "step": 2735 + }, + { + "epoch": 2.5824693685202638, + "grad_norm": 0.07452750205993652, + "learning_rate": 2.7938026497557414e-06, + "loss": 0.0628, + "num_input_tokens_seen": 6368192, + "step": 2740 + }, + { + "epoch": 2.5871819038642787, + "grad_norm": 0.034812554717063904, + "learning_rate": 2.7856322883947253e-06, + "loss": 0.0454, + "num_input_tokens_seen": 6382400, + "step": 2745 + }, + { + "epoch": 2.591894439208294, + "grad_norm": 0.02569451369345188, + "learning_rate": 2.7774588347322016e-06, + "loss": 0.0836, + "num_input_tokens_seen": 6395584, + "step": 2750 + }, + { + "epoch": 2.596606974552309, + "grad_norm": 479.6157531738281, + "learning_rate": 2.7692823772553057e-06, + "loss": 0.1468, + "num_input_tokens_seen": 6406720, + "step": 2755 + }, + { + "epoch": 2.6013195098963244, + "grad_norm": 371.5435791015625, + "learning_rate": 2.7611030044836927e-06, + "loss": 0.1705, + "num_input_tokens_seen": 6418112, + "step": 2760 + }, + { + "epoch": 2.6060320452403394, + "grad_norm": 58.181087493896484, + "learning_rate": 2.752920804968581e-06, + "loss": 0.0602, + "num_input_tokens_seen": 6431104, + "step": 2765 + }, + { + "epoch": 2.6107445805843543, + "grad_norm": 3.1500463485717773, + "learning_rate": 2.744735867291789e-06, + "loss": 0.0038, + "num_input_tokens_seen": 6441792, + "step": 2770 + }, + { + "epoch": 2.6154571159283693, + "grad_norm": 0.12725692987442017, + "learning_rate": 2.736548280064781e-06, + "loss": 0.167, + "num_input_tokens_seen": 6452672, + "step": 2775 + }, + { + "epoch": 2.6201696512723847, + "grad_norm": 0.05792888626456261, + "learning_rate": 2.728358131927704e-06, + "loss": 0.1083, + "num_input_tokens_seen": 6465600, + "step": 2780 + }, + { + "epoch": 2.6248821866163996, + "grad_norm": 2.1484336853027344, + "learning_rate": 2.720165511548433e-06, + "loss": 0.0731, + "num_input_tokens_seen": 6477312, + "step": 2785 + }, + { + "epoch": 2.6295947219604145, + "grad_norm": 13.481400489807129, + "learning_rate": 2.711970507621603e-06, + "loss": 0.179, + "num_input_tokens_seen": 6486592, + "step": 2790 + }, + { + "epoch": 2.63430725730443, + "grad_norm": 0.2009446918964386, + "learning_rate": 2.7037732088676583e-06, + "loss": 0.0011, + "num_input_tokens_seen": 6497088, + "step": 2795 + }, + { + "epoch": 2.639019792648445, + "grad_norm": 0.11857722699642181, + "learning_rate": 2.6955737040318853e-06, + "loss": 0.0035, + "num_input_tokens_seen": 6505984, + "step": 2800 + }, + { + "epoch": 2.64373232799246, + "grad_norm": 0.26333293318748474, + "learning_rate": 2.687372081883454e-06, + "loss": 0.0009, + "num_input_tokens_seen": 6516928, + "step": 2805 + }, + { + "epoch": 2.6484448633364748, + "grad_norm": 0.025169173255562782, + "learning_rate": 2.6791684312144565e-06, + "loss": 0.0096, + "num_input_tokens_seen": 6527424, + "step": 2810 + }, + { + "epoch": 2.65315739868049, + "grad_norm": 0.04603775218129158, + "learning_rate": 2.670962840838946e-06, + "loss": 0.0955, + "num_input_tokens_seen": 6538432, + "step": 2815 + }, + { + "epoch": 2.657869934024505, + "grad_norm": 116.4062271118164, + "learning_rate": 2.6627553995919763e-06, + "loss": 0.0341, + "num_input_tokens_seen": 6551552, + "step": 2820 + }, + { + "epoch": 2.6625824693685205, + "grad_norm": 0.07408913224935532, + "learning_rate": 2.6545461963286374e-06, + "loss": 0.0005, + "num_input_tokens_seen": 6566208, + "step": 2825 + }, + { + "epoch": 2.6672950047125354, + "grad_norm": 67.10398864746094, + "learning_rate": 2.646335319923097e-06, + "loss": 0.1887, + "num_input_tokens_seen": 6577472, + "step": 2830 + }, + { + "epoch": 2.6720075400565504, + "grad_norm": 0.03928399085998535, + "learning_rate": 2.6381228592676343e-06, + "loss": 0.1243, + "num_input_tokens_seen": 6588608, + "step": 2835 + }, + { + "epoch": 2.6767200754005653, + "grad_norm": 0.28905507922172546, + "learning_rate": 2.629908903271683e-06, + "loss": 0.1048, + "num_input_tokens_seen": 6601088, + "step": 2840 + }, + { + "epoch": 2.6814326107445807, + "grad_norm": 0.06444710493087769, + "learning_rate": 2.6216935408608617e-06, + "loss": 0.0005, + "num_input_tokens_seen": 6611392, + "step": 2845 + }, + { + "epoch": 2.6861451460885957, + "grad_norm": 14.772208213806152, + "learning_rate": 2.6134768609760187e-06, + "loss": 0.001, + "num_input_tokens_seen": 6622656, + "step": 2850 + }, + { + "epoch": 2.6908576814326106, + "grad_norm": 222.56637573242188, + "learning_rate": 2.605258952572263e-06, + "loss": 0.0916, + "num_input_tokens_seen": 6635264, + "step": 2855 + }, + { + "epoch": 2.695570216776626, + "grad_norm": 38.73012924194336, + "learning_rate": 2.5970399046180043e-06, + "loss": 0.0028, + "num_input_tokens_seen": 6647680, + "step": 2860 + }, + { + "epoch": 2.700282752120641, + "grad_norm": 0.014610473066568375, + "learning_rate": 2.588819806093991e-06, + "loss": 0.0001, + "num_input_tokens_seen": 6662016, + "step": 2865 + }, + { + "epoch": 2.704995287464656, + "grad_norm": 0.013276712968945503, + "learning_rate": 2.580598745992342e-06, + "loss": 0.1805, + "num_input_tokens_seen": 6673024, + "step": 2870 + }, + { + "epoch": 2.709707822808671, + "grad_norm": 0.011643171310424805, + "learning_rate": 2.5723768133155894e-06, + "loss": 0.0001, + "num_input_tokens_seen": 6684416, + "step": 2875 + }, + { + "epoch": 2.7144203581526862, + "grad_norm": 0.04866794869303703, + "learning_rate": 2.5641540970757105e-06, + "loss": 0.0783, + "num_input_tokens_seen": 6696448, + "step": 2880 + }, + { + "epoch": 2.719132893496701, + "grad_norm": 0.007833253592252731, + "learning_rate": 2.555930686293165e-06, + "loss": 0.0002, + "num_input_tokens_seen": 6710528, + "step": 2885 + }, + { + "epoch": 2.7238454288407166, + "grad_norm": 0.03470620512962341, + "learning_rate": 2.547706669995933e-06, + "loss": 0.0004, + "num_input_tokens_seen": 6722176, + "step": 2890 + }, + { + "epoch": 2.7285579641847315, + "grad_norm": 31.088220596313477, + "learning_rate": 2.53948213721855e-06, + "loss": 0.1775, + "num_input_tokens_seen": 6732416, + "step": 2895 + }, + { + "epoch": 2.7332704995287465, + "grad_norm": 0.12457949668169022, + "learning_rate": 2.531257177001141e-06, + "loss": 0.1137, + "num_input_tokens_seen": 6745728, + "step": 2900 + }, + { + "epoch": 2.7379830348727614, + "grad_norm": 11.16380786895752, + "learning_rate": 2.523031878388463e-06, + "loss": 0.0956, + "num_input_tokens_seen": 6756096, + "step": 2905 + }, + { + "epoch": 2.742695570216777, + "grad_norm": 1.0219601392745972, + "learning_rate": 2.5148063304289306e-06, + "loss": 0.063, + "num_input_tokens_seen": 6766976, + "step": 2910 + }, + { + "epoch": 2.7474081055607917, + "grad_norm": 0.03135580196976662, + "learning_rate": 2.5065806221736617e-06, + "loss": 0.1039, + "num_input_tokens_seen": 6777792, + "step": 2915 + }, + { + "epoch": 2.7521206409048067, + "grad_norm": 0.06795566529035568, + "learning_rate": 2.4983548426755104e-06, + "loss": 0.0003, + "num_input_tokens_seen": 6789568, + "step": 2920 + }, + { + "epoch": 2.756833176248822, + "grad_norm": 0.024466486647725105, + "learning_rate": 2.4901290809880984e-06, + "loss": 0.227, + "num_input_tokens_seen": 6803392, + "step": 2925 + }, + { + "epoch": 2.757775683317625, + "eval_loss": 0.43027257919311523, + "eval_runtime": 2.7751, + "eval_samples_per_second": 339.806, + "eval_steps_per_second": 42.521, + "num_input_tokens_seen": 6806208, + "step": 2926 + }, + { + "epoch": 2.761545711592837, + "grad_norm": 0.08801653981208801, + "learning_rate": 2.4819034261648574e-06, + "loss": 0.0645, + "num_input_tokens_seen": 6821760, + "step": 2930 + }, + { + "epoch": 2.766258246936852, + "grad_norm": 0.02651871182024479, + "learning_rate": 2.4736779672580625e-06, + "loss": 0.2084, + "num_input_tokens_seen": 6834688, + "step": 2935 + }, + { + "epoch": 2.770970782280867, + "grad_norm": 0.9018564224243164, + "learning_rate": 2.465452793317865e-06, + "loss": 0.0731, + "num_input_tokens_seen": 6846784, + "step": 2940 + }, + { + "epoch": 2.7756833176248823, + "grad_norm": 0.10618780553340912, + "learning_rate": 2.457227993391333e-06, + "loss": 0.0866, + "num_input_tokens_seen": 6859520, + "step": 2945 + }, + { + "epoch": 2.7803958529688972, + "grad_norm": 0.10207852721214294, + "learning_rate": 2.4490036565214876e-06, + "loss": 0.0008, + "num_input_tokens_seen": 6871296, + "step": 2950 + }, + { + "epoch": 2.785108388312912, + "grad_norm": 125.38693237304688, + "learning_rate": 2.440779871746331e-06, + "loss": 0.0151, + "num_input_tokens_seen": 6882496, + "step": 2955 + }, + { + "epoch": 2.7898209236569276, + "grad_norm": 0.05238529294729233, + "learning_rate": 2.4325567280978937e-06, + "loss": 0.0708, + "num_input_tokens_seen": 6894528, + "step": 2960 + }, + { + "epoch": 2.7945334590009425, + "grad_norm": 56.63581848144531, + "learning_rate": 2.424334314601263e-06, + "loss": 0.1738, + "num_input_tokens_seen": 6904960, + "step": 2965 + }, + { + "epoch": 2.7992459943449575, + "grad_norm": 0.020289117470383644, + "learning_rate": 2.416112720273623e-06, + "loss": 0.155, + "num_input_tokens_seen": 6914944, + "step": 2970 + }, + { + "epoch": 2.8039585296889724, + "grad_norm": 0.2439601868391037, + "learning_rate": 2.4078920341232856e-06, + "loss": 0.0006, + "num_input_tokens_seen": 6926080, + "step": 2975 + }, + { + "epoch": 2.808671065032988, + "grad_norm": 22.042722702026367, + "learning_rate": 2.3996723451487344e-06, + "loss": 0.0028, + "num_input_tokens_seen": 6936832, + "step": 2980 + }, + { + "epoch": 2.8133836003770027, + "grad_norm": 0.06483375281095505, + "learning_rate": 2.391453742337657e-06, + "loss": 0.2284, + "num_input_tokens_seen": 6948160, + "step": 2985 + }, + { + "epoch": 2.818096135721018, + "grad_norm": 0.019181005656719208, + "learning_rate": 2.3832363146659806e-06, + "loss": 0.0003, + "num_input_tokens_seen": 6958848, + "step": 2990 + }, + { + "epoch": 2.822808671065033, + "grad_norm": 265.2681579589844, + "learning_rate": 2.37502015109691e-06, + "loss": 0.1133, + "num_input_tokens_seen": 6970432, + "step": 2995 + }, + { + "epoch": 2.827521206409048, + "grad_norm": 0.018625818192958832, + "learning_rate": 2.3668053405799667e-06, + "loss": 0.0691, + "num_input_tokens_seen": 6980480, + "step": 3000 + }, + { + "epoch": 2.832233741753063, + "grad_norm": 241.9073486328125, + "learning_rate": 2.3585919720500214e-06, + "loss": 0.0368, + "num_input_tokens_seen": 6989760, + "step": 3005 + }, + { + "epoch": 2.8369462770970784, + "grad_norm": 0.006676084361970425, + "learning_rate": 2.3503801344263347e-06, + "loss": 0.093, + "num_input_tokens_seen": 6999232, + "step": 3010 + }, + { + "epoch": 2.8416588124410933, + "grad_norm": 97.66014862060547, + "learning_rate": 2.3421699166115946e-06, + "loss": 0.2148, + "num_input_tokens_seen": 7010944, + "step": 3015 + }, + { + "epoch": 2.8463713477851083, + "grad_norm": 0.0063159572891891, + "learning_rate": 2.3339614074909495e-06, + "loss": 0.1475, + "num_input_tokens_seen": 7021824, + "step": 3020 + }, + { + "epoch": 2.8510838831291236, + "grad_norm": 0.022584695369005203, + "learning_rate": 2.325754695931054e-06, + "loss": 0.1085, + "num_input_tokens_seen": 7031488, + "step": 3025 + }, + { + "epoch": 2.8557964184731386, + "grad_norm": 34.398460388183594, + "learning_rate": 2.3175498707790964e-06, + "loss": 0.0536, + "num_input_tokens_seen": 7041088, + "step": 3030 + }, + { + "epoch": 2.8605089538171535, + "grad_norm": 48.400482177734375, + "learning_rate": 2.3093470208618467e-06, + "loss": 0.1759, + "num_input_tokens_seen": 7051840, + "step": 3035 + }, + { + "epoch": 2.8652214891611685, + "grad_norm": 0.9638648629188538, + "learning_rate": 2.3011462349846907e-06, + "loss": 0.0005, + "num_input_tokens_seen": 7062848, + "step": 3040 + }, + { + "epoch": 2.869934024505184, + "grad_norm": 2.5327022075653076, + "learning_rate": 2.292947601930664e-06, + "loss": 0.0006, + "num_input_tokens_seen": 7079296, + "step": 3045 + }, + { + "epoch": 2.874646559849199, + "grad_norm": 27.3743896484375, + "learning_rate": 2.2847512104595005e-06, + "loss": 0.1614, + "num_input_tokens_seen": 7090752, + "step": 3050 + }, + { + "epoch": 2.879359095193214, + "grad_norm": 0.02387884445488453, + "learning_rate": 2.2765571493066647e-06, + "loss": 0.0003, + "num_input_tokens_seen": 7102464, + "step": 3055 + }, + { + "epoch": 2.884071630537229, + "grad_norm": 0.011423971503973007, + "learning_rate": 2.2683655071823925e-06, + "loss": 0.038, + "num_input_tokens_seen": 7117376, + "step": 3060 + }, + { + "epoch": 2.888784165881244, + "grad_norm": 0.048735495656728745, + "learning_rate": 2.2601763727707295e-06, + "loss": 0.0809, + "num_input_tokens_seen": 7131584, + "step": 3065 + }, + { + "epoch": 2.893496701225259, + "grad_norm": 0.3432343602180481, + "learning_rate": 2.2519898347285745e-06, + "loss": 0.1831, + "num_input_tokens_seen": 7142720, + "step": 3070 + }, + { + "epoch": 2.8982092365692744, + "grad_norm": 48.5273323059082, + "learning_rate": 2.2438059816847165e-06, + "loss": 0.1239, + "num_input_tokens_seen": 7155520, + "step": 3075 + }, + { + "epoch": 2.9029217719132894, + "grad_norm": 0.019747210666537285, + "learning_rate": 2.235624902238879e-06, + "loss": 0.0753, + "num_input_tokens_seen": 7165504, + "step": 3080 + }, + { + "epoch": 2.9076343072573043, + "grad_norm": 87.9238052368164, + "learning_rate": 2.2274466849607526e-06, + "loss": 0.118, + "num_input_tokens_seen": 7176384, + "step": 3085 + }, + { + "epoch": 2.9123468426013197, + "grad_norm": 0.024710891768336296, + "learning_rate": 2.219271418389046e-06, + "loss": 0.0012, + "num_input_tokens_seen": 7188288, + "step": 3090 + }, + { + "epoch": 2.9170593779453347, + "grad_norm": 48.7574577331543, + "learning_rate": 2.2110991910305233e-06, + "loss": 0.1523, + "num_input_tokens_seen": 7199680, + "step": 3095 + }, + { + "epoch": 2.9217719132893496, + "grad_norm": 24.760589599609375, + "learning_rate": 2.2029300913590413e-06, + "loss": 0.0548, + "num_input_tokens_seen": 7211520, + "step": 3100 + }, + { + "epoch": 2.9264844486333645, + "grad_norm": 0.13239729404449463, + "learning_rate": 2.1947642078146005e-06, + "loss": 0.0932, + "num_input_tokens_seen": 7221440, + "step": 3105 + }, + { + "epoch": 2.93119698397738, + "grad_norm": 0.09687553346157074, + "learning_rate": 2.1866016288023815e-06, + "loss": 0.0528, + "num_input_tokens_seen": 7232128, + "step": 3110 + }, + { + "epoch": 2.935909519321395, + "grad_norm": 23.3001651763916, + "learning_rate": 2.178442442691789e-06, + "loss": 0.1414, + "num_input_tokens_seen": 7241984, + "step": 3115 + }, + { + "epoch": 2.9406220546654103, + "grad_norm": 0.11602967977523804, + "learning_rate": 2.170286737815495e-06, + "loss": 0.0745, + "num_input_tokens_seen": 7252672, + "step": 3120 + }, + { + "epoch": 2.945334590009425, + "grad_norm": 0.5076259970664978, + "learning_rate": 2.1621346024684854e-06, + "loss": 0.0453, + "num_input_tokens_seen": 7264064, + "step": 3125 + }, + { + "epoch": 2.95004712535344, + "grad_norm": 2.5609045028686523, + "learning_rate": 2.1539861249071004e-06, + "loss": 0.0268, + "num_input_tokens_seen": 7275776, + "step": 3130 + }, + { + "epoch": 2.954759660697455, + "grad_norm": 1.9134023189544678, + "learning_rate": 2.145841393348079e-06, + "loss": 0.0361, + "num_input_tokens_seen": 7287680, + "step": 3135 + }, + { + "epoch": 2.95947219604147, + "grad_norm": 0.09775304049253464, + "learning_rate": 2.1377004959676086e-06, + "loss": 0.001, + "num_input_tokens_seen": 7300032, + "step": 3140 + }, + { + "epoch": 2.9641847313854854, + "grad_norm": 1.4209673404693604, + "learning_rate": 2.129563520900364e-06, + "loss": 0.0632, + "num_input_tokens_seen": 7311616, + "step": 3145 + }, + { + "epoch": 2.9688972667295004, + "grad_norm": 0.10756014287471771, + "learning_rate": 2.1214305562385592e-06, + "loss": 0.1604, + "num_input_tokens_seen": 7321600, + "step": 3150 + }, + { + "epoch": 2.9736098020735158, + "grad_norm": 0.026215003803372383, + "learning_rate": 2.1133016900309876e-06, + "loss": 0.0003, + "num_input_tokens_seen": 7333376, + "step": 3155 + }, + { + "epoch": 2.9783223374175307, + "grad_norm": 0.01626676134765148, + "learning_rate": 2.1051770102820755e-06, + "loss": 0.0002, + "num_input_tokens_seen": 7344384, + "step": 3160 + }, + { + "epoch": 2.9830348727615457, + "grad_norm": 0.2678651511669159, + "learning_rate": 2.0970566049509236e-06, + "loss": 0.0799, + "num_input_tokens_seen": 7355840, + "step": 3165 + }, + { + "epoch": 2.9877474081055606, + "grad_norm": 0.012307991273701191, + "learning_rate": 2.088940561950359e-06, + "loss": 0.0002, + "num_input_tokens_seen": 7368128, + "step": 3170 + }, + { + "epoch": 2.992459943449576, + "grad_norm": 0.03959092125296593, + "learning_rate": 2.080828969145979e-06, + "loss": 0.1426, + "num_input_tokens_seen": 7381056, + "step": 3175 + }, + { + "epoch": 2.997172478793591, + "grad_norm": 16.04501724243164, + "learning_rate": 2.0727219143552034e-06, + "loss": 0.094, + "num_input_tokens_seen": 7393536, + "step": 3180 + }, + { + "epoch": 3.001885014137606, + "grad_norm": 0.013006187044084072, + "learning_rate": 2.0646194853463255e-06, + "loss": 0.0923, + "num_input_tokens_seen": 7402656, + "step": 3185 + }, + { + "epoch": 3.0065975494816213, + "grad_norm": 0.05889980494976044, + "learning_rate": 2.056521769837553e-06, + "loss": 0.0004, + "num_input_tokens_seen": 7416480, + "step": 3190 + }, + { + "epoch": 3.008482563619227, + "eval_loss": 0.3936729431152344, + "eval_runtime": 2.7505, + "eval_samples_per_second": 342.849, + "eval_steps_per_second": 42.902, + "num_input_tokens_seen": 7421856, + "step": 3192 + }, + { + "epoch": 3.0113100848256362, + "grad_norm": 0.11007480323314667, + "learning_rate": 2.0484288554960707e-06, + "loss": 0.0003, + "num_input_tokens_seen": 7430304, + "step": 3195 + }, + { + "epoch": 3.016022620169651, + "grad_norm": 0.023877175524830818, + "learning_rate": 2.040340829937082e-06, + "loss": 0.052, + "num_input_tokens_seen": 7441568, + "step": 3200 + }, + { + "epoch": 3.0207351555136666, + "grad_norm": 0.016011981293559074, + "learning_rate": 2.032257780722865e-06, + "loss": 0.0003, + "num_input_tokens_seen": 7451744, + "step": 3205 + }, + { + "epoch": 3.0254476908576815, + "grad_norm": 0.02110173925757408, + "learning_rate": 2.0241797953618204e-06, + "loss": 0.0002, + "num_input_tokens_seen": 7463008, + "step": 3210 + }, + { + "epoch": 3.0301602262016964, + "grad_norm": 0.018056534230709076, + "learning_rate": 2.0161069613075295e-06, + "loss": 0.0001, + "num_input_tokens_seen": 7475424, + "step": 3215 + }, + { + "epoch": 3.0348727615457114, + "grad_norm": 0.018598072230815887, + "learning_rate": 2.008039365957804e-06, + "loss": 0.0002, + "num_input_tokens_seen": 7486368, + "step": 3220 + }, + { + "epoch": 3.039585296889727, + "grad_norm": 2.553637981414795, + "learning_rate": 1.9999770966537416e-06, + "loss": 0.0005, + "num_input_tokens_seen": 7497312, + "step": 3225 + }, + { + "epoch": 3.0442978322337417, + "grad_norm": 81.77398681640625, + "learning_rate": 1.991920240678776e-06, + "loss": 0.0457, + "num_input_tokens_seen": 7507552, + "step": 3230 + }, + { + "epoch": 3.0490103675777567, + "grad_norm": 0.0028707189485430717, + "learning_rate": 1.983868885257739e-06, + "loss": 0.0001, + "num_input_tokens_seen": 7519008, + "step": 3235 + }, + { + "epoch": 3.053722902921772, + "grad_norm": 25.919538497924805, + "learning_rate": 1.97582311755591e-06, + "loss": 0.0908, + "num_input_tokens_seen": 7530400, + "step": 3240 + }, + { + "epoch": 3.058435438265787, + "grad_norm": 0.009589405730366707, + "learning_rate": 1.9677830246780764e-06, + "loss": 0.0002, + "num_input_tokens_seen": 7544096, + "step": 3245 + }, + { + "epoch": 3.063147973609802, + "grad_norm": 0.04477664828300476, + "learning_rate": 1.9597486936675886e-06, + "loss": 0.0044, + "num_input_tokens_seen": 7554784, + "step": 3250 + }, + { + "epoch": 3.0678605089538173, + "grad_norm": 0.0675143375992775, + "learning_rate": 1.9517202115054174e-06, + "loss": 0.0001, + "num_input_tokens_seen": 7567392, + "step": 3255 + }, + { + "epoch": 3.0725730442978323, + "grad_norm": 0.010941299609839916, + "learning_rate": 1.9436976651092143e-06, + "loss": 0.0001, + "num_input_tokens_seen": 7578016, + "step": 3260 + }, + { + "epoch": 3.0772855796418472, + "grad_norm": 0.00475684879347682, + "learning_rate": 1.9356811413323686e-06, + "loss": 0.0689, + "num_input_tokens_seen": 7589728, + "step": 3265 + }, + { + "epoch": 3.081998114985862, + "grad_norm": 0.009347557090222836, + "learning_rate": 1.9276707269630664e-06, + "loss": 0.0006, + "num_input_tokens_seen": 7601184, + "step": 3270 + }, + { + "epoch": 3.0867106503298776, + "grad_norm": 0.02251746505498886, + "learning_rate": 1.9196665087233548e-06, + "loss": 0.0001, + "num_input_tokens_seen": 7612128, + "step": 3275 + }, + { + "epoch": 3.0914231856738925, + "grad_norm": 0.006476939655840397, + "learning_rate": 1.9116685732681995e-06, + "loss": 0.0004, + "num_input_tokens_seen": 7623776, + "step": 3280 + }, + { + "epoch": 3.0961357210179075, + "grad_norm": 0.008666305802762508, + "learning_rate": 1.9036770071845467e-06, + "loss": 0.0001, + "num_input_tokens_seen": 7636128, + "step": 3285 + }, + { + "epoch": 3.100848256361923, + "grad_norm": 0.07765082269906998, + "learning_rate": 1.8956918969903881e-06, + "loss": 0.0002, + "num_input_tokens_seen": 7646432, + "step": 3290 + }, + { + "epoch": 3.105560791705938, + "grad_norm": 0.006188265047967434, + "learning_rate": 1.887713329133824e-06, + "loss": 0.0, + "num_input_tokens_seen": 7657824, + "step": 3295 + }, + { + "epoch": 3.1102733270499527, + "grad_norm": 28.873966217041016, + "learning_rate": 1.8797413899921224e-06, + "loss": 0.0829, + "num_input_tokens_seen": 7669920, + "step": 3300 + }, + { + "epoch": 3.114985862393968, + "grad_norm": 34.75840759277344, + "learning_rate": 1.8717761658707916e-06, + "loss": 0.0054, + "num_input_tokens_seen": 7681952, + "step": 3305 + }, + { + "epoch": 3.119698397737983, + "grad_norm": 0.005098584573715925, + "learning_rate": 1.86381774300264e-06, + "loss": 0.0, + "num_input_tokens_seen": 7692832, + "step": 3310 + }, + { + "epoch": 3.124410933081998, + "grad_norm": 0.012214220128953457, + "learning_rate": 1.8558662075468468e-06, + "loss": 0.1029, + "num_input_tokens_seen": 7703072, + "step": 3315 + }, + { + "epoch": 3.1291234684260134, + "grad_norm": 0.05773286893963814, + "learning_rate": 1.8479216455880225e-06, + "loss": 0.0, + "num_input_tokens_seen": 7714016, + "step": 3320 + }, + { + "epoch": 3.1338360037700284, + "grad_norm": 0.0058107743971049786, + "learning_rate": 1.8399841431352855e-06, + "loss": 0.0002, + "num_input_tokens_seen": 7726688, + "step": 3325 + }, + { + "epoch": 3.1385485391140433, + "grad_norm": 0.11002416163682938, + "learning_rate": 1.8320537861213267e-06, + "loss": 0.0001, + "num_input_tokens_seen": 7739680, + "step": 3330 + }, + { + "epoch": 3.1432610744580582, + "grad_norm": 0.04805764928460121, + "learning_rate": 1.8241306604014761e-06, + "loss": 0.0001, + "num_input_tokens_seen": 7749024, + "step": 3335 + }, + { + "epoch": 3.1479736098020736, + "grad_norm": 0.001311355852521956, + "learning_rate": 1.816214851752779e-06, + "loss": 0.0008, + "num_input_tokens_seen": 7761568, + "step": 3340 + }, + { + "epoch": 3.1526861451460886, + "grad_norm": 0.013387499377131462, + "learning_rate": 1.8083064458730651e-06, + "loss": 0.0001, + "num_input_tokens_seen": 7772640, + "step": 3345 + }, + { + "epoch": 3.1573986804901035, + "grad_norm": 0.017248639836907387, + "learning_rate": 1.8004055283800204e-06, + "loss": 0.0004, + "num_input_tokens_seen": 7784672, + "step": 3350 + }, + { + "epoch": 3.162111215834119, + "grad_norm": 0.0028156836051493883, + "learning_rate": 1.7925121848102583e-06, + "loss": 0.0, + "num_input_tokens_seen": 7795872, + "step": 3355 + }, + { + "epoch": 3.166823751178134, + "grad_norm": 0.005071389954537153, + "learning_rate": 1.7846265006183976e-06, + "loss": 0.0, + "num_input_tokens_seen": 7808416, + "step": 3360 + }, + { + "epoch": 3.171536286522149, + "grad_norm": 0.013269501738250256, + "learning_rate": 1.776748561176137e-06, + "loss": 0.0, + "num_input_tokens_seen": 7820640, + "step": 3365 + }, + { + "epoch": 3.176248821866164, + "grad_norm": 0.010563570074737072, + "learning_rate": 1.7688784517713247e-06, + "loss": 0.0, + "num_input_tokens_seen": 7831072, + "step": 3370 + }, + { + "epoch": 3.180961357210179, + "grad_norm": 29.668603897094727, + "learning_rate": 1.761016257607044e-06, + "loss": 0.0969, + "num_input_tokens_seen": 7841888, + "step": 3375 + }, + { + "epoch": 3.185673892554194, + "grad_norm": 0.07287805527448654, + "learning_rate": 1.7531620638006834e-06, + "loss": 0.0488, + "num_input_tokens_seen": 7852896, + "step": 3380 + }, + { + "epoch": 3.190386427898209, + "grad_norm": 27.53697395324707, + "learning_rate": 1.7453159553830217e-06, + "loss": 0.0013, + "num_input_tokens_seen": 7868384, + "step": 3385 + }, + { + "epoch": 3.1950989632422244, + "grad_norm": 0.006890235003083944, + "learning_rate": 1.7374780172973004e-06, + "loss": 0.0001, + "num_input_tokens_seen": 7881312, + "step": 3390 + }, + { + "epoch": 3.1998114985862394, + "grad_norm": 0.0019902060739696026, + "learning_rate": 1.7296483343983095e-06, + "loss": 0.0564, + "num_input_tokens_seen": 7892128, + "step": 3395 + }, + { + "epoch": 3.2045240339302543, + "grad_norm": 0.020912524312734604, + "learning_rate": 1.7218269914514668e-06, + "loss": 0.0002, + "num_input_tokens_seen": 7902624, + "step": 3400 + }, + { + "epoch": 3.2092365692742697, + "grad_norm": 0.007507129572331905, + "learning_rate": 1.714014073131901e-06, + "loss": 0.0001, + "num_input_tokens_seen": 7915168, + "step": 3405 + }, + { + "epoch": 3.2139491046182846, + "grad_norm": 0.053086619824171066, + "learning_rate": 1.7062096640235327e-06, + "loss": 0.0002, + "num_input_tokens_seen": 7925472, + "step": 3410 + }, + { + "epoch": 3.2186616399622996, + "grad_norm": 0.1808883100748062, + "learning_rate": 1.6984138486181612e-06, + "loss": 0.0001, + "num_input_tokens_seen": 7940576, + "step": 3415 + }, + { + "epoch": 3.223374175306315, + "grad_norm": 0.015256045386195183, + "learning_rate": 1.6906267113145514e-06, + "loss": 0.0323, + "num_input_tokens_seen": 7956064, + "step": 3420 + }, + { + "epoch": 3.22808671065033, + "grad_norm": 0.022556964308023453, + "learning_rate": 1.6828483364175127e-06, + "loss": 0.0, + "num_input_tokens_seen": 7967264, + "step": 3425 + }, + { + "epoch": 3.232799245994345, + "grad_norm": 1.4760725498199463, + "learning_rate": 1.6750788081369951e-06, + "loss": 0.0003, + "num_input_tokens_seen": 7978144, + "step": 3430 + }, + { + "epoch": 3.23751178133836, + "grad_norm": 18.443159103393555, + "learning_rate": 1.6673182105871733e-06, + "loss": 0.0443, + "num_input_tokens_seen": 7989152, + "step": 3435 + }, + { + "epoch": 3.242224316682375, + "grad_norm": 0.005763629917055368, + "learning_rate": 1.659566627785536e-06, + "loss": 0.0, + "num_input_tokens_seen": 8000800, + "step": 3440 + }, + { + "epoch": 3.24693685202639, + "grad_norm": 0.0024256331380456686, + "learning_rate": 1.651824143651975e-06, + "loss": 0.0004, + "num_input_tokens_seen": 8014816, + "step": 3445 + }, + { + "epoch": 3.251649387370405, + "grad_norm": 0.003130377735942602, + "learning_rate": 1.644090842007881e-06, + "loss": 0.0, + "num_input_tokens_seen": 8025120, + "step": 3450 + }, + { + "epoch": 3.2563619227144205, + "grad_norm": 0.02095922827720642, + "learning_rate": 1.6363668065752336e-06, + "loss": 0.0, + "num_input_tokens_seen": 8037344, + "step": 3455 + }, + { + "epoch": 3.2591894439208295, + "eval_loss": 0.5191035270690918, + "eval_runtime": 2.7496, + "eval_samples_per_second": 342.955, + "eval_steps_per_second": 42.915, + "num_input_tokens_seen": 8043744, + "step": 3458 + }, + { + "epoch": 3.2610744580584354, + "grad_norm": 0.0036951308138668537, + "learning_rate": 1.6286521209756917e-06, + "loss": 0.0875, + "num_input_tokens_seen": 8048096, + "step": 3460 + }, + { + "epoch": 3.2657869934024504, + "grad_norm": 0.004653128329664469, + "learning_rate": 1.6209468687296947e-06, + "loss": 0.0, + "num_input_tokens_seen": 8061344, + "step": 3465 + }, + { + "epoch": 3.2704995287464658, + "grad_norm": 0.004204562399536371, + "learning_rate": 1.613251133255554e-06, + "loss": 0.0, + "num_input_tokens_seen": 8073184, + "step": 3470 + }, + { + "epoch": 3.2752120640904807, + "grad_norm": 0.005397483240813017, + "learning_rate": 1.6055649978685517e-06, + "loss": 0.0, + "num_input_tokens_seen": 8082976, + "step": 3475 + }, + { + "epoch": 3.2799245994344957, + "grad_norm": 0.01232027355581522, + "learning_rate": 1.5978885457800348e-06, + "loss": 0.0, + "num_input_tokens_seen": 8094624, + "step": 3480 + }, + { + "epoch": 3.284637134778511, + "grad_norm": 57.04811096191406, + "learning_rate": 1.59022186009652e-06, + "loss": 0.0843, + "num_input_tokens_seen": 8104928, + "step": 3485 + }, + { + "epoch": 3.289349670122526, + "grad_norm": 0.009285739623010159, + "learning_rate": 1.5825650238187918e-06, + "loss": 0.0, + "num_input_tokens_seen": 8116896, + "step": 3490 + }, + { + "epoch": 3.294062205466541, + "grad_norm": 0.009216835722327232, + "learning_rate": 1.5749181198410014e-06, + "loss": 0.0875, + "num_input_tokens_seen": 8127968, + "step": 3495 + }, + { + "epoch": 3.298774740810556, + "grad_norm": 0.00624003866687417, + "learning_rate": 1.5672812309497722e-06, + "loss": 0.0326, + "num_input_tokens_seen": 8139936, + "step": 3500 + }, + { + "epoch": 3.3034872761545713, + "grad_norm": 0.014653928577899933, + "learning_rate": 1.5596544398233028e-06, + "loss": 0.0001, + "num_input_tokens_seen": 8151392, + "step": 3505 + }, + { + "epoch": 3.308199811498586, + "grad_norm": 0.00760252121835947, + "learning_rate": 1.5520378290304723e-06, + "loss": 0.0, + "num_input_tokens_seen": 8165280, + "step": 3510 + }, + { + "epoch": 3.312912346842601, + "grad_norm": 0.037401266396045685, + "learning_rate": 1.544431481029944e-06, + "loss": 0.0, + "num_input_tokens_seen": 8177696, + "step": 3515 + }, + { + "epoch": 3.3176248821866166, + "grad_norm": 0.004210233688354492, + "learning_rate": 1.5368354781692764e-06, + "loss": 0.0, + "num_input_tokens_seen": 8189280, + "step": 3520 + }, + { + "epoch": 3.3223374175306315, + "grad_norm": 0.08442122489213943, + "learning_rate": 1.5292499026840292e-06, + "loss": 0.0001, + "num_input_tokens_seen": 8202784, + "step": 3525 + }, + { + "epoch": 3.3270499528746464, + "grad_norm": 0.0204016100615263, + "learning_rate": 1.5216748366968743e-06, + "loss": 0.1032, + "num_input_tokens_seen": 8216032, + "step": 3530 + }, + { + "epoch": 3.331762488218662, + "grad_norm": 0.002625198569148779, + "learning_rate": 1.5141103622167042e-06, + "loss": 0.0001, + "num_input_tokens_seen": 8228320, + "step": 3535 + }, + { + "epoch": 3.336475023562677, + "grad_norm": 0.00422442564740777, + "learning_rate": 1.5065565611377472e-06, + "loss": 0.0487, + "num_input_tokens_seen": 8240416, + "step": 3540 + }, + { + "epoch": 3.3411875589066917, + "grad_norm": 0.001890690764412284, + "learning_rate": 1.4990135152386814e-06, + "loss": 0.0, + "num_input_tokens_seen": 8252640, + "step": 3545 + }, + { + "epoch": 3.345900094250707, + "grad_norm": 0.04529090225696564, + "learning_rate": 1.4914813061817434e-06, + "loss": 0.0001, + "num_input_tokens_seen": 8261984, + "step": 3550 + }, + { + "epoch": 3.350612629594722, + "grad_norm": 0.004377726465463638, + "learning_rate": 1.4839600155118525e-06, + "loss": 0.0036, + "num_input_tokens_seen": 8273568, + "step": 3555 + }, + { + "epoch": 3.355325164938737, + "grad_norm": 0.003906300291419029, + "learning_rate": 1.4764497246557214e-06, + "loss": 0.0001, + "num_input_tokens_seen": 8285472, + "step": 3560 + }, + { + "epoch": 3.360037700282752, + "grad_norm": 5.829066753387451, + "learning_rate": 1.4689505149209788e-06, + "loss": 0.0008, + "num_input_tokens_seen": 8294816, + "step": 3565 + }, + { + "epoch": 3.3647502356267673, + "grad_norm": 0.004101856611669064, + "learning_rate": 1.4614624674952843e-06, + "loss": 0.0, + "num_input_tokens_seen": 8305504, + "step": 3570 + }, + { + "epoch": 3.3694627709707823, + "grad_norm": 0.002960493555292487, + "learning_rate": 1.4539856634454558e-06, + "loss": 0.0518, + "num_input_tokens_seen": 8316320, + "step": 3575 + }, + { + "epoch": 3.3741753063147972, + "grad_norm": 0.004375193268060684, + "learning_rate": 1.4465201837165876e-06, + "loss": 0.0384, + "num_input_tokens_seen": 8327200, + "step": 3580 + }, + { + "epoch": 3.3788878416588126, + "grad_norm": 0.0019584076944738626, + "learning_rate": 1.4390661091311742e-06, + "loss": 0.0, + "num_input_tokens_seen": 8339488, + "step": 3585 + }, + { + "epoch": 3.3836003770028276, + "grad_norm": 0.12523356080055237, + "learning_rate": 1.4316235203882373e-06, + "loss": 0.0642, + "num_input_tokens_seen": 8353120, + "step": 3590 + }, + { + "epoch": 3.3883129123468425, + "grad_norm": 0.002279081614688039, + "learning_rate": 1.4241924980624485e-06, + "loss": 0.0, + "num_input_tokens_seen": 8364768, + "step": 3595 + }, + { + "epoch": 3.3930254476908575, + "grad_norm": 7.863749027252197, + "learning_rate": 1.4167731226032656e-06, + "loss": 0.0029, + "num_input_tokens_seen": 8376480, + "step": 3600 + }, + { + "epoch": 3.397737983034873, + "grad_norm": 0.0011470771860331297, + "learning_rate": 1.4093654743340462e-06, + "loss": 0.0122, + "num_input_tokens_seen": 8386784, + "step": 3605 + }, + { + "epoch": 3.402450518378888, + "grad_norm": 0.001517343451268971, + "learning_rate": 1.4019696334511962e-06, + "loss": 0.0, + "num_input_tokens_seen": 8397984, + "step": 3610 + }, + { + "epoch": 3.4071630537229027, + "grad_norm": 0.005868109408766031, + "learning_rate": 1.3945856800232874e-06, + "loss": 0.0, + "num_input_tokens_seen": 8408544, + "step": 3615 + }, + { + "epoch": 3.411875589066918, + "grad_norm": 0.0029909429140388966, + "learning_rate": 1.3872136939902004e-06, + "loss": 0.0, + "num_input_tokens_seen": 8419552, + "step": 3620 + }, + { + "epoch": 3.416588124410933, + "grad_norm": 0.4216376841068268, + "learning_rate": 1.379853755162249e-06, + "loss": 0.0001, + "num_input_tokens_seen": 8429664, + "step": 3625 + }, + { + "epoch": 3.421300659754948, + "grad_norm": 0.0006833241204731166, + "learning_rate": 1.3725059432193278e-06, + "loss": 0.0, + "num_input_tokens_seen": 8441376, + "step": 3630 + }, + { + "epoch": 3.4260131950989634, + "grad_norm": 0.0014978962717577815, + "learning_rate": 1.3651703377100406e-06, + "loss": 0.0, + "num_input_tokens_seen": 8452896, + "step": 3635 + }, + { + "epoch": 3.4307257304429783, + "grad_norm": 0.0020364606752991676, + "learning_rate": 1.3578470180508432e-06, + "loss": 0.0, + "num_input_tokens_seen": 8463328, + "step": 3640 + }, + { + "epoch": 3.4354382657869933, + "grad_norm": 0.003118757624179125, + "learning_rate": 1.3505360635251813e-06, + "loss": 0.0, + "num_input_tokens_seen": 8475808, + "step": 3645 + }, + { + "epoch": 3.4401508011310087, + "grad_norm": 0.002336150733754039, + "learning_rate": 1.3432375532826374e-06, + "loss": 0.0122, + "num_input_tokens_seen": 8487456, + "step": 3650 + }, + { + "epoch": 3.4448633364750236, + "grad_norm": 0.04974250867962837, + "learning_rate": 1.3359515663380668e-06, + "loss": 0.0, + "num_input_tokens_seen": 8503712, + "step": 3655 + }, + { + "epoch": 3.4495758718190386, + "grad_norm": 0.069328673183918, + "learning_rate": 1.3286781815707465e-06, + "loss": 0.2188, + "num_input_tokens_seen": 8514848, + "step": 3660 + }, + { + "epoch": 3.4542884071630535, + "grad_norm": 0.0022926589008420706, + "learning_rate": 1.3214174777235192e-06, + "loss": 0.0985, + "num_input_tokens_seen": 8524960, + "step": 3665 + }, + { + "epoch": 3.459000942507069, + "grad_norm": 0.03560859337449074, + "learning_rate": 1.3141695334019453e-06, + "loss": 0.0001, + "num_input_tokens_seen": 8535520, + "step": 3670 + }, + { + "epoch": 3.463713477851084, + "grad_norm": 0.013610146008431911, + "learning_rate": 1.3069344270734452e-06, + "loss": 0.0023, + "num_input_tokens_seen": 8544864, + "step": 3675 + }, + { + "epoch": 3.468426013195099, + "grad_norm": 0.0055809905752539635, + "learning_rate": 1.2997122370664538e-06, + "loss": 0.0001, + "num_input_tokens_seen": 8556960, + "step": 3680 + }, + { + "epoch": 3.473138548539114, + "grad_norm": 0.011014264076948166, + "learning_rate": 1.2925030415695727e-06, + "loss": 0.0001, + "num_input_tokens_seen": 8567968, + "step": 3685 + }, + { + "epoch": 3.477851083883129, + "grad_norm": 30.655986785888672, + "learning_rate": 1.285306918630722e-06, + "loss": 0.0595, + "num_input_tokens_seen": 8581920, + "step": 3690 + }, + { + "epoch": 3.482563619227144, + "grad_norm": 0.047711387276649475, + "learning_rate": 1.2781239461562966e-06, + "loss": 0.0442, + "num_input_tokens_seen": 8594720, + "step": 3695 + }, + { + "epoch": 3.4872761545711595, + "grad_norm": 0.00917474739253521, + "learning_rate": 1.2709542019103211e-06, + "loss": 0.0001, + "num_input_tokens_seen": 8606560, + "step": 3700 + }, + { + "epoch": 3.4919886899151744, + "grad_norm": 0.016317633911967278, + "learning_rate": 1.2637977635136123e-06, + "loss": 0.0017, + "num_input_tokens_seen": 8618208, + "step": 3705 + }, + { + "epoch": 3.4967012252591894, + "grad_norm": 0.028710726648569107, + "learning_rate": 1.2566547084429326e-06, + "loss": 0.0089, + "num_input_tokens_seen": 8631584, + "step": 3710 + }, + { + "epoch": 3.5014137606032048, + "grad_norm": 0.0347968190908432, + "learning_rate": 1.2495251140301553e-06, + "loss": 0.0338, + "num_input_tokens_seen": 8642912, + "step": 3715 + }, + { + "epoch": 3.5061262959472197, + "grad_norm": 0.01212374772876501, + "learning_rate": 1.2424090574614262e-06, + "loss": 0.0002, + "num_input_tokens_seen": 8652384, + "step": 3720 + }, + { + "epoch": 3.5098963242224315, + "eval_loss": 0.4635506868362427, + "eval_runtime": 2.7946, + "eval_samples_per_second": 337.431, + "eval_steps_per_second": 42.224, + "num_input_tokens_seen": 8660768, + "step": 3724 + }, + { + "epoch": 3.5108388312912346, + "grad_norm": 0.008465130813419819, + "learning_rate": 1.2353066157763305e-06, + "loss": 0.0008, + "num_input_tokens_seen": 8662624, + "step": 3725 + }, + { + "epoch": 3.5155513666352496, + "grad_norm": 0.02242710441350937, + "learning_rate": 1.2282178658670514e-06, + "loss": 0.0001, + "num_input_tokens_seen": 8672864, + "step": 3730 + }, + { + "epoch": 3.520263901979265, + "grad_norm": 0.004590487107634544, + "learning_rate": 1.221142884477548e-06, + "loss": 0.0001, + "num_input_tokens_seen": 8684448, + "step": 3735 + }, + { + "epoch": 3.52497643732328, + "grad_norm": 0.0026052999310195446, + "learning_rate": 1.2140817482027155e-06, + "loss": 0.0001, + "num_input_tokens_seen": 8698336, + "step": 3740 + }, + { + "epoch": 3.529688972667295, + "grad_norm": 0.002777635119855404, + "learning_rate": 1.207034533487564e-06, + "loss": 0.0, + "num_input_tokens_seen": 8711072, + "step": 3745 + }, + { + "epoch": 3.5344015080113103, + "grad_norm": 0.004557525273412466, + "learning_rate": 1.2000013166263803e-06, + "loss": 0.0001, + "num_input_tokens_seen": 8723872, + "step": 3750 + }, + { + "epoch": 3.539114043355325, + "grad_norm": 10.647988319396973, + "learning_rate": 1.1929821737619132e-06, + "loss": 0.0013, + "num_input_tokens_seen": 8735776, + "step": 3755 + }, + { + "epoch": 3.54382657869934, + "grad_norm": 0.0055831498466432095, + "learning_rate": 1.1859771808845417e-06, + "loss": 0.0, + "num_input_tokens_seen": 8752736, + "step": 3760 + }, + { + "epoch": 3.548539114043355, + "grad_norm": 0.007252705283463001, + "learning_rate": 1.1789864138314577e-06, + "loss": 0.0001, + "num_input_tokens_seen": 8766688, + "step": 3765 + }, + { + "epoch": 3.5532516493873705, + "grad_norm": 0.031091537326574326, + "learning_rate": 1.1720099482858364e-06, + "loss": 0.0, + "num_input_tokens_seen": 8781536, + "step": 3770 + }, + { + "epoch": 3.5579641847313854, + "grad_norm": 0.0167153999209404, + "learning_rate": 1.1650478597760284e-06, + "loss": 0.0001, + "num_input_tokens_seen": 8792224, + "step": 3775 + }, + { + "epoch": 3.562676720075401, + "grad_norm": 0.002952584996819496, + "learning_rate": 1.158100223674733e-06, + "loss": 0.0704, + "num_input_tokens_seen": 8803168, + "step": 3780 + }, + { + "epoch": 3.5673892554194158, + "grad_norm": 0.003907319158315659, + "learning_rate": 1.1511671151981861e-06, + "loss": 0.0001, + "num_input_tokens_seen": 8813536, + "step": 3785 + }, + { + "epoch": 3.5721017907634307, + "grad_norm": 0.004613762255758047, + "learning_rate": 1.1442486094053445e-06, + "loss": 0.0, + "num_input_tokens_seen": 8823840, + "step": 3790 + }, + { + "epoch": 3.5768143261074457, + "grad_norm": 0.008234544657170773, + "learning_rate": 1.1373447811970762e-06, + "loss": 0.0, + "num_input_tokens_seen": 8836576, + "step": 3795 + }, + { + "epoch": 3.581526861451461, + "grad_norm": 0.004143028054386377, + "learning_rate": 1.130455705315345e-06, + "loss": 0.0, + "num_input_tokens_seen": 8849824, + "step": 3800 + }, + { + "epoch": 3.586239396795476, + "grad_norm": 0.010429292917251587, + "learning_rate": 1.1235814563424046e-06, + "loss": 0.1829, + "num_input_tokens_seen": 8860448, + "step": 3805 + }, + { + "epoch": 3.590951932139491, + "grad_norm": 0.005767806898802519, + "learning_rate": 1.1167221086999897e-06, + "loss": 0.0001, + "num_input_tokens_seen": 8871776, + "step": 3810 + }, + { + "epoch": 3.5956644674835063, + "grad_norm": 4.553272724151611, + "learning_rate": 1.10987773664851e-06, + "loss": 0.0006, + "num_input_tokens_seen": 8885728, + "step": 3815 + }, + { + "epoch": 3.6003770028275213, + "grad_norm": 0.002544153481721878, + "learning_rate": 1.1030484142862511e-06, + "loss": 0.0, + "num_input_tokens_seen": 8895904, + "step": 3820 + }, + { + "epoch": 3.605089538171536, + "grad_norm": 0.5232880711555481, + "learning_rate": 1.0962342155485613e-06, + "loss": 0.0006, + "num_input_tokens_seen": 8907808, + "step": 3825 + }, + { + "epoch": 3.609802073515551, + "grad_norm": 0.0035159303806722164, + "learning_rate": 1.0894352142070652e-06, + "loss": 0.0, + "num_input_tokens_seen": 8918432, + "step": 3830 + }, + { + "epoch": 3.6145146088595665, + "grad_norm": 21.874401092529297, + "learning_rate": 1.0826514838688533e-06, + "loss": 0.072, + "num_input_tokens_seen": 8929248, + "step": 3835 + }, + { + "epoch": 3.6192271442035815, + "grad_norm": 0.03307318687438965, + "learning_rate": 1.075883097975691e-06, + "loss": 0.0001, + "num_input_tokens_seen": 8940384, + "step": 3840 + }, + { + "epoch": 3.623939679547597, + "grad_norm": 0.0033383166883140802, + "learning_rate": 1.0691301298032218e-06, + "loss": 0.0, + "num_input_tokens_seen": 8950816, + "step": 3845 + }, + { + "epoch": 3.628652214891612, + "grad_norm": 0.004945589695125818, + "learning_rate": 1.0623926524601771e-06, + "loss": 0.0001, + "num_input_tokens_seen": 8963296, + "step": 3850 + }, + { + "epoch": 3.6333647502356268, + "grad_norm": 0.014219646342098713, + "learning_rate": 1.0556707388875786e-06, + "loss": 0.0, + "num_input_tokens_seen": 8974624, + "step": 3855 + }, + { + "epoch": 3.6380772855796417, + "grad_norm": 13.508224487304688, + "learning_rate": 1.048964461857954e-06, + "loss": 0.0596, + "num_input_tokens_seen": 8985952, + "step": 3860 + }, + { + "epoch": 3.6427898209236567, + "grad_norm": 0.022754942998290062, + "learning_rate": 1.0422738939745453e-06, + "loss": 0.0002, + "num_input_tokens_seen": 8996064, + "step": 3865 + }, + { + "epoch": 3.647502356267672, + "grad_norm": 0.023511681705713272, + "learning_rate": 1.035599107670529e-06, + "loss": 0.0002, + "num_input_tokens_seen": 9006368, + "step": 3870 + }, + { + "epoch": 3.652214891611687, + "grad_norm": 0.01913359761238098, + "learning_rate": 1.0289401752082214e-06, + "loss": 0.0001, + "num_input_tokens_seen": 9018272, + "step": 3875 + }, + { + "epoch": 3.6569274269557024, + "grad_norm": 0.03516068682074547, + "learning_rate": 1.0222971686783089e-06, + "loss": 0.1112, + "num_input_tokens_seen": 9029472, + "step": 3880 + }, + { + "epoch": 3.6616399622997173, + "grad_norm": 0.10616306960582733, + "learning_rate": 1.0156701599990562e-06, + "loss": 0.0001, + "num_input_tokens_seen": 9041824, + "step": 3885 + }, + { + "epoch": 3.6663524976437323, + "grad_norm": 0.007794396486133337, + "learning_rate": 1.0090592209155373e-06, + "loss": 0.0381, + "num_input_tokens_seen": 9054752, + "step": 3890 + }, + { + "epoch": 3.6710650329877472, + "grad_norm": 2.6590635776519775, + "learning_rate": 1.0024644229988484e-06, + "loss": 0.002, + "num_input_tokens_seen": 9064928, + "step": 3895 + }, + { + "epoch": 3.6757775683317626, + "grad_norm": 0.02115059085190296, + "learning_rate": 9.95885837645344e-07, + "loss": 0.0001, + "num_input_tokens_seen": 9076256, + "step": 3900 + }, + { + "epoch": 3.6804901036757776, + "grad_norm": 0.035742953419685364, + "learning_rate": 9.893235360758565e-07, + "loss": 0.0954, + "num_input_tokens_seen": 9086624, + "step": 3905 + }, + { + "epoch": 3.6852026390197925, + "grad_norm": 0.007046286016702652, + "learning_rate": 9.827775893349273e-07, + "loss": 0.0001, + "num_input_tokens_seen": 9097824, + "step": 3910 + }, + { + "epoch": 3.689915174363808, + "grad_norm": 379.1387939453125, + "learning_rate": 9.762480682900374e-07, + "loss": 0.0323, + "num_input_tokens_seen": 9107296, + "step": 3915 + }, + { + "epoch": 3.694627709707823, + "grad_norm": 18.87242889404297, + "learning_rate": 9.697350436308428e-07, + "loss": 0.0039, + "num_input_tokens_seen": 9119008, + "step": 3920 + }, + { + "epoch": 3.699340245051838, + "grad_norm": 0.0049491687677800655, + "learning_rate": 9.63238585868405e-07, + "loss": 0.0001, + "num_input_tokens_seen": 9131296, + "step": 3925 + }, + { + "epoch": 3.7040527803958527, + "grad_norm": 0.01330646499991417, + "learning_rate": 9.567587653344295e-07, + "loss": 0.0001, + "num_input_tokens_seen": 9141664, + "step": 3930 + }, + { + "epoch": 3.708765315739868, + "grad_norm": 0.01977146603167057, + "learning_rate": 9.502956521805054e-07, + "loss": 0.0001, + "num_input_tokens_seen": 9151328, + "step": 3935 + }, + { + "epoch": 3.713477851083883, + "grad_norm": 0.08915020525455475, + "learning_rate": 9.438493163773433e-07, + "loss": 0.0002, + "num_input_tokens_seen": 9164192, + "step": 3940 + }, + { + "epoch": 3.7181903864278985, + "grad_norm": 0.004182067699730396, + "learning_rate": 9.374198277140237e-07, + "loss": 0.0003, + "num_input_tokens_seen": 9176544, + "step": 3945 + }, + { + "epoch": 3.7229029217719134, + "grad_norm": 0.028023963794112206, + "learning_rate": 9.310072557972305e-07, + "loss": 0.0162, + "num_input_tokens_seen": 9188512, + "step": 3950 + }, + { + "epoch": 3.7276154571159283, + "grad_norm": 0.003678038949146867, + "learning_rate": 9.246116700505109e-07, + "loss": 0.0001, + "num_input_tokens_seen": 9200992, + "step": 3955 + }, + { + "epoch": 3.7323279924599433, + "grad_norm": 0.004443558864295483, + "learning_rate": 9.18233139713513e-07, + "loss": 0.0001, + "num_input_tokens_seen": 9211168, + "step": 3960 + }, + { + "epoch": 3.7370405278039587, + "grad_norm": 0.012858624570071697, + "learning_rate": 9.118717338412414e-07, + "loss": 0.0, + "num_input_tokens_seen": 9223456, + "step": 3965 + }, + { + "epoch": 3.7417530631479736, + "grad_norm": 0.037973713129758835, + "learning_rate": 9.055275213033077e-07, + "loss": 0.0002, + "num_input_tokens_seen": 9233632, + "step": 3970 + }, + { + "epoch": 3.7464655984919886, + "grad_norm": 49.70805358886719, + "learning_rate": 8.992005707831877e-07, + "loss": 0.0751, + "num_input_tokens_seen": 9243296, + "step": 3975 + }, + { + "epoch": 3.751178133836004, + "grad_norm": 0.0022637660149484873, + "learning_rate": 8.928909507774741e-07, + "loss": 0.0002, + "num_input_tokens_seen": 9259424, + "step": 3980 + }, + { + "epoch": 3.755890669180019, + "grad_norm": 0.009885657578706741, + "learning_rate": 8.86598729595137e-07, + "loss": 0.0, + "num_input_tokens_seen": 9271840, + "step": 3985 + }, + { + "epoch": 3.760603204524034, + "grad_norm": 0.010455531068146229, + "learning_rate": 8.80323975356783e-07, + "loss": 0.0, + "num_input_tokens_seen": 9286304, + "step": 3990 + }, + { + "epoch": 3.760603204524034, + "eval_loss": 0.5201095938682556, + "eval_runtime": 2.7914, + "eval_samples_per_second": 337.823, + "eval_steps_per_second": 42.273, + "num_input_tokens_seen": 9286304, + "step": 3990 + }, + { + "epoch": 3.765315739868049, + "grad_norm": 3.8269801139831543, + "learning_rate": 8.740667559939217e-07, + "loss": 0.0004, + "num_input_tokens_seen": 9297056, + "step": 3995 + }, + { + "epoch": 3.770028275212064, + "grad_norm": 0.010600591078400612, + "learning_rate": 8.678271392482243e-07, + "loss": 0.0, + "num_input_tokens_seen": 9307872, + "step": 4000 + }, + { + "epoch": 3.774740810556079, + "grad_norm": 0.00742421904578805, + "learning_rate": 8.616051926707941e-07, + "loss": 0.0, + "num_input_tokens_seen": 9318816, + "step": 4005 + }, + { + "epoch": 3.7794533459000945, + "grad_norm": 0.005168728996068239, + "learning_rate": 8.554009836214345e-07, + "loss": 0.0308, + "num_input_tokens_seen": 9331232, + "step": 4010 + }, + { + "epoch": 3.7841658812441095, + "grad_norm": 0.002414106857031584, + "learning_rate": 8.49214579267921e-07, + "loss": 0.0657, + "num_input_tokens_seen": 9342112, + "step": 4015 + }, + { + "epoch": 3.7888784165881244, + "grad_norm": 0.05582628399133682, + "learning_rate": 8.430460465852683e-07, + "loss": 0.0, + "num_input_tokens_seen": 9355872, + "step": 4020 + }, + { + "epoch": 3.7935909519321394, + "grad_norm": 0.0030152511317282915, + "learning_rate": 8.368954523550146e-07, + "loss": 0.0, + "num_input_tokens_seen": 9367008, + "step": 4025 + }, + { + "epoch": 3.7983034872761543, + "grad_norm": 0.008660019375383854, + "learning_rate": 8.307628631644904e-07, + "loss": 0.0001, + "num_input_tokens_seen": 9380000, + "step": 4030 + }, + { + "epoch": 3.8030160226201697, + "grad_norm": 0.012062069959938526, + "learning_rate": 8.246483454061016e-07, + "loss": 0.0, + "num_input_tokens_seen": 9390368, + "step": 4035 + }, + { + "epoch": 3.8077285579641846, + "grad_norm": 0.1445380598306656, + "learning_rate": 8.185519652766091e-07, + "loss": 0.0829, + "num_input_tokens_seen": 9401952, + "step": 4040 + }, + { + "epoch": 3.8124410933082, + "grad_norm": 0.017705656588077545, + "learning_rate": 8.124737887764148e-07, + "loss": 0.0, + "num_input_tokens_seen": 9413536, + "step": 4045 + }, + { + "epoch": 3.817153628652215, + "grad_norm": 13.497117042541504, + "learning_rate": 8.064138817088429e-07, + "loss": 0.09, + "num_input_tokens_seen": 9424864, + "step": 4050 + }, + { + "epoch": 3.82186616399623, + "grad_norm": 0.0018770827446132898, + "learning_rate": 8.003723096794314e-07, + "loss": 0.0, + "num_input_tokens_seen": 9437152, + "step": 4055 + }, + { + "epoch": 3.826578699340245, + "grad_norm": 0.005844899918884039, + "learning_rate": 7.94349138095219e-07, + "loss": 0.0001, + "num_input_tokens_seen": 9448032, + "step": 4060 + }, + { + "epoch": 3.8312912346842602, + "grad_norm": 0.07622718065977097, + "learning_rate": 7.883444321640383e-07, + "loss": 0.0001, + "num_input_tokens_seen": 9459424, + "step": 4065 + }, + { + "epoch": 3.836003770028275, + "grad_norm": 0.00536764832213521, + "learning_rate": 7.82358256893812e-07, + "loss": 0.0001, + "num_input_tokens_seen": 9469536, + "step": 4070 + }, + { + "epoch": 3.84071630537229, + "grad_norm": 0.0012705104891210794, + "learning_rate": 7.763906770918428e-07, + "loss": 0.0, + "num_input_tokens_seen": 9482976, + "step": 4075 + }, + { + "epoch": 3.8454288407163055, + "grad_norm": 0.008968825452029705, + "learning_rate": 7.704417573641196e-07, + "loss": 0.0001, + "num_input_tokens_seen": 9492704, + "step": 4080 + }, + { + "epoch": 3.8501413760603205, + "grad_norm": 0.00470514502376318, + "learning_rate": 7.645115621146116e-07, + "loss": 0.0, + "num_input_tokens_seen": 9504864, + "step": 4085 + }, + { + "epoch": 3.8548539114043354, + "grad_norm": 0.04696199670433998, + "learning_rate": 7.586001555445773e-07, + "loss": 0.1079, + "num_input_tokens_seen": 9515424, + "step": 4090 + }, + { + "epoch": 3.8595664467483504, + "grad_norm": 0.0076852161437273026, + "learning_rate": 7.527076016518603e-07, + "loss": 0.0001, + "num_input_tokens_seen": 9525792, + "step": 4095 + }, + { + "epoch": 3.8642789820923658, + "grad_norm": 0.04222777113318443, + "learning_rate": 7.468339642302077e-07, + "loss": 0.0001, + "num_input_tokens_seen": 9536416, + "step": 4100 + }, + { + "epoch": 3.8689915174363807, + "grad_norm": 0.024708032608032227, + "learning_rate": 7.409793068685709e-07, + "loss": 0.0722, + "num_input_tokens_seen": 9550880, + "step": 4105 + }, + { + "epoch": 3.873704052780396, + "grad_norm": 0.007110555190593004, + "learning_rate": 7.351436929504203e-07, + "loss": 0.0, + "num_input_tokens_seen": 9564768, + "step": 4110 + }, + { + "epoch": 3.878416588124411, + "grad_norm": 0.013784021139144897, + "learning_rate": 7.293271856530585e-07, + "loss": 0.0001, + "num_input_tokens_seen": 9575776, + "step": 4115 + }, + { + "epoch": 3.883129123468426, + "grad_norm": 0.0009743549744598567, + "learning_rate": 7.235298479469391e-07, + "loss": 0.0323, + "num_input_tokens_seen": 9588192, + "step": 4120 + }, + { + "epoch": 3.887841658812441, + "grad_norm": 0.006835806183516979, + "learning_rate": 7.177517425949801e-07, + "loss": 0.0, + "num_input_tokens_seen": 9598432, + "step": 4125 + }, + { + "epoch": 3.8925541941564563, + "grad_norm": 0.07676160335540771, + "learning_rate": 7.119929321518876e-07, + "loss": 0.0001, + "num_input_tokens_seen": 9613920, + "step": 4130 + }, + { + "epoch": 3.8972667295004713, + "grad_norm": 0.0023403996601700783, + "learning_rate": 7.062534789634772e-07, + "loss": 0.0001, + "num_input_tokens_seen": 9624864, + "step": 4135 + }, + { + "epoch": 3.901979264844486, + "grad_norm": 0.002742171287536621, + "learning_rate": 7.005334451660034e-07, + "loss": 0.0004, + "num_input_tokens_seen": 9635232, + "step": 4140 + }, + { + "epoch": 3.9066918001885016, + "grad_norm": 0.012737921439111233, + "learning_rate": 6.948328926854767e-07, + "loss": 0.0, + "num_input_tokens_seen": 9648544, + "step": 4145 + }, + { + "epoch": 3.9114043355325165, + "grad_norm": 0.05681464448571205, + "learning_rate": 6.891518832370059e-07, + "loss": 0.0074, + "num_input_tokens_seen": 9659424, + "step": 4150 + }, + { + "epoch": 3.9161168708765315, + "grad_norm": 0.034557852894067764, + "learning_rate": 6.834904783241198e-07, + "loss": 0.0, + "num_input_tokens_seen": 9669920, + "step": 4155 + }, + { + "epoch": 3.9208294062205464, + "grad_norm": 0.608920693397522, + "learning_rate": 6.778487392381089e-07, + "loss": 0.0002, + "num_input_tokens_seen": 9681376, + "step": 4160 + }, + { + "epoch": 3.925541941564562, + "grad_norm": 0.0012145772343501449, + "learning_rate": 6.722267270573529e-07, + "loss": 0.0, + "num_input_tokens_seen": 9691552, + "step": 4165 + }, + { + "epoch": 3.9302544769085768, + "grad_norm": 0.017469940707087517, + "learning_rate": 6.666245026466708e-07, + "loss": 0.0001, + "num_input_tokens_seen": 9704288, + "step": 4170 + }, + { + "epoch": 3.934967012252592, + "grad_norm": 0.0016305310418829322, + "learning_rate": 6.61042126656652e-07, + "loss": 0.0595, + "num_input_tokens_seen": 9713952, + "step": 4175 + }, + { + "epoch": 3.939679547596607, + "grad_norm": 26.813234329223633, + "learning_rate": 6.554796595230051e-07, + "loss": 0.0642, + "num_input_tokens_seen": 9724576, + "step": 4180 + }, + { + "epoch": 3.944392082940622, + "grad_norm": 0.0023128024768084288, + "learning_rate": 6.499371614659019e-07, + "loss": 0.0002, + "num_input_tokens_seen": 9735392, + "step": 4185 + }, + { + "epoch": 3.949104618284637, + "grad_norm": 0.00788215734064579, + "learning_rate": 6.444146924893252e-07, + "loss": 0.0766, + "num_input_tokens_seen": 9745888, + "step": 4190 + }, + { + "epoch": 3.9538171536286524, + "grad_norm": 0.0028776442632079124, + "learning_rate": 6.389123123804217e-07, + "loss": 0.111, + "num_input_tokens_seen": 9755104, + "step": 4195 + }, + { + "epoch": 3.9585296889726673, + "grad_norm": 0.0523090660572052, + "learning_rate": 6.334300807088509e-07, + "loss": 0.0003, + "num_input_tokens_seen": 9766944, + "step": 4200 + }, + { + "epoch": 3.9632422243166823, + "grad_norm": 0.008871479891240597, + "learning_rate": 6.279680568261423e-07, + "loss": 0.0782, + "num_input_tokens_seen": 9778336, + "step": 4205 + }, + { + "epoch": 3.9679547596606977, + "grad_norm": 0.22362910211086273, + "learning_rate": 6.225262998650525e-07, + "loss": 0.0004, + "num_input_tokens_seen": 9789088, + "step": 4210 + }, + { + "epoch": 3.9726672950047126, + "grad_norm": 1.362607479095459, + "learning_rate": 6.171048687389273e-07, + "loss": 0.0003, + "num_input_tokens_seen": 9799392, + "step": 4215 + }, + { + "epoch": 3.9773798303487276, + "grad_norm": 0.0025137532502412796, + "learning_rate": 6.117038221410568e-07, + "loss": 0.0, + "num_input_tokens_seen": 9811360, + "step": 4220 + }, + { + "epoch": 3.9820923656927425, + "grad_norm": 0.11764784902334213, + "learning_rate": 6.063232185440507e-07, + "loss": 0.1016, + "num_input_tokens_seen": 9824160, + "step": 4225 + }, + { + "epoch": 3.986804901036758, + "grad_norm": 0.011414180509746075, + "learning_rate": 6.009631161991958e-07, + "loss": 0.0007, + "num_input_tokens_seen": 9834784, + "step": 4230 + }, + { + "epoch": 3.991517436380773, + "grad_norm": 0.004349572584033012, + "learning_rate": 5.956235731358298e-07, + "loss": 0.0, + "num_input_tokens_seen": 9845920, + "step": 4235 + }, + { + "epoch": 3.9962299717247878, + "grad_norm": 0.0038933674804866314, + "learning_rate": 5.903046471607121e-07, + "loss": 0.0, + "num_input_tokens_seen": 9858208, + "step": 4240 + }, + { + "epoch": 4.000942507068803, + "grad_norm": 0.014259828254580498, + "learning_rate": 5.850063958573993e-07, + "loss": 0.032, + "num_input_tokens_seen": 9868192, + "step": 4245 + }, + { + "epoch": 4.005655042412818, + "grad_norm": 0.002472294494509697, + "learning_rate": 5.797288765856196e-07, + "loss": 0.0, + "num_input_tokens_seen": 9882784, + "step": 4250 + }, + { + "epoch": 4.010367577756833, + "grad_norm": 0.17401158809661865, + "learning_rate": 5.74472146480653e-07, + "loss": 0.0001, + "num_input_tokens_seen": 9892448, + "step": 4255 + }, + { + "epoch": 4.011310084825636, + "eval_loss": 0.5145591497421265, + "eval_runtime": 2.7443, + "eval_samples_per_second": 343.621, + "eval_steps_per_second": 42.998, + "num_input_tokens_seen": 9894624, + "step": 4256 + }, + { + "epoch": 4.015080113100848, + "grad_norm": 0.0075249760411679745, + "learning_rate": 5.692362624527117e-07, + "loss": 0.0, + "num_input_tokens_seen": 9905376, + "step": 4260 + }, + { + "epoch": 4.019792648444863, + "grad_norm": 0.07316409796476364, + "learning_rate": 5.640212811863277e-07, + "loss": 0.0, + "num_input_tokens_seen": 9915616, + "step": 4265 + }, + { + "epoch": 4.024505183788879, + "grad_norm": 0.007110144477337599, + "learning_rate": 5.588272591397337e-07, + "loss": 0.0, + "num_input_tokens_seen": 9928288, + "step": 4270 + }, + { + "epoch": 4.029217719132894, + "grad_norm": 0.002868575043976307, + "learning_rate": 5.536542525442554e-07, + "loss": 0.0001, + "num_input_tokens_seen": 9939232, + "step": 4275 + }, + { + "epoch": 4.033930254476909, + "grad_norm": 0.004913229029625654, + "learning_rate": 5.485023174037005e-07, + "loss": 0.0, + "num_input_tokens_seen": 9950688, + "step": 4280 + }, + { + "epoch": 4.038642789820924, + "grad_norm": 0.0007955088512971997, + "learning_rate": 5.433715094937575e-07, + "loss": 0.0, + "num_input_tokens_seen": 9961824, + "step": 4285 + }, + { + "epoch": 4.043355325164939, + "grad_norm": 0.002959765959531069, + "learning_rate": 5.382618843613827e-07, + "loss": 0.0, + "num_input_tokens_seen": 9974560, + "step": 4290 + }, + { + "epoch": 4.0480678605089535, + "grad_norm": 0.0037353690713644028, + "learning_rate": 5.331734973242089e-07, + "loss": 0.0, + "num_input_tokens_seen": 9987040, + "step": 4295 + }, + { + "epoch": 4.0527803958529685, + "grad_norm": 0.001677420805208385, + "learning_rate": 5.28106403469939e-07, + "loss": 0.0, + "num_input_tokens_seen": 10002400, + "step": 4300 + }, + { + "epoch": 4.057492931196984, + "grad_norm": 0.024959493428468704, + "learning_rate": 5.23060657655754e-07, + "loss": 0.0, + "num_input_tokens_seen": 10012448, + "step": 4305 + }, + { + "epoch": 4.062205466540999, + "grad_norm": 0.002381374826654792, + "learning_rate": 5.180363145077164e-07, + "loss": 0.0001, + "num_input_tokens_seen": 10023392, + "step": 4310 + }, + { + "epoch": 4.066918001885014, + "grad_norm": 0.0020008094143122435, + "learning_rate": 5.130334284201799e-07, + "loss": 0.0002, + "num_input_tokens_seen": 10034528, + "step": 4315 + }, + { + "epoch": 4.071630537229029, + "grad_norm": 0.0024472337681800127, + "learning_rate": 5.080520535552028e-07, + "loss": 0.0, + "num_input_tokens_seen": 10045024, + "step": 4320 + }, + { + "epoch": 4.076343072573044, + "grad_norm": 0.05074908956885338, + "learning_rate": 5.030922438419569e-07, + "loss": 0.0, + "num_input_tokens_seen": 10055328, + "step": 4325 + }, + { + "epoch": 4.081055607917059, + "grad_norm": 0.008851269260048866, + "learning_rate": 4.981540529761473e-07, + "loss": 0.0, + "num_input_tokens_seen": 10065184, + "step": 4330 + }, + { + "epoch": 4.085768143261075, + "grad_norm": 0.009758401662111282, + "learning_rate": 4.932375344194285e-07, + "loss": 0.0, + "num_input_tokens_seen": 10077088, + "step": 4335 + }, + { + "epoch": 4.09048067860509, + "grad_norm": 0.005179739557206631, + "learning_rate": 4.88342741398831e-07, + "loss": 0.0, + "num_input_tokens_seen": 10087840, + "step": 4340 + }, + { + "epoch": 4.095193213949105, + "grad_norm": 0.020409001037478447, + "learning_rate": 4.83469726906175e-07, + "loss": 0.0, + "num_input_tokens_seen": 10098656, + "step": 4345 + }, + { + "epoch": 4.09990574929312, + "grad_norm": 0.0022380428854376078, + "learning_rate": 4.786185436975085e-07, + "loss": 0.0, + "num_input_tokens_seen": 10111456, + "step": 4350 + }, + { + "epoch": 4.104618284637135, + "grad_norm": 0.0027253238949924707, + "learning_rate": 4.7378924429252735e-07, + "loss": 0.0, + "num_input_tokens_seen": 10122912, + "step": 4355 + }, + { + "epoch": 4.10933081998115, + "grad_norm": 0.22917087376117706, + "learning_rate": 4.689818809740118e-07, + "loss": 0.0003, + "num_input_tokens_seen": 10135072, + "step": 4360 + }, + { + "epoch": 4.1140433553251645, + "grad_norm": 0.04845559597015381, + "learning_rate": 4.641965057872552e-07, + "loss": 0.0001, + "num_input_tokens_seen": 10145760, + "step": 4365 + }, + { + "epoch": 4.11875589066918, + "grad_norm": 0.004377515520900488, + "learning_rate": 4.594331705395078e-07, + "loss": 0.0001, + "num_input_tokens_seen": 10156000, + "step": 4370 + }, + { + "epoch": 4.123468426013195, + "grad_norm": 0.002658847952261567, + "learning_rate": 4.5469192679940905e-07, + "loss": 0.0, + "num_input_tokens_seen": 10168736, + "step": 4375 + }, + { + "epoch": 4.12818096135721, + "grad_norm": 0.022708803415298462, + "learning_rate": 4.4997282589643363e-07, + "loss": 0.0, + "num_input_tokens_seen": 10181408, + "step": 4380 + }, + { + "epoch": 4.132893496701225, + "grad_norm": 0.004133255686610937, + "learning_rate": 4.4527591892033263e-07, + "loss": 0.0, + "num_input_tokens_seen": 10191904, + "step": 4385 + }, + { + "epoch": 4.13760603204524, + "grad_norm": 0.0023588186595588923, + "learning_rate": 4.406012567205847e-07, + "loss": 0.0, + "num_input_tokens_seen": 10202080, + "step": 4390 + }, + { + "epoch": 4.142318567389255, + "grad_norm": 0.0017039207741618156, + "learning_rate": 4.359488899058409e-07, + "loss": 0.0, + "num_input_tokens_seen": 10212064, + "step": 4395 + }, + { + "epoch": 4.147031102733271, + "grad_norm": 0.004048160742968321, + "learning_rate": 4.313188688433792e-07, + "loss": 0.0, + "num_input_tokens_seen": 10223136, + "step": 4400 + }, + { + "epoch": 4.151743638077286, + "grad_norm": 0.0019121092045679688, + "learning_rate": 4.2671124365855853e-07, + "loss": 0.0, + "num_input_tokens_seen": 10238432, + "step": 4405 + }, + { + "epoch": 4.156456173421301, + "grad_norm": 0.004715082701295614, + "learning_rate": 4.2212606423427867e-07, + "loss": 0.0252, + "num_input_tokens_seen": 10250784, + "step": 4410 + }, + { + "epoch": 4.161168708765316, + "grad_norm": 0.04374701902270317, + "learning_rate": 4.175633802104337e-07, + "loss": 0.0, + "num_input_tokens_seen": 10265440, + "step": 4415 + }, + { + "epoch": 4.165881244109331, + "grad_norm": 0.014298969879746437, + "learning_rate": 4.1302324098338315e-07, + "loss": 0.0, + "num_input_tokens_seen": 10276704, + "step": 4420 + }, + { + "epoch": 4.170593779453346, + "grad_norm": 0.004060305189341307, + "learning_rate": 4.0850569570541036e-07, + "loss": 0.0, + "num_input_tokens_seen": 10286496, + "step": 4425 + }, + { + "epoch": 4.175306314797361, + "grad_norm": 0.0036463961005210876, + "learning_rate": 4.0401079328419384e-07, + "loss": 0.0, + "num_input_tokens_seen": 10297376, + "step": 4430 + }, + { + "epoch": 4.180018850141376, + "grad_norm": 0.007656124886125326, + "learning_rate": 3.995385823822767e-07, + "loss": 0.0, + "num_input_tokens_seen": 10306976, + "step": 4435 + }, + { + "epoch": 4.184731385485391, + "grad_norm": 0.009232879616320133, + "learning_rate": 3.9508911141653896e-07, + "loss": 0.0, + "num_input_tokens_seen": 10318880, + "step": 4440 + }, + { + "epoch": 4.189443920829406, + "grad_norm": 0.5042584538459778, + "learning_rate": 3.906624285576771e-07, + "loss": 0.0001, + "num_input_tokens_seen": 10330784, + "step": 4445 + }, + { + "epoch": 4.194156456173421, + "grad_norm": 0.009172679856419563, + "learning_rate": 3.862585817296771e-07, + "loss": 0.0, + "num_input_tokens_seen": 10341088, + "step": 4450 + }, + { + "epoch": 4.198868991517436, + "grad_norm": 0.00793259497731924, + "learning_rate": 3.8187761860929956e-07, + "loss": 0.0, + "num_input_tokens_seen": 10352096, + "step": 4455 + }, + { + "epoch": 4.203581526861451, + "grad_norm": 0.010999282822012901, + "learning_rate": 3.775195866255618e-07, + "loss": 0.0, + "num_input_tokens_seen": 10364448, + "step": 4460 + }, + { + "epoch": 4.208294062205466, + "grad_norm": 0.0006450503133237362, + "learning_rate": 3.731845329592268e-07, + "loss": 0.0, + "num_input_tokens_seen": 10376928, + "step": 4465 + }, + { + "epoch": 4.213006597549482, + "grad_norm": 0.0008582579903304577, + "learning_rate": 3.6887250454228666e-07, + "loss": 0.0, + "num_input_tokens_seen": 10389216, + "step": 4470 + }, + { + "epoch": 4.217719132893497, + "grad_norm": 0.002186185447499156, + "learning_rate": 3.6458354805746304e-07, + "loss": 0.0, + "num_input_tokens_seen": 10406944, + "step": 4475 + }, + { + "epoch": 4.222431668237512, + "grad_norm": 0.0032066500280052423, + "learning_rate": 3.603177099376931e-07, + "loss": 0.0, + "num_input_tokens_seen": 10417760, + "step": 4480 + }, + { + "epoch": 4.227144203581527, + "grad_norm": 0.0038139999378472567, + "learning_rate": 3.5607503636563484e-07, + "loss": 0.0, + "num_input_tokens_seen": 10429216, + "step": 4485 + }, + { + "epoch": 4.231856738925542, + "grad_norm": 0.004827831871807575, + "learning_rate": 3.5185557327315797e-07, + "loss": 0.0, + "num_input_tokens_seen": 10442784, + "step": 4490 + }, + { + "epoch": 4.236569274269557, + "grad_norm": 0.0007723537273705006, + "learning_rate": 3.47659366340857e-07, + "loss": 0.0, + "num_input_tokens_seen": 10454496, + "step": 4495 + }, + { + "epoch": 4.2412818096135725, + "grad_norm": 0.00883992575109005, + "learning_rate": 3.43486460997548e-07, + "loss": 0.0, + "num_input_tokens_seen": 10466464, + "step": 4500 + }, + { + "epoch": 4.245994344957587, + "grad_norm": 0.009929073974490166, + "learning_rate": 3.393369024197826e-07, + "loss": 0.0, + "num_input_tokens_seen": 10476768, + "step": 4505 + }, + { + "epoch": 4.250706880301602, + "grad_norm": 0.003360740141943097, + "learning_rate": 3.352107355313536e-07, + "loss": 0.0, + "num_input_tokens_seen": 10487392, + "step": 4510 + }, + { + "epoch": 4.255419415645617, + "grad_norm": 0.004852129612118006, + "learning_rate": 3.311080050028148e-07, + "loss": 0.0, + "num_input_tokens_seen": 10498144, + "step": 4515 + }, + { + "epoch": 4.260131950989632, + "grad_norm": 0.0017156790709123015, + "learning_rate": 3.2702875525099235e-07, + "loss": 0.0782, + "num_input_tokens_seen": 10507808, + "step": 4520 + }, + { + "epoch": 4.262016965127239, + "eval_loss": 0.5548250675201416, + "eval_runtime": 2.7767, + "eval_samples_per_second": 339.618, + "eval_steps_per_second": 42.497, + "num_input_tokens_seen": 10512416, + "step": 4522 + }, + { + "epoch": 4.264844486333647, + "grad_norm": 0.014055570587515831, + "learning_rate": 3.2297303043850564e-07, + "loss": 0.0, + "num_input_tokens_seen": 10517408, + "step": 4525 + }, + { + "epoch": 4.269557021677663, + "grad_norm": 0.002077508484944701, + "learning_rate": 3.189408744732897e-07, + "loss": 0.0, + "num_input_tokens_seen": 10528416, + "step": 4530 + }, + { + "epoch": 4.274269557021678, + "grad_norm": 0.036649417132139206, + "learning_rate": 3.149323310081201e-07, + "loss": 0.0, + "num_input_tokens_seen": 10541216, + "step": 4535 + }, + { + "epoch": 4.278982092365693, + "grad_norm": 0.00107799272518605, + "learning_rate": 3.1094744344013855e-07, + "loss": 0.0, + "num_input_tokens_seen": 10554016, + "step": 4540 + }, + { + "epoch": 4.283694627709708, + "grad_norm": 0.011105876415967941, + "learning_rate": 3.069862549103841e-07, + "loss": 0.0, + "num_input_tokens_seen": 10563552, + "step": 4545 + }, + { + "epoch": 4.288407163053723, + "grad_norm": 0.005242721643298864, + "learning_rate": 3.030488083033273e-07, + "loss": 0.0, + "num_input_tokens_seen": 10576288, + "step": 4550 + }, + { + "epoch": 4.293119698397738, + "grad_norm": 0.013468295335769653, + "learning_rate": 2.991351462464037e-07, + "loss": 0.0, + "num_input_tokens_seen": 10586784, + "step": 4555 + }, + { + "epoch": 4.297832233741753, + "grad_norm": 0.005555103067308664, + "learning_rate": 2.9524531110955406e-07, + "loss": 0.0, + "num_input_tokens_seen": 10597792, + "step": 4560 + }, + { + "epoch": 4.3025447690857686, + "grad_norm": 0.009637890383601189, + "learning_rate": 2.913793450047639e-07, + "loss": 0.0, + "num_input_tokens_seen": 10610720, + "step": 4565 + }, + { + "epoch": 4.3072573044297835, + "grad_norm": 0.002298228908330202, + "learning_rate": 2.875372897856113e-07, + "loss": 0.0, + "num_input_tokens_seen": 10622176, + "step": 4570 + }, + { + "epoch": 4.311969839773798, + "grad_norm": 0.025600271299481392, + "learning_rate": 2.837191870468084e-07, + "loss": 0.0, + "num_input_tokens_seen": 10632864, + "step": 4575 + }, + { + "epoch": 4.316682375117813, + "grad_norm": 0.000993955647572875, + "learning_rate": 2.7992507812375557e-07, + "loss": 0.0039, + "num_input_tokens_seen": 10642784, + "step": 4580 + }, + { + "epoch": 4.321394910461828, + "grad_norm": 0.01237708143889904, + "learning_rate": 2.76155004092091e-07, + "loss": 0.0153, + "num_input_tokens_seen": 10652896, + "step": 4585 + }, + { + "epoch": 4.326107445805843, + "grad_norm": 0.002820044755935669, + "learning_rate": 2.7240900576724904e-07, + "loss": 0.1078, + "num_input_tokens_seen": 10665248, + "step": 4590 + }, + { + "epoch": 4.330819981149858, + "grad_norm": 0.0086582712829113, + "learning_rate": 2.686871237040151e-07, + "loss": 0.0001, + "num_input_tokens_seen": 10676384, + "step": 4595 + }, + { + "epoch": 4.335532516493874, + "grad_norm": 0.0017868748400360346, + "learning_rate": 2.6498939819608827e-07, + "loss": 0.0, + "num_input_tokens_seen": 10688352, + "step": 4600 + }, + { + "epoch": 4.340245051837889, + "grad_norm": 0.026829157024621964, + "learning_rate": 2.613158692756443e-07, + "loss": 0.0, + "num_input_tokens_seen": 10698080, + "step": 4605 + }, + { + "epoch": 4.344957587181904, + "grad_norm": 0.004218968562781811, + "learning_rate": 2.576665767129055e-07, + "loss": 0.0, + "num_input_tokens_seen": 10710816, + "step": 4610 + }, + { + "epoch": 4.349670122525919, + "grad_norm": 0.0005848800064995885, + "learning_rate": 2.5404156001570257e-07, + "loss": 0.0, + "num_input_tokens_seen": 10722592, + "step": 4615 + }, + { + "epoch": 4.354382657869934, + "grad_norm": 0.0013897489989176393, + "learning_rate": 2.5044085842905686e-07, + "loss": 0.0, + "num_input_tokens_seen": 10734752, + "step": 4620 + }, + { + "epoch": 4.359095193213949, + "grad_norm": 0.003906297497451305, + "learning_rate": 2.4686451093474673e-07, + "loss": 0.0001, + "num_input_tokens_seen": 10746464, + "step": 4625 + }, + { + "epoch": 4.363807728557964, + "grad_norm": 0.010411670431494713, + "learning_rate": 2.433125562508917e-07, + "loss": 0.0, + "num_input_tokens_seen": 10757472, + "step": 4630 + }, + { + "epoch": 4.36852026390198, + "grad_norm": 24.992229461669922, + "learning_rate": 2.3978503283152847e-07, + "loss": 0.1078, + "num_input_tokens_seen": 10769056, + "step": 4635 + }, + { + "epoch": 4.3732327992459945, + "grad_norm": 0.007977725006639957, + "learning_rate": 2.3628197886619852e-07, + "loss": 0.0, + "num_input_tokens_seen": 10780384, + "step": 4640 + }, + { + "epoch": 4.3779453345900095, + "grad_norm": 0.0031011472456157207, + "learning_rate": 2.3280343227953305e-07, + "loss": 0.0, + "num_input_tokens_seen": 10792928, + "step": 4645 + }, + { + "epoch": 4.382657869934024, + "grad_norm": 0.0017057248624041677, + "learning_rate": 2.293494307308411e-07, + "loss": 0.0, + "num_input_tokens_seen": 10803808, + "step": 4650 + }, + { + "epoch": 4.387370405278039, + "grad_norm": 0.002497282810509205, + "learning_rate": 2.2592001161370392e-07, + "loss": 0.0, + "num_input_tokens_seen": 10814496, + "step": 4655 + }, + { + "epoch": 4.392082940622054, + "grad_norm": 0.005025547929108143, + "learning_rate": 2.2251521205557042e-07, + "loss": 0.0, + "num_input_tokens_seen": 10827168, + "step": 4660 + }, + { + "epoch": 4.39679547596607, + "grad_norm": 0.014085683971643448, + "learning_rate": 2.1913506891735242e-07, + "loss": 0.0, + "num_input_tokens_seen": 10839392, + "step": 4665 + }, + { + "epoch": 4.401508011310085, + "grad_norm": 0.0015956445131450891, + "learning_rate": 2.1577961879302807e-07, + "loss": 0.0, + "num_input_tokens_seen": 10851744, + "step": 4670 + }, + { + "epoch": 4.4062205466541, + "grad_norm": 0.0027405947912484407, + "learning_rate": 2.124488980092454e-07, + "loss": 0.0, + "num_input_tokens_seen": 10864608, + "step": 4675 + }, + { + "epoch": 4.410933081998115, + "grad_norm": 0.0018958933651447296, + "learning_rate": 2.0914294262492723e-07, + "loss": 0.0, + "num_input_tokens_seen": 10877856, + "step": 4680 + }, + { + "epoch": 4.41564561734213, + "grad_norm": 0.0035127715673297644, + "learning_rate": 2.0586178843088473e-07, + "loss": 0.0044, + "num_input_tokens_seen": 10891616, + "step": 4685 + }, + { + "epoch": 4.420358152686145, + "grad_norm": 0.003918149974197149, + "learning_rate": 2.026054709494235e-07, + "loss": 0.0, + "num_input_tokens_seen": 10901024, + "step": 4690 + }, + { + "epoch": 4.425070688030161, + "grad_norm": 0.004163654521107674, + "learning_rate": 1.9937402543396683e-07, + "loss": 0.0, + "num_input_tokens_seen": 10910560, + "step": 4695 + }, + { + "epoch": 4.429783223374176, + "grad_norm": 0.0017460703384131193, + "learning_rate": 1.961674868686675e-07, + "loss": 0.0, + "num_input_tokens_seen": 10921824, + "step": 4700 + }, + { + "epoch": 4.434495758718191, + "grad_norm": 0.0011986172758042812, + "learning_rate": 1.929858899680323e-07, + "loss": 0.0, + "num_input_tokens_seen": 10934944, + "step": 4705 + }, + { + "epoch": 4.4392082940622055, + "grad_norm": 0.0017532928613945842, + "learning_rate": 1.8982926917654575e-07, + "loss": 0.0922, + "num_input_tokens_seen": 10946400, + "step": 4710 + }, + { + "epoch": 4.4439208294062205, + "grad_norm": 0.0029414647724479437, + "learning_rate": 1.8669765866829724e-07, + "loss": 0.0, + "num_input_tokens_seen": 10958112, + "step": 4715 + }, + { + "epoch": 4.448633364750235, + "grad_norm": 0.0008323266520164907, + "learning_rate": 1.835910923466097e-07, + "loss": 0.0, + "num_input_tokens_seen": 10970528, + "step": 4720 + }, + { + "epoch": 4.45334590009425, + "grad_norm": 0.002249309793114662, + "learning_rate": 1.805096038436749e-07, + "loss": 0.0, + "num_input_tokens_seen": 10982048, + "step": 4725 + }, + { + "epoch": 4.458058435438266, + "grad_norm": 0.001825433922931552, + "learning_rate": 1.774532265201867e-07, + "loss": 0.0, + "num_input_tokens_seen": 10994848, + "step": 4730 + }, + { + "epoch": 4.462770970782281, + "grad_norm": 0.04776029288768768, + "learning_rate": 1.7442199346498294e-07, + "loss": 0.0001, + "num_input_tokens_seen": 11004896, + "step": 4735 + }, + { + "epoch": 4.467483506126296, + "grad_norm": 0.01915101520717144, + "learning_rate": 1.7141593749468361e-07, + "loss": 0.0, + "num_input_tokens_seen": 11017056, + "step": 4740 + }, + { + "epoch": 4.472196041470311, + "grad_norm": 0.0076557123102247715, + "learning_rate": 1.6843509115333917e-07, + "loss": 0.0, + "num_input_tokens_seen": 11026912, + "step": 4745 + }, + { + "epoch": 4.476908576814326, + "grad_norm": 0.010826838202774525, + "learning_rate": 1.6547948671207515e-07, + "loss": 0.0, + "num_input_tokens_seen": 11038176, + "step": 4750 + }, + { + "epoch": 4.481621112158341, + "grad_norm": 0.0004848201060667634, + "learning_rate": 1.6254915616874645e-07, + "loss": 0.0, + "num_input_tokens_seen": 11047648, + "step": 4755 + }, + { + "epoch": 4.486333647502356, + "grad_norm": 0.004097946919500828, + "learning_rate": 1.5964413124758492e-07, + "loss": 0.0441, + "num_input_tokens_seen": 11056864, + "step": 4760 + }, + { + "epoch": 4.491046182846372, + "grad_norm": 0.0032770747784525156, + "learning_rate": 1.5676444339886327e-07, + "loss": 0.0, + "num_input_tokens_seen": 11067744, + "step": 4765 + }, + { + "epoch": 4.495758718190387, + "grad_norm": 0.0030054976232349873, + "learning_rate": 1.5391012379854937e-07, + "loss": 0.0, + "num_input_tokens_seen": 11077920, + "step": 4770 + }, + { + "epoch": 4.500471253534402, + "grad_norm": 0.0032186508178710938, + "learning_rate": 1.5108120334797e-07, + "loss": 0.0, + "num_input_tokens_seen": 11088864, + "step": 4775 + }, + { + "epoch": 4.5051837888784165, + "grad_norm": 0.006772062741219997, + "learning_rate": 1.4827771267347662e-07, + "loss": 0.0, + "num_input_tokens_seen": 11098336, + "step": 4780 + }, + { + "epoch": 4.5098963242224315, + "grad_norm": 0.003302761586382985, + "learning_rate": 1.4549968212611538e-07, + "loss": 0.0, + "num_input_tokens_seen": 11107680, + "step": 4785 + }, + { + "epoch": 4.512723845428841, + "eval_loss": 0.5418137311935425, + "eval_runtime": 2.7286, + "eval_samples_per_second": 345.594, + "eval_steps_per_second": 43.245, + "num_input_tokens_seen": 11115040, + "step": 4788 + }, + { + "epoch": 4.514608859566446, + "grad_norm": 0.009535914286971092, + "learning_rate": 1.4274714178129534e-07, + "loss": 0.0, + "num_input_tokens_seen": 11120480, + "step": 4790 + }, + { + "epoch": 4.519321394910461, + "grad_norm": 0.0017795681487768888, + "learning_rate": 1.4002012143846472e-07, + "loss": 0.0, + "num_input_tokens_seen": 11132320, + "step": 4795 + }, + { + "epoch": 4.524033930254477, + "grad_norm": 0.004151622299104929, + "learning_rate": 1.3731865062078853e-07, + "loss": 0.0006, + "num_input_tokens_seen": 11148960, + "step": 4800 + }, + { + "epoch": 4.528746465598492, + "grad_norm": 0.009143157862126827, + "learning_rate": 1.3464275857482778e-07, + "loss": 0.0, + "num_input_tokens_seen": 11159968, + "step": 4805 + }, + { + "epoch": 4.533459000942507, + "grad_norm": 0.00789305754005909, + "learning_rate": 1.3199247427022528e-07, + "loss": 0.122, + "num_input_tokens_seen": 11170848, + "step": 4810 + }, + { + "epoch": 4.538171536286522, + "grad_norm": 0.0016827658982947469, + "learning_rate": 1.293678263993872e-07, + "loss": 0.0, + "num_input_tokens_seen": 11184288, + "step": 4815 + }, + { + "epoch": 4.542884071630537, + "grad_norm": 0.004033006262034178, + "learning_rate": 1.2676884337717882e-07, + "loss": 0.0, + "num_input_tokens_seen": 11197856, + "step": 4820 + }, + { + "epoch": 4.547596606974552, + "grad_norm": 0.0016169328009709716, + "learning_rate": 1.241955533406114e-07, + "loss": 0.0, + "num_input_tokens_seen": 11209696, + "step": 4825 + }, + { + "epoch": 4.552309142318568, + "grad_norm": 0.0008929000468924642, + "learning_rate": 1.2164798414854073e-07, + "loss": 0.0, + "num_input_tokens_seen": 11220064, + "step": 4830 + }, + { + "epoch": 4.557021677662583, + "grad_norm": 0.002499083988368511, + "learning_rate": 1.1912616338136396e-07, + "loss": 0.0, + "num_input_tokens_seen": 11230304, + "step": 4835 + }, + { + "epoch": 4.561734213006598, + "grad_norm": 0.0028031610418111086, + "learning_rate": 1.1663011834072257e-07, + "loss": 0.0, + "num_input_tokens_seen": 11240096, + "step": 4840 + }, + { + "epoch": 4.566446748350613, + "grad_norm": 0.002892805030569434, + "learning_rate": 1.1415987604920492e-07, + "loss": 0.0, + "num_input_tokens_seen": 11251104, + "step": 4845 + }, + { + "epoch": 4.5711592836946275, + "grad_norm": 0.002147044287994504, + "learning_rate": 1.11715463250055e-07, + "loss": 0.0, + "num_input_tokens_seen": 11261088, + "step": 4850 + }, + { + "epoch": 4.5758718190386425, + "grad_norm": 69.84650421142578, + "learning_rate": 1.0929690640688218e-07, + "loss": 0.0072, + "num_input_tokens_seen": 11273312, + "step": 4855 + }, + { + "epoch": 4.580584354382658, + "grad_norm": 0.00039674167055636644, + "learning_rate": 1.0690423170337554e-07, + "loss": 0.0003, + "num_input_tokens_seen": 11284896, + "step": 4860 + }, + { + "epoch": 4.585296889726673, + "grad_norm": 0.002157399896532297, + "learning_rate": 1.0453746504302003e-07, + "loss": 0.0, + "num_input_tokens_seen": 11294560, + "step": 4865 + }, + { + "epoch": 4.590009425070688, + "grad_norm": 0.012543817050755024, + "learning_rate": 1.021966320488152e-07, + "loss": 0.0813, + "num_input_tokens_seen": 11308128, + "step": 4870 + }, + { + "epoch": 4.594721960414703, + "grad_norm": 0.004514993634074926, + "learning_rate": 9.988175806299877e-08, + "loss": 0.0, + "num_input_tokens_seen": 11321056, + "step": 4875 + }, + { + "epoch": 4.599434495758718, + "grad_norm": 0.06353511661291122, + "learning_rate": 9.759286814677305e-08, + "loss": 0.0, + "num_input_tokens_seen": 11334496, + "step": 4880 + }, + { + "epoch": 4.604147031102733, + "grad_norm": 0.00232234806753695, + "learning_rate": 9.532998708003061e-08, + "loss": 0.0, + "num_input_tokens_seen": 11346208, + "step": 4885 + }, + { + "epoch": 4.608859566446748, + "grad_norm": 0.0016301957657560706, + "learning_rate": 9.309313936108983e-08, + "loss": 0.0, + "num_input_tokens_seen": 11358112, + "step": 4890 + }, + { + "epoch": 4.613572101790764, + "grad_norm": 0.0005986247560940683, + "learning_rate": 9.088234920642703e-08, + "loss": 0.0, + "num_input_tokens_seen": 11368096, + "step": 4895 + }, + { + "epoch": 4.618284637134779, + "grad_norm": 0.004071133676916361, + "learning_rate": 8.869764055041501e-08, + "loss": 0.0, + "num_input_tokens_seen": 11378976, + "step": 4900 + }, + { + "epoch": 4.622997172478794, + "grad_norm": 0.0028420898597687483, + "learning_rate": 8.653903704506389e-08, + "loss": 0.0, + "num_input_tokens_seen": 11390688, + "step": 4905 + }, + { + "epoch": 4.627709707822809, + "grad_norm": 0.011091876775026321, + "learning_rate": 8.440656205976644e-08, + "loss": 0.0, + "num_input_tokens_seen": 11401440, + "step": 4910 + }, + { + "epoch": 4.632422243166824, + "grad_norm": 0.001331353560090065, + "learning_rate": 8.230023868104231e-08, + "loss": 0.0, + "num_input_tokens_seen": 11412448, + "step": 4915 + }, + { + "epoch": 4.6371347785108386, + "grad_norm": 0.01889793761074543, + "learning_rate": 8.022008971229039e-08, + "loss": 0.0, + "num_input_tokens_seen": 11422496, + "step": 4920 + }, + { + "epoch": 4.6418473138548535, + "grad_norm": 0.002929375506937504, + "learning_rate": 7.816613767354098e-08, + "loss": 0.0, + "num_input_tokens_seen": 11433632, + "step": 4925 + }, + { + "epoch": 4.646559849198869, + "grad_norm": 0.002046855865046382, + "learning_rate": 7.613840480121176e-08, + "loss": 0.0, + "num_input_tokens_seen": 11446112, + "step": 4930 + }, + { + "epoch": 4.651272384542884, + "grad_norm": 0.0020440209191292524, + "learning_rate": 7.41369130478689e-08, + "loss": 0.0, + "num_input_tokens_seen": 11459552, + "step": 4935 + }, + { + "epoch": 4.655984919886899, + "grad_norm": 0.01186713483184576, + "learning_rate": 7.216168408198554e-08, + "loss": 0.0, + "num_input_tokens_seen": 11469984, + "step": 4940 + }, + { + "epoch": 4.660697455230914, + "grad_norm": 0.005305349826812744, + "learning_rate": 7.021273928771221e-08, + "loss": 0.0, + "num_input_tokens_seen": 11481888, + "step": 4945 + }, + { + "epoch": 4.665409990574929, + "grad_norm": 0.0028958169277757406, + "learning_rate": 6.829009976464102e-08, + "loss": 0.0579, + "num_input_tokens_seen": 11494944, + "step": 4950 + }, + { + "epoch": 4.670122525918944, + "grad_norm": 0.0005974304513074458, + "learning_rate": 6.639378632757986e-08, + "loss": 0.0, + "num_input_tokens_seen": 11505184, + "step": 4955 + }, + { + "epoch": 4.674835061262959, + "grad_norm": 0.0025155956391245127, + "learning_rate": 6.452381950632469e-08, + "loss": 0.0, + "num_input_tokens_seen": 11517856, + "step": 4960 + }, + { + "epoch": 4.679547596606975, + "grad_norm": 0.0031720330007374287, + "learning_rate": 6.268021954544095e-08, + "loss": 0.0, + "num_input_tokens_seen": 11530016, + "step": 4965 + }, + { + "epoch": 4.68426013195099, + "grad_norm": 0.004775336477905512, + "learning_rate": 6.08630064040408e-08, + "loss": 0.0, + "num_input_tokens_seen": 11545376, + "step": 4970 + }, + { + "epoch": 4.688972667295005, + "grad_norm": 0.001557852141559124, + "learning_rate": 5.9072199755567936e-08, + "loss": 0.0, + "num_input_tokens_seen": 11556448, + "step": 4975 + }, + { + "epoch": 4.69368520263902, + "grad_norm": 0.0037680910900235176, + "learning_rate": 5.730781898758614e-08, + "loss": 0.0, + "num_input_tokens_seen": 11566304, + "step": 4980 + }, + { + "epoch": 4.698397737983035, + "grad_norm": 0.0020881840027868748, + "learning_rate": 5.556988320156831e-08, + "loss": 0.0, + "num_input_tokens_seen": 11577056, + "step": 4985 + }, + { + "epoch": 4.7031102733270505, + "grad_norm": 0.0009613709407858551, + "learning_rate": 5.3858411212689146e-08, + "loss": 0.0, + "num_input_tokens_seen": 11589536, + "step": 4990 + }, + { + "epoch": 4.707822808671065, + "grad_norm": 0.11669395118951797, + "learning_rate": 5.2173421549621685e-08, + "loss": 0.0001, + "num_input_tokens_seen": 11599648, + "step": 4995 + }, + { + "epoch": 4.71253534401508, + "grad_norm": 0.0027235562447458506, + "learning_rate": 5.051493245433775e-08, + "loss": 0.0, + "num_input_tokens_seen": 11610272, + "step": 5000 + }, + { + "epoch": 4.717247879359095, + "grad_norm": 0.06230725720524788, + "learning_rate": 4.888296188190977e-08, + "loss": 0.0, + "num_input_tokens_seen": 11620768, + "step": 5005 + }, + { + "epoch": 4.72196041470311, + "grad_norm": 0.0018570433603599668, + "learning_rate": 4.727752750031511e-08, + "loss": 0.0, + "num_input_tokens_seen": 11632608, + "step": 5010 + }, + { + "epoch": 4.726672950047125, + "grad_norm": 0.01290284376591444, + "learning_rate": 4.5698646690247874e-08, + "loss": 0.0, + "num_input_tokens_seen": 11644896, + "step": 5015 + }, + { + "epoch": 4.73138548539114, + "grad_norm": 0.005762244574725628, + "learning_rate": 4.414633654492767e-08, + "loss": 0.0, + "num_input_tokens_seen": 11661344, + "step": 5020 + }, + { + "epoch": 4.736098020735156, + "grad_norm": 0.0015709196450188756, + "learning_rate": 4.2620613869915894e-08, + "loss": 0.0, + "num_input_tokens_seen": 11672288, + "step": 5025 + }, + { + "epoch": 4.740810556079171, + "grad_norm": 0.0015942390309646726, + "learning_rate": 4.112149518293362e-08, + "loss": 0.0, + "num_input_tokens_seen": 11684960, + "step": 5030 + }, + { + "epoch": 4.745523091423186, + "grad_norm": 0.013269704766571522, + "learning_rate": 3.9648996713683715e-08, + "loss": 0.0, + "num_input_tokens_seen": 11696160, + "step": 5035 + }, + { + "epoch": 4.750235626767201, + "grad_norm": 0.004723825957626104, + "learning_rate": 3.8203134403672905e-08, + "loss": 0.0, + "num_input_tokens_seen": 11705952, + "step": 5040 + }, + { + "epoch": 4.754948162111216, + "grad_norm": 0.006132619455456734, + "learning_rate": 3.678392390604163e-08, + "loss": 0.0, + "num_input_tokens_seen": 11716192, + "step": 5045 + }, + { + "epoch": 4.759660697455231, + "grad_norm": 0.003462289460003376, + "learning_rate": 3.539138058539282e-08, + "loss": 0.0, + "num_input_tokens_seen": 11728160, + "step": 5050 + }, + { + "epoch": 4.763430725730443, + "eval_loss": 0.54215008020401, + "eval_runtime": 2.8215, + "eval_samples_per_second": 334.225, + "eval_steps_per_second": 41.822, + "num_input_tokens_seen": 11736672, + "step": 5054 + }, + { + "epoch": 4.764373232799246, + "grad_norm": 0.0019681937992572784, + "learning_rate": 3.4025519517626174e-08, + "loss": 0.0, + "num_input_tokens_seen": 11738720, + "step": 5055 + }, + { + "epoch": 4.7690857681432615, + "grad_norm": 0.0026169854681938887, + "learning_rate": 3.268635548977633e-08, + "loss": 0.0, + "num_input_tokens_seen": 11750176, + "step": 5060 + }, + { + "epoch": 4.773798303487276, + "grad_norm": 0.0066421544179320335, + "learning_rate": 3.137390299984888e-08, + "loss": 0.0, + "num_input_tokens_seen": 11761312, + "step": 5065 + }, + { + "epoch": 4.778510838831291, + "grad_norm": 0.0028810338117182255, + "learning_rate": 3.0088176256668765e-08, + "loss": 0.0, + "num_input_tokens_seen": 11773728, + "step": 5070 + }, + { + "epoch": 4.783223374175306, + "grad_norm": 0.001111470046453178, + "learning_rate": 2.8829189179721552e-08, + "loss": 0.0, + "num_input_tokens_seen": 11784672, + "step": 5075 + }, + { + "epoch": 4.787935909519321, + "grad_norm": 0.009747396223247051, + "learning_rate": 2.759695539900603e-08, + "loss": 0.0, + "num_input_tokens_seen": 11796512, + "step": 5080 + }, + { + "epoch": 4.792648444863336, + "grad_norm": 0.0038332142867147923, + "learning_rate": 2.639148825488491e-08, + "loss": 0.0, + "num_input_tokens_seen": 11810464, + "step": 5085 + }, + { + "epoch": 4.797360980207351, + "grad_norm": 0.003033042885363102, + "learning_rate": 2.5212800797941582e-08, + "loss": 0.0, + "num_input_tokens_seen": 11820768, + "step": 5090 + }, + { + "epoch": 4.802073515551367, + "grad_norm": 0.007614613976329565, + "learning_rate": 2.406090578883691e-08, + "loss": 0.0, + "num_input_tokens_seen": 11831776, + "step": 5095 + }, + { + "epoch": 4.806786050895382, + "grad_norm": 0.0031820612493902445, + "learning_rate": 2.2935815698174045e-08, + "loss": 0.0, + "num_input_tokens_seen": 11843296, + "step": 5100 + }, + { + "epoch": 4.811498586239397, + "grad_norm": 0.00583045044913888, + "learning_rate": 2.1837542706359958e-08, + "loss": 0.0, + "num_input_tokens_seen": 11860000, + "step": 5105 + }, + { + "epoch": 4.816211121583412, + "grad_norm": 0.016117779538035393, + "learning_rate": 2.0766098703477178e-08, + "loss": 0.0, + "num_input_tokens_seen": 11872160, + "step": 5110 + }, + { + "epoch": 4.820923656927427, + "grad_norm": 0.006356321275234222, + "learning_rate": 1.9721495289152237e-08, + "loss": 0.0, + "num_input_tokens_seen": 11883168, + "step": 5115 + }, + { + "epoch": 4.825636192271442, + "grad_norm": 0.0020537914242595434, + "learning_rate": 1.8703743772430783e-08, + "loss": 0.0, + "num_input_tokens_seen": 11895584, + "step": 5120 + }, + { + "epoch": 4.830348727615457, + "grad_norm": 0.006692873314023018, + "learning_rate": 1.7712855171655996e-08, + "loss": 0.0, + "num_input_tokens_seen": 11906784, + "step": 5125 + }, + { + "epoch": 4.8350612629594725, + "grad_norm": 0.0012435702374204993, + "learning_rate": 1.6748840214348972e-08, + "loss": 0.0, + "num_input_tokens_seen": 11917600, + "step": 5130 + }, + { + "epoch": 4.839773798303487, + "grad_norm": 0.00899266917258501, + "learning_rate": 1.5811709337091862e-08, + "loss": 0.0, + "num_input_tokens_seen": 11929632, + "step": 5135 + }, + { + "epoch": 4.844486333647502, + "grad_norm": 0.0016645839205011725, + "learning_rate": 1.4901472685415475e-08, + "loss": 0.0, + "num_input_tokens_seen": 11938720, + "step": 5140 + }, + { + "epoch": 4.849198868991517, + "grad_norm": 0.00441192090511322, + "learning_rate": 1.4018140113689904e-08, + "loss": 0.0072, + "num_input_tokens_seen": 11951648, + "step": 5145 + }, + { + "epoch": 4.853911404335532, + "grad_norm": 0.001967532094568014, + "learning_rate": 1.3161721185016852e-08, + "loss": 0.0, + "num_input_tokens_seen": 11962336, + "step": 5150 + }, + { + "epoch": 4.858623939679548, + "grad_norm": 0.005055475980043411, + "learning_rate": 1.2332225171126366e-08, + "loss": 0.0, + "num_input_tokens_seen": 11975904, + "step": 5155 + }, + { + "epoch": 4.863336475023563, + "grad_norm": 0.014400842599570751, + "learning_rate": 1.152966105227693e-08, + "loss": 0.0, + "num_input_tokens_seen": 11986208, + "step": 5160 + }, + { + "epoch": 4.868049010367578, + "grad_norm": 0.0008923859568312764, + "learning_rate": 1.0754037517158312e-08, + "loss": 0.0, + "num_input_tokens_seen": 11999520, + "step": 5165 + }, + { + "epoch": 4.872761545711593, + "grad_norm": 0.0033309967257082462, + "learning_rate": 1.0005362962796362e-08, + "loss": 0.0, + "num_input_tokens_seen": 12011424, + "step": 5170 + }, + { + "epoch": 4.877474081055608, + "grad_norm": 0.005915912799537182, + "learning_rate": 9.283645494463368e-09, + "loss": 0.0, + "num_input_tokens_seen": 12024864, + "step": 5175 + }, + { + "epoch": 4.882186616399623, + "grad_norm": 0.0010997394565492868, + "learning_rate": 8.588892925590064e-09, + "loss": 0.0, + "num_input_tokens_seen": 12035936, + "step": 5180 + }, + { + "epoch": 4.886899151743638, + "grad_norm": 0.005926317069679499, + "learning_rate": 7.92111277768015e-09, + "loss": 0.0, + "num_input_tokens_seen": 12045920, + "step": 5185 + }, + { + "epoch": 4.891611687087654, + "grad_norm": 0.011491699144244194, + "learning_rate": 7.280312280230073e-09, + "loss": 0.0, + "num_input_tokens_seen": 12058144, + "step": 5190 + }, + { + "epoch": 4.8963242224316685, + "grad_norm": 0.0012660275679081678, + "learning_rate": 6.666498370650198e-09, + "loss": 0.0, + "num_input_tokens_seen": 12069792, + "step": 5195 + }, + { + "epoch": 4.9010367577756835, + "grad_norm": 0.03111676499247551, + "learning_rate": 6.079677694189046e-09, + "loss": 0.0, + "num_input_tokens_seen": 12080864, + "step": 5200 + }, + { + "epoch": 4.905749293119698, + "grad_norm": 0.0013736054534092546, + "learning_rate": 5.5198566038627835e-09, + "loss": 0.0, + "num_input_tokens_seen": 12092256, + "step": 5205 + }, + { + "epoch": 4.910461828463713, + "grad_norm": 0.008520975708961487, + "learning_rate": 4.987041160385287e-09, + "loss": 0.0, + "num_input_tokens_seen": 12106784, + "step": 5210 + }, + { + "epoch": 4.915174363807728, + "grad_norm": 0.038982491940259933, + "learning_rate": 4.481237132103189e-09, + "loss": 0.0003, + "num_input_tokens_seen": 12117088, + "step": 5215 + }, + { + "epoch": 4.919886899151743, + "grad_norm": 0.0053964899852871895, + "learning_rate": 4.002449994932878e-09, + "loss": 0.0, + "num_input_tokens_seen": 12128736, + "step": 5220 + }, + { + "epoch": 4.924599434495759, + "grad_norm": 0.0056511214934289455, + "learning_rate": 3.550684932301374e-09, + "loss": 0.0, + "num_input_tokens_seen": 12145376, + "step": 5225 + }, + { + "epoch": 4.929311969839774, + "grad_norm": 0.004293152131140232, + "learning_rate": 3.1259468350910982e-09, + "loss": 0.0, + "num_input_tokens_seen": 12156320, + "step": 5230 + }, + { + "epoch": 4.934024505183789, + "grad_norm": 0.002154400572180748, + "learning_rate": 2.7282403015849167e-09, + "loss": 0.0, + "num_input_tokens_seen": 12167968, + "step": 5235 + }, + { + "epoch": 4.938737040527804, + "grad_norm": 0.0020166414324194193, + "learning_rate": 2.3575696374189548e-09, + "loss": 0.0, + "num_input_tokens_seen": 12179744, + "step": 5240 + }, + { + "epoch": 4.943449575871819, + "grad_norm": 0.0013167713768780231, + "learning_rate": 2.013938855533748e-09, + "loss": 0.0001, + "num_input_tokens_seen": 12192288, + "step": 5245 + }, + { + "epoch": 4.948162111215834, + "grad_norm": 0.00897101778537035, + "learning_rate": 1.6973516761317755e-09, + "loss": 0.0, + "num_input_tokens_seen": 12203360, + "step": 5250 + }, + { + "epoch": 4.952874646559849, + "grad_norm": 0.7944397330284119, + "learning_rate": 1.407811526637215e-09, + "loss": 0.0007, + "num_input_tokens_seen": 12215392, + "step": 5255 + }, + { + "epoch": 4.957587181903865, + "grad_norm": 0.004428845830261707, + "learning_rate": 1.145321541659028e-09, + "loss": 0.0, + "num_input_tokens_seen": 12227680, + "step": 5260 + }, + { + "epoch": 4.9622997172478795, + "grad_norm": 0.005368073936551809, + "learning_rate": 9.098845629559871e-10, + "loss": 0.0, + "num_input_tokens_seen": 12242336, + "step": 5265 + }, + { + "epoch": 4.9670122525918945, + "grad_norm": 0.055548761039972305, + "learning_rate": 7.015031394072557e-10, + "loss": 0.0, + "num_input_tokens_seen": 12251936, + "step": 5270 + }, + { + "epoch": 4.971724787935909, + "grad_norm": 0.006647925358265638, + "learning_rate": 5.201795269837995e-10, + "loss": 0.0, + "num_input_tokens_seen": 12262432, + "step": 5275 + }, + { + "epoch": 4.976437323279924, + "grad_norm": 0.0016112312441691756, + "learning_rate": 3.6591568872451634e-10, + "loss": 0.0, + "num_input_tokens_seen": 12275872, + "step": 5280 + }, + { + "epoch": 4.981149858623939, + "grad_norm": 0.003358457935974002, + "learning_rate": 2.387132947151427e-10, + "loss": 0.0, + "num_input_tokens_seen": 12288928, + "step": 5285 + }, + { + "epoch": 4.985862393967954, + "grad_norm": 0.011470218189060688, + "learning_rate": 1.3857372206882436e-10, + "loss": 0.0969, + "num_input_tokens_seen": 12300832, + "step": 5290 + }, + { + "epoch": 4.99057492931197, + "grad_norm": 0.018770242109894753, + "learning_rate": 6.549805491307127e-11, + "loss": 0.0, + "num_input_tokens_seen": 12312352, + "step": 5295 + }, + { + "epoch": 4.995287464655985, + "grad_norm": 0.002519431058317423, + "learning_rate": 1.948708437726765e-11, + "loss": 0.0, + "num_input_tokens_seen": 12322528, + "step": 5300 + }, + { + "epoch": 5.0, + "grad_norm": 0.0025235991925001144, + "learning_rate": 5.413085829575338e-13, + "loss": 0.0, + "num_input_tokens_seen": 12333600, + "step": 5305 + }, + { + "epoch": 5.0, + "num_input_tokens_seen": 12333600, + "step": 5305, + "total_flos": 7.20143693217792e+16, + "train_loss": 0.11108403178044919, + "train_runtime": 1575.5047, + "train_samples_per_second": 26.925, + "train_steps_per_second": 3.367 + } + ], + "logging_steps": 5, + "max_steps": 5305, + "num_input_tokens_seen": 12333600, + "num_train_epochs": 5, + "save_steps": 266, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7.20143693217792e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..987ed1f --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c48857d0c3d47661e9acd14ef454a027ceefd076c60dfa85538fe6e7c8a6f3c +size 6289 diff --git a/training_eval_loss.png b/training_eval_loss.png new file mode 100644 index 0000000..c9ca510 Binary files /dev/null and b/training_eval_loss.png differ diff --git a/training_loss.png b/training_loss.png new file mode 100644 index 0000000..f08b718 Binary files /dev/null and b/training_loss.png differ