commit effa9b2bb06a19683c4c293015fd4a7340e02e9c Author: ModelHub XC Date: Wed Jun 10 23:53:51 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: rbelanec/train_sst2_42_1779354537 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..582e9e0 --- /dev/null +++ b/README.md @@ -0,0 +1,81 @@ +--- +library_name: transformers +license: llama3.2 +base_model: meta-llama/Llama-3.2-1B-Instruct +tags: +- peft-factory +- freeze +- llama-factory +- generated_from_trainer +model-index: +- name: train_sst2_42_1779354537 + results: [] +--- + + + +# train_sst2_42_1779354537 + +This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) on the sst2 dataset. +It achieves the following results on the evaluation set: +- Loss: 0.0908 +- Num Input Tokens Seen: 3725120 + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 2e-06 +- train_batch_size: 8 +- eval_batch_size: 8 +- seed: 42 +- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_ratio: 0.1 +- num_epochs: 1 + +### Training results + +| Training Loss | Epoch | Step | Validation Loss | Input Tokens Seen | +|:-------------:|:------:|:----:|:---------------:|:-----------------:| +| 0.0584 | 0.0500 | 379 | 0.1753 | 187072 | +| 0.1154 | 0.1000 | 758 | 0.1295 | 373504 | +| 0.0745 | 0.1501 | 1137 | 0.1949 | 557824 | +| 0.1712 | 0.2001 | 1516 | 0.1069 | 743424 | +| 0.2865 | 0.2501 | 1895 | 0.1277 | 930944 | +| 0.1225 | 0.3001 | 2274 | 0.1098 | 1116800 | +| 0.1152 | 0.3501 | 2653 | 0.1235 | 1303872 | +| 0.1615 | 0.4002 | 3032 | 0.1323 | 1490688 | +| 0.0698 | 0.4502 | 3411 | 0.1182 | 1678208 | +| 0.3465 | 0.5002 | 3790 | 0.1325 | 1864128 | +| 0.1538 | 0.5502 | 4169 | 0.0976 | 2047552 | +| 0.1911 | 0.6002 | 4548 | 0.1150 | 2232448 | +| 0.1499 | 0.6503 | 4927 | 0.0984 | 2420096 | +| 0.2014 | 0.7003 | 5306 | 0.0908 | 2605504 | +| 0.0014 | 0.7503 | 5685 | 0.0957 | 2790656 | +| 0.1294 | 0.8003 | 6064 | 0.0955 | 2979456 | +| 0.1202 | 0.8503 | 6443 | 0.0970 | 3167488 | +| 0.0013 | 0.9004 | 6822 | 0.0957 | 3355520 | +| 0.05 | 0.9504 | 7201 | 0.0956 | 3541632 | + + +### Framework versions + +- Transformers 4.51.3 +- Pytorch 2.10.0+cu128 +- Datasets 4.0.0 +- Tokenizers 0.21.4 diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..da2e63d --- /dev/null +++ b/all_results.json @@ -0,0 +1,13 @@ +{ + "epoch": 1.0, + "eval_loss": 0.09084735810756683, + "eval_runtime": 7.6282, + "eval_samples_per_second": 882.909, + "eval_steps_per_second": 110.38, + "num_input_tokens_seen": 3725120, + "total_flos": 2.175051626840064e+16, + "train_loss": 0.12470523826255549, + "train_runtime": 1215.5483, + "train_samples_per_second": 49.866, + "train_steps_per_second": 6.233 +} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..08bd85b --- /dev/null +++ b/config.json @@ -0,0 +1,39 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.3", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/eval_results.json b/eval_results.json new file mode 100644 index 0000000..178febf --- /dev/null +++ b/eval_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 1.0, + "eval_loss": 0.09084735810756683, + "eval_runtime": 7.6282, + "eval_samples_per_second": 882.909, + "eval_steps_per_second": 110.38, + "num_input_tokens_seen": 3725120 +} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..2b8ae57 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.51.3" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..3a2e2cd --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58ceb3ca5faec61a359c8f422f68c237c099b498be223e4823117c20a19a3a5c +size 4417933576 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..14daf45 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,26 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..ddc3ce0 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2069 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/train.yaml b/train.yaml new file mode 100644 index 0000000..87c432c --- /dev/null +++ b/train.yaml @@ -0,0 +1,65 @@ +seed: 42 + +### model +model_name_or_path: meta-llama/Llama-3.2-1B-Instruct +trust_remote_code: true +flash_attn: auto +use_cache: false + +### method +# Full fine-tune of every decoder block, but with the (tied) embeddings frozen. +# `finetuning_type: freeze` only trains modules whose name matches a trainable layer; +# embed_tokens / lm_head / final model.norm are "extra" modules and stay frozen unless +# listed in freeze_extra_modules. Setting freeze_trainable_layers = num_hidden_layers (16 +# for Llama-3.2-1B) makes ALL decoder blocks trainable, so this == "full FT minus +# embeddings". Because tie_word_embeddings=true, freezing embed_tokens also freezes lm_head. +# This is lever B of the embedding-amplification fix (see figures/amplification/README.md). +stage: sft +do_train: true +finetuning_type: freeze +freeze_trainable_layers: 16 +freeze_trainable_modules: all +# freeze_extra_modules: left unset -> embed_tokens, lm_head (tied), final norm stay frozen + +### dataset +dataset: sst2 +template: llama3 +cutoff_len: 2048 +overwrite_cache: true +preprocessing_num_workers: 4 +dataloader_num_workers: 4 +packing: false + +### output +output_dir: saves_bts_preliminary/freeze/llama-3.2-1b-instruct/train_sst2_42_1779354537 +logging_steps: 5 +save_steps: 0.05 +overwrite_output_dir: true +save_only_model: false +plot_loss: true +include_num_input_tokens_seen: true +push_to_hub: true +push_to_hub_organization: rbelanec +load_best_model_at_end: true +save_total_limit: 1 + +### train +per_device_train_batch_size: 8 +learning_rate: 2.0e-6 +num_train_epochs: 1 +weight_decay: 1.0e-2 +lr_scheduler_type: cosine +bf16: true +ddp_timeout: 180000000 +resume_from_checkpoint: null +warmup_ratio: 0.1 +optim: adamw_torch +report_to: +- wandb +run_name: freeze_llama-3.2-1b-instruct_train_sst2_42_1779354537 + +### eval +per_device_eval_batch_size: 8 +eval_strategy: steps +eval_steps: 0.05 +val_size: 0.1 diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..03dde30 --- /dev/null +++ b/train_results.json @@ -0,0 +1,9 @@ +{ + "epoch": 1.0, + "num_input_tokens_seen": 3725120, + "total_flos": 2.175051626840064e+16, + "train_loss": 0.12470523826255549, + "train_runtime": 1215.5483, + "train_samples_per_second": 49.866, + "train_steps_per_second": 6.233 +} \ No newline at end of file diff --git a/trainer_log.jsonl b/trainer_log.jsonl new file mode 100644 index 0000000..f824ac1 --- /dev/null +++ b/trainer_log.jsonl @@ -0,0 +1,1535 @@ +{"current_steps": 5, "total_steps": 7577, "loss": 1.413, "lr": 1.0554089709762531e-08, "epoch": 0.0006598917777484492, "percentage": 0.07, "elapsed_time": "0:00:00", "remaining_time": "0:16:44", "throughput": 3378.34, "total_tokens": 2240} +{"current_steps": 10, "total_steps": 7577, "loss": 1.5134, "lr": 2.3746701846965696e-08, "epoch": 0.0013197835554968984, "percentage": 0.13, "elapsed_time": "0:00:00", "remaining_time": "0:12:14", "throughput": 4814.36, "total_tokens": 4672} +{"current_steps": 15, "total_steps": 7577, "loss": 1.3995, "lr": 3.6939313984168866e-08, "epoch": 0.0019796753332453477, "percentage": 0.2, "elapsed_time": "0:00:01", "remaining_time": "0:10:37", "throughput": 5567.19, "total_tokens": 7040} +{"current_steps": 20, "total_steps": 7577, "loss": 1.422, "lr": 5.013192612137203e-08, "epoch": 0.002639567110993797, "percentage": 0.26, "elapsed_time": "0:00:01", "remaining_time": "0:09:47", "throughput": 6173.8, "total_tokens": 9600} +{"current_steps": 25, "total_steps": 7577, "loss": 1.2976, "lr": 6.33245382585752e-08, "epoch": 0.0032994588887422464, "percentage": 0.33, "elapsed_time": "0:00:01", "remaining_time": "0:09:16", "throughput": 6596.06, "total_tokens": 12160} +{"current_steps": 30, "total_steps": 7577, "loss": 1.3834, "lr": 7.651715039577835e-08, "epoch": 0.0039593506664906955, "percentage": 0.4, "elapsed_time": "0:00:02", "remaining_time": "0:08:57", "throughput": 6798.01, "total_tokens": 14528} +{"current_steps": 35, "total_steps": 7577, "loss": 1.1343, "lr": 8.970976253298153e-08, "epoch": 0.004619242444239145, "percentage": 0.46, "elapsed_time": "0:00:02", "remaining_time": "0:08:42", "throughput": 6909.34, "total_tokens": 16768} +{"current_steps": 40, "total_steps": 7577, "loss": 1.1513, "lr": 1.0290237467018468e-07, "epoch": 0.005279134221987594, "percentage": 0.53, "elapsed_time": "0:00:02", "remaining_time": "0:08:32", "throughput": 7087.75, "total_tokens": 19264} +{"current_steps": 45, "total_steps": 7577, "loss": 0.8287, "lr": 1.1609498680738786e-07, "epoch": 0.005939025999736044, "percentage": 0.59, "elapsed_time": "0:00:03", "remaining_time": "0:08:25", "throughput": 7158.49, "total_tokens": 21632} +{"current_steps": 50, "total_steps": 7577, "loss": 0.7425, "lr": 1.29287598944591e-07, "epoch": 0.006598917777484493, "percentage": 0.66, "elapsed_time": "0:00:03", "remaining_time": "0:08:18", "throughput": 7247.06, "total_tokens": 24000} +{"current_steps": 55, "total_steps": 7577, "loss": 0.7064, "lr": 1.424802110817942e-07, "epoch": 0.007258809555232942, "percentage": 0.73, "elapsed_time": "0:00:03", "remaining_time": "0:08:12", "throughput": 7354.12, "total_tokens": 26496} +{"current_steps": 60, "total_steps": 7577, "loss": 0.3853, "lr": 1.5567282321899736e-07, "epoch": 0.007918701332981391, "percentage": 0.79, "elapsed_time": "0:00:03", "remaining_time": "0:08:07", "throughput": 7476.62, "total_tokens": 29120} +{"current_steps": 65, "total_steps": 7577, "loss": 0.3076, "lr": 1.688654353562005e-07, "epoch": 0.008578593110729841, "percentage": 0.86, "elapsed_time": "0:00:04", "remaining_time": "0:08:03", "throughput": 7581.71, "total_tokens": 31744} +{"current_steps": 70, "total_steps": 7577, "loss": 0.2971, "lr": 1.820580474934037e-07, "epoch": 0.00923848488847829, "percentage": 0.92, "elapsed_time": "0:00:04", "remaining_time": "0:07:59", "throughput": 7636.13, "total_tokens": 34176} +{"current_steps": 75, "total_steps": 7577, "loss": 0.3004, "lr": 1.9525065963060686e-07, "epoch": 0.009898376666226739, "percentage": 0.99, "elapsed_time": "0:00:04", "remaining_time": "0:07:56", "throughput": 7731.09, "total_tokens": 36864} +{"current_steps": 80, "total_steps": 7577, "loss": 0.2532, "lr": 2.0844327176781002e-07, "epoch": 0.010558268443975187, "percentage": 1.06, "elapsed_time": "0:00:05", "remaining_time": "0:07:54", "throughput": 7793.61, "total_tokens": 39424} +{"current_steps": 85, "total_steps": 7577, "loss": 0.2616, "lr": 2.2163588390501316e-07, "epoch": 0.011218160221723637, "percentage": 1.12, "elapsed_time": "0:00:05", "remaining_time": "0:07:51", "throughput": 7871.99, "total_tokens": 42112} +{"current_steps": 90, "total_steps": 7577, "loss": 0.2528, "lr": 2.3482849604221635e-07, "epoch": 0.011878051999472087, "percentage": 1.19, "elapsed_time": "0:00:05", "remaining_time": "0:07:49", "throughput": 7900.45, "total_tokens": 44544} +{"current_steps": 95, "total_steps": 7577, "loss": 0.1904, "lr": 2.480211081794195e-07, "epoch": 0.012537943777220536, "percentage": 1.25, "elapsed_time": "0:00:05", "remaining_time": "0:07:48", "throughput": 7925.15, "total_tokens": 47104} +{"current_steps": 100, "total_steps": 7577, "loss": 0.1653, "lr": 2.612137203166227e-07, "epoch": 0.013197835554968985, "percentage": 1.32, "elapsed_time": "0:00:06", "remaining_time": "0:07:46", "throughput": 7966.86, "total_tokens": 49664} +{"current_steps": 105, "total_steps": 7577, "loss": 0.137, "lr": 2.744063324538258e-07, "epoch": 0.013857727332717434, "percentage": 1.39, "elapsed_time": "0:00:06", "remaining_time": "0:07:44", "throughput": 8025.35, "total_tokens": 52352} +{"current_steps": 110, "total_steps": 7577, "loss": 0.102, "lr": 2.8759894459102903e-07, "epoch": 0.014517619110465884, "percentage": 1.45, "elapsed_time": "0:00:06", "remaining_time": "0:07:42", "throughput": 8031.81, "total_tokens": 54720} +{"current_steps": 115, "total_steps": 7577, "loss": 0.1477, "lr": 3.007915567282322e-07, "epoch": 0.015177510888214334, "percentage": 1.52, "elapsed_time": "0:00:07", "remaining_time": "0:07:40", "throughput": 8044.62, "total_tokens": 57152} +{"current_steps": 120, "total_steps": 7577, "loss": 0.2005, "lr": 3.139841688654353e-07, "epoch": 0.015837402665962782, "percentage": 1.58, "elapsed_time": "0:00:07", "remaining_time": "0:07:39", "throughput": 8082.38, "total_tokens": 59776} +{"current_steps": 125, "total_steps": 7577, "loss": 0.1417, "lr": 3.271767810026385e-07, "epoch": 0.01649729444371123, "percentage": 1.65, "elapsed_time": "0:00:07", "remaining_time": "0:07:38", "throughput": 8123.25, "total_tokens": 62464} +{"current_steps": 130, "total_steps": 7577, "loss": 0.1226, "lr": 3.403693931398417e-07, "epoch": 0.017157186221459682, "percentage": 1.72, "elapsed_time": "0:00:07", "remaining_time": "0:07:37", "throughput": 8150.83, "total_tokens": 65088} +{"current_steps": 135, "total_steps": 7577, "loss": 0.2123, "lr": 3.5356200527704485e-07, "epoch": 0.01781707799920813, "percentage": 1.78, "elapsed_time": "0:00:08", "remaining_time": "0:07:36", "throughput": 8189.79, "total_tokens": 67776} +{"current_steps": 140, "total_steps": 7577, "loss": 0.2606, "lr": 3.66754617414248e-07, "epoch": 0.01847696977695658, "percentage": 1.85, "elapsed_time": "0:00:08", "remaining_time": "0:07:35", "throughput": 8218.06, "total_tokens": 70400} +{"current_steps": 145, "total_steps": 7577, "loss": 0.1463, "lr": 3.7994722955145113e-07, "epoch": 0.01913686155470503, "percentage": 1.91, "elapsed_time": "0:00:08", "remaining_time": "0:07:34", "throughput": 8201.25, "total_tokens": 72704} +{"current_steps": 150, "total_steps": 7577, "loss": 0.346, "lr": 3.9313984168865435e-07, "epoch": 0.019796753332453478, "percentage": 1.98, "elapsed_time": "0:00:09", "remaining_time": "0:07:33", "throughput": 8209.22, "total_tokens": 75136} +{"current_steps": 155, "total_steps": 7577, "loss": 0.0609, "lr": 4.063324538258575e-07, "epoch": 0.020456645110201926, "percentage": 2.05, "elapsed_time": "0:00:09", "remaining_time": "0:07:32", "throughput": 8222.03, "total_tokens": 77632} +{"current_steps": 160, "total_steps": 7577, "loss": 0.2492, "lr": 4.195250659630606e-07, "epoch": 0.021116536887950375, "percentage": 2.11, "elapsed_time": "0:00:09", "remaining_time": "0:07:31", "throughput": 8246.13, "total_tokens": 80320} +{"current_steps": 165, "total_steps": 7577, "loss": 0.1542, "lr": 4.3271767810026384e-07, "epoch": 0.021776428665698826, "percentage": 2.18, "elapsed_time": "0:00:10", "remaining_time": "0:07:30", "throughput": 8249.8, "total_tokens": 82752} +{"current_steps": 170, "total_steps": 7577, "loss": 0.3095, "lr": 4.45910290237467e-07, "epoch": 0.022436320443447275, "percentage": 2.24, "elapsed_time": "0:00:10", "remaining_time": "0:07:29", "throughput": 8260.36, "total_tokens": 85248} +{"current_steps": 175, "total_steps": 7577, "loss": 0.2917, "lr": 4.5910290237467017e-07, "epoch": 0.023096212221195723, "percentage": 2.31, "elapsed_time": "0:00:10", "remaining_time": "0:07:28", "throughput": 8280.74, "total_tokens": 87872} +{"current_steps": 180, "total_steps": 7577, "loss": 0.1369, "lr": 4.7229551451187333e-07, "epoch": 0.023756103998944175, "percentage": 2.38, "elapsed_time": "0:00:10", "remaining_time": "0:07:27", "throughput": 8290.65, "total_tokens": 90368} +{"current_steps": 185, "total_steps": 7577, "loss": 0.0624, "lr": 4.854881266490765e-07, "epoch": 0.024415995776692623, "percentage": 2.44, "elapsed_time": "0:00:11", "remaining_time": "0:07:27", "throughput": 8305.03, "total_tokens": 92928} +{"current_steps": 190, "total_steps": 7577, "loss": 0.1617, "lr": 4.986807387862796e-07, "epoch": 0.02507588755444107, "percentage": 2.51, "elapsed_time": "0:00:11", "remaining_time": "0:07:26", "throughput": 8302.6, "total_tokens": 95296} +{"current_steps": 195, "total_steps": 7577, "loss": 0.1049, "lr": 5.118733509234829e-07, "epoch": 0.02573577933218952, "percentage": 2.57, "elapsed_time": "0:00:11", "remaining_time": "0:07:25", "throughput": 8325.17, "total_tokens": 97984} +{"current_steps": 200, "total_steps": 7577, "loss": 0.1992, "lr": 5.250659630606859e-07, "epoch": 0.02639567110993797, "percentage": 2.64, "elapsed_time": "0:00:12", "remaining_time": "0:07:24", "throughput": 8323.31, "total_tokens": 100352} +{"current_steps": 205, "total_steps": 7577, "loss": 0.1434, "lr": 5.382585751978892e-07, "epoch": 0.02705556288768642, "percentage": 2.71, "elapsed_time": "0:00:12", "remaining_time": "0:07:23", "throughput": 8300.48, "total_tokens": 102464} +{"current_steps": 210, "total_steps": 7577, "loss": 0.1237, "lr": 5.514511873350924e-07, "epoch": 0.027715454665434867, "percentage": 2.77, "elapsed_time": "0:00:12", "remaining_time": "0:07:23", "throughput": 8317.85, "total_tokens": 105088} +{"current_steps": 215, "total_steps": 7577, "loss": 0.3024, "lr": 5.646437994722954e-07, "epoch": 0.02837534644318332, "percentage": 2.84, "elapsed_time": "0:00:12", "remaining_time": "0:07:22", "throughput": 8329.02, "total_tokens": 107648} +{"current_steps": 220, "total_steps": 7577, "loss": 0.2383, "lr": 5.778364116094987e-07, "epoch": 0.029035238220931767, "percentage": 2.9, "elapsed_time": "0:00:13", "remaining_time": "0:07:21", "throughput": 8336.34, "total_tokens": 110144} +{"current_steps": 225, "total_steps": 7577, "loss": 0.1367, "lr": 5.910290237467019e-07, "epoch": 0.029695129998680216, "percentage": 2.97, "elapsed_time": "0:00:13", "remaining_time": "0:07:21", "throughput": 8338.23, "total_tokens": 112576} +{"current_steps": 230, "total_steps": 7577, "loss": 0.2351, "lr": 6.042216358839049e-07, "epoch": 0.030355021776428667, "percentage": 3.04, "elapsed_time": "0:00:13", "remaining_time": "0:07:20", "throughput": 8356.87, "total_tokens": 115264} +{"current_steps": 235, "total_steps": 7577, "loss": 0.0839, "lr": 6.174142480211082e-07, "epoch": 0.031014913554177116, "percentage": 3.1, "elapsed_time": "0:00:14", "remaining_time": "0:07:19", "throughput": 8370.98, "total_tokens": 117888} +{"current_steps": 240, "total_steps": 7577, "loss": 0.0189, "lr": 6.306068601583114e-07, "epoch": 0.031674805331925564, "percentage": 3.17, "elapsed_time": "0:00:14", "remaining_time": "0:07:19", "throughput": 8372.82, "total_tokens": 120320} +{"current_steps": 245, "total_steps": 7577, "loss": 0.1832, "lr": 6.437994722955144e-07, "epoch": 0.032334697109674015, "percentage": 3.23, "elapsed_time": "0:00:14", "remaining_time": "0:07:18", "throughput": 8370.25, "total_tokens": 122688} +{"current_steps": 250, "total_steps": 7577, "loss": 0.2718, "lr": 6.569920844327177e-07, "epoch": 0.03299458888742246, "percentage": 3.3, "elapsed_time": "0:00:14", "remaining_time": "0:07:18", "throughput": 8380.57, "total_tokens": 125248} +{"current_steps": 255, "total_steps": 7577, "loss": 0.2037, "lr": 6.701846965699208e-07, "epoch": 0.03365448066517091, "percentage": 3.37, "elapsed_time": "0:00:15", "remaining_time": "0:07:17", "throughput": 8382.24, "total_tokens": 127680} +{"current_steps": 260, "total_steps": 7577, "loss": 0.0833, "lr": 6.833773087071239e-07, "epoch": 0.034314372442919364, "percentage": 3.43, "elapsed_time": "0:00:15", "remaining_time": "0:07:16", "throughput": 8404.92, "total_tokens": 130496} +{"current_steps": 265, "total_steps": 7577, "loss": 0.19, "lr": 6.965699208443272e-07, "epoch": 0.03497426422066781, "percentage": 3.5, "elapsed_time": "0:00:15", "remaining_time": "0:07:16", "throughput": 8410.63, "total_tokens": 132992} +{"current_steps": 270, "total_steps": 7577, "loss": 0.1429, "lr": 7.097625329815303e-07, "epoch": 0.03563415599841626, "percentage": 3.56, "elapsed_time": "0:00:16", "remaining_time": "0:07:15", "throughput": 8388.11, "total_tokens": 135040} +{"current_steps": 275, "total_steps": 7577, "loss": 0.2488, "lr": 7.229551451187335e-07, "epoch": 0.03629404777616471, "percentage": 3.63, "elapsed_time": "0:00:16", "remaining_time": "0:07:15", "throughput": 8397.14, "total_tokens": 137600} +{"current_steps": 280, "total_steps": 7577, "loss": 0.1125, "lr": 7.361477572559367e-07, "epoch": 0.03695393955391316, "percentage": 3.7, "elapsed_time": "0:00:16", "remaining_time": "0:07:14", "throughput": 8390.5, "total_tokens": 139904} +{"current_steps": 285, "total_steps": 7577, "loss": 0.1465, "lr": 7.493403693931398e-07, "epoch": 0.03761383133166161, "percentage": 3.76, "elapsed_time": "0:00:16", "remaining_time": "0:07:13", "throughput": 8373.43, "total_tokens": 142016} +{"current_steps": 290, "total_steps": 7577, "loss": 0.0092, "lr": 7.62532981530343e-07, "epoch": 0.03827372310941006, "percentage": 3.83, "elapsed_time": "0:00:17", "remaining_time": "0:07:13", "throughput": 8381.89, "total_tokens": 144576} +{"current_steps": 295, "total_steps": 7577, "loss": 0.0894, "lr": 7.757255936675461e-07, "epoch": 0.038933614887158505, "percentage": 3.89, "elapsed_time": "0:00:17", "remaining_time": "0:07:12", "throughput": 8376.37, "total_tokens": 146880} +{"current_steps": 300, "total_steps": 7577, "loss": 0.3917, "lr": 7.889182058047493e-07, "epoch": 0.039593506664906956, "percentage": 3.96, "elapsed_time": "0:00:17", "remaining_time": "0:07:12", "throughput": 8370.95, "total_tokens": 149184} +{"current_steps": 305, "total_steps": 7577, "loss": 0.2216, "lr": 8.021108179419525e-07, "epoch": 0.0402533984426554, "percentage": 4.03, "elapsed_time": "0:00:18", "remaining_time": "0:07:11", "throughput": 8354.34, "total_tokens": 151296} +{"current_steps": 310, "total_steps": 7577, "loss": 0.2024, "lr": 8.153034300791555e-07, "epoch": 0.04091329022040385, "percentage": 4.09, "elapsed_time": "0:00:18", "remaining_time": "0:07:11", "throughput": 8352.81, "total_tokens": 153664} +{"current_steps": 315, "total_steps": 7577, "loss": 0.2624, "lr": 8.284960422163588e-07, "epoch": 0.041573181998152305, "percentage": 4.16, "elapsed_time": "0:00:18", "remaining_time": "0:07:10", "throughput": 8350.7, "total_tokens": 156032} +{"current_steps": 320, "total_steps": 7577, "loss": 0.2276, "lr": 8.41688654353562e-07, "epoch": 0.04223307377590075, "percentage": 4.22, "elapsed_time": "0:00:18", "remaining_time": "0:07:10", "throughput": 8355.37, "total_tokens": 158528} +{"current_steps": 325, "total_steps": 7577, "loss": 0.1227, "lr": 8.54881266490765e-07, "epoch": 0.0428929655536492, "percentage": 4.29, "elapsed_time": "0:00:19", "remaining_time": "0:07:09", "throughput": 8342.78, "total_tokens": 160704} +{"current_steps": 330, "total_steps": 7577, "loss": 0.1889, "lr": 8.680738786279683e-07, "epoch": 0.04355285733139765, "percentage": 4.36, "elapsed_time": "0:00:19", "remaining_time": "0:07:09", "throughput": 8341.64, "total_tokens": 163072} +{"current_steps": 335, "total_steps": 7577, "loss": 0.1251, "lr": 8.812664907651715e-07, "epoch": 0.0442127491091461, "percentage": 4.42, "elapsed_time": "0:00:19", "remaining_time": "0:07:08", "throughput": 8346.27, "total_tokens": 165568} +{"current_steps": 340, "total_steps": 7577, "loss": 0.2737, "lr": 8.944591029023745e-07, "epoch": 0.04487264088689455, "percentage": 4.49, "elapsed_time": "0:00:20", "remaining_time": "0:07:08", "throughput": 8345.09, "total_tokens": 167936} +{"current_steps": 345, "total_steps": 7577, "loss": 0.0704, "lr": 9.076517150395778e-07, "epoch": 0.045532532664643, "percentage": 4.55, "elapsed_time": "0:00:20", "remaining_time": "0:07:07", "throughput": 8336.92, "total_tokens": 170176} +{"current_steps": 350, "total_steps": 7577, "loss": 0.1521, "lr": 9.20844327176781e-07, "epoch": 0.046192424442391446, "percentage": 4.62, "elapsed_time": "0:00:20", "remaining_time": "0:07:07", "throughput": 8326.64, "total_tokens": 172352} +{"current_steps": 355, "total_steps": 7577, "loss": 0.2593, "lr": 9.340369393139841e-07, "epoch": 0.0468523162201399, "percentage": 4.69, "elapsed_time": "0:00:20", "remaining_time": "0:07:07", "throughput": 8344.55, "total_tokens": 175168} +{"current_steps": 360, "total_steps": 7577, "loss": 0.1364, "lr": 9.472295514511873e-07, "epoch": 0.04751220799788835, "percentage": 4.75, "elapsed_time": "0:00:21", "remaining_time": "0:07:06", "throughput": 8356.9, "total_tokens": 177856} +{"current_steps": 365, "total_steps": 7577, "loss": 0.3046, "lr": 9.604221635883904e-07, "epoch": 0.048172099775636794, "percentage": 4.82, "elapsed_time": "0:00:21", "remaining_time": "0:07:06", "throughput": 8352.28, "total_tokens": 180160} +{"current_steps": 370, "total_steps": 7577, "loss": 0.1873, "lr": 9.736147757255936e-07, "epoch": 0.048831991553385246, "percentage": 4.88, "elapsed_time": "0:00:21", "remaining_time": "0:07:05", "throughput": 8361.41, "total_tokens": 182784} +{"current_steps": 375, "total_steps": 7577, "loss": 0.0584, "lr": 9.86807387862797e-07, "epoch": 0.0494918833311337, "percentage": 4.95, "elapsed_time": "0:00:22", "remaining_time": "0:07:05", "throughput": 8353.5, "total_tokens": 185024} +{"current_steps": 379, "total_steps": 7577, "eval_loss": 0.17531457543373108, "epoch": 0.05001979675333245, "percentage": 5.0, "elapsed_time": "0:00:29", "remaining_time": "0:09:26", "throughput": 6274.39, "total_tokens": 187072} +{"current_steps": 380, "total_steps": 7577, "loss": 0.1671, "lr": 1e-06, "epoch": 0.05015177510888214, "percentage": 5.02, "elapsed_time": "0:01:10", "remaining_time": "0:22:06", "throughput": 2680.47, "total_tokens": 187712} +{"current_steps": 385, "total_steps": 7577, "loss": 0.2583, "lr": 1.0131926121372032e-06, "epoch": 0.050811666886630594, "percentage": 5.08, "elapsed_time": "0:01:10", "remaining_time": "0:21:53", "throughput": 2706.91, "total_tokens": 190400} +{"current_steps": 390, "total_steps": 7577, "loss": 0.1182, "lr": 1.0263852242744063e-06, "epoch": 0.05147155866437904, "percentage": 5.15, "elapsed_time": "0:01:10", "remaining_time": "0:21:41", "throughput": 2736.48, "total_tokens": 193280} +{"current_steps": 395, "total_steps": 7577, "loss": 0.1517, "lr": 1.0395778364116096e-06, "epoch": 0.05213145044212749, "percentage": 5.21, "elapsed_time": "0:01:10", "remaining_time": "0:21:29", "throughput": 2757.81, "total_tokens": 195584} +{"current_steps": 400, "total_steps": 7577, "loss": 0.0928, "lr": 1.0527704485488126e-06, "epoch": 0.05279134221987594, "percentage": 5.28, "elapsed_time": "0:01:11", "remaining_time": "0:21:17", "throughput": 2783.34, "total_tokens": 198208} +{"current_steps": 405, "total_steps": 7577, "loss": 0.2377, "lr": 1.0659630606860157e-06, "epoch": 0.05345123399762439, "percentage": 5.35, "elapsed_time": "0:01:11", "remaining_time": "0:21:06", "throughput": 2807.02, "total_tokens": 200704} +{"current_steps": 410, "total_steps": 7577, "loss": 0.2115, "lr": 1.079155672823219e-06, "epoch": 0.05411112577537284, "percentage": 5.41, "elapsed_time": "0:01:11", "remaining_time": "0:20:54", "throughput": 2829.72, "total_tokens": 203136} +{"current_steps": 415, "total_steps": 7577, "loss": 0.1501, "lr": 1.0923482849604222e-06, "epoch": 0.05477101755312129, "percentage": 5.48, "elapsed_time": "0:01:12", "remaining_time": "0:20:43", "throughput": 2851.16, "total_tokens": 205504} +{"current_steps": 420, "total_steps": 7577, "loss": 0.2442, "lr": 1.1055408970976253e-06, "epoch": 0.055430909330869735, "percentage": 5.54, "elapsed_time": "0:01:12", "remaining_time": "0:20:33", "throughput": 2874.21, "total_tokens": 208000} +{"current_steps": 425, "total_steps": 7577, "loss": 0.0951, "lr": 1.1187335092348285e-06, "epoch": 0.056090801108618187, "percentage": 5.61, "elapsed_time": "0:01:12", "remaining_time": "0:20:22", "throughput": 2897.98, "total_tokens": 210560} +{"current_steps": 430, "total_steps": 7577, "loss": 0.1749, "lr": 1.1319261213720316e-06, "epoch": 0.05675069288636664, "percentage": 5.68, "elapsed_time": "0:01:12", "remaining_time": "0:20:12", "throughput": 2920.67, "total_tokens": 213056} +{"current_steps": 435, "total_steps": 7577, "loss": 0.1071, "lr": 1.1451187335092347e-06, "epoch": 0.05741058466411508, "percentage": 5.74, "elapsed_time": "0:01:13", "remaining_time": "0:20:02", "throughput": 2939.86, "total_tokens": 215296} +{"current_steps": 440, "total_steps": 7577, "loss": 0.0089, "lr": 1.158311345646438e-06, "epoch": 0.058070476441863535, "percentage": 5.81, "elapsed_time": "0:01:13", "remaining_time": "0:19:52", "throughput": 2957.97, "total_tokens": 217472} +{"current_steps": 445, "total_steps": 7577, "loss": 0.0408, "lr": 1.1715039577836412e-06, "epoch": 0.058730368219611986, "percentage": 5.87, "elapsed_time": "0:01:13", "remaining_time": "0:19:42", "throughput": 2980.24, "total_tokens": 219968} +{"current_steps": 450, "total_steps": 7577, "loss": 0.0381, "lr": 1.1846965699208443e-06, "epoch": 0.05939025999736043, "percentage": 5.94, "elapsed_time": "0:01:14", "remaining_time": "0:19:33", "throughput": 3003.96, "total_tokens": 222592} +{"current_steps": 455, "total_steps": 7577, "loss": 0.5112, "lr": 1.1978891820580475e-06, "epoch": 0.06005015177510888, "percentage": 6.01, "elapsed_time": "0:01:14", "remaining_time": "0:19:24", "throughput": 3021.61, "total_tokens": 224768} +{"current_steps": 460, "total_steps": 7577, "loss": 0.2367, "lr": 1.2110817941952508e-06, "epoch": 0.060710043552857335, "percentage": 6.07, "elapsed_time": "0:01:14", "remaining_time": "0:19:15", "throughput": 3043.21, "total_tokens": 227264} +{"current_steps": 465, "total_steps": 7577, "loss": 0.2476, "lr": 1.2242744063324536e-06, "epoch": 0.06136993533060578, "percentage": 6.14, "elapsed_time": "0:01:14", "remaining_time": "0:19:06", "throughput": 3064.79, "total_tokens": 229760} +{"current_steps": 470, "total_steps": 7577, "loss": 0.1846, "lr": 1.237467018469657e-06, "epoch": 0.06202982710835423, "percentage": 6.2, "elapsed_time": "0:01:15", "remaining_time": "0:18:57", "throughput": 3082.86, "total_tokens": 232000} +{"current_steps": 475, "total_steps": 7577, "loss": 0.1636, "lr": 1.2506596306068602e-06, "epoch": 0.06268971888610268, "percentage": 6.27, "elapsed_time": "0:01:15", "remaining_time": "0:18:49", "throughput": 3099.98, "total_tokens": 234176} +{"current_steps": 480, "total_steps": 7577, "loss": 0.1056, "lr": 1.2638522427440632e-06, "epoch": 0.06334961066385113, "percentage": 6.33, "elapsed_time": "0:01:15", "remaining_time": "0:18:41", "throughput": 3121.97, "total_tokens": 236736} +{"current_steps": 485, "total_steps": 7577, "loss": 0.0032, "lr": 1.2770448548812665e-06, "epoch": 0.06400950244159957, "percentage": 6.4, "elapsed_time": "0:01:16", "remaining_time": "0:18:33", "throughput": 3141.35, "total_tokens": 239104} +{"current_steps": 490, "total_steps": 7577, "loss": 0.4157, "lr": 1.2902374670184698e-06, "epoch": 0.06466939421934803, "percentage": 6.47, "elapsed_time": "0:01:16", "remaining_time": "0:18:25", "throughput": 3162.22, "total_tokens": 241600} +{"current_steps": 495, "total_steps": 7577, "loss": 0.3641, "lr": 1.3034300791556726e-06, "epoch": 0.06532928599709648, "percentage": 6.53, "elapsed_time": "0:01:16", "remaining_time": "0:18:17", "throughput": 3183.73, "total_tokens": 244160} +{"current_steps": 500, "total_steps": 7577, "loss": 0.175, "lr": 1.316622691292876e-06, "epoch": 0.06598917777484492, "percentage": 6.6, "elapsed_time": "0:01:16", "remaining_time": "0:18:09", "throughput": 3201.8, "total_tokens": 246464} +{"current_steps": 505, "total_steps": 7577, "loss": 0.0893, "lr": 1.3298153034300792e-06, "epoch": 0.06664906955259338, "percentage": 6.66, "elapsed_time": "0:01:17", "remaining_time": "0:18:02", "throughput": 3222.1, "total_tokens": 248960} +{"current_steps": 510, "total_steps": 7577, "loss": 0.1824, "lr": 1.3430079155672822e-06, "epoch": 0.06730896133034182, "percentage": 6.73, "elapsed_time": "0:01:17", "remaining_time": "0:17:54", "throughput": 3241.56, "total_tokens": 251392} +{"current_steps": 515, "total_steps": 7577, "loss": 0.2085, "lr": 1.3562005277044855e-06, "epoch": 0.06796885310809027, "percentage": 6.8, "elapsed_time": "0:01:17", "remaining_time": "0:17:47", "throughput": 3261.65, "total_tokens": 253888} +{"current_steps": 520, "total_steps": 7577, "loss": 0.1502, "lr": 1.3693931398416888e-06, "epoch": 0.06862874488583873, "percentage": 6.86, "elapsed_time": "0:01:18", "remaining_time": "0:17:40", "throughput": 3281.54, "total_tokens": 256384} +{"current_steps": 525, "total_steps": 7577, "loss": 0.1721, "lr": 1.3825857519788916e-06, "epoch": 0.06928863666358717, "percentage": 6.93, "elapsed_time": "0:01:18", "remaining_time": "0:17:33", "throughput": 3296.33, "total_tokens": 258496} +{"current_steps": 530, "total_steps": 7577, "loss": 0.0959, "lr": 1.3957783641160949e-06, "epoch": 0.06994852844133562, "percentage": 6.99, "elapsed_time": "0:01:18", "remaining_time": "0:17:26", "throughput": 3314.38, "total_tokens": 260864} +{"current_steps": 535, "total_steps": 7577, "loss": 0.0484, "lr": 1.4089709762532982e-06, "epoch": 0.07060842021908408, "percentage": 7.06, "elapsed_time": "0:01:18", "remaining_time": "0:17:19", "throughput": 3333.82, "total_tokens": 263360} +{"current_steps": 540, "total_steps": 7577, "loss": 0.306, "lr": 1.4221635883905012e-06, "epoch": 0.07126831199683252, "percentage": 7.13, "elapsed_time": "0:01:19", "remaining_time": "0:17:13", "throughput": 3356.23, "total_tokens": 266112} +{"current_steps": 545, "total_steps": 7577, "loss": 0.2425, "lr": 1.4353562005277045e-06, "epoch": 0.07192820377458096, "percentage": 7.19, "elapsed_time": "0:01:19", "remaining_time": "0:17:06", "throughput": 3374.61, "total_tokens": 268544} +{"current_steps": 550, "total_steps": 7577, "loss": 0.4216, "lr": 1.4485488126649078e-06, "epoch": 0.07258809555232942, "percentage": 7.26, "elapsed_time": "0:01:19", "remaining_time": "0:17:00", "throughput": 3392.13, "total_tokens": 270912} +{"current_steps": 555, "total_steps": 7577, "loss": 0.0023, "lr": 1.4617414248021108e-06, "epoch": 0.07324798733007787, "percentage": 7.32, "elapsed_time": "0:01:20", "remaining_time": "0:16:54", "throughput": 3414.03, "total_tokens": 273664} +{"current_steps": 560, "total_steps": 7577, "loss": 0.1737, "lr": 1.4749340369393139e-06, "epoch": 0.07390787910782631, "percentage": 7.39, "elapsed_time": "0:01:20", "remaining_time": "0:16:48", "throughput": 3432.82, "total_tokens": 276160} +{"current_steps": 565, "total_steps": 7577, "loss": 0.0057, "lr": 1.4881266490765171e-06, "epoch": 0.07456777088557477, "percentage": 7.46, "elapsed_time": "0:01:20", "remaining_time": "0:16:41", "throughput": 3453.02, "total_tokens": 278784} +{"current_steps": 570, "total_steps": 7577, "loss": 0.2339, "lr": 1.5013192612137202e-06, "epoch": 0.07522766266332322, "percentage": 7.52, "elapsed_time": "0:01:21", "remaining_time": "0:16:36", "throughput": 3470.07, "total_tokens": 281152} +{"current_steps": 575, "total_steps": 7577, "loss": 0.2223, "lr": 1.5145118733509235e-06, "epoch": 0.07588755444107166, "percentage": 7.59, "elapsed_time": "0:01:21", "remaining_time": "0:16:30", "throughput": 3486.04, "total_tokens": 283456} +{"current_steps": 580, "total_steps": 7577, "loss": 0.0743, "lr": 1.5277044854881265e-06, "epoch": 0.07654744621882012, "percentage": 7.65, "elapsed_time": "0:01:21", "remaining_time": "0:16:24", "throughput": 3504.71, "total_tokens": 286016} +{"current_steps": 585, "total_steps": 7577, "loss": 0.1493, "lr": 1.5408970976253298e-06, "epoch": 0.07720733799656856, "percentage": 7.72, "elapsed_time": "0:01:21", "remaining_time": "0:16:18", "throughput": 3521.55, "total_tokens": 288448} +{"current_steps": 590, "total_steps": 7577, "loss": 0.1879, "lr": 1.5540897097625329e-06, "epoch": 0.07786722977431701, "percentage": 7.79, "elapsed_time": "0:01:22", "remaining_time": "0:16:13", "throughput": 3537.83, "total_tokens": 290816} +{"current_steps": 595, "total_steps": 7577, "loss": 0.2187, "lr": 1.567282321899736e-06, "epoch": 0.07852712155206547, "percentage": 7.85, "elapsed_time": "0:01:22", "remaining_time": "0:16:08", "throughput": 3557.83, "total_tokens": 293504} +{"current_steps": 600, "total_steps": 7577, "loss": 0.1335, "lr": 1.5804749340369392e-06, "epoch": 0.07918701332981391, "percentage": 7.92, "elapsed_time": "0:01:22", "remaining_time": "0:16:02", "throughput": 3572.58, "total_tokens": 295744} +{"current_steps": 605, "total_steps": 7577, "loss": 0.0816, "lr": 1.5936675461741425e-06, "epoch": 0.07984690510756236, "percentage": 7.98, "elapsed_time": "0:01:23", "remaining_time": "0:15:57", "throughput": 3588.58, "total_tokens": 298112} +{"current_steps": 610, "total_steps": 7577, "loss": 0.1134, "lr": 1.6068601583113455e-06, "epoch": 0.0805067968853108, "percentage": 8.05, "elapsed_time": "0:01:23", "remaining_time": "0:15:52", "throughput": 3605.97, "total_tokens": 300608} +{"current_steps": 615, "total_steps": 7577, "loss": 0.0047, "lr": 1.6200527704485488e-06, "epoch": 0.08116668866305926, "percentage": 8.12, "elapsed_time": "0:01:23", "remaining_time": "0:15:47", "throughput": 3626.14, "total_tokens": 303360} +{"current_steps": 620, "total_steps": 7577, "loss": 0.0712, "lr": 1.633245382585752e-06, "epoch": 0.0818265804408077, "percentage": 8.18, "elapsed_time": "0:01:23", "remaining_time": "0:15:41", "throughput": 3644.23, "total_tokens": 305920} +{"current_steps": 625, "total_steps": 7577, "loss": 0.0795, "lr": 1.646437994722955e-06, "epoch": 0.08248647221855615, "percentage": 8.25, "elapsed_time": "0:01:24", "remaining_time": "0:15:36", "throughput": 3661.38, "total_tokens": 308416} +{"current_steps": 630, "total_steps": 7577, "loss": 0.1324, "lr": 1.6596306068601582e-06, "epoch": 0.08314636399630461, "percentage": 8.31, "elapsed_time": "0:01:24", "remaining_time": "0:15:32", "throughput": 3677.75, "total_tokens": 310848} +{"current_steps": 635, "total_steps": 7577, "loss": 0.2123, "lr": 1.6728232189973614e-06, "epoch": 0.08380625577405305, "percentage": 8.38, "elapsed_time": "0:01:24", "remaining_time": "0:15:27", "throughput": 3695.38, "total_tokens": 313408} +{"current_steps": 640, "total_steps": 7577, "loss": 0.1099, "lr": 1.6860158311345645e-06, "epoch": 0.0844661475518015, "percentage": 8.45, "elapsed_time": "0:01:25", "remaining_time": "0:15:22", "throughput": 3712.15, "total_tokens": 315904} +{"current_steps": 645, "total_steps": 7577, "loss": 0.2301, "lr": 1.6992084432717678e-06, "epoch": 0.08512603932954996, "percentage": 8.51, "elapsed_time": "0:01:25", "remaining_time": "0:15:17", "throughput": 3725.17, "total_tokens": 318080} +{"current_steps": 650, "total_steps": 7577, "loss": 0.0621, "lr": 1.712401055408971e-06, "epoch": 0.0857859311072984, "percentage": 8.58, "elapsed_time": "0:01:25", "remaining_time": "0:15:13", "throughput": 3738.07, "total_tokens": 320256} +{"current_steps": 655, "total_steps": 7577, "loss": 0.0199, "lr": 1.7255936675461739e-06, "epoch": 0.08644582288504685, "percentage": 8.64, "elapsed_time": "0:01:25", "remaining_time": "0:15:08", "throughput": 3751.62, "total_tokens": 322496} +{"current_steps": 660, "total_steps": 7577, "loss": 0.1723, "lr": 1.7387862796833772e-06, "epoch": 0.0871057146627953, "percentage": 8.71, "elapsed_time": "0:01:26", "remaining_time": "0:15:03", "throughput": 3769.33, "total_tokens": 325120} +{"current_steps": 665, "total_steps": 7577, "loss": 0.0485, "lr": 1.7519788918205804e-06, "epoch": 0.08776560644054375, "percentage": 8.78, "elapsed_time": "0:01:26", "remaining_time": "0:14:59", "throughput": 3781.96, "total_tokens": 327296} +{"current_steps": 670, "total_steps": 7577, "loss": 0.4327, "lr": 1.7651715039577835e-06, "epoch": 0.0884254982182922, "percentage": 8.84, "elapsed_time": "0:01:26", "remaining_time": "0:14:55", "throughput": 3796.81, "total_tokens": 329664} +{"current_steps": 675, "total_steps": 7577, "loss": 0.0918, "lr": 1.7783641160949868e-06, "epoch": 0.08908538999604065, "percentage": 8.91, "elapsed_time": "0:01:27", "remaining_time": "0:14:50", "throughput": 3815.58, "total_tokens": 332416} +{"current_steps": 680, "total_steps": 7577, "loss": 0.0255, "lr": 1.79155672823219e-06, "epoch": 0.0897452817737891, "percentage": 8.97, "elapsed_time": "0:01:27", "remaining_time": "0:14:46", "throughput": 3832.32, "total_tokens": 334976} +{"current_steps": 685, "total_steps": 7577, "loss": 0.0856, "lr": 1.8047493403693929e-06, "epoch": 0.09040517355153754, "percentage": 9.04, "elapsed_time": "0:01:27", "remaining_time": "0:14:42", "throughput": 3848.26, "total_tokens": 337472} +{"current_steps": 690, "total_steps": 7577, "loss": 0.1861, "lr": 1.8179419525065961e-06, "epoch": 0.091065065329286, "percentage": 9.11, "elapsed_time": "0:01:27", "remaining_time": "0:14:38", "throughput": 3863.35, "total_tokens": 339904} +{"current_steps": 695, "total_steps": 7577, "loss": 0.1639, "lr": 1.8311345646437994e-06, "epoch": 0.09172495710703445, "percentage": 9.17, "elapsed_time": "0:01:28", "remaining_time": "0:14:34", "throughput": 3877.39, "total_tokens": 342272} +{"current_steps": 700, "total_steps": 7577, "loss": 0.1908, "lr": 1.8443271767810025e-06, "epoch": 0.09238484888478289, "percentage": 9.24, "elapsed_time": "0:01:28", "remaining_time": "0:14:30", "throughput": 3891.33, "total_tokens": 344640} +{"current_steps": 705, "total_steps": 7577, "loss": 0.0427, "lr": 1.8575197889182057e-06, "epoch": 0.09304474066253135, "percentage": 9.3, "elapsed_time": "0:01:28", "remaining_time": "0:14:26", "throughput": 3906.06, "total_tokens": 347072} +{"current_steps": 710, "total_steps": 7577, "loss": 0.194, "lr": 1.870712401055409e-06, "epoch": 0.0937046324402798, "percentage": 9.37, "elapsed_time": "0:01:29", "remaining_time": "0:14:22", "throughput": 3922.79, "total_tokens": 349696} +{"current_steps": 715, "total_steps": 7577, "loss": 0.0821, "lr": 1.883905013192612e-06, "epoch": 0.09436452421802824, "percentage": 9.44, "elapsed_time": "0:01:29", "remaining_time": "0:14:18", "throughput": 3938.51, "total_tokens": 352256} +{"current_steps": 720, "total_steps": 7577, "loss": 0.1312, "lr": 1.8970976253298151e-06, "epoch": 0.0950244159957767, "percentage": 9.5, "elapsed_time": "0:01:29", "remaining_time": "0:14:14", "throughput": 3956.23, "total_tokens": 355008} +{"current_steps": 725, "total_steps": 7577, "loss": 0.2885, "lr": 1.9102902374670186e-06, "epoch": 0.09568430777352514, "percentage": 9.57, "elapsed_time": "0:01:30", "remaining_time": "0:14:10", "throughput": 3969.72, "total_tokens": 357376} +{"current_steps": 730, "total_steps": 7577, "loss": 0.1712, "lr": 1.9234828496042215e-06, "epoch": 0.09634419955127359, "percentage": 9.63, "elapsed_time": "0:01:30", "remaining_time": "0:14:07", "throughput": 3982.46, "total_tokens": 359680} +{"current_steps": 735, "total_steps": 7577, "loss": 0.1537, "lr": 1.9366754617414247e-06, "epoch": 0.09700409132902205, "percentage": 9.7, "elapsed_time": "0:01:30", "remaining_time": "0:14:03", "throughput": 3997.21, "total_tokens": 362176} +{"current_steps": 740, "total_steps": 7577, "loss": 0.2028, "lr": 1.949868073878628e-06, "epoch": 0.09766398310677049, "percentage": 9.77, "elapsed_time": "0:01:30", "remaining_time": "0:13:59", "throughput": 4015.76, "total_tokens": 365056} +{"current_steps": 745, "total_steps": 7577, "loss": 0.2106, "lr": 1.963060686015831e-06, "epoch": 0.09832387488451894, "percentage": 9.83, "elapsed_time": "0:01:31", "remaining_time": "0:13:56", "throughput": 4029.67, "total_tokens": 367488} +{"current_steps": 750, "total_steps": 7577, "loss": 0.2852, "lr": 1.976253298153034e-06, "epoch": 0.0989837666622674, "percentage": 9.9, "elapsed_time": "0:01:31", "remaining_time": "0:13:52", "throughput": 4041.98, "total_tokens": 369792} +{"current_steps": 755, "total_steps": 7577, "loss": 0.1154, "lr": 1.9894459102902374e-06, "epoch": 0.09964365844001584, "percentage": 9.96, "elapsed_time": "0:01:31", "remaining_time": "0:13:49", "throughput": 4054.89, "total_tokens": 372160} +{"current_steps": 758, "total_steps": 7577, "eval_loss": 0.129482701420784, "epoch": 0.1000395935066649, "percentage": 10.0, "elapsed_time": "0:01:39", "remaining_time": "0:14:56", "throughput": 3748.07, "total_tokens": 373504} +{"current_steps": 760, "total_steps": 7577, "loss": 0.0874, "lr": 1.9999998938723955e-06, "epoch": 0.10030355021776428, "percentage": 10.03, "elapsed_time": "0:02:02", "remaining_time": "0:18:21", "throughput": 3047.59, "total_tokens": 374272} +{"current_steps": 765, "total_steps": 7577, "loss": 0.0774, "lr": 1.9999961794086063e-06, "epoch": 0.10096344199551274, "percentage": 10.1, "elapsed_time": "0:02:03", "remaining_time": "0:18:16", "throughput": 3060.18, "total_tokens": 376704} +{"current_steps": 770, "total_steps": 7577, "loss": 0.2165, "lr": 1.999987158587122e-06, "epoch": 0.10162333377326119, "percentage": 10.16, "elapsed_time": "0:02:03", "remaining_time": "0:18:10", "throughput": 3072.69, "total_tokens": 379136} +{"current_steps": 775, "total_steps": 7577, "loss": 0.1505, "lr": 1.9999728314558114e-06, "epoch": 0.10228322555100963, "percentage": 10.23, "elapsed_time": "0:02:03", "remaining_time": "0:18:05", "throughput": 3085.18, "total_tokens": 381568} +{"current_steps": 780, "total_steps": 7577, "loss": 0.2297, "lr": 1.9999531980906988e-06, "epoch": 0.10294311732875808, "percentage": 10.29, "elapsed_time": "0:02:03", "remaining_time": "0:18:00", "throughput": 3098.6, "total_tokens": 384128} +{"current_steps": 785, "total_steps": 7577, "loss": 0.0893, "lr": 1.999928258595967e-06, "epoch": 0.10360300910650654, "percentage": 10.36, "elapsed_time": "0:02:04", "remaining_time": "0:17:55", "throughput": 3108.9, "total_tokens": 386304} +{"current_steps": 790, "total_steps": 7577, "loss": 0.2538, "lr": 1.9998980131039534e-06, "epoch": 0.10426290088425498, "percentage": 10.43, "elapsed_time": "0:02:04", "remaining_time": "0:17:49", "throughput": 3122.25, "total_tokens": 388864} +{"current_steps": 795, "total_steps": 7577, "loss": 0.0914, "lr": 1.999862461775153e-06, "epoch": 0.10492279266200343, "percentage": 10.49, "elapsed_time": "0:02:04", "remaining_time": "0:17:44", "throughput": 3132.99, "total_tokens": 391104} +{"current_steps": 800, "total_steps": 7577, "loss": 0.1431, "lr": 1.999821604798214e-06, "epoch": 0.10558268443975188, "percentage": 10.56, "elapsed_time": "0:02:05", "remaining_time": "0:17:39", "throughput": 3147.66, "total_tokens": 393856} +{"current_steps": 805, "total_steps": 7577, "loss": 0.3214, "lr": 1.999775442389939e-06, "epoch": 0.10624257621750033, "percentage": 10.62, "elapsed_time": "0:02:05", "remaining_time": "0:17:35", "throughput": 3160.27, "total_tokens": 396352} +{"current_steps": 810, "total_steps": 7577, "loss": 0.1422, "lr": 1.9997239747952843e-06, "epoch": 0.10690246799524877, "percentage": 10.69, "elapsed_time": "0:02:05", "remaining_time": "0:17:30", "throughput": 3170.81, "total_tokens": 398592} +{"current_steps": 815, "total_steps": 7577, "loss": 0.0609, "lr": 1.9996672022873546e-06, "epoch": 0.10756235977299723, "percentage": 10.76, "elapsed_time": "0:02:05", "remaining_time": "0:17:25", "throughput": 3183.34, "total_tokens": 401088} +{"current_steps": 820, "total_steps": 7577, "loss": 0.0726, "lr": 1.9996051251674073e-06, "epoch": 0.10822225155074568, "percentage": 10.82, "elapsed_time": "0:02:06", "remaining_time": "0:17:20", "throughput": 3194.84, "total_tokens": 403456} +{"current_steps": 825, "total_steps": 7577, "loss": 0.1602, "lr": 1.999537743764847e-06, "epoch": 0.10888214332849412, "percentage": 10.89, "elapsed_time": "0:02:06", "remaining_time": "0:17:15", "throughput": 3205.31, "total_tokens": 405696} +{"current_steps": 830, "total_steps": 7577, "loss": 0.4649, "lr": 1.999465058437225e-06, "epoch": 0.10954203510624258, "percentage": 10.95, "elapsed_time": "0:02:06", "remaining_time": "0:17:11", "throughput": 3217.18, "total_tokens": 408128} +{"current_steps": 835, "total_steps": 7577, "loss": 0.0112, "lr": 1.9993870695702364e-06, "epoch": 0.11020192688399102, "percentage": 11.02, "elapsed_time": "0:02:07", "remaining_time": "0:17:06", "throughput": 3232.37, "total_tokens": 411008} +{"current_steps": 840, "total_steps": 7577, "loss": 0.3035, "lr": 1.9993037775777206e-06, "epoch": 0.11086181866173947, "percentage": 11.09, "elapsed_time": "0:02:07", "remaining_time": "0:17:02", "throughput": 3243.18, "total_tokens": 413312} +{"current_steps": 845, "total_steps": 7577, "loss": 0.1141, "lr": 1.999215182901656e-06, "epoch": 0.11152171043948793, "percentage": 11.15, "elapsed_time": "0:02:07", "remaining_time": "0:16:57", "throughput": 3253.95, "total_tokens": 415616} +{"current_steps": 850, "total_steps": 7577, "loss": 0.1391, "lr": 1.9991212860121587e-06, "epoch": 0.11218160221723637, "percentage": 11.22, "elapsed_time": "0:02:08", "remaining_time": "0:16:53", "throughput": 3267.99, "total_tokens": 418368} +{"current_steps": 855, "total_steps": 7577, "loss": 0.0502, "lr": 1.999022087407482e-06, "epoch": 0.11284149399498482, "percentage": 11.28, "elapsed_time": "0:02:08", "remaining_time": "0:16:48", "throughput": 3280.09, "total_tokens": 420864} +{"current_steps": 860, "total_steps": 7577, "loss": 0.3102, "lr": 1.998917587614011e-06, "epoch": 0.11350138577273328, "percentage": 11.35, "elapsed_time": "0:02:08", "remaining_time": "0:16:44", "throughput": 3289.71, "total_tokens": 423040} +{"current_steps": 865, "total_steps": 7577, "loss": 0.3563, "lr": 1.9988077871862615e-06, "epoch": 0.11416127755048172, "percentage": 11.42, "elapsed_time": "0:02:08", "remaining_time": "0:16:40", "throughput": 3300.25, "total_tokens": 425344} +{"current_steps": 870, "total_steps": 7577, "loss": 0.0052, "lr": 1.9986926867068752e-06, "epoch": 0.11482116932823017, "percentage": 11.48, "elapsed_time": "0:02:09", "remaining_time": "0:16:35", "throughput": 3313.18, "total_tokens": 427968} +{"current_steps": 875, "total_steps": 7577, "loss": 0.2265, "lr": 1.998572286786619e-06, "epoch": 0.11548106110597862, "percentage": 11.55, "elapsed_time": "0:02:09", "remaining_time": "0:16:31", "throughput": 3326.0, "total_tokens": 430592} +{"current_steps": 880, "total_steps": 7577, "loss": 0.295, "lr": 1.9984465880643807e-06, "epoch": 0.11614095288372707, "percentage": 11.61, "elapsed_time": "0:02:09", "remaining_time": "0:16:27", "throughput": 3338.34, "total_tokens": 433152} +{"current_steps": 885, "total_steps": 7577, "loss": 0.0961, "lr": 1.998315591207165e-06, "epoch": 0.11680084466147551, "percentage": 11.68, "elapsed_time": "0:02:10", "remaining_time": "0:16:23", "throughput": 3348.71, "total_tokens": 435456} +{"current_steps": 890, "total_steps": 7577, "loss": 0.1703, "lr": 1.9981792969100912e-06, "epoch": 0.11746073643922397, "percentage": 11.75, "elapsed_time": "0:02:10", "remaining_time": "0:16:19", "throughput": 3361.35, "total_tokens": 438080} +{"current_steps": 895, "total_steps": 7577, "loss": 0.2036, "lr": 1.9980377058963875e-06, "epoch": 0.11812062821697242, "percentage": 11.81, "elapsed_time": "0:02:10", "remaining_time": "0:16:15", "throughput": 3373.49, "total_tokens": 440640} +{"current_steps": 900, "total_steps": 7577, "loss": 0.0225, "lr": 1.99789081891739e-06, "epoch": 0.11878051999472086, "percentage": 11.88, "elapsed_time": "0:02:10", "remaining_time": "0:16:11", "throughput": 3384.18, "total_tokens": 443008} +{"current_steps": 905, "total_steps": 7577, "loss": 0.2203, "lr": 1.997738636752536e-06, "epoch": 0.11944041177246932, "percentage": 11.94, "elapsed_time": "0:02:11", "remaining_time": "0:16:07", "throughput": 3394.3, "total_tokens": 445312} +{"current_steps": 910, "total_steps": 7577, "loss": 0.0016, "lr": 1.9975811602093624e-06, "epoch": 0.12010030355021777, "percentage": 12.01, "elapsed_time": "0:02:11", "remaining_time": "0:16:03", "throughput": 3404.92, "total_tokens": 447680} +{"current_steps": 915, "total_steps": 7577, "loss": 0.3289, "lr": 1.9974183901234984e-06, "epoch": 0.12076019532796621, "percentage": 12.08, "elapsed_time": "0:02:11", "remaining_time": "0:15:59", "throughput": 3417.79, "total_tokens": 450368} +{"current_steps": 920, "total_steps": 7577, "loss": 0.3008, "lr": 1.997250327358664e-06, "epoch": 0.12142008710571467, "percentage": 12.14, "elapsed_time": "0:02:12", "remaining_time": "0:15:55", "throughput": 3428.76, "total_tokens": 452800} +{"current_steps": 925, "total_steps": 7577, "loss": 0.1352, "lr": 1.997076972806664e-06, "epoch": 0.12207997888346311, "percentage": 12.21, "elapsed_time": "0:02:12", "remaining_time": "0:15:51", "throughput": 3443.36, "total_tokens": 455744} +{"current_steps": 930, "total_steps": 7577, "loss": 0.2869, "lr": 1.9968983273873827e-06, "epoch": 0.12273987066121156, "percentage": 12.27, "elapsed_time": "0:02:12", "remaining_time": "0:15:48", "throughput": 3454.22, "total_tokens": 458176} +{"current_steps": 935, "total_steps": 7577, "loss": 0.1694, "lr": 1.99671439204878e-06, "epoch": 0.12339976243896002, "percentage": 12.34, "elapsed_time": "0:02:12", "remaining_time": "0:15:44", "throughput": 3464.06, "total_tokens": 460480} +{"current_steps": 940, "total_steps": 7577, "loss": 0.1448, "lr": 1.9965251677668873e-06, "epoch": 0.12405965421670846, "percentage": 12.41, "elapsed_time": "0:02:13", "remaining_time": "0:15:40", "throughput": 3472.89, "total_tokens": 462656} +{"current_steps": 945, "total_steps": 7577, "loss": 0.2976, "lr": 1.9963306555458e-06, "epoch": 0.1247195459944569, "percentage": 12.47, "elapsed_time": "0:02:13", "remaining_time": "0:15:36", "throughput": 3485.45, "total_tokens": 465344} +{"current_steps": 950, "total_steps": 7577, "loss": 0.3325, "lr": 1.9961308564176723e-06, "epoch": 0.12537943777220537, "percentage": 12.54, "elapsed_time": "0:02:13", "remaining_time": "0:15:33", "throughput": 3495.64, "total_tokens": 467712} +{"current_steps": 955, "total_steps": 7577, "loss": 0.1471, "lr": 1.9959257714427147e-06, "epoch": 0.1260393295499538, "percentage": 12.6, "elapsed_time": "0:02:14", "remaining_time": "0:15:29", "throughput": 3505.8, "total_tokens": 470080} +{"current_steps": 960, "total_steps": 7577, "loss": 0.1476, "lr": 1.995715401709186e-06, "epoch": 0.12669922132770225, "percentage": 12.67, "elapsed_time": "0:02:14", "remaining_time": "0:15:26", "throughput": 3516.29, "total_tokens": 472512} +{"current_steps": 965, "total_steps": 7577, "loss": 0.1639, "lr": 1.995499748333387e-06, "epoch": 0.1273591131054507, "percentage": 12.74, "elapsed_time": "0:02:14", "remaining_time": "0:15:22", "throughput": 3525.42, "total_tokens": 474752} +{"current_steps": 970, "total_steps": 7577, "loss": 0.1539, "lr": 1.9952788124596555e-06, "epoch": 0.12801900488319914, "percentage": 12.8, "elapsed_time": "0:02:14", "remaining_time": "0:15:19", "throughput": 3537.71, "total_tokens": 477440} +{"current_steps": 975, "total_steps": 7577, "loss": 0.1507, "lr": 1.9950525952603617e-06, "epoch": 0.12867889666094762, "percentage": 12.87, "elapsed_time": "0:02:15", "remaining_time": "0:15:15", "throughput": 3548.98, "total_tokens": 480000} +{"current_steps": 980, "total_steps": 7577, "loss": 0.1434, "lr": 1.994821097935899e-06, "epoch": 0.12933878843869606, "percentage": 12.93, "elapsed_time": "0:02:15", "remaining_time": "0:15:12", "throughput": 3558.85, "total_tokens": 482368} +{"current_steps": 985, "total_steps": 7577, "loss": 0.0706, "lr": 1.9945843217146804e-06, "epoch": 0.1299986802164445, "percentage": 13.0, "elapsed_time": "0:02:15", "remaining_time": "0:15:09", "throughput": 3567.23, "total_tokens": 484544} +{"current_steps": 990, "total_steps": 7577, "loss": 0.1142, "lr": 1.9943422678531293e-06, "epoch": 0.13065857199419295, "percentage": 13.07, "elapsed_time": "0:02:16", "remaining_time": "0:15:05", "throughput": 3575.63, "total_tokens": 486720} +{"current_steps": 995, "total_steps": 7577, "loss": 0.0692, "lr": 1.994094937635675e-06, "epoch": 0.1313184637719414, "percentage": 13.13, "elapsed_time": "0:02:16", "remaining_time": "0:15:02", "throughput": 3587.18, "total_tokens": 489344} +{"current_steps": 1000, "total_steps": 7577, "loss": 0.0421, "lr": 1.9938423323747457e-06, "epoch": 0.13197835554968984, "percentage": 13.2, "elapsed_time": "0:02:16", "remaining_time": "0:14:59", "throughput": 3597.34, "total_tokens": 491776} +{"current_steps": 1005, "total_steps": 7577, "loss": 0.1827, "lr": 1.99358445341076e-06, "epoch": 0.1326382473274383, "percentage": 13.26, "elapsed_time": "0:02:16", "remaining_time": "0:14:55", "throughput": 3605.63, "total_tokens": 493952} +{"current_steps": 1010, "total_steps": 7577, "loss": 0.2152, "lr": 1.993321302112121e-06, "epoch": 0.13329813910518676, "percentage": 13.33, "elapsed_time": "0:02:17", "remaining_time": "0:14:52", "throughput": 3615.23, "total_tokens": 496320} +{"current_steps": 1015, "total_steps": 7577, "loss": 0.0299, "lr": 1.993052879875209e-06, "epoch": 0.1339580308829352, "percentage": 13.4, "elapsed_time": "0:02:17", "remaining_time": "0:14:49", "throughput": 3623.41, "total_tokens": 498496} +{"current_steps": 1020, "total_steps": 7577, "loss": 0.1351, "lr": 1.992779188124374e-06, "epoch": 0.13461792266068365, "percentage": 13.46, "elapsed_time": "0:02:17", "remaining_time": "0:14:46", "throughput": 3634.31, "total_tokens": 501056} +{"current_steps": 1025, "total_steps": 7577, "loss": 0.0501, "lr": 1.992500228311928e-06, "epoch": 0.1352778144384321, "percentage": 13.53, "elapsed_time": "0:02:18", "remaining_time": "0:14:43", "throughput": 3642.94, "total_tokens": 503296} +{"current_steps": 1030, "total_steps": 7577, "loss": 0.3259, "lr": 1.9922160019181372e-06, "epoch": 0.13593770621618054, "percentage": 13.59, "elapsed_time": "0:02:18", "remaining_time": "0:14:40", "throughput": 3653.81, "total_tokens": 505856} +{"current_steps": 1035, "total_steps": 7577, "loss": 0.1532, "lr": 1.9919265104512138e-06, "epoch": 0.13659759799392898, "percentage": 13.66, "elapsed_time": "0:02:18", "remaining_time": "0:14:36", "throughput": 3664.57, "total_tokens": 508416} +{"current_steps": 1040, "total_steps": 7577, "loss": 0.2708, "lr": 1.9916317554473094e-06, "epoch": 0.13725748977167745, "percentage": 13.73, "elapsed_time": "0:02:19", "remaining_time": "0:14:33", "throughput": 3675.74, "total_tokens": 511040} +{"current_steps": 1045, "total_steps": 7577, "loss": 0.188, "lr": 1.9913317384705052e-06, "epoch": 0.1379173815494259, "percentage": 13.79, "elapsed_time": "0:02:19", "remaining_time": "0:14:30", "throughput": 3683.77, "total_tokens": 513216} +{"current_steps": 1050, "total_steps": 7577, "loss": 0.1146, "lr": 1.991026461112805e-06, "epoch": 0.13857727332717434, "percentage": 13.86, "elapsed_time": "0:02:19", "remaining_time": "0:14:27", "throughput": 3692.18, "total_tokens": 515456} +{"current_steps": 1055, "total_steps": 7577, "loss": 0.1353, "lr": 1.9907159249941257e-06, "epoch": 0.1392371651049228, "percentage": 13.92, "elapsed_time": "0:02:19", "remaining_time": "0:14:24", "throughput": 3701.45, "total_tokens": 517824} +{"current_steps": 1060, "total_steps": 7577, "loss": 0.112, "lr": 1.990400131762289e-06, "epoch": 0.13989705688267123, "percentage": 13.99, "elapsed_time": "0:02:20", "remaining_time": "0:14:21", "throughput": 3711.61, "total_tokens": 520320} +{"current_steps": 1065, "total_steps": 7577, "loss": 0.0702, "lr": 1.9900790830930134e-06, "epoch": 0.14055694866041968, "percentage": 14.06, "elapsed_time": "0:02:20", "remaining_time": "0:14:18", "throughput": 3721.08, "total_tokens": 522752} +{"current_steps": 1070, "total_steps": 7577, "loss": 0.1085, "lr": 1.9897527806899047e-06, "epoch": 0.14121684043816815, "percentage": 14.12, "elapsed_time": "0:02:20", "remaining_time": "0:14:16", "throughput": 3731.92, "total_tokens": 525376} +{"current_steps": 1075, "total_steps": 7577, "loss": 0.2922, "lr": 1.9894212262844465e-06, "epoch": 0.1418767322159166, "percentage": 14.19, "elapsed_time": "0:02:21", "remaining_time": "0:14:13", "throughput": 3741.53, "total_tokens": 527808} +{"current_steps": 1080, "total_steps": 7577, "loss": 0.1607, "lr": 1.989084421635992e-06, "epoch": 0.14253662399366504, "percentage": 14.25, "elapsed_time": "0:02:21", "remaining_time": "0:14:10", "throughput": 3751.54, "total_tokens": 530304} +{"current_steps": 1085, "total_steps": 7577, "loss": 0.2576, "lr": 1.988742368531754e-06, "epoch": 0.14319651577141349, "percentage": 14.32, "elapsed_time": "0:02:21", "remaining_time": "0:14:07", "throughput": 3759.24, "total_tokens": 532480} +{"current_steps": 1090, "total_steps": 7577, "loss": 0.0676, "lr": 1.9883950687867947e-06, "epoch": 0.14385640754916193, "percentage": 14.39, "elapsed_time": "0:02:21", "remaining_time": "0:14:04", "throughput": 3770.53, "total_tokens": 535168} +{"current_steps": 1095, "total_steps": 7577, "loss": 0.1067, "lr": 1.9880425242440187e-06, "epoch": 0.14451629932691037, "percentage": 14.45, "elapsed_time": "0:02:22", "remaining_time": "0:14:01", "throughput": 3779.92, "total_tokens": 537600} +{"current_steps": 1100, "total_steps": 7577, "loss": 0.1435, "lr": 1.9876847367741607e-06, "epoch": 0.14517619110465885, "percentage": 14.52, "elapsed_time": "0:02:22", "remaining_time": "0:13:59", "throughput": 3789.8, "total_tokens": 540096} +{"current_steps": 1105, "total_steps": 7577, "loss": 0.1568, "lr": 1.987321708275776e-06, "epoch": 0.1458360828824073, "percentage": 14.58, "elapsed_time": "0:02:22", "remaining_time": "0:13:56", "throughput": 3799.61, "total_tokens": 542592} +{"current_steps": 1110, "total_steps": 7577, "loss": 0.0017, "lr": 1.986953440675231e-06, "epoch": 0.14649597466015574, "percentage": 14.65, "elapsed_time": "0:02:23", "remaining_time": "0:13:53", "throughput": 3808.43, "total_tokens": 544960} +{"current_steps": 1115, "total_steps": 7577, "loss": 0.0812, "lr": 1.9865799359266925e-06, "epoch": 0.14715586643790418, "percentage": 14.72, "elapsed_time": "0:02:23", "remaining_time": "0:13:50", "throughput": 3815.97, "total_tokens": 547136} +{"current_steps": 1120, "total_steps": 7577, "loss": 0.0878, "lr": 1.986201196012118e-06, "epoch": 0.14781575821565263, "percentage": 14.78, "elapsed_time": "0:02:23", "remaining_time": "0:13:48", "throughput": 3824.3, "total_tokens": 549440} +{"current_steps": 1125, "total_steps": 7577, "loss": 0.2476, "lr": 1.985817222941245e-06, "epoch": 0.14847564999340107, "percentage": 14.85, "elapsed_time": "0:02:23", "remaining_time": "0:13:45", "throughput": 3834.81, "total_tokens": 552064} +{"current_steps": 1130, "total_steps": 7577, "loss": 0.082, "lr": 1.9854280187515794e-06, "epoch": 0.14913554177114954, "percentage": 14.91, "elapsed_time": "0:02:24", "remaining_time": "0:13:42", "throughput": 3843.57, "total_tokens": 554432} +{"current_steps": 1135, "total_steps": 7577, "loss": 0.0745, "lr": 1.985033585508386e-06, "epoch": 0.149795433548898, "percentage": 14.98, "elapsed_time": "0:02:24", "remaining_time": "0:13:40", "throughput": 3852.27, "total_tokens": 556800} +{"current_steps": 1137, "total_steps": 7577, "eval_loss": 0.19488762319087982, "epoch": 0.15005939025999737, "percentage": 15.01, "elapsed_time": "0:02:32", "remaining_time": "0:14:22", "throughput": 3665.25, "total_tokens": 557824} +{"current_steps": 1140, "total_steps": 7577, "loss": 0.5451, "lr": 1.9846339253046766e-06, "epoch": 0.15045532532664643, "percentage": 15.05, "elapsed_time": "0:03:34", "remaining_time": "0:20:09", "throughput": 2611.06, "total_tokens": 559296} +{"current_steps": 1145, "total_steps": 7577, "loss": 0.1735, "lr": 1.984229040261199e-06, "epoch": 0.15111521710439488, "percentage": 15.11, "elapsed_time": "0:03:34", "remaining_time": "0:20:04", "throughput": 2620.66, "total_tokens": 562112} +{"current_steps": 1150, "total_steps": 7577, "loss": 0.2349, "lr": 1.9838189325264263e-06, "epoch": 0.15177510888214332, "percentage": 15.18, "elapsed_time": "0:03:34", "remaining_time": "0:20:00", "throughput": 2627.33, "total_tokens": 564288} +{"current_steps": 1155, "total_steps": 7577, "loss": 0.0845, "lr": 1.983403604276546e-06, "epoch": 0.15243500065989177, "percentage": 15.24, "elapsed_time": "0:03:35", "remaining_time": "0:19:55", "throughput": 2635.77, "total_tokens": 566848} +{"current_steps": 1160, "total_steps": 7577, "loss": 0.394, "lr": 1.9829830577154457e-06, "epoch": 0.15309489243764024, "percentage": 15.31, "elapsed_time": "0:03:35", "remaining_time": "0:19:51", "throughput": 2643.0, "total_tokens": 569152} +{"current_steps": 1165, "total_steps": 7577, "loss": 0.0604, "lr": 1.982557295074705e-06, "epoch": 0.15375478421538868, "percentage": 15.38, "elapsed_time": "0:03:35", "remaining_time": "0:19:46", "throughput": 2650.22, "total_tokens": 571456} +{"current_steps": 1170, "total_steps": 7577, "loss": 0.1545, "lr": 1.982126318613581e-06, "epoch": 0.15441467599313713, "percentage": 15.44, "elapsed_time": "0:03:35", "remaining_time": "0:19:42", "throughput": 2657.74, "total_tokens": 573824} +{"current_steps": 1175, "total_steps": 7577, "loss": 0.0016, "lr": 1.9816901306189977e-06, "epoch": 0.15507456777088557, "percentage": 15.51, "elapsed_time": "0:03:36", "remaining_time": "0:19:37", "throughput": 2664.89, "total_tokens": 576128} +{"current_steps": 1180, "total_steps": 7577, "loss": 0.139, "lr": 1.9812487334055342e-06, "epoch": 0.15573445954863402, "percentage": 15.57, "elapsed_time": "0:03:36", "remaining_time": "0:19:33", "throughput": 2672.06, "total_tokens": 578432} +{"current_steps": 1185, "total_steps": 7577, "loss": 0.1618, "lr": 1.98080212931541e-06, "epoch": 0.15639435132638246, "percentage": 15.64, "elapsed_time": "0:03:36", "remaining_time": "0:19:29", "throughput": 2679.18, "total_tokens": 580736} +{"current_steps": 1190, "total_steps": 7577, "loss": 0.0846, "lr": 1.980350320718476e-06, "epoch": 0.15705424310413094, "percentage": 15.71, "elapsed_time": "0:03:37", "remaining_time": "0:19:24", "throughput": 2686.32, "total_tokens": 583040} +{"current_steps": 1195, "total_steps": 7577, "loss": 0.0073, "lr": 1.9798933100121985e-06, "epoch": 0.15771413488187938, "percentage": 15.77, "elapsed_time": "0:03:37", "remaining_time": "0:19:20", "throughput": 2693.4, "total_tokens": 585344} +{"current_steps": 1200, "total_steps": 7577, "loss": 0.0793, "lr": 1.97943109962165e-06, "epoch": 0.15837402665962783, "percentage": 15.84, "elapsed_time": "0:03:37", "remaining_time": "0:19:16", "throughput": 2701.62, "total_tokens": 587904} +{"current_steps": 1205, "total_steps": 7577, "loss": 0.1511, "lr": 1.978963691999493e-06, "epoch": 0.15903391843737627, "percentage": 15.9, "elapsed_time": "0:03:37", "remaining_time": "0:19:12", "throughput": 2708.67, "total_tokens": 590208} +{"current_steps": 1210, "total_steps": 7577, "loss": 0.0853, "lr": 1.978491089625969e-06, "epoch": 0.15969381021512472, "percentage": 15.97, "elapsed_time": "0:03:38", "remaining_time": "0:19:08", "throughput": 2715.74, "total_tokens": 592512} +{"current_steps": 1215, "total_steps": 7577, "loss": 0.1785, "lr": 1.9780132950088854e-06, "epoch": 0.16035370199287316, "percentage": 16.04, "elapsed_time": "0:03:38", "remaining_time": "0:19:03", "throughput": 2723.85, "total_tokens": 595072} +{"current_steps": 1220, "total_steps": 7577, "loss": 0.2842, "lr": 1.9775303106836e-06, "epoch": 0.1610135937706216, "percentage": 16.1, "elapsed_time": "0:03:38", "remaining_time": "0:18:59", "throughput": 2731.94, "total_tokens": 597632} +{"current_steps": 1225, "total_steps": 7577, "loss": 0.0847, "lr": 1.977042139213011e-06, "epoch": 0.16167348554837008, "percentage": 16.17, "elapsed_time": "0:03:39", "remaining_time": "0:18:55", "throughput": 2740.06, "total_tokens": 600192} +{"current_steps": 1230, "total_steps": 7577, "loss": 0.0931, "lr": 1.9765487831875404e-06, "epoch": 0.16233337732611852, "percentage": 16.23, "elapsed_time": "0:03:39", "remaining_time": "0:18:51", "throughput": 2746.17, "total_tokens": 602304} +{"current_steps": 1235, "total_steps": 7577, "loss": 0.1418, "lr": 1.9760502452251217e-06, "epoch": 0.16299326910386697, "percentage": 16.3, "elapsed_time": "0:03:39", "remaining_time": "0:18:47", "throughput": 2753.13, "total_tokens": 604608} +{"current_steps": 1240, "total_steps": 7577, "loss": 0.1102, "lr": 1.975546527971186e-06, "epoch": 0.1636531608816154, "percentage": 16.37, "elapsed_time": "0:03:39", "remaining_time": "0:18:43", "throughput": 2760.3, "total_tokens": 606976} +{"current_steps": 1245, "total_steps": 7577, "loss": 0.0447, "lr": 1.9750376340986472e-06, "epoch": 0.16431305265936386, "percentage": 16.43, "elapsed_time": "0:03:40", "remaining_time": "0:18:39", "throughput": 2768.52, "total_tokens": 609600} +{"current_steps": 1250, "total_steps": 7577, "loss": 0.1681, "lr": 1.974523566307889e-06, "epoch": 0.1649729444371123, "percentage": 16.5, "elapsed_time": "0:03:40", "remaining_time": "0:18:35", "throughput": 2775.15, "total_tokens": 611840} +{"current_steps": 1255, "total_steps": 7577, "loss": 0.1085, "lr": 1.9740043273267487e-06, "epoch": 0.16563283621486077, "percentage": 16.56, "elapsed_time": "0:03:40", "remaining_time": "0:18:32", "throughput": 2783.72, "total_tokens": 614528} +{"current_steps": 1260, "total_steps": 7577, "loss": 0.0217, "lr": 1.973479919910505e-06, "epoch": 0.16629272799260922, "percentage": 16.63, "elapsed_time": "0:03:41", "remaining_time": "0:18:28", "throughput": 2791.42, "total_tokens": 617024} +{"current_steps": 1265, "total_steps": 7577, "loss": 0.1141, "lr": 1.972950346841862e-06, "epoch": 0.16695261977035766, "percentage": 16.7, "elapsed_time": "0:03:41", "remaining_time": "0:18:24", "throughput": 2798.54, "total_tokens": 619392} +{"current_steps": 1270, "total_steps": 7577, "loss": 0.0049, "lr": 1.972415610930934e-06, "epoch": 0.1676125115481061, "percentage": 16.76, "elapsed_time": "0:03:41", "remaining_time": "0:18:20", "throughput": 2806.17, "total_tokens": 621888} +{"current_steps": 1275, "total_steps": 7577, "loss": 0.2469, "lr": 1.9718757150152324e-06, "epoch": 0.16827240332585455, "percentage": 16.83, "elapsed_time": "0:03:41", "remaining_time": "0:18:16", "throughput": 2812.98, "total_tokens": 624192} +{"current_steps": 1280, "total_steps": 7577, "loss": 0.0511, "lr": 1.9713306619596488e-06, "epoch": 0.168932295103603, "percentage": 16.89, "elapsed_time": "0:03:42", "remaining_time": "0:18:13", "throughput": 2820.3, "total_tokens": 626624} +{"current_steps": 1285, "total_steps": 7577, "loss": 0.0686, "lr": 1.9707804546564407e-06, "epoch": 0.16959218688135147, "percentage": 16.96, "elapsed_time": "0:03:42", "remaining_time": "0:18:09", "throughput": 2827.06, "total_tokens": 628928} +{"current_steps": 1290, "total_steps": 7577, "loss": 0.0234, "lr": 1.9702250960252164e-06, "epoch": 0.17025207865909991, "percentage": 17.03, "elapsed_time": "0:03:42", "remaining_time": "0:18:05", "throughput": 2835.5, "total_tokens": 631616} +{"current_steps": 1295, "total_steps": 7577, "loss": 0.0015, "lr": 1.969664589012918e-06, "epoch": 0.17091197043684836, "percentage": 17.09, "elapsed_time": "0:03:43", "remaining_time": "0:18:01", "throughput": 2843.05, "total_tokens": 634112} +{"current_steps": 1300, "total_steps": 7577, "loss": 0.3855, "lr": 1.9690989365938077e-06, "epoch": 0.1715718622145968, "percentage": 17.16, "elapsed_time": "0:03:43", "remaining_time": "0:17:58", "throughput": 2849.78, "total_tokens": 636416} +{"current_steps": 1305, "total_steps": 7577, "loss": 0.0051, "lr": 1.9685281417694513e-06, "epoch": 0.17223175399234525, "percentage": 17.22, "elapsed_time": "0:03:43", "remaining_time": "0:17:54", "throughput": 2857.07, "total_tokens": 638848} +{"current_steps": 1310, "total_steps": 7577, "loss": 0.1125, "lr": 1.967952207568702e-06, "epoch": 0.1728916457700937, "percentage": 17.29, "elapsed_time": "0:03:43", "remaining_time": "0:17:51", "throughput": 2864.05, "total_tokens": 641216} +{"current_steps": 1315, "total_steps": 7577, "loss": 0.0011, "lr": 1.967371137047685e-06, "epoch": 0.17355153754784217, "percentage": 17.36, "elapsed_time": "0:03:44", "remaining_time": "0:17:47", "throughput": 2872.92, "total_tokens": 644032} +{"current_steps": 1320, "total_steps": 7577, "loss": 0.1494, "lr": 1.966784933289778e-06, "epoch": 0.1742114293255906, "percentage": 17.42, "elapsed_time": "0:03:44", "remaining_time": "0:17:43", "throughput": 2880.37, "total_tokens": 646528} +{"current_steps": 1325, "total_steps": 7577, "loss": 0.1951, "lr": 1.9661935994056014e-06, "epoch": 0.17487132110333906, "percentage": 17.49, "elapsed_time": "0:03:44", "remaining_time": "0:17:40", "throughput": 2888.08, "total_tokens": 649088} +{"current_steps": 1330, "total_steps": 7577, "loss": 0.0093, "lr": 1.965597138532996e-06, "epoch": 0.1755312128810875, "percentage": 17.55, "elapsed_time": "0:03:45", "remaining_time": "0:17:36", "throughput": 2895.24, "total_tokens": 651520} +{"current_steps": 1335, "total_steps": 7577, "loss": 0.0409, "lr": 1.964995553837009e-06, "epoch": 0.17619110465883595, "percentage": 17.62, "elapsed_time": "0:03:45", "remaining_time": "0:17:33", "throughput": 2902.65, "total_tokens": 654016} +{"current_steps": 1340, "total_steps": 7577, "loss": 0.1143, "lr": 1.964388848509875e-06, "epoch": 0.1768509964365844, "percentage": 17.69, "elapsed_time": "0:03:45", "remaining_time": "0:17:30", "throughput": 2909.23, "total_tokens": 656320} +{"current_steps": 1345, "total_steps": 7577, "loss": 0.1683, "lr": 1.9637770257710026e-06, "epoch": 0.17751088821433286, "percentage": 17.75, "elapsed_time": "0:03:45", "remaining_time": "0:17:26", "throughput": 2916.9, "total_tokens": 658880} +{"current_steps": 1350, "total_steps": 7577, "loss": 0.0205, "lr": 1.9631600888669545e-06, "epoch": 0.1781707799920813, "percentage": 17.82, "elapsed_time": "0:03:46", "remaining_time": "0:17:23", "throughput": 2923.42, "total_tokens": 661184} +{"current_steps": 1355, "total_steps": 7577, "loss": 0.0664, "lr": 1.962538041071431e-06, "epoch": 0.17883067176982975, "percentage": 17.88, "elapsed_time": "0:03:46", "remaining_time": "0:17:19", "throughput": 2930.75, "total_tokens": 663680} +{"current_steps": 1360, "total_steps": 7577, "loss": 0.0688, "lr": 1.961910885685253e-06, "epoch": 0.1794905635475782, "percentage": 17.95, "elapsed_time": "0:03:46", "remaining_time": "0:17:16", "throughput": 2937.55, "total_tokens": 666048} +{"current_steps": 1365, "total_steps": 7577, "loss": 0.2636, "lr": 1.9612786260363436e-06, "epoch": 0.18015045532532664, "percentage": 18.02, "elapsed_time": "0:03:47", "remaining_time": "0:17:13", "throughput": 2944.59, "total_tokens": 668480} +{"current_steps": 1370, "total_steps": 7577, "loss": 0.1108, "lr": 1.9606412654797116e-06, "epoch": 0.1808103471030751, "percentage": 18.08, "elapsed_time": "0:03:47", "remaining_time": "0:17:09", "throughput": 2954.04, "total_tokens": 671488} +{"current_steps": 1375, "total_steps": 7577, "loss": 0.1088, "lr": 1.9599988073974332e-06, "epoch": 0.18147023888082353, "percentage": 18.15, "elapsed_time": "0:03:47", "remaining_time": "0:17:06", "throughput": 2961.06, "total_tokens": 673920} +{"current_steps": 1380, "total_steps": 7577, "loss": 0.1413, "lr": 1.959351255198634e-06, "epoch": 0.182130130658572, "percentage": 18.21, "elapsed_time": "0:03:47", "remaining_time": "0:17:03", "throughput": 2968.3, "total_tokens": 676416} +{"current_steps": 1385, "total_steps": 7577, "loss": 0.0008, "lr": 1.9586986123194704e-06, "epoch": 0.18279002243632045, "percentage": 18.28, "elapsed_time": "0:03:48", "remaining_time": "0:17:00", "throughput": 2976.08, "total_tokens": 679040} +{"current_steps": 1390, "total_steps": 7577, "loss": 0.1041, "lr": 1.958040882223112e-06, "epoch": 0.1834499142140689, "percentage": 18.34, "elapsed_time": "0:03:48", "remaining_time": "0:16:56", "throughput": 2984.86, "total_tokens": 681920} +{"current_steps": 1395, "total_steps": 7577, "loss": 0.04, "lr": 1.9573780683997235e-06, "epoch": 0.18410980599181734, "percentage": 18.41, "elapsed_time": "0:03:48", "remaining_time": "0:16:53", "throughput": 2992.05, "total_tokens": 684416} +{"current_steps": 1400, "total_steps": 7577, "loss": 0.3574, "lr": 1.956710174366445e-06, "epoch": 0.18476969776956578, "percentage": 18.48, "elapsed_time": "0:03:49", "remaining_time": "0:16:50", "throughput": 2999.49, "total_tokens": 686976} +{"current_steps": 1405, "total_steps": 7577, "loss": 0.2731, "lr": 1.9560372036673764e-06, "epoch": 0.18542958954731423, "percentage": 18.54, "elapsed_time": "0:03:49", "remaining_time": "0:16:47", "throughput": 3006.43, "total_tokens": 689408} +{"current_steps": 1410, "total_steps": 7577, "loss": 0.0238, "lr": 1.955359159873553e-06, "epoch": 0.1860894813250627, "percentage": 18.61, "elapsed_time": "0:03:49", "remaining_time": "0:16:44", "throughput": 3012.79, "total_tokens": 691712} +{"current_steps": 1415, "total_steps": 7577, "loss": 0.1341, "lr": 1.954676046582932e-06, "epoch": 0.18674937310281114, "percentage": 18.67, "elapsed_time": "0:03:49", "remaining_time": "0:16:41", "throughput": 3019.43, "total_tokens": 694080} +{"current_steps": 1420, "total_steps": 7577, "loss": 0.2135, "lr": 1.9539878674203706e-06, "epoch": 0.1874092648805596, "percentage": 18.74, "elapsed_time": "0:03:50", "remaining_time": "0:16:37", "throughput": 3026.81, "total_tokens": 696640} +{"current_steps": 1425, "total_steps": 7577, "loss": 0.0011, "lr": 1.9532946260376076e-06, "epoch": 0.18806915665830803, "percentage": 18.81, "elapsed_time": "0:03:50", "remaining_time": "0:16:34", "throughput": 3033.89, "total_tokens": 699136} +{"current_steps": 1430, "total_steps": 7577, "loss": 0.3109, "lr": 1.952596326113244e-06, "epoch": 0.18872904843605648, "percentage": 18.87, "elapsed_time": "0:03:50", "remaining_time": "0:16:31", "throughput": 3041.21, "total_tokens": 701696} +{"current_steps": 1435, "total_steps": 7577, "loss": 0.1812, "lr": 1.9518929713527226e-06, "epoch": 0.18938894021380492, "percentage": 18.94, "elapsed_time": "0:03:51", "remaining_time": "0:16:28", "throughput": 3049.05, "total_tokens": 704384} +{"current_steps": 1440, "total_steps": 7577, "loss": 0.0066, "lr": 1.9511845654883097e-06, "epoch": 0.1900488319915534, "percentage": 19.0, "elapsed_time": "0:03:51", "remaining_time": "0:16:25", "throughput": 3054.76, "total_tokens": 706560} +{"current_steps": 1445, "total_steps": 7577, "loss": 0.0755, "lr": 1.9504711122790754e-06, "epoch": 0.19070872376930184, "percentage": 19.07, "elapsed_time": "0:03:51", "remaining_time": "0:16:22", "throughput": 3062.59, "total_tokens": 709248} +{"current_steps": 1450, "total_steps": 7577, "loss": 0.2258, "lr": 1.949752615510871e-06, "epoch": 0.19136861554705029, "percentage": 19.14, "elapsed_time": "0:03:51", "remaining_time": "0:16:19", "throughput": 3067.72, "total_tokens": 711296} +{"current_steps": 1455, "total_steps": 7577, "loss": 0.0457, "lr": 1.949029078996313e-06, "epoch": 0.19202850732479873, "percentage": 19.2, "elapsed_time": "0:03:52", "remaining_time": "0:16:16", "throughput": 3074.48, "total_tokens": 713728} +{"current_steps": 1460, "total_steps": 7577, "loss": 0.1224, "lr": 1.9483005065747584e-06, "epoch": 0.19268839910254718, "percentage": 19.27, "elapsed_time": "0:03:52", "remaining_time": "0:16:13", "throughput": 3081.47, "total_tokens": 716224} +{"current_steps": 1465, "total_steps": 7577, "loss": 0.3816, "lr": 1.947566902112289e-06, "epoch": 0.19334829088029562, "percentage": 19.33, "elapsed_time": "0:03:52", "remaining_time": "0:16:10", "throughput": 3087.62, "total_tokens": 718528} +{"current_steps": 1470, "total_steps": 7577, "loss": 0.1841, "lr": 1.9468282695016863e-06, "epoch": 0.1940081826580441, "percentage": 19.4, "elapsed_time": "0:03:52", "remaining_time": "0:16:07", "throughput": 3094.28, "total_tokens": 720960} +{"current_steps": 1475, "total_steps": 7577, "loss": 0.1318, "lr": 1.946084612662415e-06, "epoch": 0.19466807443579254, "percentage": 19.47, "elapsed_time": "0:03:53", "remaining_time": "0:16:05", "throughput": 3100.12, "total_tokens": 723200} +{"current_steps": 1480, "total_steps": 7577, "loss": 0.1708, "lr": 1.9453359355405987e-06, "epoch": 0.19532796621354098, "percentage": 19.53, "elapsed_time": "0:03:53", "remaining_time": "0:16:02", "throughput": 3107.82, "total_tokens": 725888} +{"current_steps": 1485, "total_steps": 7577, "loss": 0.0194, "lr": 1.944582242109002e-06, "epoch": 0.19598785799128943, "percentage": 19.6, "elapsed_time": "0:03:53", "remaining_time": "0:15:59", "throughput": 3114.18, "total_tokens": 728256} +{"current_steps": 1490, "total_steps": 7577, "loss": 0.1454, "lr": 1.943823536367006e-06, "epoch": 0.19664774976903787, "percentage": 19.66, "elapsed_time": "0:03:54", "remaining_time": "0:15:56", "throughput": 3120.82, "total_tokens": 730688} +{"current_steps": 1495, "total_steps": 7577, "loss": 0.1624, "lr": 1.9430598223405913e-06, "epoch": 0.19730764154678632, "percentage": 19.73, "elapsed_time": "0:03:54", "remaining_time": "0:15:53", "throughput": 3126.8, "total_tokens": 732992} +{"current_steps": 1500, "total_steps": 7577, "loss": 0.1476, "lr": 1.9422911040823125e-06, "epoch": 0.1979675333245348, "percentage": 19.8, "elapsed_time": "0:03:54", "remaining_time": "0:15:50", "throughput": 3133.31, "total_tokens": 735424} +{"current_steps": 1505, "total_steps": 7577, "loss": 0.3263, "lr": 1.941517385671279e-06, "epoch": 0.19862742510228323, "percentage": 19.86, "elapsed_time": "0:03:54", "remaining_time": "0:15:48", "throughput": 3139.02, "total_tokens": 737664} +{"current_steps": 1510, "total_steps": 7577, "loss": 0.0942, "lr": 1.940738671213134e-06, "epoch": 0.19928731688003168, "percentage": 19.93, "elapsed_time": "0:03:55", "remaining_time": "0:15:45", "throughput": 3145.52, "total_tokens": 740096} +{"current_steps": 1515, "total_steps": 7577, "loss": 0.1712, "lr": 1.93995496484003e-06, "epoch": 0.19994720865778012, "percentage": 19.99, "elapsed_time": "0:03:55", "remaining_time": "0:15:42", "throughput": 3153.53, "total_tokens": 742912} +{"current_steps": 1516, "total_steps": 7577, "eval_loss": 0.1068890318274498, "epoch": 0.2000791870133298, "percentage": 20.01, "elapsed_time": "0:04:03", "remaining_time": "0:16:12", "throughput": 3055.51, "total_tokens": 743424} +{"current_steps": 1520, "total_steps": 7577, "loss": 0.1021, "lr": 1.9391662707106092e-06, "epoch": 0.20060710043552857, "percentage": 20.06, "elapsed_time": "0:04:21", "remaining_time": "0:17:21", "throughput": 2853.07, "total_tokens": 745536} +{"current_steps": 1525, "total_steps": 7577, "loss": 0.0031, "lr": 1.9383725930099814e-06, "epoch": 0.201266992213277, "percentage": 20.13, "elapsed_time": "0:04:21", "remaining_time": "0:17:18", "throughput": 2859.13, "total_tokens": 747968} +{"current_steps": 1530, "total_steps": 7577, "loss": 0.1222, "lr": 1.9375739359497e-06, "epoch": 0.20192688399102549, "percentage": 20.19, "elapsed_time": "0:04:21", "remaining_time": "0:17:15", "throughput": 2865.42, "total_tokens": 750464} +{"current_steps": 1535, "total_steps": 7577, "loss": 0.2416, "lr": 1.936770303767741e-06, "epoch": 0.20258677576877393, "percentage": 20.26, "elapsed_time": "0:04:22", "remaining_time": "0:17:12", "throughput": 2871.46, "total_tokens": 752896} +{"current_steps": 1540, "total_steps": 7577, "loss": 0.1974, "lr": 1.9359617007284815e-06, "epoch": 0.20324666754652237, "percentage": 20.32, "elapsed_time": "0:04:22", "remaining_time": "0:17:09", "throughput": 2878.74, "total_tokens": 755648} +{"current_steps": 1545, "total_steps": 7577, "loss": 0.2312, "lr": 1.9351481311226738e-06, "epoch": 0.20390655932427082, "percentage": 20.39, "elapsed_time": "0:04:22", "remaining_time": "0:17:05", "throughput": 2885.03, "total_tokens": 758144} +{"current_steps": 1550, "total_steps": 7577, "loss": 0.1313, "lr": 1.934329599267426e-06, "epoch": 0.20456645110201926, "percentage": 20.46, "elapsed_time": "0:04:23", "remaining_time": "0:17:02", "throughput": 2891.57, "total_tokens": 760704} +{"current_steps": 1555, "total_steps": 7577, "loss": 0.0468, "lr": 1.933506109506178e-06, "epoch": 0.2052263428797677, "percentage": 20.52, "elapsed_time": "0:04:23", "remaining_time": "0:16:59", "throughput": 2897.59, "total_tokens": 763136} +{"current_steps": 1560, "total_steps": 7577, "loss": 0.1132, "lr": 1.9326776662086765e-06, "epoch": 0.20588623465751615, "percentage": 20.59, "elapsed_time": "0:04:23", "remaining_time": "0:16:56", "throughput": 2905.22, "total_tokens": 766016} +{"current_steps": 1565, "total_steps": 7577, "loss": 0.3367, "lr": 1.9318442737709565e-06, "epoch": 0.20654612643526463, "percentage": 20.65, "elapsed_time": "0:04:23", "remaining_time": "0:16:54", "throughput": 2911.47, "total_tokens": 768512} +{"current_steps": 1570, "total_steps": 7577, "loss": 0.2047, "lr": 1.9310059366153116e-06, "epoch": 0.20720601821301307, "percentage": 20.72, "elapsed_time": "0:04:24", "remaining_time": "0:16:51", "throughput": 2916.99, "total_tokens": 770816} +{"current_steps": 1575, "total_steps": 7577, "loss": 0.2302, "lr": 1.930162659190277e-06, "epoch": 0.20786590999076152, "percentage": 20.79, "elapsed_time": "0:04:24", "remaining_time": "0:16:48", "throughput": 2923.17, "total_tokens": 773312} +{"current_steps": 1580, "total_steps": 7577, "loss": 0.0029, "lr": 1.9293144459706007e-06, "epoch": 0.20852580176850996, "percentage": 20.85, "elapsed_time": "0:04:24", "remaining_time": "0:16:45", "throughput": 2928.91, "total_tokens": 775680} +{"current_steps": 1585, "total_steps": 7577, "loss": 0.1877, "lr": 1.928461301457223e-06, "epoch": 0.2091856935462584, "percentage": 20.92, "elapsed_time": "0:04:25", "remaining_time": "0:16:42", "throughput": 2934.62, "total_tokens": 778048} +{"current_steps": 1590, "total_steps": 7577, "loss": 0.3027, "lr": 1.92760323017725e-06, "epoch": 0.20984558532400685, "percentage": 20.98, "elapsed_time": "0:04:25", "remaining_time": "0:16:39", "throughput": 2941.27, "total_tokens": 780672} +{"current_steps": 1595, "total_steps": 7577, "loss": 0.216, "lr": 1.9267402366839338e-06, "epoch": 0.21050547710175532, "percentage": 21.05, "elapsed_time": "0:04:25", "remaining_time": "0:16:36", "throughput": 2948.12, "total_tokens": 783360} +{"current_steps": 1600, "total_steps": 7577, "loss": 0.1268, "lr": 1.9258723255566433e-06, "epoch": 0.21116536887950377, "percentage": 21.12, "elapsed_time": "0:04:26", "remaining_time": "0:16:33", "throughput": 2954.24, "total_tokens": 785856} +{"current_steps": 1605, "total_steps": 7577, "loss": 0.1832, "lr": 1.924999501400843e-06, "epoch": 0.2118252606572522, "percentage": 21.18, "elapsed_time": "0:04:26", "remaining_time": "0:16:30", "throughput": 2960.88, "total_tokens": 788480} +{"current_steps": 1610, "total_steps": 7577, "loss": 0.1511, "lr": 1.924121768848068e-06, "epoch": 0.21248515243500066, "percentage": 21.25, "elapsed_time": "0:04:26", "remaining_time": "0:16:28", "throughput": 2967.29, "total_tokens": 791040} +{"current_steps": 1615, "total_steps": 7577, "loss": 0.1088, "lr": 1.923239132555899e-06, "epoch": 0.2131450442127491, "percentage": 21.31, "elapsed_time": "0:04:26", "remaining_time": "0:16:25", "throughput": 2973.66, "total_tokens": 793600} +{"current_steps": 1620, "total_steps": 7577, "loss": 0.1302, "lr": 1.9223515972079378e-06, "epoch": 0.21380493599049755, "percentage": 21.38, "elapsed_time": "0:04:27", "remaining_time": "0:16:22", "throughput": 2979.33, "total_tokens": 795968} +{"current_steps": 1625, "total_steps": 7577, "loss": 0.049, "lr": 1.9214591675137813e-06, "epoch": 0.21446482776824602, "percentage": 21.45, "elapsed_time": "0:04:27", "remaining_time": "0:16:19", "throughput": 2984.75, "total_tokens": 798272} +{"current_steps": 1630, "total_steps": 7577, "loss": 0.144, "lr": 1.9205618482090003e-06, "epoch": 0.21512471954599446, "percentage": 21.51, "elapsed_time": "0:04:27", "remaining_time": "0:16:16", "throughput": 2991.77, "total_tokens": 801024} +{"current_steps": 1635, "total_steps": 7577, "loss": 0.1374, "lr": 1.91965964405511e-06, "epoch": 0.2157846113237429, "percentage": 21.58, "elapsed_time": "0:04:28", "remaining_time": "0:16:14", "throughput": 2998.05, "total_tokens": 803584} +{"current_steps": 1640, "total_steps": 7577, "loss": 0.0117, "lr": 1.9187525598395457e-06, "epoch": 0.21644450310149135, "percentage": 21.64, "elapsed_time": "0:04:28", "remaining_time": "0:16:11", "throughput": 3003.64, "total_tokens": 805952} +{"current_steps": 1645, "total_steps": 7577, "loss": 0.1249, "lr": 1.9178406003756396e-06, "epoch": 0.2171043948792398, "percentage": 21.71, "elapsed_time": "0:04:28", "remaining_time": "0:16:08", "throughput": 3009.91, "total_tokens": 808512} +{"current_steps": 1650, "total_steps": 7577, "loss": 0.0819, "lr": 1.9169237705025936e-06, "epoch": 0.21776428665698824, "percentage": 21.78, "elapsed_time": "0:04:28", "remaining_time": "0:16:05", "throughput": 3016.38, "total_tokens": 811136} +{"current_steps": 1655, "total_steps": 7577, "loss": 0.0183, "lr": 1.9160020750854533e-06, "epoch": 0.21842417843473672, "percentage": 21.84, "elapsed_time": "0:04:29", "remaining_time": "0:16:03", "throughput": 3021.45, "total_tokens": 813376} +{"current_steps": 1660, "total_steps": 7577, "loss": 0.199, "lr": 1.915075519015083e-06, "epoch": 0.21908407021248516, "percentage": 21.91, "elapsed_time": "0:04:29", "remaining_time": "0:16:00", "throughput": 3027.43, "total_tokens": 815872} +{"current_steps": 1665, "total_steps": 7577, "loss": 0.0725, "lr": 1.914144107208139e-06, "epoch": 0.2197439619902336, "percentage": 21.97, "elapsed_time": "0:04:29", "remaining_time": "0:15:57", "throughput": 3032.95, "total_tokens": 818240} +{"current_steps": 1670, "total_steps": 7577, "loss": 0.0539, "lr": 1.913207844607045e-06, "epoch": 0.22040385376798205, "percentage": 22.04, "elapsed_time": "0:04:30", "remaining_time": "0:15:55", "throughput": 3038.96, "total_tokens": 820736} +{"current_steps": 1675, "total_steps": 7577, "loss": 0.2528, "lr": 1.912266736179964e-06, "epoch": 0.2210637455457305, "percentage": 22.11, "elapsed_time": "0:04:30", "remaining_time": "0:15:52", "throughput": 3046.29, "total_tokens": 823616} +{"current_steps": 1680, "total_steps": 7577, "loss": 0.1707, "lr": 1.9113207869207727e-06, "epoch": 0.22172363732347894, "percentage": 22.17, "elapsed_time": "0:04:30", "remaining_time": "0:15:50", "throughput": 3052.26, "total_tokens": 826112} +{"current_steps": 1685, "total_steps": 7577, "loss": 0.1356, "lr": 1.9103700018490365e-06, "epoch": 0.2223835291012274, "percentage": 22.24, "elapsed_time": "0:04:30", "remaining_time": "0:15:47", "throughput": 3058.46, "total_tokens": 828672} +{"current_steps": 1690, "total_steps": 7577, "loss": 0.1711, "lr": 1.9094143860099787e-06, "epoch": 0.22304342087897586, "percentage": 22.3, "elapsed_time": "0:04:31", "remaining_time": "0:15:44", "throughput": 3064.89, "total_tokens": 831296} +{"current_steps": 1695, "total_steps": 7577, "loss": 0.0895, "lr": 1.9084539444744594e-06, "epoch": 0.2237033126567243, "percentage": 22.37, "elapsed_time": "0:04:31", "remaining_time": "0:15:42", "throughput": 3071.06, "total_tokens": 833856} +{"current_steps": 1700, "total_steps": 7577, "loss": 0.1324, "lr": 1.907488682338944e-06, "epoch": 0.22436320443447275, "percentage": 22.44, "elapsed_time": "0:04:31", "remaining_time": "0:15:39", "throughput": 3077.44, "total_tokens": 836480} +{"current_steps": 1705, "total_steps": 7577, "loss": 0.0553, "lr": 1.9065186047254782e-06, "epoch": 0.2250230962122212, "percentage": 22.5, "elapsed_time": "0:04:32", "remaining_time": "0:15:37", "throughput": 3083.29, "total_tokens": 838976} +{"current_steps": 1710, "total_steps": 7577, "loss": 0.2205, "lr": 1.9055437167816604e-06, "epoch": 0.22568298798996964, "percentage": 22.57, "elapsed_time": "0:04:32", "remaining_time": "0:15:34", "throughput": 3090.1, "total_tokens": 841728} +{"current_steps": 1715, "total_steps": 7577, "loss": 0.0143, "lr": 1.9045640236806149e-06, "epoch": 0.22634287976771808, "percentage": 22.63, "elapsed_time": "0:04:32", "remaining_time": "0:15:32", "throughput": 3095.07, "total_tokens": 843968} +{"current_steps": 1720, "total_steps": 7577, "loss": 0.3401, "lr": 1.903579530620963e-06, "epoch": 0.22700277154546655, "percentage": 22.7, "elapsed_time": "0:04:32", "remaining_time": "0:15:29", "throughput": 3100.9, "total_tokens": 846464} +{"current_steps": 1725, "total_steps": 7577, "loss": 0.0967, "lr": 1.9025902428267975e-06, "epoch": 0.227662663323215, "percentage": 22.77, "elapsed_time": "0:04:33", "remaining_time": "0:15:27", "throughput": 3107.21, "total_tokens": 849088} +{"current_steps": 1730, "total_steps": 7577, "loss": 0.2082, "lr": 1.901596165547653e-06, "epoch": 0.22832255510096344, "percentage": 22.83, "elapsed_time": "0:04:33", "remaining_time": "0:15:24", "throughput": 3113.48, "total_tokens": 851712} +{"current_steps": 1735, "total_steps": 7577, "loss": 0.102, "lr": 1.9005973040584796e-06, "epoch": 0.2289824468787119, "percentage": 22.9, "elapsed_time": "0:04:33", "remaining_time": "0:15:22", "throughput": 3119.22, "total_tokens": 854208} +{"current_steps": 1740, "total_steps": 7577, "loss": 0.088, "lr": 1.8995936636596138e-06, "epoch": 0.22964233865646033, "percentage": 22.96, "elapsed_time": "0:04:34", "remaining_time": "0:15:19", "throughput": 3124.55, "total_tokens": 856576} +{"current_steps": 1745, "total_steps": 7577, "loss": 0.1348, "lr": 1.8985852496767504e-06, "epoch": 0.23030223043420878, "percentage": 23.03, "elapsed_time": "0:04:34", "remaining_time": "0:15:17", "throughput": 3130.18, "total_tokens": 859008} +{"current_steps": 1750, "total_steps": 7577, "loss": 0.1643, "lr": 1.897572067460916e-06, "epoch": 0.23096212221195725, "percentage": 23.1, "elapsed_time": "0:04:34", "remaining_time": "0:15:14", "throughput": 3135.71, "total_tokens": 861440} +{"current_steps": 1755, "total_steps": 7577, "loss": 0.0848, "lr": 1.8965541223884377e-06, "epoch": 0.2316220139897057, "percentage": 23.16, "elapsed_time": "0:04:35", "remaining_time": "0:15:12", "throughput": 3141.47, "total_tokens": 863936} +{"current_steps": 1760, "total_steps": 7577, "loss": 0.1238, "lr": 1.8955314198609171e-06, "epoch": 0.23228190576745414, "percentage": 23.23, "elapsed_time": "0:04:35", "remaining_time": "0:15:09", "throughput": 3146.34, "total_tokens": 866176} +{"current_steps": 1765, "total_steps": 7577, "loss": 0.0977, "lr": 1.8945039653052005e-06, "epoch": 0.23294179754520258, "percentage": 23.29, "elapsed_time": "0:04:35", "remaining_time": "0:15:07", "throughput": 3151.41, "total_tokens": 868480} +{"current_steps": 1770, "total_steps": 7577, "loss": 0.0877, "lr": 1.8934717641733498e-06, "epoch": 0.23360168932295103, "percentage": 23.36, "elapsed_time": "0:04:35", "remaining_time": "0:15:05", "throughput": 3157.14, "total_tokens": 870976} +{"current_steps": 1775, "total_steps": 7577, "loss": 0.2471, "lr": 1.8924348219426143e-06, "epoch": 0.23426158110069947, "percentage": 23.43, "elapsed_time": "0:04:36", "remaining_time": "0:15:02", "throughput": 3161.46, "total_tokens": 873088} +{"current_steps": 1780, "total_steps": 7577, "loss": 0.2694, "lr": 1.8913931441154016e-06, "epoch": 0.23492147287844795, "percentage": 23.49, "elapsed_time": "0:04:36", "remaining_time": "0:15:00", "throughput": 3166.92, "total_tokens": 875520} +{"current_steps": 1785, "total_steps": 7577, "loss": 0.0401, "lr": 1.8903467362192482e-06, "epoch": 0.2355813646561964, "percentage": 23.56, "elapsed_time": "0:04:36", "remaining_time": "0:14:57", "throughput": 3171.25, "total_tokens": 877632} +{"current_steps": 1790, "total_steps": 7577, "loss": 0.0696, "lr": 1.8892956038067895e-06, "epoch": 0.23624125643394484, "percentage": 23.62, "elapsed_time": "0:04:37", "remaining_time": "0:14:55", "throughput": 3176.49, "total_tokens": 880000} +{"current_steps": 1795, "total_steps": 7577, "loss": 0.0238, "lr": 1.8882397524557317e-06, "epoch": 0.23690114821169328, "percentage": 23.69, "elapsed_time": "0:04:37", "remaining_time": "0:14:53", "throughput": 3181.07, "total_tokens": 882176} +{"current_steps": 1800, "total_steps": 7577, "loss": 0.0642, "lr": 1.8871791877688208e-06, "epoch": 0.23756103998944172, "percentage": 23.76, "elapsed_time": "0:04:37", "remaining_time": "0:14:50", "throughput": 3187.22, "total_tokens": 884800} +{"current_steps": 1805, "total_steps": 7577, "loss": 0.0068, "lr": 1.8861139153738143e-06, "epoch": 0.23822093176719017, "percentage": 23.82, "elapsed_time": "0:04:37", "remaining_time": "0:14:48", "throughput": 3192.19, "total_tokens": 887104} +{"current_steps": 1810, "total_steps": 7577, "loss": 0.0012, "lr": 1.8850439409234498e-06, "epoch": 0.23888082354493864, "percentage": 23.89, "elapsed_time": "0:04:38", "remaining_time": "0:14:46", "throughput": 3197.19, "total_tokens": 889408} +{"current_steps": 1815, "total_steps": 7577, "loss": 0.1943, "lr": 1.8839692700954161e-06, "epoch": 0.2395407153226871, "percentage": 23.95, "elapsed_time": "0:04:38", "remaining_time": "0:14:44", "throughput": 3201.97, "total_tokens": 891648} +{"current_steps": 1820, "total_steps": 7577, "loss": 0.3211, "lr": 1.8828899085923234e-06, "epoch": 0.24020060710043553, "percentage": 24.02, "elapsed_time": "0:04:38", "remaining_time": "0:14:41", "throughput": 3207.85, "total_tokens": 894208} +{"current_steps": 1825, "total_steps": 7577, "loss": 0.2085, "lr": 1.881805862141671e-06, "epoch": 0.24086049887818398, "percentage": 24.09, "elapsed_time": "0:04:39", "remaining_time": "0:14:39", "throughput": 3213.5, "total_tokens": 896704} +{"current_steps": 1830, "total_steps": 7577, "loss": 0.082, "lr": 1.8807171364958196e-06, "epoch": 0.24152039065593242, "percentage": 24.15, "elapsed_time": "0:04:39", "remaining_time": "0:14:37", "throughput": 3219.35, "total_tokens": 899264} +{"current_steps": 1835, "total_steps": 7577, "loss": 0.0206, "lr": 1.879623737431959e-06, "epoch": 0.24218028243368087, "percentage": 24.22, "elapsed_time": "0:04:39", "remaining_time": "0:14:34", "throughput": 3224.97, "total_tokens": 901760} +{"current_steps": 1840, "total_steps": 7577, "loss": 0.3077, "lr": 1.8785256707520778e-06, "epoch": 0.24284017421142934, "percentage": 24.28, "elapsed_time": "0:04:39", "remaining_time": "0:14:32", "throughput": 3229.23, "total_tokens": 903872} +{"current_steps": 1845, "total_steps": 7577, "loss": 0.0012, "lr": 1.8774229422829325e-06, "epoch": 0.24350006598917778, "percentage": 24.35, "elapsed_time": "0:04:40", "remaining_time": "0:14:30", "throughput": 3234.72, "total_tokens": 906368} +{"current_steps": 1850, "total_steps": 7577, "loss": 0.0491, "lr": 1.8763155578760181e-06, "epoch": 0.24415995776692623, "percentage": 24.42, "elapsed_time": "0:04:40", "remaining_time": "0:14:28", "throughput": 3240.32, "total_tokens": 908864} +{"current_steps": 1855, "total_steps": 7577, "loss": 0.0892, "lr": 1.8752035234075336e-06, "epoch": 0.24481984954467467, "percentage": 24.48, "elapsed_time": "0:04:40", "remaining_time": "0:14:26", "throughput": 3244.8, "total_tokens": 911040} +{"current_steps": 1860, "total_steps": 7577, "loss": 0.1932, "lr": 1.8740868447783554e-06, "epoch": 0.24547974132242312, "percentage": 24.55, "elapsed_time": "0:04:41", "remaining_time": "0:14:23", "throughput": 3249.92, "total_tokens": 913408} +{"current_steps": 1865, "total_steps": 7577, "loss": 0.2285, "lr": 1.8729655279140012e-06, "epoch": 0.24613963310017156, "percentage": 24.61, "elapsed_time": "0:04:41", "remaining_time": "0:14:21", "throughput": 3255.66, "total_tokens": 915968} +{"current_steps": 1870, "total_steps": 7577, "loss": 0.1745, "lr": 1.8718395787646029e-06, "epoch": 0.24679952487792003, "percentage": 24.68, "elapsed_time": "0:04:41", "remaining_time": "0:14:19", "throughput": 3261.46, "total_tokens": 918528} +{"current_steps": 1875, "total_steps": 7577, "loss": 0.0009, "lr": 1.870709003304872e-06, "epoch": 0.24745941665566848, "percentage": 24.75, "elapsed_time": "0:04:41", "remaining_time": "0:14:17", "throughput": 3267.43, "total_tokens": 921152} +{"current_steps": 1880, "total_steps": 7577, "loss": 0.0005, "lr": 1.8695738075340693e-06, "epoch": 0.24811930843341692, "percentage": 24.81, "elapsed_time": "0:04:42", "remaining_time": "0:14:15", "throughput": 3272.45, "total_tokens": 923520} +{"current_steps": 1885, "total_steps": 7577, "loss": 0.1696, "lr": 1.8684339974759723e-06, "epoch": 0.24877920021116537, "percentage": 24.88, "elapsed_time": "0:04:42", "remaining_time": "0:14:13", "throughput": 3277.49, "total_tokens": 925888} +{"current_steps": 1890, "total_steps": 7577, "loss": 0.0881, "lr": 1.8672895791788445e-06, "epoch": 0.2494390919889138, "percentage": 24.94, "elapsed_time": "0:04:42", "remaining_time": "0:14:10", "throughput": 3284.06, "total_tokens": 928704} +{"current_steps": 1895, "total_steps": 7577, "loss": 0.2865, "lr": 1.8661405587154017e-06, "epoch": 0.2500989837666623, "percentage": 25.01, "elapsed_time": "0:04:43", "remaining_time": "0:14:08", "throughput": 3288.6, "total_tokens": 930944} +{"current_steps": 1895, "total_steps": 7577, "eval_loss": 0.12773367762565613, "epoch": 0.2500989837666623, "percentage": 25.01, "elapsed_time": "0:04:50", "remaining_time": "0:14:31", "throughput": 3202.19, "total_tokens": 930944} +{"current_steps": 1900, "total_steps": 7577, "loss": 0.2389, "lr": 1.8649869421827808e-06, "epoch": 0.25075887554441073, "percentage": 25.08, "elapsed_time": "0:05:25", "remaining_time": "0:16:11", "throughput": 2870.44, "total_tokens": 933376} +{"current_steps": 1905, "total_steps": 7577, "loss": 0.0517, "lr": 1.863828735702507e-06, "epoch": 0.2514187673221592, "percentage": 25.14, "elapsed_time": "0:05:25", "remaining_time": "0:16:09", "throughput": 2875.96, "total_tokens": 936000} +{"current_steps": 1910, "total_steps": 7577, "loss": 0.0611, "lr": 1.862665945420462e-06, "epoch": 0.2520786590999076, "percentage": 25.21, "elapsed_time": "0:05:25", "remaining_time": "0:16:06", "throughput": 2880.89, "total_tokens": 938432} +{"current_steps": 1915, "total_steps": 7577, "loss": 0.0838, "lr": 1.8614985775068498e-06, "epoch": 0.25273855087765607, "percentage": 25.27, "elapsed_time": "0:05:26", "remaining_time": "0:16:03", "throughput": 2887.11, "total_tokens": 941312} +{"current_steps": 1920, "total_steps": 7577, "loss": 0.0099, "lr": 1.860326638156167e-06, "epoch": 0.2533984426554045, "percentage": 25.34, "elapsed_time": "0:05:26", "remaining_time": "0:16:01", "throughput": 2891.24, "total_tokens": 943488} +{"current_steps": 1925, "total_steps": 7577, "loss": 0.1064, "lr": 1.8591501335871653e-06, "epoch": 0.25405833443315295, "percentage": 25.41, "elapsed_time": "0:05:26", "remaining_time": "0:15:58", "throughput": 2895.96, "total_tokens": 945856} +{"current_steps": 1930, "total_steps": 7577, "loss": 0.2861, "lr": 1.857969070042824e-06, "epoch": 0.2547182262109014, "percentage": 25.47, "elapsed_time": "0:05:26", "remaining_time": "0:15:56", "throughput": 2901.02, "total_tokens": 948352} +{"current_steps": 1935, "total_steps": 7577, "loss": 0.0541, "lr": 1.8567834537903116e-06, "epoch": 0.25537811798864984, "percentage": 25.54, "elapsed_time": "0:05:27", "remaining_time": "0:15:54", "throughput": 2906.47, "total_tokens": 950976} +{"current_steps": 1940, "total_steps": 7577, "loss": 0.1499, "lr": 1.8555932911209565e-06, "epoch": 0.2560380097663983, "percentage": 25.6, "elapsed_time": "0:05:27", "remaining_time": "0:15:51", "throughput": 2910.76, "total_tokens": 953216} +{"current_steps": 1945, "total_steps": 7577, "loss": 0.0338, "lr": 1.8543985883502119e-06, "epoch": 0.25669790154414673, "percentage": 25.67, "elapsed_time": "0:05:27", "remaining_time": "0:15:49", "throughput": 2915.64, "total_tokens": 955648} +{"current_steps": 1950, "total_steps": 7577, "loss": 0.0462, "lr": 1.8531993518176216e-06, "epoch": 0.25735779332189523, "percentage": 25.74, "elapsed_time": "0:05:28", "remaining_time": "0:15:46", "throughput": 2919.93, "total_tokens": 957888} +{"current_steps": 1955, "total_steps": 7577, "loss": 0.1275, "lr": 1.8519955878867889e-06, "epoch": 0.2580176850996437, "percentage": 25.8, "elapsed_time": "0:05:28", "remaining_time": "0:15:44", "throughput": 2924.2, "total_tokens": 960128} +{"current_steps": 1960, "total_steps": 7577, "loss": 0.1778, "lr": 1.8507873029453392e-06, "epoch": 0.2586775768773921, "percentage": 25.87, "elapsed_time": "0:05:28", "remaining_time": "0:15:41", "throughput": 2928.85, "total_tokens": 962496} +{"current_steps": 1965, "total_steps": 7577, "loss": 0.2342, "lr": 1.8495745034048896e-06, "epoch": 0.25933746865514057, "percentage": 25.93, "elapsed_time": "0:05:28", "remaining_time": "0:15:39", "throughput": 2934.24, "total_tokens": 965120} +{"current_steps": 1970, "total_steps": 7577, "loss": 0.0074, "lr": 1.8483571957010127e-06, "epoch": 0.259997360432889, "percentage": 26.0, "elapsed_time": "0:05:29", "remaining_time": "0:15:36", "throughput": 2939.24, "total_tokens": 967616} +{"current_steps": 1975, "total_steps": 7577, "loss": 0.0688, "lr": 1.8471353862932035e-06, "epoch": 0.26065725221063746, "percentage": 26.07, "elapsed_time": "0:05:29", "remaining_time": "0:15:34", "throughput": 2944.6, "total_tokens": 970240} +{"current_steps": 1980, "total_steps": 7577, "loss": 0.1752, "lr": 1.8459090816648444e-06, "epoch": 0.2613171439883859, "percentage": 26.13, "elapsed_time": "0:05:29", "remaining_time": "0:15:32", "throughput": 2949.02, "total_tokens": 972544} +{"current_steps": 1985, "total_steps": 7577, "loss": 0.2913, "lr": 1.8446782883231713e-06, "epoch": 0.26197703576613435, "percentage": 26.2, "elapsed_time": "0:05:30", "remaining_time": "0:15:29", "throughput": 2953.64, "total_tokens": 974912} +{"current_steps": 1990, "total_steps": 7577, "loss": 0.3162, "lr": 1.8434430127992387e-06, "epoch": 0.2626369275438828, "percentage": 26.26, "elapsed_time": "0:05:30", "remaining_time": "0:15:27", "throughput": 2957.66, "total_tokens": 977088} +{"current_steps": 1995, "total_steps": 7577, "loss": 0.1709, "lr": 1.8422032616478857e-06, "epoch": 0.26329681932163124, "percentage": 26.33, "elapsed_time": "0:05:30", "remaining_time": "0:15:25", "throughput": 2962.82, "total_tokens": 979648} +{"current_steps": 2000, "total_steps": 7577, "loss": 0.1159, "lr": 1.8409590414477001e-06, "epoch": 0.2639567110993797, "percentage": 26.4, "elapsed_time": "0:05:30", "remaining_time": "0:15:22", "throughput": 2968.31, "total_tokens": 982336} +{"current_steps": 2005, "total_steps": 7577, "loss": 0.0056, "lr": 1.839710358800985e-06, "epoch": 0.2646166028771281, "percentage": 26.46, "elapsed_time": "0:05:31", "remaining_time": "0:15:20", "throughput": 2973.06, "total_tokens": 984768} +{"current_steps": 2010, "total_steps": 7577, "loss": 0.0349, "lr": 1.8384572203337224e-06, "epoch": 0.2652764946548766, "percentage": 26.53, "elapsed_time": "0:05:31", "remaining_time": "0:15:18", "throughput": 2977.63, "total_tokens": 987136} +{"current_steps": 2015, "total_steps": 7577, "loss": 0.1309, "lr": 1.837199632695538e-06, "epoch": 0.26593638643262507, "percentage": 26.59, "elapsed_time": "0:05:31", "remaining_time": "0:15:15", "throughput": 2983.11, "total_tokens": 989824} +{"current_steps": 2020, "total_steps": 7577, "loss": 0.3374, "lr": 1.8359376025596682e-06, "epoch": 0.2665962782103735, "percentage": 26.66, "elapsed_time": "0:05:32", "remaining_time": "0:15:13", "throughput": 2987.28, "total_tokens": 992064} +{"current_steps": 2025, "total_steps": 7577, "loss": 0.1366, "lr": 1.8346711366229215e-06, "epoch": 0.26725616998812196, "percentage": 26.73, "elapsed_time": "0:05:32", "remaining_time": "0:15:11", "throughput": 2991.63, "total_tokens": 994368} +{"current_steps": 2030, "total_steps": 7577, "loss": 0.215, "lr": 1.8334002416056442e-06, "epoch": 0.2679160617658704, "percentage": 26.79, "elapsed_time": "0:05:32", "remaining_time": "0:15:09", "throughput": 2996.55, "total_tokens": 996864} +{"current_steps": 2035, "total_steps": 7577, "loss": 0.2084, "lr": 1.8321249242516865e-06, "epoch": 0.26857595354361885, "percentage": 26.86, "elapsed_time": "0:05:32", "remaining_time": "0:15:06", "throughput": 3001.47, "total_tokens": 999360} +{"current_steps": 2040, "total_steps": 7577, "loss": 0.0868, "lr": 1.8308451913283638e-06, "epoch": 0.2692358453213673, "percentage": 26.92, "elapsed_time": "0:05:33", "remaining_time": "0:15:04", "throughput": 3006.55, "total_tokens": 1001920} +{"current_steps": 2045, "total_steps": 7577, "loss": 0.0602, "lr": 1.8295610496264229e-06, "epoch": 0.26989573709911574, "percentage": 26.99, "elapsed_time": "0:05:33", "remaining_time": "0:15:02", "throughput": 3010.82, "total_tokens": 1004224} +{"current_steps": 2050, "total_steps": 7577, "loss": 0.0027, "lr": 1.828272505960005e-06, "epoch": 0.2705556288768642, "percentage": 27.06, "elapsed_time": "0:05:33", "remaining_time": "0:15:00", "throughput": 3015.13, "total_tokens": 1006528} +{"current_steps": 2055, "total_steps": 7577, "loss": 0.1856, "lr": 1.8269795671666098e-06, "epoch": 0.27121552065461263, "percentage": 27.12, "elapsed_time": "0:05:34", "remaining_time": "0:14:57", "throughput": 3019.63, "total_tokens": 1008896} +{"current_steps": 2060, "total_steps": 7577, "loss": 0.1347, "lr": 1.8256822401070591e-06, "epoch": 0.2718754124323611, "percentage": 27.19, "elapsed_time": "0:05:34", "remaining_time": "0:14:55", "throughput": 3025.22, "total_tokens": 1011648} +{"current_steps": 2065, "total_steps": 7577, "loss": 0.0254, "lr": 1.8243805316654611e-06, "epoch": 0.2725353042101095, "percentage": 27.25, "elapsed_time": "0:05:34", "remaining_time": "0:14:53", "throughput": 3030.27, "total_tokens": 1014208} +{"current_steps": 2070, "total_steps": 7577, "loss": 0.2187, "lr": 1.823074448749172e-06, "epoch": 0.27319519598785796, "percentage": 27.32, "elapsed_time": "0:05:34", "remaining_time": "0:14:51", "throughput": 3034.89, "total_tokens": 1016640} +{"current_steps": 2075, "total_steps": 7577, "loss": 0.0403, "lr": 1.8217639982887623e-06, "epoch": 0.27385508776560646, "percentage": 27.39, "elapsed_time": "0:05:35", "remaining_time": "0:14:49", "throughput": 3040.28, "total_tokens": 1019328} +{"current_steps": 2080, "total_steps": 7577, "loss": 0.0603, "lr": 1.8204491872379769e-06, "epoch": 0.2745149795433549, "percentage": 27.45, "elapsed_time": "0:05:35", "remaining_time": "0:14:46", "throughput": 3044.72, "total_tokens": 1021696} +{"current_steps": 2085, "total_steps": 7577, "loss": 0.0996, "lr": 1.8191300225737e-06, "epoch": 0.27517487132110335, "percentage": 27.52, "elapsed_time": "0:05:35", "remaining_time": "0:14:44", "throughput": 3049.71, "total_tokens": 1024256} +{"current_steps": 2090, "total_steps": 7577, "loss": 0.2261, "lr": 1.8178065112959184e-06, "epoch": 0.2758347630988518, "percentage": 27.58, "elapsed_time": "0:05:36", "remaining_time": "0:14:42", "throughput": 3053.94, "total_tokens": 1026560} +{"current_steps": 2095, "total_steps": 7577, "loss": 0.3078, "lr": 1.8164786604276832e-06, "epoch": 0.27649465487660024, "percentage": 27.65, "elapsed_time": "0:05:36", "remaining_time": "0:14:40", "throughput": 3059.11, "total_tokens": 1029184} +{"current_steps": 2100, "total_steps": 7577, "loss": 0.1119, "lr": 1.8151464770150727e-06, "epoch": 0.2771545466543487, "percentage": 27.72, "elapsed_time": "0:05:36", "remaining_time": "0:14:38", "throughput": 3064.09, "total_tokens": 1031744} +{"current_steps": 2105, "total_steps": 7577, "loss": 0.2357, "lr": 1.8138099681271558e-06, "epoch": 0.27781443843209713, "percentage": 27.78, "elapsed_time": "0:05:37", "remaining_time": "0:14:36", "throughput": 3068.33, "total_tokens": 1034048} +{"current_steps": 2110, "total_steps": 7577, "loss": 0.1489, "lr": 1.8124691408559536e-06, "epoch": 0.2784743302098456, "percentage": 27.85, "elapsed_time": "0:05:37", "remaining_time": "0:14:33", "throughput": 3073.13, "total_tokens": 1036544} +{"current_steps": 2115, "total_steps": 7577, "loss": 0.1057, "lr": 1.8111240023164023e-06, "epoch": 0.279134221987594, "percentage": 27.91, "elapsed_time": "0:05:37", "remaining_time": "0:14:31", "throughput": 3077.34, "total_tokens": 1038848} +{"current_steps": 2120, "total_steps": 7577, "loss": 0.0049, "lr": 1.809774559646316e-06, "epoch": 0.27979411376534247, "percentage": 27.98, "elapsed_time": "0:05:37", "remaining_time": "0:14:29", "throughput": 3081.57, "total_tokens": 1041152} +{"current_steps": 2125, "total_steps": 7577, "loss": 0.1192, "lr": 1.8084208200063469e-06, "epoch": 0.2804540055430909, "percentage": 28.05, "elapsed_time": "0:05:38", "remaining_time": "0:14:27", "throughput": 3087.23, "total_tokens": 1043968} +{"current_steps": 2130, "total_steps": 7577, "loss": 0.2678, "lr": 1.8070627905799496e-06, "epoch": 0.28111389732083936, "percentage": 28.11, "elapsed_time": "0:05:38", "remaining_time": "0:14:25", "throughput": 3091.41, "total_tokens": 1046272} +{"current_steps": 2135, "total_steps": 7577, "loss": 0.0892, "lr": 1.8057004785733413e-06, "epoch": 0.28177378909858786, "percentage": 28.18, "elapsed_time": "0:05:38", "remaining_time": "0:14:23", "throughput": 3095.21, "total_tokens": 1048448} +{"current_steps": 2140, "total_steps": 7577, "loss": 0.171, "lr": 1.8043338912154647e-06, "epoch": 0.2824336808763363, "percentage": 28.24, "elapsed_time": "0:05:39", "remaining_time": "0:14:21", "throughput": 3100.3, "total_tokens": 1051072} +{"current_steps": 2145, "total_steps": 7577, "loss": 0.0537, "lr": 1.8029630357579486e-06, "epoch": 0.28309357265408475, "percentage": 28.31, "elapsed_time": "0:05:39", "remaining_time": "0:14:19", "throughput": 3104.28, "total_tokens": 1053312} +{"current_steps": 2150, "total_steps": 7577, "loss": 0.0727, "lr": 1.8015879194750702e-06, "epoch": 0.2837534644318332, "percentage": 28.38, "elapsed_time": "0:05:39", "remaining_time": "0:14:17", "throughput": 3108.61, "total_tokens": 1055680} +{"current_steps": 2155, "total_steps": 7577, "loss": 0.1279, "lr": 1.8002085496637165e-06, "epoch": 0.28441335620958164, "percentage": 28.44, "elapsed_time": "0:05:39", "remaining_time": "0:14:15", "throughput": 3112.76, "total_tokens": 1057984} +{"current_steps": 2160, "total_steps": 7577, "loss": 0.1195, "lr": 1.7988249336433448e-06, "epoch": 0.2850732479873301, "percentage": 28.51, "elapsed_time": "0:05:40", "remaining_time": "0:14:13", "throughput": 3118.19, "total_tokens": 1060736} +{"current_steps": 2165, "total_steps": 7577, "loss": 0.1191, "lr": 1.7974370787559447e-06, "epoch": 0.2857331397650785, "percentage": 28.57, "elapsed_time": "0:05:40", "remaining_time": "0:14:11", "throughput": 3123.4, "total_tokens": 1063424} +{"current_steps": 2170, "total_steps": 7577, "loss": 0.0407, "lr": 1.796044992365999e-06, "epoch": 0.28639303154282697, "percentage": 28.64, "elapsed_time": "0:05:40", "remaining_time": "0:14:09", "throughput": 3127.53, "total_tokens": 1065728} +{"current_steps": 2175, "total_steps": 7577, "loss": 0.0343, "lr": 1.794648681860444e-06, "epoch": 0.2870529233205754, "percentage": 28.71, "elapsed_time": "0:05:41", "remaining_time": "0:14:07", "throughput": 3132.02, "total_tokens": 1068160} +{"current_steps": 2180, "total_steps": 7577, "loss": 0.2582, "lr": 1.7932481546486312e-06, "epoch": 0.28771281509832386, "percentage": 28.77, "elapsed_time": "0:05:41", "remaining_time": "0:14:05", "throughput": 3136.51, "total_tokens": 1070592} +{"current_steps": 2185, "total_steps": 7577, "loss": 0.161, "lr": 1.791843418162287e-06, "epoch": 0.2883727068760723, "percentage": 28.84, "elapsed_time": "0:05:41", "remaining_time": "0:14:03", "throughput": 3141.69, "total_tokens": 1073280} +{"current_steps": 2190, "total_steps": 7577, "loss": 0.0127, "lr": 1.7904344798554748e-06, "epoch": 0.28903259865382075, "percentage": 28.9, "elapsed_time": "0:05:41", "remaining_time": "0:14:01", "throughput": 3145.79, "total_tokens": 1075584} +{"current_steps": 2195, "total_steps": 7577, "loss": 0.0962, "lr": 1.789021347204553e-06, "epoch": 0.28969249043156925, "percentage": 28.97, "elapsed_time": "0:05:42", "remaining_time": "0:13:59", "throughput": 3150.28, "total_tokens": 1078016} +{"current_steps": 2200, "total_steps": 7577, "loss": 0.1665, "lr": 1.7876040277081381e-06, "epoch": 0.2903523822093177, "percentage": 29.04, "elapsed_time": "0:05:42", "remaining_time": "0:13:57", "throughput": 3154.99, "total_tokens": 1080512} +{"current_steps": 2205, "total_steps": 7577, "loss": 0.1979, "lr": 1.7861825288870632e-06, "epoch": 0.29101227398706614, "percentage": 29.1, "elapsed_time": "0:05:42", "remaining_time": "0:13:55", "throughput": 3158.94, "total_tokens": 1082752} +{"current_steps": 2210, "total_steps": 7577, "loss": 0.3436, "lr": 1.7847568582843376e-06, "epoch": 0.2916721657648146, "percentage": 29.17, "elapsed_time": "0:05:43", "remaining_time": "0:13:53", "throughput": 3163.4, "total_tokens": 1085184} +{"current_steps": 2215, "total_steps": 7577, "loss": 0.1458, "lr": 1.7833270234651088e-06, "epoch": 0.29233205754256303, "percentage": 29.23, "elapsed_time": "0:05:43", "remaining_time": "0:13:51", "throughput": 3167.12, "total_tokens": 1087360} +{"current_steps": 2220, "total_steps": 7577, "loss": 0.0619, "lr": 1.781893032016621e-06, "epoch": 0.2929919493203115, "percentage": 29.3, "elapsed_time": "0:05:43", "remaining_time": "0:13:49", "throughput": 3172.04, "total_tokens": 1089984} +{"current_steps": 2225, "total_steps": 7577, "loss": 0.0185, "lr": 1.7804548915481746e-06, "epoch": 0.2936518410980599, "percentage": 29.37, "elapsed_time": "0:05:43", "remaining_time": "0:13:47", "throughput": 3177.0, "total_tokens": 1092608} +{"current_steps": 2230, "total_steps": 7577, "loss": 0.1235, "lr": 1.7790126096910865e-06, "epoch": 0.29431173287580836, "percentage": 29.43, "elapsed_time": "0:05:44", "remaining_time": "0:13:45", "throughput": 3181.41, "total_tokens": 1095040} +{"current_steps": 2235, "total_steps": 7577, "loss": 0.064, "lr": 1.7775661940986492e-06, "epoch": 0.2949716246535568, "percentage": 29.5, "elapsed_time": "0:05:44", "remaining_time": "0:13:43", "throughput": 3186.53, "total_tokens": 1097728} +{"current_steps": 2240, "total_steps": 7577, "loss": 0.2202, "lr": 1.776115652446091e-06, "epoch": 0.29563151643130525, "percentage": 29.56, "elapsed_time": "0:05:44", "remaining_time": "0:13:41", "throughput": 3190.74, "total_tokens": 1100096} +{"current_steps": 2245, "total_steps": 7577, "loss": 0.1252, "lr": 1.7746609924305336e-06, "epoch": 0.2962914082090537, "percentage": 29.63, "elapsed_time": "0:05:45", "remaining_time": "0:13:39", "throughput": 3194.75, "total_tokens": 1102400} +{"current_steps": 2250, "total_steps": 7577, "loss": 0.1016, "lr": 1.7732022217709534e-06, "epoch": 0.29695129998680214, "percentage": 29.7, "elapsed_time": "0:05:45", "remaining_time": "0:13:37", "throughput": 3199.45, "total_tokens": 1104960} +{"current_steps": 2255, "total_steps": 7577, "loss": 0.0905, "lr": 1.7717393482081384e-06, "epoch": 0.2976111917645506, "percentage": 29.76, "elapsed_time": "0:05:45", "remaining_time": "0:13:35", "throughput": 3204.15, "total_tokens": 1107520} +{"current_steps": 2260, "total_steps": 7577, "loss": 0.1454, "lr": 1.7702723795046492e-06, "epoch": 0.2982710835422991, "percentage": 29.83, "elapsed_time": "0:05:45", "remaining_time": "0:13:33", "throughput": 3208.49, "total_tokens": 1109952} +{"current_steps": 2265, "total_steps": 7577, "loss": 0.0226, "lr": 1.7688013234447757e-06, "epoch": 0.29893097532004753, "percentage": 29.89, "elapsed_time": "0:05:46", "remaining_time": "0:13:32", "throughput": 3212.03, "total_tokens": 1112128} +{"current_steps": 2270, "total_steps": 7577, "loss": 0.1225, "lr": 1.7673261878344973e-06, "epoch": 0.299590867097796, "percentage": 29.96, "elapsed_time": "0:05:46", "remaining_time": "0:13:30", "throughput": 3216.71, "total_tokens": 1114688} +{"current_steps": 2274, "total_steps": 7577, "eval_loss": 0.10979828983545303, "epoch": 0.30011878051999474, "percentage": 30.01, "elapsed_time": "0:05:54", "remaining_time": "0:13:46", "throughput": 3152.34, "total_tokens": 1116800} +{"current_steps": 2275, "total_steps": 7577, "loss": 0.1963, "lr": 1.7658469805014414e-06, "epoch": 0.3002507588755444, "percentage": 30.03, "elapsed_time": "0:06:14", "remaining_time": "0:14:32", "throughput": 2983.5, "total_tokens": 1117248} +{"current_steps": 2280, "total_steps": 7577, "loss": 0.1312, "lr": 1.7643637092948415e-06, "epoch": 0.30091065065329287, "percentage": 30.09, "elapsed_time": "0:06:14", "remaining_time": "0:14:30", "throughput": 2987.92, "total_tokens": 1119808} +{"current_steps": 2285, "total_steps": 7577, "loss": 0.2181, "lr": 1.7628763820854948e-06, "epoch": 0.3015705424310413, "percentage": 30.16, "elapsed_time": "0:06:15", "remaining_time": "0:14:28", "throughput": 2991.7, "total_tokens": 1122112} +{"current_steps": 2290, "total_steps": 7577, "loss": 0.0905, "lr": 1.7613850067657216e-06, "epoch": 0.30223043420878976, "percentage": 30.22, "elapsed_time": "0:06:15", "remaining_time": "0:14:26", "throughput": 2995.86, "total_tokens": 1124544} +{"current_steps": 2295, "total_steps": 7577, "loss": 0.0688, "lr": 1.7598895912493232e-06, "epoch": 0.3028903259865382, "percentage": 30.29, "elapsed_time": "0:06:15", "remaining_time": "0:14:24", "throughput": 3000.36, "total_tokens": 1127104} +{"current_steps": 2300, "total_steps": 7577, "loss": 0.0735, "lr": 1.7583901434715397e-06, "epoch": 0.30355021776428665, "percentage": 30.36, "elapsed_time": "0:06:15", "remaining_time": "0:14:22", "throughput": 3004.51, "total_tokens": 1129536} +{"current_steps": 2305, "total_steps": 7577, "loss": 0.0694, "lr": 1.7568866713890074e-06, "epoch": 0.3042101095420351, "percentage": 30.42, "elapsed_time": "0:06:16", "remaining_time": "0:14:20", "throughput": 3008.33, "total_tokens": 1131840} +{"current_steps": 2310, "total_steps": 7577, "loss": 0.1669, "lr": 1.7553791829797175e-06, "epoch": 0.30487000131978353, "percentage": 30.49, "elapsed_time": "0:06:16", "remaining_time": "0:14:18", "throughput": 3012.65, "total_tokens": 1134336} +{"current_steps": 2315, "total_steps": 7577, "loss": 0.2863, "lr": 1.7538676862429737e-06, "epoch": 0.305529893097532, "percentage": 30.55, "elapsed_time": "0:06:16", "remaining_time": "0:14:16", "throughput": 3016.47, "total_tokens": 1136640} +{"current_steps": 2320, "total_steps": 7577, "loss": 0.1177, "lr": 1.7523521891993486e-06, "epoch": 0.3061897848752805, "percentage": 30.62, "elapsed_time": "0:06:17", "remaining_time": "0:14:14", "throughput": 3020.78, "total_tokens": 1139136} +{"current_steps": 2325, "total_steps": 7577, "loss": 0.0919, "lr": 1.7508326998906422e-06, "epoch": 0.3068496766530289, "percentage": 30.68, "elapsed_time": "0:06:17", "remaining_time": "0:14:12", "throughput": 3024.92, "total_tokens": 1141568} +{"current_steps": 2330, "total_steps": 7577, "loss": 0.004, "lr": 1.7493092263798394e-06, "epoch": 0.30750956843077737, "percentage": 30.75, "elapsed_time": "0:06:17", "remaining_time": "0:14:10", "throughput": 3028.87, "total_tokens": 1143936} +{"current_steps": 2335, "total_steps": 7577, "loss": 0.037, "lr": 1.7477817767510664e-06, "epoch": 0.3081694602085258, "percentage": 30.82, "elapsed_time": "0:06:17", "remaining_time": "0:14:08", "throughput": 3033.62, "total_tokens": 1146624} +{"current_steps": 2340, "total_steps": 7577, "loss": 0.0055, "lr": 1.7462503591095484e-06, "epoch": 0.30882935198627426, "percentage": 30.88, "elapsed_time": "0:06:18", "remaining_time": "0:14:06", "throughput": 3037.92, "total_tokens": 1149120} +{"current_steps": 2345, "total_steps": 7577, "loss": 0.0421, "lr": 1.7447149815815659e-06, "epoch": 0.3094892437640227, "percentage": 30.95, "elapsed_time": "0:06:18", "remaining_time": "0:14:04", "throughput": 3041.84, "total_tokens": 1151488} +{"current_steps": 2350, "total_steps": 7577, "loss": 0.1083, "lr": 1.7431756523144126e-06, "epoch": 0.31014913554177115, "percentage": 31.01, "elapsed_time": "0:06:18", "remaining_time": "0:14:02", "throughput": 3045.1, "total_tokens": 1153600} +{"current_steps": 2355, "total_steps": 7577, "loss": 0.0665, "lr": 1.7416323794763512e-06, "epoch": 0.3108090273195196, "percentage": 31.08, "elapsed_time": "0:06:19", "remaining_time": "0:14:00", "throughput": 3049.66, "total_tokens": 1156224} +{"current_steps": 2360, "total_steps": 7577, "loss": 0.2148, "lr": 1.7400851712565707e-06, "epoch": 0.31146891909726804, "percentage": 31.15, "elapsed_time": "0:06:19", "remaining_time": "0:13:58", "throughput": 3053.74, "total_tokens": 1158656} +{"current_steps": 2365, "total_steps": 7577, "loss": 0.2065, "lr": 1.7385340358651432e-06, "epoch": 0.3121288108750165, "percentage": 31.21, "elapsed_time": "0:06:19", "remaining_time": "0:13:56", "throughput": 3058.63, "total_tokens": 1161408} +{"current_steps": 2370, "total_steps": 7577, "loss": 0.0283, "lr": 1.736978981532979e-06, "epoch": 0.3127887026527649, "percentage": 31.28, "elapsed_time": "0:06:20", "remaining_time": "0:13:54", "throughput": 3062.88, "total_tokens": 1163904} +{"current_steps": 2375, "total_steps": 7577, "loss": 0.2238, "lr": 1.7354200165117838e-06, "epoch": 0.31344859443051337, "percentage": 31.34, "elapsed_time": "0:06:20", "remaining_time": "0:13:52", "throughput": 3066.62, "total_tokens": 1166208} +{"current_steps": 2380, "total_steps": 7577, "loss": 0.2442, "lr": 1.733857149074016e-06, "epoch": 0.3141084862082619, "percentage": 31.41, "elapsed_time": "0:06:20", "remaining_time": "0:13:51", "throughput": 3070.35, "total_tokens": 1168512} +{"current_steps": 2385, "total_steps": 7577, "loss": 0.1859, "lr": 1.7322903875128402e-06, "epoch": 0.3147683779860103, "percentage": 31.48, "elapsed_time": "0:06:20", "remaining_time": "0:13:49", "throughput": 3074.74, "total_tokens": 1171072} +{"current_steps": 2390, "total_steps": 7577, "loss": 0.0042, "lr": 1.7307197401420858e-06, "epoch": 0.31542826976375876, "percentage": 31.54, "elapsed_time": "0:06:21", "remaining_time": "0:13:47", "throughput": 3078.27, "total_tokens": 1173312} +{"current_steps": 2395, "total_steps": 7577, "loss": 0.0649, "lr": 1.7291452152962018e-06, "epoch": 0.3160881615415072, "percentage": 31.61, "elapsed_time": "0:06:21", "remaining_time": "0:13:45", "throughput": 3082.32, "total_tokens": 1175744} +{"current_steps": 2400, "total_steps": 7577, "loss": 0.1831, "lr": 1.7275668213302116e-06, "epoch": 0.31674805331925565, "percentage": 31.67, "elapsed_time": "0:06:21", "remaining_time": "0:13:43", "throughput": 3086.21, "total_tokens": 1178112} +{"current_steps": 2405, "total_steps": 7577, "loss": 0.0443, "lr": 1.72598456661967e-06, "epoch": 0.3174079450970041, "percentage": 31.74, "elapsed_time": "0:06:22", "remaining_time": "0:13:41", "throughput": 3089.75, "total_tokens": 1180352} +{"current_steps": 2410, "total_steps": 7577, "loss": 0.1393, "lr": 1.7243984595606191e-06, "epoch": 0.31806783687475254, "percentage": 31.81, "elapsed_time": "0:06:22", "remaining_time": "0:13:39", "throughput": 3093.13, "total_tokens": 1182528} +{"current_steps": 2415, "total_steps": 7577, "loss": 0.0891, "lr": 1.722808508569542e-06, "epoch": 0.318727728652501, "percentage": 31.87, "elapsed_time": "0:06:22", "remaining_time": "0:13:37", "throughput": 3098.0, "total_tokens": 1185280} +{"current_steps": 2420, "total_steps": 7577, "loss": 0.0768, "lr": 1.72121472208332e-06, "epoch": 0.31938762043024943, "percentage": 31.94, "elapsed_time": "0:06:22", "remaining_time": "0:13:35", "throughput": 3102.83, "total_tokens": 1188032} +{"current_steps": 2425, "total_steps": 7577, "loss": 0.2321, "lr": 1.7196171085591864e-06, "epoch": 0.3200475122079979, "percentage": 32.0, "elapsed_time": "0:06:23", "remaining_time": "0:13:34", "throughput": 3106.86, "total_tokens": 1190464} +{"current_steps": 2430, "total_steps": 7577, "loss": 0.2085, "lr": 1.7180156764746824e-06, "epoch": 0.3207074039857463, "percentage": 32.07, "elapsed_time": "0:06:23", "remaining_time": "0:13:32", "throughput": 3111.06, "total_tokens": 1192960} +{"current_steps": 2435, "total_steps": 7577, "loss": 0.0272, "lr": 1.7164104343276113e-06, "epoch": 0.32136729576349476, "percentage": 32.14, "elapsed_time": "0:06:23", "remaining_time": "0:13:30", "throughput": 3114.27, "total_tokens": 1195072} +{"current_steps": 2440, "total_steps": 7577, "loss": 0.0063, "lr": 1.714801390635996e-06, "epoch": 0.3220271875412432, "percentage": 32.2, "elapsed_time": "0:06:24", "remaining_time": "0:13:28", "throughput": 3117.96, "total_tokens": 1197376} +{"current_steps": 2445, "total_steps": 7577, "loss": 0.038, "lr": 1.7131885539380297e-06, "epoch": 0.3226870793189917, "percentage": 32.27, "elapsed_time": "0:06:24", "remaining_time": "0:13:26", "throughput": 3122.27, "total_tokens": 1199936} +{"current_steps": 2450, "total_steps": 7577, "loss": 0.1487, "lr": 1.7115719327920335e-06, "epoch": 0.32334697109674015, "percentage": 32.33, "elapsed_time": "0:06:24", "remaining_time": "0:13:24", "throughput": 3126.26, "total_tokens": 1202368} +{"current_steps": 2455, "total_steps": 7577, "loss": 0.0011, "lr": 1.70995153577641e-06, "epoch": 0.3240068628744886, "percentage": 32.4, "elapsed_time": "0:06:24", "remaining_time": "0:13:23", "throughput": 3130.26, "total_tokens": 1204800} +{"current_steps": 2460, "total_steps": 7577, "loss": 0.0641, "lr": 1.7083273714895991e-06, "epoch": 0.32466675465223704, "percentage": 32.47, "elapsed_time": "0:06:25", "remaining_time": "0:13:21", "throughput": 3135.03, "total_tokens": 1207552} +{"current_steps": 2465, "total_steps": 7577, "loss": 0.2123, "lr": 1.7066994485500298e-06, "epoch": 0.3253266464299855, "percentage": 32.53, "elapsed_time": "0:06:25", "remaining_time": "0:13:19", "throughput": 3138.68, "total_tokens": 1209856} +{"current_steps": 2470, "total_steps": 7577, "loss": 0.0982, "lr": 1.7050677755960762e-06, "epoch": 0.32598653820773393, "percentage": 32.6, "elapsed_time": "0:06:25", "remaining_time": "0:13:17", "throughput": 3142.79, "total_tokens": 1212352} +{"current_steps": 2475, "total_steps": 7577, "loss": 0.1048, "lr": 1.7034323612860124e-06, "epoch": 0.3266464299854824, "percentage": 32.66, "elapsed_time": "0:06:26", "remaining_time": "0:13:15", "throughput": 3147.07, "total_tokens": 1214912} +{"current_steps": 2480, "total_steps": 7577, "loss": 0.0354, "lr": 1.7017932142979645e-06, "epoch": 0.3273063217632308, "percentage": 32.73, "elapsed_time": "0:06:26", "remaining_time": "0:13:14", "throughput": 3150.36, "total_tokens": 1217088} +{"current_steps": 2485, "total_steps": 7577, "loss": 0.2006, "lr": 1.700150343329866e-06, "epoch": 0.32796621354097927, "percentage": 32.8, "elapsed_time": "0:06:26", "remaining_time": "0:13:12", "throughput": 3154.49, "total_tokens": 1219584} +{"current_steps": 2490, "total_steps": 7577, "loss": 0.1335, "lr": 1.6985037570994113e-06, "epoch": 0.3286261053187277, "percentage": 32.86, "elapsed_time": "0:06:26", "remaining_time": "0:13:10", "throughput": 3159.25, "total_tokens": 1222336} +{"current_steps": 2495, "total_steps": 7577, "loss": 0.0688, "lr": 1.6968534643440088e-06, "epoch": 0.32928599709647616, "percentage": 32.93, "elapsed_time": "0:06:27", "remaining_time": "0:13:08", "throughput": 3163.33, "total_tokens": 1224832} +{"current_steps": 2500, "total_steps": 7577, "loss": 0.1821, "lr": 1.6951994738207364e-06, "epoch": 0.3299458888742246, "percentage": 32.99, "elapsed_time": "0:06:27", "remaining_time": "0:13:06", "throughput": 3167.58, "total_tokens": 1227392} +{"current_steps": 2505, "total_steps": 7577, "loss": 0.2034, "lr": 1.6935417943062928e-06, "epoch": 0.3306057806519731, "percentage": 33.06, "elapsed_time": "0:06:27", "remaining_time": "0:13:05", "throughput": 3171.81, "total_tokens": 1229952} +{"current_steps": 2510, "total_steps": 7577, "loss": 0.0106, "lr": 1.6918804345969516e-06, "epoch": 0.33126567242972155, "percentage": 33.13, "elapsed_time": "0:06:28", "remaining_time": "0:13:03", "throughput": 3176.36, "total_tokens": 1232640} +{"current_steps": 2515, "total_steps": 7577, "loss": 0.0161, "lr": 1.6902154035085156e-06, "epoch": 0.33192556420747, "percentage": 33.19, "elapsed_time": "0:06:28", "remaining_time": "0:13:01", "throughput": 3180.59, "total_tokens": 1235200} +{"current_steps": 2520, "total_steps": 7577, "loss": 0.0893, "lr": 1.688546709876269e-06, "epoch": 0.33258545598521844, "percentage": 33.26, "elapsed_time": "0:06:28", "remaining_time": "0:12:59", "throughput": 3184.5, "total_tokens": 1237632} +{"current_steps": 2525, "total_steps": 7577, "loss": 0.0905, "lr": 1.6868743625549314e-06, "epoch": 0.3332453477629669, "percentage": 33.32, "elapsed_time": "0:06:28", "remaining_time": "0:12:58", "throughput": 3188.08, "total_tokens": 1239936} +{"current_steps": 2530, "total_steps": 7577, "loss": 0.0392, "lr": 1.6851983704186092e-06, "epoch": 0.3339052395407153, "percentage": 33.39, "elapsed_time": "0:06:29", "remaining_time": "0:12:56", "throughput": 3191.83, "total_tokens": 1242304} +{"current_steps": 2535, "total_steps": 7577, "loss": 0.0036, "lr": 1.6835187423607503e-06, "epoch": 0.33456513131846377, "percentage": 33.46, "elapsed_time": "0:06:29", "remaining_time": "0:12:54", "throughput": 3195.73, "total_tokens": 1244736} +{"current_steps": 2540, "total_steps": 7577, "loss": 0.2003, "lr": 1.681835487294096e-06, "epoch": 0.3352250230962122, "percentage": 33.52, "elapsed_time": "0:06:29", "remaining_time": "0:12:52", "throughput": 3200.41, "total_tokens": 1247488} +{"current_steps": 2545, "total_steps": 7577, "loss": 0.2557, "lr": 1.6801486141506342e-06, "epoch": 0.33588491487396066, "percentage": 33.59, "elapsed_time": "0:06:30", "remaining_time": "0:12:51", "throughput": 3204.61, "total_tokens": 1250048} +{"current_steps": 2550, "total_steps": 7577, "loss": 0.3749, "lr": 1.6784581318815514e-06, "epoch": 0.3365448066517091, "percentage": 33.65, "elapsed_time": "0:06:30", "remaining_time": "0:12:49", "throughput": 3209.59, "total_tokens": 1252928} +{"current_steps": 2555, "total_steps": 7577, "loss": 0.146, "lr": 1.6767640494571849e-06, "epoch": 0.33720469842945755, "percentage": 33.72, "elapsed_time": "0:06:30", "remaining_time": "0:12:47", "throughput": 3213.8, "total_tokens": 1255488} +{"current_steps": 2560, "total_steps": 7577, "loss": 0.3346, "lr": 1.6750663758669767e-06, "epoch": 0.337864590207206, "percentage": 33.79, "elapsed_time": "0:06:30", "remaining_time": "0:12:46", "throughput": 3217.8, "total_tokens": 1257984} +{"current_steps": 2565, "total_steps": 7577, "loss": 0.1044, "lr": 1.6733651201194245e-06, "epoch": 0.3385244819849545, "percentage": 33.85, "elapsed_time": "0:06:31", "remaining_time": "0:12:44", "throughput": 3221.68, "total_tokens": 1260416} +{"current_steps": 2570, "total_steps": 7577, "loss": 0.0797, "lr": 1.6716602912420342e-06, "epoch": 0.33918437376270294, "percentage": 33.92, "elapsed_time": "0:06:31", "remaining_time": "0:12:42", "throughput": 3226.33, "total_tokens": 1263168} +{"current_steps": 2575, "total_steps": 7577, "loss": 0.1608, "lr": 1.6699518982812726e-06, "epoch": 0.3398442655404514, "percentage": 33.98, "elapsed_time": "0:06:31", "remaining_time": "0:12:41", "throughput": 3230.19, "total_tokens": 1265600} +{"current_steps": 2580, "total_steps": 7577, "loss": 0.0033, "lr": 1.6682399503025183e-06, "epoch": 0.34050415731819983, "percentage": 34.05, "elapsed_time": "0:06:32", "remaining_time": "0:12:39", "throughput": 3234.06, "total_tokens": 1268032} +{"current_steps": 2585, "total_steps": 7577, "loss": 0.1571, "lr": 1.666524456390014e-06, "epoch": 0.3411640490959483, "percentage": 34.12, "elapsed_time": "0:06:32", "remaining_time": "0:12:37", "throughput": 3237.59, "total_tokens": 1270336} +{"current_steps": 2590, "total_steps": 7577, "loss": 0.0566, "lr": 1.664805425646819e-06, "epoch": 0.3418239408736967, "percentage": 34.18, "elapsed_time": "0:06:32", "remaining_time": "0:12:36", "throughput": 3242.22, "total_tokens": 1273088} +{"current_steps": 2595, "total_steps": 7577, "loss": 0.2203, "lr": 1.6630828671947606e-06, "epoch": 0.34248383265144516, "percentage": 34.25, "elapsed_time": "0:06:32", "remaining_time": "0:12:34", "throughput": 3245.9, "total_tokens": 1275456} +{"current_steps": 2600, "total_steps": 7577, "loss": 0.0365, "lr": 1.6613567901743842e-06, "epoch": 0.3431437244291936, "percentage": 34.31, "elapsed_time": "0:06:33", "remaining_time": "0:12:32", "throughput": 3249.75, "total_tokens": 1277888} +{"current_steps": 2605, "total_steps": 7577, "loss": 0.0013, "lr": 1.6596272037449075e-06, "epoch": 0.34380361620694205, "percentage": 34.38, "elapsed_time": "0:06:33", "remaining_time": "0:12:31", "throughput": 3253.73, "total_tokens": 1280384} +{"current_steps": 2610, "total_steps": 7577, "loss": 0.064, "lr": 1.6578941170841696e-06, "epoch": 0.3444635079846905, "percentage": 34.45, "elapsed_time": "0:06:33", "remaining_time": "0:12:29", "throughput": 3257.85, "total_tokens": 1282944} +{"current_steps": 2615, "total_steps": 7577, "loss": 0.0664, "lr": 1.6561575393885833e-06, "epoch": 0.34512339976243894, "percentage": 34.51, "elapsed_time": "0:06:34", "remaining_time": "0:12:27", "throughput": 3261.19, "total_tokens": 1285184} +{"current_steps": 2620, "total_steps": 7577, "loss": 0.1976, "lr": 1.6544174798730864e-06, "epoch": 0.3457832915401874, "percentage": 34.58, "elapsed_time": "0:06:34", "remaining_time": "0:12:26", "throughput": 3265.46, "total_tokens": 1287808} +{"current_steps": 2625, "total_steps": 7577, "loss": 0.1552, "lr": 1.6526739477710923e-06, "epoch": 0.34644318331793583, "percentage": 34.64, "elapsed_time": "0:06:34", "remaining_time": "0:12:24", "throughput": 3269.75, "total_tokens": 1290432} +{"current_steps": 2630, "total_steps": 7577, "loss": 0.2257, "lr": 1.650926952334441e-06, "epoch": 0.34710307509568433, "percentage": 34.71, "elapsed_time": "0:06:34", "remaining_time": "0:12:22", "throughput": 3273.24, "total_tokens": 1292736} +{"current_steps": 2635, "total_steps": 7577, "loss": 0.2674, "lr": 1.6491765028333516e-06, "epoch": 0.3477629668734328, "percentage": 34.78, "elapsed_time": "0:06:35", "remaining_time": "0:12:21", "throughput": 3276.87, "total_tokens": 1295104} +{"current_steps": 2640, "total_steps": 7577, "loss": 0.0204, "lr": 1.6474226085563693e-06, "epoch": 0.3484228586511812, "percentage": 34.84, "elapsed_time": "0:06:35", "remaining_time": "0:12:19", "throughput": 3280.82, "total_tokens": 1297600} +{"current_steps": 2645, "total_steps": 7577, "loss": 0.0496, "lr": 1.6456652788103215e-06, "epoch": 0.34908275042892967, "percentage": 34.91, "elapsed_time": "0:06:35", "remaining_time": "0:12:18", "throughput": 3285.07, "total_tokens": 1300224} +{"current_steps": 2650, "total_steps": 7577, "loss": 0.1152, "lr": 1.6439045229202631e-06, "epoch": 0.3497426422066781, "percentage": 34.97, "elapsed_time": "0:06:36", "remaining_time": "0:12:16", "throughput": 3288.51, "total_tokens": 1302528} +{"current_steps": 2653, "total_steps": 7577, "eval_loss": 0.12348020076751709, "epoch": 0.3501385772733272, "percentage": 35.01, "elapsed_time": "0:06:43", "remaining_time": "0:12:29", "throughput": 3228.54, "total_tokens": 1303872} +{"current_steps": 2655, "total_steps": 7577, "loss": 0.159, "lr": 1.6421403502294307e-06, "epoch": 0.35040253398442656, "percentage": 35.04, "elapsed_time": "0:07:30", "remaining_time": "0:13:54", "throughput": 2899.93, "total_tokens": 1305024} +{"current_steps": 2660, "total_steps": 7577, "loss": 0.1813, "lr": 1.6403727700991915e-06, "epoch": 0.351062425762175, "percentage": 35.11, "elapsed_time": "0:07:30", "remaining_time": "0:13:52", "throughput": 2903.35, "total_tokens": 1307392} +{"current_steps": 2665, "total_steps": 7577, "loss": 0.1581, "lr": 1.6386017919089933e-06, "epoch": 0.35172231753992345, "percentage": 35.17, "elapsed_time": "0:07:30", "remaining_time": "0:13:50", "throughput": 2907.32, "total_tokens": 1310016} +{"current_steps": 2670, "total_steps": 7577, "loss": 0.0066, "lr": 1.636827425056316e-06, "epoch": 0.3523822093176719, "percentage": 35.24, "elapsed_time": "0:07:30", "remaining_time": "0:13:48", "throughput": 2911.14, "total_tokens": 1312576} +{"current_steps": 2675, "total_steps": 7577, "loss": 0.1432, "lr": 1.635049678956621e-06, "epoch": 0.35304210109542034, "percentage": 35.3, "elapsed_time": "0:07:31", "remaining_time": "0:13:46", "throughput": 2914.81, "total_tokens": 1315072} +{"current_steps": 2680, "total_steps": 7577, "loss": 0.1222, "lr": 1.633268563043301e-06, "epoch": 0.3537019928731688, "percentage": 35.37, "elapsed_time": "0:07:31", "remaining_time": "0:13:44", "throughput": 2918.35, "total_tokens": 1317504} +{"current_steps": 2685, "total_steps": 7577, "loss": 0.0023, "lr": 1.63148408676763e-06, "epoch": 0.3543618846509172, "percentage": 35.44, "elapsed_time": "0:07:31", "remaining_time": "0:13:43", "throughput": 2921.34, "total_tokens": 1319680} +{"current_steps": 2690, "total_steps": 7577, "loss": 0.0014, "lr": 1.6296962595987141e-06, "epoch": 0.3550217764286657, "percentage": 35.5, "elapsed_time": "0:07:32", "remaining_time": "0:13:41", "throughput": 2925.15, "total_tokens": 1322240} +{"current_steps": 2695, "total_steps": 7577, "loss": 0.1142, "lr": 1.6279050910234392e-06, "epoch": 0.35568166820641417, "percentage": 35.57, "elapsed_time": "0:07:32", "remaining_time": "0:13:39", "throughput": 2928.81, "total_tokens": 1324736} +{"current_steps": 2700, "total_steps": 7577, "loss": 0.0407, "lr": 1.626110590546423e-06, "epoch": 0.3563415599841626, "percentage": 35.63, "elapsed_time": "0:07:32", "remaining_time": "0:13:37", "throughput": 2932.22, "total_tokens": 1327104} +{"current_steps": 2705, "total_steps": 7577, "loss": 0.248, "lr": 1.6243127676899635e-06, "epoch": 0.35700145176191106, "percentage": 35.7, "elapsed_time": "0:07:32", "remaining_time": "0:13:35", "throughput": 2936.54, "total_tokens": 1329920} +{"current_steps": 2710, "total_steps": 7577, "loss": 0.2153, "lr": 1.6225116319939884e-06, "epoch": 0.3576613435396595, "percentage": 35.77, "elapsed_time": "0:07:33", "remaining_time": "0:13:33", "throughput": 2940.06, "total_tokens": 1332352} +{"current_steps": 2715, "total_steps": 7577, "loss": 0.1084, "lr": 1.6207071930160044e-06, "epoch": 0.35832123531740795, "percentage": 35.83, "elapsed_time": "0:07:33", "remaining_time": "0:13:32", "throughput": 2944.12, "total_tokens": 1335040} +{"current_steps": 2720, "total_steps": 7577, "loss": 0.0054, "lr": 1.6188994603310468e-06, "epoch": 0.3589811270951564, "percentage": 35.9, "elapsed_time": "0:07:33", "remaining_time": "0:13:30", "throughput": 2947.64, "total_tokens": 1337472} +{"current_steps": 2725, "total_steps": 7577, "loss": 0.1694, "lr": 1.617088443531628e-06, "epoch": 0.35964101887290484, "percentage": 35.96, "elapsed_time": "0:07:34", "remaining_time": "0:13:28", "throughput": 2950.73, "total_tokens": 1339712} +{"current_steps": 2730, "total_steps": 7577, "loss": 0.0016, "lr": 1.6152741522276882e-06, "epoch": 0.3603009106506533, "percentage": 36.03, "elapsed_time": "0:07:34", "remaining_time": "0:13:26", "throughput": 2954.24, "total_tokens": 1342144} +{"current_steps": 2735, "total_steps": 7577, "loss": 0.108, "lr": 1.6134565960465425e-06, "epoch": 0.36096080242840173, "percentage": 36.1, "elapsed_time": "0:07:34", "remaining_time": "0:13:24", "throughput": 2957.61, "total_tokens": 1344512} +{"current_steps": 2740, "total_steps": 7577, "loss": 0.242, "lr": 1.6116357846328312e-06, "epoch": 0.3616206942061502, "percentage": 36.16, "elapsed_time": "0:07:34", "remaining_time": "0:13:23", "throughput": 2960.97, "total_tokens": 1346880} +{"current_steps": 2745, "total_steps": 7577, "loss": 0.1324, "lr": 1.609811727648468e-06, "epoch": 0.3622805859838986, "percentage": 36.23, "elapsed_time": "0:07:35", "remaining_time": "0:13:21", "throughput": 2963.9, "total_tokens": 1349056} +{"current_steps": 2750, "total_steps": 7577, "loss": 0.0724, "lr": 1.6079844347725882e-06, "epoch": 0.36294047776164706, "percentage": 36.29, "elapsed_time": "0:07:35", "remaining_time": "0:13:19", "throughput": 2967.37, "total_tokens": 1351488} +{"current_steps": 2755, "total_steps": 7577, "loss": 0.0532, "lr": 1.6061539157014987e-06, "epoch": 0.36360036953939556, "percentage": 36.36, "elapsed_time": "0:07:35", "remaining_time": "0:13:17", "throughput": 2970.84, "total_tokens": 1353920} +{"current_steps": 2760, "total_steps": 7577, "loss": 0.2916, "lr": 1.6043201801486257e-06, "epoch": 0.364260261317144, "percentage": 36.43, "elapsed_time": "0:07:36", "remaining_time": "0:13:15", "throughput": 2974.31, "total_tokens": 1356352} +{"current_steps": 2765, "total_steps": 7577, "loss": 0.2542, "lr": 1.6024832378444628e-06, "epoch": 0.36492015309489245, "percentage": 36.49, "elapsed_time": "0:07:36", "remaining_time": "0:13:14", "throughput": 2978.47, "total_tokens": 1359104} +{"current_steps": 2770, "total_steps": 7577, "loss": 0.2718, "lr": 1.6006430985365204e-06, "epoch": 0.3655800448726409, "percentage": 36.56, "elapsed_time": "0:07:36", "remaining_time": "0:13:12", "throughput": 2981.93, "total_tokens": 1361536} +{"current_steps": 2775, "total_steps": 7577, "loss": 0.2648, "lr": 1.5987997719892735e-06, "epoch": 0.36623993665038934, "percentage": 36.62, "elapsed_time": "0:07:36", "remaining_time": "0:13:10", "throughput": 2985.8, "total_tokens": 1364160} +{"current_steps": 2780, "total_steps": 7577, "loss": 0.0465, "lr": 1.5969532679841088e-06, "epoch": 0.3668998284281378, "percentage": 36.69, "elapsed_time": "0:07:37", "remaining_time": "0:13:08", "throughput": 2989.4, "total_tokens": 1366656} +{"current_steps": 2785, "total_steps": 7577, "loss": 0.0486, "lr": 1.5951035963192752e-06, "epoch": 0.36755972020588623, "percentage": 36.76, "elapsed_time": "0:07:37", "remaining_time": "0:13:07", "throughput": 2993.14, "total_tokens": 1369216} +{"current_steps": 2790, "total_steps": 7577, "loss": 0.2435, "lr": 1.593250766809829e-06, "epoch": 0.3682196119836347, "percentage": 36.82, "elapsed_time": "0:07:37", "remaining_time": "0:13:05", "throughput": 2996.71, "total_tokens": 1371712} +{"current_steps": 2795, "total_steps": 7577, "loss": 0.1572, "lr": 1.5913947892875842e-06, "epoch": 0.3688795037613831, "percentage": 36.89, "elapsed_time": "0:07:38", "remaining_time": "0:13:03", "throughput": 3000.02, "total_tokens": 1374080} +{"current_steps": 2800, "total_steps": 7577, "loss": 0.1055, "lr": 1.589535673601059e-06, "epoch": 0.36953939553913157, "percentage": 36.95, "elapsed_time": "0:07:38", "remaining_time": "0:13:01", "throughput": 3004.51, "total_tokens": 1377024} +{"current_steps": 2805, "total_steps": 7577, "loss": 0.0806, "lr": 1.587673429615424e-06, "epoch": 0.37019928731688, "percentage": 37.02, "elapsed_time": "0:07:38", "remaining_time": "0:13:00", "throughput": 3007.81, "total_tokens": 1379392} +{"current_steps": 2810, "total_steps": 7577, "loss": 0.1468, "lr": 1.5858080672124495e-06, "epoch": 0.37085917909462845, "percentage": 37.09, "elapsed_time": "0:07:38", "remaining_time": "0:12:58", "throughput": 3011.12, "total_tokens": 1381760} +{"current_steps": 2815, "total_steps": 7577, "loss": 0.0923, "lr": 1.5839395962904536e-06, "epoch": 0.37151907087237696, "percentage": 37.15, "elapsed_time": "0:07:39", "remaining_time": "0:12:56", "throughput": 3014.41, "total_tokens": 1384128} +{"current_steps": 2820, "total_steps": 7577, "loss": 0.0594, "lr": 1.5820680267642494e-06, "epoch": 0.3721789626501254, "percentage": 37.22, "elapsed_time": "0:07:39", "remaining_time": "0:12:55", "throughput": 3017.7, "total_tokens": 1386496} +{"current_steps": 2825, "total_steps": 7577, "loss": 0.0668, "lr": 1.5801933685650917e-06, "epoch": 0.37283885442787384, "percentage": 37.28, "elapsed_time": "0:07:39", "remaining_time": "0:12:53", "throughput": 3020.72, "total_tokens": 1388736} +{"current_steps": 2830, "total_steps": 7577, "loss": 0.002, "lr": 1.5783156316406259e-06, "epoch": 0.3734987462056223, "percentage": 37.35, "elapsed_time": "0:07:40", "remaining_time": "0:12:51", "throughput": 3023.87, "total_tokens": 1391040} +{"current_steps": 2835, "total_steps": 7577, "loss": 0.218, "lr": 1.5764348259548334e-06, "epoch": 0.37415863798337073, "percentage": 37.42, "elapsed_time": "0:07:40", "remaining_time": "0:12:49", "throughput": 3027.02, "total_tokens": 1393344} +{"current_steps": 2840, "total_steps": 7577, "loss": 0.056, "lr": 1.5745509614879806e-06, "epoch": 0.3748185297611192, "percentage": 37.48, "elapsed_time": "0:07:40", "remaining_time": "0:12:48", "throughput": 3030.17, "total_tokens": 1395648} +{"current_steps": 2845, "total_steps": 7577, "loss": 0.2865, "lr": 1.572664048236564e-06, "epoch": 0.3754784215388676, "percentage": 37.55, "elapsed_time": "0:07:40", "remaining_time": "0:12:46", "throughput": 3033.99, "total_tokens": 1398272} +{"current_steps": 2850, "total_steps": 7577, "loss": 0.0507, "lr": 1.570774096213259e-06, "epoch": 0.37613831331661607, "percentage": 37.61, "elapsed_time": "0:07:41", "remaining_time": "0:12:44", "throughput": 3037.11, "total_tokens": 1400576} +{"current_steps": 2855, "total_steps": 7577, "loss": 0.0513, "lr": 1.5688811154468649e-06, "epoch": 0.3767982050943645, "percentage": 37.68, "elapsed_time": "0:07:41", "remaining_time": "0:12:43", "throughput": 3040.78, "total_tokens": 1403136} +{"current_steps": 2860, "total_steps": 7577, "loss": 0.1228, "lr": 1.5669851159822532e-06, "epoch": 0.37745809687211296, "percentage": 37.75, "elapsed_time": "0:07:41", "remaining_time": "0:12:41", "throughput": 3044.04, "total_tokens": 1405504} +{"current_steps": 2865, "total_steps": 7577, "loss": 0.1389, "lr": 1.5650861078803137e-06, "epoch": 0.3781179886498614, "percentage": 37.81, "elapsed_time": "0:07:42", "remaining_time": "0:12:39", "throughput": 3047.15, "total_tokens": 1407808} +{"current_steps": 2870, "total_steps": 7577, "loss": 0.0692, "lr": 1.5631841012179013e-06, "epoch": 0.37877788042760985, "percentage": 37.88, "elapsed_time": "0:07:42", "remaining_time": "0:12:38", "throughput": 3050.66, "total_tokens": 1410304} +{"current_steps": 2875, "total_steps": 7577, "loss": 0.004, "lr": 1.5612791060877818e-06, "epoch": 0.37943777220535835, "percentage": 37.94, "elapsed_time": "0:07:42", "remaining_time": "0:12:36", "throughput": 3054.03, "total_tokens": 1412736} +{"current_steps": 2880, "total_steps": 7577, "loss": 0.0961, "lr": 1.5593711325985801e-06, "epoch": 0.3800976639831068, "percentage": 38.01, "elapsed_time": "0:07:42", "remaining_time": "0:12:34", "throughput": 3058.07, "total_tokens": 1415488} +{"current_steps": 2885, "total_steps": 7577, "loss": 0.21, "lr": 1.5574601908747245e-06, "epoch": 0.38075755576085524, "percentage": 38.08, "elapsed_time": "0:07:43", "remaining_time": "0:12:33", "throughput": 3061.28, "total_tokens": 1417856} +{"current_steps": 2890, "total_steps": 7577, "loss": 0.0664, "lr": 1.5555462910563936e-06, "epoch": 0.3814174475386037, "percentage": 38.14, "elapsed_time": "0:07:43", "remaining_time": "0:12:31", "throughput": 3064.24, "total_tokens": 1420096} +{"current_steps": 2895, "total_steps": 7577, "loss": 0.2344, "lr": 1.5536294432994636e-06, "epoch": 0.3820773393163521, "percentage": 38.21, "elapsed_time": "0:07:43", "remaining_time": "0:12:29", "throughput": 3067.86, "total_tokens": 1422656} +{"current_steps": 2900, "total_steps": 7577, "loss": 0.0884, "lr": 1.5517096577754528e-06, "epoch": 0.38273723109410057, "percentage": 38.27, "elapsed_time": "0:07:44", "remaining_time": "0:12:28", "throughput": 3071.36, "total_tokens": 1425152} +{"current_steps": 2905, "total_steps": 7577, "loss": 0.0623, "lr": 1.5497869446714695e-06, "epoch": 0.383397122871849, "percentage": 38.34, "elapsed_time": "0:07:44", "remaining_time": "0:12:26", "throughput": 3075.25, "total_tokens": 1427840} +{"current_steps": 2910, "total_steps": 7577, "loss": 0.0019, "lr": 1.5478613141901558e-06, "epoch": 0.38405701464959746, "percentage": 38.41, "elapsed_time": "0:07:44", "remaining_time": "0:12:25", "throughput": 3078.34, "total_tokens": 1430144} +{"current_steps": 2915, "total_steps": 7577, "loss": 0.1492, "lr": 1.5459327765496348e-06, "epoch": 0.3847169064273459, "percentage": 38.47, "elapsed_time": "0:07:44", "remaining_time": "0:12:23", "throughput": 3081.41, "total_tokens": 1432448} +{"current_steps": 2920, "total_steps": 7577, "loss": 0.0071, "lr": 1.5440013419834563e-06, "epoch": 0.38537679820509435, "percentage": 38.54, "elapsed_time": "0:07:45", "remaining_time": "0:12:21", "throughput": 3084.48, "total_tokens": 1434752} +{"current_steps": 2925, "total_steps": 7577, "loss": 0.0011, "lr": 1.5420670207405419e-06, "epoch": 0.3860366899828428, "percentage": 38.6, "elapsed_time": "0:07:45", "remaining_time": "0:12:20", "throughput": 3087.82, "total_tokens": 1437184} +{"current_steps": 2930, "total_steps": 7577, "loss": 0.1098, "lr": 1.5401298230851314e-06, "epoch": 0.38669658176059124, "percentage": 38.67, "elapsed_time": "0:07:45", "remaining_time": "0:12:18", "throughput": 3091.93, "total_tokens": 1440000} +{"current_steps": 2935, "total_steps": 7577, "loss": 0.0072, "lr": 1.5381897592967275e-06, "epoch": 0.3873564735383397, "percentage": 38.74, "elapsed_time": "0:07:46", "remaining_time": "0:12:17", "throughput": 3095.64, "total_tokens": 1442624} +{"current_steps": 2940, "total_steps": 7577, "loss": 0.0702, "lr": 1.5362468396700426e-06, "epoch": 0.3880163653160882, "percentage": 38.8, "elapsed_time": "0:07:46", "remaining_time": "0:12:15", "throughput": 3099.22, "total_tokens": 1445184} +{"current_steps": 2945, "total_steps": 7577, "loss": 0.322, "lr": 1.5343010745149418e-06, "epoch": 0.38867625709383663, "percentage": 38.87, "elapsed_time": "0:07:46", "remaining_time": "0:12:13", "throughput": 3102.51, "total_tokens": 1447616} +{"current_steps": 2950, "total_steps": 7577, "loss": 0.0715, "lr": 1.532352474156391e-06, "epoch": 0.3893361488715851, "percentage": 38.93, "elapsed_time": "0:07:46", "remaining_time": "0:12:12", "throughput": 3106.07, "total_tokens": 1450176} +{"current_steps": 2955, "total_steps": 7577, "loss": 0.4706, "lr": 1.5304010489343995e-06, "epoch": 0.3899960406493335, "percentage": 39.0, "elapsed_time": "0:07:47", "remaining_time": "0:12:10", "throughput": 3109.52, "total_tokens": 1452672} +{"current_steps": 2960, "total_steps": 7577, "loss": 0.2238, "lr": 1.528446809203968e-06, "epoch": 0.39065593242708196, "percentage": 39.07, "elapsed_time": "0:07:47", "remaining_time": "0:12:09", "throughput": 3113.1, "total_tokens": 1455232} +{"current_steps": 2965, "total_steps": 7577, "loss": 0.1729, "lr": 1.526489765335031e-06, "epoch": 0.3913158242048304, "percentage": 39.13, "elapsed_time": "0:07:47", "remaining_time": "0:12:07", "throughput": 3116.66, "total_tokens": 1457792} +{"current_steps": 2970, "total_steps": 7577, "loss": 0.1528, "lr": 1.5245299277124026e-06, "epoch": 0.39197571598257885, "percentage": 39.2, "elapsed_time": "0:07:48", "remaining_time": "0:12:05", "throughput": 3119.83, "total_tokens": 1460160} +{"current_steps": 2975, "total_steps": 7577, "loss": 0.1434, "lr": 1.5225673067357218e-06, "epoch": 0.3926356077603273, "percentage": 39.26, "elapsed_time": "0:07:48", "remaining_time": "0:12:04", "throughput": 3122.73, "total_tokens": 1462400} +{"current_steps": 2980, "total_steps": 7577, "loss": 0.1209, "lr": 1.5206019128193981e-06, "epoch": 0.39329549953807574, "percentage": 39.33, "elapsed_time": "0:07:48", "remaining_time": "0:12:02", "throughput": 3126.55, "total_tokens": 1465088} +{"current_steps": 2985, "total_steps": 7577, "loss": 0.1168, "lr": 1.5186337563925538e-06, "epoch": 0.3939553913158242, "percentage": 39.4, "elapsed_time": "0:07:48", "remaining_time": "0:12:01", "throughput": 3129.71, "total_tokens": 1467456} +{"current_steps": 2990, "total_steps": 7577, "loss": 0.0016, "lr": 1.516662847898971e-06, "epoch": 0.39461528309357263, "percentage": 39.46, "elapsed_time": "0:07:49", "remaining_time": "0:11:59", "throughput": 3133.25, "total_tokens": 1470016} +{"current_steps": 2995, "total_steps": 7577, "loss": 0.1024, "lr": 1.5146891977970349e-06, "epoch": 0.3952751748713211, "percentage": 39.53, "elapsed_time": "0:07:49", "remaining_time": "0:11:58", "throughput": 3136.54, "total_tokens": 1472448} +{"current_steps": 3000, "total_steps": 7577, "loss": 0.1009, "lr": 1.5127128165596794e-06, "epoch": 0.3959350666490696, "percentage": 39.59, "elapsed_time": "0:07:49", "remaining_time": "0:11:56", "throughput": 3140.18, "total_tokens": 1475072} +{"current_steps": 3005, "total_steps": 7577, "loss": 0.0499, "lr": 1.51073371467433e-06, "epoch": 0.396594958426818, "percentage": 39.66, "elapsed_time": "0:07:50", "remaining_time": "0:11:55", "throughput": 3143.31, "total_tokens": 1477440} +{"current_steps": 3010, "total_steps": 7577, "loss": 0.0043, "lr": 1.5087519026428498e-06, "epoch": 0.39725485020456647, "percentage": 39.73, "elapsed_time": "0:07:50", "remaining_time": "0:11:53", "throughput": 3146.57, "total_tokens": 1479872} +{"current_steps": 3015, "total_steps": 7577, "loss": 0.1242, "lr": 1.5067673909814818e-06, "epoch": 0.3979147419823149, "percentage": 39.79, "elapsed_time": "0:07:50", "remaining_time": "0:11:52", "throughput": 3149.02, "total_tokens": 1481920} +{"current_steps": 3020, "total_steps": 7577, "loss": 0.1901, "lr": 1.5047801902207953e-06, "epoch": 0.39857463376006336, "percentage": 39.86, "elapsed_time": "0:07:50", "remaining_time": "0:11:50", "throughput": 3153.57, "total_tokens": 1484992} +{"current_steps": 3025, "total_steps": 7577, "loss": 0.1508, "lr": 1.5027903109056288e-06, "epoch": 0.3992345255378118, "percentage": 39.92, "elapsed_time": "0:07:51", "remaining_time": "0:11:49", "throughput": 3156.43, "total_tokens": 1487232} +{"current_steps": 3030, "total_steps": 7577, "loss": 0.1615, "lr": 1.5007977635950336e-06, "epoch": 0.39989441731556025, "percentage": 39.99, "elapsed_time": "0:07:51", "remaining_time": "0:11:47", "throughput": 3159.8, "total_tokens": 1489728} +{"current_steps": 3032, "total_steps": 7577, "eval_loss": 0.13228875398635864, "epoch": 0.4001583740266596, "percentage": 40.02, "elapsed_time": "0:07:59", "remaining_time": "0:11:58", "throughput": 3110.38, "total_tokens": 1490688} +{"current_steps": 3035, "total_steps": 7577, "loss": 0.154, "lr": 1.498802558862219e-06, "epoch": 0.4005543090933087, "percentage": 40.06, "elapsed_time": "0:08:27", "remaining_time": "0:12:39", "throughput": 2939.48, "total_tokens": 1491968} +{"current_steps": 3040, "total_steps": 7577, "loss": 0.1078, "lr": 1.496804707294496e-06, "epoch": 0.40121420087105714, "percentage": 40.12, "elapsed_time": "0:08:27", "remaining_time": "0:12:37", "throughput": 2942.48, "total_tokens": 1494336} +{"current_steps": 3045, "total_steps": 7577, "loss": 0.0599, "lr": 1.4948042194932195e-06, "epoch": 0.4018740926488056, "percentage": 40.19, "elapsed_time": "0:08:28", "remaining_time": "0:12:36", "throughput": 2946.93, "total_tokens": 1497472} +{"current_steps": 3050, "total_steps": 7577, "loss": 0.0399, "lr": 1.4928011060737341e-06, "epoch": 0.402533984426554, "percentage": 40.25, "elapsed_time": "0:08:28", "remaining_time": "0:12:34", "throughput": 2950.18, "total_tokens": 1499968} +{"current_steps": 3055, "total_steps": 7577, "loss": 0.0741, "lr": 1.4907953776653171e-06, "epoch": 0.40319387620430247, "percentage": 40.32, "elapsed_time": "0:08:28", "remaining_time": "0:12:33", "throughput": 2953.18, "total_tokens": 1502336} +{"current_steps": 3060, "total_steps": 7577, "loss": 0.1581, "lr": 1.4887870449111206e-06, "epoch": 0.40385376798205097, "percentage": 40.39, "elapsed_time": "0:08:29", "remaining_time": "0:12:31", "throughput": 2955.93, "total_tokens": 1504576} +{"current_steps": 3065, "total_steps": 7577, "loss": 0.1605, "lr": 1.486776118468118e-06, "epoch": 0.4045136597597994, "percentage": 40.45, "elapsed_time": "0:08:29", "remaining_time": "0:12:29", "throughput": 2959.31, "total_tokens": 1507136} +{"current_steps": 3070, "total_steps": 7577, "loss": 0.0716, "lr": 1.4847626090070451e-06, "epoch": 0.40517355153754786, "percentage": 40.52, "elapsed_time": "0:08:29", "remaining_time": "0:12:28", "throughput": 2962.68, "total_tokens": 1509696} +{"current_steps": 3075, "total_steps": 7577, "loss": 0.299, "lr": 1.4827465272123439e-06, "epoch": 0.4058334433152963, "percentage": 40.58, "elapsed_time": "0:08:29", "remaining_time": "0:12:26", "throughput": 2965.92, "total_tokens": 1512192} +{"current_steps": 3080, "total_steps": 7577, "loss": 0.0453, "lr": 1.4807278837821063e-06, "epoch": 0.40649333509304475, "percentage": 40.65, "elapsed_time": "0:08:30", "remaining_time": "0:12:24", "throughput": 2969.28, "total_tokens": 1514752} +{"current_steps": 3085, "total_steps": 7577, "loss": 0.2992, "lr": 1.4787066894280178e-06, "epoch": 0.4071532268707932, "percentage": 40.72, "elapsed_time": "0:08:30", "remaining_time": "0:12:23", "throughput": 2972.84, "total_tokens": 1517440} +{"current_steps": 3090, "total_steps": 7577, "loss": 0.0637, "lr": 1.476682954875299e-06, "epoch": 0.40781311864854164, "percentage": 40.78, "elapsed_time": "0:08:30", "remaining_time": "0:12:21", "throughput": 2975.7, "total_tokens": 1519744} +{"current_steps": 3095, "total_steps": 7577, "loss": 0.0773, "lr": 1.4746566908626506e-06, "epoch": 0.4084730104262901, "percentage": 40.85, "elapsed_time": "0:08:31", "remaining_time": "0:12:20", "throughput": 2978.78, "total_tokens": 1522176} +{"current_steps": 3100, "total_steps": 7577, "loss": 0.0516, "lr": 1.4726279081421956e-06, "epoch": 0.40913290220403853, "percentage": 40.91, "elapsed_time": "0:08:31", "remaining_time": "0:12:18", "throughput": 2981.39, "total_tokens": 1524352} +{"current_steps": 3105, "total_steps": 7577, "loss": 0.2317, "lr": 1.4705966174794216e-06, "epoch": 0.409792793981787, "percentage": 40.98, "elapsed_time": "0:08:31", "remaining_time": "0:12:16", "throughput": 2984.83, "total_tokens": 1526976} +{"current_steps": 3110, "total_steps": 7577, "loss": 0.1563, "lr": 1.4685628296531248e-06, "epoch": 0.4104526857595354, "percentage": 41.05, "elapsed_time": "0:08:31", "remaining_time": "0:12:15", "throughput": 2987.43, "total_tokens": 1529152} +{"current_steps": 3115, "total_steps": 7577, "loss": 0.051, "lr": 1.466526555455352e-06, "epoch": 0.41111257753728386, "percentage": 41.11, "elapsed_time": "0:08:32", "remaining_time": "0:12:13", "throughput": 2990.63, "total_tokens": 1531648} +{"current_steps": 3120, "total_steps": 7577, "loss": 0.0057, "lr": 1.4644878056913432e-06, "epoch": 0.4117724693150323, "percentage": 41.18, "elapsed_time": "0:08:32", "remaining_time": "0:12:12", "throughput": 2993.48, "total_tokens": 1533952} +{"current_steps": 3125, "total_steps": 7577, "loss": 0.1887, "lr": 1.4624465911794764e-06, "epoch": 0.4124323610927808, "percentage": 41.24, "elapsed_time": "0:08:32", "remaining_time": "0:12:10", "throughput": 2997.02, "total_tokens": 1536640} +{"current_steps": 3130, "total_steps": 7577, "loss": 0.0053, "lr": 1.4604029227512062e-06, "epoch": 0.41309225287052925, "percentage": 41.31, "elapsed_time": "0:08:33", "remaining_time": "0:12:08", "throughput": 3000.31, "total_tokens": 1539200} +{"current_steps": 3135, "total_steps": 7577, "loss": 0.1908, "lr": 1.4583568112510108e-06, "epoch": 0.4137521446482777, "percentage": 41.38, "elapsed_time": "0:08:33", "remaining_time": "0:12:07", "throughput": 3003.39, "total_tokens": 1541632} +{"current_steps": 3140, "total_steps": 7577, "loss": 0.0965, "lr": 1.4563082675363302e-06, "epoch": 0.41441203642602614, "percentage": 41.44, "elapsed_time": "0:08:33", "remaining_time": "0:12:05", "throughput": 3006.58, "total_tokens": 1544128} +{"current_steps": 3145, "total_steps": 7577, "loss": 0.0228, "lr": 1.4542573024775122e-06, "epoch": 0.4150719282037746, "percentage": 41.51, "elapsed_time": "0:08:33", "remaining_time": "0:12:04", "throughput": 3009.27, "total_tokens": 1546368} +{"current_steps": 3150, "total_steps": 7577, "loss": 0.2984, "lr": 1.4522039269577521e-06, "epoch": 0.41573181998152303, "percentage": 41.57, "elapsed_time": "0:08:34", "remaining_time": "0:12:02", "throughput": 3012.22, "total_tokens": 1548736} +{"current_steps": 3155, "total_steps": 7577, "loss": 0.2461, "lr": 1.4501481518730372e-06, "epoch": 0.4163917117592715, "percentage": 41.64, "elapsed_time": "0:08:34", "remaining_time": "0:12:01", "throughput": 3015.27, "total_tokens": 1551168} +{"current_steps": 3160, "total_steps": 7577, "loss": 0.0719, "lr": 1.4480899881320868e-06, "epoch": 0.4170516035370199, "percentage": 41.71, "elapsed_time": "0:08:34", "remaining_time": "0:11:59", "throughput": 3018.44, "total_tokens": 1553664} +{"current_steps": 3165, "total_steps": 7577, "loss": 0.1771, "lr": 1.4460294466562956e-06, "epoch": 0.41771149531476837, "percentage": 41.77, "elapsed_time": "0:08:35", "remaining_time": "0:11:57", "throughput": 3021.25, "total_tokens": 1555968} +{"current_steps": 3170, "total_steps": 7577, "loss": 0.0399, "lr": 1.4439665383796756e-06, "epoch": 0.4183713870925168, "percentage": 41.84, "elapsed_time": "0:08:35", "remaining_time": "0:11:56", "throughput": 3023.93, "total_tokens": 1558208} +{"current_steps": 3175, "total_steps": 7577, "loss": 0.0054, "lr": 1.4419012742487972e-06, "epoch": 0.41903127887026526, "percentage": 41.9, "elapsed_time": "0:08:35", "remaining_time": "0:11:54", "throughput": 3026.97, "total_tokens": 1560640} +{"current_steps": 3180, "total_steps": 7577, "loss": 0.095, "lr": 1.4398336652227335e-06, "epoch": 0.4196911706480137, "percentage": 41.97, "elapsed_time": "0:08:35", "remaining_time": "0:11:53", "throughput": 3030.5, "total_tokens": 1563328} +{"current_steps": 3185, "total_steps": 7577, "loss": 0.1201, "lr": 1.4377637222729986e-06, "epoch": 0.4203510624257622, "percentage": 42.04, "elapsed_time": "0:08:36", "remaining_time": "0:11:51", "throughput": 3033.41, "total_tokens": 1565696} +{"current_steps": 3190, "total_steps": 7577, "loss": 0.1675, "lr": 1.435691456383493e-06, "epoch": 0.42101095420351065, "percentage": 42.1, "elapsed_time": "0:08:36", "remaining_time": "0:11:50", "throughput": 3037.39, "total_tokens": 1568640} +{"current_steps": 3195, "total_steps": 7577, "loss": 0.1212, "lr": 1.433616878550442e-06, "epoch": 0.4216708459812591, "percentage": 42.17, "elapsed_time": "0:08:36", "remaining_time": "0:11:48", "throughput": 3040.89, "total_tokens": 1571328} +{"current_steps": 3200, "total_steps": 7577, "loss": 0.3175, "lr": 1.4315399997823403e-06, "epoch": 0.42233073775900754, "percentage": 42.23, "elapsed_time": "0:08:37", "remaining_time": "0:11:47", "throughput": 3044.38, "total_tokens": 1574016} +{"current_steps": 3205, "total_steps": 7577, "loss": 0.2534, "lr": 1.429460831099891e-06, "epoch": 0.422990629536756, "percentage": 42.3, "elapsed_time": "0:08:37", "remaining_time": "0:11:45", "throughput": 3047.26, "total_tokens": 1576384} +{"current_steps": 3210, "total_steps": 7577, "loss": 0.2136, "lr": 1.4273793835359492e-06, "epoch": 0.4236505213145044, "percentage": 42.37, "elapsed_time": "0:08:37", "remaining_time": "0:11:44", "throughput": 3050.98, "total_tokens": 1579200} +{"current_steps": 3215, "total_steps": 7577, "loss": 0.0964, "lr": 1.4252956681354631e-06, "epoch": 0.42431041309225287, "percentage": 42.43, "elapsed_time": "0:08:37", "remaining_time": "0:11:42", "throughput": 3053.99, "total_tokens": 1581632} +{"current_steps": 3220, "total_steps": 7577, "loss": 0.0035, "lr": 1.4232096959554135e-06, "epoch": 0.4249703048700013, "percentage": 42.5, "elapsed_time": "0:08:38", "remaining_time": "0:11:41", "throughput": 3056.98, "total_tokens": 1584064} +{"current_steps": 3225, "total_steps": 7577, "loss": 0.0297, "lr": 1.4211214780647572e-06, "epoch": 0.42563019664774976, "percentage": 42.56, "elapsed_time": "0:08:38", "remaining_time": "0:11:39", "throughput": 3060.48, "total_tokens": 1586752} +{"current_steps": 3230, "total_steps": 7577, "loss": 0.0918, "lr": 1.4190310255443676e-06, "epoch": 0.4262900884254982, "percentage": 42.63, "elapsed_time": "0:08:38", "remaining_time": "0:11:38", "throughput": 3063.61, "total_tokens": 1589248} +{"current_steps": 3235, "total_steps": 7577, "loss": 0.0286, "lr": 1.4169383494869764e-06, "epoch": 0.42694998020324665, "percentage": 42.69, "elapsed_time": "0:08:39", "remaining_time": "0:11:36", "throughput": 3066.37, "total_tokens": 1591552} +{"current_steps": 3240, "total_steps": 7577, "loss": 0.0616, "lr": 1.414843460997113e-06, "epoch": 0.4276098719809951, "percentage": 42.76, "elapsed_time": "0:08:39", "remaining_time": "0:11:35", "throughput": 3069.49, "total_tokens": 1594048} +{"current_steps": 3245, "total_steps": 7577, "loss": 0.1517, "lr": 1.4127463711910483e-06, "epoch": 0.4282697637587436, "percentage": 42.83, "elapsed_time": "0:08:39", "remaining_time": "0:11:33", "throughput": 3072.6, "total_tokens": 1596544} +{"current_steps": 3250, "total_steps": 7577, "loss": 0.1214, "lr": 1.410647091196733e-06, "epoch": 0.42892965553649204, "percentage": 42.89, "elapsed_time": "0:08:39", "remaining_time": "0:11:32", "throughput": 3075.85, "total_tokens": 1599104} +{"current_steps": 3255, "total_steps": 7577, "loss": 0.124, "lr": 1.4085456321537402e-06, "epoch": 0.4295895473142405, "percentage": 42.96, "elapsed_time": "0:08:40", "remaining_time": "0:11:30", "throughput": 3078.46, "total_tokens": 1601344} +{"current_steps": 3260, "total_steps": 7577, "loss": 0.1022, "lr": 1.4064420052132056e-06, "epoch": 0.43024943909198893, "percentage": 43.02, "elapsed_time": "0:08:40", "remaining_time": "0:11:29", "throughput": 3081.8, "total_tokens": 1603968} +{"current_steps": 3265, "total_steps": 7577, "loss": 0.078, "lr": 1.4043362215377696e-06, "epoch": 0.4309093308697374, "percentage": 43.09, "elapsed_time": "0:08:40", "remaining_time": "0:11:27", "throughput": 3084.79, "total_tokens": 1606400} +{"current_steps": 3270, "total_steps": 7577, "loss": 0.1095, "lr": 1.4022282923015158e-06, "epoch": 0.4315692226474858, "percentage": 43.16, "elapsed_time": "0:08:41", "remaining_time": "0:11:26", "throughput": 3088.0, "total_tokens": 1608960} +{"current_steps": 3275, "total_steps": 7577, "loss": 0.0042, "lr": 1.4001182286899136e-06, "epoch": 0.43222911442523426, "percentage": 43.22, "elapsed_time": "0:08:41", "remaining_time": "0:11:24", "throughput": 3091.11, "total_tokens": 1611456} +{"current_steps": 3280, "total_steps": 7577, "loss": 0.0458, "lr": 1.398006041899758e-06, "epoch": 0.4328890062029827, "percentage": 43.29, "elapsed_time": "0:08:41", "remaining_time": "0:11:23", "throughput": 3094.21, "total_tokens": 1613952} +{"current_steps": 3285, "total_steps": 7577, "loss": 0.1192, "lr": 1.3958917431391102e-06, "epoch": 0.43354889798073115, "percentage": 43.35, "elapsed_time": "0:08:41", "remaining_time": "0:11:21", "throughput": 3097.08, "total_tokens": 1616320} +{"current_steps": 3290, "total_steps": 7577, "loss": 0.1763, "lr": 1.3937753436272388e-06, "epoch": 0.4342087897584796, "percentage": 43.42, "elapsed_time": "0:08:42", "remaining_time": "0:11:20", "throughput": 3100.76, "total_tokens": 1619136} +{"current_steps": 3295, "total_steps": 7577, "loss": 0.0483, "lr": 1.3916568545945597e-06, "epoch": 0.43486868153622804, "percentage": 43.49, "elapsed_time": "0:08:42", "remaining_time": "0:11:18", "throughput": 3103.84, "total_tokens": 1621632} +{"current_steps": 3300, "total_steps": 7577, "loss": 0.1352, "lr": 1.3895362872825764e-06, "epoch": 0.4355285733139765, "percentage": 43.55, "elapsed_time": "0:08:42", "remaining_time": "0:11:17", "throughput": 3106.8, "total_tokens": 1624064} +{"current_steps": 3305, "total_steps": 7577, "loss": 0.1454, "lr": 1.3874136529438205e-06, "epoch": 0.43618846509172493, "percentage": 43.62, "elapsed_time": "0:08:43", "remaining_time": "0:11:16", "throughput": 3109.77, "total_tokens": 1626496} +{"current_steps": 3310, "total_steps": 7577, "loss": 0.0691, "lr": 1.3852889628417918e-06, "epoch": 0.43684835686947343, "percentage": 43.68, "elapsed_time": "0:08:43", "remaining_time": "0:11:14", "throughput": 3112.48, "total_tokens": 1628800} +{"current_steps": 3315, "total_steps": 7577, "loss": 0.0967, "lr": 1.3831622282508994e-06, "epoch": 0.4375082486472219, "percentage": 43.75, "elapsed_time": "0:08:43", "remaining_time": "0:11:13", "throughput": 3115.44, "total_tokens": 1631232} +{"current_steps": 3320, "total_steps": 7577, "loss": 0.0018, "lr": 1.3810334604564007e-06, "epoch": 0.4381681404249703, "percentage": 43.82, "elapsed_time": "0:08:43", "remaining_time": "0:11:11", "throughput": 3118.49, "total_tokens": 1633728} +{"current_steps": 3325, "total_steps": 7577, "loss": 0.0695, "lr": 1.3789026707543423e-06, "epoch": 0.43882803220271877, "percentage": 43.88, "elapsed_time": "0:08:44", "remaining_time": "0:11:10", "throughput": 3121.56, "total_tokens": 1636224} +{"current_steps": 3330, "total_steps": 7577, "loss": 0.0631, "lr": 1.3767698704514998e-06, "epoch": 0.4394879239804672, "percentage": 43.95, "elapsed_time": "0:08:44", "remaining_time": "0:11:08", "throughput": 3123.78, "total_tokens": 1638272} +{"current_steps": 3335, "total_steps": 7577, "loss": 0.1898, "lr": 1.3746350708653175e-06, "epoch": 0.44014781575821565, "percentage": 44.01, "elapsed_time": "0:08:44", "remaining_time": "0:11:07", "throughput": 3126.34, "total_tokens": 1640512} +{"current_steps": 3340, "total_steps": 7577, "loss": 0.1903, "lr": 1.3724982833238495e-06, "epoch": 0.4408077075359641, "percentage": 44.08, "elapsed_time": "0:08:45", "remaining_time": "0:11:06", "throughput": 3129.27, "total_tokens": 1642944} +{"current_steps": 3345, "total_steps": 7577, "loss": 0.0559, "lr": 1.370359519165697e-06, "epoch": 0.44146759931371254, "percentage": 44.15, "elapsed_time": "0:08:45", "remaining_time": "0:11:04", "throughput": 3132.2, "total_tokens": 1645376} +{"current_steps": 3350, "total_steps": 7577, "loss": 0.0108, "lr": 1.368218789739952e-06, "epoch": 0.442127491091461, "percentage": 44.21, "elapsed_time": "0:08:45", "remaining_time": "0:11:03", "throughput": 3135.37, "total_tokens": 1647936} +{"current_steps": 3355, "total_steps": 7577, "loss": 0.065, "lr": 1.3660761064061337e-06, "epoch": 0.44278738286920943, "percentage": 44.28, "elapsed_time": "0:08:45", "remaining_time": "0:11:01", "throughput": 3138.52, "total_tokens": 1650496} +{"current_steps": 3360, "total_steps": 7577, "loss": 0.0935, "lr": 1.3639314805341297e-06, "epoch": 0.4434472746469579, "percentage": 44.34, "elapsed_time": "0:08:46", "remaining_time": "0:11:00", "throughput": 3141.56, "total_tokens": 1652992} +{"current_steps": 3365, "total_steps": 7577, "loss": 0.0665, "lr": 1.3617849235041355e-06, "epoch": 0.4441071664247063, "percentage": 44.41, "elapsed_time": "0:08:46", "remaining_time": "0:10:58", "throughput": 3144.59, "total_tokens": 1655488} +{"current_steps": 3370, "total_steps": 7577, "loss": 0.1599, "lr": 1.3596364467065938e-06, "epoch": 0.4447670582024548, "percentage": 44.48, "elapsed_time": "0:08:46", "remaining_time": "0:10:57", "throughput": 3147.61, "total_tokens": 1657984} +{"current_steps": 3375, "total_steps": 7577, "loss": 0.229, "lr": 1.3574860615421346e-06, "epoch": 0.44542694998020327, "percentage": 44.54, "elapsed_time": "0:08:47", "remaining_time": "0:10:56", "throughput": 3151.09, "total_tokens": 1660736} +{"current_steps": 3380, "total_steps": 7577, "loss": 0.192, "lr": 1.3553337794215147e-06, "epoch": 0.4460868417579517, "percentage": 44.61, "elapsed_time": "0:08:47", "remaining_time": "0:10:54", "throughput": 3153.86, "total_tokens": 1663104} +{"current_steps": 3385, "total_steps": 7577, "loss": 0.0766, "lr": 1.3531796117655565e-06, "epoch": 0.44674673353570016, "percentage": 44.67, "elapsed_time": "0:08:47", "remaining_time": "0:10:53", "throughput": 3156.4, "total_tokens": 1665344} +{"current_steps": 3390, "total_steps": 7577, "loss": 0.1651, "lr": 1.3510235700050873e-06, "epoch": 0.4474066253134486, "percentage": 44.74, "elapsed_time": "0:08:47", "remaining_time": "0:10:52", "throughput": 3159.87, "total_tokens": 1668096} +{"current_steps": 3395, "total_steps": 7577, "loss": 0.0679, "lr": 1.3488656655808801e-06, "epoch": 0.44806651709119705, "percentage": 44.81, "elapsed_time": "0:08:48", "remaining_time": "0:10:50", "throughput": 3162.3, "total_tokens": 1670272} +{"current_steps": 3400, "total_steps": 7577, "loss": 0.0905, "lr": 1.3467059099435912e-06, "epoch": 0.4487264088689455, "percentage": 44.87, "elapsed_time": "0:08:48", "remaining_time": "0:10:49", "throughput": 3164.7, "total_tokens": 1672448} +{"current_steps": 3405, "total_steps": 7577, "loss": 0.0608, "lr": 1.3445443145537002e-06, "epoch": 0.44938630064669394, "percentage": 44.94, "elapsed_time": "0:08:48", "remaining_time": "0:10:47", "throughput": 3168.15, "total_tokens": 1675200} +{"current_steps": 3410, "total_steps": 7577, "loss": 0.0698, "lr": 1.3423808908814494e-06, "epoch": 0.4500461924244424, "percentage": 45.0, "elapsed_time": "0:08:49", "remaining_time": "0:10:46", "throughput": 3171.14, "total_tokens": 1677696} +{"current_steps": 3411, "total_steps": 7577, "eval_loss": 0.1182408258318901, "epoch": 0.45017817077999206, "percentage": 45.02, "elapsed_time": "0:08:56", "remaining_time": "0:10:55", "throughput": 3126.86, "total_tokens": 1678208} +{"current_steps": 3415, "total_steps": 7577, "loss": 0.0969, "lr": 1.3402156504067826e-06, "epoch": 0.4507060842021908, "percentage": 45.07, "elapsed_time": "0:09:20", "remaining_time": "0:11:22", "throughput": 2999.06, "total_tokens": 1680256} +{"current_steps": 3420, "total_steps": 7577, "loss": 0.1191, "lr": 1.338048604619284e-06, "epoch": 0.45136597597993927, "percentage": 45.14, "elapsed_time": "0:09:20", "remaining_time": "0:11:21", "throughput": 3001.72, "total_tokens": 1682624} +{"current_steps": 3425, "total_steps": 7577, "loss": 0.0365, "lr": 1.3358797650181178e-06, "epoch": 0.4520258677576877, "percentage": 45.2, "elapsed_time": "0:09:20", "remaining_time": "0:11:19", "throughput": 3004.53, "total_tokens": 1685056} +{"current_steps": 3430, "total_steps": 7577, "loss": 0.1349, "lr": 1.3337091431119662e-06, "epoch": 0.45268575953543616, "percentage": 45.27, "elapsed_time": "0:09:21", "remaining_time": "0:11:18", "throughput": 3006.78, "total_tokens": 1687168} +{"current_steps": 3435, "total_steps": 7577, "loss": 0.3197, "lr": 1.3315367504189698e-06, "epoch": 0.45334565131318466, "percentage": 45.33, "elapsed_time": "0:09:21", "remaining_time": "0:11:16", "throughput": 3008.92, "total_tokens": 1689216} +{"current_steps": 3440, "total_steps": 7577, "loss": 0.0946, "lr": 1.3293625984666656e-06, "epoch": 0.4540055430909331, "percentage": 45.4, "elapsed_time": "0:09:21", "remaining_time": "0:11:15", "throughput": 3011.96, "total_tokens": 1691776} +{"current_steps": 3445, "total_steps": 7577, "loss": 0.0012, "lr": 1.3271866987919254e-06, "epoch": 0.45466543486868155, "percentage": 45.47, "elapsed_time": "0:09:21", "remaining_time": "0:11:14", "throughput": 3014.98, "total_tokens": 1694336} +{"current_steps": 3450, "total_steps": 7577, "loss": 0.2113, "lr": 1.325009062940895e-06, "epoch": 0.45532532664643, "percentage": 45.53, "elapsed_time": "0:09:22", "remaining_time": "0:11:12", "throughput": 3017.56, "total_tokens": 1696640} +{"current_steps": 3455, "total_steps": 7577, "loss": 0.0765, "lr": 1.3228297024689336e-06, "epoch": 0.45598521842417844, "percentage": 45.6, "elapsed_time": "0:09:22", "remaining_time": "0:11:11", "throughput": 3020.02, "total_tokens": 1698880} +{"current_steps": 3460, "total_steps": 7577, "loss": 0.1025, "lr": 1.3206486289405519e-06, "epoch": 0.4566451102019269, "percentage": 45.66, "elapsed_time": "0:09:22", "remaining_time": "0:11:09", "throughput": 3022.82, "total_tokens": 1701312} +{"current_steps": 3465, "total_steps": 7577, "loss": 0.1407, "lr": 1.3184658539293496e-06, "epoch": 0.45730500197967533, "percentage": 45.73, "elapsed_time": "0:09:23", "remaining_time": "0:11:08", "throughput": 3025.71, "total_tokens": 1703808} +{"current_steps": 3470, "total_steps": 7577, "loss": 0.125, "lr": 1.3162813890179564e-06, "epoch": 0.4579648937574238, "percentage": 45.8, "elapsed_time": "0:09:23", "remaining_time": "0:11:06", "throughput": 3028.61, "total_tokens": 1706304} +{"current_steps": 3475, "total_steps": 7577, "loss": 0.3138, "lr": 1.314095245797969e-06, "epoch": 0.4586247855351722, "percentage": 45.86, "elapsed_time": "0:09:23", "remaining_time": "0:11:05", "throughput": 3031.41, "total_tokens": 1708736} +{"current_steps": 3480, "total_steps": 7577, "loss": 0.1379, "lr": 1.3119074358698891e-06, "epoch": 0.45928467731292066, "percentage": 45.93, "elapsed_time": "0:09:23", "remaining_time": "0:11:03", "throughput": 3034.31, "total_tokens": 1711232} +{"current_steps": 3485, "total_steps": 7577, "loss": 0.0039, "lr": 1.3097179708430634e-06, "epoch": 0.4599445690906691, "percentage": 45.99, "elapsed_time": "0:09:24", "remaining_time": "0:11:02", "throughput": 3036.98, "total_tokens": 1713600} +{"current_steps": 3490, "total_steps": 7577, "loss": 0.2013, "lr": 1.3075268623356214e-06, "epoch": 0.46060446086841755, "percentage": 46.06, "elapsed_time": "0:09:24", "remaining_time": "0:11:01", "throughput": 3040.1, "total_tokens": 1716224} +{"current_steps": 3495, "total_steps": 7577, "loss": 0.1515, "lr": 1.305334121974412e-06, "epoch": 0.46126435264616605, "percentage": 46.13, "elapsed_time": "0:09:24", "remaining_time": "0:10:59", "throughput": 3042.99, "total_tokens": 1718720} +{"current_steps": 3500, "total_steps": 7577, "loss": 0.1062, "lr": 1.3031397613949448e-06, "epoch": 0.4619242444239145, "percentage": 46.19, "elapsed_time": "0:09:25", "remaining_time": "0:10:58", "throughput": 3045.97, "total_tokens": 1721280} +{"current_steps": 3505, "total_steps": 7577, "loss": 0.0727, "lr": 1.3009437922413266e-06, "epoch": 0.46258413620166294, "percentage": 46.26, "elapsed_time": "0:09:25", "remaining_time": "0:10:56", "throughput": 3048.71, "total_tokens": 1723712} +{"current_steps": 3510, "total_steps": 7577, "loss": 0.0932, "lr": 1.2987462261661994e-06, "epoch": 0.4632440279794114, "percentage": 46.32, "elapsed_time": "0:09:25", "remaining_time": "0:10:55", "throughput": 3051.15, "total_tokens": 1725952} +{"current_steps": 3515, "total_steps": 7577, "loss": 0.0048, "lr": 1.2965470748306798e-06, "epoch": 0.46390391975715983, "percentage": 46.39, "elapsed_time": "0:09:25", "remaining_time": "0:10:54", "throughput": 3054.13, "total_tokens": 1728512} +{"current_steps": 3520, "total_steps": 7577, "loss": 0.094, "lr": 1.2943463499042957e-06, "epoch": 0.4645638115349083, "percentage": 46.46, "elapsed_time": "0:09:26", "remaining_time": "0:10:52", "throughput": 3057.0, "total_tokens": 1731008} +{"current_steps": 3525, "total_steps": 7577, "loss": 0.1567, "lr": 1.2921440630649257e-06, "epoch": 0.4652237033126567, "percentage": 46.52, "elapsed_time": "0:09:26", "remaining_time": "0:10:51", "throughput": 3060.19, "total_tokens": 1733696} +{"current_steps": 3530, "total_steps": 7577, "loss": 0.0778, "lr": 1.2899402259987355e-06, "epoch": 0.46588359509040517, "percentage": 46.59, "elapsed_time": "0:09:26", "remaining_time": "0:10:49", "throughput": 3063.18, "total_tokens": 1736256} +{"current_steps": 3535, "total_steps": 7577, "loss": 0.2758, "lr": 1.287734850400118e-06, "epoch": 0.4665434868681536, "percentage": 46.65, "elapsed_time": "0:09:27", "remaining_time": "0:10:48", "throughput": 3066.37, "total_tokens": 1738944} +{"current_steps": 3540, "total_steps": 7577, "loss": 0.1846, "lr": 1.2855279479716297e-06, "epoch": 0.46720337864590206, "percentage": 46.72, "elapsed_time": "0:09:27", "remaining_time": "0:10:47", "throughput": 3069.45, "total_tokens": 1741568} +{"current_steps": 3545, "total_steps": 7577, "loss": 0.0017, "lr": 1.283319530423929e-06, "epoch": 0.4678632704236505, "percentage": 46.79, "elapsed_time": "0:09:27", "remaining_time": "0:10:45", "throughput": 3071.86, "total_tokens": 1743808} +{"current_steps": 3550, "total_steps": 7577, "loss": 0.0026, "lr": 1.2811096094757144e-06, "epoch": 0.46852316220139895, "percentage": 46.85, "elapsed_time": "0:09:27", "remaining_time": "0:10:44", "throughput": 3074.51, "total_tokens": 1746176} +{"current_steps": 3555, "total_steps": 7577, "loss": 0.1779, "lr": 1.2788981968536612e-06, "epoch": 0.46918305397914745, "percentage": 46.92, "elapsed_time": "0:09:28", "remaining_time": "0:10:42", "throughput": 3077.24, "total_tokens": 1748608} +{"current_steps": 3560, "total_steps": 7577, "loss": 0.1046, "lr": 1.2766853042923607e-06, "epoch": 0.4698429457568959, "percentage": 46.98, "elapsed_time": "0:09:28", "remaining_time": "0:10:41", "throughput": 3079.99, "total_tokens": 1751040} +{"current_steps": 3565, "total_steps": 7577, "loss": 0.0626, "lr": 1.2744709435342573e-06, "epoch": 0.47050283753464434, "percentage": 47.05, "elapsed_time": "0:09:28", "remaining_time": "0:10:40", "throughput": 3082.38, "total_tokens": 1753280} +{"current_steps": 3570, "total_steps": 7577, "loss": 0.2856, "lr": 1.2722551263295864e-06, "epoch": 0.4711627293123928, "percentage": 47.12, "elapsed_time": "0:09:29", "remaining_time": "0:10:38", "throughput": 3085.13, "total_tokens": 1755712} +{"current_steps": 3575, "total_steps": 7577, "loss": 0.1173, "lr": 1.2700378644363114e-06, "epoch": 0.4718226210901412, "percentage": 47.18, "elapsed_time": "0:09:29", "remaining_time": "0:10:37", "throughput": 3087.51, "total_tokens": 1757952} +{"current_steps": 3580, "total_steps": 7577, "loss": 0.0951, "lr": 1.2678191696200621e-06, "epoch": 0.47248251286788967, "percentage": 47.25, "elapsed_time": "0:09:29", "remaining_time": "0:10:36", "throughput": 3090.22, "total_tokens": 1760384} +{"current_steps": 3585, "total_steps": 7577, "loss": 0.0029, "lr": 1.2655990536540717e-06, "epoch": 0.4731424046456381, "percentage": 47.31, "elapsed_time": "0:09:29", "remaining_time": "0:10:34", "throughput": 3093.14, "total_tokens": 1762944} +{"current_steps": 3590, "total_steps": 7577, "loss": 0.275, "lr": 1.2633775283191144e-06, "epoch": 0.47380229642338656, "percentage": 47.38, "elapsed_time": "0:09:30", "remaining_time": "0:10:33", "throughput": 3096.1, "total_tokens": 1765504} +{"current_steps": 3595, "total_steps": 7577, "loss": 0.0527, "lr": 1.2611546054034436e-06, "epoch": 0.474462188201135, "percentage": 47.45, "elapsed_time": "0:09:30", "remaining_time": "0:10:31", "throughput": 3099.14, "total_tokens": 1768128} +{"current_steps": 3600, "total_steps": 7577, "loss": 0.1554, "lr": 1.2589302967027285e-06, "epoch": 0.47512207997888345, "percentage": 47.51, "elapsed_time": "0:09:30", "remaining_time": "0:10:30", "throughput": 3101.94, "total_tokens": 1770624} +{"current_steps": 3605, "total_steps": 7577, "loss": 0.2221, "lr": 1.2567046140199914e-06, "epoch": 0.4757819717566319, "percentage": 47.58, "elapsed_time": "0:09:31", "remaining_time": "0:10:29", "throughput": 3104.97, "total_tokens": 1773248} +{"current_steps": 3610, "total_steps": 7577, "loss": 0.0267, "lr": 1.2544775691655463e-06, "epoch": 0.47644186353438034, "percentage": 47.64, "elapsed_time": "0:09:31", "remaining_time": "0:10:27", "throughput": 3107.35, "total_tokens": 1775488} +{"current_steps": 3615, "total_steps": 7577, "loss": 0.1329, "lr": 1.2522491739569346e-06, "epoch": 0.4771017553121288, "percentage": 47.71, "elapsed_time": "0:09:31", "remaining_time": "0:10:26", "throughput": 3109.82, "total_tokens": 1777792} +{"current_steps": 3620, "total_steps": 7577, "loss": 0.0942, "lr": 1.250019440218864e-06, "epoch": 0.4777616470898773, "percentage": 47.78, "elapsed_time": "0:09:31", "remaining_time": "0:10:25", "throughput": 3112.73, "total_tokens": 1780352} +{"current_steps": 3625, "total_steps": 7577, "loss": 0.1692, "lr": 1.247788379783144e-06, "epoch": 0.47842153886762573, "percentage": 47.84, "elapsed_time": "0:09:32", "remaining_time": "0:10:23", "throughput": 3116.06, "total_tokens": 1783168} +{"current_steps": 3630, "total_steps": 7577, "loss": 0.0503, "lr": 1.2455560044886248e-06, "epoch": 0.4790814306453742, "percentage": 47.91, "elapsed_time": "0:09:32", "remaining_time": "0:10:22", "throughput": 3119.28, "total_tokens": 1785920} +{"current_steps": 3635, "total_steps": 7577, "loss": 0.1104, "lr": 1.2433223261811337e-06, "epoch": 0.4797413224231226, "percentage": 47.97, "elapsed_time": "0:09:32", "remaining_time": "0:10:21", "throughput": 3122.08, "total_tokens": 1788416} +{"current_steps": 3640, "total_steps": 7577, "loss": 0.0317, "lr": 1.2410873567134115e-06, "epoch": 0.48040121420087106, "percentage": 48.04, "elapsed_time": "0:09:33", "remaining_time": "0:10:19", "throughput": 3124.76, "total_tokens": 1790848} +{"current_steps": 3645, "total_steps": 7577, "loss": 0.0394, "lr": 1.238851107945051e-06, "epoch": 0.4810611059786195, "percentage": 48.11, "elapsed_time": "0:09:33", "remaining_time": "0:10:18", "throughput": 3127.45, "total_tokens": 1793280} +{"current_steps": 3650, "total_steps": 7577, "loss": 0.1043, "lr": 1.2366135917424341e-06, "epoch": 0.48172099775636795, "percentage": 48.17, "elapsed_time": "0:09:33", "remaining_time": "0:10:17", "throughput": 3130.01, "total_tokens": 1795648} +{"current_steps": 3655, "total_steps": 7577, "loss": 0.183, "lr": 1.2343748199786665e-06, "epoch": 0.4823808895341164, "percentage": 48.24, "elapsed_time": "0:09:33", "remaining_time": "0:10:15", "throughput": 3132.48, "total_tokens": 1797952} +{"current_steps": 3660, "total_steps": 7577, "loss": 0.0865, "lr": 1.2321348045335182e-06, "epoch": 0.48304078131186484, "percentage": 48.3, "elapsed_time": "0:09:34", "remaining_time": "0:10:14", "throughput": 3134.83, "total_tokens": 1800192} +{"current_steps": 3665, "total_steps": 7577, "loss": 0.1479, "lr": 1.2298935572933575e-06, "epoch": 0.4837006730896133, "percentage": 48.37, "elapsed_time": "0:09:34", "remaining_time": "0:10:13", "throughput": 3137.4, "total_tokens": 1802560} +{"current_steps": 3670, "total_steps": 7577, "loss": 0.1646, "lr": 1.2276510901510892e-06, "epoch": 0.48436056486736173, "percentage": 48.44, "elapsed_time": "0:09:34", "remaining_time": "0:10:11", "throughput": 3140.18, "total_tokens": 1805056} +{"current_steps": 3675, "total_steps": 7577, "loss": 0.1443, "lr": 1.2254074150060915e-06, "epoch": 0.4850204566451102, "percentage": 48.5, "elapsed_time": "0:09:35", "remaining_time": "0:10:10", "throughput": 3143.29, "total_tokens": 1807744} +{"current_steps": 3680, "total_steps": 7577, "loss": 0.0999, "lr": 1.2231625437641535e-06, "epoch": 0.4856803484228587, "percentage": 48.57, "elapsed_time": "0:09:35", "remaining_time": "0:10:09", "throughput": 3146.29, "total_tokens": 1810368} +{"current_steps": 3685, "total_steps": 7577, "loss": 0.0791, "lr": 1.2209164883374096e-06, "epoch": 0.4863402402006071, "percentage": 48.63, "elapsed_time": "0:09:35", "remaining_time": "0:10:08", "throughput": 3149.39, "total_tokens": 1813056} +{"current_steps": 3690, "total_steps": 7577, "loss": 0.2265, "lr": 1.2186692606442793e-06, "epoch": 0.48700013197835557, "percentage": 48.7, "elapsed_time": "0:09:35", "remaining_time": "0:10:06", "throughput": 3151.83, "total_tokens": 1815360} +{"current_steps": 3695, "total_steps": 7577, "loss": 0.1958, "lr": 1.216420872609402e-06, "epoch": 0.487660023756104, "percentage": 48.77, "elapsed_time": "0:09:36", "remaining_time": "0:10:05", "throughput": 3154.69, "total_tokens": 1817920} +{"current_steps": 3700, "total_steps": 7577, "loss": 0.0936, "lr": 1.2141713361635739e-06, "epoch": 0.48831991553385246, "percentage": 48.83, "elapsed_time": "0:09:36", "remaining_time": "0:10:04", "throughput": 3157.23, "total_tokens": 1820288} +{"current_steps": 3705, "total_steps": 7577, "loss": 0.157, "lr": 1.2119206632436864e-06, "epoch": 0.4889798073116009, "percentage": 48.9, "elapsed_time": "0:09:36", "remaining_time": "0:10:02", "throughput": 3159.79, "total_tokens": 1822656} +{"current_steps": 3710, "total_steps": 7577, "loss": 0.116, "lr": 1.209668865792661e-06, "epoch": 0.48963969908934935, "percentage": 48.96, "elapsed_time": "0:09:37", "remaining_time": "0:10:01", "throughput": 3162.02, "total_tokens": 1824832} +{"current_steps": 3715, "total_steps": 7577, "loss": 0.0906, "lr": 1.207415955759385e-06, "epoch": 0.4902995908670978, "percentage": 49.03, "elapsed_time": "0:09:37", "remaining_time": "0:10:00", "throughput": 3164.55, "total_tokens": 1827200} +{"current_steps": 3720, "total_steps": 7577, "loss": 0.1443, "lr": 1.2051619450986514e-06, "epoch": 0.49095948264484623, "percentage": 49.1, "elapsed_time": "0:09:37", "remaining_time": "0:09:58", "throughput": 3167.21, "total_tokens": 1829632} +{"current_steps": 3725, "total_steps": 7577, "loss": 0.076, "lr": 1.2029068457710923e-06, "epoch": 0.4916193744225947, "percentage": 49.16, "elapsed_time": "0:09:37", "remaining_time": "0:09:57", "throughput": 3170.07, "total_tokens": 1832192} +{"current_steps": 3730, "total_steps": 7577, "loss": 0.1089, "lr": 1.200650669743117e-06, "epoch": 0.4922792662003431, "percentage": 49.23, "elapsed_time": "0:09:38", "remaining_time": "0:09:56", "throughput": 3172.93, "total_tokens": 1834752} +{"current_steps": 3735, "total_steps": 7577, "loss": 0.0533, "lr": 1.1983934289868488e-06, "epoch": 0.49293915797809157, "percentage": 49.29, "elapsed_time": "0:09:38", "remaining_time": "0:09:55", "throughput": 3175.68, "total_tokens": 1837248} +{"current_steps": 3740, "total_steps": 7577, "loss": 0.2063, "lr": 1.1961351354800595e-06, "epoch": 0.49359904975584007, "percentage": 49.36, "elapsed_time": "0:09:38", "remaining_time": "0:09:53", "throughput": 3178.31, "total_tokens": 1839680} +{"current_steps": 3745, "total_steps": 7577, "loss": 0.1478, "lr": 1.193875801206109e-06, "epoch": 0.4942589415335885, "percentage": 49.43, "elapsed_time": "0:09:39", "remaining_time": "0:09:52", "throughput": 3181.27, "total_tokens": 1842304} +{"current_steps": 3750, "total_steps": 7577, "loss": 0.0398, "lr": 1.1916154381538786e-06, "epoch": 0.49491883331133696, "percentage": 49.49, "elapsed_time": "0:09:39", "remaining_time": "0:09:51", "throughput": 3183.47, "total_tokens": 1844480} +{"current_steps": 3755, "total_steps": 7577, "loss": 0.1799, "lr": 1.1893540583177083e-06, "epoch": 0.4955787250890854, "percentage": 49.56, "elapsed_time": "0:09:39", "remaining_time": "0:09:50", "throughput": 3186.09, "total_tokens": 1846912} +{"current_steps": 3760, "total_steps": 7577, "loss": 0.0861, "lr": 1.187091673697335e-06, "epoch": 0.49623861686683385, "percentage": 49.62, "elapsed_time": "0:09:39", "remaining_time": "0:09:48", "throughput": 3188.18, "total_tokens": 1849024} +{"current_steps": 3765, "total_steps": 7577, "loss": 0.0693, "lr": 1.184828296297826e-06, "epoch": 0.4968985086445823, "percentage": 49.69, "elapsed_time": "0:09:40", "remaining_time": "0:09:47", "throughput": 3191.24, "total_tokens": 1851712} +{"current_steps": 3770, "total_steps": 7577, "loss": 0.074, "lr": 1.182563938129518e-06, "epoch": 0.49755840042233074, "percentage": 49.76, "elapsed_time": "0:09:40", "remaining_time": "0:09:46", "throughput": 3193.97, "total_tokens": 1854208} +{"current_steps": 3775, "total_steps": 7577, "loss": 0.0972, "lr": 1.1802986112079507e-06, "epoch": 0.4982182922000792, "percentage": 49.82, "elapsed_time": "0:09:40", "remaining_time": "0:09:44", "throughput": 3196.71, "total_tokens": 1856704} +{"current_steps": 3780, "total_steps": 7577, "loss": 0.0812, "lr": 1.1780323275538056e-06, "epoch": 0.4988781839778276, "percentage": 49.89, "elapsed_time": "0:09:41", "remaining_time": "0:09:43", "throughput": 3198.99, "total_tokens": 1858944} +{"current_steps": 3785, "total_steps": 7577, "loss": 0.2014, "lr": 1.1757650991928393e-06, "epoch": 0.49953807575557607, "percentage": 49.95, "elapsed_time": "0:09:41", "remaining_time": "0:09:42", "throughput": 3202.14, "total_tokens": 1861696} +{"current_steps": 3790, "total_steps": 7577, "loss": 0.3465, "lr": 1.1734969381558235e-06, "epoch": 0.5001979675333246, "percentage": 50.02, "elapsed_time": "0:09:41", "remaining_time": "0:09:41", "throughput": 3204.75, "total_tokens": 1864128} +{"current_steps": 3790, "total_steps": 7577, "eval_loss": 0.13253989815711975, "epoch": 0.5001979675333246, "percentage": 50.02, "elapsed_time": "0:09:49", "remaining_time": "0:09:48", "throughput": 3163.09, "total_tokens": 1864128} +{"current_steps": 3795, "total_steps": 7577, "loss": 0.0012, "lr": 1.1712278564784774e-06, "epoch": 0.500857859311073, "percentage": 50.09, "elapsed_time": "0:10:07", "remaining_time": "0:10:05", "throughput": 3071.73, "total_tokens": 1866432} +{"current_steps": 3800, "total_steps": 7577, "loss": 0.071, "lr": 1.1689578662014064e-06, "epoch": 0.5015177510888215, "percentage": 50.15, "elapsed_time": "0:10:07", "remaining_time": "0:10:04", "throughput": 3074.02, "total_tokens": 1868736} +{"current_steps": 3805, "total_steps": 7577, "loss": 0.2416, "lr": 1.1666869793700362e-06, "epoch": 0.5021776428665699, "percentage": 50.22, "elapsed_time": "0:10:08", "remaining_time": "0:10:02", "throughput": 3076.85, "total_tokens": 1871360} +{"current_steps": 3810, "total_steps": 7577, "loss": 0.0019, "lr": 1.1644152080345515e-06, "epoch": 0.5028375346443184, "percentage": 50.28, "elapsed_time": "0:10:08", "remaining_time": "0:10:01", "throughput": 3078.93, "total_tokens": 1873536} +{"current_steps": 3815, "total_steps": 7577, "loss": 0.2788, "lr": 1.1621425642498289e-06, "epoch": 0.5034974264220667, "percentage": 50.35, "elapsed_time": "0:10:08", "remaining_time": "0:10:00", "throughput": 3081.33, "total_tokens": 1875904} +{"current_steps": 3820, "total_steps": 7577, "loss": 0.2056, "lr": 1.1598690600753759e-06, "epoch": 0.5041573181998152, "percentage": 50.42, "elapsed_time": "0:10:09", "remaining_time": "0:09:59", "throughput": 3084.02, "total_tokens": 1878464} +{"current_steps": 3825, "total_steps": 7577, "loss": 0.2253, "lr": 1.1575947075752644e-06, "epoch": 0.5048172099775636, "percentage": 50.48, "elapsed_time": "0:10:09", "remaining_time": "0:09:57", "throughput": 3086.1, "total_tokens": 1880640} +{"current_steps": 3830, "total_steps": 7577, "loss": 0.1243, "lr": 1.1553195188180691e-06, "epoch": 0.5054771017553121, "percentage": 50.55, "elapsed_time": "0:10:09", "remaining_time": "0:09:56", "throughput": 3088.36, "total_tokens": 1882944} +{"current_steps": 3835, "total_steps": 7577, "loss": 0.0629, "lr": 1.1530435058768008e-06, "epoch": 0.5061369935330606, "percentage": 50.61, "elapsed_time": "0:10:09", "remaining_time": "0:09:55", "throughput": 3090.64, "total_tokens": 1885248} +{"current_steps": 3840, "total_steps": 7577, "loss": 0.0576, "lr": 1.150766680828845e-06, "epoch": 0.506796885310809, "percentage": 50.68, "elapsed_time": "0:10:10", "remaining_time": "0:09:53", "throughput": 3093.44, "total_tokens": 1887872} +{"current_steps": 3845, "total_steps": 7577, "loss": 0.004, "lr": 1.1484890557558955e-06, "epoch": 0.5074567770885575, "percentage": 50.75, "elapsed_time": "0:10:10", "remaining_time": "0:09:52", "throughput": 3096.33, "total_tokens": 1890560} +{"current_steps": 3850, "total_steps": 7577, "loss": 0.0781, "lr": 1.146210642743892e-06, "epoch": 0.5081166688663059, "percentage": 50.81, "elapsed_time": "0:10:10", "remaining_time": "0:09:51", "throughput": 3098.89, "total_tokens": 1893056} +{"current_steps": 3855, "total_steps": 7577, "loss": 0.0498, "lr": 1.1439314538829554e-06, "epoch": 0.5087765606440544, "percentage": 50.88, "elapsed_time": "0:10:11", "remaining_time": "0:09:50", "throughput": 3101.15, "total_tokens": 1895360} +{"current_steps": 3860, "total_steps": 7577, "loss": 0.0617, "lr": 1.141651501267323e-06, "epoch": 0.5094364524218028, "percentage": 50.94, "elapsed_time": "0:10:11", "remaining_time": "0:09:48", "throughput": 3103.42, "total_tokens": 1897664} +{"current_steps": 3865, "total_steps": 7577, "loss": 0.1711, "lr": 1.1393707969952847e-06, "epoch": 0.5100963441995513, "percentage": 51.01, "elapsed_time": "0:10:11", "remaining_time": "0:09:47", "throughput": 3106.21, "total_tokens": 1900288} +{"current_steps": 3870, "total_steps": 7577, "loss": 0.1191, "lr": 1.13708935316912e-06, "epoch": 0.5107562359772997, "percentage": 51.08, "elapsed_time": "0:10:12", "remaining_time": "0:09:46", "throughput": 3109.21, "total_tokens": 1903040} +{"current_steps": 3875, "total_steps": 7577, "loss": 0.0025, "lr": 1.134807181895032e-06, "epoch": 0.5114161277550482, "percentage": 51.14, "elapsed_time": "0:10:12", "remaining_time": "0:09:45", "throughput": 3111.71, "total_tokens": 1905472} +{"current_steps": 3880, "total_steps": 7577, "loss": 0.1253, "lr": 1.132524295283084e-06, "epoch": 0.5120760195327966, "percentage": 51.21, "elapsed_time": "0:10:12", "remaining_time": "0:09:43", "throughput": 3113.89, "total_tokens": 1907712} +{"current_steps": 3885, "total_steps": 7577, "loss": 0.0096, "lr": 1.1302407054471355e-06, "epoch": 0.5127359113105451, "percentage": 51.27, "elapsed_time": "0:10:12", "remaining_time": "0:09:42", "throughput": 3116.28, "total_tokens": 1910080} +{"current_steps": 3890, "total_steps": 7577, "loss": 0.2717, "lr": 1.1279564245047767e-06, "epoch": 0.5133958030882935, "percentage": 51.34, "elapsed_time": "0:10:13", "remaining_time": "0:09:41", "throughput": 3118.78, "total_tokens": 1912512} +{"current_steps": 3895, "total_steps": 7577, "loss": 0.0696, "lr": 1.1256714645772662e-06, "epoch": 0.514055694866042, "percentage": 51.41, "elapsed_time": "0:10:13", "remaining_time": "0:09:39", "throughput": 3120.97, "total_tokens": 1914752} +{"current_steps": 3900, "total_steps": 7577, "loss": 0.0073, "lr": 1.1233858377894647e-06, "epoch": 0.5147155866437905, "percentage": 51.47, "elapsed_time": "0:10:13", "remaining_time": "0:09:38", "throughput": 3123.35, "total_tokens": 1917120} +{"current_steps": 3905, "total_steps": 7577, "loss": 0.0094, "lr": 1.1210995562697722e-06, "epoch": 0.5153754784215389, "percentage": 51.54, "elapsed_time": "0:10:14", "remaining_time": "0:09:37", "throughput": 3125.33, "total_tokens": 1919232} +{"current_steps": 3910, "total_steps": 7577, "loss": 0.0061, "lr": 1.1188126321500621e-06, "epoch": 0.5160353701992874, "percentage": 51.6, "elapsed_time": "0:10:14", "remaining_time": "0:09:36", "throughput": 3128.12, "total_tokens": 1921856} +{"current_steps": 3915, "total_steps": 7577, "loss": 0.1091, "lr": 1.1165250775656188e-06, "epoch": 0.5166952619770357, "percentage": 51.67, "elapsed_time": "0:10:14", "remaining_time": "0:09:34", "throughput": 3130.49, "total_tokens": 1924224} +{"current_steps": 3920, "total_steps": 7577, "loss": 0.0258, "lr": 1.1142369046550708e-06, "epoch": 0.5173551537547842, "percentage": 51.74, "elapsed_time": "0:10:14", "remaining_time": "0:09:33", "throughput": 3132.65, "total_tokens": 1926464} +{"current_steps": 3925, "total_steps": 7577, "loss": 0.253, "lr": 1.1119481255603289e-06, "epoch": 0.5180150455325326, "percentage": 51.8, "elapsed_time": "0:10:15", "remaining_time": "0:09:32", "throughput": 3135.15, "total_tokens": 1928896} +{"current_steps": 3930, "total_steps": 7577, "loss": 0.0598, "lr": 1.1096587524265197e-06, "epoch": 0.5186749373102811, "percentage": 51.87, "elapsed_time": "0:10:15", "remaining_time": "0:09:31", "throughput": 3137.43, "total_tokens": 1931200} +{"current_steps": 3935, "total_steps": 7577, "loss": 0.1918, "lr": 1.107368797401923e-06, "epoch": 0.5193348290880295, "percentage": 51.93, "elapsed_time": "0:10:15", "remaining_time": "0:09:29", "throughput": 3139.91, "total_tokens": 1933632} +{"current_steps": 3940, "total_steps": 7577, "loss": 0.0022, "lr": 1.1050782726379054e-06, "epoch": 0.519994720865778, "percentage": 52.0, "elapsed_time": "0:10:16", "remaining_time": "0:09:28", "throughput": 3142.06, "total_tokens": 1935872} +{"current_steps": 3945, "total_steps": 7577, "loss": 0.104, "lr": 1.1027871902888566e-06, "epoch": 0.5206546126435264, "percentage": 52.07, "elapsed_time": "0:10:16", "remaining_time": "0:09:27", "throughput": 3144.13, "total_tokens": 1938048} +{"current_steps": 3950, "total_steps": 7577, "loss": 0.059, "lr": 1.1004955625121257e-06, "epoch": 0.5213145044212749, "percentage": 52.13, "elapsed_time": "0:10:16", "remaining_time": "0:09:26", "throughput": 3146.81, "total_tokens": 1940608} +{"current_steps": 3955, "total_steps": 7577, "loss": 0.2127, "lr": 1.0982034014679561e-06, "epoch": 0.5219743961990233, "percentage": 52.2, "elapsed_time": "0:10:16", "remaining_time": "0:09:25", "throughput": 3149.27, "total_tokens": 1943040} +{"current_steps": 3960, "total_steps": 7577, "loss": 0.279, "lr": 1.0959107193194206e-06, "epoch": 0.5226342879767718, "percentage": 52.26, "elapsed_time": "0:10:17", "remaining_time": "0:09:23", "throughput": 3152.04, "total_tokens": 1945664} +{"current_steps": 3965, "total_steps": 7577, "loss": 0.0022, "lr": 1.0936175282323575e-06, "epoch": 0.5232941797545203, "percentage": 52.33, "elapsed_time": "0:10:17", "remaining_time": "0:09:22", "throughput": 3154.41, "total_tokens": 1948032} +{"current_steps": 3970, "total_steps": 7577, "loss": 0.0235, "lr": 1.091323840375305e-06, "epoch": 0.5239540715322687, "percentage": 52.4, "elapsed_time": "0:10:17", "remaining_time": "0:09:21", "throughput": 3156.45, "total_tokens": 1950208} +{"current_steps": 3975, "total_steps": 7577, "loss": 0.2217, "lr": 1.0890296679194378e-06, "epoch": 0.5246139633100172, "percentage": 52.46, "elapsed_time": "0:10:18", "remaining_time": "0:09:20", "throughput": 3159.31, "total_tokens": 1952896} +{"current_steps": 3980, "total_steps": 7577, "loss": 0.0466, "lr": 1.086735023038502e-06, "epoch": 0.5252738550877656, "percentage": 52.53, "elapsed_time": "0:10:18", "remaining_time": "0:09:18", "throughput": 3161.55, "total_tokens": 1955200} +{"current_steps": 3985, "total_steps": 7577, "loss": 0.0765, "lr": 1.0844399179087512e-06, "epoch": 0.5259337468655141, "percentage": 52.59, "elapsed_time": "0:10:18", "remaining_time": "0:09:17", "throughput": 3163.6, "total_tokens": 1957376} +{"current_steps": 3990, "total_steps": 7577, "loss": 0.2646, "lr": 1.0821443647088802e-06, "epoch": 0.5265936386432625, "percentage": 52.66, "elapsed_time": "0:10:19", "remaining_time": "0:09:16", "throughput": 3166.46, "total_tokens": 1960064} +{"current_steps": 3995, "total_steps": 7577, "loss": 0.1166, "lr": 1.0798483756199623e-06, "epoch": 0.527253530421011, "percentage": 52.73, "elapsed_time": "0:10:19", "remaining_time": "0:09:15", "throughput": 3169.12, "total_tokens": 1962624} +{"current_steps": 4000, "total_steps": 7577, "loss": 0.0901, "lr": 1.0775519628253833e-06, "epoch": 0.5279134221987594, "percentage": 52.79, "elapsed_time": "0:10:19", "remaining_time": "0:09:14", "throughput": 3171.57, "total_tokens": 1965056} +{"current_steps": 4005, "total_steps": 7577, "loss": 0.1363, "lr": 1.0752551385107772e-06, "epoch": 0.5285733139765079, "percentage": 52.86, "elapsed_time": "0:10:19", "remaining_time": "0:09:12", "throughput": 3173.91, "total_tokens": 1967424} +{"current_steps": 4010, "total_steps": 7577, "loss": 0.1608, "lr": 1.0729579148639621e-06, "epoch": 0.5292332057542563, "percentage": 52.92, "elapsed_time": "0:10:20", "remaining_time": "0:09:11", "throughput": 3176.34, "total_tokens": 1969856} +{"current_steps": 4015, "total_steps": 7577, "loss": 0.0527, "lr": 1.0706603040748747e-06, "epoch": 0.5298930975320048, "percentage": 52.99, "elapsed_time": "0:10:20", "remaining_time": "0:09:10", "throughput": 3179.18, "total_tokens": 1972544} +{"current_steps": 4020, "total_steps": 7577, "loss": 0.0851, "lr": 1.0683623183355071e-06, "epoch": 0.5305529893097533, "percentage": 53.06, "elapsed_time": "0:10:20", "remaining_time": "0:09:09", "throughput": 3181.51, "total_tokens": 1974912} +{"current_steps": 4025, "total_steps": 7577, "loss": 0.0918, "lr": 1.0660639698398392e-06, "epoch": 0.5312128810875016, "percentage": 53.12, "elapsed_time": "0:10:21", "remaining_time": "0:09:08", "throughput": 3183.74, "total_tokens": 1977216} +{"current_steps": 4030, "total_steps": 7577, "loss": 0.069, "lr": 1.0637652707837773e-06, "epoch": 0.5318727728652501, "percentage": 53.19, "elapsed_time": "0:10:21", "remaining_time": "0:09:06", "throughput": 3186.17, "total_tokens": 1979648} +{"current_steps": 4035, "total_steps": 7577, "loss": 0.0788, "lr": 1.0614662333650876e-06, "epoch": 0.5325326646429985, "percentage": 53.25, "elapsed_time": "0:10:21", "remaining_time": "0:09:05", "throughput": 3188.31, "total_tokens": 1981888} +{"current_steps": 4040, "total_steps": 7577, "loss": 0.199, "lr": 1.0591668697833311e-06, "epoch": 0.533192556420747, "percentage": 53.32, "elapsed_time": "0:10:21", "remaining_time": "0:09:04", "throughput": 3190.94, "total_tokens": 1984448} +{"current_steps": 4045, "total_steps": 7577, "loss": 0.1948, "lr": 1.0568671922398005e-06, "epoch": 0.5338524481984954, "percentage": 53.39, "elapsed_time": "0:10:22", "remaining_time": "0:09:03", "throughput": 3193.66, "total_tokens": 1987072} +{"current_steps": 4050, "total_steps": 7577, "loss": 0.1732, "lr": 1.054567212937454e-06, "epoch": 0.5345123399762439, "percentage": 53.45, "elapsed_time": "0:10:22", "remaining_time": "0:09:02", "throughput": 3196.29, "total_tokens": 1989632} +{"current_steps": 4055, "total_steps": 7577, "loss": 0.0482, "lr": 1.0522669440808508e-06, "epoch": 0.5351722317539923, "percentage": 53.52, "elapsed_time": "0:10:22", "remaining_time": "0:09:00", "throughput": 3198.92, "total_tokens": 1992192} +{"current_steps": 4060, "total_steps": 7577, "loss": 0.2351, "lr": 1.0499663978760871e-06, "epoch": 0.5358321235317408, "percentage": 53.58, "elapsed_time": "0:10:23", "remaining_time": "0:08:59", "throughput": 3201.34, "total_tokens": 1994624} +{"current_steps": 4065, "total_steps": 7577, "loss": 0.0567, "lr": 1.0476655865307308e-06, "epoch": 0.5364920153094892, "percentage": 53.65, "elapsed_time": "0:10:23", "remaining_time": "0:08:58", "throughput": 3203.76, "total_tokens": 1997056} +{"current_steps": 4070, "total_steps": 7577, "loss": 0.0665, "lr": 1.0453645222537556e-06, "epoch": 0.5371519070872377, "percentage": 53.72, "elapsed_time": "0:10:23", "remaining_time": "0:08:57", "throughput": 3205.98, "total_tokens": 1999360} +{"current_steps": 4075, "total_steps": 7577, "loss": 0.0719, "lr": 1.0430632172554796e-06, "epoch": 0.5378117988649861, "percentage": 53.78, "elapsed_time": "0:10:23", "remaining_time": "0:08:56", "throughput": 3208.5, "total_tokens": 2001856} +{"current_steps": 4080, "total_steps": 7577, "loss": 0.1029, "lr": 1.0407616837474963e-06, "epoch": 0.5384716906427346, "percentage": 53.85, "elapsed_time": "0:10:24", "remaining_time": "0:08:55", "throughput": 3210.91, "total_tokens": 2004288} +{"current_steps": 4085, "total_steps": 7577, "loss": 0.0145, "lr": 1.038459933942612e-06, "epoch": 0.5391315824204831, "percentage": 53.91, "elapsed_time": "0:10:24", "remaining_time": "0:08:53", "throughput": 3213.73, "total_tokens": 2006976} +{"current_steps": 4090, "total_steps": 7577, "loss": 0.0129, "lr": 1.036157980054782e-06, "epoch": 0.5397914741982315, "percentage": 53.98, "elapsed_time": "0:10:24", "remaining_time": "0:08:52", "throughput": 3215.94, "total_tokens": 2009280} +{"current_steps": 4095, "total_steps": 7577, "loss": 0.0985, "lr": 1.0338558342990431e-06, "epoch": 0.54045136597598, "percentage": 54.05, "elapsed_time": "0:10:25", "remaining_time": "0:08:51", "throughput": 3218.45, "total_tokens": 2011776} +{"current_steps": 4100, "total_steps": 7577, "loss": 0.2285, "lr": 1.0315535088914508e-06, "epoch": 0.5411112577537284, "percentage": 54.11, "elapsed_time": "0:10:25", "remaining_time": "0:08:50", "throughput": 3221.05, "total_tokens": 2014336} +{"current_steps": 4105, "total_steps": 7577, "loss": 0.1558, "lr": 1.0292510160490146e-06, "epoch": 0.5417711495314769, "percentage": 54.18, "elapsed_time": "0:10:25", "remaining_time": "0:08:49", "throughput": 3224.04, "total_tokens": 2017152} +{"current_steps": 4110, "total_steps": 7577, "loss": 0.1097, "lr": 1.0269483679896308e-06, "epoch": 0.5424310413092253, "percentage": 54.24, "elapsed_time": "0:10:25", "remaining_time": "0:08:48", "throughput": 3226.34, "total_tokens": 2019520} +{"current_steps": 4115, "total_steps": 7577, "loss": 0.164, "lr": 1.0246455769320211e-06, "epoch": 0.5430909330869738, "percentage": 54.31, "elapsed_time": "0:10:26", "remaining_time": "0:08:46", "throughput": 3228.14, "total_tokens": 2021632} +{"current_steps": 4120, "total_steps": 7577, "loss": 0.1157, "lr": 1.0223426550956647e-06, "epoch": 0.5437508248647221, "percentage": 54.38, "elapsed_time": "0:10:26", "remaining_time": "0:08:45", "throughput": 3230.02, "total_tokens": 2023744} +{"current_steps": 4125, "total_steps": 7577, "loss": 0.06, "lr": 1.0200396147007354e-06, "epoch": 0.5444107166424706, "percentage": 54.44, "elapsed_time": "0:10:26", "remaining_time": "0:08:44", "throughput": 3232.21, "total_tokens": 2026048} +{"current_steps": 4130, "total_steps": 7577, "loss": 0.1203, "lr": 1.0177364679680367e-06, "epoch": 0.545070608420219, "percentage": 54.51, "elapsed_time": "0:10:27", "remaining_time": "0:08:43", "throughput": 3234.41, "total_tokens": 2028352} +{"current_steps": 4135, "total_steps": 7577, "loss": 0.0494, "lr": 1.015433227118935e-06, "epoch": 0.5457305001979675, "percentage": 54.57, "elapsed_time": "0:10:27", "remaining_time": "0:08:42", "throughput": 3236.9, "total_tokens": 2030848} +{"current_steps": 4140, "total_steps": 7577, "loss": 0.1369, "lr": 1.0131299043752967e-06, "epoch": 0.5463903919757159, "percentage": 54.64, "elapsed_time": "0:10:27", "remaining_time": "0:08:41", "throughput": 3239.39, "total_tokens": 2033344} +{"current_steps": 4145, "total_steps": 7577, "loss": 0.0777, "lr": 1.0108265119594233e-06, "epoch": 0.5470502837534644, "percentage": 54.71, "elapsed_time": "0:10:27", "remaining_time": "0:08:39", "throughput": 3241.48, "total_tokens": 2035584} +{"current_steps": 4150, "total_steps": 7577, "loss": 0.0407, "lr": 1.0085230620939853e-06, "epoch": 0.5477101755312129, "percentage": 54.77, "elapsed_time": "0:10:28", "remaining_time": "0:08:38", "throughput": 3244.26, "total_tokens": 2038272} +{"current_steps": 4155, "total_steps": 7577, "loss": 0.0956, "lr": 1.0062195670019583e-06, "epoch": 0.5483700673089613, "percentage": 54.84, "elapsed_time": "0:10:28", "remaining_time": "0:08:37", "throughput": 3246.73, "total_tokens": 2040768} +{"current_steps": 4160, "total_steps": 7577, "loss": 0.1461, "lr": 1.0039160389065582e-06, "epoch": 0.5490299590867098, "percentage": 54.9, "elapsed_time": "0:10:28", "remaining_time": "0:08:36", "throughput": 3248.91, "total_tokens": 2043072} +{"current_steps": 4165, "total_steps": 7577, "loss": 0.1538, "lr": 1.0016124900311755e-06, "epoch": 0.5496898508644582, "percentage": 54.97, "elapsed_time": "0:10:29", "remaining_time": "0:08:35", "throughput": 3251.77, "total_tokens": 2045824} +{"current_steps": 4169, "total_steps": 7577, "eval_loss": 0.0976191833615303, "epoch": 0.550217764286657, "percentage": 55.02, "elapsed_time": "0:10:36", "remaining_time": "0:08:40", "throughput": 3214.63, "total_tokens": 2047552} +{"current_steps": 4170, "total_steps": 7577, "loss": 0.233, "lr": 9.99308932599311e-07, "epoch": 0.5503497426422067, "percentage": 55.03, "elapsed_time": "0:11:06", "remaining_time": "0:09:04", "throughput": 3070.57, "total_tokens": 2048064} +{"current_steps": 4175, "total_steps": 7577, "loss": 0.0557, "lr": 9.970053788345112e-07, "epoch": 0.5510096344199551, "percentage": 55.1, "elapsed_time": "0:11:07", "remaining_time": "0:09:03", "throughput": 3072.79, "total_tokens": 2050432} +{"current_steps": 4180, "total_steps": 7577, "loss": 0.0547, "lr": 9.947018409603036e-07, "epoch": 0.5516695261977036, "percentage": 55.17, "elapsed_time": "0:11:07", "remaining_time": "0:09:02", "throughput": 3075.2, "total_tokens": 2052928} +{"current_steps": 4185, "total_steps": 7577, "loss": 0.0658, "lr": 9.923983312001304e-07, "epoch": 0.552329417975452, "percentage": 55.23, "elapsed_time": "0:11:07", "remaining_time": "0:09:01", "throughput": 3077.6, "total_tokens": 2055424} +{"current_steps": 4190, "total_steps": 7577, "loss": 0.1874, "lr": 9.900948617772846e-07, "epoch": 0.5529893097532005, "percentage": 55.3, "elapsed_time": "0:11:08", "remaining_time": "0:09:00", "throughput": 3079.44, "total_tokens": 2057536} +{"current_steps": 4195, "total_steps": 7577, "loss": 0.1518, "lr": 9.877914449148462e-07, "epoch": 0.5536492015309489, "percentage": 55.36, "elapsed_time": "0:11:08", "remaining_time": "0:08:58", "throughput": 3081.57, "total_tokens": 2059840} +{"current_steps": 4200, "total_steps": 7577, "loss": 0.2201, "lr": 9.854880928356157e-07, "epoch": 0.5543090933086974, "percentage": 55.43, "elapsed_time": "0:11:08", "remaining_time": "0:08:57", "throughput": 3084.42, "total_tokens": 2062656} +{"current_steps": 4205, "total_steps": 7577, "loss": 0.22, "lr": 9.831848177620493e-07, "epoch": 0.5549689850864459, "percentage": 55.5, "elapsed_time": "0:11:09", "remaining_time": "0:08:56", "throughput": 3086.55, "total_tokens": 2064960} +{"current_steps": 4210, "total_steps": 7577, "loss": 0.2685, "lr": 9.808816319161961e-07, "epoch": 0.5556288768641943, "percentage": 55.56, "elapsed_time": "0:11:09", "remaining_time": "0:08:55", "throughput": 3088.29, "total_tokens": 2067008} +{"current_steps": 4215, "total_steps": 7577, "loss": 0.0021, "lr": 9.785785475196298e-07, "epoch": 0.5562887686419428, "percentage": 55.63, "elapsed_time": "0:11:09", "remaining_time": "0:08:54", "throughput": 3090.95, "total_tokens": 2069696} +{"current_steps": 4220, "total_steps": 7577, "loss": 0.0054, "lr": 9.76275576793387e-07, "epoch": 0.5569486604196912, "percentage": 55.69, "elapsed_time": "0:11:09", "remaining_time": "0:08:52", "throughput": 3093.51, "total_tokens": 2072320} +{"current_steps": 4225, "total_steps": 7577, "loss": 0.0023, "lr": 9.739727319579007e-07, "epoch": 0.5576085521974397, "percentage": 55.76, "elapsed_time": "0:11:10", "remaining_time": "0:08:51", "throughput": 3095.78, "total_tokens": 2074752} +{"current_steps": 4230, "total_steps": 7577, "loss": 0.0678, "lr": 9.716700252329361e-07, "epoch": 0.558268443975188, "percentage": 55.83, "elapsed_time": "0:11:10", "remaining_time": "0:08:50", "throughput": 3098.43, "total_tokens": 2077440} +{"current_steps": 4235, "total_steps": 7577, "loss": 0.2046, "lr": 9.693674688375254e-07, "epoch": 0.5589283357529365, "percentage": 55.89, "elapsed_time": "0:11:10", "remaining_time": "0:08:49", "throughput": 3100.9, "total_tokens": 2080000} +{"current_steps": 4240, "total_steps": 7577, "loss": 0.1257, "lr": 9.67065074989903e-07, "epoch": 0.5595882275306849, "percentage": 55.96, "elapsed_time": "0:11:11", "remaining_time": "0:08:48", "throughput": 3103.37, "total_tokens": 2082560} +{"current_steps": 4245, "total_steps": 7577, "loss": 0.0827, "lr": 9.647628559074415e-07, "epoch": 0.5602481193084334, "percentage": 56.02, "elapsed_time": "0:11:11", "remaining_time": "0:08:46", "throughput": 3105.47, "total_tokens": 2084864} +{"current_steps": 4250, "total_steps": 7577, "loss": 0.1167, "lr": 9.62460823806585e-07, "epoch": 0.5609080110861818, "percentage": 56.09, "elapsed_time": "0:11:11", "remaining_time": "0:08:45", "throughput": 3107.95, "total_tokens": 2087424} +{"current_steps": 4255, "total_steps": 7577, "loss": 0.2136, "lr": 9.601589909027857e-07, "epoch": 0.5615679028639303, "percentage": 56.16, "elapsed_time": "0:11:11", "remaining_time": "0:08:44", "throughput": 3110.52, "total_tokens": 2090048} +{"current_steps": 4260, "total_steps": 7577, "loss": 0.0795, "lr": 9.578573694104394e-07, "epoch": 0.5622277946416787, "percentage": 56.22, "elapsed_time": "0:11:12", "remaining_time": "0:08:43", "throughput": 3112.72, "total_tokens": 2092416} +{"current_steps": 4265, "total_steps": 7577, "loss": 0.0455, "lr": 9.555559715428199e-07, "epoch": 0.5628876864194272, "percentage": 56.29, "elapsed_time": "0:11:12", "remaining_time": "0:08:42", "throughput": 3114.72, "total_tokens": 2094656} +{"current_steps": 4270, "total_steps": 7577, "loss": 0.0031, "lr": 9.532548095120134e-07, "epoch": 0.5635475781971757, "percentage": 56.35, "elapsed_time": "0:11:12", "remaining_time": "0:08:41", "throughput": 3116.91, "total_tokens": 2097024} +{"current_steps": 4275, "total_steps": 7577, "loss": 0.0884, "lr": 9.509538955288564e-07, "epoch": 0.5642074699749241, "percentage": 56.42, "elapsed_time": "0:11:13", "remaining_time": "0:08:39", "throughput": 3119.1, "total_tokens": 2099392} +{"current_steps": 4280, "total_steps": 7577, "loss": 0.0815, "lr": 9.486532418028672e-07, "epoch": 0.5648673617526726, "percentage": 56.49, "elapsed_time": "0:11:13", "remaining_time": "0:08:38", "throughput": 3121.66, "total_tokens": 2102016} +{"current_steps": 4285, "total_steps": 7577, "loss": 0.117, "lr": 9.463528605421844e-07, "epoch": 0.565527253530421, "percentage": 56.55, "elapsed_time": "0:11:13", "remaining_time": "0:08:37", "throughput": 3123.75, "total_tokens": 2104320} +{"current_steps": 4290, "total_steps": 7577, "loss": 0.0795, "lr": 9.440527639535004e-07, "epoch": 0.5661871453081695, "percentage": 56.62, "elapsed_time": "0:11:13", "remaining_time": "0:08:36", "throughput": 3126.58, "total_tokens": 2107136} +{"current_steps": 4295, "total_steps": 7577, "loss": 0.0935, "lr": 9.417529642419971e-07, "epoch": 0.5668470370859179, "percentage": 56.68, "elapsed_time": "0:11:14", "remaining_time": "0:08:35", "throughput": 3129.31, "total_tokens": 2109888} +{"current_steps": 4300, "total_steps": 7577, "loss": 0.1225, "lr": 9.394534736112815e-07, "epoch": 0.5675069288636664, "percentage": 56.75, "elapsed_time": "0:11:14", "remaining_time": "0:08:34", "throughput": 3131.39, "total_tokens": 2112192} +{"current_steps": 4305, "total_steps": 7577, "loss": 0.1277, "lr": 9.371543042633192e-07, "epoch": 0.5681668206414148, "percentage": 56.82, "elapsed_time": "0:11:14", "remaining_time": "0:08:32", "throughput": 3133.84, "total_tokens": 2114752} +{"current_steps": 4310, "total_steps": 7577, "loss": 0.1616, "lr": 9.348554683983722e-07, "epoch": 0.5688267124191633, "percentage": 56.88, "elapsed_time": "0:11:15", "remaining_time": "0:08:31", "throughput": 3136.1, "total_tokens": 2117184} +{"current_steps": 4315, "total_steps": 7577, "loss": 0.0485, "lr": 9.325569782149323e-07, "epoch": 0.5694866041969117, "percentage": 56.95, "elapsed_time": "0:11:15", "remaining_time": "0:08:30", "throughput": 3138.27, "total_tokens": 2119552} +{"current_steps": 4320, "total_steps": 7577, "loss": 0.0897, "lr": 9.302588459096574e-07, "epoch": 0.5701464959746602, "percentage": 57.01, "elapsed_time": "0:11:15", "remaining_time": "0:08:29", "throughput": 3140.43, "total_tokens": 2121920} +{"current_steps": 4325, "total_steps": 7577, "loss": 0.1948, "lr": 9.279610836773064e-07, "epoch": 0.5708063877524086, "percentage": 57.08, "elapsed_time": "0:11:15", "remaining_time": "0:08:28", "throughput": 3142.31, "total_tokens": 2124096} +{"current_steps": 4330, "total_steps": 7577, "loss": 0.0979, "lr": 9.256637037106735e-07, "epoch": 0.571466279530157, "percentage": 57.15, "elapsed_time": "0:11:16", "remaining_time": "0:08:27", "throughput": 3144.55, "total_tokens": 2126528} +{"current_steps": 4335, "total_steps": 7577, "loss": 0.0585, "lr": 9.233667182005259e-07, "epoch": 0.5721261713079056, "percentage": 57.21, "elapsed_time": "0:11:16", "remaining_time": "0:08:25", "throughput": 3146.25, "total_tokens": 2128576} +{"current_steps": 4340, "total_steps": 7577, "loss": 0.1142, "lr": 9.210701393355361e-07, "epoch": 0.5727860630856539, "percentage": 57.28, "elapsed_time": "0:11:16", "remaining_time": "0:08:24", "throughput": 3148.02, "total_tokens": 2130688} +{"current_steps": 4345, "total_steps": 7577, "loss": 0.1147, "lr": 9.187739793022198e-07, "epoch": 0.5734459548634024, "percentage": 57.34, "elapsed_time": "0:11:17", "remaining_time": "0:08:23", "throughput": 3150.56, "total_tokens": 2133312} +{"current_steps": 4350, "total_steps": 7577, "loss": 0.0315, "lr": 9.164782502848702e-07, "epoch": 0.5741058466411508, "percentage": 57.41, "elapsed_time": "0:11:17", "remaining_time": "0:08:22", "throughput": 3152.71, "total_tokens": 2135680} +{"current_steps": 4355, "total_steps": 7577, "loss": 0.2153, "lr": 9.141829644654936e-07, "epoch": 0.5747657384188993, "percentage": 57.48, "elapsed_time": "0:11:17", "remaining_time": "0:08:21", "throughput": 3154.96, "total_tokens": 2138112} +{"current_steps": 4360, "total_steps": 7577, "loss": 0.3872, "lr": 9.118881340237432e-07, "epoch": 0.5754256301966477, "percentage": 57.54, "elapsed_time": "0:11:17", "remaining_time": "0:08:20", "throughput": 3156.93, "total_tokens": 2140352} +{"current_steps": 4365, "total_steps": 7577, "loss": 0.0637, "lr": 9.095937711368573e-07, "epoch": 0.5760855219743962, "percentage": 57.61, "elapsed_time": "0:11:18", "remaining_time": "0:08:19", "throughput": 3159.53, "total_tokens": 2143040} +{"current_steps": 4370, "total_steps": 7577, "loss": 0.1285, "lr": 9.072998879795923e-07, "epoch": 0.5767454137521446, "percentage": 57.67, "elapsed_time": "0:11:18", "remaining_time": "0:08:17", "throughput": 3161.5, "total_tokens": 2145280} +{"current_steps": 4375, "total_steps": 7577, "loss": 0.0807, "lr": 9.050064967241596e-07, "epoch": 0.5774053055298931, "percentage": 57.74, "elapsed_time": "0:11:18", "remaining_time": "0:08:16", "throughput": 3164.01, "total_tokens": 2147904} +{"current_steps": 4380, "total_steps": 7577, "loss": 0.0728, "lr": 9.027136095401598e-07, "epoch": 0.5780651973076415, "percentage": 57.81, "elapsed_time": "0:11:19", "remaining_time": "0:08:15", "throughput": 3166.33, "total_tokens": 2150400} +{"current_steps": 4385, "total_steps": 7577, "loss": 0.1274, "lr": 9.004212385945187e-07, "epoch": 0.57872508908539, "percentage": 57.87, "elapsed_time": "0:11:19", "remaining_time": "0:08:14", "throughput": 3168.92, "total_tokens": 2153088} +{"current_steps": 4390, "total_steps": 7577, "loss": 0.0495, "lr": 8.981293960514233e-07, "epoch": 0.5793849808631385, "percentage": 57.94, "elapsed_time": "0:11:19", "remaining_time": "0:08:13", "throughput": 3171.53, "total_tokens": 2155776} +{"current_steps": 4395, "total_steps": 7577, "loss": 0.1366, "lr": 8.958380940722564e-07, "epoch": 0.5800448726408869, "percentage": 58.0, "elapsed_time": "0:11:20", "remaining_time": "0:08:12", "throughput": 3174.02, "total_tokens": 2158400} +{"current_steps": 4400, "total_steps": 7577, "loss": 0.0731, "lr": 8.935473448155326e-07, "epoch": 0.5807047644186354, "percentage": 58.07, "elapsed_time": "0:11:20", "remaining_time": "0:08:11", "throughput": 3176.06, "total_tokens": 2160704} +{"current_steps": 4405, "total_steps": 7577, "loss": 0.0423, "lr": 8.912571604368324e-07, "epoch": 0.5813646561963838, "percentage": 58.14, "elapsed_time": "0:11:20", "remaining_time": "0:08:10", "throughput": 3178.39, "total_tokens": 2163200} +{"current_steps": 4410, "total_steps": 7577, "loss": 0.1252, "lr": 8.889675530887404e-07, "epoch": 0.5820245479741323, "percentage": 58.2, "elapsed_time": "0:11:20", "remaining_time": "0:08:08", "throughput": 3180.25, "total_tokens": 2165376} +{"current_steps": 4415, "total_steps": 7577, "loss": 0.131, "lr": 8.866785349207786e-07, "epoch": 0.5826844397518807, "percentage": 58.27, "elapsed_time": "0:11:21", "remaining_time": "0:08:07", "throughput": 3182.47, "total_tokens": 2167808} +{"current_steps": 4420, "total_steps": 7577, "loss": 0.1223, "lr": 8.843901180793423e-07, "epoch": 0.5833443315296292, "percentage": 58.33, "elapsed_time": "0:11:21", "remaining_time": "0:08:06", "throughput": 3184.51, "total_tokens": 2170112} +{"current_steps": 4425, "total_steps": 7577, "loss": 0.001, "lr": 8.821023147076362e-07, "epoch": 0.5840042233073776, "percentage": 58.4, "elapsed_time": "0:11:21", "remaining_time": "0:08:05", "throughput": 3186.64, "total_tokens": 2172480} +{"current_steps": 4430, "total_steps": 7577, "loss": 0.0822, "lr": 8.798151369456098e-07, "epoch": 0.5846641150851261, "percentage": 58.47, "elapsed_time": "0:11:22", "remaining_time": "0:08:04", "throughput": 3189.12, "total_tokens": 2175104} +{"current_steps": 4435, "total_steps": 7577, "loss": 0.0803, "lr": 8.775285969298931e-07, "epoch": 0.5853240068628744, "percentage": 58.53, "elapsed_time": "0:11:22", "remaining_time": "0:08:03", "throughput": 3190.98, "total_tokens": 2177280} +{"current_steps": 4440, "total_steps": 7577, "loss": 0.0628, "lr": 8.752427067937312e-07, "epoch": 0.585983898640623, "percentage": 58.6, "elapsed_time": "0:11:22", "remaining_time": "0:08:02", "throughput": 3193.29, "total_tokens": 2179776} +{"current_steps": 4445, "total_steps": 7577, "loss": 0.0845, "lr": 8.729574786669214e-07, "epoch": 0.5866437904183713, "percentage": 58.66, "elapsed_time": "0:11:22", "remaining_time": "0:08:01", "throughput": 3195.77, "total_tokens": 2182400} +{"current_steps": 4450, "total_steps": 7577, "loss": 0.06, "lr": 8.706729246757477e-07, "epoch": 0.5873036821961198, "percentage": 58.73, "elapsed_time": "0:11:23", "remaining_time": "0:08:00", "throughput": 3198.36, "total_tokens": 2185088} +{"current_steps": 4455, "total_steps": 7577, "loss": 0.0725, "lr": 8.683890569429173e-07, "epoch": 0.5879635739738683, "percentage": 58.8, "elapsed_time": "0:11:23", "remaining_time": "0:07:58", "throughput": 3200.95, "total_tokens": 2187776} +{"current_steps": 4460, "total_steps": 7577, "loss": 0.0027, "lr": 8.661058875874956e-07, "epoch": 0.5886234657516167, "percentage": 58.86, "elapsed_time": "0:11:23", "remaining_time": "0:07:57", "throughput": 3202.87, "total_tokens": 2190016} +{"current_steps": 4465, "total_steps": 7577, "loss": 0.0013, "lr": 8.638234287248423e-07, "epoch": 0.5892833575293652, "percentage": 58.93, "elapsed_time": "0:11:24", "remaining_time": "0:07:56", "throughput": 3204.9, "total_tokens": 2192320} +{"current_steps": 4470, "total_steps": 7577, "loss": 0.0578, "lr": 8.615416924665464e-07, "epoch": 0.5899432493071136, "percentage": 58.99, "elapsed_time": "0:11:24", "remaining_time": "0:07:55", "throughput": 3207.11, "total_tokens": 2194752} +{"current_steps": 4475, "total_steps": 7577, "loss": 0.0962, "lr": 8.592606909203629e-07, "epoch": 0.5906031410848621, "percentage": 59.06, "elapsed_time": "0:11:24", "remaining_time": "0:07:54", "throughput": 3209.14, "total_tokens": 2197056} +{"current_steps": 4480, "total_steps": 7577, "loss": 0.0401, "lr": 8.569804361901485e-07, "epoch": 0.5912630328626105, "percentage": 59.13, "elapsed_time": "0:11:24", "remaining_time": "0:07:53", "throughput": 3211.06, "total_tokens": 2199296} +{"current_steps": 4485, "total_steps": 7577, "loss": 0.4233, "lr": 8.547009403757963e-07, "epoch": 0.591922924640359, "percentage": 59.19, "elapsed_time": "0:11:25", "remaining_time": "0:07:52", "throughput": 3213.17, "total_tokens": 2201664} +{"current_steps": 4490, "total_steps": 7577, "loss": 0.1601, "lr": 8.524222155731731e-07, "epoch": 0.5925828164181074, "percentage": 59.26, "elapsed_time": "0:11:25", "remaining_time": "0:07:51", "throughput": 3215.62, "total_tokens": 2204288} +{"current_steps": 4495, "total_steps": 7577, "loss": 0.1259, "lr": 8.501442738740538e-07, "epoch": 0.5932427081958559, "percentage": 59.32, "elapsed_time": "0:11:25", "remaining_time": "0:07:50", "throughput": 3217.54, "total_tokens": 2206528} +{"current_steps": 4500, "total_steps": 7577, "loss": 0.0636, "lr": 8.47867127366058e-07, "epoch": 0.5939025999736043, "percentage": 59.39, "elapsed_time": "0:11:26", "remaining_time": "0:07:49", "throughput": 3219.82, "total_tokens": 2209024} +{"current_steps": 4505, "total_steps": 7577, "loss": 0.0027, "lr": 8.455907881325858e-07, "epoch": 0.5945624917513528, "percentage": 59.46, "elapsed_time": "0:11:26", "remaining_time": "0:07:48", "throughput": 3222.2, "total_tokens": 2211584} +{"current_steps": 4510, "total_steps": 7577, "loss": 0.1052, "lr": 8.433152682527533e-07, "epoch": 0.5952223835291012, "percentage": 59.52, "elapsed_time": "0:11:26", "remaining_time": "0:07:46", "throughput": 3224.3, "total_tokens": 2213952} +{"current_steps": 4515, "total_steps": 7577, "loss": 0.0747, "lr": 8.410405798013298e-07, "epoch": 0.5958822753068497, "percentage": 59.59, "elapsed_time": "0:11:26", "remaining_time": "0:07:45", "throughput": 3226.21, "total_tokens": 2216192} +{"current_steps": 4520, "total_steps": 7577, "loss": 0.0035, "lr": 8.387667348486712e-07, "epoch": 0.5965421670845982, "percentage": 59.65, "elapsed_time": "0:11:27", "remaining_time": "0:07:44", "throughput": 3228.48, "total_tokens": 2218688} +{"current_steps": 4525, "total_steps": 7577, "loss": 0.1296, "lr": 8.364937454606585e-07, "epoch": 0.5972020588623466, "percentage": 59.72, "elapsed_time": "0:11:27", "remaining_time": "0:07:43", "throughput": 3230.38, "total_tokens": 2220928} +{"current_steps": 4530, "total_steps": 7577, "loss": 0.0014, "lr": 8.342216236986329e-07, "epoch": 0.5978619506400951, "percentage": 59.79, "elapsed_time": "0:11:27", "remaining_time": "0:07:42", "throughput": 3232.55, "total_tokens": 2223360} +{"current_steps": 4535, "total_steps": 7577, "loss": 0.1463, "lr": 8.319503816193305e-07, "epoch": 0.5985218424178435, "percentage": 59.85, "elapsed_time": "0:11:28", "remaining_time": "0:07:41", "throughput": 3234.74, "total_tokens": 2225792} +{"current_steps": 4540, "total_steps": 7577, "loss": 0.1496, "lr": 8.296800312748206e-07, "epoch": 0.599181734195592, "percentage": 59.92, "elapsed_time": "0:11:28", "remaining_time": "0:07:40", "throughput": 3237.01, "total_tokens": 2228288} +{"current_steps": 4545, "total_steps": 7577, "loss": 0.1911, "lr": 8.274105847124404e-07, "epoch": 0.5998416259733403, "percentage": 59.98, "elapsed_time": "0:11:28", "remaining_time": "0:07:39", "throughput": 3239.36, "total_tokens": 2230848} +{"current_steps": 4548, "total_steps": 7577, "eval_loss": 0.11496574431657791, "epoch": 0.6002375610399895, "percentage": 60.02, "elapsed_time": "0:11:36", "remaining_time": "0:07:43", "throughput": 3205.32, "total_tokens": 2232448} +{"current_steps": 4550, "total_steps": 7577, "loss": 0.1187, "lr": 8.251420539747311e-07, "epoch": 0.6005015177510888, "percentage": 60.05, "elapsed_time": "0:12:04", "remaining_time": "0:08:01", "throughput": 3084.83, "total_tokens": 2233472} +{"current_steps": 4555, "total_steps": 7577, "loss": 0.1799, "lr": 8.228744510993742e-07, "epoch": 0.6011614095288372, "percentage": 60.12, "elapsed_time": "0:12:04", "remaining_time": "0:08:00", "throughput": 3087.14, "total_tokens": 2236096} +{"current_steps": 4560, "total_steps": 7577, "loss": 0.0908, "lr": 8.206077881191274e-07, "epoch": 0.6018213013065857, "percentage": 60.18, "elapsed_time": "0:12:04", "remaining_time": "0:07:59", "throughput": 3089.5, "total_tokens": 2238720} +{"current_steps": 4565, "total_steps": 7577, "loss": 0.1394, "lr": 8.183420770617614e-07, "epoch": 0.6024811930843341, "percentage": 60.25, "elapsed_time": "0:12:04", "remaining_time": "0:07:58", "throughput": 3091.7, "total_tokens": 2241216} +{"current_steps": 4570, "total_steps": 7577, "loss": 0.0631, "lr": 8.160773299499955e-07, "epoch": 0.6031410848620826, "percentage": 60.31, "elapsed_time": "0:12:05", "remaining_time": "0:07:57", "throughput": 3093.83, "total_tokens": 2243648} +{"current_steps": 4575, "total_steps": 7577, "loss": 0.0464, "lr": 8.138135588014339e-07, "epoch": 0.6038009766398311, "percentage": 60.38, "elapsed_time": "0:12:05", "remaining_time": "0:07:56", "throughput": 3095.97, "total_tokens": 2246080} +{"current_steps": 4580, "total_steps": 7577, "loss": 0.0632, "lr": 8.115507756285017e-07, "epoch": 0.6044608684175795, "percentage": 60.45, "elapsed_time": "0:12:05", "remaining_time": "0:07:54", "throughput": 3097.75, "total_tokens": 2248256} +{"current_steps": 4585, "total_steps": 7577, "loss": 0.1037, "lr": 8.092889924383819e-07, "epoch": 0.605120760195328, "percentage": 60.51, "elapsed_time": "0:12:06", "remaining_time": "0:07:53", "throughput": 3099.89, "total_tokens": 2250688} +{"current_steps": 4590, "total_steps": 7577, "loss": 0.0775, "lr": 8.070282212329508e-07, "epoch": 0.6057806519730764, "percentage": 60.58, "elapsed_time": "0:12:06", "remaining_time": "0:07:52", "throughput": 3102.02, "total_tokens": 2253120} +{"current_steps": 4595, "total_steps": 7577, "loss": 0.22, "lr": 8.047684740087156e-07, "epoch": 0.6064405437508249, "percentage": 60.64, "elapsed_time": "0:12:06", "remaining_time": "0:07:51", "throughput": 3103.9, "total_tokens": 2255360} +{"current_steps": 4600, "total_steps": 7577, "loss": 0.1834, "lr": 8.025097627567481e-07, "epoch": 0.6071004355285733, "percentage": 60.71, "elapsed_time": "0:12:06", "remaining_time": "0:07:50", "throughput": 3105.93, "total_tokens": 2257728} +{"current_steps": 4605, "total_steps": 7577, "loss": 0.0712, "lr": 8.002520994626247e-07, "epoch": 0.6077603273063218, "percentage": 60.78, "elapsed_time": "0:12:07", "remaining_time": "0:07:49", "throughput": 3108.14, "total_tokens": 2260224} +{"current_steps": 4610, "total_steps": 7577, "loss": 0.0733, "lr": 7.979954961063596e-07, "epoch": 0.6084202190840702, "percentage": 60.84, "elapsed_time": "0:12:07", "remaining_time": "0:07:48", "throughput": 3110.6, "total_tokens": 2262912} +{"current_steps": 4615, "total_steps": 7577, "loss": 0.3433, "lr": 7.957399646623436e-07, "epoch": 0.6090801108618187, "percentage": 60.91, "elapsed_time": "0:12:07", "remaining_time": "0:07:47", "throughput": 3112.47, "total_tokens": 2265152} +{"current_steps": 4620, "total_steps": 7577, "loss": 0.042, "lr": 7.934855170992788e-07, "epoch": 0.6097400026395671, "percentage": 60.97, "elapsed_time": "0:12:08", "remaining_time": "0:07:45", "throughput": 3115.08, "total_tokens": 2267968} +{"current_steps": 4625, "total_steps": 7577, "loss": 0.0468, "lr": 7.912321653801161e-07, "epoch": 0.6103998944173156, "percentage": 61.04, "elapsed_time": "0:12:08", "remaining_time": "0:07:44", "throughput": 3117.12, "total_tokens": 2270336} +{"current_steps": 4630, "total_steps": 7577, "loss": 0.1865, "lr": 7.889799214619919e-07, "epoch": 0.611059786195064, "percentage": 61.11, "elapsed_time": "0:12:08", "remaining_time": "0:07:43", "throughput": 3119.56, "total_tokens": 2273024} +{"current_steps": 4635, "total_steps": 7577, "loss": 0.0821, "lr": 7.867287972961629e-07, "epoch": 0.6117196779728125, "percentage": 61.17, "elapsed_time": "0:12:08", "remaining_time": "0:07:42", "throughput": 3121.41, "total_tokens": 2275264} +{"current_steps": 4640, "total_steps": 7577, "loss": 0.0704, "lr": 7.844788048279453e-07, "epoch": 0.612379569750561, "percentage": 61.24, "elapsed_time": "0:12:09", "remaining_time": "0:07:41", "throughput": 3123.78, "total_tokens": 2277888} +{"current_steps": 4645, "total_steps": 7577, "loss": 0.0007, "lr": 7.822299559966494e-07, "epoch": 0.6130394615283093, "percentage": 61.3, "elapsed_time": "0:12:09", "remaining_time": "0:07:40", "throughput": 3125.9, "total_tokens": 2280320} +{"current_steps": 4650, "total_steps": 7577, "loss": 0.0591, "lr": 7.799822627355171e-07, "epoch": 0.6136993533060578, "percentage": 61.37, "elapsed_time": "0:12:09", "remaining_time": "0:07:39", "throughput": 3127.75, "total_tokens": 2282560} +{"current_steps": 4655, "total_steps": 7577, "loss": 0.0842, "lr": 7.77735736971659e-07, "epoch": 0.6143592450838062, "percentage": 61.44, "elapsed_time": "0:12:10", "remaining_time": "0:07:38", "throughput": 3129.69, "total_tokens": 2284864} +{"current_steps": 4660, "total_steps": 7577, "loss": 0.1524, "lr": 7.754903906259889e-07, "epoch": 0.6150191368615547, "percentage": 61.5, "elapsed_time": "0:12:10", "remaining_time": "0:07:37", "throughput": 3131.63, "total_tokens": 2287168} +{"current_steps": 4665, "total_steps": 7577, "loss": 0.059, "lr": 7.732462356131637e-07, "epoch": 0.6156790286393031, "percentage": 61.57, "elapsed_time": "0:12:10", "remaining_time": "0:07:36", "throughput": 3133.73, "total_tokens": 2289600} +{"current_steps": 4670, "total_steps": 7577, "loss": 0.0896, "lr": 7.710032838415179e-07, "epoch": 0.6163389204170516, "percentage": 61.63, "elapsed_time": "0:12:10", "remaining_time": "0:07:34", "throughput": 3135.99, "total_tokens": 2292160} +{"current_steps": 4675, "total_steps": 7577, "loss": 0.155, "lr": 7.687615472130016e-07, "epoch": 0.6169988121948, "percentage": 61.7, "elapsed_time": "0:12:11", "remaining_time": "0:07:33", "throughput": 3138.52, "total_tokens": 2294912} +{"current_steps": 4680, "total_steps": 7577, "loss": 0.1138, "lr": 7.665210376231165e-07, "epoch": 0.6176587039725485, "percentage": 61.77, "elapsed_time": "0:12:11", "remaining_time": "0:07:32", "throughput": 3140.19, "total_tokens": 2297024} +{"current_steps": 4685, "total_steps": 7577, "loss": 0.1342, "lr": 7.642817669608536e-07, "epoch": 0.6183185957502969, "percentage": 61.83, "elapsed_time": "0:12:11", "remaining_time": "0:07:31", "throughput": 3142.29, "total_tokens": 2299456} +{"current_steps": 4690, "total_steps": 7577, "loss": 0.0194, "lr": 7.62043747108629e-07, "epoch": 0.6189784875280454, "percentage": 61.9, "elapsed_time": "0:12:12", "remaining_time": "0:07:30", "throughput": 3143.95, "total_tokens": 2301568} +{"current_steps": 4695, "total_steps": 7577, "loss": 0.1988, "lr": 7.598069899422221e-07, "epoch": 0.6196383793057938, "percentage": 61.96, "elapsed_time": "0:12:12", "remaining_time": "0:07:29", "throughput": 3145.97, "total_tokens": 2303936} +{"current_steps": 4700, "total_steps": 7577, "loss": 0.2107, "lr": 7.575715073307119e-07, "epoch": 0.6202982710835423, "percentage": 62.03, "elapsed_time": "0:12:12", "remaining_time": "0:07:28", "throughput": 3147.81, "total_tokens": 2306176} +{"current_steps": 4705, "total_steps": 7577, "loss": 0.0995, "lr": 7.55337311136414e-07, "epoch": 0.6209581628612908, "percentage": 62.1, "elapsed_time": "0:12:12", "remaining_time": "0:07:27", "throughput": 3150.07, "total_tokens": 2308736} +{"current_steps": 4710, "total_steps": 7577, "loss": 0.1775, "lr": 7.531044132148183e-07, "epoch": 0.6216180546390392, "percentage": 62.16, "elapsed_time": "0:12:13", "remaining_time": "0:07:26", "throughput": 3152.07, "total_tokens": 2311104} +{"current_steps": 4715, "total_steps": 7577, "loss": 0.0493, "lr": 7.508728254145245e-07, "epoch": 0.6222779464167877, "percentage": 62.23, "elapsed_time": "0:12:13", "remaining_time": "0:07:25", "throughput": 3154.16, "total_tokens": 2313536} +{"current_steps": 4720, "total_steps": 7577, "loss": 0.117, "lr": 7.486425595771817e-07, "epoch": 0.6229378381945361, "percentage": 62.29, "elapsed_time": "0:12:13", "remaining_time": "0:07:24", "throughput": 3156.33, "total_tokens": 2316032} +{"current_steps": 4725, "total_steps": 7577, "loss": 0.1853, "lr": 7.464136275374223e-07, "epoch": 0.6235977299722846, "percentage": 62.36, "elapsed_time": "0:12:14", "remaining_time": "0:07:23", "throughput": 3158.65, "total_tokens": 2318656} +{"current_steps": 4730, "total_steps": 7577, "loss": 0.1311, "lr": 7.441860411228029e-07, "epoch": 0.624257621750033, "percentage": 62.43, "elapsed_time": "0:12:14", "remaining_time": "0:07:22", "throughput": 3160.89, "total_tokens": 2321216} +{"current_steps": 4735, "total_steps": 7577, "loss": 0.1273, "lr": 7.419598121537387e-07, "epoch": 0.6249175135277815, "percentage": 62.49, "elapsed_time": "0:12:14", "remaining_time": "0:07:20", "throughput": 3162.96, "total_tokens": 2323648} +{"current_steps": 4740, "total_steps": 7577, "loss": 0.1446, "lr": 7.397349524434424e-07, "epoch": 0.6255774053055299, "percentage": 62.56, "elapsed_time": "0:12:14", "remaining_time": "0:07:19", "throughput": 3165.03, "total_tokens": 2326080} +{"current_steps": 4745, "total_steps": 7577, "loss": 0.0544, "lr": 7.375114737978605e-07, "epoch": 0.6262372970832784, "percentage": 62.62, "elapsed_time": "0:12:15", "remaining_time": "0:07:18", "throughput": 3167.11, "total_tokens": 2328512} +{"current_steps": 4750, "total_steps": 7577, "loss": 0.1048, "lr": 7.352893880156106e-07, "epoch": 0.6268971888610267, "percentage": 62.69, "elapsed_time": "0:12:15", "remaining_time": "0:07:17", "throughput": 3169.26, "total_tokens": 2331008} +{"current_steps": 4755, "total_steps": 7577, "loss": 0.0516, "lr": 7.330687068879202e-07, "epoch": 0.6275570806387752, "percentage": 62.76, "elapsed_time": "0:12:15", "remaining_time": "0:07:16", "throughput": 3171.27, "total_tokens": 2333376} +{"current_steps": 4760, "total_steps": 7577, "loss": 0.1411, "lr": 7.308494421985626e-07, "epoch": 0.6282169724165237, "percentage": 62.82, "elapsed_time": "0:12:16", "remaining_time": "0:07:15", "throughput": 3173.43, "total_tokens": 2335872} +{"current_steps": 4765, "total_steps": 7577, "loss": 0.0029, "lr": 7.286316057237951e-07, "epoch": 0.6288768641942721, "percentage": 62.89, "elapsed_time": "0:12:16", "remaining_time": "0:07:14", "throughput": 3175.68, "total_tokens": 2338432} +{"current_steps": 4770, "total_steps": 7577, "loss": 0.1567, "lr": 7.264152092322963e-07, "epoch": 0.6295367559720206, "percentage": 62.95, "elapsed_time": "0:12:16", "remaining_time": "0:07:13", "throughput": 3177.82, "total_tokens": 2340928} +{"current_steps": 4775, "total_steps": 7577, "loss": 0.0441, "lr": 7.242002644851035e-07, "epoch": 0.630196647749769, "percentage": 63.02, "elapsed_time": "0:12:16", "remaining_time": "0:07:12", "throughput": 3180.31, "total_tokens": 2343680} +{"current_steps": 4780, "total_steps": 7577, "loss": 0.0673, "lr": 7.219867832355508e-07, "epoch": 0.6308565395275175, "percentage": 63.09, "elapsed_time": "0:12:17", "remaining_time": "0:07:11", "throughput": 3182.54, "total_tokens": 2346240} +{"current_steps": 4785, "total_steps": 7577, "loss": 0.0718, "lr": 7.197747772292071e-07, "epoch": 0.6315164313052659, "percentage": 63.15, "elapsed_time": "0:12:17", "remaining_time": "0:07:10", "throughput": 3184.45, "total_tokens": 2348544} +{"current_steps": 4790, "total_steps": 7577, "loss": 0.2532, "lr": 7.17564258203811e-07, "epoch": 0.6321763230830144, "percentage": 63.22, "elapsed_time": "0:12:17", "remaining_time": "0:07:09", "throughput": 3186.5, "total_tokens": 2350976} +{"current_steps": 4795, "total_steps": 7577, "loss": 0.1214, "lr": 7.153552378892128e-07, "epoch": 0.6328362148607628, "percentage": 63.28, "elapsed_time": "0:12:18", "remaining_time": "0:07:08", "throughput": 3188.31, "total_tokens": 2353216} +{"current_steps": 4800, "total_steps": 7577, "loss": 0.1191, "lr": 7.131477280073091e-07, "epoch": 0.6334961066385113, "percentage": 63.35, "elapsed_time": "0:12:18", "remaining_time": "0:07:07", "throughput": 3190.28, "total_tokens": 2355584} +{"current_steps": 4805, "total_steps": 7577, "loss": 0.1127, "lr": 7.109417402719813e-07, "epoch": 0.6341559984162597, "percentage": 63.42, "elapsed_time": "0:12:18", "remaining_time": "0:07:06", "throughput": 3192.5, "total_tokens": 2358144} +{"current_steps": 4810, "total_steps": 7577, "loss": 0.0543, "lr": 7.087372863890346e-07, "epoch": 0.6348158901940082, "percentage": 63.48, "elapsed_time": "0:12:18", "remaining_time": "0:07:05", "throughput": 3194.97, "total_tokens": 2360896} +{"current_steps": 4815, "total_steps": 7577, "loss": 0.2546, "lr": 7.065343780561344e-07, "epoch": 0.6354757819717566, "percentage": 63.55, "elapsed_time": "0:12:19", "remaining_time": "0:07:04", "throughput": 3196.95, "total_tokens": 2363264} +{"current_steps": 4820, "total_steps": 7577, "loss": 0.0676, "lr": 7.043330269627448e-07, "epoch": 0.6361356737495051, "percentage": 63.61, "elapsed_time": "0:12:19", "remaining_time": "0:07:02", "throughput": 3198.92, "total_tokens": 2365632} +{"current_steps": 4825, "total_steps": 7577, "loss": 0.0018, "lr": 7.021332447900671e-07, "epoch": 0.6367955655272536, "percentage": 63.68, "elapsed_time": "0:12:19", "remaining_time": "0:07:01", "throughput": 3200.89, "total_tokens": 2368000} +{"current_steps": 4830, "total_steps": 7577, "loss": 0.1462, "lr": 6.999350432109766e-07, "epoch": 0.637455457305002, "percentage": 63.75, "elapsed_time": "0:12:20", "remaining_time": "0:07:00", "throughput": 3203.11, "total_tokens": 2370560} +{"current_steps": 4835, "total_steps": 7577, "loss": 0.001, "lr": 6.977384338899617e-07, "epoch": 0.6381153490827505, "percentage": 63.81, "elapsed_time": "0:12:20", "remaining_time": "0:06:59", "throughput": 3205.32, "total_tokens": 2373120} +{"current_steps": 4840, "total_steps": 7577, "loss": 0.0052, "lr": 6.955434284830619e-07, "epoch": 0.6387752408604989, "percentage": 63.88, "elapsed_time": "0:12:20", "remaining_time": "0:06:58", "throughput": 3207.78, "total_tokens": 2375872} +{"current_steps": 4845, "total_steps": 7577, "loss": 0.2037, "lr": 6.933500386378056e-07, "epoch": 0.6394351326382474, "percentage": 63.94, "elapsed_time": "0:12:20", "remaining_time": "0:06:57", "throughput": 3209.98, "total_tokens": 2378432} +{"current_steps": 4850, "total_steps": 7577, "loss": 0.1581, "lr": 6.911582759931482e-07, "epoch": 0.6400950244159958, "percentage": 64.01, "elapsed_time": "0:12:21", "remaining_time": "0:06:56", "throughput": 3211.96, "total_tokens": 2380800} +{"current_steps": 4855, "total_steps": 7577, "loss": 0.2158, "lr": 6.889681521794109e-07, "epoch": 0.6407549161937443, "percentage": 64.08, "elapsed_time": "0:12:21", "remaining_time": "0:06:55", "throughput": 3214.66, "total_tokens": 2383744} +{"current_steps": 4860, "total_steps": 7577, "loss": 0.0894, "lr": 6.867796788182181e-07, "epoch": 0.6414148079714926, "percentage": 64.14, "elapsed_time": "0:12:21", "remaining_time": "0:06:54", "throughput": 3216.62, "total_tokens": 2386112} +{"current_steps": 4865, "total_steps": 7577, "loss": 0.1499, "lr": 6.845928675224366e-07, "epoch": 0.6420746997492411, "percentage": 64.21, "elapsed_time": "0:12:22", "remaining_time": "0:06:53", "throughput": 3218.92, "total_tokens": 2388736} +{"current_steps": 4870, "total_steps": 7577, "loss": 0.0662, "lr": 6.82407729896114e-07, "epoch": 0.6427345915269895, "percentage": 64.27, "elapsed_time": "0:12:22", "remaining_time": "0:06:52", "throughput": 3220.88, "total_tokens": 2391104} +{"current_steps": 4875, "total_steps": 7577, "loss": 0.0747, "lr": 6.802242775344163e-07, "epoch": 0.643394483304738, "percentage": 64.34, "elapsed_time": "0:12:22", "remaining_time": "0:06:51", "throughput": 3223.16, "total_tokens": 2393728} +{"current_steps": 4880, "total_steps": 7577, "loss": 0.1309, "lr": 6.780425220235674e-07, "epoch": 0.6440543750824864, "percentage": 64.41, "elapsed_time": "0:12:22", "remaining_time": "0:06:50", "throughput": 3225.6, "total_tokens": 2396480} +{"current_steps": 4885, "total_steps": 7577, "loss": 0.008, "lr": 6.758624749407859e-07, "epoch": 0.6447142668602349, "percentage": 64.47, "elapsed_time": "0:12:23", "remaining_time": "0:06:49", "throughput": 3227.88, "total_tokens": 2399104} +{"current_steps": 4890, "total_steps": 7577, "loss": 0.0813, "lr": 6.736841478542264e-07, "epoch": 0.6453741586379834, "percentage": 64.54, "elapsed_time": "0:12:23", "remaining_time": "0:06:48", "throughput": 3230.08, "total_tokens": 2401664} +{"current_steps": 4895, "total_steps": 7577, "loss": 0.0084, "lr": 6.715075523229151e-07, "epoch": 0.6460340504157318, "percentage": 64.6, "elapsed_time": "0:12:23", "remaining_time": "0:06:47", "throughput": 3232.19, "total_tokens": 2404160} +{"current_steps": 4900, "total_steps": 7577, "loss": 0.129, "lr": 6.693326998966909e-07, "epoch": 0.6466939421934803, "percentage": 64.67, "elapsed_time": "0:12:24", "remaining_time": "0:06:46", "throughput": 3234.22, "total_tokens": 2406592} +{"current_steps": 4905, "total_steps": 7577, "loss": 0.0684, "lr": 6.671596021161431e-07, "epoch": 0.6473538339712287, "percentage": 64.74, "elapsed_time": "0:12:24", "remaining_time": "0:06:45", "throughput": 3236.33, "total_tokens": 2409088} +{"current_steps": 4910, "total_steps": 7577, "loss": 0.0965, "lr": 6.649882705125494e-07, "epoch": 0.6480137257489772, "percentage": 64.8, "elapsed_time": "0:12:24", "remaining_time": "0:06:44", "throughput": 3238.43, "total_tokens": 2411584} +{"current_steps": 4915, "total_steps": 7577, "loss": 0.4483, "lr": 6.628187166078163e-07, "epoch": 0.6486736175267256, "percentage": 64.87, "elapsed_time": "0:12:24", "remaining_time": "0:06:43", "throughput": 3240.95, "total_tokens": 2414400} +{"current_steps": 4920, "total_steps": 7577, "loss": 0.0583, "lr": 6.606509519144166e-07, "epoch": 0.6493335093044741, "percentage": 64.93, "elapsed_time": "0:12:25", "remaining_time": "0:06:42", "throughput": 3242.72, "total_tokens": 2416640} +{"current_steps": 4925, "total_steps": 7577, "loss": 0.1499, "lr": 6.584849879353289e-07, "epoch": 0.6499934010822225, "percentage": 65.0, "elapsed_time": "0:12:25", "remaining_time": "0:06:41", "throughput": 3244.82, "total_tokens": 2419136} +{"current_steps": 4927, "total_steps": 7577, "eval_loss": 0.09844312816858292, "epoch": 0.6502573577933219, "percentage": 65.03, "elapsed_time": "0:12:33", "remaining_time": "0:06:45", "throughput": 3213.32, "total_tokens": 2420096} +{"current_steps": 4930, "total_steps": 7577, "loss": 0.0307, "lr": 6.563208361639772e-07, "epoch": 0.650653292859971, "percentage": 65.07, "elapsed_time": "0:12:55", "remaining_time": "0:06:56", "throughput": 3120.78, "total_tokens": 2421440} +{"current_steps": 4935, "total_steps": 7577, "loss": 0.0015, "lr": 6.541585080841687e-07, "epoch": 0.6513131846377194, "percentage": 65.13, "elapsed_time": "0:12:56", "remaining_time": "0:06:55", "throughput": 3122.91, "total_tokens": 2424000} +{"current_steps": 4940, "total_steps": 7577, "loss": 0.0999, "lr": 6.519980151700332e-07, "epoch": 0.6519730764154679, "percentage": 65.2, "elapsed_time": "0:12:56", "remaining_time": "0:06:54", "throughput": 3124.64, "total_tokens": 2426240} +{"current_steps": 4945, "total_steps": 7577, "loss": 0.0789, "lr": 6.498393688859629e-07, "epoch": 0.6526329681932164, "percentage": 65.26, "elapsed_time": "0:12:56", "remaining_time": "0:06:53", "throughput": 3126.86, "total_tokens": 2428864} +{"current_steps": 4950, "total_steps": 7577, "loss": 0.0011, "lr": 6.47682580686551e-07, "epoch": 0.6532928599709648, "percentage": 65.33, "elapsed_time": "0:12:57", "remaining_time": "0:06:52", "throughput": 3128.84, "total_tokens": 2431296} +{"current_steps": 4955, "total_steps": 7577, "loss": 0.002, "lr": 6.455276620165307e-07, "epoch": 0.6539527517487133, "percentage": 65.4, "elapsed_time": "0:12:57", "remaining_time": "0:06:51", "throughput": 3131.12, "total_tokens": 2433984} +{"current_steps": 4960, "total_steps": 7577, "loss": 0.4195, "lr": 6.433746243107152e-07, "epoch": 0.6546126435264616, "percentage": 65.46, "elapsed_time": "0:12:57", "remaining_time": "0:06:50", "throughput": 3132.85, "total_tokens": 2436224} +{"current_steps": 4965, "total_steps": 7577, "loss": 0.229, "lr": 6.412234789939359e-07, "epoch": 0.6552725353042101, "percentage": 65.53, "elapsed_time": "0:12:57", "remaining_time": "0:06:49", "throughput": 3134.92, "total_tokens": 2438720} +{"current_steps": 4970, "total_steps": 7577, "loss": 0.0818, "lr": 6.390742374809832e-07, "epoch": 0.6559324270819585, "percentage": 65.59, "elapsed_time": "0:12:58", "remaining_time": "0:06:48", "throughput": 3136.65, "total_tokens": 2440960} +{"current_steps": 4975, "total_steps": 7577, "loss": 0.0417, "lr": 6.369269111765454e-07, "epoch": 0.656592318859707, "percentage": 65.66, "elapsed_time": "0:12:58", "remaining_time": "0:06:47", "throughput": 3138.51, "total_tokens": 2443328} +{"current_steps": 4980, "total_steps": 7577, "loss": 0.1413, "lr": 6.347815114751465e-07, "epoch": 0.6572522106374554, "percentage": 65.73, "elapsed_time": "0:12:58", "remaining_time": "0:06:46", "throughput": 3140.71, "total_tokens": 2445952} +{"current_steps": 4985, "total_steps": 7577, "loss": 0.1102, "lr": 6.326380497610886e-07, "epoch": 0.6579121024152039, "percentage": 65.79, "elapsed_time": "0:12:59", "remaining_time": "0:06:45", "throughput": 3142.91, "total_tokens": 2448576} +{"current_steps": 4990, "total_steps": 7577, "loss": 0.323, "lr": 6.304965374083899e-07, "epoch": 0.6585719941929523, "percentage": 65.86, "elapsed_time": "0:12:59", "remaining_time": "0:06:44", "throughput": 3145.02, "total_tokens": 2451136} +{"current_steps": 4995, "total_steps": 7577, "loss": 0.0022, "lr": 6.283569857807245e-07, "epoch": 0.6592318859707008, "percentage": 65.92, "elapsed_time": "0:12:59", "remaining_time": "0:06:43", "throughput": 3147.06, "total_tokens": 2453632} +{"current_steps": 5000, "total_steps": 7577, "loss": 0.0082, "lr": 6.262194062313615e-07, "epoch": 0.6598917777484492, "percentage": 65.99, "elapsed_time": "0:12:59", "remaining_time": "0:06:41", "throughput": 3149.17, "total_tokens": 2456192} +{"current_steps": 5005, "total_steps": 7577, "loss": 0.0012, "lr": 6.240838101031063e-07, "epoch": 0.6605516695261977, "percentage": 66.06, "elapsed_time": "0:13:00", "remaining_time": "0:06:40", "throughput": 3151.12, "total_tokens": 2458624} +{"current_steps": 5010, "total_steps": 7577, "loss": 0.134, "lr": 6.21950208728239e-07, "epoch": 0.6612115613039462, "percentage": 66.12, "elapsed_time": "0:13:00", "remaining_time": "0:06:39", "throughput": 3152.9, "total_tokens": 2460928} +{"current_steps": 5015, "total_steps": 7577, "loss": 0.1085, "lr": 6.198186134284554e-07, "epoch": 0.6618714530816946, "percentage": 66.19, "elapsed_time": "0:13:00", "remaining_time": "0:06:38", "throughput": 3155.09, "total_tokens": 2463552} +{"current_steps": 5020, "total_steps": 7577, "loss": 0.0561, "lr": 6.176890355148049e-07, "epoch": 0.6625313448594431, "percentage": 66.25, "elapsed_time": "0:13:01", "remaining_time": "0:06:37", "throughput": 3156.89, "total_tokens": 2465856} +{"current_steps": 5025, "total_steps": 7577, "loss": 0.0902, "lr": 6.155614862876335e-07, "epoch": 0.6631912366371915, "percentage": 66.32, "elapsed_time": "0:13:01", "remaining_time": "0:06:36", "throughput": 3158.82, "total_tokens": 2468288} +{"current_steps": 5030, "total_steps": 7577, "loss": 0.1482, "lr": 6.134359770365214e-07, "epoch": 0.66385112841494, "percentage": 66.39, "elapsed_time": "0:13:01", "remaining_time": "0:06:35", "throughput": 3161.0, "total_tokens": 2470912} +{"current_steps": 5035, "total_steps": 7577, "loss": 0.109, "lr": 6.11312519040224e-07, "epoch": 0.6645110201926884, "percentage": 66.45, "elapsed_time": "0:13:01", "remaining_time": "0:06:34", "throughput": 3163.17, "total_tokens": 2473536} +{"current_steps": 5040, "total_steps": 7577, "loss": 0.0013, "lr": 6.091911235666125e-07, "epoch": 0.6651709119704369, "percentage": 66.52, "elapsed_time": "0:13:02", "remaining_time": "0:06:33", "throughput": 3165.19, "total_tokens": 2476032} +{"current_steps": 5045, "total_steps": 7577, "loss": 0.1091, "lr": 6.070718018726124e-07, "epoch": 0.6658308037481853, "percentage": 66.58, "elapsed_time": "0:13:02", "remaining_time": "0:06:32", "throughput": 3166.81, "total_tokens": 2478208} +{"current_steps": 5050, "total_steps": 7577, "loss": 0.1482, "lr": 6.049545652041459e-07, "epoch": 0.6664906955259338, "percentage": 66.65, "elapsed_time": "0:13:02", "remaining_time": "0:06:31", "throughput": 3168.58, "total_tokens": 2480512} +{"current_steps": 5055, "total_steps": 7577, "loss": 0.1775, "lr": 6.028394247960709e-07, "epoch": 0.6671505873036822, "percentage": 66.72, "elapsed_time": "0:13:03", "remaining_time": "0:06:30", "throughput": 3170.59, "total_tokens": 2483008} +{"current_steps": 5060, "total_steps": 7577, "loss": 0.1572, "lr": 6.007263918721221e-07, "epoch": 0.6678104790814307, "percentage": 66.78, "elapsed_time": "0:13:03", "remaining_time": "0:06:29", "throughput": 3172.45, "total_tokens": 2485376} +{"current_steps": 5065, "total_steps": 7577, "loss": 0.0559, "lr": 5.986154776448507e-07, "epoch": 0.668470370859179, "percentage": 66.85, "elapsed_time": "0:13:03", "remaining_time": "0:06:28", "throughput": 3174.7, "total_tokens": 2488064} +{"current_steps": 5070, "total_steps": 7577, "loss": 0.0578, "lr": 5.965066933155656e-07, "epoch": 0.6691302626369275, "percentage": 66.91, "elapsed_time": "0:13:04", "remaining_time": "0:06:27", "throughput": 3176.79, "total_tokens": 2490624} +{"current_steps": 5075, "total_steps": 7577, "loss": 0.2826, "lr": 5.944000500742735e-07, "epoch": 0.669790154414676, "percentage": 66.98, "elapsed_time": "0:13:04", "remaining_time": "0:06:26", "throughput": 3178.95, "total_tokens": 2493248} +{"current_steps": 5080, "total_steps": 7577, "loss": 0.201, "lr": 5.922955590996195e-07, "epoch": 0.6704500461924244, "percentage": 67.05, "elapsed_time": "0:13:04", "remaining_time": "0:06:25", "throughput": 3180.96, "total_tokens": 2495744} +{"current_steps": 5085, "total_steps": 7577, "loss": 0.0019, "lr": 5.901932315588281e-07, "epoch": 0.6711099379701729, "percentage": 67.11, "elapsed_time": "0:13:04", "remaining_time": "0:06:24", "throughput": 3182.89, "total_tokens": 2498176} +{"current_steps": 5090, "total_steps": 7577, "loss": 0.1805, "lr": 5.880930786076441e-07, "epoch": 0.6717698297479213, "percentage": 67.18, "elapsed_time": "0:13:05", "remaining_time": "0:06:23", "throughput": 3184.57, "total_tokens": 2500416} +{"current_steps": 5095, "total_steps": 7577, "loss": 0.06, "lr": 5.859951113902728e-07, "epoch": 0.6724297215256698, "percentage": 67.24, "elapsed_time": "0:13:05", "remaining_time": "0:06:22", "throughput": 3186.5, "total_tokens": 2502848} +{"current_steps": 5100, "total_steps": 7577, "loss": 0.1099, "lr": 5.83899341039321e-07, "epoch": 0.6730896133034182, "percentage": 67.31, "elapsed_time": "0:13:05", "remaining_time": "0:06:21", "throughput": 3188.25, "total_tokens": 2505152} +{"current_steps": 5105, "total_steps": 7577, "loss": 0.1247, "lr": 5.818057786757386e-07, "epoch": 0.6737495050811667, "percentage": 67.37, "elapsed_time": "0:13:06", "remaining_time": "0:06:20", "throughput": 3190.25, "total_tokens": 2507648} +{"current_steps": 5110, "total_steps": 7577, "loss": 0.0989, "lr": 5.797144354087588e-07, "epoch": 0.6744093968589151, "percentage": 67.44, "elapsed_time": "0:13:06", "remaining_time": "0:06:19", "throughput": 3192.25, "total_tokens": 2510144} +{"current_steps": 5115, "total_steps": 7577, "loss": 0.076, "lr": 5.77625322335839e-07, "epoch": 0.6750692886366636, "percentage": 67.51, "elapsed_time": "0:13:06", "remaining_time": "0:06:18", "throughput": 3194.71, "total_tokens": 2513024} +{"current_steps": 5120, "total_steps": 7577, "loss": 0.0721, "lr": 5.755384505426032e-07, "epoch": 0.675729180414412, "percentage": 67.57, "elapsed_time": "0:13:06", "remaining_time": "0:06:17", "throughput": 3196.14, "total_tokens": 2515072} +{"current_steps": 5125, "total_steps": 7577, "loss": 0.0018, "lr": 5.734538311027819e-07, "epoch": 0.6763890721921605, "percentage": 67.64, "elapsed_time": "0:13:07", "remaining_time": "0:06:16", "throughput": 3197.89, "total_tokens": 2517376} +{"current_steps": 5130, "total_steps": 7577, "loss": 0.0036, "lr": 5.713714750781533e-07, "epoch": 0.677048963969909, "percentage": 67.7, "elapsed_time": "0:13:07", "remaining_time": "0:06:15", "throughput": 3200.11, "total_tokens": 2520064} +{"current_steps": 5135, "total_steps": 7577, "loss": 0.0685, "lr": 5.692913935184862e-07, "epoch": 0.6777088557476574, "percentage": 67.77, "elapsed_time": "0:13:07", "remaining_time": "0:06:14", "throughput": 3202.26, "total_tokens": 2522688} +{"current_steps": 5140, "total_steps": 7577, "loss": 0.0071, "lr": 5.672135974614794e-07, "epoch": 0.6783687475254059, "percentage": 67.84, "elapsed_time": "0:13:08", "remaining_time": "0:06:13", "throughput": 3204.24, "total_tokens": 2525184} +{"current_steps": 5145, "total_steps": 7577, "loss": 0.0014, "lr": 5.651380979327034e-07, "epoch": 0.6790286393031543, "percentage": 67.9, "elapsed_time": "0:13:08", "remaining_time": "0:06:12", "throughput": 3206.07, "total_tokens": 2527552} +{"current_steps": 5150, "total_steps": 7577, "loss": 0.0442, "lr": 5.630649059455444e-07, "epoch": 0.6796885310809028, "percentage": 67.97, "elapsed_time": "0:13:08", "remaining_time": "0:06:11", "throughput": 3208.3, "total_tokens": 2530240} +{"current_steps": 5155, "total_steps": 7577, "loss": 0.0023, "lr": 5.609940325011413e-07, "epoch": 0.6803484228586512, "percentage": 68.03, "elapsed_time": "0:13:08", "remaining_time": "0:06:10", "throughput": 3209.96, "total_tokens": 2532480} +{"current_steps": 5160, "total_steps": 7577, "loss": 0.0007, "lr": 5.589254885883325e-07, "epoch": 0.6810083146363997, "percentage": 68.1, "elapsed_time": "0:13:09", "remaining_time": "0:06:09", "throughput": 3211.85, "total_tokens": 2534912} +{"current_steps": 5165, "total_steps": 7577, "loss": 0.0598, "lr": 5.568592851835936e-07, "epoch": 0.681668206414148, "percentage": 68.17, "elapsed_time": "0:13:09", "remaining_time": "0:06:08", "throughput": 3213.84, "total_tokens": 2537408} +{"current_steps": 5170, "total_steps": 7577, "loss": 0.3023, "lr": 5.547954332509805e-07, "epoch": 0.6823280981918965, "percentage": 68.23, "elapsed_time": "0:13:09", "remaining_time": "0:06:07", "throughput": 3215.66, "total_tokens": 2539776} +{"current_steps": 5175, "total_steps": 7577, "loss": 0.0009, "lr": 5.527339437420717e-07, "epoch": 0.6829879899696449, "percentage": 68.3, "elapsed_time": "0:13:10", "remaining_time": "0:06:06", "throughput": 3217.57, "total_tokens": 2542208} +{"current_steps": 5180, "total_steps": 7577, "loss": 0.1061, "lr": 5.506748275959094e-07, "epoch": 0.6836478817473934, "percentage": 68.36, "elapsed_time": "0:13:10", "remaining_time": "0:06:05", "throughput": 3219.54, "total_tokens": 2544704} +{"current_steps": 5185, "total_steps": 7577, "loss": 0.0535, "lr": 5.48618095738943e-07, "epoch": 0.6843077735251418, "percentage": 68.43, "elapsed_time": "0:13:10", "remaining_time": "0:06:04", "throughput": 3221.36, "total_tokens": 2547072} +{"current_steps": 5190, "total_steps": 7577, "loss": 0.1301, "lr": 5.465637590849681e-07, "epoch": 0.6849676653028903, "percentage": 68.5, "elapsed_time": "0:13:10", "remaining_time": "0:06:03", "throughput": 3223.18, "total_tokens": 2549440} +{"current_steps": 5195, "total_steps": 7577, "loss": 0.2169, "lr": 5.445118285350723e-07, "epoch": 0.6856275570806388, "percentage": 68.56, "elapsed_time": "0:13:11", "remaining_time": "0:06:02", "throughput": 3225.4, "total_tokens": 2552128} +{"current_steps": 5200, "total_steps": 7577, "loss": 0.0681, "lr": 5.424623149775745e-07, "epoch": 0.6862874488583872, "percentage": 68.63, "elapsed_time": "0:13:11", "remaining_time": "0:06:01", "throughput": 3227.07, "total_tokens": 2554368} +{"current_steps": 5205, "total_steps": 7577, "loss": 0.1175, "lr": 5.404152292879676e-07, "epoch": 0.6869473406361357, "percentage": 68.69, "elapsed_time": "0:13:11", "remaining_time": "0:06:00", "throughput": 3229.14, "total_tokens": 2556928} +{"current_steps": 5210, "total_steps": 7577, "loss": 0.1624, "lr": 5.38370582328863e-07, "epoch": 0.6876072324138841, "percentage": 68.76, "elapsed_time": "0:13:12", "remaining_time": "0:05:59", "throughput": 3231.05, "total_tokens": 2559360} +{"current_steps": 5215, "total_steps": 7577, "loss": 0.1578, "lr": 5.363283849499293e-07, "epoch": 0.6882671241916326, "percentage": 68.83, "elapsed_time": "0:13:12", "remaining_time": "0:05:58", "throughput": 3233.04, "total_tokens": 2561856} +{"current_steps": 5220, "total_steps": 7577, "loss": 0.1794, "lr": 5.342886479878387e-07, "epoch": 0.688927015969381, "percentage": 68.89, "elapsed_time": "0:13:12", "remaining_time": "0:05:57", "throughput": 3235.01, "total_tokens": 2564352} +{"current_steps": 5225, "total_steps": 7577, "loss": 0.0437, "lr": 5.32251382266206e-07, "epoch": 0.6895869077471295, "percentage": 68.96, "elapsed_time": "0:13:12", "remaining_time": "0:05:56", "throughput": 3236.91, "total_tokens": 2566784} +{"current_steps": 5230, "total_steps": 7577, "loss": 0.0593, "lr": 5.302165985955327e-07, "epoch": 0.6902467995248779, "percentage": 69.02, "elapsed_time": "0:13:13", "remaining_time": "0:05:55", "throughput": 3238.72, "total_tokens": 2569152} +{"current_steps": 5235, "total_steps": 7577, "loss": 0.067, "lr": 5.281843077731511e-07, "epoch": 0.6909066913026264, "percentage": 69.09, "elapsed_time": "0:13:13", "remaining_time": "0:05:55", "throughput": 3240.54, "total_tokens": 2571520} +{"current_steps": 5240, "total_steps": 7577, "loss": 0.141, "lr": 5.26154520583163e-07, "epoch": 0.6915665830803748, "percentage": 69.16, "elapsed_time": "0:13:13", "remaining_time": "0:05:54", "throughput": 3242.6, "total_tokens": 2574080} +{"current_steps": 5245, "total_steps": 7577, "loss": 0.0595, "lr": 5.241272477963877e-07, "epoch": 0.6922264748581233, "percentage": 69.22, "elapsed_time": "0:13:14", "remaining_time": "0:05:53", "throughput": 3244.26, "total_tokens": 2576320} +{"current_steps": 5250, "total_steps": 7577, "loss": 0.0576, "lr": 5.221025001703e-07, "epoch": 0.6928863666358717, "percentage": 69.29, "elapsed_time": "0:13:14", "remaining_time": "0:05:52", "throughput": 3246.16, "total_tokens": 2578752} +{"current_steps": 5255, "total_steps": 7577, "loss": 0.1368, "lr": 5.200802884489768e-07, "epoch": 0.6935462584136202, "percentage": 69.35, "elapsed_time": "0:13:14", "remaining_time": "0:05:51", "throughput": 3248.06, "total_tokens": 2581184} +{"current_steps": 5260, "total_steps": 7577, "loss": 0.1654, "lr": 5.180606233630374e-07, "epoch": 0.6942061501913687, "percentage": 69.42, "elapsed_time": "0:13:14", "remaining_time": "0:05:50", "throughput": 3250.24, "total_tokens": 2583872} +{"current_steps": 5265, "total_steps": 7577, "loss": 0.1912, "lr": 5.160435156295879e-07, "epoch": 0.694866041969117, "percentage": 69.49, "elapsed_time": "0:13:15", "remaining_time": "0:05:49", "throughput": 3252.12, "total_tokens": 2586304} +{"current_steps": 5270, "total_steps": 7577, "loss": 0.0201, "lr": 5.14028975952165e-07, "epoch": 0.6955259337468656, "percentage": 69.55, "elapsed_time": "0:13:15", "remaining_time": "0:05:48", "throughput": 3254.39, "total_tokens": 2589056} +{"current_steps": 5275, "total_steps": 7577, "loss": 0.14, "lr": 5.120170150206768e-07, "epoch": 0.6961858255246139, "percentage": 69.62, "elapsed_time": "0:13:15", "remaining_time": "0:05:47", "throughput": 3256.28, "total_tokens": 2591488} +{"current_steps": 5280, "total_steps": 7577, "loss": 0.0542, "lr": 5.100076435113496e-07, "epoch": 0.6968457173023624, "percentage": 69.68, "elapsed_time": "0:13:16", "remaining_time": "0:05:46", "throughput": 3258.01, "total_tokens": 2593792} +{"current_steps": 5285, "total_steps": 7577, "loss": 0.1538, "lr": 5.080008720866673e-07, "epoch": 0.6975056090801108, "percentage": 69.75, "elapsed_time": "0:13:16", "remaining_time": "0:05:45", "throughput": 3259.58, "total_tokens": 2595968} +{"current_steps": 5290, "total_steps": 7577, "loss": 0.2123, "lr": 5.059967113953173e-07, "epoch": 0.6981655008578593, "percentage": 69.82, "elapsed_time": "0:13:16", "remaining_time": "0:05:44", "throughput": 3261.16, "total_tokens": 2598144} +{"current_steps": 5295, "total_steps": 7577, "loss": 0.0838, "lr": 5.039951720721349e-07, "epoch": 0.6988253926356077, "percentage": 69.88, "elapsed_time": "0:13:16", "remaining_time": "0:05:43", "throughput": 3262.89, "total_tokens": 2600448} +{"current_steps": 5300, "total_steps": 7577, "loss": 0.0167, "lr": 5.019962647380429e-07, "epoch": 0.6994852844133562, "percentage": 69.95, "elapsed_time": "0:13:17", "remaining_time": "0:05:42", "throughput": 3264.84, "total_tokens": 2602944} +{"current_steps": 5305, "total_steps": 7577, "loss": 0.2014, "lr": 5.000000000000002e-07, "epoch": 0.7001451761911046, "percentage": 70.01, "elapsed_time": "0:13:17", "remaining_time": "0:05:41", "throughput": 3266.41, "total_tokens": 2605120} +{"current_steps": 5306, "total_steps": 7577, "eval_loss": 0.09084735810756683, "epoch": 0.7002771545466544, "percentage": 70.03, "elapsed_time": "0:13:25", "remaining_time": "0:05:44", "throughput": 3235.64, "total_tokens": 2605504} +{"current_steps": 5310, "total_steps": 7577, "loss": 0.0377, "lr": 4.980063884509414e-07, "epoch": 0.7008050679688531, "percentage": 70.08, "elapsed_time": "0:13:45", "remaining_time": "0:05:52", "throughput": 3159.93, "total_tokens": 2607296} +{"current_steps": 5315, "total_steps": 7577, "loss": 0.0463, "lr": 4.960154406697229e-07, "epoch": 0.7014649597466015, "percentage": 70.15, "elapsed_time": "0:13:45", "remaining_time": "0:05:51", "throughput": 3161.76, "total_tokens": 2609728} +{"current_steps": 5320, "total_steps": 7577, "loss": 0.2924, "lr": 4.940271672210667e-07, "epoch": 0.70212485152435, "percentage": 70.21, "elapsed_time": "0:13:45", "remaining_time": "0:05:50", "throughput": 3163.67, "total_tokens": 2612224} +{"current_steps": 5325, "total_steps": 7577, "loss": 0.0513, "lr": 4.920415786555025e-07, "epoch": 0.7027847433020985, "percentage": 70.28, "elapsed_time": "0:13:45", "remaining_time": "0:05:49", "throughput": 3165.57, "total_tokens": 2614720} +{"current_steps": 5330, "total_steps": 7577, "loss": 0.3194, "lr": 4.900586855093144e-07, "epoch": 0.7034446350798469, "percentage": 70.34, "elapsed_time": "0:13:46", "remaining_time": "0:05:48", "throughput": 3167.62, "total_tokens": 2617344} +{"current_steps": 5335, "total_steps": 7577, "loss": 0.1166, "lr": 4.880784983044827e-07, "epoch": 0.7041045268575954, "percentage": 70.41, "elapsed_time": "0:13:46", "remaining_time": "0:05:47", "throughput": 3169.23, "total_tokens": 2619584} +{"current_steps": 5340, "total_steps": 7577, "loss": 0.0176, "lr": 4.861010275486284e-07, "epoch": 0.7047644186353438, "percentage": 70.48, "elapsed_time": "0:13:46", "remaining_time": "0:05:46", "throughput": 3170.92, "total_tokens": 2621888} +{"current_steps": 5345, "total_steps": 7577, "loss": 0.0731, "lr": 4.8412628373496e-07, "epoch": 0.7054243104130923, "percentage": 70.54, "elapsed_time": "0:13:47", "remaining_time": "0:05:45", "throughput": 3173.0, "total_tokens": 2624512} +{"current_steps": 5350, "total_steps": 7577, "loss": 0.0024, "lr": 4.821542773422136e-07, "epoch": 0.7060842021908407, "percentage": 70.61, "elapsed_time": "0:13:47", "remaining_time": "0:05:44", "throughput": 3174.91, "total_tokens": 2627008} +{"current_steps": 5355, "total_steps": 7577, "loss": 0.0019, "lr": 4.801850188346012e-07, "epoch": 0.7067440939685892, "percentage": 70.67, "elapsed_time": "0:13:47", "remaining_time": "0:05:43", "throughput": 3176.76, "total_tokens": 2629440} +{"current_steps": 5360, "total_steps": 7577, "loss": 0.1034, "lr": 4.782185186617523e-07, "epoch": 0.7074039857463376, "percentage": 70.74, "elapsed_time": "0:13:47", "remaining_time": "0:05:42", "throughput": 3178.6, "total_tokens": 2631872} +{"current_steps": 5365, "total_steps": 7577, "loss": 0.0814, "lr": 4.762547872586603e-07, "epoch": 0.7080638775240861, "percentage": 70.81, "elapsed_time": "0:13:48", "remaining_time": "0:05:41", "throughput": 3180.74, "total_tokens": 2634560} +{"current_steps": 5370, "total_steps": 7577, "loss": 0.1396, "lr": 4.7429383504562605e-07, "epoch": 0.7087237693018344, "percentage": 70.87, "elapsed_time": "0:13:48", "remaining_time": "0:05:40", "throughput": 3182.72, "total_tokens": 2637120} +{"current_steps": 5375, "total_steps": 7577, "loss": 0.0019, "lr": 4.723356724282029e-07, "epoch": 0.709383661079583, "percentage": 70.94, "elapsed_time": "0:13:48", "remaining_time": "0:05:39", "throughput": 3184.56, "total_tokens": 2639552} +{"current_steps": 5380, "total_steps": 7577, "loss": 0.1315, "lr": 4.703803097971426e-07, "epoch": 0.7100435528573315, "percentage": 71.0, "elapsed_time": "0:13:49", "remaining_time": "0:05:38", "throughput": 3186.4, "total_tokens": 2641984} +{"current_steps": 5385, "total_steps": 7577, "loss": 0.0708, "lr": 4.6842775752833763e-07, "epoch": 0.7107034446350798, "percentage": 71.07, "elapsed_time": "0:13:49", "remaining_time": "0:05:37", "throughput": 3188.16, "total_tokens": 2644352} +{"current_steps": 5390, "total_steps": 7577, "loss": 0.02, "lr": 4.664780259827689e-07, "epoch": 0.7113633364128283, "percentage": 71.14, "elapsed_time": "0:13:49", "remaining_time": "0:05:36", "throughput": 3190.3, "total_tokens": 2647040} +{"current_steps": 5395, "total_steps": 7577, "loss": 0.0013, "lr": 4.6453112550644857e-07, "epoch": 0.7120232281905767, "percentage": 71.2, "elapsed_time": "0:13:50", "remaining_time": "0:05:35", "throughput": 3192.13, "total_tokens": 2649472} +{"current_steps": 5400, "total_steps": 7577, "loss": 0.0411, "lr": 4.625870664303663e-07, "epoch": 0.7126831199683252, "percentage": 71.27, "elapsed_time": "0:13:50", "remaining_time": "0:05:34", "throughput": 3193.89, "total_tokens": 2651840} +{"current_steps": 5405, "total_steps": 7577, "loss": 0.0056, "lr": 4.6064585907043486e-07, "epoch": 0.7133430117460736, "percentage": 71.33, "elapsed_time": "0:13:50", "remaining_time": "0:05:33", "throughput": 3195.94, "total_tokens": 2654464} +{"current_steps": 5410, "total_steps": 7577, "loss": 0.0537, "lr": 4.587075137274334e-07, "epoch": 0.7140029035238221, "percentage": 71.4, "elapsed_time": "0:13:50", "remaining_time": "0:05:32", "throughput": 3197.4, "total_tokens": 2656576} +{"current_steps": 5415, "total_steps": 7577, "loss": 0.0546, "lr": 4.5677204068695597e-07, "epoch": 0.7146627953015705, "percentage": 71.47, "elapsed_time": "0:13:51", "remaining_time": "0:05:31", "throughput": 3199.23, "total_tokens": 2659008} +{"current_steps": 5420, "total_steps": 7577, "loss": 0.0401, "lr": 4.5483945021935356e-07, "epoch": 0.715322687079319, "percentage": 71.53, "elapsed_time": "0:13:51", "remaining_time": "0:05:30", "throughput": 3201.29, "total_tokens": 2661632} +{"current_steps": 5425, "total_steps": 7577, "loss": 0.0963, "lr": 4.5290975257968155e-07, "epoch": 0.7159825788570674, "percentage": 71.6, "elapsed_time": "0:13:51", "remaining_time": "0:05:29", "throughput": 3203.27, "total_tokens": 2664192} +{"current_steps": 5430, "total_steps": 7577, "loss": 0.1819, "lr": 4.509829580076452e-07, "epoch": 0.7166424706348159, "percentage": 71.66, "elapsed_time": "0:13:51", "remaining_time": "0:05:28", "throughput": 3205.1, "total_tokens": 2666624} +{"current_steps": 5435, "total_steps": 7577, "loss": 0.1842, "lr": 4.490590767275442e-07, "epoch": 0.7173023624125643, "percentage": 71.73, "elapsed_time": "0:13:52", "remaining_time": "0:05:28", "throughput": 3207.0, "total_tokens": 2669120} +{"current_steps": 5440, "total_steps": 7577, "loss": 0.102, "lr": 4.4713811894822064e-07, "epoch": 0.7179622541903128, "percentage": 71.8, "elapsed_time": "0:13:52", "remaining_time": "0:05:27", "throughput": 3208.84, "total_tokens": 2671552} +{"current_steps": 5445, "total_steps": 7577, "loss": 0.071, "lr": 4.4522009486300204e-07, "epoch": 0.7186221459680613, "percentage": 71.86, "elapsed_time": "0:13:52", "remaining_time": "0:05:26", "throughput": 3210.95, "total_tokens": 2674240} +{"current_steps": 5450, "total_steps": 7577, "loss": 0.1247, "lr": 4.43305014649649e-07, "epoch": 0.7192820377458097, "percentage": 71.93, "elapsed_time": "0:13:53", "remaining_time": "0:05:25", "throughput": 3212.62, "total_tokens": 2676544} +{"current_steps": 5455, "total_steps": 7577, "loss": 0.0005, "lr": 4.4139288847030155e-07, "epoch": 0.7199419295235582, "percentage": 71.99, "elapsed_time": "0:13:53", "remaining_time": "0:05:24", "throughput": 3214.37, "total_tokens": 2678912} +{"current_steps": 5460, "total_steps": 7577, "loss": 0.0554, "lr": 4.394837264714233e-07, "epoch": 0.7206018213013066, "percentage": 72.06, "elapsed_time": "0:13:53", "remaining_time": "0:05:23", "throughput": 3216.19, "total_tokens": 2681344} +{"current_steps": 5465, "total_steps": 7577, "loss": 0.0013, "lr": 4.3757753878375005e-07, "epoch": 0.7212617130790551, "percentage": 72.13, "elapsed_time": "0:13:53", "remaining_time": "0:05:22", "throughput": 3218.0, "total_tokens": 2683776} +{"current_steps": 5470, "total_steps": 7577, "loss": 0.0567, "lr": 4.3567433552223375e-07, "epoch": 0.7219216048568035, "percentage": 72.19, "elapsed_time": "0:13:54", "remaining_time": "0:05:21", "throughput": 3219.6, "total_tokens": 2686016} +{"current_steps": 5475, "total_steps": 7577, "loss": 0.1963, "lr": 4.3377412678599e-07, "epoch": 0.722581496634552, "percentage": 72.26, "elapsed_time": "0:13:54", "remaining_time": "0:05:20", "throughput": 3221.03, "total_tokens": 2688128} +{"current_steps": 5480, "total_steps": 7577, "loss": 0.1399, "lr": 4.318769226582454e-07, "epoch": 0.7232413884123003, "percentage": 72.32, "elapsed_time": "0:13:54", "remaining_time": "0:05:19", "throughput": 3222.63, "total_tokens": 2690368} +{"current_steps": 5485, "total_steps": 7577, "loss": 0.0348, "lr": 4.299827332062811e-07, "epoch": 0.7239012801900488, "percentage": 72.39, "elapsed_time": "0:13:55", "remaining_time": "0:05:18", "throughput": 3224.67, "total_tokens": 2692992} +{"current_steps": 5490, "total_steps": 7577, "loss": 0.0968, "lr": 4.2809156848138363e-07, "epoch": 0.7245611719677972, "percentage": 72.46, "elapsed_time": "0:13:55", "remaining_time": "0:05:17", "throughput": 3226.47, "total_tokens": 2695424} +{"current_steps": 5495, "total_steps": 7577, "loss": 0.1639, "lr": 4.2620343851878616e-07, "epoch": 0.7252210637455457, "percentage": 72.52, "elapsed_time": "0:13:55", "remaining_time": "0:05:16", "throughput": 3228.28, "total_tokens": 2697856} +{"current_steps": 5500, "total_steps": 7577, "loss": 0.0446, "lr": 4.2431835333762123e-07, "epoch": 0.7258809555232941, "percentage": 72.59, "elapsed_time": "0:13:55", "remaining_time": "0:05:15", "throughput": 3230.45, "total_tokens": 2700608} +{"current_steps": 5505, "total_steps": 7577, "loss": 0.0005, "lr": 4.224363229408628e-07, "epoch": 0.7265408473010426, "percentage": 72.65, "elapsed_time": "0:13:56", "remaining_time": "0:05:14", "throughput": 3232.32, "total_tokens": 2703104} +{"current_steps": 5510, "total_steps": 7577, "loss": 0.1834, "lr": 4.205573573152753e-07, "epoch": 0.7272007390787911, "percentage": 72.72, "elapsed_time": "0:13:56", "remaining_time": "0:05:13", "throughput": 3233.91, "total_tokens": 2705344} +{"current_steps": 5515, "total_steps": 7577, "loss": 0.0728, "lr": 4.18681466431361e-07, "epoch": 0.7278606308565395, "percentage": 72.79, "elapsed_time": "0:13:56", "remaining_time": "0:05:12", "throughput": 3235.41, "total_tokens": 2707520} +{"current_steps": 5520, "total_steps": 7577, "loss": 0.105, "lr": 4.168086602433055e-07, "epoch": 0.728520522634288, "percentage": 72.85, "elapsed_time": "0:13:57", "remaining_time": "0:05:11", "throughput": 3237.13, "total_tokens": 2709888} +{"current_steps": 5525, "total_steps": 7577, "loss": 0.1888, "lr": 4.1493894868892676e-07, "epoch": 0.7291804144120364, "percentage": 72.92, "elapsed_time": "0:13:57", "remaining_time": "0:05:11", "throughput": 3238.79, "total_tokens": 2712192} +{"current_steps": 5530, "total_steps": 7577, "loss": 0.0838, "lr": 4.1307234168962093e-07, "epoch": 0.7298403061897849, "percentage": 72.98, "elapsed_time": "0:13:57", "remaining_time": "0:05:10", "throughput": 3240.29, "total_tokens": 2714368} +{"current_steps": 5535, "total_steps": 7577, "loss": 0.0014, "lr": 4.112088491503095e-07, "epoch": 0.7305001979675333, "percentage": 73.05, "elapsed_time": "0:13:57", "remaining_time": "0:05:09", "throughput": 3241.87, "total_tokens": 2716608} +{"current_steps": 5540, "total_steps": 7577, "loss": 0.001, "lr": 4.0934848095938937e-07, "epoch": 0.7311600897452818, "percentage": 73.12, "elapsed_time": "0:13:58", "remaining_time": "0:05:08", "throughput": 3243.22, "total_tokens": 2718656} +{"current_steps": 5545, "total_steps": 7577, "loss": 0.098, "lr": 4.074912469886763e-07, "epoch": 0.7318199815230302, "percentage": 73.18, "elapsed_time": "0:13:58", "remaining_time": "0:05:07", "throughput": 3245.08, "total_tokens": 2721152} +{"current_steps": 5550, "total_steps": 7577, "loss": 0.0009, "lr": 4.0563715709335657e-07, "epoch": 0.7324798733007787, "percentage": 73.25, "elapsed_time": "0:13:58", "remaining_time": "0:05:06", "throughput": 3246.5, "total_tokens": 2723264} +{"current_steps": 5555, "total_steps": 7577, "loss": 0.2022, "lr": 4.037862211119315e-07, "epoch": 0.7331397650785271, "percentage": 73.31, "elapsed_time": "0:13:59", "remaining_time": "0:05:05", "throughput": 3248.15, "total_tokens": 2725568} +{"current_steps": 5560, "total_steps": 7577, "loss": 0.0389, "lr": 4.0193844886616715e-07, "epoch": 0.7337996568562756, "percentage": 73.38, "elapsed_time": "0:13:59", "remaining_time": "0:05:04", "throughput": 3250.17, "total_tokens": 2728192} +{"current_steps": 5565, "total_steps": 7577, "loss": 0.1632, "lr": 4.0009385016104137e-07, "epoch": 0.7344595486340241, "percentage": 73.45, "elapsed_time": "0:13:59", "remaining_time": "0:05:03", "throughput": 3252.46, "total_tokens": 2731072} +{"current_steps": 5570, "total_steps": 7577, "loss": 0.1455, "lr": 3.9825243478469164e-07, "epoch": 0.7351194404117725, "percentage": 73.51, "elapsed_time": "0:13:59", "remaining_time": "0:05:02", "throughput": 3254.17, "total_tokens": 2733440} +{"current_steps": 5575, "total_steps": 7577, "loss": 0.1211, "lr": 3.9641421250836484e-07, "epoch": 0.735779332189521, "percentage": 73.58, "elapsed_time": "0:14:00", "remaining_time": "0:05:01", "throughput": 3256.18, "total_tokens": 2736064} +{"current_steps": 5580, "total_steps": 7577, "loss": 0.0356, "lr": 3.945791930863622e-07, "epoch": 0.7364392239672694, "percentage": 73.64, "elapsed_time": "0:14:00", "remaining_time": "0:05:00", "throughput": 3257.97, "total_tokens": 2738496} +{"current_steps": 5585, "total_steps": 7577, "loss": 0.002, "lr": 3.9274738625599137e-07, "epoch": 0.7370991157450179, "percentage": 73.71, "elapsed_time": "0:14:00", "remaining_time": "0:04:59", "throughput": 3259.62, "total_tokens": 2740800} +{"current_steps": 5590, "total_steps": 7577, "loss": 0.0746, "lr": 3.909188017375112e-07, "epoch": 0.7377590075227662, "percentage": 73.78, "elapsed_time": "0:14:01", "remaining_time": "0:04:58", "throughput": 3261.25, "total_tokens": 2743104} +{"current_steps": 5595, "total_steps": 7577, "loss": 0.1553, "lr": 3.890934492340819e-07, "epoch": 0.7384188993005147, "percentage": 73.84, "elapsed_time": "0:14:01", "remaining_time": "0:04:58", "throughput": 3262.81, "total_tokens": 2745344} +{"current_steps": 5600, "total_steps": 7577, "loss": 0.062, "lr": 3.872713384317147e-07, "epoch": 0.7390787910782631, "percentage": 73.91, "elapsed_time": "0:14:01", "remaining_time": "0:04:57", "throughput": 3264.3, "total_tokens": 2747520} +{"current_steps": 5605, "total_steps": 7577, "loss": 0.1382, "lr": 3.8545247899921776e-07, "epoch": 0.7397386828560116, "percentage": 73.97, "elapsed_time": "0:14:01", "remaining_time": "0:04:56", "throughput": 3266.16, "total_tokens": 2750016} +{"current_steps": 5610, "total_steps": 7577, "loss": 0.1139, "lr": 3.8363688058814614e-07, "epoch": 0.74039857463376, "percentage": 74.04, "elapsed_time": "0:14:02", "remaining_time": "0:04:55", "throughput": 3268.24, "total_tokens": 2752704} +{"current_steps": 5615, "total_steps": 7577, "loss": 0.1544, "lr": 3.818245528327526e-07, "epoch": 0.7410584664115085, "percentage": 74.11, "elapsed_time": "0:14:02", "remaining_time": "0:04:54", "throughput": 3270.23, "total_tokens": 2755328} +{"current_steps": 5620, "total_steps": 7577, "loss": 0.0911, "lr": 3.8001550534993164e-07, "epoch": 0.7417183581892569, "percentage": 74.17, "elapsed_time": "0:14:02", "remaining_time": "0:04:53", "throughput": 3271.85, "total_tokens": 2757632} +{"current_steps": 5625, "total_steps": 7577, "loss": 0.0665, "lr": 3.7820974773917413e-07, "epoch": 0.7423782499670054, "percentage": 74.24, "elapsed_time": "0:14:03", "remaining_time": "0:04:52", "throughput": 3273.78, "total_tokens": 2760192} +{"current_steps": 5630, "total_steps": 7577, "loss": 0.001, "lr": 3.764072895825117e-07, "epoch": 0.7430381417447539, "percentage": 74.3, "elapsed_time": "0:14:03", "remaining_time": "0:04:51", "throughput": 3275.77, "total_tokens": 2762816} +{"current_steps": 5635, "total_steps": 7577, "loss": 0.0625, "lr": 3.7460814044446934e-07, "epoch": 0.7436980335225023, "percentage": 74.37, "elapsed_time": "0:14:03", "remaining_time": "0:04:50", "throughput": 3277.4, "total_tokens": 2765120} +{"current_steps": 5640, "total_steps": 7577, "loss": 0.0989, "lr": 3.72812309872012e-07, "epoch": 0.7443579253002508, "percentage": 74.44, "elapsed_time": "0:14:03", "remaining_time": "0:04:49", "throughput": 3279.45, "total_tokens": 2767808} +{"current_steps": 5645, "total_steps": 7577, "loss": 0.0518, "lr": 3.71019807394495e-07, "epoch": 0.7450178170779992, "percentage": 74.5, "elapsed_time": "0:14:04", "remaining_time": "0:04:48", "throughput": 3281.12, "total_tokens": 2770176} +{"current_steps": 5650, "total_steps": 7577, "loss": 0.0983, "lr": 3.6923064252361505e-07, "epoch": 0.7456777088557477, "percentage": 74.57, "elapsed_time": "0:14:04", "remaining_time": "0:04:48", "throughput": 3282.97, "total_tokens": 2772672} +{"current_steps": 5655, "total_steps": 7577, "loss": 0.1089, "lr": 3.674448247533561e-07, "epoch": 0.7463376006334961, "percentage": 74.63, "elapsed_time": "0:14:04", "remaining_time": "0:04:47", "throughput": 3284.74, "total_tokens": 2775104} +{"current_steps": 5660, "total_steps": 7577, "loss": 0.2327, "lr": 3.656623635599432e-07, "epoch": 0.7469974924112446, "percentage": 74.7, "elapsed_time": "0:14:05", "remaining_time": "0:04:46", "throughput": 3286.8, "total_tokens": 2777792} +{"current_steps": 5665, "total_steps": 7577, "loss": 0.1313, "lr": 3.6388326840178865e-07, "epoch": 0.747657384188993, "percentage": 74.77, "elapsed_time": "0:14:05", "remaining_time": "0:04:45", "throughput": 3288.78, "total_tokens": 2780416} +{"current_steps": 5670, "total_steps": 7577, "loss": 0.0056, "lr": 3.621075487194435e-07, "epoch": 0.7483172759667415, "percentage": 74.83, "elapsed_time": "0:14:05", "remaining_time": "0:04:44", "throughput": 3290.99, "total_tokens": 2783232} +{"current_steps": 5675, "total_steps": 7577, "loss": 0.1054, "lr": 3.603352139355483e-07, "epoch": 0.7489771677444899, "percentage": 74.9, "elapsed_time": "0:14:05", "remaining_time": "0:04:43", "throughput": 3292.76, "total_tokens": 2785664} +{"current_steps": 5680, "total_steps": 7577, "loss": 0.0984, "lr": 3.58566273454781e-07, "epoch": 0.7496370595222384, "percentage": 74.96, "elapsed_time": "0:14:06", "remaining_time": "0:04:42", "throughput": 3294.67, "total_tokens": 2788224} +{"current_steps": 5685, "total_steps": 7577, "loss": 0.0014, "lr": 3.5680073666380817e-07, "epoch": 0.7502969512999867, "percentage": 75.03, "elapsed_time": "0:14:06", "remaining_time": "0:04:41", "throughput": 3296.43, "total_tokens": 2790656} +{"current_steps": 5685, "total_steps": 7577, "eval_loss": 0.0956902727484703, "epoch": 0.7502969512999867, "percentage": 75.03, "elapsed_time": "0:14:14", "remaining_time": "0:04:44", "throughput": 3267.12, "total_tokens": 2790656} +{"current_steps": 5690, "total_steps": 7577, "loss": 0.1594, "lr": 3.5503861293123514e-07, "epoch": 0.7509568430777352, "percentage": 75.1, "elapsed_time": "0:15:12", "remaining_time": "0:05:02", "throughput": 3060.24, "total_tokens": 2792960} +{"current_steps": 5695, "total_steps": 7577, "loss": 0.0789, "lr": 3.532799116075571e-07, "epoch": 0.7516167348554837, "percentage": 75.16, "elapsed_time": "0:15:12", "remaining_time": "0:05:01", "throughput": 3062.2, "total_tokens": 2795648} +{"current_steps": 5700, "total_steps": 7577, "loss": 0.098, "lr": 3.5152464202510777e-07, "epoch": 0.7522766266332321, "percentage": 75.23, "elapsed_time": "0:15:13", "remaining_time": "0:05:00", "throughput": 3063.49, "total_tokens": 2797696} +{"current_steps": 5705, "total_steps": 7577, "loss": 0.1334, "lr": 3.4977281349801056e-07, "epoch": 0.7529365184109806, "percentage": 75.29, "elapsed_time": "0:15:13", "remaining_time": "0:04:59", "throughput": 3065.25, "total_tokens": 2800192} +{"current_steps": 5710, "total_steps": 7577, "loss": 0.0133, "lr": 3.4802443532213056e-07, "epoch": 0.753596410188729, "percentage": 75.36, "elapsed_time": "0:15:13", "remaining_time": "0:04:58", "throughput": 3066.87, "total_tokens": 2802560} +{"current_steps": 5715, "total_steps": 7577, "loss": 0.2453, "lr": 3.4627951677502233e-07, "epoch": 0.7542563019664775, "percentage": 75.43, "elapsed_time": "0:15:14", "remaining_time": "0:04:57", "throughput": 3068.55, "total_tokens": 2804992} +{"current_steps": 5720, "total_steps": 7577, "loss": 0.0492, "lr": 3.4453806711588397e-07, "epoch": 0.7549161937442259, "percentage": 75.49, "elapsed_time": "0:15:14", "remaining_time": "0:04:56", "throughput": 3070.1, "total_tokens": 2807296} +{"current_steps": 5725, "total_steps": 7577, "loss": 0.0303, "lr": 3.428000955855054e-07, "epoch": 0.7555760855219744, "percentage": 75.56, "elapsed_time": "0:15:14", "remaining_time": "0:04:55", "throughput": 3072.08, "total_tokens": 2809984} +{"current_steps": 5730, "total_steps": 7577, "loss": 0.0023, "lr": 3.4106561140621983e-07, "epoch": 0.7562359772997228, "percentage": 75.62, "elapsed_time": "0:15:14", "remaining_time": "0:04:54", "throughput": 3074.1, "total_tokens": 2812736} +{"current_steps": 5735, "total_steps": 7577, "loss": 0.1465, "lr": 3.393346237818567e-07, "epoch": 0.7568958690774713, "percentage": 75.69, "elapsed_time": "0:15:15", "remaining_time": "0:04:53", "throughput": 3075.65, "total_tokens": 2815040} +{"current_steps": 5740, "total_steps": 7577, "loss": 0.1114, "lr": 3.3760714189769015e-07, "epoch": 0.7575557608552197, "percentage": 75.76, "elapsed_time": "0:15:15", "remaining_time": "0:04:53", "throughput": 3077.2, "total_tokens": 2817344} +{"current_steps": 5745, "total_steps": 7577, "loss": 0.0357, "lr": 3.3588317492039266e-07, "epoch": 0.7582156526329682, "percentage": 75.82, "elapsed_time": "0:15:15", "remaining_time": "0:04:52", "throughput": 3078.75, "total_tokens": 2819648} +{"current_steps": 5750, "total_steps": 7577, "loss": 0.1254, "lr": 3.341627319979834e-07, "epoch": 0.7588755444107167, "percentage": 75.89, "elapsed_time": "0:15:16", "remaining_time": "0:04:51", "throughput": 3080.82, "total_tokens": 2822464} +{"current_steps": 5755, "total_steps": 7577, "loss": 0.1943, "lr": 3.324458222597839e-07, "epoch": 0.7595354361884651, "percentage": 75.95, "elapsed_time": "0:15:16", "remaining_time": "0:04:50", "throughput": 3082.5, "total_tokens": 2824896} +{"current_steps": 5760, "total_steps": 7577, "loss": 0.0749, "lr": 3.307324548163657e-07, "epoch": 0.7601953279662136, "percentage": 76.02, "elapsed_time": "0:15:16", "remaining_time": "0:04:49", "throughput": 3084.52, "total_tokens": 2827648} +{"current_steps": 5765, "total_steps": 7577, "loss": 0.114, "lr": 3.2902263875950374e-07, "epoch": 0.760855219743962, "percentage": 76.09, "elapsed_time": "0:15:17", "remaining_time": "0:04:48", "throughput": 3086.48, "total_tokens": 2830336} +{"current_steps": 5770, "total_steps": 7577, "loss": 0.0462, "lr": 3.2731638316212894e-07, "epoch": 0.7615151115217105, "percentage": 76.15, "elapsed_time": "0:15:17", "remaining_time": "0:04:47", "throughput": 3088.03, "total_tokens": 2832640} +{"current_steps": 5775, "total_steps": 7577, "loss": 0.049, "lr": 3.256136970782782e-07, "epoch": 0.7621750032994589, "percentage": 76.22, "elapsed_time": "0:15:17", "remaining_time": "0:04:46", "throughput": 3089.5, "total_tokens": 2834880} +{"current_steps": 5780, "total_steps": 7577, "loss": 0.1447, "lr": 3.23914589543047e-07, "epoch": 0.7628348950772074, "percentage": 76.28, "elapsed_time": "0:15:17", "remaining_time": "0:04:45", "throughput": 3091.31, "total_tokens": 2837440} +{"current_steps": 5785, "total_steps": 7577, "loss": 0.0424, "lr": 3.2221906957254276e-07, "epoch": 0.7634947868549558, "percentage": 76.35, "elapsed_time": "0:15:18", "remaining_time": "0:04:44", "throughput": 3092.91, "total_tokens": 2839808} +{"current_steps": 5790, "total_steps": 7577, "loss": 0.1412, "lr": 3.205271461638346e-07, "epoch": 0.7641546786327043, "percentage": 76.42, "elapsed_time": "0:15:18", "remaining_time": "0:04:43", "throughput": 3094.79, "total_tokens": 2842432} +{"current_steps": 5795, "total_steps": 7577, "loss": 0.1313, "lr": 3.188388282949085e-07, "epoch": 0.7648145704104526, "percentage": 76.48, "elapsed_time": "0:15:18", "remaining_time": "0:04:42", "throughput": 3096.74, "total_tokens": 2845120} +{"current_steps": 5800, "total_steps": 7577, "loss": 0.1633, "lr": 3.171541249246166e-07, "epoch": 0.7654744621882011, "percentage": 76.55, "elapsed_time": "0:15:19", "remaining_time": "0:04:41", "throughput": 3098.88, "total_tokens": 2848000} +{"current_steps": 5805, "total_steps": 7577, "loss": 0.161, "lr": 3.154730449926316e-07, "epoch": 0.7661343539659495, "percentage": 76.61, "elapsed_time": "0:15:19", "remaining_time": "0:04:40", "throughput": 3100.75, "total_tokens": 2850624} +{"current_steps": 5810, "total_steps": 7577, "loss": 0.121, "lr": 3.137955974194e-07, "epoch": 0.766794245743698, "percentage": 76.68, "elapsed_time": "0:15:19", "remaining_time": "0:04:39", "throughput": 3102.35, "total_tokens": 2852992} +{"current_steps": 5815, "total_steps": 7577, "loss": 0.1251, "lr": 3.1212179110609125e-07, "epoch": 0.7674541375214465, "percentage": 76.75, "elapsed_time": "0:15:19", "remaining_time": "0:04:38", "throughput": 3104.02, "total_tokens": 2855424} +{"current_steps": 5820, "total_steps": 7577, "loss": 0.137, "lr": 3.104516349345553e-07, "epoch": 0.7681140292991949, "percentage": 76.81, "elapsed_time": "0:15:20", "remaining_time": "0:04:37", "throughput": 3104.93, "total_tokens": 2857984} +{"current_steps": 5825, "total_steps": 7577, "loss": 0.0643, "lr": 3.0878513776727144e-07, "epoch": 0.7687739210769434, "percentage": 76.88, "elapsed_time": "0:15:20", "remaining_time": "0:04:36", "throughput": 3106.87, "total_tokens": 2860672} +{"current_steps": 5830, "total_steps": 7577, "loss": 0.1726, "lr": 3.0712230844730414e-07, "epoch": 0.7694338128546918, "percentage": 76.94, "elapsed_time": "0:15:21", "remaining_time": "0:04:35", "throughput": 3108.46, "total_tokens": 2863040} +{"current_steps": 5835, "total_steps": 7577, "loss": 0.0704, "lr": 3.054631557982539e-07, "epoch": 0.7700937046324403, "percentage": 77.01, "elapsed_time": "0:15:21", "remaining_time": "0:04:35", "throughput": 3110.53, "total_tokens": 2865856} +{"current_steps": 5840, "total_steps": 7577, "loss": 0.1005, "lr": 3.0380768862421156e-07, "epoch": 0.7707535964101887, "percentage": 77.08, "elapsed_time": "0:15:21", "remaining_time": "0:04:34", "throughput": 3111.98, "total_tokens": 2868096} +{"current_steps": 5845, "total_steps": 7577, "loss": 0.0013, "lr": 3.0215591570971234e-07, "epoch": 0.7714134881879372, "percentage": 77.14, "elapsed_time": "0:15:21", "remaining_time": "0:04:33", "throughput": 3113.92, "total_tokens": 2870784} +{"current_steps": 5850, "total_steps": 7577, "loss": 0.0712, "lr": 3.005078458196868e-07, "epoch": 0.7720733799656856, "percentage": 77.21, "elapsed_time": "0:15:22", "remaining_time": "0:04:32", "throughput": 3115.59, "total_tokens": 2873216} +{"current_steps": 5855, "total_steps": 7577, "loss": 0.0011, "lr": 2.988634876994175e-07, "epoch": 0.7727332717434341, "percentage": 77.27, "elapsed_time": "0:15:22", "remaining_time": "0:04:31", "throughput": 3117.38, "total_tokens": 2875776} +{"current_steps": 5860, "total_steps": 7577, "loss": 0.0336, "lr": 2.972228500744898e-07, "epoch": 0.7733931635211825, "percentage": 77.34, "elapsed_time": "0:15:22", "remaining_time": "0:04:30", "throughput": 3119.17, "total_tokens": 2878336} +{"current_steps": 5865, "total_steps": 7577, "loss": 0.1431, "lr": 2.955859416507467e-07, "epoch": 0.774053055298931, "percentage": 77.41, "elapsed_time": "0:15:23", "remaining_time": "0:04:29", "throughput": 3120.97, "total_tokens": 2880896} +{"current_steps": 5870, "total_steps": 7577, "loss": 0.0684, "lr": 2.9395277111424357e-07, "epoch": 0.7747129470766794, "percentage": 77.47, "elapsed_time": "0:15:23", "remaining_time": "0:04:28", "throughput": 3122.96, "total_tokens": 2883648} +{"current_steps": 5875, "total_steps": 7577, "loss": 0.0016, "lr": 2.9232334713120035e-07, "epoch": 0.7753728388544279, "percentage": 77.54, "elapsed_time": "0:15:23", "remaining_time": "0:04:27", "throughput": 3124.48, "total_tokens": 2885952} +{"current_steps": 5880, "total_steps": 7577, "loss": 0.0614, "lr": 2.9069767834795655e-07, "epoch": 0.7760327306321764, "percentage": 77.6, "elapsed_time": "0:15:23", "remaining_time": "0:04:26", "throughput": 3126.34, "total_tokens": 2888576} +{"current_steps": 5885, "total_steps": 7577, "loss": 0.1331, "lr": 2.8907577339092483e-07, "epoch": 0.7766926224099248, "percentage": 77.67, "elapsed_time": "0:15:24", "remaining_time": "0:04:25", "throughput": 3128.14, "total_tokens": 2891136} +{"current_steps": 5890, "total_steps": 7577, "loss": 0.0711, "lr": 2.8745764086654654e-07, "epoch": 0.7773525141876733, "percentage": 77.74, "elapsed_time": "0:15:24", "remaining_time": "0:04:24", "throughput": 3129.94, "total_tokens": 2893696} +{"current_steps": 5895, "total_steps": 7577, "loss": 0.0499, "lr": 2.8584328936124424e-07, "epoch": 0.7780124059654216, "percentage": 77.8, "elapsed_time": "0:15:24", "remaining_time": "0:04:23", "throughput": 3131.99, "total_tokens": 2896512} +{"current_steps": 5900, "total_steps": 7577, "loss": 0.1805, "lr": 2.8423272744137674e-07, "epoch": 0.7786722977431701, "percentage": 77.87, "elapsed_time": "0:15:25", "remaining_time": "0:04:22", "throughput": 3133.72, "total_tokens": 2899008} +{"current_steps": 5905, "total_steps": 7577, "loss": 0.0781, "lr": 2.82625963653195e-07, "epoch": 0.7793321895209185, "percentage": 77.93, "elapsed_time": "0:15:25", "remaining_time": "0:04:22", "throughput": 3135.32, "total_tokens": 2901376} +{"current_steps": 5910, "total_steps": 7577, "loss": 0.1989, "lr": 2.810230065227944e-07, "epoch": 0.779992081298667, "percentage": 78.0, "elapsed_time": "0:15:25", "remaining_time": "0:04:21", "throughput": 3137.05, "total_tokens": 2903872} +{"current_steps": 5915, "total_steps": 7577, "loss": 0.0016, "lr": 2.7942386455607203e-07, "epoch": 0.7806519730764154, "percentage": 78.07, "elapsed_time": "0:15:25", "remaining_time": "0:04:20", "throughput": 3138.65, "total_tokens": 2906240} +{"current_steps": 5920, "total_steps": 7577, "loss": 0.0695, "lr": 2.77828546238679e-07, "epoch": 0.7813118648541639, "percentage": 78.13, "elapsed_time": "0:15:26", "remaining_time": "0:04:19", "throughput": 3140.38, "total_tokens": 2908736} +{"current_steps": 5925, "total_steps": 7577, "loss": 0.1347, "lr": 2.762370600359774e-07, "epoch": 0.7819717566319123, "percentage": 78.2, "elapsed_time": "0:15:26", "remaining_time": "0:04:18", "throughput": 3141.97, "total_tokens": 2911104} +{"current_steps": 5930, "total_steps": 7577, "loss": 0.0614, "lr": 2.7464941439299484e-07, "epoch": 0.7826316484096608, "percentage": 78.26, "elapsed_time": "0:15:26", "remaining_time": "0:04:17", "throughput": 3143.57, "total_tokens": 2913472} +{"current_steps": 5935, "total_steps": 7577, "loss": 0.1583, "lr": 2.7306561773437887e-07, "epoch": 0.7832915401874093, "percentage": 78.33, "elapsed_time": "0:15:27", "remaining_time": "0:04:16", "throughput": 3145.13, "total_tokens": 2915840} +{"current_steps": 5940, "total_steps": 7577, "loss": 0.0047, "lr": 2.714856784643533e-07, "epoch": 0.7839514319651577, "percentage": 78.4, "elapsed_time": "0:15:27", "remaining_time": "0:04:15", "throughput": 3146.64, "total_tokens": 2918144} +{"current_steps": 5945, "total_steps": 7577, "loss": 0.1479, "lr": 2.6990960496667313e-07, "epoch": 0.7846113237429062, "percentage": 78.46, "elapsed_time": "0:15:27", "remaining_time": "0:04:14", "throughput": 3148.51, "total_tokens": 2920768} +{"current_steps": 5950, "total_steps": 7577, "loss": 0.067, "lr": 2.6833740560457976e-07, "epoch": 0.7852712155206546, "percentage": 78.53, "elapsed_time": "0:15:27", "remaining_time": "0:04:13", "throughput": 3150.09, "total_tokens": 2923136} +{"current_steps": 5955, "total_steps": 7577, "loss": 0.0702, "lr": 2.6676908872075757e-07, "epoch": 0.7859311072984031, "percentage": 78.59, "elapsed_time": "0:15:28", "remaining_time": "0:04:12", "throughput": 3151.74, "total_tokens": 2925568} +{"current_steps": 5960, "total_steps": 7577, "loss": 0.0576, "lr": 2.6520466263728836e-07, "epoch": 0.7865909990761515, "percentage": 78.66, "elapsed_time": "0:15:28", "remaining_time": "0:04:11", "throughput": 3153.46, "total_tokens": 2928064} +{"current_steps": 5965, "total_steps": 7577, "loss": 0.2178, "lr": 2.636441356556087e-07, "epoch": 0.7872508908539, "percentage": 78.73, "elapsed_time": "0:15:28", "remaining_time": "0:04:11", "throughput": 3154.98, "total_tokens": 2930368} +{"current_steps": 5970, "total_steps": 7577, "loss": 0.1005, "lr": 2.620875160564645e-07, "epoch": 0.7879107826316484, "percentage": 78.79, "elapsed_time": "0:15:29", "remaining_time": "0:04:10", "throughput": 3156.76, "total_tokens": 2932928} +{"current_steps": 5975, "total_steps": 7577, "loss": 0.418, "lr": 2.6053481209986715e-07, "epoch": 0.7885706744093969, "percentage": 78.86, "elapsed_time": "0:15:29", "remaining_time": "0:04:09", "throughput": 3158.41, "total_tokens": 2935360} +{"current_steps": 5980, "total_steps": 7577, "loss": 0.059, "lr": 2.5898603202505155e-07, "epoch": 0.7892305661871453, "percentage": 78.92, "elapsed_time": "0:15:29", "remaining_time": "0:04:08", "throughput": 3160.19, "total_tokens": 2937920} +{"current_steps": 5985, "total_steps": 7577, "loss": 0.0502, "lr": 2.5744118405042923e-07, "epoch": 0.7898904579648938, "percentage": 78.99, "elapsed_time": "0:15:29", "remaining_time": "0:04:07", "throughput": 3161.71, "total_tokens": 2940224} +{"current_steps": 5990, "total_steps": 7577, "loss": 0.0017, "lr": 2.559002763735485e-07, "epoch": 0.7905503497426422, "percentage": 79.06, "elapsed_time": "0:15:30", "remaining_time": "0:04:06", "throughput": 3163.56, "total_tokens": 2942848} +{"current_steps": 5995, "total_steps": 7577, "loss": 0.0591, "lr": 2.543633171710472e-07, "epoch": 0.7912102415203907, "percentage": 79.12, "elapsed_time": "0:15:30", "remaining_time": "0:04:05", "throughput": 3165.28, "total_tokens": 2945344} +{"current_steps": 6000, "total_steps": 7577, "loss": 0.0162, "lr": 2.5283031459861205e-07, "epoch": 0.7918701332981392, "percentage": 79.19, "elapsed_time": "0:15:30", "remaining_time": "0:04:04", "throughput": 3167.0, "total_tokens": 2947840} +{"current_steps": 6005, "total_steps": 7577, "loss": 0.0344, "lr": 2.5130127679093396e-07, "epoch": 0.7925300250758875, "percentage": 79.25, "elapsed_time": "0:15:31", "remaining_time": "0:04:03", "throughput": 3168.5, "total_tokens": 2950144} +{"current_steps": 6010, "total_steps": 7577, "loss": 0.0428, "lr": 2.497762118616652e-07, "epoch": 0.793189916853636, "percentage": 79.32, "elapsed_time": "0:15:31", "remaining_time": "0:04:02", "throughput": 3169.94, "total_tokens": 2952384} +{"current_steps": 6015, "total_steps": 7577, "loss": 0.0788, "lr": 2.4825512790337745e-07, "epoch": 0.7938498086313844, "percentage": 79.38, "elapsed_time": "0:15:31", "remaining_time": "0:04:01", "throughput": 3171.9, "total_tokens": 2955136} +{"current_steps": 6020, "total_steps": 7577, "loss": 0.0441, "lr": 2.467380329875163e-07, "epoch": 0.7945097004091329, "percentage": 79.45, "elapsed_time": "0:15:31", "remaining_time": "0:04:01", "throughput": 3173.8, "total_tokens": 2957824} +{"current_steps": 6025, "total_steps": 7577, "loss": 0.0038, "lr": 2.452249351643615e-07, "epoch": 0.7951695921868813, "percentage": 79.52, "elapsed_time": "0:15:32", "remaining_time": "0:04:00", "throughput": 3175.44, "total_tokens": 2960256} +{"current_steps": 6030, "total_steps": 7577, "loss": 0.0672, "lr": 2.437158424629817e-07, "epoch": 0.7958294839646298, "percentage": 79.58, "elapsed_time": "0:15:32", "remaining_time": "0:03:59", "throughput": 3177.35, "total_tokens": 2962944} +{"current_steps": 6035, "total_steps": 7577, "loss": 0.2047, "lr": 2.422107628911929e-07, "epoch": 0.7964893757423782, "percentage": 79.65, "elapsed_time": "0:15:32", "remaining_time": "0:03:58", "throughput": 3179.12, "total_tokens": 2965504} +{"current_steps": 6040, "total_steps": 7577, "loss": 0.2335, "lr": 2.4070970443551673e-07, "epoch": 0.7971492675201267, "percentage": 79.71, "elapsed_time": "0:15:33", "remaining_time": "0:03:57", "throughput": 3180.56, "total_tokens": 2967744} +{"current_steps": 6045, "total_steps": 7577, "loss": 0.0017, "lr": 2.392126750611362e-07, "epoch": 0.7978091592978751, "percentage": 79.78, "elapsed_time": "0:15:33", "remaining_time": "0:03:56", "throughput": 3182.26, "total_tokens": 2970240} +{"current_steps": 6050, "total_steps": 7577, "loss": 0.1777, "lr": 2.3771968271185538e-07, "epoch": 0.7984690510756236, "percentage": 79.85, "elapsed_time": "0:15:33", "remaining_time": "0:03:55", "throughput": 3184.15, "total_tokens": 2972928} +{"current_steps": 6055, "total_steps": 7577, "loss": 0.1485, "lr": 2.3623073531005579e-07, "epoch": 0.799128942853372, "percentage": 79.91, "elapsed_time": "0:15:33", "remaining_time": "0:03:54", "throughput": 3185.58, "total_tokens": 2975168} +{"current_steps": 6060, "total_steps": 7577, "loss": 0.1294, "lr": 2.3474584075665493e-07, "epoch": 0.7997888346311205, "percentage": 79.98, "elapsed_time": "0:15:34", "remaining_time": "0:03:53", "throughput": 3187.01, "total_tokens": 2977408} +{"current_steps": 6064, "total_steps": 7577, "eval_loss": 0.0954766720533371, "epoch": 0.8003167480533192, "percentage": 80.03, "elapsed_time": "0:15:41", "remaining_time": "0:03:55", "throughput": 3162.96, "total_tokens": 2979456} +{"current_steps": 6065, "total_steps": 7577, "loss": 0.0013, "lr": 2.3326500693106533e-07, "epoch": 0.800448726408869, "percentage": 80.04, "elapsed_time": "0:16:06", "remaining_time": "0:04:00", "throughput": 3082.97, "total_tokens": 2979968} +{"current_steps": 6070, "total_steps": 7577, "loss": 0.209, "lr": 2.3178824169114975e-07, "epoch": 0.8011086181866174, "percentage": 80.11, "elapsed_time": "0:16:06", "remaining_time": "0:04:00", "throughput": 3084.7, "total_tokens": 2982528} +{"current_steps": 6075, "total_steps": 7577, "loss": 0.0494, "lr": 2.303155528731837e-07, "epoch": 0.8017685099643659, "percentage": 80.18, "elapsed_time": "0:16:07", "remaining_time": "0:03:59", "throughput": 3086.18, "total_tokens": 2984832} +{"current_steps": 6080, "total_steps": 7577, "loss": 0.0014, "lr": 2.2884694829181016e-07, "epoch": 0.8024284017421143, "percentage": 80.24, "elapsed_time": "0:16:07", "remaining_time": "0:03:58", "throughput": 3087.82, "total_tokens": 2987328} +{"current_steps": 6085, "total_steps": 7577, "loss": 0.0083, "lr": 2.273824357400005e-07, "epoch": 0.8030882935198628, "percentage": 80.31, "elapsed_time": "0:16:07", "remaining_time": "0:03:57", "throughput": 3089.42, "total_tokens": 2989760} +{"current_steps": 6090, "total_steps": 7577, "loss": 0.0188, "lr": 2.2592202298901174e-07, "epoch": 0.8037481852976112, "percentage": 80.37, "elapsed_time": "0:16:08", "remaining_time": "0:03:56", "throughput": 3091.14, "total_tokens": 2992320} +{"current_steps": 6095, "total_steps": 7577, "loss": 0.0014, "lr": 2.2446571778834555e-07, "epoch": 0.8044080770753597, "percentage": 80.44, "elapsed_time": "0:16:08", "remaining_time": "0:03:55", "throughput": 3093.13, "total_tokens": 2995136} +{"current_steps": 6100, "total_steps": 7577, "loss": 0.0009, "lr": 2.2301352786570827e-07, "epoch": 0.805067968853108, "percentage": 80.51, "elapsed_time": "0:16:08", "remaining_time": "0:03:54", "throughput": 3095.17, "total_tokens": 2998016} +{"current_steps": 6105, "total_steps": 7577, "loss": 0.1425, "lr": 2.215654609269685e-07, "epoch": 0.8057278606308566, "percentage": 80.57, "elapsed_time": "0:16:08", "remaining_time": "0:03:53", "throughput": 3097.15, "total_tokens": 3000832} +{"current_steps": 6110, "total_steps": 7577, "loss": 0.1461, "lr": 2.201215246561161e-07, "epoch": 0.8063877524086049, "percentage": 80.64, "elapsed_time": "0:16:09", "remaining_time": "0:03:52", "throughput": 3099.05, "total_tokens": 3003584} +{"current_steps": 6115, "total_steps": 7577, "loss": 0.0738, "lr": 2.1868172671522357e-07, "epoch": 0.8070476441863534, "percentage": 80.7, "elapsed_time": "0:16:09", "remaining_time": "0:03:51", "throughput": 3101.07, "total_tokens": 3006464} +{"current_steps": 6120, "total_steps": 7577, "loss": 0.0824, "lr": 2.1724607474440216e-07, "epoch": 0.8077075359641019, "percentage": 80.77, "elapsed_time": "0:16:09", "remaining_time": "0:03:50", "throughput": 3102.66, "total_tokens": 3008896} +{"current_steps": 6125, "total_steps": 7577, "loss": 0.1463, "lr": 2.158145763617646e-07, "epoch": 0.8083674277418503, "percentage": 80.84, "elapsed_time": "0:16:10", "remaining_time": "0:03:49", "throughput": 3104.32, "total_tokens": 3011392} +{"current_steps": 6130, "total_steps": 7577, "loss": 0.2764, "lr": 2.1438723916338198e-07, "epoch": 0.8090273195195988, "percentage": 80.9, "elapsed_time": "0:16:10", "remaining_time": "0:03:49", "throughput": 3106.09, "total_tokens": 3014016} +{"current_steps": 6135, "total_steps": 7577, "loss": 0.1715, "lr": 2.1296407072324495e-07, "epoch": 0.8096872112973472, "percentage": 80.97, "elapsed_time": "0:16:10", "remaining_time": "0:03:48", "throughput": 3107.81, "total_tokens": 3016576} +{"current_steps": 6140, "total_steps": 7577, "loss": 0.0432, "lr": 2.1154507859322336e-07, "epoch": 0.8103471030750957, "percentage": 81.03, "elapsed_time": "0:16:10", "remaining_time": "0:03:47", "throughput": 3109.41, "total_tokens": 3019008} +{"current_steps": 6145, "total_steps": 7577, "loss": 0.1229, "lr": 2.101302703030252e-07, "epoch": 0.8110069948528441, "percentage": 81.1, "elapsed_time": "0:16:11", "remaining_time": "0:03:46", "throughput": 3111.07, "total_tokens": 3021504} +{"current_steps": 6150, "total_steps": 7577, "loss": 0.0575, "lr": 2.0871965336015885e-07, "epoch": 0.8116668866305926, "percentage": 81.17, "elapsed_time": "0:16:11", "remaining_time": "0:03:45", "throughput": 3112.27, "total_tokens": 3023552} +{"current_steps": 6155, "total_steps": 7577, "loss": 0.0704, "lr": 2.0731323524989031e-07, "epoch": 0.812326778408341, "percentage": 81.23, "elapsed_time": "0:16:11", "remaining_time": "0:03:44", "throughput": 3113.74, "total_tokens": 3025856} +{"current_steps": 6160, "total_steps": 7577, "loss": 0.2049, "lr": 2.0591102343520616e-07, "epoch": 0.8129866701860895, "percentage": 81.3, "elapsed_time": "0:16:12", "remaining_time": "0:03:43", "throughput": 3115.14, "total_tokens": 3028096} +{"current_steps": 6165, "total_steps": 7577, "loss": 0.159, "lr": 2.0451302535677206e-07, "epoch": 0.8136465619638379, "percentage": 81.36, "elapsed_time": "0:16:12", "remaining_time": "0:03:42", "throughput": 3116.72, "total_tokens": 3030528} +{"current_steps": 6170, "total_steps": 7577, "loss": 0.227, "lr": 2.0311924843289396e-07, "epoch": 0.8143064537415864, "percentage": 81.43, "elapsed_time": "0:16:12", "remaining_time": "0:03:41", "throughput": 3118.43, "total_tokens": 3033088} +{"current_steps": 6175, "total_steps": 7577, "loss": 0.0642, "lr": 2.017297000594794e-07, "epoch": 0.8149663455193348, "percentage": 81.5, "elapsed_time": "0:16:12", "remaining_time": "0:03:40", "throughput": 3119.7, "total_tokens": 3035200} +{"current_steps": 6180, "total_steps": 7577, "loss": 0.0604, "lr": 2.0034438760999696e-07, "epoch": 0.8156262372970833, "percentage": 81.56, "elapsed_time": "0:16:13", "remaining_time": "0:03:39", "throughput": 3121.35, "total_tokens": 3037696} +{"current_steps": 6185, "total_steps": 7577, "loss": 0.1423, "lr": 1.9896331843543856e-07, "epoch": 0.8162861290748318, "percentage": 81.63, "elapsed_time": "0:16:13", "remaining_time": "0:03:39", "throughput": 3122.94, "total_tokens": 3040128} +{"current_steps": 6190, "total_steps": 7577, "loss": 0.1184, "lr": 1.975864998642789e-07, "epoch": 0.8169460208525802, "percentage": 81.69, "elapsed_time": "0:16:13", "remaining_time": "0:03:38", "throughput": 3124.53, "total_tokens": 3042560} +{"current_steps": 6195, "total_steps": 7577, "loss": 0.2826, "lr": 1.9621393920243767e-07, "epoch": 0.8176059126303287, "percentage": 81.76, "elapsed_time": "0:16:14", "remaining_time": "0:03:37", "throughput": 3125.89, "total_tokens": 3044800} +{"current_steps": 6200, "total_steps": 7577, "loss": 0.1028, "lr": 1.9484564373324074e-07, "epoch": 0.8182658044080771, "percentage": 81.83, "elapsed_time": "0:16:14", "remaining_time": "0:03:36", "throughput": 3127.28, "total_tokens": 3047040} +{"current_steps": 6205, "total_steps": 7577, "loss": 0.0495, "lr": 1.934816207173805e-07, "epoch": 0.8189256961858256, "percentage": 81.89, "elapsed_time": "0:16:14", "remaining_time": "0:03:35", "throughput": 3129.0, "total_tokens": 3049600} +{"current_steps": 6210, "total_steps": 7577, "loss": 0.158, "lr": 1.9212187739287943e-07, "epoch": 0.819585587963574, "percentage": 81.96, "elapsed_time": "0:16:14", "remaining_time": "0:03:34", "throughput": 3130.94, "total_tokens": 3052416} +{"current_steps": 6215, "total_steps": 7577, "loss": 0.0135, "lr": 1.907664209750488e-07, "epoch": 0.8202454797413224, "percentage": 82.02, "elapsed_time": "0:16:15", "remaining_time": "0:03:33", "throughput": 3132.68, "total_tokens": 3055040} +{"current_steps": 6220, "total_steps": 7577, "loss": 0.0446, "lr": 1.8941525865645336e-07, "epoch": 0.8209053715190708, "percentage": 82.09, "elapsed_time": "0:16:15", "remaining_time": "0:03:32", "throughput": 3134.62, "total_tokens": 3057856} +{"current_steps": 6225, "total_steps": 7577, "loss": 0.2045, "lr": 1.8806839760687076e-07, "epoch": 0.8215652632968193, "percentage": 82.16, "elapsed_time": "0:16:15", "remaining_time": "0:03:31", "throughput": 3136.05, "total_tokens": 3060160} +{"current_steps": 6230, "total_steps": 7577, "loss": 0.1205, "lr": 1.867258449732545e-07, "epoch": 0.8222251550745677, "percentage": 82.22, "elapsed_time": "0:16:16", "remaining_time": "0:03:31", "throughput": 3137.58, "total_tokens": 3062592} +{"current_steps": 6235, "total_steps": 7577, "loss": 0.0502, "lr": 1.8538760787969676e-07, "epoch": 0.8228850468523162, "percentage": 82.29, "elapsed_time": "0:16:16", "remaining_time": "0:03:30", "throughput": 3139.21, "total_tokens": 3065088} +{"current_steps": 6240, "total_steps": 7577, "loss": 0.0019, "lr": 1.8405369342738907e-07, "epoch": 0.8235449386300646, "percentage": 82.35, "elapsed_time": "0:16:16", "remaining_time": "0:03:29", "throughput": 3140.97, "total_tokens": 3067712} +{"current_steps": 6245, "total_steps": 7577, "loss": 0.0876, "lr": 1.8272410869458598e-07, "epoch": 0.8242048304078131, "percentage": 82.42, "elapsed_time": "0:16:16", "remaining_time": "0:03:28", "throughput": 3142.53, "total_tokens": 3070144} +{"current_steps": 6250, "total_steps": 7577, "loss": 0.2369, "lr": 1.8139886073656653e-07, "epoch": 0.8248647221855616, "percentage": 82.49, "elapsed_time": "0:16:17", "remaining_time": "0:03:27", "throughput": 3143.96, "total_tokens": 3072448} +{"current_steps": 6255, "total_steps": 7577, "loss": 0.2066, "lr": 1.800779565855971e-07, "epoch": 0.82552461396331, "percentage": 82.55, "elapsed_time": "0:16:17", "remaining_time": "0:03:26", "throughput": 3145.72, "total_tokens": 3075072} +{"current_steps": 6260, "total_steps": 7577, "loss": 0.0029, "lr": 1.7876140325089463e-07, "epoch": 0.8261845057410585, "percentage": 82.62, "elapsed_time": "0:16:17", "remaining_time": "0:03:25", "throughput": 3147.14, "total_tokens": 3077376} +{"current_steps": 6265, "total_steps": 7577, "loss": 0.1344, "lr": 1.774492077185883e-07, "epoch": 0.8268443975188069, "percentage": 82.68, "elapsed_time": "0:16:18", "remaining_time": "0:03:24", "throughput": 3148.7, "total_tokens": 3079808} +{"current_steps": 6270, "total_steps": 7577, "loss": 0.0009, "lr": 1.7614137695168408e-07, "epoch": 0.8275042892965554, "percentage": 82.75, "elapsed_time": "0:16:18", "remaining_time": "0:03:23", "throughput": 3150.58, "total_tokens": 3082560} +{"current_steps": 6275, "total_steps": 7577, "loss": 0.0705, "lr": 1.748379178900261e-07, "epoch": 0.8281641810743038, "percentage": 82.82, "elapsed_time": "0:16:18", "remaining_time": "0:03:23", "throughput": 3151.72, "total_tokens": 3084608} +{"current_steps": 6280, "total_steps": 7577, "loss": 0.228, "lr": 1.7353883745026055e-07, "epoch": 0.8288240728520523, "percentage": 82.88, "elapsed_time": "0:16:18", "remaining_time": "0:03:22", "throughput": 3153.35, "total_tokens": 3087104} +{"current_steps": 6285, "total_steps": 7577, "loss": 0.1102, "lr": 1.722441425257999e-07, "epoch": 0.8294839646298007, "percentage": 82.95, "elapsed_time": "0:16:19", "remaining_time": "0:03:21", "throughput": 3154.76, "total_tokens": 3089408} +{"current_steps": 6290, "total_steps": 7577, "loss": 0.0552, "lr": 1.7095383998678402e-07, "epoch": 0.8301438564075492, "percentage": 83.01, "elapsed_time": "0:16:19", "remaining_time": "0:03:20", "throughput": 3156.23, "total_tokens": 3091776} +{"current_steps": 6295, "total_steps": 7577, "loss": 0.1083, "lr": 1.6966793668004653e-07, "epoch": 0.8308037481852976, "percentage": 83.08, "elapsed_time": "0:16:19", "remaining_time": "0:03:19", "throughput": 3157.79, "total_tokens": 3094208} +{"current_steps": 6300, "total_steps": 7577, "loss": 0.0801, "lr": 1.6838643942907625e-07, "epoch": 0.8314636399630461, "percentage": 83.15, "elapsed_time": "0:16:20", "remaining_time": "0:03:18", "throughput": 3159.48, "total_tokens": 3096768} +{"current_steps": 6305, "total_steps": 7577, "loss": 0.06, "lr": 1.671093550339815e-07, "epoch": 0.8321235317407946, "percentage": 83.21, "elapsed_time": "0:16:20", "remaining_time": "0:03:17", "throughput": 3161.28, "total_tokens": 3099456} +{"current_steps": 6310, "total_steps": 7577, "loss": 0.0046, "lr": 1.6583669027145542e-07, "epoch": 0.832783423518543, "percentage": 83.28, "elapsed_time": "0:16:20", "remaining_time": "0:03:16", "throughput": 3163.13, "total_tokens": 3102208} +{"current_steps": 6315, "total_steps": 7577, "loss": 0.0014, "lr": 1.6456845189473767e-07, "epoch": 0.8334433152962915, "percentage": 83.34, "elapsed_time": "0:16:21", "remaining_time": "0:03:16", "throughput": 3164.94, "total_tokens": 3104896} +{"current_steps": 6320, "total_steps": 7577, "loss": 0.1178, "lr": 1.6330464663358123e-07, "epoch": 0.8341032070740398, "percentage": 83.41, "elapsed_time": "0:16:21", "remaining_time": "0:03:15", "throughput": 3166.68, "total_tokens": 3107520} +{"current_steps": 6325, "total_steps": 7577, "loss": 0.0014, "lr": 1.6204528119421346e-07, "epoch": 0.8347630988517883, "percentage": 83.48, "elapsed_time": "0:16:21", "remaining_time": "0:03:14", "throughput": 3168.41, "total_tokens": 3110144} +{"current_steps": 6330, "total_steps": 7577, "loss": 0.0501, "lr": 1.607903622593042e-07, "epoch": 0.8354229906295367, "percentage": 83.54, "elapsed_time": "0:16:21", "remaining_time": "0:03:13", "throughput": 3170.14, "total_tokens": 3112768} +{"current_steps": 6335, "total_steps": 7577, "loss": 0.0007, "lr": 1.5953989648792743e-07, "epoch": 0.8360828824072852, "percentage": 83.61, "elapsed_time": "0:16:22", "remaining_time": "0:03:12", "throughput": 3171.82, "total_tokens": 3115328} +{"current_steps": 6340, "total_steps": 7577, "loss": 0.0323, "lr": 1.5829389051552678e-07, "epoch": 0.8367427741850336, "percentage": 83.67, "elapsed_time": "0:16:22", "remaining_time": "0:03:11", "throughput": 3173.5, "total_tokens": 3117888} +{"current_steps": 6345, "total_steps": 7577, "loss": 0.038, "lr": 1.5705235095388136e-07, "epoch": 0.8374026659627821, "percentage": 83.74, "elapsed_time": "0:16:22", "remaining_time": "0:03:10", "throughput": 3175.09, "total_tokens": 3120384} +{"current_steps": 6350, "total_steps": 7577, "loss": 0.0436, "lr": 1.5581528439106907e-07, "epoch": 0.8380625577405305, "percentage": 83.81, "elapsed_time": "0:16:23", "remaining_time": "0:03:09", "throughput": 3176.81, "total_tokens": 3123008} +{"current_steps": 6355, "total_steps": 7577, "loss": 0.0796, "lr": 1.5458269739143292e-07, "epoch": 0.838722449518279, "percentage": 83.87, "elapsed_time": "0:16:23", "remaining_time": "0:03:09", "throughput": 3178.4, "total_tokens": 3125504} +{"current_steps": 6360, "total_steps": 7577, "loss": 0.0025, "lr": 1.5335459649554538e-07, "epoch": 0.8393823412960274, "percentage": 83.94, "elapsed_time": "0:16:23", "remaining_time": "0:03:08", "throughput": 3179.75, "total_tokens": 3127744} +{"current_steps": 6365, "total_steps": 7577, "loss": 0.1043, "lr": 1.5213098822017357e-07, "epoch": 0.8400422330737759, "percentage": 84.0, "elapsed_time": "0:16:23", "remaining_time": "0:03:07", "throughput": 3181.16, "total_tokens": 3130048} +{"current_steps": 6370, "total_steps": 7577, "loss": 0.0469, "lr": 1.50911879058246e-07, "epoch": 0.8407021248515244, "percentage": 84.07, "elapsed_time": "0:16:24", "remaining_time": "0:03:06", "throughput": 3182.7, "total_tokens": 3132480} +{"current_steps": 6375, "total_steps": 7577, "loss": 0.1012, "lr": 1.4969727547881628e-07, "epoch": 0.8413620166292728, "percentage": 84.14, "elapsed_time": "0:16:24", "remaining_time": "0:03:05", "throughput": 3184.44, "total_tokens": 3135104} +{"current_steps": 6380, "total_steps": 7577, "loss": 0.1743, "lr": 1.4848718392703052e-07, "epoch": 0.8420219084070213, "percentage": 84.2, "elapsed_time": "0:16:24", "remaining_time": "0:03:04", "throughput": 3185.77, "total_tokens": 3137344} +{"current_steps": 6385, "total_steps": 7577, "loss": 0.1728, "lr": 1.472816108240915e-07, "epoch": 0.8426818001847697, "percentage": 84.27, "elapsed_time": "0:16:25", "remaining_time": "0:03:03", "throughput": 3187.63, "total_tokens": 3140096} +{"current_steps": 6390, "total_steps": 7577, "loss": 0.0782, "lr": 1.46080562567226e-07, "epoch": 0.8433416919625182, "percentage": 84.33, "elapsed_time": "0:16:25", "remaining_time": "0:03:03", "throughput": 3189.04, "total_tokens": 3142400} +{"current_steps": 6395, "total_steps": 7577, "loss": 0.0657, "lr": 1.4488404552964993e-07, "epoch": 0.8440015837402666, "percentage": 84.4, "elapsed_time": "0:16:25", "remaining_time": "0:03:02", "throughput": 3190.27, "total_tokens": 3144512} +{"current_steps": 6400, "total_steps": 7577, "loss": 0.0303, "lr": 1.4369206606053463e-07, "epoch": 0.8446614755180151, "percentage": 84.47, "elapsed_time": "0:16:25", "remaining_time": "0:03:01", "throughput": 3191.82, "total_tokens": 3146944} +{"current_steps": 6405, "total_steps": 7577, "loss": 0.0816, "lr": 1.425046304849742e-07, "epoch": 0.8453213672957635, "percentage": 84.53, "elapsed_time": "0:16:26", "remaining_time": "0:03:00", "throughput": 3193.35, "total_tokens": 3149376} +{"current_steps": 6410, "total_steps": 7577, "loss": 0.1094, "lr": 1.4132174510395024e-07, "epoch": 0.845981259073512, "percentage": 84.6, "elapsed_time": "0:16:26", "remaining_time": "0:02:59", "throughput": 3194.82, "total_tokens": 3151744} +{"current_steps": 6415, "total_steps": 7577, "loss": 0.0082, "lr": 1.4014341619430003e-07, "epoch": 0.8466411508512603, "percentage": 84.66, "elapsed_time": "0:16:26", "remaining_time": "0:02:58", "throughput": 3196.28, "total_tokens": 3154112} +{"current_steps": 6420, "total_steps": 7577, "loss": 0.0082, "lr": 1.3896965000868188e-07, "epoch": 0.8473010426290088, "percentage": 84.73, "elapsed_time": "0:16:27", "remaining_time": "0:02:57", "throughput": 3197.75, "total_tokens": 3156480} +{"current_steps": 6425, "total_steps": 7577, "loss": 0.138, "lr": 1.3780045277554276e-07, "epoch": 0.8479609344067572, "percentage": 84.8, "elapsed_time": "0:16:27", "remaining_time": "0:02:57", "throughput": 3199.13, "total_tokens": 3158784} +{"current_steps": 6430, "total_steps": 7577, "loss": 0.1674, "lr": 1.3663583069908535e-07, "epoch": 0.8486208261845057, "percentage": 84.86, "elapsed_time": "0:16:27", "remaining_time": "0:02:56", "throughput": 3200.58, "total_tokens": 3161152} +{"current_steps": 6435, "total_steps": 7577, "loss": 0.0385, "lr": 1.3547578995923447e-07, "epoch": 0.8492807179622542, "percentage": 84.93, "elapsed_time": "0:16:27", "remaining_time": "0:02:55", "throughput": 3202.3, "total_tokens": 3163776} +{"current_steps": 6440, "total_steps": 7577, "loss": 0.1202, "lr": 1.3432033671160458e-07, "epoch": 0.8499406097400026, "percentage": 84.99, "elapsed_time": "0:16:28", "remaining_time": "0:02:54", "throughput": 3203.9, "total_tokens": 3166272} +{"current_steps": 6443, "total_steps": 7577, "eval_loss": 0.09701072424650192, "epoch": 0.8503365448066517, "percentage": 85.03, "elapsed_time": "0:16:36", "remaining_time": "0:02:55", "throughput": 3179.6, "total_tokens": 3167488} +{"current_steps": 6445, "total_steps": 7577, "loss": 0.0653, "lr": 1.3316947708746762e-07, "epoch": 0.8506005015177511, "percentage": 85.06, "elapsed_time": "0:16:54", "remaining_time": "0:02:58", "throughput": 3123.59, "total_tokens": 3168640} +{"current_steps": 6450, "total_steps": 7577, "loss": 0.1256, "lr": 1.3202321719371967e-07, "epoch": 0.8512603932954995, "percentage": 85.13, "elapsed_time": "0:16:54", "remaining_time": "0:02:57", "throughput": 3125.03, "total_tokens": 3171008} +{"current_steps": 6455, "total_steps": 7577, "loss": 0.1099, "lr": 1.3088156311284893e-07, "epoch": 0.851920285073248, "percentage": 85.19, "elapsed_time": "0:16:54", "remaining_time": "0:02:56", "throughput": 3126.42, "total_tokens": 3173312} +{"current_steps": 6460, "total_steps": 7577, "loss": 0.2267, "lr": 1.2974452090290322e-07, "epoch": 0.8525801768509964, "percentage": 85.26, "elapsed_time": "0:16:55", "remaining_time": "0:02:55", "throughput": 3127.99, "total_tokens": 3175808} +{"current_steps": 6465, "total_steps": 7577, "loss": 0.0888, "lr": 1.2861209659745865e-07, "epoch": 0.8532400686287449, "percentage": 85.32, "elapsed_time": "0:16:55", "remaining_time": "0:02:54", "throughput": 3129.31, "total_tokens": 3178048} +{"current_steps": 6470, "total_steps": 7577, "loss": 0.0148, "lr": 1.2748429620558654e-07, "epoch": 0.8538999604064933, "percentage": 85.39, "elapsed_time": "0:16:55", "remaining_time": "0:02:53", "throughput": 3130.88, "total_tokens": 3180544} +{"current_steps": 6475, "total_steps": 7577, "loss": 0.1561, "lr": 1.2636112571182167e-07, "epoch": 0.8545598521842418, "percentage": 85.46, "elapsed_time": "0:16:56", "remaining_time": "0:02:52", "throughput": 3132.44, "total_tokens": 3183040} +{"current_steps": 6480, "total_steps": 7577, "loss": 0.1766, "lr": 1.2524259107613178e-07, "epoch": 0.8552197439619902, "percentage": 85.52, "elapsed_time": "0:16:56", "remaining_time": "0:02:52", "throughput": 3134.13, "total_tokens": 3185664} +{"current_steps": 6485, "total_steps": 7577, "loss": 0.146, "lr": 1.2412869823388382e-07, "epoch": 0.8558796357397387, "percentage": 85.59, "elapsed_time": "0:16:56", "remaining_time": "0:02:51", "throughput": 3136.17, "total_tokens": 3188672} +{"current_steps": 6490, "total_steps": 7577, "loss": 0.0385, "lr": 1.2301945309581486e-07, "epoch": 0.8565395275174872, "percentage": 85.65, "elapsed_time": "0:16:57", "remaining_time": "0:02:50", "throughput": 3137.73, "total_tokens": 3191168} +{"current_steps": 6495, "total_steps": 7577, "loss": 0.0822, "lr": 1.2191486154799846e-07, "epoch": 0.8571994192952356, "percentage": 85.72, "elapsed_time": "0:16:57", "remaining_time": "0:02:49", "throughput": 3139.29, "total_tokens": 3193664} +{"current_steps": 6500, "total_steps": 7577, "loss": 0.001, "lr": 1.208149294518147e-07, "epoch": 0.8578593110729841, "percentage": 85.79, "elapsed_time": "0:16:57", "remaining_time": "0:02:48", "throughput": 3140.92, "total_tokens": 3196224} +{"current_steps": 6505, "total_steps": 7577, "loss": 0.1988, "lr": 1.1971966264391954e-07, "epoch": 0.8585192028507325, "percentage": 85.85, "elapsed_time": "0:16:57", "remaining_time": "0:02:47", "throughput": 3142.54, "total_tokens": 3198784} +{"current_steps": 6510, "total_steps": 7577, "loss": 0.1104, "lr": 1.1862906693621233e-07, "epoch": 0.859179094628481, "percentage": 85.92, "elapsed_time": "0:16:58", "remaining_time": "0:02:46", "throughput": 3144.28, "total_tokens": 3201472} +{"current_steps": 6515, "total_steps": 7577, "loss": 0.1169, "lr": 1.1754314811580623e-07, "epoch": 0.8598389864062294, "percentage": 85.98, "elapsed_time": "0:16:58", "remaining_time": "0:02:46", "throughput": 3145.47, "total_tokens": 3203584} +{"current_steps": 6520, "total_steps": 7577, "loss": 0.0712, "lr": 1.1646191194499655e-07, "epoch": 0.8604988781839779, "percentage": 86.05, "elapsed_time": "0:16:58", "remaining_time": "0:02:45", "throughput": 3146.85, "total_tokens": 3205888} +{"current_steps": 6525, "total_steps": 7577, "loss": 0.1908, "lr": 1.1538536416123168e-07, "epoch": 0.8611587699617262, "percentage": 86.12, "elapsed_time": "0:16:59", "remaining_time": "0:02:44", "throughput": 3148.03, "total_tokens": 3208000} +{"current_steps": 6530, "total_steps": 7577, "loss": 0.0208, "lr": 1.1431351047708072e-07, "epoch": 0.8618186617394747, "percentage": 86.18, "elapsed_time": "0:16:59", "remaining_time": "0:02:43", "throughput": 3149.34, "total_tokens": 3210240} +{"current_steps": 6535, "total_steps": 7577, "loss": 0.1363, "lr": 1.1324635658020432e-07, "epoch": 0.8624785535172231, "percentage": 86.25, "elapsed_time": "0:16:59", "remaining_time": "0:02:42", "throughput": 3150.83, "total_tokens": 3212672} +{"current_steps": 6540, "total_steps": 7577, "loss": 0.1361, "lr": 1.1218390813332479e-07, "epoch": 0.8631384452949716, "percentage": 86.31, "elapsed_time": "0:16:59", "remaining_time": "0:02:41", "throughput": 3152.57, "total_tokens": 3215360} +{"current_steps": 6545, "total_steps": 7577, "loss": 0.1234, "lr": 1.1112617077419472e-07, "epoch": 0.86379833707272, "percentage": 86.38, "elapsed_time": "0:17:00", "remaining_time": "0:02:40", "throughput": 3154.36, "total_tokens": 3218112} +{"current_steps": 6550, "total_steps": 7577, "loss": 0.0346, "lr": 1.1007315011556884e-07, "epoch": 0.8644582288504685, "percentage": 86.45, "elapsed_time": "0:17:00", "remaining_time": "0:02:40", "throughput": 3155.6, "total_tokens": 3220288} +{"current_steps": 6555, "total_steps": 7577, "loss": 0.0015, "lr": 1.0902485174517251e-07, "epoch": 0.865118120628217, "percentage": 86.51, "elapsed_time": "0:17:00", "remaining_time": "0:02:39", "throughput": 3157.33, "total_tokens": 3222976} +{"current_steps": 6560, "total_steps": 7577, "loss": 0.0725, "lr": 1.0798128122567285e-07, "epoch": 0.8657780124059654, "percentage": 86.58, "elapsed_time": "0:17:01", "remaining_time": "0:02:38", "throughput": 3158.88, "total_tokens": 3225472} +{"current_steps": 6565, "total_steps": 7577, "loss": 0.1631, "lr": 1.0694244409464992e-07, "epoch": 0.8664379041837139, "percentage": 86.64, "elapsed_time": "0:17:01", "remaining_time": "0:02:37", "throughput": 3160.56, "total_tokens": 3228096} +{"current_steps": 6570, "total_steps": 7577, "loss": 0.1158, "lr": 1.0590834586456577e-07, "epoch": 0.8670977959614623, "percentage": 86.71, "elapsed_time": "0:17:01", "remaining_time": "0:02:36", "throughput": 3162.24, "total_tokens": 3230720} +{"current_steps": 6575, "total_steps": 7577, "loss": 0.1239, "lr": 1.0487899202273708e-07, "epoch": 0.8677576877392108, "percentage": 86.78, "elapsed_time": "0:17:01", "remaining_time": "0:02:35", "throughput": 3163.66, "total_tokens": 3233088} +{"current_steps": 6580, "total_steps": 7577, "loss": 0.1255, "lr": 1.0385438803130364e-07, "epoch": 0.8684175795169592, "percentage": 86.84, "elapsed_time": "0:17:02", "remaining_time": "0:02:34", "throughput": 3165.33, "total_tokens": 3235712} +{"current_steps": 6585, "total_steps": 7577, "loss": 0.1423, "lr": 1.0283453932720199e-07, "epoch": 0.8690774712947077, "percentage": 86.91, "elapsed_time": "0:17:02", "remaining_time": "0:02:34", "throughput": 3167.18, "total_tokens": 3238528} +{"current_steps": 6590, "total_steps": 7577, "loss": 0.0738, "lr": 1.0181945132213476e-07, "epoch": 0.8697373630724561, "percentage": 86.97, "elapsed_time": "0:17:02", "remaining_time": "0:02:33", "throughput": 3168.6, "total_tokens": 3240896} +{"current_steps": 6595, "total_steps": 7577, "loss": 0.0016, "lr": 1.0080912940254227e-07, "epoch": 0.8703972548502046, "percentage": 87.04, "elapsed_time": "0:17:03", "remaining_time": "0:02:32", "throughput": 3170.14, "total_tokens": 3243392} +{"current_steps": 6600, "total_steps": 7577, "loss": 0.0041, "lr": 9.980357892957492e-08, "epoch": 0.871057146627953, "percentage": 87.11, "elapsed_time": "0:17:03", "remaining_time": "0:02:31", "throughput": 3171.62, "total_tokens": 3245824} +{"current_steps": 6605, "total_steps": 7577, "loss": 0.0031, "lr": 9.880280523906337e-08, "epoch": 0.8717170384057015, "percentage": 87.17, "elapsed_time": "0:17:03", "remaining_time": "0:02:30", "throughput": 3172.98, "total_tokens": 3248128} +{"current_steps": 6610, "total_steps": 7577, "loss": 0.1351, "lr": 9.780681364149091e-08, "epoch": 0.8723769301834499, "percentage": 87.24, "elapsed_time": "0:17:03", "remaining_time": "0:02:29", "throughput": 3174.52, "total_tokens": 3250624} +{"current_steps": 6615, "total_steps": 7577, "loss": 0.1127, "lr": 9.681560942196587e-08, "epoch": 0.8730368219611984, "percentage": 87.3, "elapsed_time": "0:17:04", "remaining_time": "0:02:28", "throughput": 3176.24, "total_tokens": 3253312} +{"current_steps": 6620, "total_steps": 7577, "loss": 0.1168, "lr": 9.582919784019194e-08, "epoch": 0.8736967137389469, "percentage": 87.37, "elapsed_time": "0:17:04", "remaining_time": "0:02:28", "throughput": 3177.48, "total_tokens": 3255488} +{"current_steps": 6625, "total_steps": 7577, "loss": 0.0668, "lr": 9.484758413044236e-08, "epoch": 0.8743566055166953, "percentage": 87.44, "elapsed_time": "0:17:04", "remaining_time": "0:02:27", "throughput": 3178.67, "total_tokens": 3257664} +{"current_steps": 6630, "total_steps": 7577, "loss": 0.0542, "lr": 9.387077350153017e-08, "epoch": 0.8750164972944438, "percentage": 87.5, "elapsed_time": "0:17:05", "remaining_time": "0:02:26", "throughput": 3180.2, "total_tokens": 3260160} +{"current_steps": 6635, "total_steps": 7577, "loss": 0.0616, "lr": 9.289877113678168e-08, "epoch": 0.8756763890721921, "percentage": 87.57, "elapsed_time": "0:17:05", "remaining_time": "0:02:25", "throughput": 3181.63, "total_tokens": 3262528} +{"current_steps": 6640, "total_steps": 7577, "loss": 0.0576, "lr": 9.19315821940092e-08, "epoch": 0.8763362808499406, "percentage": 87.63, "elapsed_time": "0:17:05", "remaining_time": "0:02:24", "throughput": 3183.18, "total_tokens": 3265024} +{"current_steps": 6645, "total_steps": 7577, "loss": 0.1659, "lr": 9.096921180548234e-08, "epoch": 0.876996172627689, "percentage": 87.7, "elapsed_time": "0:17:05", "remaining_time": "0:02:23", "throughput": 3184.66, "total_tokens": 3267456} +{"current_steps": 6650, "total_steps": 7577, "loss": 0.0915, "lr": 9.001166507790259e-08, "epoch": 0.8776560644054375, "percentage": 87.77, "elapsed_time": "0:17:06", "remaining_time": "0:02:23", "throughput": 3186.44, "total_tokens": 3270208} +{"current_steps": 6655, "total_steps": 7577, "loss": 0.1045, "lr": 8.905894709237427e-08, "epoch": 0.8783159561831859, "percentage": 87.83, "elapsed_time": "0:17:06", "remaining_time": "0:02:22", "throughput": 3188.24, "total_tokens": 3272960} +{"current_steps": 6660, "total_steps": 7577, "loss": 0.0736, "lr": 8.811106290437975e-08, "epoch": 0.8789758479609344, "percentage": 87.9, "elapsed_time": "0:17:06", "remaining_time": "0:02:21", "throughput": 3189.48, "total_tokens": 3275136} +{"current_steps": 6665, "total_steps": 7577, "loss": 0.1122, "lr": 8.716801754375036e-08, "epoch": 0.8796357397386828, "percentage": 87.96, "elapsed_time": "0:17:07", "remaining_time": "0:02:20", "throughput": 3191.08, "total_tokens": 3277696} +{"current_steps": 6670, "total_steps": 7577, "loss": 0.1268, "lr": 8.62298160146413e-08, "epoch": 0.8802956315164313, "percentage": 88.03, "elapsed_time": "0:17:07", "remaining_time": "0:02:19", "throughput": 3192.51, "total_tokens": 3280064} +{"current_steps": 6675, "total_steps": 7577, "loss": 0.002, "lr": 8.529646329550466e-08, "epoch": 0.8809555232941798, "percentage": 88.1, "elapsed_time": "0:17:07", "remaining_time": "0:02:18", "throughput": 3193.8, "total_tokens": 3282304} +{"current_steps": 6680, "total_steps": 7577, "loss": 0.0268, "lr": 8.436796433906235e-08, "epoch": 0.8816154150719282, "percentage": 88.16, "elapsed_time": "0:17:07", "remaining_time": "0:02:18", "throughput": 3195.29, "total_tokens": 3284736} +{"current_steps": 6685, "total_steps": 7577, "loss": 0.0431, "lr": 8.344432407228141e-08, "epoch": 0.8822753068496767, "percentage": 88.23, "elapsed_time": "0:17:08", "remaining_time": "0:02:17", "throughput": 3196.77, "total_tokens": 3287168} +{"current_steps": 6690, "total_steps": 7577, "loss": 0.0486, "lr": 8.252554739634577e-08, "epoch": 0.8829351986274251, "percentage": 88.29, "elapsed_time": "0:17:08", "remaining_time": "0:02:16", "throughput": 3198.25, "total_tokens": 3289600} +{"current_steps": 6695, "total_steps": 7577, "loss": 0.0731, "lr": 8.16116391866316e-08, "epoch": 0.8835950904051736, "percentage": 88.36, "elapsed_time": "0:17:08", "remaining_time": "0:02:15", "throughput": 3199.86, "total_tokens": 3292160} +{"current_steps": 6700, "total_steps": 7577, "loss": 0.1312, "lr": 8.070260429268172e-08, "epoch": 0.884254982182922, "percentage": 88.43, "elapsed_time": "0:17:09", "remaining_time": "0:02:14", "throughput": 3201.33, "total_tokens": 3294592} +{"current_steps": 6705, "total_steps": 7577, "loss": 0.0078, "lr": 7.979844753817855e-08, "epoch": 0.8849148739606705, "percentage": 88.49, "elapsed_time": "0:17:09", "remaining_time": "0:02:13", "throughput": 3202.75, "total_tokens": 3296960} +{"current_steps": 6710, "total_steps": 7577, "loss": 0.0772, "lr": 7.889917372091982e-08, "epoch": 0.8855747657384189, "percentage": 88.56, "elapsed_time": "0:17:09", "remaining_time": "0:02:13", "throughput": 3204.05, "total_tokens": 3299200} +{"current_steps": 6715, "total_steps": 7577, "loss": 0.2034, "lr": 7.800478761279183e-08, "epoch": 0.8862346575161674, "percentage": 88.62, "elapsed_time": "0:17:09", "remaining_time": "0:02:12", "throughput": 3205.47, "total_tokens": 3301568} +{"current_steps": 6720, "total_steps": 7577, "loss": 0.1794, "lr": 7.711529395974592e-08, "epoch": 0.8868945492939158, "percentage": 88.69, "elapsed_time": "0:17:10", "remaining_time": "0:02:11", "throughput": 3206.98, "total_tokens": 3304064} +{"current_steps": 6725, "total_steps": 7577, "loss": 0.1778, "lr": 7.623069748177135e-08, "epoch": 0.8875544410716643, "percentage": 88.76, "elapsed_time": "0:17:10", "remaining_time": "0:02:10", "throughput": 3208.38, "total_tokens": 3306432} +{"current_steps": 6730, "total_steps": 7577, "loss": 0.1002, "lr": 7.535100287287111e-08, "epoch": 0.8882143328494126, "percentage": 88.82, "elapsed_time": "0:17:10", "remaining_time": "0:02:09", "throughput": 3209.72, "total_tokens": 3308736} +{"current_steps": 6735, "total_steps": 7577, "loss": 0.0022, "lr": 7.447621480103783e-08, "epoch": 0.8888742246271611, "percentage": 88.89, "elapsed_time": "0:17:11", "remaining_time": "0:02:08", "throughput": 3211.18, "total_tokens": 3311168} +{"current_steps": 6740, "total_steps": 7577, "loss": 0.2822, "lr": 7.360633790822713e-08, "epoch": 0.8895341164049096, "percentage": 88.95, "elapsed_time": "0:17:11", "remaining_time": "0:02:08", "throughput": 3212.69, "total_tokens": 3313664} +{"current_steps": 6745, "total_steps": 7577, "loss": 0.022, "lr": 7.274137681033498e-08, "epoch": 0.890194008182658, "percentage": 89.02, "elapsed_time": "0:17:11", "remaining_time": "0:02:07", "throughput": 3214.28, "total_tokens": 3316224} +{"current_steps": 6750, "total_steps": 7577, "loss": 0.0855, "lr": 7.188133609717184e-08, "epoch": 0.8908538999604065, "percentage": 89.09, "elapsed_time": "0:17:12", "remaining_time": "0:02:06", "throughput": 3215.54, "total_tokens": 3318464} +{"current_steps": 6755, "total_steps": 7577, "loss": 0.0011, "lr": 7.102622033243843e-08, "epoch": 0.8915137917381549, "percentage": 89.15, "elapsed_time": "0:17:12", "remaining_time": "0:02:05", "throughput": 3216.99, "total_tokens": 3320896} +{"current_steps": 6760, "total_steps": 7577, "loss": 0.1368, "lr": 7.017603405370276e-08, "epoch": 0.8921736835159034, "percentage": 89.22, "elapsed_time": "0:17:12", "remaining_time": "0:02:04", "throughput": 3218.73, "total_tokens": 3323648} +{"current_steps": 6765, "total_steps": 7577, "loss": 0.1476, "lr": 6.933078177237429e-08, "epoch": 0.8928335752936518, "percentage": 89.28, "elapsed_time": "0:17:12", "remaining_time": "0:02:03", "throughput": 3220.3, "total_tokens": 3326208} +{"current_steps": 6770, "total_steps": 7577, "loss": 0.0651, "lr": 6.849046797368108e-08, "epoch": 0.8934934670714003, "percentage": 89.35, "elapsed_time": "0:17:13", "remaining_time": "0:02:03", "throughput": 3221.7, "total_tokens": 3328576} +{"current_steps": 6775, "total_steps": 7577, "loss": 0.003, "lr": 6.765509711664574e-08, "epoch": 0.8941533588491487, "percentage": 89.42, "elapsed_time": "0:17:13", "remaining_time": "0:02:02", "throughput": 3223.62, "total_tokens": 3331520} +{"current_steps": 6780, "total_steps": 7577, "loss": 0.0235, "lr": 6.682467363406174e-08, "epoch": 0.8948132506268972, "percentage": 89.48, "elapsed_time": "0:17:13", "remaining_time": "0:02:01", "throughput": 3225.43, "total_tokens": 3334336} +{"current_steps": 6785, "total_steps": 7577, "loss": 0.0671, "lr": 6.59992019324701e-08, "epoch": 0.8954731424046456, "percentage": 89.55, "elapsed_time": "0:17:14", "remaining_time": "0:02:00", "throughput": 3227.0, "total_tokens": 3336896} +{"current_steps": 6790, "total_steps": 7577, "loss": 0.1574, "lr": 6.517868639213553e-08, "epoch": 0.8961330341823941, "percentage": 89.61, "elapsed_time": "0:17:14", "remaining_time": "0:01:59", "throughput": 3228.44, "total_tokens": 3339328} +{"current_steps": 6795, "total_steps": 7577, "loss": 0.0331, "lr": 6.436313136702387e-08, "epoch": 0.8967929259601425, "percentage": 89.68, "elapsed_time": "0:17:14", "remaining_time": "0:01:59", "throughput": 3229.89, "total_tokens": 3341760} +{"current_steps": 6800, "total_steps": 7577, "loss": 0.0527, "lr": 6.355254118477815e-08, "epoch": 0.897452817737891, "percentage": 89.75, "elapsed_time": "0:17:14", "remaining_time": "0:01:58", "throughput": 3231.58, "total_tokens": 3344448} +{"current_steps": 6805, "total_steps": 7577, "loss": 0.0009, "lr": 6.274692014669602e-08, "epoch": 0.8981127095156395, "percentage": 89.81, "elapsed_time": "0:17:15", "remaining_time": "0:01:57", "throughput": 3233.15, "total_tokens": 3347008} +{"current_steps": 6810, "total_steps": 7577, "loss": 0.0008, "lr": 6.194627252770768e-08, "epoch": 0.8987726012933879, "percentage": 89.88, "elapsed_time": "0:17:15", "remaining_time": "0:01:56", "throughput": 3234.94, "total_tokens": 3349824} +{"current_steps": 6815, "total_steps": 7577, "loss": 0.0687, "lr": 6.115060257635174e-08, "epoch": 0.8994324930711364, "percentage": 89.94, "elapsed_time": "0:17:15", "remaining_time": "0:01:55", "throughput": 3236.44, "total_tokens": 3352320} +{"current_steps": 6820, "total_steps": 7577, "loss": 0.0013, "lr": 6.035991451475375e-08, "epoch": 0.9000923848488848, "percentage": 90.01, "elapsed_time": "0:17:16", "remaining_time": "0:01:55", "throughput": 3237.82, "total_tokens": 3354688} +{"current_steps": 6822, "total_steps": 7577, "eval_loss": 0.09568765014410019, "epoch": 0.9003563415599841, "percentage": 90.04, "elapsed_time": "0:17:23", "remaining_time": "0:01:55", "throughput": 3214.8, "total_tokens": 3355520} +{"current_steps": 6825, "total_steps": 7577, "loss": 0.0891, "lr": 5.9574212538603505e-08, "epoch": 0.9007522766266333, "percentage": 90.08, "elapsed_time": "0:18:22", "remaining_time": "0:02:01", "throughput": 3046.08, "total_tokens": 3357056} +{"current_steps": 6830, "total_steps": 7577, "loss": 0.0683, "lr": 5.879350081713252e-08, "epoch": 0.9014121684043817, "percentage": 90.14, "elapsed_time": "0:18:22", "remaining_time": "0:02:00", "throughput": 3047.5, "total_tokens": 3359488} +{"current_steps": 6835, "total_steps": 7577, "loss": 0.2249, "lr": 5.8017783493092386e-08, "epoch": 0.9020720601821302, "percentage": 90.21, "elapsed_time": "0:18:22", "remaining_time": "0:01:59", "throughput": 3048.92, "total_tokens": 3361920} +{"current_steps": 6840, "total_steps": 7577, "loss": 0.0018, "lr": 5.7247064682732104e-08, "epoch": 0.9027319519598785, "percentage": 90.27, "elapsed_time": "0:18:22", "remaining_time": "0:01:58", "throughput": 3050.41, "total_tokens": 3364416} +{"current_steps": 6845, "total_steps": 7577, "loss": 0.0617, "lr": 5.6481348475777566e-08, "epoch": 0.903391843737627, "percentage": 90.34, "elapsed_time": "0:18:23", "remaining_time": "0:01:57", "throughput": 3051.9, "total_tokens": 3366912} +{"current_steps": 6850, "total_steps": 7577, "loss": 0.0014, "lr": 5.5720638935407796e-08, "epoch": 0.9040517355153754, "percentage": 90.41, "elapsed_time": "0:18:23", "remaining_time": "0:01:57", "throughput": 3053.1, "total_tokens": 3369088} +{"current_steps": 6855, "total_steps": 7577, "loss": 0.1392, "lr": 5.49649400982356e-08, "epoch": 0.9047116272931239, "percentage": 90.47, "elapsed_time": "0:18:23", "remaining_time": "0:01:56", "throughput": 3054.46, "total_tokens": 3371520} +{"current_steps": 6860, "total_steps": 7577, "loss": 0.0007, "lr": 5.421425597428442e-08, "epoch": 0.9053715190708723, "percentage": 90.54, "elapsed_time": "0:18:24", "remaining_time": "0:01:55", "throughput": 3055.99, "total_tokens": 3374080} +{"current_steps": 6865, "total_steps": 7577, "loss": 0.0786, "lr": 5.346859054696784e-08, "epoch": 0.9060314108486208, "percentage": 90.6, "elapsed_time": "0:18:24", "remaining_time": "0:01:54", "throughput": 3057.52, "total_tokens": 3376640} +{"current_steps": 6870, "total_steps": 7577, "loss": 0.0794, "lr": 5.2727947773068773e-08, "epoch": 0.9066913026263693, "percentage": 90.67, "elapsed_time": "0:18:24", "remaining_time": "0:01:53", "throughput": 3058.93, "total_tokens": 3379072} +{"current_steps": 6875, "total_steps": 7577, "loss": 0.0732, "lr": 5.199233158271732e-08, "epoch": 0.9073511944041177, "percentage": 90.74, "elapsed_time": "0:18:24", "remaining_time": "0:01:52", "throughput": 3060.51, "total_tokens": 3381696} +{"current_steps": 6880, "total_steps": 7577, "loss": 0.2058, "lr": 5.126174587937149e-08, "epoch": 0.9080110861818662, "percentage": 90.8, "elapsed_time": "0:18:25", "remaining_time": "0:01:51", "throughput": 3061.88, "total_tokens": 3384064} +{"current_steps": 6885, "total_steps": 7577, "loss": 0.0036, "lr": 5.053619453979485e-08, "epoch": 0.9086709779596146, "percentage": 90.87, "elapsed_time": "0:18:25", "remaining_time": "0:01:51", "throughput": 3063.12, "total_tokens": 3386304} +{"current_steps": 6890, "total_steps": 7577, "loss": 0.1486, "lr": 4.9815681414037025e-08, "epoch": 0.9093308697373631, "percentage": 90.93, "elapsed_time": "0:18:25", "remaining_time": "0:01:50", "throughput": 3064.59, "total_tokens": 3388800} +{"current_steps": 6895, "total_steps": 7577, "loss": 0.0886, "lr": 4.910021032541334e-08, "epoch": 0.9099907615151115, "percentage": 91.0, "elapsed_time": "0:18:26", "remaining_time": "0:01:49", "throughput": 3066.0, "total_tokens": 3391232} +{"current_steps": 6900, "total_steps": 7577, "loss": 0.0815, "lr": 4.838978507048319e-08, "epoch": 0.91065065329286, "percentage": 91.07, "elapsed_time": "0:18:26", "remaining_time": "0:01:48", "throughput": 3067.4, "total_tokens": 3393664} +{"current_steps": 6905, "total_steps": 7577, "loss": 0.0055, "lr": 4.768440941903207e-08, "epoch": 0.9113105450706084, "percentage": 91.13, "elapsed_time": "0:18:26", "remaining_time": "0:01:47", "throughput": 3068.7, "total_tokens": 3395968} +{"current_steps": 6910, "total_steps": 7577, "loss": 0.0122, "lr": 4.698408711404944e-08, "epoch": 0.9119704368483569, "percentage": 91.2, "elapsed_time": "0:18:26", "remaining_time": "0:01:46", "throughput": 3070.0, "total_tokens": 3398272} +{"current_steps": 6915, "total_steps": 7577, "loss": 0.0763, "lr": 4.628882187171046e-08, "epoch": 0.9126303286261053, "percentage": 91.26, "elapsed_time": "0:18:27", "remaining_time": "0:01:45", "throughput": 3071.63, "total_tokens": 3400960} +{"current_steps": 6920, "total_steps": 7577, "loss": 0.155, "lr": 4.559861738135506e-08, "epoch": 0.9132902204038538, "percentage": 91.33, "elapsed_time": "0:18:27", "remaining_time": "0:01:45", "throughput": 3073.15, "total_tokens": 3403520} +{"current_steps": 6925, "total_steps": 7577, "loss": 0.229, "lr": 4.491347730546913e-08, "epoch": 0.9139501121816023, "percentage": 91.4, "elapsed_time": "0:18:27", "remaining_time": "0:01:44", "throughput": 3074.56, "total_tokens": 3405952} +{"current_steps": 6930, "total_steps": 7577, "loss": 0.128, "lr": 4.423340527966512e-08, "epoch": 0.9146100039593507, "percentage": 91.46, "elapsed_time": "0:18:28", "remaining_time": "0:01:43", "throughput": 3075.92, "total_tokens": 3408320} +{"current_steps": 6935, "total_steps": 7577, "loss": 0.0052, "lr": 4.355840491266205e-08, "epoch": 0.9152698957370992, "percentage": 91.53, "elapsed_time": "0:18:28", "remaining_time": "0:01:42", "throughput": 3077.42, "total_tokens": 3410880} +{"current_steps": 6940, "total_steps": 7577, "loss": 0.0576, "lr": 4.288847978626686e-08, "epoch": 0.9159297875148475, "percentage": 91.59, "elapsed_time": "0:18:28", "remaining_time": "0:01:41", "throughput": 3078.95, "total_tokens": 3413440} +{"current_steps": 6945, "total_steps": 7577, "loss": 0.1275, "lr": 4.222363345535585e-08, "epoch": 0.916589679292596, "percentage": 91.66, "elapsed_time": "0:18:28", "remaining_time": "0:01:40", "throughput": 3080.46, "total_tokens": 3416000} +{"current_steps": 6950, "total_steps": 7577, "loss": 0.1253, "lr": 4.1563869447854505e-08, "epoch": 0.9172495710703444, "percentage": 91.72, "elapsed_time": "0:18:29", "remaining_time": "0:01:40", "throughput": 3081.7, "total_tokens": 3418240} +{"current_steps": 6955, "total_steps": 7577, "loss": 0.1407, "lr": 4.090919126472048e-08, "epoch": 0.9179094628480929, "percentage": 91.79, "elapsed_time": "0:18:29", "remaining_time": "0:01:39", "throughput": 3083.1, "total_tokens": 3420672} +{"current_steps": 6960, "total_steps": 7577, "loss": 0.0538, "lr": 4.025960237992332e-08, "epoch": 0.9185693546258413, "percentage": 91.86, "elapsed_time": "0:18:29", "remaining_time": "0:01:38", "throughput": 3084.32, "total_tokens": 3422912} +{"current_steps": 6965, "total_steps": 7577, "loss": 0.0027, "lr": 3.961510624042741e-08, "epoch": 0.9192292464035898, "percentage": 91.92, "elapsed_time": "0:18:30", "remaining_time": "0:01:37", "throughput": 3085.76, "total_tokens": 3425408} +{"current_steps": 6970, "total_steps": 7577, "loss": 0.1111, "lr": 3.8975706266172636e-08, "epoch": 0.9198891381813382, "percentage": 91.99, "elapsed_time": "0:18:30", "remaining_time": "0:01:36", "throughput": 3087.09, "total_tokens": 3427776} +{"current_steps": 6975, "total_steps": 7577, "loss": 0.0538, "lr": 3.834140585005696e-08, "epoch": 0.9205490299590867, "percentage": 92.05, "elapsed_time": "0:18:30", "remaining_time": "0:01:35", "throughput": 3088.59, "total_tokens": 3430336} +{"current_steps": 6980, "total_steps": 7577, "loss": 0.2688, "lr": 3.771220835791844e-08, "epoch": 0.9212089217368351, "percentage": 92.12, "elapsed_time": "0:18:30", "remaining_time": "0:01:35", "throughput": 3090.09, "total_tokens": 3432896} +{"current_steps": 6985, "total_steps": 7577, "loss": 0.0703, "lr": 3.708811712851634e-08, "epoch": 0.9218688135145836, "percentage": 92.19, "elapsed_time": "0:18:31", "remaining_time": "0:01:34", "throughput": 3091.31, "total_tokens": 3435136} +{"current_steps": 6990, "total_steps": 7577, "loss": 0.0604, "lr": 3.6469135473514936e-08, "epoch": 0.9225287052923321, "percentage": 92.25, "elapsed_time": "0:18:31", "remaining_time": "0:01:33", "throughput": 3092.92, "total_tokens": 3437824} +{"current_steps": 6995, "total_steps": 7577, "loss": 0.0066, "lr": 3.5855266677464744e-08, "epoch": 0.9231885970700805, "percentage": 92.32, "elapsed_time": "0:18:31", "remaining_time": "0:01:32", "throughput": 3094.36, "total_tokens": 3440320} +{"current_steps": 7000, "total_steps": 7577, "loss": 0.0499, "lr": 3.524651399778555e-08, "epoch": 0.923848488847829, "percentage": 92.38, "elapsed_time": "0:18:32", "remaining_time": "0:01:31", "throughput": 3095.84, "total_tokens": 3442880} +{"current_steps": 7005, "total_steps": 7577, "loss": 0.0009, "lr": 3.4642880664749296e-08, "epoch": 0.9245083806255774, "percentage": 92.45, "elapsed_time": "0:18:32", "remaining_time": "0:01:30", "throughput": 3097.06, "total_tokens": 3445120} +{"current_steps": 7010, "total_steps": 7577, "loss": 0.1758, "lr": 3.404436988146242e-08, "epoch": 0.9251682724033259, "percentage": 92.52, "elapsed_time": "0:18:32", "remaining_time": "0:01:29", "throughput": 3098.33, "total_tokens": 3447424} +{"current_steps": 7015, "total_steps": 7577, "loss": 0.0461, "lr": 3.345098482384956e-08, "epoch": 0.9258281641810743, "percentage": 92.58, "elapsed_time": "0:18:32", "remaining_time": "0:01:29", "throughput": 3099.77, "total_tokens": 3449920} +{"current_steps": 7020, "total_steps": 7577, "loss": 0.0499, "lr": 3.2862728640636105e-08, "epoch": 0.9264880559588228, "percentage": 92.65, "elapsed_time": "0:18:33", "remaining_time": "0:01:28", "throughput": 3101.21, "total_tokens": 3452416} +{"current_steps": 7025, "total_steps": 7577, "loss": 0.1119, "lr": 3.227960445333155e-08, "epoch": 0.9271479477365712, "percentage": 92.71, "elapsed_time": "0:18:33", "remaining_time": "0:01:27", "throughput": 3102.66, "total_tokens": 3454912} +{"current_steps": 7030, "total_steps": 7577, "loss": 0.0654, "lr": 3.1701615356213295e-08, "epoch": 0.9278078395143197, "percentage": 92.78, "elapsed_time": "0:18:33", "remaining_time": "0:01:26", "throughput": 3104.16, "total_tokens": 3457472} +{"current_steps": 7035, "total_steps": 7577, "loss": 0.0654, "lr": 3.112876441630985e-08, "epoch": 0.928467731292068, "percentage": 92.85, "elapsed_time": "0:18:34", "remaining_time": "0:01:25", "throughput": 3105.39, "total_tokens": 3459712} +{"current_steps": 7040, "total_steps": 7577, "loss": 0.0532, "lr": 3.05610546733851e-08, "epoch": 0.9291276230698166, "percentage": 92.91, "elapsed_time": "0:18:34", "remaining_time": "0:01:25", "throughput": 3106.77, "total_tokens": 3462144} +{"current_steps": 7045, "total_steps": 7577, "loss": 0.2881, "lr": 2.99984891399212e-08, "epoch": 0.9297875148475649, "percentage": 92.98, "elapsed_time": "0:18:34", "remaining_time": "0:01:24", "throughput": 3108.11, "total_tokens": 3464512} +{"current_steps": 7050, "total_steps": 7577, "loss": 0.0061, "lr": 2.9441070801103808e-08, "epoch": 0.9304474066253134, "percentage": 93.04, "elapsed_time": "0:18:34", "remaining_time": "0:01:23", "throughput": 3109.45, "total_tokens": 3466880} +{"current_steps": 7055, "total_steps": 7577, "loss": 0.1035, "lr": 2.8888802614805085e-08, "epoch": 0.931107298403062, "percentage": 93.11, "elapsed_time": "0:18:35", "remaining_time": "0:01:22", "throughput": 3110.78, "total_tokens": 3469248} +{"current_steps": 7060, "total_steps": 7577, "loss": 0.2707, "lr": 2.8341687511568734e-08, "epoch": 0.9317671901808103, "percentage": 93.18, "elapsed_time": "0:18:35", "remaining_time": "0:01:21", "throughput": 3112.11, "total_tokens": 3471616} +{"current_steps": 7065, "total_steps": 7577, "loss": 0.0773, "lr": 2.7799728394594547e-08, "epoch": 0.9324270819585588, "percentage": 93.24, "elapsed_time": "0:18:35", "remaining_time": "0:01:20", "throughput": 3113.66, "total_tokens": 3474240} +{"current_steps": 7070, "total_steps": 7577, "loss": 0.0759, "lr": 2.7262928139722198e-08, "epoch": 0.9330869737363072, "percentage": 93.31, "elapsed_time": "0:18:36", "remaining_time": "0:01:20", "throughput": 3115.15, "total_tokens": 3476800} +{"current_steps": 7075, "total_steps": 7577, "loss": 0.0879, "lr": 2.673128959541693e-08, "epoch": 0.9337468655140557, "percentage": 93.37, "elapsed_time": "0:18:36", "remaining_time": "0:01:19", "throughput": 3116.75, "total_tokens": 3479488} +{"current_steps": 7080, "total_steps": 7577, "loss": 0.0007, "lr": 2.620481558275367e-08, "epoch": 0.9344067572918041, "percentage": 93.44, "elapsed_time": "0:18:36", "remaining_time": "0:01:18", "throughput": 3118.35, "total_tokens": 3482176} +{"current_steps": 7085, "total_steps": 7577, "loss": 0.0318, "lr": 2.5683508895402382e-08, "epoch": 0.9350666490695526, "percentage": 93.51, "elapsed_time": "0:18:36", "remaining_time": "0:01:17", "throughput": 3119.89, "total_tokens": 3484800} +{"current_steps": 7090, "total_steps": 7577, "loss": 0.1076, "lr": 2.5167372299613853e-08, "epoch": 0.935726540847301, "percentage": 93.57, "elapsed_time": "0:18:37", "remaining_time": "0:01:16", "throughput": 3121.48, "total_tokens": 3487488} +{"current_steps": 7095, "total_steps": 7577, "loss": 0.238, "lr": 2.4656408534203365e-08, "epoch": 0.9363864326250495, "percentage": 93.64, "elapsed_time": "0:18:37", "remaining_time": "0:01:15", "throughput": 3122.68, "total_tokens": 3489728} +{"current_steps": 7100, "total_steps": 7577, "loss": 0.2424, "lr": 2.4150620310538273e-08, "epoch": 0.9370463244027979, "percentage": 93.7, "elapsed_time": "0:18:37", "remaining_time": "0:01:15", "throughput": 3123.82, "total_tokens": 3491904} +{"current_steps": 7105, "total_steps": 7577, "loss": 0.0751, "lr": 2.3650010312521673e-08, "epoch": 0.9377062161805464, "percentage": 93.77, "elapsed_time": "0:18:38", "remaining_time": "0:01:14", "throughput": 3125.42, "total_tokens": 3494592} +{"current_steps": 7110, "total_steps": 7577, "loss": 0.1782, "lr": 2.3154581196579648e-08, "epoch": 0.9383661079582949, "percentage": 93.84, "elapsed_time": "0:18:38", "remaining_time": "0:01:13", "throughput": 3126.84, "total_tokens": 3497088} +{"current_steps": 7115, "total_steps": 7577, "loss": 0.0552, "lr": 2.2664335591646377e-08, "epoch": 0.9390259997360433, "percentage": 93.9, "elapsed_time": "0:18:38", "remaining_time": "0:01:12", "throughput": 3128.21, "total_tokens": 3499520} +{"current_steps": 7120, "total_steps": 7577, "loss": 0.1962, "lr": 2.2179276099150158e-08, "epoch": 0.9396858915137918, "percentage": 93.97, "elapsed_time": "0:18:38", "remaining_time": "0:01:11", "throughput": 3129.8, "total_tokens": 3502208} +{"current_steps": 7125, "total_steps": 7577, "loss": 0.1811, "lr": 2.1699405293000182e-08, "epoch": 0.9403457832915402, "percentage": 94.03, "elapsed_time": "0:18:39", "remaining_time": "0:01:11", "throughput": 3131.17, "total_tokens": 3504640} +{"current_steps": 7130, "total_steps": 7577, "loss": 0.0653, "lr": 2.1224725719572235e-08, "epoch": 0.9410056750692887, "percentage": 94.1, "elapsed_time": "0:18:39", "remaining_time": "0:01:10", "throughput": 3132.42, "total_tokens": 3506944} +{"current_steps": 7135, "total_steps": 7577, "loss": 0.1533, "lr": 2.0755239897695453e-08, "epoch": 0.9416655668470371, "percentage": 94.17, "elapsed_time": "0:18:39", "remaining_time": "0:01:09", "throughput": 3133.78, "total_tokens": 3509376} +{"current_steps": 7140, "total_steps": 7577, "loss": 0.1645, "lr": 2.0290950318639256e-08, "epoch": 0.9423254586247856, "percentage": 94.23, "elapsed_time": "0:18:40", "remaining_time": "0:01:08", "throughput": 3135.03, "total_tokens": 3511680} +{"current_steps": 7145, "total_steps": 7577, "loss": 0.0611, "lr": 1.983185944609944e-08, "epoch": 0.942985350402534, "percentage": 94.3, "elapsed_time": "0:18:40", "remaining_time": "0:01:07", "throughput": 3136.4, "total_tokens": 3514112} +{"current_steps": 7150, "total_steps": 7577, "loss": 0.0665, "lr": 1.9377969716185994e-08, "epoch": 0.9436452421802825, "percentage": 94.36, "elapsed_time": "0:18:40", "remaining_time": "0:01:06", "throughput": 3137.71, "total_tokens": 3516480} +{"current_steps": 7155, "total_steps": 7577, "loss": 0.1162, "lr": 1.8929283537408968e-08, "epoch": 0.9443051339580308, "percentage": 94.43, "elapsed_time": "0:18:41", "remaining_time": "0:01:06", "throughput": 3138.9, "total_tokens": 3518720} +{"current_steps": 7160, "total_steps": 7577, "loss": 0.0086, "lr": 1.848580329066718e-08, "epoch": 0.9449650257357793, "percentage": 94.5, "elapsed_time": "0:18:41", "remaining_time": "0:01:05", "throughput": 3140.32, "total_tokens": 3521216} +{"current_steps": 7165, "total_steps": 7577, "loss": 0.3859, "lr": 1.804753132923431e-08, "epoch": 0.9456249175135277, "percentage": 94.56, "elapsed_time": "0:18:41", "remaining_time": "0:01:04", "throughput": 3141.79, "total_tokens": 3523776} +{"current_steps": 7170, "total_steps": 7577, "loss": 0.0012, "lr": 1.7614469978746827e-08, "epoch": 0.9462848092912762, "percentage": 94.63, "elapsed_time": "0:18:41", "remaining_time": "0:01:03", "throughput": 3143.21, "total_tokens": 3526272} +{"current_steps": 7175, "total_steps": 7577, "loss": 0.0324, "lr": 1.7186621537192304e-08, "epoch": 0.9469447010690247, "percentage": 94.69, "elapsed_time": "0:18:42", "remaining_time": "0:01:02", "throughput": 3144.45, "total_tokens": 3528576} +{"current_steps": 7180, "total_steps": 7577, "loss": 0.0012, "lr": 1.6763988274896003e-08, "epoch": 0.9476045928467731, "percentage": 94.76, "elapsed_time": "0:18:42", "remaining_time": "0:01:02", "throughput": 3145.91, "total_tokens": 3531136} +{"current_steps": 7185, "total_steps": 7577, "loss": 0.1503, "lr": 1.6346572434509876e-08, "epoch": 0.9482644846245216, "percentage": 94.83, "elapsed_time": "0:18:42", "remaining_time": "0:01:01", "throughput": 3147.37, "total_tokens": 3533696} +{"current_steps": 7190, "total_steps": 7577, "loss": 0.1569, "lr": 1.5934376231000248e-08, "epoch": 0.94892437640227, "percentage": 94.89, "elapsed_time": "0:18:43", "remaining_time": "0:01:00", "throughput": 3148.67, "total_tokens": 3536064} +{"current_steps": 7195, "total_steps": 7577, "loss": 0.0044, "lr": 1.55274018516357e-08, "epoch": 0.9495842681800185, "percentage": 94.96, "elapsed_time": "0:18:43", "remaining_time": "0:00:59", "throughput": 3149.97, "total_tokens": 3538432} +{"current_steps": 7200, "total_steps": 7577, "loss": 0.05, "lr": 1.512565145597633e-08, "epoch": 0.9502441599577669, "percentage": 95.02, "elapsed_time": "0:18:43", "remaining_time": "0:00:58", "throughput": 3151.55, "total_tokens": 3541120} +{"current_steps": 7201, "total_steps": 7577, "eval_loss": 0.09555233269929886, "epoch": 0.9503761383133166, "percentage": 95.04, "elapsed_time": "0:18:51", "remaining_time": "0:00:59", "throughput": 3130.63, "total_tokens": 3541632} +{"current_steps": 7205, "total_steps": 7577, "loss": 0.0498, "lr": 1.47291271758615e-08, "epoch": 0.9509040517355154, "percentage": 95.09, "elapsed_time": "0:19:08", "remaining_time": "0:00:59", "throughput": 3086.44, "total_tokens": 3543680} +{"current_steps": 7210, "total_steps": 7577, "loss": 0.1477, "lr": 1.4337831115398991e-08, "epoch": 0.9515639435132638, "percentage": 95.16, "elapsed_time": "0:19:08", "remaining_time": "0:00:58", "throughput": 3087.67, "total_tokens": 3545984} +{"current_steps": 7215, "total_steps": 7577, "loss": 0.1276, "lr": 1.3951765350953548e-08, "epoch": 0.9522238352910123, "percentage": 95.22, "elapsed_time": "0:19:08", "remaining_time": "0:00:57", "throughput": 3089.11, "total_tokens": 3548544} +{"current_steps": 7220, "total_steps": 7577, "loss": 0.1596, "lr": 1.3570931931136009e-08, "epoch": 0.9528837270687607, "percentage": 95.29, "elapsed_time": "0:19:09", "remaining_time": "0:00:56", "throughput": 3090.5, "total_tokens": 3551040} +{"current_steps": 7225, "total_steps": 7577, "loss": 0.0839, "lr": 1.3195332876792532e-08, "epoch": 0.9535436188465092, "percentage": 95.35, "elapsed_time": "0:19:09", "remaining_time": "0:00:55", "throughput": 3091.89, "total_tokens": 3553536} +{"current_steps": 7230, "total_steps": 7577, "loss": 0.1149, "lr": 1.2824970180993488e-08, "epoch": 0.9542035106242576, "percentage": 95.42, "elapsed_time": "0:19:09", "remaining_time": "0:00:55", "throughput": 3093.0, "total_tokens": 3555712} +{"current_steps": 7235, "total_steps": 7577, "loss": 0.1233, "lr": 1.2459845809023484e-08, "epoch": 0.9548634024020061, "percentage": 95.49, "elapsed_time": "0:19:09", "remaining_time": "0:00:54", "throughput": 3094.28, "total_tokens": 3558080} +{"current_steps": 7240, "total_steps": 7577, "loss": 0.3036, "lr": 1.2099961698370353e-08, "epoch": 0.9555232941797546, "percentage": 95.55, "elapsed_time": "0:19:10", "remaining_time": "0:00:53", "throughput": 3095.74, "total_tokens": 3560640} +{"current_steps": 7245, "total_steps": 7577, "loss": 0.0906, "lr": 1.1745319758715288e-08, "epoch": 0.956183185957503, "percentage": 95.62, "elapsed_time": "0:19:10", "remaining_time": "0:00:52", "throughput": 3097.35, "total_tokens": 3563392} +{"current_steps": 7250, "total_steps": 7577, "loss": 0.1414, "lr": 1.1395921871922509e-08, "epoch": 0.9568430777352515, "percentage": 95.68, "elapsed_time": "0:19:10", "remaining_time": "0:00:51", "throughput": 3098.68, "total_tokens": 3565824} +{"current_steps": 7255, "total_steps": 7577, "loss": 0.0009, "lr": 1.105176989202905e-08, "epoch": 0.9575029695129998, "percentage": 95.75, "elapsed_time": "0:19:11", "remaining_time": "0:00:51", "throughput": 3100.02, "total_tokens": 3568256} +{"current_steps": 7260, "total_steps": 7577, "loss": 0.0157, "lr": 1.0712865645235659e-08, "epoch": 0.9581628612907483, "percentage": 95.82, "elapsed_time": "0:19:11", "remaining_time": "0:00:50", "throughput": 3101.4, "total_tokens": 3570752} +{"current_steps": 7265, "total_steps": 7577, "loss": 0.0805, "lr": 1.0379210929896131e-08, "epoch": 0.9588227530684967, "percentage": 95.88, "elapsed_time": "0:19:11", "remaining_time": "0:00:49", "throughput": 3102.52, "total_tokens": 3572928} +{"current_steps": 7270, "total_steps": 7577, "loss": 0.2674, "lr": 1.0050807516508553e-08, "epoch": 0.9594826448462452, "percentage": 95.95, "elapsed_time": "0:19:11", "remaining_time": "0:00:48", "throughput": 3103.79, "total_tokens": 3575296} +{"current_steps": 7275, "total_steps": 7577, "loss": 0.0011, "lr": 9.727657147705737e-09, "epoch": 0.9601425366239936, "percentage": 96.01, "elapsed_time": "0:19:12", "remaining_time": "0:00:47", "throughput": 3105.07, "total_tokens": 3577664} +{"current_steps": 7280, "total_steps": 7577, "loss": 0.1992, "lr": 9.409761538245575e-09, "epoch": 0.9608024284017421, "percentage": 96.08, "elapsed_time": "0:19:12", "remaining_time": "0:00:47", "throughput": 3106.45, "total_tokens": 3580160} +{"current_steps": 7285, "total_steps": 7577, "loss": 0.0761, "lr": 9.097122375002264e-09, "epoch": 0.9614623201794905, "percentage": 96.15, "elapsed_time": "0:19:12", "remaining_time": "0:00:46", "throughput": 3107.67, "total_tokens": 3582464} +{"current_steps": 7290, "total_steps": 7577, "loss": 0.1308, "lr": 8.789741316957312e-09, "epoch": 0.962122211957239, "percentage": 96.21, "elapsed_time": "0:19:13", "remaining_time": "0:00:45", "throughput": 3109.01, "total_tokens": 3584896} +{"current_steps": 7295, "total_steps": 7577, "loss": 0.005, "lr": 8.487619995190986e-09, "epoch": 0.9627821037349875, "percentage": 96.28, "elapsed_time": "0:19:13", "remaining_time": "0:00:44", "throughput": 3110.55, "total_tokens": 3587584} +{"current_steps": 7300, "total_steps": 7577, "loss": 0.1393, "lr": 8.19076001287311e-09, "epoch": 0.9634419955127359, "percentage": 96.34, "elapsed_time": "0:19:13", "remaining_time": "0:00:43", "throughput": 3112.0, "total_tokens": 3590144} +{"current_steps": 7305, "total_steps": 7577, "loss": 0.0012, "lr": 7.899162945254945e-09, "epoch": 0.9641018872904844, "percentage": 96.41, "elapsed_time": "0:19:13", "remaining_time": "0:00:42", "throughput": 3113.55, "total_tokens": 3592832} +{"current_steps": 7310, "total_steps": 7577, "loss": 0.0509, "lr": 7.612830339660758e-09, "epoch": 0.9647617790682328, "percentage": 96.48, "elapsed_time": "0:19:14", "remaining_time": "0:00:42", "throughput": 3115.04, "total_tokens": 3595456} +{"current_steps": 7315, "total_steps": 7577, "loss": 0.1043, "lr": 7.3317637154796105e-09, "epoch": 0.9654216708459813, "percentage": 96.54, "elapsed_time": "0:19:14", "remaining_time": "0:00:41", "throughput": 3116.38, "total_tokens": 3597888} +{"current_steps": 7320, "total_steps": 7577, "loss": 0.0687, "lr": 7.0559645641572465e-09, "epoch": 0.9660815626237297, "percentage": 96.61, "elapsed_time": "0:19:14", "remaining_time": "0:00:40", "throughput": 3117.76, "total_tokens": 3600384} +{"current_steps": 7325, "total_steps": 7577, "loss": 0.1628, "lr": 6.785434349188102e-09, "epoch": 0.9667414544014782, "percentage": 96.67, "elapsed_time": "0:19:15", "remaining_time": "0:00:39", "throughput": 3119.15, "total_tokens": 3602880} +{"current_steps": 7330, "total_steps": 7577, "loss": 0.0423, "lr": 6.520174506107867e-09, "epoch": 0.9674013461792266, "percentage": 96.74, "elapsed_time": "0:19:15", "remaining_time": "0:00:38", "throughput": 3120.43, "total_tokens": 3605248} +{"current_steps": 7335, "total_steps": 7577, "loss": 0.0011, "lr": 6.260186442485494e-09, "epoch": 0.9680612379569751, "percentage": 96.81, "elapsed_time": "0:19:15", "remaining_time": "0:00:38", "throughput": 3121.87, "total_tokens": 3607808} +{"current_steps": 7340, "total_steps": 7577, "loss": 0.1108, "lr": 6.005471537915863e-09, "epoch": 0.9687211297347235, "percentage": 96.87, "elapsed_time": "0:19:15", "remaining_time": "0:00:37", "throughput": 3123.09, "total_tokens": 3610112} +{"current_steps": 7345, "total_steps": 7577, "loss": 0.0454, "lr": 5.756031144012685e-09, "epoch": 0.969381021512472, "percentage": 96.94, "elapsed_time": "0:19:16", "remaining_time": "0:00:36", "throughput": 3124.27, "total_tokens": 3612352} +{"current_steps": 7350, "total_steps": 7577, "loss": 0.1715, "lr": 5.511866584400837e-09, "epoch": 0.9700409132902204, "percentage": 97.0, "elapsed_time": "0:19:16", "remaining_time": "0:00:35", "throughput": 3125.65, "total_tokens": 3614848} +{"current_steps": 7355, "total_steps": 7577, "loss": 0.0017, "lr": 5.2729791547097e-09, "epoch": 0.9707008050679689, "percentage": 97.07, "elapsed_time": "0:19:16", "remaining_time": "0:00:34", "throughput": 3127.09, "total_tokens": 3617408} +{"current_steps": 7360, "total_steps": 7577, "loss": 0.0783, "lr": 5.039370122566389e-09, "epoch": 0.9713606968457174, "percentage": 97.14, "elapsed_time": "0:19:17", "remaining_time": "0:00:34", "throughput": 3128.53, "total_tokens": 3619968} +{"current_steps": 7365, "total_steps": 7577, "loss": 0.0965, "lr": 4.811040727588755e-09, "epoch": 0.9720205886234657, "percentage": 97.2, "elapsed_time": "0:19:17", "remaining_time": "0:00:33", "throughput": 3129.53, "total_tokens": 3622016} +{"current_steps": 7370, "total_steps": 7577, "loss": 0.1156, "lr": 4.58799218137873e-09, "epoch": 0.9726804804012142, "percentage": 97.27, "elapsed_time": "0:19:17", "remaining_time": "0:00:32", "throughput": 3130.64, "total_tokens": 3624192} +{"current_steps": 7375, "total_steps": 7577, "loss": 0.0009, "lr": 4.370225667516325e-09, "epoch": 0.9733403721789626, "percentage": 97.33, "elapsed_time": "0:19:17", "remaining_time": "0:00:31", "throughput": 3131.97, "total_tokens": 3626624} +{"current_steps": 7380, "total_steps": 7577, "loss": 0.1827, "lr": 4.157742341552861e-09, "epoch": 0.9740002639567111, "percentage": 97.4, "elapsed_time": "0:19:18", "remaining_time": "0:00:30", "throughput": 3133.18, "total_tokens": 3628928} +{"current_steps": 7385, "total_steps": 7577, "loss": 0.0786, "lr": 3.950543331005307e-09, "epoch": 0.9746601557344595, "percentage": 97.47, "elapsed_time": "0:19:18", "remaining_time": "0:00:30", "throughput": 3134.67, "total_tokens": 3631552} +{"current_steps": 7390, "total_steps": 7577, "loss": 0.0009, "lr": 3.748629735349839e-09, "epoch": 0.975320047512208, "percentage": 97.53, "elapsed_time": "0:19:18", "remaining_time": "0:00:29", "throughput": 3135.99, "total_tokens": 3633984} +{"current_steps": 7395, "total_steps": 7577, "loss": 0.1332, "lr": 3.552002626016293e-09, "epoch": 0.9759799392899564, "percentage": 97.6, "elapsed_time": "0:19:19", "remaining_time": "0:00:28", "throughput": 3137.16, "total_tokens": 3636224} +{"current_steps": 7400, "total_steps": 7577, "loss": 0.2453, "lr": 3.3606630463824947e-09, "epoch": 0.9766398310677049, "percentage": 97.66, "elapsed_time": "0:19:19", "remaining_time": "0:00:27", "throughput": 3138.48, "total_tokens": 3638656} +{"current_steps": 7405, "total_steps": 7577, "loss": 0.0011, "lr": 3.174612011768607e-09, "epoch": 0.9772997228454533, "percentage": 97.73, "elapsed_time": "0:19:19", "remaining_time": "0:00:26", "throughput": 3140.07, "total_tokens": 3641408} +{"current_steps": 7410, "total_steps": 7577, "loss": 0.1615, "lr": 2.9938505094316834e-09, "epoch": 0.9779596146232018, "percentage": 97.8, "elapsed_time": "0:19:19", "remaining_time": "0:00:26", "throughput": 3141.39, "total_tokens": 3643840} +{"current_steps": 7415, "total_steps": 7577, "loss": 0.0006, "lr": 2.8183794985605637e-09, "epoch": 0.9786195064009502, "percentage": 97.86, "elapsed_time": "0:19:20", "remaining_time": "0:00:25", "throughput": 3142.77, "total_tokens": 3646336} +{"current_steps": 7420, "total_steps": 7577, "loss": 0.0664, "lr": 2.6481999102707654e-09, "epoch": 0.9792793981786987, "percentage": 97.93, "elapsed_time": "0:19:20", "remaining_time": "0:00:24", "throughput": 3144.25, "total_tokens": 3648960} +{"current_steps": 7425, "total_steps": 7577, "loss": 0.0011, "lr": 2.4833126475994894e-09, "epoch": 0.9799392899564472, "percentage": 97.99, "elapsed_time": "0:19:20", "remaining_time": "0:00:23", "throughput": 3145.4, "total_tokens": 3651200} +{"current_steps": 7430, "total_steps": 7577, "loss": 0.0056, "lr": 2.3237185855008443e-09, "epoch": 0.9805991817341956, "percentage": 98.06, "elapsed_time": "0:19:21", "remaining_time": "0:00:22", "throughput": 3146.61, "total_tokens": 3653504} +{"current_steps": 7435, "total_steps": 7577, "loss": 0.2456, "lr": 2.1694185708414083e-09, "epoch": 0.9812590735119441, "percentage": 98.13, "elapsed_time": "0:19:21", "remaining_time": "0:00:22", "throughput": 3148.03, "total_tokens": 3656064} +{"current_steps": 7440, "total_steps": 7577, "loss": 0.2749, "lr": 2.0204134223952284e-09, "epoch": 0.9819189652896925, "percentage": 98.19, "elapsed_time": "0:19:21", "remaining_time": "0:00:21", "throughput": 3149.02, "total_tokens": 3658112} +{"current_steps": 7445, "total_steps": 7577, "loss": 0.1727, "lr": 1.87670393083994e-09, "epoch": 0.982578857067441, "percentage": 98.26, "elapsed_time": "0:19:21", "remaining_time": "0:00:20", "throughput": 3150.66, "total_tokens": 3660928} +{"current_steps": 7450, "total_steps": 7577, "loss": 0.0298, "lr": 1.7382908587525447e-09, "epoch": 0.9832387488451894, "percentage": 98.32, "elapsed_time": "0:19:22", "remaining_time": "0:00:19", "throughput": 3151.86, "total_tokens": 3663232} +{"current_steps": 7455, "total_steps": 7577, "loss": 0.0013, "lr": 1.6051749406049697e-09, "epoch": 0.9838986406229379, "percentage": 98.39, "elapsed_time": "0:19:22", "remaining_time": "0:00:19", "throughput": 3153.12, "total_tokens": 3665600} +{"current_steps": 7460, "total_steps": 7577, "loss": 0.0008, "lr": 1.4773568827607386e-09, "epoch": 0.9845585324006862, "percentage": 98.46, "elapsed_time": "0:19:22", "remaining_time": "0:00:18", "throughput": 3154.49, "total_tokens": 3668096} +{"current_steps": 7465, "total_steps": 7577, "loss": 0.0016, "lr": 1.354837363470529e-09, "epoch": 0.9852184241784347, "percentage": 98.52, "elapsed_time": "0:19:23", "remaining_time": "0:00:17", "throughput": 3155.91, "total_tokens": 3670656} +{"current_steps": 7470, "total_steps": 7577, "loss": 0.1466, "lr": 1.23761703286962e-09, "epoch": 0.9858783159561831, "percentage": 98.59, "elapsed_time": "0:19:23", "remaining_time": "0:00:16", "throughput": 3157.17, "total_tokens": 3673024} +{"current_steps": 7475, "total_steps": 7577, "loss": 0.0012, "lr": 1.1256965129730068e-09, "epoch": 0.9865382077339316, "percentage": 98.65, "elapsed_time": "0:19:23", "remaining_time": "0:00:15", "throughput": 3158.69, "total_tokens": 3675712} +{"current_steps": 7480, "total_steps": 7577, "loss": 0.1029, "lr": 1.0190763976734018e-09, "epoch": 0.9871980995116801, "percentage": 98.72, "elapsed_time": "0:19:23", "remaining_time": "0:00:15", "throughput": 3159.95, "total_tokens": 3678080} +{"current_steps": 7485, "total_steps": 7577, "loss": 0.1536, "lr": 9.177572527375721e-10, "epoch": 0.9878579912894285, "percentage": 98.79, "elapsed_time": "0:19:24", "remaining_time": "0:00:14", "throughput": 3161.21, "total_tokens": 3680448} +{"current_steps": 7490, "total_steps": 7577, "loss": 0.0019, "lr": 8.217396158030076e-10, "epoch": 0.988517883067177, "percentage": 98.85, "elapsed_time": "0:19:24", "remaining_time": "0:00:13", "throughput": 3162.42, "total_tokens": 3682752} +{"current_steps": 7495, "total_steps": 7577, "loss": 0.1517, "lr": 7.310239963755904e-10, "epoch": 0.9891777748449254, "percentage": 98.92, "elapsed_time": "0:19:24", "remaining_time": "0:00:12", "throughput": 3163.88, "total_tokens": 3685376} +{"current_steps": 7500, "total_steps": 7577, "loss": 0.0016, "lr": 6.456108758268186e-10, "epoch": 0.9898376666226739, "percentage": 98.98, "elapsed_time": "0:19:25", "remaining_time": "0:00:11", "throughput": 3165.13, "total_tokens": 3687744} +{"current_steps": 7505, "total_steps": 7577, "loss": 0.1517, "lr": 5.655007073909202e-10, "epoch": 0.9904975584004223, "percentage": 99.05, "elapsed_time": "0:19:25", "remaining_time": "0:00:11", "throughput": 3166.5, "total_tokens": 3690240} +{"current_steps": 7510, "total_steps": 7577, "loss": 0.0507, "lr": 4.906939161627432e-10, "epoch": 0.9911574501781708, "percentage": 99.12, "elapsed_time": "0:19:25", "remaining_time": "0:00:10", "throughput": 3167.86, "total_tokens": 3692736} +{"current_steps": 7515, "total_steps": 7577, "loss": 0.201, "lr": 4.2119089909542495e-10, "epoch": 0.9918173419559192, "percentage": 99.18, "elapsed_time": "0:19:25", "remaining_time": "0:00:09", "throughput": 3169.33, "total_tokens": 3695360} +{"current_steps": 7520, "total_steps": 7577, "loss": 0.0593, "lr": 3.569920249981706e-10, "epoch": 0.9924772337336677, "percentage": 99.25, "elapsed_time": "0:19:26", "remaining_time": "0:00:08", "throughput": 3170.69, "total_tokens": 3697856} +{"current_steps": 7525, "total_steps": 7577, "loss": 0.027, "lr": 2.980976345344777e-10, "epoch": 0.9931371255114161, "percentage": 99.31, "elapsed_time": "0:19:26", "remaining_time": "0:00:08", "throughput": 3171.94, "total_tokens": 3700224} +{"current_steps": 7530, "total_steps": 7577, "loss": 0.0772, "lr": 2.445080402202482e-10, "epoch": 0.9937970172891646, "percentage": 99.38, "elapsed_time": "0:19:26", "remaining_time": "0:00:07", "throughput": 3173.2, "total_tokens": 3702592} +{"current_steps": 7535, "total_steps": 7577, "loss": 0.1564, "lr": 1.962235264222345e-10, "epoch": 0.994456909066913, "percentage": 99.45, "elapsed_time": "0:19:27", "remaining_time": "0:00:06", "throughput": 3174.4, "total_tokens": 3704896} +{"current_steps": 7540, "total_steps": 7577, "loss": 0.0446, "lr": 1.5324434935615195e-10, "epoch": 0.9951168008446615, "percentage": 99.51, "elapsed_time": "0:19:27", "remaining_time": "0:00:05", "throughput": 3175.65, "total_tokens": 3707264} +{"current_steps": 7545, "total_steps": 7577, "loss": 0.1154, "lr": 1.1557073708579057e-10, "epoch": 0.99577669262241, "percentage": 99.58, "elapsed_time": "0:19:27", "remaining_time": "0:00:04", "throughput": 3177.06, "total_tokens": 3709824} +{"current_steps": 7550, "total_steps": 7577, "loss": 0.1286, "lr": 8.320288952168297e-11, "epoch": 0.9964365844001584, "percentage": 99.64, "elapsed_time": "0:19:27", "remaining_time": "0:00:04", "throughput": 3178.31, "total_tokens": 3712192} +{"current_steps": 7555, "total_steps": 7577, "loss": 0.0832, "lr": 5.614097841988297e-11, "epoch": 0.9970964761779069, "percentage": 99.71, "elapsed_time": "0:19:28", "remaining_time": "0:00:03", "throughput": 3179.82, "total_tokens": 3714880} +{"current_steps": 7560, "total_steps": 7577, "loss": 0.0311, "lr": 3.43851473808554e-11, "epoch": 0.9977563679556553, "percentage": 99.78, "elapsed_time": "0:19:28", "remaining_time": "0:00:02", "throughput": 3181.01, "total_tokens": 3717184} +{"current_steps": 7565, "total_steps": 7577, "loss": 0.0522, "lr": 1.7935511849587192e-11, "epoch": 0.9984162597334038, "percentage": 99.84, "elapsed_time": "0:19:28", "remaining_time": "0:00:01", "throughput": 3182.16, "total_tokens": 3719424} +{"current_steps": 7570, "total_steps": 7577, "loss": 0.1208, "lr": 6.792159113921947e-12, "epoch": 0.9990761515111521, "percentage": 99.91, "elapsed_time": "0:19:29", "remaining_time": "0:00:01", "throughput": 3183.52, "total_tokens": 3721920} +{"current_steps": 7575, "total_steps": 7577, "loss": 0.1169, "lr": 9.55148304560005e-13, "epoch": 0.9997360432889006, "percentage": 99.97, "elapsed_time": "0:19:29", "remaining_time": "0:00:00", "throughput": 3184.78, "total_tokens": 3724288} +{"current_steps": 7577, "total_steps": 7577, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "0:20:13", "remaining_time": "0:00:00", "throughput": 3069.72, "total_tokens": 3725120} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..6bafcf9 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,12335 @@ +{ + "best_global_step": 5306, + "best_metric": 0.09084735810756683, + "best_model_checkpoint": "saves_bts_preliminary/freeze/llama-3.2-1b-instruct/train_sst2_42_1779354537/checkpoint-5306", + "epoch": 1.0, + "eval_steps": 379, + "global_step": 7577, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006598917777484492, + "grad_norm": 442.1559753417969, + "learning_rate": 1.0554089709762531e-08, + "loss": 1.413, + "num_input_tokens_seen": 2240, + "step": 5 + }, + { + "epoch": 0.0013197835554968984, + "grad_norm": 437.0361328125, + "learning_rate": 2.3746701846965696e-08, + "loss": 1.5134, + "num_input_tokens_seen": 4672, + "step": 10 + }, + { + "epoch": 0.0019796753332453477, + "grad_norm": 441.0426330566406, + "learning_rate": 3.6939313984168866e-08, + "loss": 1.3995, + "num_input_tokens_seen": 7040, + "step": 15 + }, + { + "epoch": 0.002639567110993797, + "grad_norm": 481.140380859375, + "learning_rate": 5.013192612137203e-08, + "loss": 1.422, + "num_input_tokens_seen": 9600, + "step": 20 + }, + { + "epoch": 0.0032994588887422464, + "grad_norm": 432.22509765625, + "learning_rate": 6.33245382585752e-08, + "loss": 1.2976, + "num_input_tokens_seen": 12160, + "step": 25 + }, + { + "epoch": 0.0039593506664906955, + "grad_norm": 370.72210693359375, + "learning_rate": 7.651715039577835e-08, + "loss": 1.3834, + "num_input_tokens_seen": 14528, + "step": 30 + }, + { + "epoch": 0.004619242444239145, + "grad_norm": 356.2071533203125, + "learning_rate": 8.970976253298153e-08, + "loss": 1.1343, + "num_input_tokens_seen": 16768, + "step": 35 + }, + { + "epoch": 0.005279134221987594, + "grad_norm": 318.7434997558594, + "learning_rate": 1.0290237467018468e-07, + "loss": 1.1513, + "num_input_tokens_seen": 19264, + "step": 40 + }, + { + "epoch": 0.005939025999736044, + "grad_norm": 264.9164733886719, + "learning_rate": 1.1609498680738786e-07, + "loss": 0.8287, + "num_input_tokens_seen": 21632, + "step": 45 + }, + { + "epoch": 0.006598917777484493, + "grad_norm": 186.7604217529297, + "learning_rate": 1.29287598944591e-07, + "loss": 0.7425, + "num_input_tokens_seen": 24000, + "step": 50 + }, + { + "epoch": 0.007258809555232942, + "grad_norm": 162.783203125, + "learning_rate": 1.424802110817942e-07, + "loss": 0.7064, + "num_input_tokens_seen": 26496, + "step": 55 + }, + { + "epoch": 0.007918701332981391, + "grad_norm": 45.531829833984375, + "learning_rate": 1.5567282321899736e-07, + "loss": 0.3853, + "num_input_tokens_seen": 29120, + "step": 60 + }, + { + "epoch": 0.008578593110729841, + "grad_norm": 28.7725887298584, + "learning_rate": 1.688654353562005e-07, + "loss": 0.3076, + "num_input_tokens_seen": 31744, + "step": 65 + }, + { + "epoch": 0.00923848488847829, + "grad_norm": 51.50053024291992, + "learning_rate": 1.820580474934037e-07, + "loss": 0.2971, + "num_input_tokens_seen": 34176, + "step": 70 + }, + { + "epoch": 0.009898376666226739, + "grad_norm": 48.889060974121094, + "learning_rate": 1.9525065963060686e-07, + "loss": 0.3004, + "num_input_tokens_seen": 36864, + "step": 75 + }, + { + "epoch": 0.010558268443975187, + "grad_norm": 26.879812240600586, + "learning_rate": 2.0844327176781002e-07, + "loss": 0.2532, + "num_input_tokens_seen": 39424, + "step": 80 + }, + { + "epoch": 0.011218160221723637, + "grad_norm": 41.192012786865234, + "learning_rate": 2.2163588390501316e-07, + "loss": 0.2616, + "num_input_tokens_seen": 42112, + "step": 85 + }, + { + "epoch": 0.011878051999472087, + "grad_norm": 29.90664291381836, + "learning_rate": 2.3482849604221635e-07, + "loss": 0.2528, + "num_input_tokens_seen": 44544, + "step": 90 + }, + { + "epoch": 0.012537943777220536, + "grad_norm": 42.19837951660156, + "learning_rate": 2.480211081794195e-07, + "loss": 0.1904, + "num_input_tokens_seen": 47104, + "step": 95 + }, + { + "epoch": 0.013197835554968985, + "grad_norm": 23.84099769592285, + "learning_rate": 2.612137203166227e-07, + "loss": 0.1653, + "num_input_tokens_seen": 49664, + "step": 100 + }, + { + "epoch": 0.013857727332717434, + "grad_norm": 22.509811401367188, + "learning_rate": 2.744063324538258e-07, + "loss": 0.137, + "num_input_tokens_seen": 52352, + "step": 105 + }, + { + "epoch": 0.014517619110465884, + "grad_norm": 86.81343078613281, + "learning_rate": 2.8759894459102903e-07, + "loss": 0.102, + "num_input_tokens_seen": 54720, + "step": 110 + }, + { + "epoch": 0.015177510888214334, + "grad_norm": 43.25410461425781, + "learning_rate": 3.007915567282322e-07, + "loss": 0.1477, + "num_input_tokens_seen": 57152, + "step": 115 + }, + { + "epoch": 0.015837402665962782, + "grad_norm": 98.41868591308594, + "learning_rate": 3.139841688654353e-07, + "loss": 0.2005, + "num_input_tokens_seen": 59776, + "step": 120 + }, + { + "epoch": 0.01649729444371123, + "grad_norm": 23.211767196655273, + "learning_rate": 3.271767810026385e-07, + "loss": 0.1417, + "num_input_tokens_seen": 62464, + "step": 125 + }, + { + "epoch": 0.017157186221459682, + "grad_norm": 80.11978912353516, + "learning_rate": 3.403693931398417e-07, + "loss": 0.1226, + "num_input_tokens_seen": 65088, + "step": 130 + }, + { + "epoch": 0.01781707799920813, + "grad_norm": 74.83419036865234, + "learning_rate": 3.5356200527704485e-07, + "loss": 0.2123, + "num_input_tokens_seen": 67776, + "step": 135 + }, + { + "epoch": 0.01847696977695658, + "grad_norm": 67.47618865966797, + "learning_rate": 3.66754617414248e-07, + "loss": 0.2606, + "num_input_tokens_seen": 70400, + "step": 140 + }, + { + "epoch": 0.01913686155470503, + "grad_norm": 51.34063720703125, + "learning_rate": 3.7994722955145113e-07, + "loss": 0.1463, + "num_input_tokens_seen": 72704, + "step": 145 + }, + { + "epoch": 0.019796753332453478, + "grad_norm": 63.031131744384766, + "learning_rate": 3.9313984168865435e-07, + "loss": 0.346, + "num_input_tokens_seen": 75136, + "step": 150 + }, + { + "epoch": 0.020456645110201926, + "grad_norm": 25.44994354248047, + "learning_rate": 4.063324538258575e-07, + "loss": 0.0609, + "num_input_tokens_seen": 77632, + "step": 155 + }, + { + "epoch": 0.021116536887950375, + "grad_norm": 72.61922454833984, + "learning_rate": 4.195250659630606e-07, + "loss": 0.2492, + "num_input_tokens_seen": 80320, + "step": 160 + }, + { + "epoch": 0.021776428665698826, + "grad_norm": 107.4610824584961, + "learning_rate": 4.3271767810026384e-07, + "loss": 0.1542, + "num_input_tokens_seen": 82752, + "step": 165 + }, + { + "epoch": 0.022436320443447275, + "grad_norm": 148.21913146972656, + "learning_rate": 4.45910290237467e-07, + "loss": 0.3095, + "num_input_tokens_seen": 85248, + "step": 170 + }, + { + "epoch": 0.023096212221195723, + "grad_norm": 166.9718475341797, + "learning_rate": 4.5910290237467017e-07, + "loss": 0.2917, + "num_input_tokens_seen": 87872, + "step": 175 + }, + { + "epoch": 0.023756103998944175, + "grad_norm": 49.3593864440918, + "learning_rate": 4.7229551451187333e-07, + "loss": 0.1369, + "num_input_tokens_seen": 90368, + "step": 180 + }, + { + "epoch": 0.024415995776692623, + "grad_norm": 39.84811019897461, + "learning_rate": 4.854881266490765e-07, + "loss": 0.0624, + "num_input_tokens_seen": 92928, + "step": 185 + }, + { + "epoch": 0.02507588755444107, + "grad_norm": 61.82024383544922, + "learning_rate": 4.986807387862796e-07, + "loss": 0.1617, + "num_input_tokens_seen": 95296, + "step": 190 + }, + { + "epoch": 0.02573577933218952, + "grad_norm": 56.04179763793945, + "learning_rate": 5.118733509234829e-07, + "loss": 0.1049, + "num_input_tokens_seen": 97984, + "step": 195 + }, + { + "epoch": 0.02639567110993797, + "grad_norm": 102.43315124511719, + "learning_rate": 5.250659630606859e-07, + "loss": 0.1992, + "num_input_tokens_seen": 100352, + "step": 200 + }, + { + "epoch": 0.02705556288768642, + "grad_norm": 17.690580368041992, + "learning_rate": 5.382585751978892e-07, + "loss": 0.1434, + "num_input_tokens_seen": 102464, + "step": 205 + }, + { + "epoch": 0.027715454665434867, + "grad_norm": 14.600909233093262, + "learning_rate": 5.514511873350924e-07, + "loss": 0.1237, + "num_input_tokens_seen": 105088, + "step": 210 + }, + { + "epoch": 0.02837534644318332, + "grad_norm": 16.153587341308594, + "learning_rate": 5.646437994722954e-07, + "loss": 0.3024, + "num_input_tokens_seen": 107648, + "step": 215 + }, + { + "epoch": 0.029035238220931767, + "grad_norm": 108.86598205566406, + "learning_rate": 5.778364116094987e-07, + "loss": 0.2383, + "num_input_tokens_seen": 110144, + "step": 220 + }, + { + "epoch": 0.029695129998680216, + "grad_norm": 30.326353073120117, + "learning_rate": 5.910290237467019e-07, + "loss": 0.1367, + "num_input_tokens_seen": 112576, + "step": 225 + }, + { + "epoch": 0.030355021776428667, + "grad_norm": 56.80265808105469, + "learning_rate": 6.042216358839049e-07, + "loss": 0.2351, + "num_input_tokens_seen": 115264, + "step": 230 + }, + { + "epoch": 0.031014913554177116, + "grad_norm": 17.4112606048584, + "learning_rate": 6.174142480211082e-07, + "loss": 0.0839, + "num_input_tokens_seen": 117888, + "step": 235 + }, + { + "epoch": 0.031674805331925564, + "grad_norm": 34.93491744995117, + "learning_rate": 6.306068601583114e-07, + "loss": 0.0189, + "num_input_tokens_seen": 120320, + "step": 240 + }, + { + "epoch": 0.032334697109674015, + "grad_norm": 89.36637115478516, + "learning_rate": 6.437994722955144e-07, + "loss": 0.1832, + "num_input_tokens_seen": 122688, + "step": 245 + }, + { + "epoch": 0.03299458888742246, + "grad_norm": 80.80409240722656, + "learning_rate": 6.569920844327177e-07, + "loss": 0.2718, + "num_input_tokens_seen": 125248, + "step": 250 + }, + { + "epoch": 0.03365448066517091, + "grad_norm": 89.6211166381836, + "learning_rate": 6.701846965699208e-07, + "loss": 0.2037, + "num_input_tokens_seen": 127680, + "step": 255 + }, + { + "epoch": 0.034314372442919364, + "grad_norm": 17.70661735534668, + "learning_rate": 6.833773087071239e-07, + "loss": 0.0833, + "num_input_tokens_seen": 130496, + "step": 260 + }, + { + "epoch": 0.03497426422066781, + "grad_norm": 31.946949005126953, + "learning_rate": 6.965699208443272e-07, + "loss": 0.19, + "num_input_tokens_seen": 132992, + "step": 265 + }, + { + "epoch": 0.03563415599841626, + "grad_norm": 58.67094802856445, + "learning_rate": 7.097625329815303e-07, + "loss": 0.1429, + "num_input_tokens_seen": 135040, + "step": 270 + }, + { + "epoch": 0.03629404777616471, + "grad_norm": 67.81129455566406, + "learning_rate": 7.229551451187335e-07, + "loss": 0.2488, + "num_input_tokens_seen": 137600, + "step": 275 + }, + { + "epoch": 0.03695393955391316, + "grad_norm": 61.778663635253906, + "learning_rate": 7.361477572559367e-07, + "loss": 0.1125, + "num_input_tokens_seen": 139904, + "step": 280 + }, + { + "epoch": 0.03761383133166161, + "grad_norm": 15.433332443237305, + "learning_rate": 7.493403693931398e-07, + "loss": 0.1465, + "num_input_tokens_seen": 142016, + "step": 285 + }, + { + "epoch": 0.03827372310941006, + "grad_norm": 8.442534446716309, + "learning_rate": 7.62532981530343e-07, + "loss": 0.0092, + "num_input_tokens_seen": 144576, + "step": 290 + }, + { + "epoch": 0.038933614887158505, + "grad_norm": 112.9028091430664, + "learning_rate": 7.757255936675461e-07, + "loss": 0.0894, + "num_input_tokens_seen": 146880, + "step": 295 + }, + { + "epoch": 0.039593506664906956, + "grad_norm": 95.21463775634766, + "learning_rate": 7.889182058047493e-07, + "loss": 0.3917, + "num_input_tokens_seen": 149184, + "step": 300 + }, + { + "epoch": 0.0402533984426554, + "grad_norm": 0.18586313724517822, + "learning_rate": 8.021108179419525e-07, + "loss": 0.2216, + "num_input_tokens_seen": 151296, + "step": 305 + }, + { + "epoch": 0.04091329022040385, + "grad_norm": 79.36148071289062, + "learning_rate": 8.153034300791555e-07, + "loss": 0.2024, + "num_input_tokens_seen": 153664, + "step": 310 + }, + { + "epoch": 0.041573181998152305, + "grad_norm": 55.61602020263672, + "learning_rate": 8.284960422163588e-07, + "loss": 0.2624, + "num_input_tokens_seen": 156032, + "step": 315 + }, + { + "epoch": 0.04223307377590075, + "grad_norm": 0.2602592408657074, + "learning_rate": 8.41688654353562e-07, + "loss": 0.2276, + "num_input_tokens_seen": 158528, + "step": 320 + }, + { + "epoch": 0.0428929655536492, + "grad_norm": 18.09028434753418, + "learning_rate": 8.54881266490765e-07, + "loss": 0.1227, + "num_input_tokens_seen": 160704, + "step": 325 + }, + { + "epoch": 0.04355285733139765, + "grad_norm": 9.590105056762695, + "learning_rate": 8.680738786279683e-07, + "loss": 0.1889, + "num_input_tokens_seen": 163072, + "step": 330 + }, + { + "epoch": 0.0442127491091461, + "grad_norm": 17.7542724609375, + "learning_rate": 8.812664907651715e-07, + "loss": 0.1251, + "num_input_tokens_seen": 165568, + "step": 335 + }, + { + "epoch": 0.04487264088689455, + "grad_norm": 0.6858607530593872, + "learning_rate": 8.944591029023745e-07, + "loss": 0.2737, + "num_input_tokens_seen": 167936, + "step": 340 + }, + { + "epoch": 0.045532532664643, + "grad_norm": 168.79275512695312, + "learning_rate": 9.076517150395778e-07, + "loss": 0.0704, + "num_input_tokens_seen": 170176, + "step": 345 + }, + { + "epoch": 0.046192424442391446, + "grad_norm": 137.41461181640625, + "learning_rate": 9.20844327176781e-07, + "loss": 0.1521, + "num_input_tokens_seen": 172352, + "step": 350 + }, + { + "epoch": 0.0468523162201399, + "grad_norm": 42.20560073852539, + "learning_rate": 9.340369393139841e-07, + "loss": 0.2593, + "num_input_tokens_seen": 175168, + "step": 355 + }, + { + "epoch": 0.04751220799788835, + "grad_norm": 1.3534170389175415, + "learning_rate": 9.472295514511873e-07, + "loss": 0.1364, + "num_input_tokens_seen": 177856, + "step": 360 + }, + { + "epoch": 0.048172099775636794, + "grad_norm": 38.15507125854492, + "learning_rate": 9.604221635883904e-07, + "loss": 0.3046, + "num_input_tokens_seen": 180160, + "step": 365 + }, + { + "epoch": 0.048831991553385246, + "grad_norm": 143.81930541992188, + "learning_rate": 9.736147757255936e-07, + "loss": 0.1873, + "num_input_tokens_seen": 182784, + "step": 370 + }, + { + "epoch": 0.0494918833311337, + "grad_norm": 0.975854754447937, + "learning_rate": 9.86807387862797e-07, + "loss": 0.0584, + "num_input_tokens_seen": 185024, + "step": 375 + }, + { + "epoch": 0.05001979675333245, + "eval_loss": 0.17531457543373108, + "eval_runtime": 7.4576, + "eval_samples_per_second": 903.11, + "eval_steps_per_second": 112.905, + "num_input_tokens_seen": 187072, + "step": 379 + }, + { + "epoch": 0.05015177510888214, + "grad_norm": 59.641075134277344, + "learning_rate": 1e-06, + "loss": 0.1671, + "num_input_tokens_seen": 187712, + "step": 380 + }, + { + "epoch": 0.050811666886630594, + "grad_norm": 49.6882209777832, + "learning_rate": 1.0131926121372032e-06, + "loss": 0.2583, + "num_input_tokens_seen": 190400, + "step": 385 + }, + { + "epoch": 0.05147155866437904, + "grad_norm": 53.95302963256836, + "learning_rate": 1.0263852242744063e-06, + "loss": 0.1182, + "num_input_tokens_seen": 193280, + "step": 390 + }, + { + "epoch": 0.05213145044212749, + "grad_norm": 30.752676010131836, + "learning_rate": 1.0395778364116096e-06, + "loss": 0.1517, + "num_input_tokens_seen": 195584, + "step": 395 + }, + { + "epoch": 0.05279134221987594, + "grad_norm": 14.899801254272461, + "learning_rate": 1.0527704485488126e-06, + "loss": 0.0928, + "num_input_tokens_seen": 198208, + "step": 400 + }, + { + "epoch": 0.05345123399762439, + "grad_norm": 85.27983093261719, + "learning_rate": 1.0659630606860157e-06, + "loss": 0.2377, + "num_input_tokens_seen": 200704, + "step": 405 + }, + { + "epoch": 0.05411112577537284, + "grad_norm": 81.29493713378906, + "learning_rate": 1.079155672823219e-06, + "loss": 0.2115, + "num_input_tokens_seen": 203136, + "step": 410 + }, + { + "epoch": 0.05477101755312129, + "grad_norm": 64.4339370727539, + "learning_rate": 1.0923482849604222e-06, + "loss": 0.1501, + "num_input_tokens_seen": 205504, + "step": 415 + }, + { + "epoch": 0.055430909330869735, + "grad_norm": 108.44839477539062, + "learning_rate": 1.1055408970976253e-06, + "loss": 0.2442, + "num_input_tokens_seen": 208000, + "step": 420 + }, + { + "epoch": 0.056090801108618187, + "grad_norm": 229.58607482910156, + "learning_rate": 1.1187335092348285e-06, + "loss": 0.0951, + "num_input_tokens_seen": 210560, + "step": 425 + }, + { + "epoch": 0.05675069288636664, + "grad_norm": 34.30244827270508, + "learning_rate": 1.1319261213720316e-06, + "loss": 0.1749, + "num_input_tokens_seen": 213056, + "step": 430 + }, + { + "epoch": 0.05741058466411508, + "grad_norm": 48.8880500793457, + "learning_rate": 1.1451187335092347e-06, + "loss": 0.1071, + "num_input_tokens_seen": 215296, + "step": 435 + }, + { + "epoch": 0.058070476441863535, + "grad_norm": 0.0803152322769165, + "learning_rate": 1.158311345646438e-06, + "loss": 0.0089, + "num_input_tokens_seen": 217472, + "step": 440 + }, + { + "epoch": 0.058730368219611986, + "grad_norm": 173.73971557617188, + "learning_rate": 1.1715039577836412e-06, + "loss": 0.0408, + "num_input_tokens_seen": 219968, + "step": 445 + }, + { + "epoch": 0.05939025999736043, + "grad_norm": 0.06401122361421585, + "learning_rate": 1.1846965699208443e-06, + "loss": 0.0381, + "num_input_tokens_seen": 222592, + "step": 450 + }, + { + "epoch": 0.06005015177510888, + "grad_norm": 118.57979583740234, + "learning_rate": 1.1978891820580475e-06, + "loss": 0.5112, + "num_input_tokens_seen": 224768, + "step": 455 + }, + { + "epoch": 0.060710043552857335, + "grad_norm": 3.2848386764526367, + "learning_rate": 1.2110817941952508e-06, + "loss": 0.2367, + "num_input_tokens_seen": 227264, + "step": 460 + }, + { + "epoch": 0.06136993533060578, + "grad_norm": 39.09904861450195, + "learning_rate": 1.2242744063324536e-06, + "loss": 0.2476, + "num_input_tokens_seen": 229760, + "step": 465 + }, + { + "epoch": 0.06202982710835423, + "grad_norm": 75.33295440673828, + "learning_rate": 1.237467018469657e-06, + "loss": 0.1846, + "num_input_tokens_seen": 232000, + "step": 470 + }, + { + "epoch": 0.06268971888610268, + "grad_norm": 1.241437315940857, + "learning_rate": 1.2506596306068602e-06, + "loss": 0.1636, + "num_input_tokens_seen": 234176, + "step": 475 + }, + { + "epoch": 0.06334961066385113, + "grad_norm": 0.22346267104148865, + "learning_rate": 1.2638522427440632e-06, + "loss": 0.1056, + "num_input_tokens_seen": 236736, + "step": 480 + }, + { + "epoch": 0.06400950244159957, + "grad_norm": 0.07724567502737045, + "learning_rate": 1.2770448548812665e-06, + "loss": 0.0032, + "num_input_tokens_seen": 239104, + "step": 485 + }, + { + "epoch": 0.06466939421934803, + "grad_norm": 162.94638061523438, + "learning_rate": 1.2902374670184698e-06, + "loss": 0.4157, + "num_input_tokens_seen": 241600, + "step": 490 + }, + { + "epoch": 0.06532928599709648, + "grad_norm": 143.5639190673828, + "learning_rate": 1.3034300791556726e-06, + "loss": 0.3641, + "num_input_tokens_seen": 244160, + "step": 495 + }, + { + "epoch": 0.06598917777484492, + "grad_norm": 0.8768963813781738, + "learning_rate": 1.316622691292876e-06, + "loss": 0.175, + "num_input_tokens_seen": 246464, + "step": 500 + }, + { + "epoch": 0.06664906955259338, + "grad_norm": 1.546777606010437, + "learning_rate": 1.3298153034300792e-06, + "loss": 0.0893, + "num_input_tokens_seen": 248960, + "step": 505 + }, + { + "epoch": 0.06730896133034182, + "grad_norm": 82.6065902709961, + "learning_rate": 1.3430079155672822e-06, + "loss": 0.1824, + "num_input_tokens_seen": 251392, + "step": 510 + }, + { + "epoch": 0.06796885310809027, + "grad_norm": 29.751699447631836, + "learning_rate": 1.3562005277044855e-06, + "loss": 0.2085, + "num_input_tokens_seen": 253888, + "step": 515 + }, + { + "epoch": 0.06862874488583873, + "grad_norm": 26.04964828491211, + "learning_rate": 1.3693931398416888e-06, + "loss": 0.1502, + "num_input_tokens_seen": 256384, + "step": 520 + }, + { + "epoch": 0.06928863666358717, + "grad_norm": 60.13207244873047, + "learning_rate": 1.3825857519788916e-06, + "loss": 0.1721, + "num_input_tokens_seen": 258496, + "step": 525 + }, + { + "epoch": 0.06994852844133562, + "grad_norm": 0.15003204345703125, + "learning_rate": 1.3957783641160949e-06, + "loss": 0.0959, + "num_input_tokens_seen": 260864, + "step": 530 + }, + { + "epoch": 0.07060842021908408, + "grad_norm": 19.173173904418945, + "learning_rate": 1.4089709762532982e-06, + "loss": 0.0484, + "num_input_tokens_seen": 263360, + "step": 535 + }, + { + "epoch": 0.07126831199683252, + "grad_norm": 233.016357421875, + "learning_rate": 1.4221635883905012e-06, + "loss": 0.306, + "num_input_tokens_seen": 266112, + "step": 540 + }, + { + "epoch": 0.07192820377458096, + "grad_norm": 39.65471267700195, + "learning_rate": 1.4353562005277045e-06, + "loss": 0.2425, + "num_input_tokens_seen": 268544, + "step": 545 + }, + { + "epoch": 0.07258809555232942, + "grad_norm": 0.22139348089694977, + "learning_rate": 1.4485488126649078e-06, + "loss": 0.4216, + "num_input_tokens_seen": 270912, + "step": 550 + }, + { + "epoch": 0.07324798733007787, + "grad_norm": 0.2836262285709381, + "learning_rate": 1.4617414248021108e-06, + "loss": 0.0023, + "num_input_tokens_seen": 273664, + "step": 555 + }, + { + "epoch": 0.07390787910782631, + "grad_norm": 91.80794525146484, + "learning_rate": 1.4749340369393139e-06, + "loss": 0.1737, + "num_input_tokens_seen": 276160, + "step": 560 + }, + { + "epoch": 0.07456777088557477, + "grad_norm": 0.3804304897785187, + "learning_rate": 1.4881266490765171e-06, + "loss": 0.0057, + "num_input_tokens_seen": 278784, + "step": 565 + }, + { + "epoch": 0.07522766266332322, + "grad_norm": 25.789026260375977, + "learning_rate": 1.5013192612137202e-06, + "loss": 0.2339, + "num_input_tokens_seen": 281152, + "step": 570 + }, + { + "epoch": 0.07588755444107166, + "grad_norm": 56.50286865234375, + "learning_rate": 1.5145118733509235e-06, + "loss": 0.2223, + "num_input_tokens_seen": 283456, + "step": 575 + }, + { + "epoch": 0.07654744621882012, + "grad_norm": 3.2999558448791504, + "learning_rate": 1.5277044854881265e-06, + "loss": 0.0743, + "num_input_tokens_seen": 286016, + "step": 580 + }, + { + "epoch": 0.07720733799656856, + "grad_norm": 0.23049865663051605, + "learning_rate": 1.5408970976253298e-06, + "loss": 0.1493, + "num_input_tokens_seen": 288448, + "step": 585 + }, + { + "epoch": 0.07786722977431701, + "grad_norm": 0.022031376138329506, + "learning_rate": 1.5540897097625329e-06, + "loss": 0.1879, + "num_input_tokens_seen": 290816, + "step": 590 + }, + { + "epoch": 0.07852712155206547, + "grad_norm": 71.99144744873047, + "learning_rate": 1.567282321899736e-06, + "loss": 0.2187, + "num_input_tokens_seen": 293504, + "step": 595 + }, + { + "epoch": 0.07918701332981391, + "grad_norm": 0.02517612837255001, + "learning_rate": 1.5804749340369392e-06, + "loss": 0.1335, + "num_input_tokens_seen": 295744, + "step": 600 + }, + { + "epoch": 0.07984690510756236, + "grad_norm": 0.04881107434630394, + "learning_rate": 1.5936675461741425e-06, + "loss": 0.0816, + "num_input_tokens_seen": 298112, + "step": 605 + }, + { + "epoch": 0.0805067968853108, + "grad_norm": 197.50973510742188, + "learning_rate": 1.6068601583113455e-06, + "loss": 0.1134, + "num_input_tokens_seen": 300608, + "step": 610 + }, + { + "epoch": 0.08116668866305926, + "grad_norm": 0.06382615864276886, + "learning_rate": 1.6200527704485488e-06, + "loss": 0.0047, + "num_input_tokens_seen": 303360, + "step": 615 + }, + { + "epoch": 0.0818265804408077, + "grad_norm": 9.255777359008789, + "learning_rate": 1.633245382585752e-06, + "loss": 0.0712, + "num_input_tokens_seen": 305920, + "step": 620 + }, + { + "epoch": 0.08248647221855615, + "grad_norm": 11.119955062866211, + "learning_rate": 1.646437994722955e-06, + "loss": 0.0795, + "num_input_tokens_seen": 308416, + "step": 625 + }, + { + "epoch": 0.08314636399630461, + "grad_norm": 0.05398223549127579, + "learning_rate": 1.6596306068601582e-06, + "loss": 0.1324, + "num_input_tokens_seen": 310848, + "step": 630 + }, + { + "epoch": 0.08380625577405305, + "grad_norm": 55.00618362426758, + "learning_rate": 1.6728232189973614e-06, + "loss": 0.2123, + "num_input_tokens_seen": 313408, + "step": 635 + }, + { + "epoch": 0.0844661475518015, + "grad_norm": 111.69770050048828, + "learning_rate": 1.6860158311345645e-06, + "loss": 0.1099, + "num_input_tokens_seen": 315904, + "step": 640 + }, + { + "epoch": 0.08512603932954996, + "grad_norm": 48.804962158203125, + "learning_rate": 1.6992084432717678e-06, + "loss": 0.2301, + "num_input_tokens_seen": 318080, + "step": 645 + }, + { + "epoch": 0.0857859311072984, + "grad_norm": 6.783302307128906, + "learning_rate": 1.712401055408971e-06, + "loss": 0.0621, + "num_input_tokens_seen": 320256, + "step": 650 + }, + { + "epoch": 0.08644582288504685, + "grad_norm": 0.6253184676170349, + "learning_rate": 1.7255936675461739e-06, + "loss": 0.0199, + "num_input_tokens_seen": 322496, + "step": 655 + }, + { + "epoch": 0.0871057146627953, + "grad_norm": 257.97125244140625, + "learning_rate": 1.7387862796833772e-06, + "loss": 0.1723, + "num_input_tokens_seen": 325120, + "step": 660 + }, + { + "epoch": 0.08776560644054375, + "grad_norm": 29.855276107788086, + "learning_rate": 1.7519788918205804e-06, + "loss": 0.0485, + "num_input_tokens_seen": 327296, + "step": 665 + }, + { + "epoch": 0.0884254982182922, + "grad_norm": 42.55568313598633, + "learning_rate": 1.7651715039577835e-06, + "loss": 0.4327, + "num_input_tokens_seen": 329664, + "step": 670 + }, + { + "epoch": 0.08908538999604065, + "grad_norm": 23.53718376159668, + "learning_rate": 1.7783641160949868e-06, + "loss": 0.0918, + "num_input_tokens_seen": 332416, + "step": 675 + }, + { + "epoch": 0.0897452817737891, + "grad_norm": 0.3492559790611267, + "learning_rate": 1.79155672823219e-06, + "loss": 0.0255, + "num_input_tokens_seen": 334976, + "step": 680 + }, + { + "epoch": 0.09040517355153754, + "grad_norm": 0.0223238542675972, + "learning_rate": 1.8047493403693929e-06, + "loss": 0.0856, + "num_input_tokens_seen": 337472, + "step": 685 + }, + { + "epoch": 0.091065065329286, + "grad_norm": 8.461647033691406, + "learning_rate": 1.8179419525065961e-06, + "loss": 0.1861, + "num_input_tokens_seen": 339904, + "step": 690 + }, + { + "epoch": 0.09172495710703445, + "grad_norm": 131.66806030273438, + "learning_rate": 1.8311345646437994e-06, + "loss": 0.1639, + "num_input_tokens_seen": 342272, + "step": 695 + }, + { + "epoch": 0.09238484888478289, + "grad_norm": 0.0658893883228302, + "learning_rate": 1.8443271767810025e-06, + "loss": 0.1908, + "num_input_tokens_seen": 344640, + "step": 700 + }, + { + "epoch": 0.09304474066253135, + "grad_norm": 0.02002323418855667, + "learning_rate": 1.8575197889182057e-06, + "loss": 0.0427, + "num_input_tokens_seen": 347072, + "step": 705 + }, + { + "epoch": 0.0937046324402798, + "grad_norm": 0.023283669725060463, + "learning_rate": 1.870712401055409e-06, + "loss": 0.194, + "num_input_tokens_seen": 349696, + "step": 710 + }, + { + "epoch": 0.09436452421802824, + "grad_norm": 12.705941200256348, + "learning_rate": 1.883905013192612e-06, + "loss": 0.0821, + "num_input_tokens_seen": 352256, + "step": 715 + }, + { + "epoch": 0.0950244159957767, + "grad_norm": 0.16988928616046906, + "learning_rate": 1.8970976253298151e-06, + "loss": 0.1312, + "num_input_tokens_seen": 355008, + "step": 720 + }, + { + "epoch": 0.09568430777352514, + "grad_norm": 82.4719009399414, + "learning_rate": 1.9102902374670186e-06, + "loss": 0.2885, + "num_input_tokens_seen": 357376, + "step": 725 + }, + { + "epoch": 0.09634419955127359, + "grad_norm": 6.462850093841553, + "learning_rate": 1.9234828496042215e-06, + "loss": 0.1712, + "num_input_tokens_seen": 359680, + "step": 730 + }, + { + "epoch": 0.09700409132902205, + "grad_norm": 2.923388957977295, + "learning_rate": 1.9366754617414247e-06, + "loss": 0.1537, + "num_input_tokens_seen": 362176, + "step": 735 + }, + { + "epoch": 0.09766398310677049, + "grad_norm": 104.2777328491211, + "learning_rate": 1.949868073878628e-06, + "loss": 0.2028, + "num_input_tokens_seen": 365056, + "step": 740 + }, + { + "epoch": 0.09832387488451894, + "grad_norm": 2.3851282596588135, + "learning_rate": 1.963060686015831e-06, + "loss": 0.2106, + "num_input_tokens_seen": 367488, + "step": 745 + }, + { + "epoch": 0.0989837666622674, + "grad_norm": 1.8862087726593018, + "learning_rate": 1.976253298153034e-06, + "loss": 0.2852, + "num_input_tokens_seen": 369792, + "step": 750 + }, + { + "epoch": 0.09964365844001584, + "grad_norm": 102.61363220214844, + "learning_rate": 1.9894459102902374e-06, + "loss": 0.1154, + "num_input_tokens_seen": 372160, + "step": 755 + }, + { + "epoch": 0.1000395935066649, + "eval_loss": 0.129482701420784, + "eval_runtime": 7.7189, + "eval_samples_per_second": 872.532, + "eval_steps_per_second": 109.083, + "num_input_tokens_seen": 373504, + "step": 758 + }, + { + "epoch": 0.10030355021776428, + "grad_norm": 4.566295146942139, + "learning_rate": 1.9999998938723955e-06, + "loss": 0.0874, + "num_input_tokens_seen": 374272, + "step": 760 + }, + { + "epoch": 0.10096344199551274, + "grad_norm": 25.750286102294922, + "learning_rate": 1.9999961794086063e-06, + "loss": 0.0774, + "num_input_tokens_seen": 376704, + "step": 765 + }, + { + "epoch": 0.10162333377326119, + "grad_norm": 149.0970458984375, + "learning_rate": 1.999987158587122e-06, + "loss": 0.2165, + "num_input_tokens_seen": 379136, + "step": 770 + }, + { + "epoch": 0.10228322555100963, + "grad_norm": 47.778255462646484, + "learning_rate": 1.9999728314558114e-06, + "loss": 0.1505, + "num_input_tokens_seen": 381568, + "step": 775 + }, + { + "epoch": 0.10294311732875808, + "grad_norm": 0.1281862109899521, + "learning_rate": 1.9999531980906988e-06, + "loss": 0.2297, + "num_input_tokens_seen": 384128, + "step": 780 + }, + { + "epoch": 0.10360300910650654, + "grad_norm": 105.48400115966797, + "learning_rate": 1.999928258595967e-06, + "loss": 0.0893, + "num_input_tokens_seen": 386304, + "step": 785 + }, + { + "epoch": 0.10426290088425498, + "grad_norm": 16.267196655273438, + "learning_rate": 1.9998980131039534e-06, + "loss": 0.2538, + "num_input_tokens_seen": 388864, + "step": 790 + }, + { + "epoch": 0.10492279266200343, + "grad_norm": 34.18339920043945, + "learning_rate": 1.999862461775153e-06, + "loss": 0.0914, + "num_input_tokens_seen": 391104, + "step": 795 + }, + { + "epoch": 0.10558268443975188, + "grad_norm": 14.670069694519043, + "learning_rate": 1.999821604798214e-06, + "loss": 0.1431, + "num_input_tokens_seen": 393856, + "step": 800 + }, + { + "epoch": 0.10624257621750033, + "grad_norm": 32.27194595336914, + "learning_rate": 1.999775442389939e-06, + "loss": 0.3214, + "num_input_tokens_seen": 396352, + "step": 805 + }, + { + "epoch": 0.10690246799524877, + "grad_norm": 1.3998618125915527, + "learning_rate": 1.9997239747952843e-06, + "loss": 0.1422, + "num_input_tokens_seen": 398592, + "step": 810 + }, + { + "epoch": 0.10756235977299723, + "grad_norm": 177.2610321044922, + "learning_rate": 1.9996672022873546e-06, + "loss": 0.0609, + "num_input_tokens_seen": 401088, + "step": 815 + }, + { + "epoch": 0.10822225155074568, + "grad_norm": 182.47579956054688, + "learning_rate": 1.9996051251674073e-06, + "loss": 0.0726, + "num_input_tokens_seen": 403456, + "step": 820 + }, + { + "epoch": 0.10888214332849412, + "grad_norm": 31.635814666748047, + "learning_rate": 1.999537743764847e-06, + "loss": 0.1602, + "num_input_tokens_seen": 405696, + "step": 825 + }, + { + "epoch": 0.10954203510624258, + "grad_norm": 20.697343826293945, + "learning_rate": 1.999465058437225e-06, + "loss": 0.4649, + "num_input_tokens_seen": 408128, + "step": 830 + }, + { + "epoch": 0.11020192688399102, + "grad_norm": 0.6629725694656372, + "learning_rate": 1.9993870695702364e-06, + "loss": 0.0112, + "num_input_tokens_seen": 411008, + "step": 835 + }, + { + "epoch": 0.11086181866173947, + "grad_norm": 247.9231719970703, + "learning_rate": 1.9993037775777206e-06, + "loss": 0.3035, + "num_input_tokens_seen": 413312, + "step": 840 + }, + { + "epoch": 0.11152171043948793, + "grad_norm": 0.9605908989906311, + "learning_rate": 1.999215182901656e-06, + "loss": 0.1141, + "num_input_tokens_seen": 415616, + "step": 845 + }, + { + "epoch": 0.11218160221723637, + "grad_norm": 75.42913055419922, + "learning_rate": 1.9991212860121587e-06, + "loss": 0.1391, + "num_input_tokens_seen": 418368, + "step": 850 + }, + { + "epoch": 0.11284149399498482, + "grad_norm": 0.684021532535553, + "learning_rate": 1.999022087407482e-06, + "loss": 0.0502, + "num_input_tokens_seen": 420864, + "step": 855 + }, + { + "epoch": 0.11350138577273328, + "grad_norm": 61.68302536010742, + "learning_rate": 1.998917587614011e-06, + "loss": 0.3102, + "num_input_tokens_seen": 423040, + "step": 860 + }, + { + "epoch": 0.11416127755048172, + "grad_norm": 26.822439193725586, + "learning_rate": 1.9988077871862615e-06, + "loss": 0.3563, + "num_input_tokens_seen": 425344, + "step": 865 + }, + { + "epoch": 0.11482116932823017, + "grad_norm": 1.1649112701416016, + "learning_rate": 1.9986926867068752e-06, + "loss": 0.0052, + "num_input_tokens_seen": 427968, + "step": 870 + }, + { + "epoch": 0.11548106110597862, + "grad_norm": 0.3206559121608734, + "learning_rate": 1.998572286786619e-06, + "loss": 0.2265, + "num_input_tokens_seen": 430592, + "step": 875 + }, + { + "epoch": 0.11614095288372707, + "grad_norm": 51.00387954711914, + "learning_rate": 1.9984465880643807e-06, + "loss": 0.295, + "num_input_tokens_seen": 433152, + "step": 880 + }, + { + "epoch": 0.11680084466147551, + "grad_norm": 94.77568817138672, + "learning_rate": 1.998315591207165e-06, + "loss": 0.0961, + "num_input_tokens_seen": 435456, + "step": 885 + }, + { + "epoch": 0.11746073643922397, + "grad_norm": 14.036933898925781, + "learning_rate": 1.9981792969100912e-06, + "loss": 0.1703, + "num_input_tokens_seen": 438080, + "step": 890 + }, + { + "epoch": 0.11812062821697242, + "grad_norm": 8.309388160705566, + "learning_rate": 1.9980377058963875e-06, + "loss": 0.2036, + "num_input_tokens_seen": 440640, + "step": 895 + }, + { + "epoch": 0.11878051999472086, + "grad_norm": 2.4462878704071045, + "learning_rate": 1.99789081891739e-06, + "loss": 0.0225, + "num_input_tokens_seen": 443008, + "step": 900 + }, + { + "epoch": 0.11944041177246932, + "grad_norm": 77.5080337524414, + "learning_rate": 1.997738636752536e-06, + "loss": 0.2203, + "num_input_tokens_seen": 445312, + "step": 905 + }, + { + "epoch": 0.12010030355021777, + "grad_norm": 0.06471412628889084, + "learning_rate": 1.9975811602093624e-06, + "loss": 0.0016, + "num_input_tokens_seen": 447680, + "step": 910 + }, + { + "epoch": 0.12076019532796621, + "grad_norm": 123.2421875, + "learning_rate": 1.9974183901234984e-06, + "loss": 0.3289, + "num_input_tokens_seen": 450368, + "step": 915 + }, + { + "epoch": 0.12142008710571467, + "grad_norm": 5.337376594543457, + "learning_rate": 1.997250327358664e-06, + "loss": 0.3008, + "num_input_tokens_seen": 452800, + "step": 920 + }, + { + "epoch": 0.12207997888346311, + "grad_norm": 15.385493278503418, + "learning_rate": 1.997076972806664e-06, + "loss": 0.1352, + "num_input_tokens_seen": 455744, + "step": 925 + }, + { + "epoch": 0.12273987066121156, + "grad_norm": 60.07589340209961, + "learning_rate": 1.9968983273873827e-06, + "loss": 0.2869, + "num_input_tokens_seen": 458176, + "step": 930 + }, + { + "epoch": 0.12339976243896002, + "grad_norm": 1.007535457611084, + "learning_rate": 1.99671439204878e-06, + "loss": 0.1694, + "num_input_tokens_seen": 460480, + "step": 935 + }, + { + "epoch": 0.12405965421670846, + "grad_norm": 0.9021179676055908, + "learning_rate": 1.9965251677668873e-06, + "loss": 0.1448, + "num_input_tokens_seen": 462656, + "step": 940 + }, + { + "epoch": 0.1247195459944569, + "grad_norm": 130.83981323242188, + "learning_rate": 1.9963306555458e-06, + "loss": 0.2976, + "num_input_tokens_seen": 465344, + "step": 945 + }, + { + "epoch": 0.12537943777220537, + "grad_norm": 55.70017623901367, + "learning_rate": 1.9961308564176723e-06, + "loss": 0.3325, + "num_input_tokens_seen": 467712, + "step": 950 + }, + { + "epoch": 0.1260393295499538, + "grad_norm": 21.9194278717041, + "learning_rate": 1.9959257714427147e-06, + "loss": 0.1471, + "num_input_tokens_seen": 470080, + "step": 955 + }, + { + "epoch": 0.12669922132770225, + "grad_norm": 9.642888069152832, + "learning_rate": 1.995715401709186e-06, + "loss": 0.1476, + "num_input_tokens_seen": 472512, + "step": 960 + }, + { + "epoch": 0.1273591131054507, + "grad_norm": 22.075639724731445, + "learning_rate": 1.995499748333387e-06, + "loss": 0.1639, + "num_input_tokens_seen": 474752, + "step": 965 + }, + { + "epoch": 0.12801900488319914, + "grad_norm": 34.80426025390625, + "learning_rate": 1.9952788124596555e-06, + "loss": 0.1539, + "num_input_tokens_seen": 477440, + "step": 970 + }, + { + "epoch": 0.12867889666094762, + "grad_norm": 16.518495559692383, + "learning_rate": 1.9950525952603617e-06, + "loss": 0.1507, + "num_input_tokens_seen": 480000, + "step": 975 + }, + { + "epoch": 0.12933878843869606, + "grad_norm": 63.373817443847656, + "learning_rate": 1.994821097935899e-06, + "loss": 0.1434, + "num_input_tokens_seen": 482368, + "step": 980 + }, + { + "epoch": 0.1299986802164445, + "grad_norm": 18.813962936401367, + "learning_rate": 1.9945843217146804e-06, + "loss": 0.0706, + "num_input_tokens_seen": 484544, + "step": 985 + }, + { + "epoch": 0.13065857199419295, + "grad_norm": 0.16079047322273254, + "learning_rate": 1.9943422678531293e-06, + "loss": 0.1142, + "num_input_tokens_seen": 486720, + "step": 990 + }, + { + "epoch": 0.1313184637719414, + "grad_norm": 0.1510768085718155, + "learning_rate": 1.994094937635675e-06, + "loss": 0.0692, + "num_input_tokens_seen": 489344, + "step": 995 + }, + { + "epoch": 0.13197835554968984, + "grad_norm": 19.409828186035156, + "learning_rate": 1.9938423323747457e-06, + "loss": 0.0421, + "num_input_tokens_seen": 491776, + "step": 1000 + }, + { + "epoch": 0.1326382473274383, + "grad_norm": 170.2490997314453, + "learning_rate": 1.99358445341076e-06, + "loss": 0.1827, + "num_input_tokens_seen": 493952, + "step": 1005 + }, + { + "epoch": 0.13329813910518676, + "grad_norm": 15.425086975097656, + "learning_rate": 1.993321302112121e-06, + "loss": 0.2152, + "num_input_tokens_seen": 496320, + "step": 1010 + }, + { + "epoch": 0.1339580308829352, + "grad_norm": 117.909423828125, + "learning_rate": 1.993052879875209e-06, + "loss": 0.0299, + "num_input_tokens_seen": 498496, + "step": 1015 + }, + { + "epoch": 0.13461792266068365, + "grad_norm": 47.48206329345703, + "learning_rate": 1.992779188124374e-06, + "loss": 0.1351, + "num_input_tokens_seen": 501056, + "step": 1020 + }, + { + "epoch": 0.1352778144384321, + "grad_norm": 1.8825244903564453, + "learning_rate": 1.992500228311928e-06, + "loss": 0.0501, + "num_input_tokens_seen": 503296, + "step": 1025 + }, + { + "epoch": 0.13593770621618054, + "grad_norm": 12.106839179992676, + "learning_rate": 1.9922160019181372e-06, + "loss": 0.3259, + "num_input_tokens_seen": 505856, + "step": 1030 + }, + { + "epoch": 0.13659759799392898, + "grad_norm": 0.0899241715669632, + "learning_rate": 1.9919265104512138e-06, + "loss": 0.1532, + "num_input_tokens_seen": 508416, + "step": 1035 + }, + { + "epoch": 0.13725748977167745, + "grad_norm": 2.1223573684692383, + "learning_rate": 1.9916317554473094e-06, + "loss": 0.2708, + "num_input_tokens_seen": 511040, + "step": 1040 + }, + { + "epoch": 0.1379173815494259, + "grad_norm": 57.11883544921875, + "learning_rate": 1.9913317384705052e-06, + "loss": 0.188, + "num_input_tokens_seen": 513216, + "step": 1045 + }, + { + "epoch": 0.13857727332717434, + "grad_norm": 12.335477828979492, + "learning_rate": 1.991026461112805e-06, + "loss": 0.1146, + "num_input_tokens_seen": 515456, + "step": 1050 + }, + { + "epoch": 0.1392371651049228, + "grad_norm": 0.4140935242176056, + "learning_rate": 1.9907159249941257e-06, + "loss": 0.1353, + "num_input_tokens_seen": 517824, + "step": 1055 + }, + { + "epoch": 0.13989705688267123, + "grad_norm": 101.3670425415039, + "learning_rate": 1.990400131762289e-06, + "loss": 0.112, + "num_input_tokens_seen": 520320, + "step": 1060 + }, + { + "epoch": 0.14055694866041968, + "grad_norm": 0.5620743632316589, + "learning_rate": 1.9900790830930134e-06, + "loss": 0.0702, + "num_input_tokens_seen": 522752, + "step": 1065 + }, + { + "epoch": 0.14121684043816815, + "grad_norm": 2.9457738399505615, + "learning_rate": 1.9897527806899047e-06, + "loss": 0.1085, + "num_input_tokens_seen": 525376, + "step": 1070 + }, + { + "epoch": 0.1418767322159166, + "grad_norm": 628.5321655273438, + "learning_rate": 1.9894212262844465e-06, + "loss": 0.2922, + "num_input_tokens_seen": 527808, + "step": 1075 + }, + { + "epoch": 0.14253662399366504, + "grad_norm": 0.10223134607076645, + "learning_rate": 1.989084421635992e-06, + "loss": 0.1607, + "num_input_tokens_seen": 530304, + "step": 1080 + }, + { + "epoch": 0.14319651577141349, + "grad_norm": 9.022106170654297, + "learning_rate": 1.988742368531754e-06, + "loss": 0.2576, + "num_input_tokens_seen": 532480, + "step": 1085 + }, + { + "epoch": 0.14385640754916193, + "grad_norm": 3.4002270698547363, + "learning_rate": 1.9883950687867947e-06, + "loss": 0.0676, + "num_input_tokens_seen": 535168, + "step": 1090 + }, + { + "epoch": 0.14451629932691037, + "grad_norm": 19.492107391357422, + "learning_rate": 1.9880425242440187e-06, + "loss": 0.1067, + "num_input_tokens_seen": 537600, + "step": 1095 + }, + { + "epoch": 0.14517619110465885, + "grad_norm": 77.36679077148438, + "learning_rate": 1.9876847367741607e-06, + "loss": 0.1435, + "num_input_tokens_seen": 540096, + "step": 1100 + }, + { + "epoch": 0.1458360828824073, + "grad_norm": 0.1341482549905777, + "learning_rate": 1.987321708275776e-06, + "loss": 0.1568, + "num_input_tokens_seen": 542592, + "step": 1105 + }, + { + "epoch": 0.14649597466015574, + "grad_norm": 0.15566033124923706, + "learning_rate": 1.986953440675231e-06, + "loss": 0.0017, + "num_input_tokens_seen": 544960, + "step": 1110 + }, + { + "epoch": 0.14715586643790418, + "grad_norm": 0.3482903242111206, + "learning_rate": 1.9865799359266925e-06, + "loss": 0.0812, + "num_input_tokens_seen": 547136, + "step": 1115 + }, + { + "epoch": 0.14781575821565263, + "grad_norm": 0.914465606212616, + "learning_rate": 1.986201196012118e-06, + "loss": 0.0878, + "num_input_tokens_seen": 549440, + "step": 1120 + }, + { + "epoch": 0.14847564999340107, + "grad_norm": 57.043827056884766, + "learning_rate": 1.985817222941245e-06, + "loss": 0.2476, + "num_input_tokens_seen": 552064, + "step": 1125 + }, + { + "epoch": 0.14913554177114954, + "grad_norm": 10.63588809967041, + "learning_rate": 1.9854280187515794e-06, + "loss": 0.082, + "num_input_tokens_seen": 554432, + "step": 1130 + }, + { + "epoch": 0.149795433548898, + "grad_norm": 0.7898812890052795, + "learning_rate": 1.985033585508386e-06, + "loss": 0.0745, + "num_input_tokens_seen": 556800, + "step": 1135 + }, + { + "epoch": 0.15005939025999737, + "eval_loss": 0.19488762319087982, + "eval_runtime": 7.5589, + "eval_samples_per_second": 890.998, + "eval_steps_per_second": 111.391, + "num_input_tokens_seen": 557824, + "step": 1137 + }, + { + "epoch": 0.15045532532664643, + "grad_norm": 250.70848083496094, + "learning_rate": 1.9846339253046766e-06, + "loss": 0.5451, + "num_input_tokens_seen": 559296, + "step": 1140 + }, + { + "epoch": 0.15111521710439488, + "grad_norm": 27.038022994995117, + "learning_rate": 1.984229040261199e-06, + "loss": 0.1735, + "num_input_tokens_seen": 562112, + "step": 1145 + }, + { + "epoch": 0.15177510888214332, + "grad_norm": 0.19544407725334167, + "learning_rate": 1.9838189325264263e-06, + "loss": 0.2349, + "num_input_tokens_seen": 564288, + "step": 1150 + }, + { + "epoch": 0.15243500065989177, + "grad_norm": 199.0702667236328, + "learning_rate": 1.983403604276546e-06, + "loss": 0.0845, + "num_input_tokens_seen": 566848, + "step": 1155 + }, + { + "epoch": 0.15309489243764024, + "grad_norm": 44.44175720214844, + "learning_rate": 1.9829830577154457e-06, + "loss": 0.394, + "num_input_tokens_seen": 569152, + "step": 1160 + }, + { + "epoch": 0.15375478421538868, + "grad_norm": 19.46393585205078, + "learning_rate": 1.982557295074705e-06, + "loss": 0.0604, + "num_input_tokens_seen": 571456, + "step": 1165 + }, + { + "epoch": 0.15441467599313713, + "grad_norm": 0.14685490727424622, + "learning_rate": 1.982126318613581e-06, + "loss": 0.1545, + "num_input_tokens_seen": 573824, + "step": 1170 + }, + { + "epoch": 0.15507456777088557, + "grad_norm": 0.41161906719207764, + "learning_rate": 1.9816901306189977e-06, + "loss": 0.0016, + "num_input_tokens_seen": 576128, + "step": 1175 + }, + { + "epoch": 0.15573445954863402, + "grad_norm": 2.800428867340088, + "learning_rate": 1.9812487334055342e-06, + "loss": 0.139, + "num_input_tokens_seen": 578432, + "step": 1180 + }, + { + "epoch": 0.15639435132638246, + "grad_norm": 0.6683080196380615, + "learning_rate": 1.98080212931541e-06, + "loss": 0.1618, + "num_input_tokens_seen": 580736, + "step": 1185 + }, + { + "epoch": 0.15705424310413094, + "grad_norm": 0.13595707714557648, + "learning_rate": 1.980350320718476e-06, + "loss": 0.0846, + "num_input_tokens_seen": 583040, + "step": 1190 + }, + { + "epoch": 0.15771413488187938, + "grad_norm": 0.4379376471042633, + "learning_rate": 1.9798933100121985e-06, + "loss": 0.0073, + "num_input_tokens_seen": 585344, + "step": 1195 + }, + { + "epoch": 0.15837402665962783, + "grad_norm": 4.045234203338623, + "learning_rate": 1.97943109962165e-06, + "loss": 0.0793, + "num_input_tokens_seen": 587904, + "step": 1200 + }, + { + "epoch": 0.15903391843737627, + "grad_norm": 0.751695990562439, + "learning_rate": 1.978963691999493e-06, + "loss": 0.1511, + "num_input_tokens_seen": 590208, + "step": 1205 + }, + { + "epoch": 0.15969381021512472, + "grad_norm": 21.781272888183594, + "learning_rate": 1.978491089625969e-06, + "loss": 0.0853, + "num_input_tokens_seen": 592512, + "step": 1210 + }, + { + "epoch": 0.16035370199287316, + "grad_norm": 0.15117277204990387, + "learning_rate": 1.9780132950088854e-06, + "loss": 0.1785, + "num_input_tokens_seen": 595072, + "step": 1215 + }, + { + "epoch": 0.1610135937706216, + "grad_norm": 23.114465713500977, + "learning_rate": 1.9775303106836e-06, + "loss": 0.2842, + "num_input_tokens_seen": 597632, + "step": 1220 + }, + { + "epoch": 0.16167348554837008, + "grad_norm": 0.19639664888381958, + "learning_rate": 1.977042139213011e-06, + "loss": 0.0847, + "num_input_tokens_seen": 600192, + "step": 1225 + }, + { + "epoch": 0.16233337732611852, + "grad_norm": 0.22633503377437592, + "learning_rate": 1.9765487831875404e-06, + "loss": 0.0931, + "num_input_tokens_seen": 602304, + "step": 1230 + }, + { + "epoch": 0.16299326910386697, + "grad_norm": 0.8158997297286987, + "learning_rate": 1.9760502452251217e-06, + "loss": 0.1418, + "num_input_tokens_seen": 604608, + "step": 1235 + }, + { + "epoch": 0.1636531608816154, + "grad_norm": 9.417763710021973, + "learning_rate": 1.975546527971186e-06, + "loss": 0.1102, + "num_input_tokens_seen": 606976, + "step": 1240 + }, + { + "epoch": 0.16431305265936386, + "grad_norm": 0.3996043801307678, + "learning_rate": 1.9750376340986472e-06, + "loss": 0.0447, + "num_input_tokens_seen": 609600, + "step": 1245 + }, + { + "epoch": 0.1649729444371123, + "grad_norm": 17.319820404052734, + "learning_rate": 1.974523566307889e-06, + "loss": 0.1681, + "num_input_tokens_seen": 611840, + "step": 1250 + }, + { + "epoch": 0.16563283621486077, + "grad_norm": 17.101892471313477, + "learning_rate": 1.9740043273267487e-06, + "loss": 0.1085, + "num_input_tokens_seen": 614528, + "step": 1255 + }, + { + "epoch": 0.16629272799260922, + "grad_norm": 0.14512968063354492, + "learning_rate": 1.973479919910505e-06, + "loss": 0.0217, + "num_input_tokens_seen": 617024, + "step": 1260 + }, + { + "epoch": 0.16695261977035766, + "grad_norm": 26.45575523376465, + "learning_rate": 1.972950346841862e-06, + "loss": 0.1141, + "num_input_tokens_seen": 619392, + "step": 1265 + }, + { + "epoch": 0.1676125115481061, + "grad_norm": 46.64674758911133, + "learning_rate": 1.972415610930934e-06, + "loss": 0.0049, + "num_input_tokens_seen": 621888, + "step": 1270 + }, + { + "epoch": 0.16827240332585455, + "grad_norm": 48.495487213134766, + "learning_rate": 1.9718757150152324e-06, + "loss": 0.2469, + "num_input_tokens_seen": 624192, + "step": 1275 + }, + { + "epoch": 0.168932295103603, + "grad_norm": 0.485227108001709, + "learning_rate": 1.9713306619596488e-06, + "loss": 0.0511, + "num_input_tokens_seen": 626624, + "step": 1280 + }, + { + "epoch": 0.16959218688135147, + "grad_norm": 34.0601692199707, + "learning_rate": 1.9707804546564407e-06, + "loss": 0.0686, + "num_input_tokens_seen": 628928, + "step": 1285 + }, + { + "epoch": 0.17025207865909991, + "grad_norm": 2.3748066425323486, + "learning_rate": 1.9702250960252164e-06, + "loss": 0.0234, + "num_input_tokens_seen": 631616, + "step": 1290 + }, + { + "epoch": 0.17091197043684836, + "grad_norm": 0.013682112097740173, + "learning_rate": 1.969664589012918e-06, + "loss": 0.0015, + "num_input_tokens_seen": 634112, + "step": 1295 + }, + { + "epoch": 0.1715718622145968, + "grad_norm": 314.3330078125, + "learning_rate": 1.9690989365938077e-06, + "loss": 0.3855, + "num_input_tokens_seen": 636416, + "step": 1300 + }, + { + "epoch": 0.17223175399234525, + "grad_norm": 0.05270430073142052, + "learning_rate": 1.9685281417694513e-06, + "loss": 0.0051, + "num_input_tokens_seen": 638848, + "step": 1305 + }, + { + "epoch": 0.1728916457700937, + "grad_norm": 0.23324760794639587, + "learning_rate": 1.967952207568702e-06, + "loss": 0.1125, + "num_input_tokens_seen": 641216, + "step": 1310 + }, + { + "epoch": 0.17355153754784217, + "grad_norm": 0.26865366101264954, + "learning_rate": 1.967371137047685e-06, + "loss": 0.0011, + "num_input_tokens_seen": 644032, + "step": 1315 + }, + { + "epoch": 0.1742114293255906, + "grad_norm": 0.24145404994487762, + "learning_rate": 1.966784933289778e-06, + "loss": 0.1494, + "num_input_tokens_seen": 646528, + "step": 1320 + }, + { + "epoch": 0.17487132110333906, + "grad_norm": 0.08738990128040314, + "learning_rate": 1.9661935994056014e-06, + "loss": 0.1951, + "num_input_tokens_seen": 649088, + "step": 1325 + }, + { + "epoch": 0.1755312128810875, + "grad_norm": 0.9014714956283569, + "learning_rate": 1.965597138532996e-06, + "loss": 0.0093, + "num_input_tokens_seen": 651520, + "step": 1330 + }, + { + "epoch": 0.17619110465883595, + "grad_norm": 0.6617699265480042, + "learning_rate": 1.964995553837009e-06, + "loss": 0.0409, + "num_input_tokens_seen": 654016, + "step": 1335 + }, + { + "epoch": 0.1768509964365844, + "grad_norm": 0.015406480059027672, + "learning_rate": 1.964388848509875e-06, + "loss": 0.1143, + "num_input_tokens_seen": 656320, + "step": 1340 + }, + { + "epoch": 0.17751088821433286, + "grad_norm": 0.025758925825357437, + "learning_rate": 1.9637770257710026e-06, + "loss": 0.1683, + "num_input_tokens_seen": 658880, + "step": 1345 + }, + { + "epoch": 0.1781707799920813, + "grad_norm": 0.08237680792808533, + "learning_rate": 1.9631600888669545e-06, + "loss": 0.0205, + "num_input_tokens_seen": 661184, + "step": 1350 + }, + { + "epoch": 0.17883067176982975, + "grad_norm": 0.07278398424386978, + "learning_rate": 1.962538041071431e-06, + "loss": 0.0664, + "num_input_tokens_seen": 663680, + "step": 1355 + }, + { + "epoch": 0.1794905635475782, + "grad_norm": 25.769346237182617, + "learning_rate": 1.961910885685253e-06, + "loss": 0.0688, + "num_input_tokens_seen": 666048, + "step": 1360 + }, + { + "epoch": 0.18015045532532664, + "grad_norm": 164.78553771972656, + "learning_rate": 1.9612786260363436e-06, + "loss": 0.2636, + "num_input_tokens_seen": 668480, + "step": 1365 + }, + { + "epoch": 0.1808103471030751, + "grad_norm": 0.17773790657520294, + "learning_rate": 1.9606412654797116e-06, + "loss": 0.1108, + "num_input_tokens_seen": 671488, + "step": 1370 + }, + { + "epoch": 0.18147023888082353, + "grad_norm": 66.89860534667969, + "learning_rate": 1.9599988073974332e-06, + "loss": 0.1088, + "num_input_tokens_seen": 673920, + "step": 1375 + }, + { + "epoch": 0.182130130658572, + "grad_norm": 187.47903442382812, + "learning_rate": 1.959351255198634e-06, + "loss": 0.1413, + "num_input_tokens_seen": 676416, + "step": 1380 + }, + { + "epoch": 0.18279002243632045, + "grad_norm": 2.0588765144348145, + "learning_rate": 1.9586986123194704e-06, + "loss": 0.0008, + "num_input_tokens_seen": 679040, + "step": 1385 + }, + { + "epoch": 0.1834499142140689, + "grad_norm": 0.09783805161714554, + "learning_rate": 1.958040882223112e-06, + "loss": 0.1041, + "num_input_tokens_seen": 681920, + "step": 1390 + }, + { + "epoch": 0.18410980599181734, + "grad_norm": 0.06469712406396866, + "learning_rate": 1.9573780683997235e-06, + "loss": 0.04, + "num_input_tokens_seen": 684416, + "step": 1395 + }, + { + "epoch": 0.18476969776956578, + "grad_norm": 222.29971313476562, + "learning_rate": 1.956710174366445e-06, + "loss": 0.3574, + "num_input_tokens_seen": 686976, + "step": 1400 + }, + { + "epoch": 0.18542958954731423, + "grad_norm": 0.0895252674818039, + "learning_rate": 1.9560372036673764e-06, + "loss": 0.2731, + "num_input_tokens_seen": 689408, + "step": 1405 + }, + { + "epoch": 0.1860894813250627, + "grad_norm": 0.06232970580458641, + "learning_rate": 1.955359159873553e-06, + "loss": 0.0238, + "num_input_tokens_seen": 691712, + "step": 1410 + }, + { + "epoch": 0.18674937310281114, + "grad_norm": 0.0344870463013649, + "learning_rate": 1.954676046582932e-06, + "loss": 0.1341, + "num_input_tokens_seen": 694080, + "step": 1415 + }, + { + "epoch": 0.1874092648805596, + "grad_norm": 38.71489334106445, + "learning_rate": 1.9539878674203706e-06, + "loss": 0.2135, + "num_input_tokens_seen": 696640, + "step": 1420 + }, + { + "epoch": 0.18806915665830803, + "grad_norm": 0.06116657704114914, + "learning_rate": 1.9532946260376076e-06, + "loss": 0.0011, + "num_input_tokens_seen": 699136, + "step": 1425 + }, + { + "epoch": 0.18872904843605648, + "grad_norm": 53.41019821166992, + "learning_rate": 1.952596326113244e-06, + "loss": 0.3109, + "num_input_tokens_seen": 701696, + "step": 1430 + }, + { + "epoch": 0.18938894021380492, + "grad_norm": 1.2848087549209595, + "learning_rate": 1.9518929713527226e-06, + "loss": 0.1812, + "num_input_tokens_seen": 704384, + "step": 1435 + }, + { + "epoch": 0.1900488319915534, + "grad_norm": 2.0177323818206787, + "learning_rate": 1.9511845654883097e-06, + "loss": 0.0066, + "num_input_tokens_seen": 706560, + "step": 1440 + }, + { + "epoch": 0.19070872376930184, + "grad_norm": 0.4617765247821808, + "learning_rate": 1.9504711122790754e-06, + "loss": 0.0755, + "num_input_tokens_seen": 709248, + "step": 1445 + }, + { + "epoch": 0.19136861554705029, + "grad_norm": 45.93152618408203, + "learning_rate": 1.949752615510871e-06, + "loss": 0.2258, + "num_input_tokens_seen": 711296, + "step": 1450 + }, + { + "epoch": 0.19202850732479873, + "grad_norm": 0.20753158628940582, + "learning_rate": 1.949029078996313e-06, + "loss": 0.0457, + "num_input_tokens_seen": 713728, + "step": 1455 + }, + { + "epoch": 0.19268839910254718, + "grad_norm": 13.657062530517578, + "learning_rate": 1.9483005065747584e-06, + "loss": 0.1224, + "num_input_tokens_seen": 716224, + "step": 1460 + }, + { + "epoch": 0.19334829088029562, + "grad_norm": 13.369616508483887, + "learning_rate": 1.947566902112289e-06, + "loss": 0.3816, + "num_input_tokens_seen": 718528, + "step": 1465 + }, + { + "epoch": 0.1940081826580441, + "grad_norm": 60.77271270751953, + "learning_rate": 1.9468282695016863e-06, + "loss": 0.1841, + "num_input_tokens_seen": 720960, + "step": 1470 + }, + { + "epoch": 0.19466807443579254, + "grad_norm": 1.3714667558670044, + "learning_rate": 1.946084612662415e-06, + "loss": 0.1318, + "num_input_tokens_seen": 723200, + "step": 1475 + }, + { + "epoch": 0.19532796621354098, + "grad_norm": 114.1025619506836, + "learning_rate": 1.9453359355405987e-06, + "loss": 0.1708, + "num_input_tokens_seen": 725888, + "step": 1480 + }, + { + "epoch": 0.19598785799128943, + "grad_norm": 0.23408390581607819, + "learning_rate": 1.944582242109002e-06, + "loss": 0.0194, + "num_input_tokens_seen": 728256, + "step": 1485 + }, + { + "epoch": 0.19664774976903787, + "grad_norm": 0.22887404263019562, + "learning_rate": 1.943823536367006e-06, + "loss": 0.1454, + "num_input_tokens_seen": 730688, + "step": 1490 + }, + { + "epoch": 0.19730764154678632, + "grad_norm": 2.37292742729187, + "learning_rate": 1.9430598223405913e-06, + "loss": 0.1624, + "num_input_tokens_seen": 732992, + "step": 1495 + }, + { + "epoch": 0.1979675333245348, + "grad_norm": 0.2745613157749176, + "learning_rate": 1.9422911040823125e-06, + "loss": 0.1476, + "num_input_tokens_seen": 735424, + "step": 1500 + }, + { + "epoch": 0.19862742510228323, + "grad_norm": 132.48385620117188, + "learning_rate": 1.941517385671279e-06, + "loss": 0.3263, + "num_input_tokens_seen": 737664, + "step": 1505 + }, + { + "epoch": 0.19928731688003168, + "grad_norm": 0.1534176468849182, + "learning_rate": 1.940738671213134e-06, + "loss": 0.0942, + "num_input_tokens_seen": 740096, + "step": 1510 + }, + { + "epoch": 0.19994720865778012, + "grad_norm": 46.99830627441406, + "learning_rate": 1.93995496484003e-06, + "loss": 0.1712, + "num_input_tokens_seen": 742912, + "step": 1515 + }, + { + "epoch": 0.2000791870133298, + "eval_loss": 0.1068890318274498, + "eval_runtime": 7.6888, + "eval_samples_per_second": 875.951, + "eval_steps_per_second": 109.51, + "num_input_tokens_seen": 743424, + "step": 1516 + }, + { + "epoch": 0.20060710043552857, + "grad_norm": 160.11495971679688, + "learning_rate": 1.9391662707106092e-06, + "loss": 0.1021, + "num_input_tokens_seen": 745536, + "step": 1520 + }, + { + "epoch": 0.201266992213277, + "grad_norm": 0.16469451785087585, + "learning_rate": 1.9383725930099814e-06, + "loss": 0.0031, + "num_input_tokens_seen": 747968, + "step": 1525 + }, + { + "epoch": 0.20192688399102549, + "grad_norm": 0.772555947303772, + "learning_rate": 1.9375739359497e-06, + "loss": 0.1222, + "num_input_tokens_seen": 750464, + "step": 1530 + }, + { + "epoch": 0.20258677576877393, + "grad_norm": 0.41962626576423645, + "learning_rate": 1.936770303767741e-06, + "loss": 0.2416, + "num_input_tokens_seen": 752896, + "step": 1535 + }, + { + "epoch": 0.20324666754652237, + "grad_norm": 11.837217330932617, + "learning_rate": 1.9359617007284815e-06, + "loss": 0.1974, + "num_input_tokens_seen": 755648, + "step": 1540 + }, + { + "epoch": 0.20390655932427082, + "grad_norm": 9.827956199645996, + "learning_rate": 1.9351481311226738e-06, + "loss": 0.2312, + "num_input_tokens_seen": 758144, + "step": 1545 + }, + { + "epoch": 0.20456645110201926, + "grad_norm": 0.8918312788009644, + "learning_rate": 1.934329599267426e-06, + "loss": 0.1313, + "num_input_tokens_seen": 760704, + "step": 1550 + }, + { + "epoch": 0.2052263428797677, + "grad_norm": 43.78156280517578, + "learning_rate": 1.933506109506178e-06, + "loss": 0.0468, + "num_input_tokens_seen": 763136, + "step": 1555 + }, + { + "epoch": 0.20588623465751615, + "grad_norm": 1.698026418685913, + "learning_rate": 1.9326776662086765e-06, + "loss": 0.1132, + "num_input_tokens_seen": 766016, + "step": 1560 + }, + { + "epoch": 0.20654612643526463, + "grad_norm": 59.669952392578125, + "learning_rate": 1.9318442737709565e-06, + "loss": 0.3367, + "num_input_tokens_seen": 768512, + "step": 1565 + }, + { + "epoch": 0.20720601821301307, + "grad_norm": 0.267106831073761, + "learning_rate": 1.9310059366153116e-06, + "loss": 0.2047, + "num_input_tokens_seen": 770816, + "step": 1570 + }, + { + "epoch": 0.20786590999076152, + "grad_norm": 0.7591071724891663, + "learning_rate": 1.930162659190277e-06, + "loss": 0.2302, + "num_input_tokens_seen": 773312, + "step": 1575 + }, + { + "epoch": 0.20852580176850996, + "grad_norm": 1.2925443649291992, + "learning_rate": 1.9293144459706007e-06, + "loss": 0.0029, + "num_input_tokens_seen": 775680, + "step": 1580 + }, + { + "epoch": 0.2091856935462584, + "grad_norm": 17.853742599487305, + "learning_rate": 1.928461301457223e-06, + "loss": 0.1877, + "num_input_tokens_seen": 778048, + "step": 1585 + }, + { + "epoch": 0.20984558532400685, + "grad_norm": 0.08952134847640991, + "learning_rate": 1.92760323017725e-06, + "loss": 0.3027, + "num_input_tokens_seen": 780672, + "step": 1590 + }, + { + "epoch": 0.21050547710175532, + "grad_norm": 0.1787254512310028, + "learning_rate": 1.9267402366839338e-06, + "loss": 0.216, + "num_input_tokens_seen": 783360, + "step": 1595 + }, + { + "epoch": 0.21116536887950377, + "grad_norm": 9.013484954833984, + "learning_rate": 1.9258723255566433e-06, + "loss": 0.1268, + "num_input_tokens_seen": 785856, + "step": 1600 + }, + { + "epoch": 0.2118252606572522, + "grad_norm": 1.6822223663330078, + "learning_rate": 1.924999501400843e-06, + "loss": 0.1832, + "num_input_tokens_seen": 788480, + "step": 1605 + }, + { + "epoch": 0.21248515243500066, + "grad_norm": 0.5006535649299622, + "learning_rate": 1.924121768848068e-06, + "loss": 0.1511, + "num_input_tokens_seen": 791040, + "step": 1610 + }, + { + "epoch": 0.2131450442127491, + "grad_norm": 0.24185070395469666, + "learning_rate": 1.923239132555899e-06, + "loss": 0.1088, + "num_input_tokens_seen": 793600, + "step": 1615 + }, + { + "epoch": 0.21380493599049755, + "grad_norm": 1.2802025079727173, + "learning_rate": 1.9223515972079378e-06, + "loss": 0.1302, + "num_input_tokens_seen": 795968, + "step": 1620 + }, + { + "epoch": 0.21446482776824602, + "grad_norm": 6.617660999298096, + "learning_rate": 1.9214591675137813e-06, + "loss": 0.049, + "num_input_tokens_seen": 798272, + "step": 1625 + }, + { + "epoch": 0.21512471954599446, + "grad_norm": 111.40785217285156, + "learning_rate": 1.9205618482090003e-06, + "loss": 0.144, + "num_input_tokens_seen": 801024, + "step": 1630 + }, + { + "epoch": 0.2157846113237429, + "grad_norm": 22.670175552368164, + "learning_rate": 1.91965964405511e-06, + "loss": 0.1374, + "num_input_tokens_seen": 803584, + "step": 1635 + }, + { + "epoch": 0.21644450310149135, + "grad_norm": 56.14551544189453, + "learning_rate": 1.9187525598395457e-06, + "loss": 0.0117, + "num_input_tokens_seen": 805952, + "step": 1640 + }, + { + "epoch": 0.2171043948792398, + "grad_norm": 35.73996353149414, + "learning_rate": 1.9178406003756396e-06, + "loss": 0.1249, + "num_input_tokens_seen": 808512, + "step": 1645 + }, + { + "epoch": 0.21776428665698824, + "grad_norm": 68.0622787475586, + "learning_rate": 1.9169237705025936e-06, + "loss": 0.0819, + "num_input_tokens_seen": 811136, + "step": 1650 + }, + { + "epoch": 0.21842417843473672, + "grad_norm": 0.02525966428220272, + "learning_rate": 1.9160020750854533e-06, + "loss": 0.0183, + "num_input_tokens_seen": 813376, + "step": 1655 + }, + { + "epoch": 0.21908407021248516, + "grad_norm": 180.0795135498047, + "learning_rate": 1.915075519015083e-06, + "loss": 0.199, + "num_input_tokens_seen": 815872, + "step": 1660 + }, + { + "epoch": 0.2197439619902336, + "grad_norm": 1.1811161041259766, + "learning_rate": 1.914144107208139e-06, + "loss": 0.0725, + "num_input_tokens_seen": 818240, + "step": 1665 + }, + { + "epoch": 0.22040385376798205, + "grad_norm": 0.16843393445014954, + "learning_rate": 1.913207844607045e-06, + "loss": 0.0539, + "num_input_tokens_seen": 820736, + "step": 1670 + }, + { + "epoch": 0.2210637455457305, + "grad_norm": 21.720033645629883, + "learning_rate": 1.912266736179964e-06, + "loss": 0.2528, + "num_input_tokens_seen": 823616, + "step": 1675 + }, + { + "epoch": 0.22172363732347894, + "grad_norm": 8.987836837768555, + "learning_rate": 1.9113207869207727e-06, + "loss": 0.1707, + "num_input_tokens_seen": 826112, + "step": 1680 + }, + { + "epoch": 0.2223835291012274, + "grad_norm": 0.8188716769218445, + "learning_rate": 1.9103700018490365e-06, + "loss": 0.1356, + "num_input_tokens_seen": 828672, + "step": 1685 + }, + { + "epoch": 0.22304342087897586, + "grad_norm": 1.7912400960922241, + "learning_rate": 1.9094143860099787e-06, + "loss": 0.1711, + "num_input_tokens_seen": 831296, + "step": 1690 + }, + { + "epoch": 0.2237033126567243, + "grad_norm": 108.46529388427734, + "learning_rate": 1.9084539444744594e-06, + "loss": 0.0895, + "num_input_tokens_seen": 833856, + "step": 1695 + }, + { + "epoch": 0.22436320443447275, + "grad_norm": 0.1527111977338791, + "learning_rate": 1.907488682338944e-06, + "loss": 0.1324, + "num_input_tokens_seen": 836480, + "step": 1700 + }, + { + "epoch": 0.2250230962122212, + "grad_norm": 63.81155776977539, + "learning_rate": 1.9065186047254782e-06, + "loss": 0.0553, + "num_input_tokens_seen": 838976, + "step": 1705 + }, + { + "epoch": 0.22568298798996964, + "grad_norm": 0.6900471448898315, + "learning_rate": 1.9055437167816604e-06, + "loss": 0.2205, + "num_input_tokens_seen": 841728, + "step": 1710 + }, + { + "epoch": 0.22634287976771808, + "grad_norm": 0.05360851809382439, + "learning_rate": 1.9045640236806149e-06, + "loss": 0.0143, + "num_input_tokens_seen": 843968, + "step": 1715 + }, + { + "epoch": 0.22700277154546655, + "grad_norm": 0.35860204696655273, + "learning_rate": 1.903579530620963e-06, + "loss": 0.3401, + "num_input_tokens_seen": 846464, + "step": 1720 + }, + { + "epoch": 0.227662663323215, + "grad_norm": 0.1349165141582489, + "learning_rate": 1.9025902428267975e-06, + "loss": 0.0967, + "num_input_tokens_seen": 849088, + "step": 1725 + }, + { + "epoch": 0.22832255510096344, + "grad_norm": 0.09187756478786469, + "learning_rate": 1.901596165547653e-06, + "loss": 0.2082, + "num_input_tokens_seen": 851712, + "step": 1730 + }, + { + "epoch": 0.2289824468787119, + "grad_norm": 0.5252279043197632, + "learning_rate": 1.9005973040584796e-06, + "loss": 0.102, + "num_input_tokens_seen": 854208, + "step": 1735 + }, + { + "epoch": 0.22964233865646033, + "grad_norm": 0.3394613564014435, + "learning_rate": 1.8995936636596138e-06, + "loss": 0.088, + "num_input_tokens_seen": 856576, + "step": 1740 + }, + { + "epoch": 0.23030223043420878, + "grad_norm": 0.196676105260849, + "learning_rate": 1.8985852496767504e-06, + "loss": 0.1348, + "num_input_tokens_seen": 859008, + "step": 1745 + }, + { + "epoch": 0.23096212221195725, + "grad_norm": 83.27044677734375, + "learning_rate": 1.897572067460916e-06, + "loss": 0.1643, + "num_input_tokens_seen": 861440, + "step": 1750 + }, + { + "epoch": 0.2316220139897057, + "grad_norm": 1.6085481643676758, + "learning_rate": 1.8965541223884377e-06, + "loss": 0.0848, + "num_input_tokens_seen": 863936, + "step": 1755 + }, + { + "epoch": 0.23228190576745414, + "grad_norm": 23.993480682373047, + "learning_rate": 1.8955314198609171e-06, + "loss": 0.1238, + "num_input_tokens_seen": 866176, + "step": 1760 + }, + { + "epoch": 0.23294179754520258, + "grad_norm": 0.1617557555437088, + "learning_rate": 1.8945039653052005e-06, + "loss": 0.0977, + "num_input_tokens_seen": 868480, + "step": 1765 + }, + { + "epoch": 0.23360168932295103, + "grad_norm": 0.15750542283058167, + "learning_rate": 1.8934717641733498e-06, + "loss": 0.0877, + "num_input_tokens_seen": 870976, + "step": 1770 + }, + { + "epoch": 0.23426158110069947, + "grad_norm": 38.80494689941406, + "learning_rate": 1.8924348219426143e-06, + "loss": 0.2471, + "num_input_tokens_seen": 873088, + "step": 1775 + }, + { + "epoch": 0.23492147287844795, + "grad_norm": 11.684532165527344, + "learning_rate": 1.8913931441154016e-06, + "loss": 0.2694, + "num_input_tokens_seen": 875520, + "step": 1780 + }, + { + "epoch": 0.2355813646561964, + "grad_norm": 32.83953857421875, + "learning_rate": 1.8903467362192482e-06, + "loss": 0.0401, + "num_input_tokens_seen": 877632, + "step": 1785 + }, + { + "epoch": 0.23624125643394484, + "grad_norm": 97.62303161621094, + "learning_rate": 1.8892956038067895e-06, + "loss": 0.0696, + "num_input_tokens_seen": 880000, + "step": 1790 + }, + { + "epoch": 0.23690114821169328, + "grad_norm": 97.3688735961914, + "learning_rate": 1.8882397524557317e-06, + "loss": 0.0238, + "num_input_tokens_seen": 882176, + "step": 1795 + }, + { + "epoch": 0.23756103998944172, + "grad_norm": 0.09657946974039078, + "learning_rate": 1.8871791877688208e-06, + "loss": 0.0642, + "num_input_tokens_seen": 884800, + "step": 1800 + }, + { + "epoch": 0.23822093176719017, + "grad_norm": 0.5457859635353088, + "learning_rate": 1.8861139153738143e-06, + "loss": 0.0068, + "num_input_tokens_seen": 887104, + "step": 1805 + }, + { + "epoch": 0.23888082354493864, + "grad_norm": 2.4806833267211914, + "learning_rate": 1.8850439409234498e-06, + "loss": 0.0012, + "num_input_tokens_seen": 889408, + "step": 1810 + }, + { + "epoch": 0.2395407153226871, + "grad_norm": 156.25328063964844, + "learning_rate": 1.8839692700954161e-06, + "loss": 0.1943, + "num_input_tokens_seen": 891648, + "step": 1815 + }, + { + "epoch": 0.24020060710043553, + "grad_norm": 184.66175842285156, + "learning_rate": 1.8828899085923234e-06, + "loss": 0.3211, + "num_input_tokens_seen": 894208, + "step": 1820 + }, + { + "epoch": 0.24086049887818398, + "grad_norm": 0.037798941135406494, + "learning_rate": 1.881805862141671e-06, + "loss": 0.2085, + "num_input_tokens_seen": 896704, + "step": 1825 + }, + { + "epoch": 0.24152039065593242, + "grad_norm": 0.04176515340805054, + "learning_rate": 1.8807171364958196e-06, + "loss": 0.082, + "num_input_tokens_seen": 899264, + "step": 1830 + }, + { + "epoch": 0.24218028243368087, + "grad_norm": 0.2008335441350937, + "learning_rate": 1.879623737431959e-06, + "loss": 0.0206, + "num_input_tokens_seen": 901760, + "step": 1835 + }, + { + "epoch": 0.24284017421142934, + "grad_norm": 62.211387634277344, + "learning_rate": 1.8785256707520778e-06, + "loss": 0.3077, + "num_input_tokens_seen": 903872, + "step": 1840 + }, + { + "epoch": 0.24350006598917778, + "grad_norm": 0.0855240523815155, + "learning_rate": 1.8774229422829325e-06, + "loss": 0.0012, + "num_input_tokens_seen": 906368, + "step": 1845 + }, + { + "epoch": 0.24415995776692623, + "grad_norm": 0.062163472175598145, + "learning_rate": 1.8763155578760181e-06, + "loss": 0.0491, + "num_input_tokens_seen": 908864, + "step": 1850 + }, + { + "epoch": 0.24481984954467467, + "grad_norm": 0.138889878988266, + "learning_rate": 1.8752035234075336e-06, + "loss": 0.0892, + "num_input_tokens_seen": 911040, + "step": 1855 + }, + { + "epoch": 0.24547974132242312, + "grad_norm": 21.301368713378906, + "learning_rate": 1.8740868447783554e-06, + "loss": 0.1932, + "num_input_tokens_seen": 913408, + "step": 1860 + }, + { + "epoch": 0.24613963310017156, + "grad_norm": 77.18330383300781, + "learning_rate": 1.8729655279140012e-06, + "loss": 0.2285, + "num_input_tokens_seen": 915968, + "step": 1865 + }, + { + "epoch": 0.24679952487792003, + "grad_norm": 28.638566970825195, + "learning_rate": 1.8718395787646029e-06, + "loss": 0.1745, + "num_input_tokens_seen": 918528, + "step": 1870 + }, + { + "epoch": 0.24745941665566848, + "grad_norm": 0.12477682530879974, + "learning_rate": 1.870709003304872e-06, + "loss": 0.0009, + "num_input_tokens_seen": 921152, + "step": 1875 + }, + { + "epoch": 0.24811930843341692, + "grad_norm": 0.03698311001062393, + "learning_rate": 1.8695738075340693e-06, + "loss": 0.0005, + "num_input_tokens_seen": 923520, + "step": 1880 + }, + { + "epoch": 0.24877920021116537, + "grad_norm": 0.9180229902267456, + "learning_rate": 1.8684339974759723e-06, + "loss": 0.1696, + "num_input_tokens_seen": 925888, + "step": 1885 + }, + { + "epoch": 0.2494390919889138, + "grad_norm": 14.323315620422363, + "learning_rate": 1.8672895791788445e-06, + "loss": 0.0881, + "num_input_tokens_seen": 928704, + "step": 1890 + }, + { + "epoch": 0.2500989837666623, + "grad_norm": 118.73922729492188, + "learning_rate": 1.8661405587154017e-06, + "loss": 0.2865, + "num_input_tokens_seen": 930944, + "step": 1895 + }, + { + "epoch": 0.2500989837666623, + "eval_loss": 0.12773367762565613, + "eval_runtime": 7.6378, + "eval_samples_per_second": 881.797, + "eval_steps_per_second": 110.241, + "num_input_tokens_seen": 930944, + "step": 1895 + }, + { + "epoch": 0.25075887554441073, + "grad_norm": 12.755705833435059, + "learning_rate": 1.8649869421827808e-06, + "loss": 0.2389, + "num_input_tokens_seen": 933376, + "step": 1900 + }, + { + "epoch": 0.2514187673221592, + "grad_norm": 0.946739137172699, + "learning_rate": 1.863828735702507e-06, + "loss": 0.0517, + "num_input_tokens_seen": 936000, + "step": 1905 + }, + { + "epoch": 0.2520786590999076, + "grad_norm": 19.37730598449707, + "learning_rate": 1.862665945420462e-06, + "loss": 0.0611, + "num_input_tokens_seen": 938432, + "step": 1910 + }, + { + "epoch": 0.25273855087765607, + "grad_norm": 4.929696559906006, + "learning_rate": 1.8614985775068498e-06, + "loss": 0.0838, + "num_input_tokens_seen": 941312, + "step": 1915 + }, + { + "epoch": 0.2533984426554045, + "grad_norm": 59.3293342590332, + "learning_rate": 1.860326638156167e-06, + "loss": 0.0099, + "num_input_tokens_seen": 943488, + "step": 1920 + }, + { + "epoch": 0.25405833443315295, + "grad_norm": 32.63521194458008, + "learning_rate": 1.8591501335871653e-06, + "loss": 0.1064, + "num_input_tokens_seen": 945856, + "step": 1925 + }, + { + "epoch": 0.2547182262109014, + "grad_norm": 163.6297149658203, + "learning_rate": 1.857969070042824e-06, + "loss": 0.2861, + "num_input_tokens_seen": 948352, + "step": 1930 + }, + { + "epoch": 0.25537811798864984, + "grad_norm": 0.6843308210372925, + "learning_rate": 1.8567834537903116e-06, + "loss": 0.0541, + "num_input_tokens_seen": 950976, + "step": 1935 + }, + { + "epoch": 0.2560380097663983, + "grad_norm": 15.151936531066895, + "learning_rate": 1.8555932911209565e-06, + "loss": 0.1499, + "num_input_tokens_seen": 953216, + "step": 1940 + }, + { + "epoch": 0.25669790154414673, + "grad_norm": 1.8535500764846802, + "learning_rate": 1.8543985883502119e-06, + "loss": 0.0338, + "num_input_tokens_seen": 955648, + "step": 1945 + }, + { + "epoch": 0.25735779332189523, + "grad_norm": 0.5665189623832703, + "learning_rate": 1.8531993518176216e-06, + "loss": 0.0462, + "num_input_tokens_seen": 957888, + "step": 1950 + }, + { + "epoch": 0.2580176850996437, + "grad_norm": 91.90030670166016, + "learning_rate": 1.8519955878867889e-06, + "loss": 0.1275, + "num_input_tokens_seen": 960128, + "step": 1955 + }, + { + "epoch": 0.2586775768773921, + "grad_norm": 0.0542287677526474, + "learning_rate": 1.8507873029453392e-06, + "loss": 0.1778, + "num_input_tokens_seen": 962496, + "step": 1960 + }, + { + "epoch": 0.25933746865514057, + "grad_norm": 9.215625762939453, + "learning_rate": 1.8495745034048896e-06, + "loss": 0.2342, + "num_input_tokens_seen": 965120, + "step": 1965 + }, + { + "epoch": 0.259997360432889, + "grad_norm": 0.16024070978164673, + "learning_rate": 1.8483571957010127e-06, + "loss": 0.0074, + "num_input_tokens_seen": 967616, + "step": 1970 + }, + { + "epoch": 0.26065725221063746, + "grad_norm": 0.10168848931789398, + "learning_rate": 1.8471353862932035e-06, + "loss": 0.0688, + "num_input_tokens_seen": 970240, + "step": 1975 + }, + { + "epoch": 0.2613171439883859, + "grad_norm": 71.9769515991211, + "learning_rate": 1.8459090816648444e-06, + "loss": 0.1752, + "num_input_tokens_seen": 972544, + "step": 1980 + }, + { + "epoch": 0.26197703576613435, + "grad_norm": 0.1407454013824463, + "learning_rate": 1.8446782883231713e-06, + "loss": 0.2913, + "num_input_tokens_seen": 974912, + "step": 1985 + }, + { + "epoch": 0.2626369275438828, + "grad_norm": 17.948266983032227, + "learning_rate": 1.8434430127992387e-06, + "loss": 0.3162, + "num_input_tokens_seen": 977088, + "step": 1990 + }, + { + "epoch": 0.26329681932163124, + "grad_norm": 32.353912353515625, + "learning_rate": 1.8422032616478857e-06, + "loss": 0.1709, + "num_input_tokens_seen": 979648, + "step": 1995 + }, + { + "epoch": 0.2639567110993797, + "grad_norm": 1.2442036867141724, + "learning_rate": 1.8409590414477001e-06, + "loss": 0.1159, + "num_input_tokens_seen": 982336, + "step": 2000 + }, + { + "epoch": 0.2646166028771281, + "grad_norm": 3.403188705444336, + "learning_rate": 1.839710358800985e-06, + "loss": 0.0056, + "num_input_tokens_seen": 984768, + "step": 2005 + }, + { + "epoch": 0.2652764946548766, + "grad_norm": 5.241207599639893, + "learning_rate": 1.8384572203337224e-06, + "loss": 0.0349, + "num_input_tokens_seen": 987136, + "step": 2010 + }, + { + "epoch": 0.26593638643262507, + "grad_norm": 0.26890337467193604, + "learning_rate": 1.837199632695538e-06, + "loss": 0.1309, + "num_input_tokens_seen": 989824, + "step": 2015 + }, + { + "epoch": 0.2665962782103735, + "grad_norm": 53.87063217163086, + "learning_rate": 1.8359376025596682e-06, + "loss": 0.3374, + "num_input_tokens_seen": 992064, + "step": 2020 + }, + { + "epoch": 0.26725616998812196, + "grad_norm": 17.814453125, + "learning_rate": 1.8346711366229215e-06, + "loss": 0.1366, + "num_input_tokens_seen": 994368, + "step": 2025 + }, + { + "epoch": 0.2679160617658704, + "grad_norm": 18.101577758789062, + "learning_rate": 1.8334002416056442e-06, + "loss": 0.215, + "num_input_tokens_seen": 996864, + "step": 2030 + }, + { + "epoch": 0.26857595354361885, + "grad_norm": 0.25549983978271484, + "learning_rate": 1.8321249242516865e-06, + "loss": 0.2084, + "num_input_tokens_seen": 999360, + "step": 2035 + }, + { + "epoch": 0.2692358453213673, + "grad_norm": 0.35009151697158813, + "learning_rate": 1.8308451913283638e-06, + "loss": 0.0868, + "num_input_tokens_seen": 1001920, + "step": 2040 + }, + { + "epoch": 0.26989573709911574, + "grad_norm": 0.3472491502761841, + "learning_rate": 1.8295610496264229e-06, + "loss": 0.0602, + "num_input_tokens_seen": 1004224, + "step": 2045 + }, + { + "epoch": 0.2705556288768642, + "grad_norm": 0.34922727942466736, + "learning_rate": 1.828272505960005e-06, + "loss": 0.0027, + "num_input_tokens_seen": 1006528, + "step": 2050 + }, + { + "epoch": 0.27121552065461263, + "grad_norm": 0.13754220306873322, + "learning_rate": 1.8269795671666098e-06, + "loss": 0.1856, + "num_input_tokens_seen": 1008896, + "step": 2055 + }, + { + "epoch": 0.2718754124323611, + "grad_norm": 2.371704339981079, + "learning_rate": 1.8256822401070591e-06, + "loss": 0.1347, + "num_input_tokens_seen": 1011648, + "step": 2060 + }, + { + "epoch": 0.2725353042101095, + "grad_norm": 125.92493438720703, + "learning_rate": 1.8243805316654611e-06, + "loss": 0.0254, + "num_input_tokens_seen": 1014208, + "step": 2065 + }, + { + "epoch": 0.27319519598785796, + "grad_norm": 6.873655796051025, + "learning_rate": 1.823074448749172e-06, + "loss": 0.2187, + "num_input_tokens_seen": 1016640, + "step": 2070 + }, + { + "epoch": 0.27385508776560646, + "grad_norm": 0.0956018716096878, + "learning_rate": 1.8217639982887623e-06, + "loss": 0.0403, + "num_input_tokens_seen": 1019328, + "step": 2075 + }, + { + "epoch": 0.2745149795433549, + "grad_norm": 0.05648243427276611, + "learning_rate": 1.8204491872379769e-06, + "loss": 0.0603, + "num_input_tokens_seen": 1021696, + "step": 2080 + }, + { + "epoch": 0.27517487132110335, + "grad_norm": 148.0998077392578, + "learning_rate": 1.8191300225737e-06, + "loss": 0.0996, + "num_input_tokens_seen": 1024256, + "step": 2085 + }, + { + "epoch": 0.2758347630988518, + "grad_norm": 23.68135643005371, + "learning_rate": 1.8178065112959184e-06, + "loss": 0.2261, + "num_input_tokens_seen": 1026560, + "step": 2090 + }, + { + "epoch": 0.27649465487660024, + "grad_norm": 23.90264129638672, + "learning_rate": 1.8164786604276832e-06, + "loss": 0.3078, + "num_input_tokens_seen": 1029184, + "step": 2095 + }, + { + "epoch": 0.2771545466543487, + "grad_norm": 1.5052696466445923, + "learning_rate": 1.8151464770150727e-06, + "loss": 0.1119, + "num_input_tokens_seen": 1031744, + "step": 2100 + }, + { + "epoch": 0.27781443843209713, + "grad_norm": 0.3358094096183777, + "learning_rate": 1.8138099681271558e-06, + "loss": 0.2357, + "num_input_tokens_seen": 1034048, + "step": 2105 + }, + { + "epoch": 0.2784743302098456, + "grad_norm": 0.09559042006731033, + "learning_rate": 1.8124691408559536e-06, + "loss": 0.1489, + "num_input_tokens_seen": 1036544, + "step": 2110 + }, + { + "epoch": 0.279134221987594, + "grad_norm": 0.12438903003931046, + "learning_rate": 1.8111240023164023e-06, + "loss": 0.1057, + "num_input_tokens_seen": 1038848, + "step": 2115 + }, + { + "epoch": 0.27979411376534247, + "grad_norm": 0.17464138567447662, + "learning_rate": 1.809774559646316e-06, + "loss": 0.0049, + "num_input_tokens_seen": 1041152, + "step": 2120 + }, + { + "epoch": 0.2804540055430909, + "grad_norm": 15.5991792678833, + "learning_rate": 1.8084208200063469e-06, + "loss": 0.1192, + "num_input_tokens_seen": 1043968, + "step": 2125 + }, + { + "epoch": 0.28111389732083936, + "grad_norm": 16.062332153320312, + "learning_rate": 1.8070627905799496e-06, + "loss": 0.2678, + "num_input_tokens_seen": 1046272, + "step": 2130 + }, + { + "epoch": 0.28177378909858786, + "grad_norm": 38.3685302734375, + "learning_rate": 1.8057004785733413e-06, + "loss": 0.0892, + "num_input_tokens_seen": 1048448, + "step": 2135 + }, + { + "epoch": 0.2824336808763363, + "grad_norm": 1.2147469520568848, + "learning_rate": 1.8043338912154647e-06, + "loss": 0.171, + "num_input_tokens_seen": 1051072, + "step": 2140 + }, + { + "epoch": 0.28309357265408475, + "grad_norm": 2.57453989982605, + "learning_rate": 1.8029630357579486e-06, + "loss": 0.0537, + "num_input_tokens_seen": 1053312, + "step": 2145 + }, + { + "epoch": 0.2837534644318332, + "grad_norm": 0.08693689852952957, + "learning_rate": 1.8015879194750702e-06, + "loss": 0.0727, + "num_input_tokens_seen": 1055680, + "step": 2150 + }, + { + "epoch": 0.28441335620958164, + "grad_norm": 0.2852920889854431, + "learning_rate": 1.8002085496637165e-06, + "loss": 0.1279, + "num_input_tokens_seen": 1057984, + "step": 2155 + }, + { + "epoch": 0.2850732479873301, + "grad_norm": 0.3056880831718445, + "learning_rate": 1.7988249336433448e-06, + "loss": 0.1195, + "num_input_tokens_seen": 1060736, + "step": 2160 + }, + { + "epoch": 0.2857331397650785, + "grad_norm": 40.0366096496582, + "learning_rate": 1.7974370787559447e-06, + "loss": 0.1191, + "num_input_tokens_seen": 1063424, + "step": 2165 + }, + { + "epoch": 0.28639303154282697, + "grad_norm": 0.06313258409500122, + "learning_rate": 1.796044992365999e-06, + "loss": 0.0407, + "num_input_tokens_seen": 1065728, + "step": 2170 + }, + { + "epoch": 0.2870529233205754, + "grad_norm": 0.0497964546084404, + "learning_rate": 1.794648681860444e-06, + "loss": 0.0343, + "num_input_tokens_seen": 1068160, + "step": 2175 + }, + { + "epoch": 0.28771281509832386, + "grad_norm": 0.049737598747015, + "learning_rate": 1.7932481546486312e-06, + "loss": 0.2582, + "num_input_tokens_seen": 1070592, + "step": 2180 + }, + { + "epoch": 0.2883727068760723, + "grad_norm": 15.30761432647705, + "learning_rate": 1.791843418162287e-06, + "loss": 0.161, + "num_input_tokens_seen": 1073280, + "step": 2185 + }, + { + "epoch": 0.28903259865382075, + "grad_norm": 2.0795376300811768, + "learning_rate": 1.7904344798554748e-06, + "loss": 0.0127, + "num_input_tokens_seen": 1075584, + "step": 2190 + }, + { + "epoch": 0.28969249043156925, + "grad_norm": 2.7730255126953125, + "learning_rate": 1.789021347204553e-06, + "loss": 0.0962, + "num_input_tokens_seen": 1078016, + "step": 2195 + }, + { + "epoch": 0.2903523822093177, + "grad_norm": 42.26045608520508, + "learning_rate": 1.7876040277081381e-06, + "loss": 0.1665, + "num_input_tokens_seen": 1080512, + "step": 2200 + }, + { + "epoch": 0.29101227398706614, + "grad_norm": 49.438175201416016, + "learning_rate": 1.7861825288870632e-06, + "loss": 0.1979, + "num_input_tokens_seen": 1082752, + "step": 2205 + }, + { + "epoch": 0.2916721657648146, + "grad_norm": 19.054122924804688, + "learning_rate": 1.7847568582843376e-06, + "loss": 0.3436, + "num_input_tokens_seen": 1085184, + "step": 2210 + }, + { + "epoch": 0.29233205754256303, + "grad_norm": 13.935524940490723, + "learning_rate": 1.7833270234651088e-06, + "loss": 0.1458, + "num_input_tokens_seen": 1087360, + "step": 2215 + }, + { + "epoch": 0.2929919493203115, + "grad_norm": 12.456001281738281, + "learning_rate": 1.781893032016621e-06, + "loss": 0.0619, + "num_input_tokens_seen": 1089984, + "step": 2220 + }, + { + "epoch": 0.2936518410980599, + "grad_norm": 0.7875421643257141, + "learning_rate": 1.7804548915481746e-06, + "loss": 0.0185, + "num_input_tokens_seen": 1092608, + "step": 2225 + }, + { + "epoch": 0.29431173287580836, + "grad_norm": 0.38268232345581055, + "learning_rate": 1.7790126096910865e-06, + "loss": 0.1235, + "num_input_tokens_seen": 1095040, + "step": 2230 + }, + { + "epoch": 0.2949716246535568, + "grad_norm": 8.145883560180664, + "learning_rate": 1.7775661940986492e-06, + "loss": 0.064, + "num_input_tokens_seen": 1097728, + "step": 2235 + }, + { + "epoch": 0.29563151643130525, + "grad_norm": 4.164917469024658, + "learning_rate": 1.776115652446091e-06, + "loss": 0.2202, + "num_input_tokens_seen": 1100096, + "step": 2240 + }, + { + "epoch": 0.2962914082090537, + "grad_norm": 51.62826919555664, + "learning_rate": 1.7746609924305336e-06, + "loss": 0.1252, + "num_input_tokens_seen": 1102400, + "step": 2245 + }, + { + "epoch": 0.29695129998680214, + "grad_norm": 0.07670289278030396, + "learning_rate": 1.7732022217709534e-06, + "loss": 0.1016, + "num_input_tokens_seen": 1104960, + "step": 2250 + }, + { + "epoch": 0.2976111917645506, + "grad_norm": 9.224966049194336, + "learning_rate": 1.7717393482081384e-06, + "loss": 0.0905, + "num_input_tokens_seen": 1107520, + "step": 2255 + }, + { + "epoch": 0.2982710835422991, + "grad_norm": 26.702306747436523, + "learning_rate": 1.7702723795046492e-06, + "loss": 0.1454, + "num_input_tokens_seen": 1109952, + "step": 2260 + }, + { + "epoch": 0.29893097532004753, + "grad_norm": 0.32974258065223694, + "learning_rate": 1.7688013234447757e-06, + "loss": 0.0226, + "num_input_tokens_seen": 1112128, + "step": 2265 + }, + { + "epoch": 0.299590867097796, + "grad_norm": 0.09137266874313354, + "learning_rate": 1.7673261878344973e-06, + "loss": 0.1225, + "num_input_tokens_seen": 1114688, + "step": 2270 + }, + { + "epoch": 0.30011878051999474, + "eval_loss": 0.10979828983545303, + "eval_runtime": 7.5343, + "eval_samples_per_second": 893.91, + "eval_steps_per_second": 111.755, + "num_input_tokens_seen": 1116800, + "step": 2274 + }, + { + "epoch": 0.3002507588755444, + "grad_norm": 0.13611248135566711, + "learning_rate": 1.7658469805014414e-06, + "loss": 0.1963, + "num_input_tokens_seen": 1117248, + "step": 2275 + }, + { + "epoch": 0.30091065065329287, + "grad_norm": 11.078180313110352, + "learning_rate": 1.7643637092948415e-06, + "loss": 0.1312, + "num_input_tokens_seen": 1119808, + "step": 2280 + }, + { + "epoch": 0.3015705424310413, + "grad_norm": 13.604643821716309, + "learning_rate": 1.7628763820854948e-06, + "loss": 0.2181, + "num_input_tokens_seen": 1122112, + "step": 2285 + }, + { + "epoch": 0.30223043420878976, + "grad_norm": 0.21895840764045715, + "learning_rate": 1.7613850067657216e-06, + "loss": 0.0905, + "num_input_tokens_seen": 1124544, + "step": 2290 + }, + { + "epoch": 0.3028903259865382, + "grad_norm": 0.23491473495960236, + "learning_rate": 1.7598895912493232e-06, + "loss": 0.0688, + "num_input_tokens_seen": 1127104, + "step": 2295 + }, + { + "epoch": 0.30355021776428665, + "grad_norm": 3.027573823928833, + "learning_rate": 1.7583901434715397e-06, + "loss": 0.0735, + "num_input_tokens_seen": 1129536, + "step": 2300 + }, + { + "epoch": 0.3042101095420351, + "grad_norm": 0.24895241856575012, + "learning_rate": 1.7568866713890074e-06, + "loss": 0.0694, + "num_input_tokens_seen": 1131840, + "step": 2305 + }, + { + "epoch": 0.30487000131978353, + "grad_norm": 0.47908198833465576, + "learning_rate": 1.7553791829797175e-06, + "loss": 0.1669, + "num_input_tokens_seen": 1134336, + "step": 2310 + }, + { + "epoch": 0.305529893097532, + "grad_norm": 8.449284553527832, + "learning_rate": 1.7538676862429737e-06, + "loss": 0.2863, + "num_input_tokens_seen": 1136640, + "step": 2315 + }, + { + "epoch": 0.3061897848752805, + "grad_norm": 20.579036712646484, + "learning_rate": 1.7523521891993486e-06, + "loss": 0.1177, + "num_input_tokens_seen": 1139136, + "step": 2320 + }, + { + "epoch": 0.3068496766530289, + "grad_norm": 0.1681635081768036, + "learning_rate": 1.7508326998906422e-06, + "loss": 0.0919, + "num_input_tokens_seen": 1141568, + "step": 2325 + }, + { + "epoch": 0.30750956843077737, + "grad_norm": 0.1681900918483734, + "learning_rate": 1.7493092263798394e-06, + "loss": 0.004, + "num_input_tokens_seen": 1143936, + "step": 2330 + }, + { + "epoch": 0.3081694602085258, + "grad_norm": 96.35763549804688, + "learning_rate": 1.7477817767510664e-06, + "loss": 0.037, + "num_input_tokens_seen": 1146624, + "step": 2335 + }, + { + "epoch": 0.30882935198627426, + "grad_norm": 0.5398063063621521, + "learning_rate": 1.7462503591095484e-06, + "loss": 0.0055, + "num_input_tokens_seen": 1149120, + "step": 2340 + }, + { + "epoch": 0.3094892437640227, + "grad_norm": 0.031823668628931046, + "learning_rate": 1.7447149815815659e-06, + "loss": 0.0421, + "num_input_tokens_seen": 1151488, + "step": 2345 + }, + { + "epoch": 0.31014913554177115, + "grad_norm": 0.048917245119810104, + "learning_rate": 1.7431756523144126e-06, + "loss": 0.1083, + "num_input_tokens_seen": 1153600, + "step": 2350 + }, + { + "epoch": 0.3108090273195196, + "grad_norm": 0.025062644854187965, + "learning_rate": 1.7416323794763512e-06, + "loss": 0.0665, + "num_input_tokens_seen": 1156224, + "step": 2355 + }, + { + "epoch": 0.31146891909726804, + "grad_norm": 0.01710972562432289, + "learning_rate": 1.7400851712565707e-06, + "loss": 0.2148, + "num_input_tokens_seen": 1158656, + "step": 2360 + }, + { + "epoch": 0.3121288108750165, + "grad_norm": 0.0433725044131279, + "learning_rate": 1.7385340358651432e-06, + "loss": 0.2065, + "num_input_tokens_seen": 1161408, + "step": 2365 + }, + { + "epoch": 0.3127887026527649, + "grad_norm": 119.09558868408203, + "learning_rate": 1.736978981532979e-06, + "loss": 0.0283, + "num_input_tokens_seen": 1163904, + "step": 2370 + }, + { + "epoch": 0.31344859443051337, + "grad_norm": 0.16062359511852264, + "learning_rate": 1.7354200165117838e-06, + "loss": 0.2238, + "num_input_tokens_seen": 1166208, + "step": 2375 + }, + { + "epoch": 0.3141084862082619, + "grad_norm": 226.05735778808594, + "learning_rate": 1.733857149074016e-06, + "loss": 0.2442, + "num_input_tokens_seen": 1168512, + "step": 2380 + }, + { + "epoch": 0.3147683779860103, + "grad_norm": 0.14660975337028503, + "learning_rate": 1.7322903875128402e-06, + "loss": 0.1859, + "num_input_tokens_seen": 1171072, + "step": 2385 + }, + { + "epoch": 0.31542826976375876, + "grad_norm": 0.15770386159420013, + "learning_rate": 1.7307197401420858e-06, + "loss": 0.0042, + "num_input_tokens_seen": 1173312, + "step": 2390 + }, + { + "epoch": 0.3160881615415072, + "grad_norm": 0.41774991154670715, + "learning_rate": 1.7291452152962018e-06, + "loss": 0.0649, + "num_input_tokens_seen": 1175744, + "step": 2395 + }, + { + "epoch": 0.31674805331925565, + "grad_norm": 0.3258494734764099, + "learning_rate": 1.7275668213302116e-06, + "loss": 0.1831, + "num_input_tokens_seen": 1178112, + "step": 2400 + }, + { + "epoch": 0.3174079450970041, + "grad_norm": 57.20950698852539, + "learning_rate": 1.72598456661967e-06, + "loss": 0.0443, + "num_input_tokens_seen": 1180352, + "step": 2405 + }, + { + "epoch": 0.31806783687475254, + "grad_norm": 0.16907945275306702, + "learning_rate": 1.7243984595606191e-06, + "loss": 0.1393, + "num_input_tokens_seen": 1182528, + "step": 2410 + }, + { + "epoch": 0.318727728652501, + "grad_norm": 0.5881856679916382, + "learning_rate": 1.722808508569542e-06, + "loss": 0.0891, + "num_input_tokens_seen": 1185280, + "step": 2415 + }, + { + "epoch": 0.31938762043024943, + "grad_norm": 14.51777458190918, + "learning_rate": 1.72121472208332e-06, + "loss": 0.0768, + "num_input_tokens_seen": 1188032, + "step": 2420 + }, + { + "epoch": 0.3200475122079979, + "grad_norm": 83.25537872314453, + "learning_rate": 1.7196171085591864e-06, + "loss": 0.2321, + "num_input_tokens_seen": 1190464, + "step": 2425 + }, + { + "epoch": 0.3207074039857463, + "grad_norm": 222.58181762695312, + "learning_rate": 1.7180156764746824e-06, + "loss": 0.2085, + "num_input_tokens_seen": 1192960, + "step": 2430 + }, + { + "epoch": 0.32136729576349476, + "grad_norm": 0.1936943382024765, + "learning_rate": 1.7164104343276113e-06, + "loss": 0.0272, + "num_input_tokens_seen": 1195072, + "step": 2435 + }, + { + "epoch": 0.3220271875412432, + "grad_norm": 0.019381973892450333, + "learning_rate": 1.714801390635996e-06, + "loss": 0.0063, + "num_input_tokens_seen": 1197376, + "step": 2440 + }, + { + "epoch": 0.3226870793189917, + "grad_norm": 0.04609265550971031, + "learning_rate": 1.7131885539380297e-06, + "loss": 0.038, + "num_input_tokens_seen": 1199936, + "step": 2445 + }, + { + "epoch": 0.32334697109674015, + "grad_norm": 31.58684539794922, + "learning_rate": 1.7115719327920335e-06, + "loss": 0.1487, + "num_input_tokens_seen": 1202368, + "step": 2450 + }, + { + "epoch": 0.3240068628744886, + "grad_norm": 0.015219883061945438, + "learning_rate": 1.70995153577641e-06, + "loss": 0.0011, + "num_input_tokens_seen": 1204800, + "step": 2455 + }, + { + "epoch": 0.32466675465223704, + "grad_norm": 27.071508407592773, + "learning_rate": 1.7083273714895991e-06, + "loss": 0.0641, + "num_input_tokens_seen": 1207552, + "step": 2460 + }, + { + "epoch": 0.3253266464299855, + "grad_norm": 0.03585462644696236, + "learning_rate": 1.7066994485500298e-06, + "loss": 0.2123, + "num_input_tokens_seen": 1209856, + "step": 2465 + }, + { + "epoch": 0.32598653820773393, + "grad_norm": 0.20441071689128876, + "learning_rate": 1.7050677755960762e-06, + "loss": 0.0982, + "num_input_tokens_seen": 1212352, + "step": 2470 + }, + { + "epoch": 0.3266464299854824, + "grad_norm": 99.99305725097656, + "learning_rate": 1.7034323612860124e-06, + "loss": 0.1048, + "num_input_tokens_seen": 1214912, + "step": 2475 + }, + { + "epoch": 0.3273063217632308, + "grad_norm": 0.028969811275601387, + "learning_rate": 1.7017932142979645e-06, + "loss": 0.0354, + "num_input_tokens_seen": 1217088, + "step": 2480 + }, + { + "epoch": 0.32796621354097927, + "grad_norm": 4.167091369628906, + "learning_rate": 1.700150343329866e-06, + "loss": 0.2006, + "num_input_tokens_seen": 1219584, + "step": 2485 + }, + { + "epoch": 0.3286261053187277, + "grad_norm": 30.39582633972168, + "learning_rate": 1.6985037570994113e-06, + "loss": 0.1335, + "num_input_tokens_seen": 1222336, + "step": 2490 + }, + { + "epoch": 0.32928599709647616, + "grad_norm": 0.7080073952674866, + "learning_rate": 1.6968534643440088e-06, + "loss": 0.0688, + "num_input_tokens_seen": 1224832, + "step": 2495 + }, + { + "epoch": 0.3299458888742246, + "grad_norm": 36.44700241088867, + "learning_rate": 1.6951994738207364e-06, + "loss": 0.1821, + "num_input_tokens_seen": 1227392, + "step": 2500 + }, + { + "epoch": 0.3306057806519731, + "grad_norm": 12.277377128601074, + "learning_rate": 1.6935417943062928e-06, + "loss": 0.2034, + "num_input_tokens_seen": 1229952, + "step": 2505 + }, + { + "epoch": 0.33126567242972155, + "grad_norm": 0.4052470028400421, + "learning_rate": 1.6918804345969516e-06, + "loss": 0.0106, + "num_input_tokens_seen": 1232640, + "step": 2510 + }, + { + "epoch": 0.33192556420747, + "grad_norm": 26.028976440429688, + "learning_rate": 1.6902154035085156e-06, + "loss": 0.0161, + "num_input_tokens_seen": 1235200, + "step": 2515 + }, + { + "epoch": 0.33258545598521844, + "grad_norm": 0.1962713748216629, + "learning_rate": 1.688546709876269e-06, + "loss": 0.0893, + "num_input_tokens_seen": 1237632, + "step": 2520 + }, + { + "epoch": 0.3332453477629669, + "grad_norm": 1.0130256414413452, + "learning_rate": 1.6868743625549314e-06, + "loss": 0.0905, + "num_input_tokens_seen": 1239936, + "step": 2525 + }, + { + "epoch": 0.3339052395407153, + "grad_norm": 0.6345663666725159, + "learning_rate": 1.6851983704186092e-06, + "loss": 0.0392, + "num_input_tokens_seen": 1242304, + "step": 2530 + }, + { + "epoch": 0.33456513131846377, + "grad_norm": 0.025052571669220924, + "learning_rate": 1.6835187423607503e-06, + "loss": 0.0036, + "num_input_tokens_seen": 1244736, + "step": 2535 + }, + { + "epoch": 0.3352250230962122, + "grad_norm": 0.6511954665184021, + "learning_rate": 1.681835487294096e-06, + "loss": 0.2003, + "num_input_tokens_seen": 1247488, + "step": 2540 + }, + { + "epoch": 0.33588491487396066, + "grad_norm": 0.022017456591129303, + "learning_rate": 1.6801486141506342e-06, + "loss": 0.2557, + "num_input_tokens_seen": 1250048, + "step": 2545 + }, + { + "epoch": 0.3365448066517091, + "grad_norm": 160.31907653808594, + "learning_rate": 1.6784581318815514e-06, + "loss": 0.3749, + "num_input_tokens_seen": 1252928, + "step": 2550 + }, + { + "epoch": 0.33720469842945755, + "grad_norm": 0.035775672644376755, + "learning_rate": 1.6767640494571849e-06, + "loss": 0.146, + "num_input_tokens_seen": 1255488, + "step": 2555 + }, + { + "epoch": 0.337864590207206, + "grad_norm": 25.89168357849121, + "learning_rate": 1.6750663758669767e-06, + "loss": 0.3346, + "num_input_tokens_seen": 1257984, + "step": 2560 + }, + { + "epoch": 0.3385244819849545, + "grad_norm": 0.10507479310035706, + "learning_rate": 1.6733651201194245e-06, + "loss": 0.1044, + "num_input_tokens_seen": 1260416, + "step": 2565 + }, + { + "epoch": 0.33918437376270294, + "grad_norm": 36.37501907348633, + "learning_rate": 1.6716602912420342e-06, + "loss": 0.0797, + "num_input_tokens_seen": 1263168, + "step": 2570 + }, + { + "epoch": 0.3398442655404514, + "grad_norm": 0.687891960144043, + "learning_rate": 1.6699518982812726e-06, + "loss": 0.1608, + "num_input_tokens_seen": 1265600, + "step": 2575 + }, + { + "epoch": 0.34050415731819983, + "grad_norm": 0.11499731987714767, + "learning_rate": 1.6682399503025183e-06, + "loss": 0.0033, + "num_input_tokens_seen": 1268032, + "step": 2580 + }, + { + "epoch": 0.3411640490959483, + "grad_norm": 42.86396408081055, + "learning_rate": 1.666524456390014e-06, + "loss": 0.1571, + "num_input_tokens_seen": 1270336, + "step": 2585 + }, + { + "epoch": 0.3418239408736967, + "grad_norm": 30.411161422729492, + "learning_rate": 1.664805425646819e-06, + "loss": 0.0566, + "num_input_tokens_seen": 1273088, + "step": 2590 + }, + { + "epoch": 0.34248383265144516, + "grad_norm": 0.1486613005399704, + "learning_rate": 1.6630828671947606e-06, + "loss": 0.2203, + "num_input_tokens_seen": 1275456, + "step": 2595 + }, + { + "epoch": 0.3431437244291936, + "grad_norm": 0.21017670631408691, + "learning_rate": 1.6613567901743842e-06, + "loss": 0.0365, + "num_input_tokens_seen": 1277888, + "step": 2600 + }, + { + "epoch": 0.34380361620694205, + "grad_norm": 0.2567872107028961, + "learning_rate": 1.6596272037449075e-06, + "loss": 0.0013, + "num_input_tokens_seen": 1280384, + "step": 2605 + }, + { + "epoch": 0.3444635079846905, + "grad_norm": 35.565086364746094, + "learning_rate": 1.6578941170841696e-06, + "loss": 0.064, + "num_input_tokens_seen": 1282944, + "step": 2610 + }, + { + "epoch": 0.34512339976243894, + "grad_norm": 0.281055212020874, + "learning_rate": 1.6561575393885833e-06, + "loss": 0.0664, + "num_input_tokens_seen": 1285184, + "step": 2615 + }, + { + "epoch": 0.3457832915401874, + "grad_norm": 0.0915956199169159, + "learning_rate": 1.6544174798730864e-06, + "loss": 0.1976, + "num_input_tokens_seen": 1287808, + "step": 2620 + }, + { + "epoch": 0.34644318331793583, + "grad_norm": 0.18124467134475708, + "learning_rate": 1.6526739477710923e-06, + "loss": 0.1552, + "num_input_tokens_seen": 1290432, + "step": 2625 + }, + { + "epoch": 0.34710307509568433, + "grad_norm": 0.16865764558315277, + "learning_rate": 1.650926952334441e-06, + "loss": 0.2257, + "num_input_tokens_seen": 1292736, + "step": 2630 + }, + { + "epoch": 0.3477629668734328, + "grad_norm": 0.4664243757724762, + "learning_rate": 1.6491765028333516e-06, + "loss": 0.2674, + "num_input_tokens_seen": 1295104, + "step": 2635 + }, + { + "epoch": 0.3484228586511812, + "grad_norm": 0.6427319645881653, + "learning_rate": 1.6474226085563693e-06, + "loss": 0.0204, + "num_input_tokens_seen": 1297600, + "step": 2640 + }, + { + "epoch": 0.34908275042892967, + "grad_norm": 0.10163812339305878, + "learning_rate": 1.6456652788103215e-06, + "loss": 0.0496, + "num_input_tokens_seen": 1300224, + "step": 2645 + }, + { + "epoch": 0.3497426422066781, + "grad_norm": 2.703385829925537, + "learning_rate": 1.6439045229202631e-06, + "loss": 0.1152, + "num_input_tokens_seen": 1302528, + "step": 2650 + }, + { + "epoch": 0.3501385772733272, + "eval_loss": 0.12348020076751709, + "eval_runtime": 7.625, + "eval_samples_per_second": 883.275, + "eval_steps_per_second": 110.426, + "num_input_tokens_seen": 1303872, + "step": 2653 + }, + { + "epoch": 0.35040253398442656, + "grad_norm": 0.24891549348831177, + "learning_rate": 1.6421403502294307e-06, + "loss": 0.159, + "num_input_tokens_seen": 1305024, + "step": 2655 + }, + { + "epoch": 0.351062425762175, + "grad_norm": 0.2551489472389221, + "learning_rate": 1.6403727700991915e-06, + "loss": 0.1813, + "num_input_tokens_seen": 1307392, + "step": 2660 + }, + { + "epoch": 0.35172231753992345, + "grad_norm": 0.29944464564323425, + "learning_rate": 1.6386017919089933e-06, + "loss": 0.1581, + "num_input_tokens_seen": 1310016, + "step": 2665 + }, + { + "epoch": 0.3523822093176719, + "grad_norm": 0.0917045846581459, + "learning_rate": 1.636827425056316e-06, + "loss": 0.0066, + "num_input_tokens_seen": 1312576, + "step": 2670 + }, + { + "epoch": 0.35304210109542034, + "grad_norm": 0.10297297686338425, + "learning_rate": 1.635049678956621e-06, + "loss": 0.1432, + "num_input_tokens_seen": 1315072, + "step": 2675 + }, + { + "epoch": 0.3537019928731688, + "grad_norm": 32.947994232177734, + "learning_rate": 1.633268563043301e-06, + "loss": 0.1222, + "num_input_tokens_seen": 1317504, + "step": 2680 + }, + { + "epoch": 0.3543618846509172, + "grad_norm": 0.27496451139450073, + "learning_rate": 1.63148408676763e-06, + "loss": 0.0023, + "num_input_tokens_seen": 1319680, + "step": 2685 + }, + { + "epoch": 0.3550217764286657, + "grad_norm": 0.06333144754171371, + "learning_rate": 1.6296962595987141e-06, + "loss": 0.0014, + "num_input_tokens_seen": 1322240, + "step": 2690 + }, + { + "epoch": 0.35568166820641417, + "grad_norm": 1.364142894744873, + "learning_rate": 1.6279050910234392e-06, + "loss": 0.1142, + "num_input_tokens_seen": 1324736, + "step": 2695 + }, + { + "epoch": 0.3563415599841626, + "grad_norm": 0.07366377115249634, + "learning_rate": 1.626110590546423e-06, + "loss": 0.0407, + "num_input_tokens_seen": 1327104, + "step": 2700 + }, + { + "epoch": 0.35700145176191106, + "grad_norm": 134.90122985839844, + "learning_rate": 1.6243127676899635e-06, + "loss": 0.248, + "num_input_tokens_seen": 1329920, + "step": 2705 + }, + { + "epoch": 0.3576613435396595, + "grad_norm": 0.04035777971148491, + "learning_rate": 1.6225116319939884e-06, + "loss": 0.2153, + "num_input_tokens_seen": 1332352, + "step": 2710 + }, + { + "epoch": 0.35832123531740795, + "grad_norm": 75.87095642089844, + "learning_rate": 1.6207071930160044e-06, + "loss": 0.1084, + "num_input_tokens_seen": 1335040, + "step": 2715 + }, + { + "epoch": 0.3589811270951564, + "grad_norm": 0.1767151951789856, + "learning_rate": 1.6188994603310468e-06, + "loss": 0.0054, + "num_input_tokens_seen": 1337472, + "step": 2720 + }, + { + "epoch": 0.35964101887290484, + "grad_norm": 3.7952630519866943, + "learning_rate": 1.617088443531628e-06, + "loss": 0.1694, + "num_input_tokens_seen": 1339712, + "step": 2725 + }, + { + "epoch": 0.3603009106506533, + "grad_norm": 0.17187942564487457, + "learning_rate": 1.6152741522276882e-06, + "loss": 0.0016, + "num_input_tokens_seen": 1342144, + "step": 2730 + }, + { + "epoch": 0.36096080242840173, + "grad_norm": 0.7987899780273438, + "learning_rate": 1.6134565960465425e-06, + "loss": 0.108, + "num_input_tokens_seen": 1344512, + "step": 2735 + }, + { + "epoch": 0.3616206942061502, + "grad_norm": 0.12640990316867828, + "learning_rate": 1.6116357846328312e-06, + "loss": 0.242, + "num_input_tokens_seen": 1346880, + "step": 2740 + }, + { + "epoch": 0.3622805859838986, + "grad_norm": 0.04579659551382065, + "learning_rate": 1.609811727648468e-06, + "loss": 0.1324, + "num_input_tokens_seen": 1349056, + "step": 2745 + }, + { + "epoch": 0.36294047776164706, + "grad_norm": 0.21617701649665833, + "learning_rate": 1.6079844347725882e-06, + "loss": 0.0724, + "num_input_tokens_seen": 1351488, + "step": 2750 + }, + { + "epoch": 0.36360036953939556, + "grad_norm": 0.17689555883407593, + "learning_rate": 1.6061539157014987e-06, + "loss": 0.0532, + "num_input_tokens_seen": 1353920, + "step": 2755 + }, + { + "epoch": 0.364260261317144, + "grad_norm": 0.18878047168254852, + "learning_rate": 1.6043201801486257e-06, + "loss": 0.2916, + "num_input_tokens_seen": 1356352, + "step": 2760 + }, + { + "epoch": 0.36492015309489245, + "grad_norm": 1.1614915132522583, + "learning_rate": 1.6024832378444628e-06, + "loss": 0.2542, + "num_input_tokens_seen": 1359104, + "step": 2765 + }, + { + "epoch": 0.3655800448726409, + "grad_norm": 26.108612060546875, + "learning_rate": 1.6006430985365204e-06, + "loss": 0.2718, + "num_input_tokens_seen": 1361536, + "step": 2770 + }, + { + "epoch": 0.36623993665038934, + "grad_norm": 113.12171936035156, + "learning_rate": 1.5987997719892735e-06, + "loss": 0.2648, + "num_input_tokens_seen": 1364160, + "step": 2775 + }, + { + "epoch": 0.3668998284281378, + "grad_norm": 0.58730149269104, + "learning_rate": 1.5969532679841088e-06, + "loss": 0.0465, + "num_input_tokens_seen": 1366656, + "step": 2780 + }, + { + "epoch": 0.36755972020588623, + "grad_norm": 32.10945510864258, + "learning_rate": 1.5951035963192752e-06, + "loss": 0.0486, + "num_input_tokens_seen": 1369216, + "step": 2785 + }, + { + "epoch": 0.3682196119836347, + "grad_norm": 1.1091487407684326, + "learning_rate": 1.593250766809829e-06, + "loss": 0.2435, + "num_input_tokens_seen": 1371712, + "step": 2790 + }, + { + "epoch": 0.3688795037613831, + "grad_norm": 61.50751495361328, + "learning_rate": 1.5913947892875842e-06, + "loss": 0.1572, + "num_input_tokens_seen": 1374080, + "step": 2795 + }, + { + "epoch": 0.36953939553913157, + "grad_norm": 0.4279724657535553, + "learning_rate": 1.589535673601059e-06, + "loss": 0.1055, + "num_input_tokens_seen": 1377024, + "step": 2800 + }, + { + "epoch": 0.37019928731688, + "grad_norm": 42.588748931884766, + "learning_rate": 1.587673429615424e-06, + "loss": 0.0806, + "num_input_tokens_seen": 1379392, + "step": 2805 + }, + { + "epoch": 0.37085917909462845, + "grad_norm": 0.18637718260288239, + "learning_rate": 1.5858080672124495e-06, + "loss": 0.1468, + "num_input_tokens_seen": 1381760, + "step": 2810 + }, + { + "epoch": 0.37151907087237696, + "grad_norm": 0.43665367364883423, + "learning_rate": 1.5839395962904536e-06, + "loss": 0.0923, + "num_input_tokens_seen": 1384128, + "step": 2815 + }, + { + "epoch": 0.3721789626501254, + "grad_norm": 0.0831814855337143, + "learning_rate": 1.5820680267642494e-06, + "loss": 0.0594, + "num_input_tokens_seen": 1386496, + "step": 2820 + }, + { + "epoch": 0.37283885442787384, + "grad_norm": 0.25996115803718567, + "learning_rate": 1.5801933685650917e-06, + "loss": 0.0668, + "num_input_tokens_seen": 1388736, + "step": 2825 + }, + { + "epoch": 0.3734987462056223, + "grad_norm": 2.1776347160339355, + "learning_rate": 1.5783156316406259e-06, + "loss": 0.002, + "num_input_tokens_seen": 1391040, + "step": 2830 + }, + { + "epoch": 0.37415863798337073, + "grad_norm": 66.52011108398438, + "learning_rate": 1.5764348259548334e-06, + "loss": 0.218, + "num_input_tokens_seen": 1393344, + "step": 2835 + }, + { + "epoch": 0.3748185297611192, + "grad_norm": 234.61207580566406, + "learning_rate": 1.5745509614879806e-06, + "loss": 0.056, + "num_input_tokens_seen": 1395648, + "step": 2840 + }, + { + "epoch": 0.3754784215388676, + "grad_norm": 0.03497995808720589, + "learning_rate": 1.572664048236564e-06, + "loss": 0.2865, + "num_input_tokens_seen": 1398272, + "step": 2845 + }, + { + "epoch": 0.37613831331661607, + "grad_norm": 0.07777401059865952, + "learning_rate": 1.570774096213259e-06, + "loss": 0.0507, + "num_input_tokens_seen": 1400576, + "step": 2850 + }, + { + "epoch": 0.3767982050943645, + "grad_norm": 0.07564707100391388, + "learning_rate": 1.5688811154468649e-06, + "loss": 0.0513, + "num_input_tokens_seen": 1403136, + "step": 2855 + }, + { + "epoch": 0.37745809687211296, + "grad_norm": 0.08237399160861969, + "learning_rate": 1.5669851159822532e-06, + "loss": 0.1228, + "num_input_tokens_seen": 1405504, + "step": 2860 + }, + { + "epoch": 0.3781179886498614, + "grad_norm": 42.22079086303711, + "learning_rate": 1.5650861078803137e-06, + "loss": 0.1389, + "num_input_tokens_seen": 1407808, + "step": 2865 + }, + { + "epoch": 0.37877788042760985, + "grad_norm": 6.883021831512451, + "learning_rate": 1.5631841012179013e-06, + "loss": 0.0692, + "num_input_tokens_seen": 1410304, + "step": 2870 + }, + { + "epoch": 0.37943777220535835, + "grad_norm": 0.3424462676048279, + "learning_rate": 1.5612791060877818e-06, + "loss": 0.004, + "num_input_tokens_seen": 1412736, + "step": 2875 + }, + { + "epoch": 0.3800976639831068, + "grad_norm": 75.88460540771484, + "learning_rate": 1.5593711325985801e-06, + "loss": 0.0961, + "num_input_tokens_seen": 1415488, + "step": 2880 + }, + { + "epoch": 0.38075755576085524, + "grad_norm": 0.043806418776512146, + "learning_rate": 1.5574601908747245e-06, + "loss": 0.21, + "num_input_tokens_seen": 1417856, + "step": 2885 + }, + { + "epoch": 0.3814174475386037, + "grad_norm": 0.06361314654350281, + "learning_rate": 1.5555462910563936e-06, + "loss": 0.0664, + "num_input_tokens_seen": 1420096, + "step": 2890 + }, + { + "epoch": 0.3820773393163521, + "grad_norm": 25.98211097717285, + "learning_rate": 1.5536294432994636e-06, + "loss": 0.2344, + "num_input_tokens_seen": 1422656, + "step": 2895 + }, + { + "epoch": 0.38273723109410057, + "grad_norm": 92.6849594116211, + "learning_rate": 1.5517096577754528e-06, + "loss": 0.0884, + "num_input_tokens_seen": 1425152, + "step": 2900 + }, + { + "epoch": 0.383397122871849, + "grad_norm": 0.08511543273925781, + "learning_rate": 1.5497869446714695e-06, + "loss": 0.0623, + "num_input_tokens_seen": 1427840, + "step": 2905 + }, + { + "epoch": 0.38405701464959746, + "grad_norm": 0.13399949669837952, + "learning_rate": 1.5478613141901558e-06, + "loss": 0.0019, + "num_input_tokens_seen": 1430144, + "step": 2910 + }, + { + "epoch": 0.3847169064273459, + "grad_norm": 0.18390312790870667, + "learning_rate": 1.5459327765496348e-06, + "loss": 0.1492, + "num_input_tokens_seen": 1432448, + "step": 2915 + }, + { + "epoch": 0.38537679820509435, + "grad_norm": 1.747375726699829, + "learning_rate": 1.5440013419834563e-06, + "loss": 0.0071, + "num_input_tokens_seen": 1434752, + "step": 2920 + }, + { + "epoch": 0.3860366899828428, + "grad_norm": 0.4480796158313751, + "learning_rate": 1.5420670207405419e-06, + "loss": 0.0011, + "num_input_tokens_seen": 1437184, + "step": 2925 + }, + { + "epoch": 0.38669658176059124, + "grad_norm": 7.325652122497559, + "learning_rate": 1.5401298230851314e-06, + "loss": 0.1098, + "num_input_tokens_seen": 1440000, + "step": 2930 + }, + { + "epoch": 0.3873564735383397, + "grad_norm": 5.879019737243652, + "learning_rate": 1.5381897592967275e-06, + "loss": 0.0072, + "num_input_tokens_seen": 1442624, + "step": 2935 + }, + { + "epoch": 0.3880163653160882, + "grad_norm": 0.20650486648082733, + "learning_rate": 1.5362468396700426e-06, + "loss": 0.0702, + "num_input_tokens_seen": 1445184, + "step": 2940 + }, + { + "epoch": 0.38867625709383663, + "grad_norm": 22.289382934570312, + "learning_rate": 1.5343010745149418e-06, + "loss": 0.322, + "num_input_tokens_seen": 1447616, + "step": 2945 + }, + { + "epoch": 0.3893361488715851, + "grad_norm": 0.035571977496147156, + "learning_rate": 1.532352474156391e-06, + "loss": 0.0715, + "num_input_tokens_seen": 1450176, + "step": 2950 + }, + { + "epoch": 0.3899960406493335, + "grad_norm": 3.2316651344299316, + "learning_rate": 1.5304010489343995e-06, + "loss": 0.4706, + "num_input_tokens_seen": 1452672, + "step": 2955 + }, + { + "epoch": 0.39065593242708196, + "grad_norm": 0.06907609850168228, + "learning_rate": 1.528446809203968e-06, + "loss": 0.2238, + "num_input_tokens_seen": 1455232, + "step": 2960 + }, + { + "epoch": 0.3913158242048304, + "grad_norm": 82.65614318847656, + "learning_rate": 1.526489765335031e-06, + "loss": 0.1729, + "num_input_tokens_seen": 1457792, + "step": 2965 + }, + { + "epoch": 0.39197571598257885, + "grad_norm": 0.3325257897377014, + "learning_rate": 1.5245299277124026e-06, + "loss": 0.1528, + "num_input_tokens_seen": 1460160, + "step": 2970 + }, + { + "epoch": 0.3926356077603273, + "grad_norm": 0.9707848429679871, + "learning_rate": 1.5225673067357218e-06, + "loss": 0.1434, + "num_input_tokens_seen": 1462400, + "step": 2975 + }, + { + "epoch": 0.39329549953807574, + "grad_norm": 22.089210510253906, + "learning_rate": 1.5206019128193981e-06, + "loss": 0.1209, + "num_input_tokens_seen": 1465088, + "step": 2980 + }, + { + "epoch": 0.3939553913158242, + "grad_norm": 1.0957697629928589, + "learning_rate": 1.5186337563925538e-06, + "loss": 0.1168, + "num_input_tokens_seen": 1467456, + "step": 2985 + }, + { + "epoch": 0.39461528309357263, + "grad_norm": 0.22268956899642944, + "learning_rate": 1.516662847898971e-06, + "loss": 0.0016, + "num_input_tokens_seen": 1470016, + "step": 2990 + }, + { + "epoch": 0.3952751748713211, + "grad_norm": 0.2794409990310669, + "learning_rate": 1.5146891977970349e-06, + "loss": 0.1024, + "num_input_tokens_seen": 1472448, + "step": 2995 + }, + { + "epoch": 0.3959350666490696, + "grad_norm": 55.23267364501953, + "learning_rate": 1.5127128165596794e-06, + "loss": 0.1009, + "num_input_tokens_seen": 1475072, + "step": 3000 + }, + { + "epoch": 0.396594958426818, + "grad_norm": 0.32357192039489746, + "learning_rate": 1.51073371467433e-06, + "loss": 0.0499, + "num_input_tokens_seen": 1477440, + "step": 3005 + }, + { + "epoch": 0.39725485020456647, + "grad_norm": 2.3438990116119385, + "learning_rate": 1.5087519026428498e-06, + "loss": 0.0043, + "num_input_tokens_seen": 1479872, + "step": 3010 + }, + { + "epoch": 0.3979147419823149, + "grad_norm": 214.1775665283203, + "learning_rate": 1.5067673909814818e-06, + "loss": 0.1242, + "num_input_tokens_seen": 1481920, + "step": 3015 + }, + { + "epoch": 0.39857463376006336, + "grad_norm": 0.06694573163986206, + "learning_rate": 1.5047801902207953e-06, + "loss": 0.1901, + "num_input_tokens_seen": 1484992, + "step": 3020 + }, + { + "epoch": 0.3992345255378118, + "grad_norm": 37.85984802246094, + "learning_rate": 1.5027903109056288e-06, + "loss": 0.1508, + "num_input_tokens_seen": 1487232, + "step": 3025 + }, + { + "epoch": 0.39989441731556025, + "grad_norm": 22.730335235595703, + "learning_rate": 1.5007977635950336e-06, + "loss": 0.1615, + "num_input_tokens_seen": 1489728, + "step": 3030 + }, + { + "epoch": 0.4001583740266596, + "eval_loss": 0.13228875398635864, + "eval_runtime": 7.7073, + "eval_samples_per_second": 873.842, + "eval_steps_per_second": 109.246, + "num_input_tokens_seen": 1490688, + "step": 3032 + }, + { + "epoch": 0.4005543090933087, + "grad_norm": 96.17181396484375, + "learning_rate": 1.498802558862219e-06, + "loss": 0.154, + "num_input_tokens_seen": 1491968, + "step": 3035 + }, + { + "epoch": 0.40121420087105714, + "grad_norm": 0.3932342231273651, + "learning_rate": 1.496804707294496e-06, + "loss": 0.1078, + "num_input_tokens_seen": 1494336, + "step": 3040 + }, + { + "epoch": 0.4018740926488056, + "grad_norm": 0.33634519577026367, + "learning_rate": 1.4948042194932195e-06, + "loss": 0.0599, + "num_input_tokens_seen": 1497472, + "step": 3045 + }, + { + "epoch": 0.402533984426554, + "grad_norm": 0.19691598415374756, + "learning_rate": 1.4928011060737341e-06, + "loss": 0.0399, + "num_input_tokens_seen": 1499968, + "step": 3050 + }, + { + "epoch": 0.40319387620430247, + "grad_norm": 0.058707304298877716, + "learning_rate": 1.4907953776653171e-06, + "loss": 0.0741, + "num_input_tokens_seen": 1502336, + "step": 3055 + }, + { + "epoch": 0.40385376798205097, + "grad_norm": 17.177833557128906, + "learning_rate": 1.4887870449111206e-06, + "loss": 0.1581, + "num_input_tokens_seen": 1504576, + "step": 3060 + }, + { + "epoch": 0.4045136597597994, + "grad_norm": 0.7955127954483032, + "learning_rate": 1.486776118468118e-06, + "loss": 0.1605, + "num_input_tokens_seen": 1507136, + "step": 3065 + }, + { + "epoch": 0.40517355153754786, + "grad_norm": 0.5847259163856506, + "learning_rate": 1.4847626090070451e-06, + "loss": 0.0716, + "num_input_tokens_seen": 1509696, + "step": 3070 + }, + { + "epoch": 0.4058334433152963, + "grad_norm": 0.25745320320129395, + "learning_rate": 1.4827465272123439e-06, + "loss": 0.299, + "num_input_tokens_seen": 1512192, + "step": 3075 + }, + { + "epoch": 0.40649333509304475, + "grad_norm": 0.3554550111293793, + "learning_rate": 1.4807278837821063e-06, + "loss": 0.0453, + "num_input_tokens_seen": 1514752, + "step": 3080 + }, + { + "epoch": 0.4071532268707932, + "grad_norm": 12.156785011291504, + "learning_rate": 1.4787066894280178e-06, + "loss": 0.2992, + "num_input_tokens_seen": 1517440, + "step": 3085 + }, + { + "epoch": 0.40781311864854164, + "grad_norm": 0.10129724442958832, + "learning_rate": 1.476682954875299e-06, + "loss": 0.0637, + "num_input_tokens_seen": 1519744, + "step": 3090 + }, + { + "epoch": 0.4084730104262901, + "grad_norm": 84.23600769042969, + "learning_rate": 1.4746566908626506e-06, + "loss": 0.0773, + "num_input_tokens_seen": 1522176, + "step": 3095 + }, + { + "epoch": 0.40913290220403853, + "grad_norm": 1.9050307273864746, + "learning_rate": 1.4726279081421956e-06, + "loss": 0.0516, + "num_input_tokens_seen": 1524352, + "step": 3100 + }, + { + "epoch": 0.409792793981787, + "grad_norm": 35.056800842285156, + "learning_rate": 1.4705966174794216e-06, + "loss": 0.2317, + "num_input_tokens_seen": 1526976, + "step": 3105 + }, + { + "epoch": 0.4104526857595354, + "grad_norm": 0.22622281312942505, + "learning_rate": 1.4685628296531248e-06, + "loss": 0.1563, + "num_input_tokens_seen": 1529152, + "step": 3110 + }, + { + "epoch": 0.41111257753728386, + "grad_norm": 1.48894202709198, + "learning_rate": 1.466526555455352e-06, + "loss": 0.051, + "num_input_tokens_seen": 1531648, + "step": 3115 + }, + { + "epoch": 0.4117724693150323, + "grad_norm": 0.444116473197937, + "learning_rate": 1.4644878056913432e-06, + "loss": 0.0057, + "num_input_tokens_seen": 1533952, + "step": 3120 + }, + { + "epoch": 0.4124323610927808, + "grad_norm": 48.74332046508789, + "learning_rate": 1.4624465911794764e-06, + "loss": 0.1887, + "num_input_tokens_seen": 1536640, + "step": 3125 + }, + { + "epoch": 0.41309225287052925, + "grad_norm": 0.06482608616352081, + "learning_rate": 1.4604029227512062e-06, + "loss": 0.0053, + "num_input_tokens_seen": 1539200, + "step": 3130 + }, + { + "epoch": 0.4137521446482777, + "grad_norm": 81.11097717285156, + "learning_rate": 1.4583568112510108e-06, + "loss": 0.1908, + "num_input_tokens_seen": 1541632, + "step": 3135 + }, + { + "epoch": 0.41441203642602614, + "grad_norm": 12.146714210510254, + "learning_rate": 1.4563082675363302e-06, + "loss": 0.0965, + "num_input_tokens_seen": 1544128, + "step": 3140 + }, + { + "epoch": 0.4150719282037746, + "grad_norm": 0.2594153583049774, + "learning_rate": 1.4542573024775122e-06, + "loss": 0.0228, + "num_input_tokens_seen": 1546368, + "step": 3145 + }, + { + "epoch": 0.41573181998152303, + "grad_norm": 4.159293174743652, + "learning_rate": 1.4522039269577521e-06, + "loss": 0.2984, + "num_input_tokens_seen": 1548736, + "step": 3150 + }, + { + "epoch": 0.4163917117592715, + "grad_norm": 0.10340887308120728, + "learning_rate": 1.4501481518730372e-06, + "loss": 0.2461, + "num_input_tokens_seen": 1551168, + "step": 3155 + }, + { + "epoch": 0.4170516035370199, + "grad_norm": 0.2676301598548889, + "learning_rate": 1.4480899881320868e-06, + "loss": 0.0719, + "num_input_tokens_seen": 1553664, + "step": 3160 + }, + { + "epoch": 0.41771149531476837, + "grad_norm": 25.496265411376953, + "learning_rate": 1.4460294466562956e-06, + "loss": 0.1771, + "num_input_tokens_seen": 1555968, + "step": 3165 + }, + { + "epoch": 0.4183713870925168, + "grad_norm": 0.47720712423324585, + "learning_rate": 1.4439665383796756e-06, + "loss": 0.0399, + "num_input_tokens_seen": 1558208, + "step": 3170 + }, + { + "epoch": 0.41903127887026526, + "grad_norm": 2.1485588550567627, + "learning_rate": 1.4419012742487972e-06, + "loss": 0.0054, + "num_input_tokens_seen": 1560640, + "step": 3175 + }, + { + "epoch": 0.4196911706480137, + "grad_norm": 5.430055618286133, + "learning_rate": 1.4398336652227335e-06, + "loss": 0.095, + "num_input_tokens_seen": 1563328, + "step": 3180 + }, + { + "epoch": 0.4203510624257622, + "grad_norm": 0.05566899850964546, + "learning_rate": 1.4377637222729986e-06, + "loss": 0.1201, + "num_input_tokens_seen": 1565696, + "step": 3185 + }, + { + "epoch": 0.42101095420351065, + "grad_norm": 0.08947694301605225, + "learning_rate": 1.435691456383493e-06, + "loss": 0.1675, + "num_input_tokens_seen": 1568640, + "step": 3190 + }, + { + "epoch": 0.4216708459812591, + "grad_norm": 2.342318058013916, + "learning_rate": 1.433616878550442e-06, + "loss": 0.1212, + "num_input_tokens_seen": 1571328, + "step": 3195 + }, + { + "epoch": 0.42233073775900754, + "grad_norm": 18.465282440185547, + "learning_rate": 1.4315399997823403e-06, + "loss": 0.3175, + "num_input_tokens_seen": 1574016, + "step": 3200 + }, + { + "epoch": 0.422990629536756, + "grad_norm": 12.997380256652832, + "learning_rate": 1.429460831099891e-06, + "loss": 0.2534, + "num_input_tokens_seen": 1576384, + "step": 3205 + }, + { + "epoch": 0.4236505213145044, + "grad_norm": 0.08205987513065338, + "learning_rate": 1.4273793835359492e-06, + "loss": 0.2136, + "num_input_tokens_seen": 1579200, + "step": 3210 + }, + { + "epoch": 0.42431041309225287, + "grad_norm": 66.97320556640625, + "learning_rate": 1.4252956681354631e-06, + "loss": 0.0964, + "num_input_tokens_seen": 1581632, + "step": 3215 + }, + { + "epoch": 0.4249703048700013, + "grad_norm": 0.7273184657096863, + "learning_rate": 1.4232096959554135e-06, + "loss": 0.0035, + "num_input_tokens_seen": 1584064, + "step": 3220 + }, + { + "epoch": 0.42563019664774976, + "grad_norm": 65.00259399414062, + "learning_rate": 1.4211214780647572e-06, + "loss": 0.0297, + "num_input_tokens_seen": 1586752, + "step": 3225 + }, + { + "epoch": 0.4262900884254982, + "grad_norm": 9.714056968688965, + "learning_rate": 1.4190310255443676e-06, + "loss": 0.0918, + "num_input_tokens_seen": 1589248, + "step": 3230 + }, + { + "epoch": 0.42694998020324665, + "grad_norm": 0.03953593969345093, + "learning_rate": 1.4169383494869764e-06, + "loss": 0.0286, + "num_input_tokens_seen": 1591552, + "step": 3235 + }, + { + "epoch": 0.4276098719809951, + "grad_norm": 117.95477294921875, + "learning_rate": 1.414843460997113e-06, + "loss": 0.0616, + "num_input_tokens_seen": 1594048, + "step": 3240 + }, + { + "epoch": 0.4282697637587436, + "grad_norm": 17.138263702392578, + "learning_rate": 1.4127463711910483e-06, + "loss": 0.1517, + "num_input_tokens_seen": 1596544, + "step": 3245 + }, + { + "epoch": 0.42892965553649204, + "grad_norm": 5.194220542907715, + "learning_rate": 1.410647091196733e-06, + "loss": 0.1214, + "num_input_tokens_seen": 1599104, + "step": 3250 + }, + { + "epoch": 0.4295895473142405, + "grad_norm": 0.02321782521903515, + "learning_rate": 1.4085456321537402e-06, + "loss": 0.124, + "num_input_tokens_seen": 1601344, + "step": 3255 + }, + { + "epoch": 0.43024943909198893, + "grad_norm": 10.903656005859375, + "learning_rate": 1.4064420052132056e-06, + "loss": 0.1022, + "num_input_tokens_seen": 1603968, + "step": 3260 + }, + { + "epoch": 0.4309093308697374, + "grad_norm": 75.95123291015625, + "learning_rate": 1.4043362215377696e-06, + "loss": 0.078, + "num_input_tokens_seen": 1606400, + "step": 3265 + }, + { + "epoch": 0.4315692226474858, + "grad_norm": 0.12190647423267365, + "learning_rate": 1.4022282923015158e-06, + "loss": 0.1095, + "num_input_tokens_seen": 1608960, + "step": 3270 + }, + { + "epoch": 0.43222911442523426, + "grad_norm": 0.8287085294723511, + "learning_rate": 1.4001182286899136e-06, + "loss": 0.0042, + "num_input_tokens_seen": 1611456, + "step": 3275 + }, + { + "epoch": 0.4328890062029827, + "grad_norm": 0.0886739045381546, + "learning_rate": 1.398006041899758e-06, + "loss": 0.0458, + "num_input_tokens_seen": 1613952, + "step": 3280 + }, + { + "epoch": 0.43354889798073115, + "grad_norm": 27.18416404724121, + "learning_rate": 1.3958917431391102e-06, + "loss": 0.1192, + "num_input_tokens_seen": 1616320, + "step": 3285 + }, + { + "epoch": 0.4342087897584796, + "grad_norm": 0.13577166199684143, + "learning_rate": 1.3937753436272388e-06, + "loss": 0.1763, + "num_input_tokens_seen": 1619136, + "step": 3290 + }, + { + "epoch": 0.43486868153622804, + "grad_norm": 431.9822082519531, + "learning_rate": 1.3916568545945597e-06, + "loss": 0.0483, + "num_input_tokens_seen": 1621632, + "step": 3295 + }, + { + "epoch": 0.4355285733139765, + "grad_norm": 0.2625204920768738, + "learning_rate": 1.3895362872825764e-06, + "loss": 0.1352, + "num_input_tokens_seen": 1624064, + "step": 3300 + }, + { + "epoch": 0.43618846509172493, + "grad_norm": 0.5975183844566345, + "learning_rate": 1.3874136529438205e-06, + "loss": 0.1454, + "num_input_tokens_seen": 1626496, + "step": 3305 + }, + { + "epoch": 0.43684835686947343, + "grad_norm": 9.573996543884277, + "learning_rate": 1.3852889628417918e-06, + "loss": 0.0691, + "num_input_tokens_seen": 1628800, + "step": 3310 + }, + { + "epoch": 0.4375082486472219, + "grad_norm": 2.738884925842285, + "learning_rate": 1.3831622282508994e-06, + "loss": 0.0967, + "num_input_tokens_seen": 1631232, + "step": 3315 + }, + { + "epoch": 0.4381681404249703, + "grad_norm": 0.1655990183353424, + "learning_rate": 1.3810334604564007e-06, + "loss": 0.0018, + "num_input_tokens_seen": 1633728, + "step": 3320 + }, + { + "epoch": 0.43882803220271877, + "grad_norm": 0.21200844645500183, + "learning_rate": 1.3789026707543423e-06, + "loss": 0.0695, + "num_input_tokens_seen": 1636224, + "step": 3325 + }, + { + "epoch": 0.4394879239804672, + "grad_norm": 0.12617841362953186, + "learning_rate": 1.3767698704514998e-06, + "loss": 0.0631, + "num_input_tokens_seen": 1638272, + "step": 3330 + }, + { + "epoch": 0.44014781575821565, + "grad_norm": 0.025392625480890274, + "learning_rate": 1.3746350708653175e-06, + "loss": 0.1898, + "num_input_tokens_seen": 1640512, + "step": 3335 + }, + { + "epoch": 0.4408077075359641, + "grad_norm": 51.78602981567383, + "learning_rate": 1.3724982833238495e-06, + "loss": 0.1903, + "num_input_tokens_seen": 1642944, + "step": 3340 + }, + { + "epoch": 0.44146759931371254, + "grad_norm": 0.11096933484077454, + "learning_rate": 1.370359519165697e-06, + "loss": 0.0559, + "num_input_tokens_seen": 1645376, + "step": 3345 + }, + { + "epoch": 0.442127491091461, + "grad_norm": 259.23699951171875, + "learning_rate": 1.368218789739952e-06, + "loss": 0.0108, + "num_input_tokens_seen": 1647936, + "step": 3350 + }, + { + "epoch": 0.44278738286920943, + "grad_norm": 0.37444016337394714, + "learning_rate": 1.3660761064061337e-06, + "loss": 0.065, + "num_input_tokens_seen": 1650496, + "step": 3355 + }, + { + "epoch": 0.4434472746469579, + "grad_norm": 0.05476607382297516, + "learning_rate": 1.3639314805341297e-06, + "loss": 0.0935, + "num_input_tokens_seen": 1652992, + "step": 3360 + }, + { + "epoch": 0.4441071664247063, + "grad_norm": 0.11798688024282455, + "learning_rate": 1.3617849235041355e-06, + "loss": 0.0665, + "num_input_tokens_seen": 1655488, + "step": 3365 + }, + { + "epoch": 0.4447670582024548, + "grad_norm": 0.04145582392811775, + "learning_rate": 1.3596364467065938e-06, + "loss": 0.1599, + "num_input_tokens_seen": 1657984, + "step": 3370 + }, + { + "epoch": 0.44542694998020327, + "grad_norm": 90.30973052978516, + "learning_rate": 1.3574860615421346e-06, + "loss": 0.229, + "num_input_tokens_seen": 1660736, + "step": 3375 + }, + { + "epoch": 0.4460868417579517, + "grad_norm": 12.61612319946289, + "learning_rate": 1.3553337794215147e-06, + "loss": 0.192, + "num_input_tokens_seen": 1663104, + "step": 3380 + }, + { + "epoch": 0.44674673353570016, + "grad_norm": 75.10413360595703, + "learning_rate": 1.3531796117655565e-06, + "loss": 0.0766, + "num_input_tokens_seen": 1665344, + "step": 3385 + }, + { + "epoch": 0.4474066253134486, + "grad_norm": 30.948253631591797, + "learning_rate": 1.3510235700050873e-06, + "loss": 0.1651, + "num_input_tokens_seen": 1668096, + "step": 3390 + }, + { + "epoch": 0.44806651709119705, + "grad_norm": 22.553556442260742, + "learning_rate": 1.3488656655808801e-06, + "loss": 0.0679, + "num_input_tokens_seen": 1670272, + "step": 3395 + }, + { + "epoch": 0.4487264088689455, + "grad_norm": 1.1050207614898682, + "learning_rate": 1.3467059099435912e-06, + "loss": 0.0905, + "num_input_tokens_seen": 1672448, + "step": 3400 + }, + { + "epoch": 0.44938630064669394, + "grad_norm": 0.16898778080940247, + "learning_rate": 1.3445443145537002e-06, + "loss": 0.0608, + "num_input_tokens_seen": 1675200, + "step": 3405 + }, + { + "epoch": 0.4500461924244424, + "grad_norm": 1.0715267658233643, + "learning_rate": 1.3423808908814494e-06, + "loss": 0.0698, + "num_input_tokens_seen": 1677696, + "step": 3410 + }, + { + "epoch": 0.45017817077999206, + "eval_loss": 0.1182408258318901, + "eval_runtime": 7.6199, + "eval_samples_per_second": 883.874, + "eval_steps_per_second": 110.501, + "num_input_tokens_seen": 1678208, + "step": 3411 + }, + { + "epoch": 0.4507060842021908, + "grad_norm": 14.29131031036377, + "learning_rate": 1.3402156504067826e-06, + "loss": 0.0969, + "num_input_tokens_seen": 1680256, + "step": 3415 + }, + { + "epoch": 0.45136597597993927, + "grad_norm": 0.1442999541759491, + "learning_rate": 1.338048604619284e-06, + "loss": 0.1191, + "num_input_tokens_seen": 1682624, + "step": 3420 + }, + { + "epoch": 0.4520258677576877, + "grad_norm": 33.37054443359375, + "learning_rate": 1.3358797650181178e-06, + "loss": 0.0365, + "num_input_tokens_seen": 1685056, + "step": 3425 + }, + { + "epoch": 0.45268575953543616, + "grad_norm": 132.64529418945312, + "learning_rate": 1.3337091431119662e-06, + "loss": 0.1349, + "num_input_tokens_seen": 1687168, + "step": 3430 + }, + { + "epoch": 0.45334565131318466, + "grad_norm": 168.06629943847656, + "learning_rate": 1.3315367504189698e-06, + "loss": 0.3197, + "num_input_tokens_seen": 1689216, + "step": 3435 + }, + { + "epoch": 0.4540055430909331, + "grad_norm": 86.57543182373047, + "learning_rate": 1.3293625984666656e-06, + "loss": 0.0946, + "num_input_tokens_seen": 1691776, + "step": 3440 + }, + { + "epoch": 0.45466543486868155, + "grad_norm": 0.10748296976089478, + "learning_rate": 1.3271866987919254e-06, + "loss": 0.0012, + "num_input_tokens_seen": 1694336, + "step": 3445 + }, + { + "epoch": 0.45532532664643, + "grad_norm": 0.3375436067581177, + "learning_rate": 1.325009062940895e-06, + "loss": 0.2113, + "num_input_tokens_seen": 1696640, + "step": 3450 + }, + { + "epoch": 0.45598521842417844, + "grad_norm": 15.320273399353027, + "learning_rate": 1.3228297024689336e-06, + "loss": 0.0765, + "num_input_tokens_seen": 1698880, + "step": 3455 + }, + { + "epoch": 0.4566451102019269, + "grad_norm": 23.91095733642578, + "learning_rate": 1.3206486289405519e-06, + "loss": 0.1025, + "num_input_tokens_seen": 1701312, + "step": 3460 + }, + { + "epoch": 0.45730500197967533, + "grad_norm": 44.923030853271484, + "learning_rate": 1.3184658539293496e-06, + "loss": 0.1407, + "num_input_tokens_seen": 1703808, + "step": 3465 + }, + { + "epoch": 0.4579648937574238, + "grad_norm": 65.6329116821289, + "learning_rate": 1.3162813890179564e-06, + "loss": 0.125, + "num_input_tokens_seen": 1706304, + "step": 3470 + }, + { + "epoch": 0.4586247855351722, + "grad_norm": 12.479512214660645, + "learning_rate": 1.314095245797969e-06, + "loss": 0.3138, + "num_input_tokens_seen": 1708736, + "step": 3475 + }, + { + "epoch": 0.45928467731292066, + "grad_norm": 0.6768988370895386, + "learning_rate": 1.3119074358698891e-06, + "loss": 0.1379, + "num_input_tokens_seen": 1711232, + "step": 3480 + }, + { + "epoch": 0.4599445690906691, + "grad_norm": 0.6303845047950745, + "learning_rate": 1.3097179708430634e-06, + "loss": 0.0039, + "num_input_tokens_seen": 1713600, + "step": 3485 + }, + { + "epoch": 0.46060446086841755, + "grad_norm": 0.1511518806219101, + "learning_rate": 1.3075268623356214e-06, + "loss": 0.2013, + "num_input_tokens_seen": 1716224, + "step": 3490 + }, + { + "epoch": 0.46126435264616605, + "grad_norm": 34.9669189453125, + "learning_rate": 1.305334121974412e-06, + "loss": 0.1515, + "num_input_tokens_seen": 1718720, + "step": 3495 + }, + { + "epoch": 0.4619242444239145, + "grad_norm": 46.562442779541016, + "learning_rate": 1.3031397613949448e-06, + "loss": 0.1062, + "num_input_tokens_seen": 1721280, + "step": 3500 + }, + { + "epoch": 0.46258413620166294, + "grad_norm": 93.35523986816406, + "learning_rate": 1.3009437922413266e-06, + "loss": 0.0727, + "num_input_tokens_seen": 1723712, + "step": 3505 + }, + { + "epoch": 0.4632440279794114, + "grad_norm": 87.05264282226562, + "learning_rate": 1.2987462261661994e-06, + "loss": 0.0932, + "num_input_tokens_seen": 1725952, + "step": 3510 + }, + { + "epoch": 0.46390391975715983, + "grad_norm": 58.2432975769043, + "learning_rate": 1.2965470748306798e-06, + "loss": 0.0048, + "num_input_tokens_seen": 1728512, + "step": 3515 + }, + { + "epoch": 0.4645638115349083, + "grad_norm": 9.179746627807617, + "learning_rate": 1.2943463499042957e-06, + "loss": 0.094, + "num_input_tokens_seen": 1731008, + "step": 3520 + }, + { + "epoch": 0.4652237033126567, + "grad_norm": 0.5701031684875488, + "learning_rate": 1.2921440630649257e-06, + "loss": 0.1567, + "num_input_tokens_seen": 1733696, + "step": 3525 + }, + { + "epoch": 0.46588359509040517, + "grad_norm": 245.243408203125, + "learning_rate": 1.2899402259987355e-06, + "loss": 0.0778, + "num_input_tokens_seen": 1736256, + "step": 3530 + }, + { + "epoch": 0.4665434868681536, + "grad_norm": 0.34011900424957275, + "learning_rate": 1.287734850400118e-06, + "loss": 0.2758, + "num_input_tokens_seen": 1738944, + "step": 3535 + }, + { + "epoch": 0.46720337864590206, + "grad_norm": 19.37761116027832, + "learning_rate": 1.2855279479716297e-06, + "loss": 0.1846, + "num_input_tokens_seen": 1741568, + "step": 3540 + }, + { + "epoch": 0.4678632704236505, + "grad_norm": 0.1848049759864807, + "learning_rate": 1.283319530423929e-06, + "loss": 0.0017, + "num_input_tokens_seen": 1743808, + "step": 3545 + }, + { + "epoch": 0.46852316220139895, + "grad_norm": 0.10032381117343903, + "learning_rate": 1.2811096094757144e-06, + "loss": 0.0026, + "num_input_tokens_seen": 1746176, + "step": 3550 + }, + { + "epoch": 0.46918305397914745, + "grad_norm": 0.09643909335136414, + "learning_rate": 1.2788981968536612e-06, + "loss": 0.1779, + "num_input_tokens_seen": 1748608, + "step": 3555 + }, + { + "epoch": 0.4698429457568959, + "grad_norm": 0.24367760121822357, + "learning_rate": 1.2766853042923607e-06, + "loss": 0.1046, + "num_input_tokens_seen": 1751040, + "step": 3560 + }, + { + "epoch": 0.47050283753464434, + "grad_norm": 1.557897686958313, + "learning_rate": 1.2744709435342573e-06, + "loss": 0.0626, + "num_input_tokens_seen": 1753280, + "step": 3565 + }, + { + "epoch": 0.4711627293123928, + "grad_norm": 13.281846046447754, + "learning_rate": 1.2722551263295864e-06, + "loss": 0.2856, + "num_input_tokens_seen": 1755712, + "step": 3570 + }, + { + "epoch": 0.4718226210901412, + "grad_norm": 53.76845169067383, + "learning_rate": 1.2700378644363114e-06, + "loss": 0.1173, + "num_input_tokens_seen": 1757952, + "step": 3575 + }, + { + "epoch": 0.47248251286788967, + "grad_norm": 23.442663192749023, + "learning_rate": 1.2678191696200621e-06, + "loss": 0.0951, + "num_input_tokens_seen": 1760384, + "step": 3580 + }, + { + "epoch": 0.4731424046456381, + "grad_norm": 0.13637100160121918, + "learning_rate": 1.2655990536540717e-06, + "loss": 0.0029, + "num_input_tokens_seen": 1762944, + "step": 3585 + }, + { + "epoch": 0.47380229642338656, + "grad_norm": 36.00935363769531, + "learning_rate": 1.2633775283191144e-06, + "loss": 0.275, + "num_input_tokens_seen": 1765504, + "step": 3590 + }, + { + "epoch": 0.474462188201135, + "grad_norm": 0.4418662190437317, + "learning_rate": 1.2611546054034436e-06, + "loss": 0.0527, + "num_input_tokens_seen": 1768128, + "step": 3595 + }, + { + "epoch": 0.47512207997888345, + "grad_norm": 0.2341255098581314, + "learning_rate": 1.2589302967027285e-06, + "loss": 0.1554, + "num_input_tokens_seen": 1770624, + "step": 3600 + }, + { + "epoch": 0.4757819717566319, + "grad_norm": 23.149660110473633, + "learning_rate": 1.2567046140199914e-06, + "loss": 0.2221, + "num_input_tokens_seen": 1773248, + "step": 3605 + }, + { + "epoch": 0.47644186353438034, + "grad_norm": 1.1026215553283691, + "learning_rate": 1.2544775691655463e-06, + "loss": 0.0267, + "num_input_tokens_seen": 1775488, + "step": 3610 + }, + { + "epoch": 0.4771017553121288, + "grad_norm": 0.24849441647529602, + "learning_rate": 1.2522491739569346e-06, + "loss": 0.1329, + "num_input_tokens_seen": 1777792, + "step": 3615 + }, + { + "epoch": 0.4777616470898773, + "grad_norm": 1.301603078842163, + "learning_rate": 1.250019440218864e-06, + "loss": 0.0942, + "num_input_tokens_seen": 1780352, + "step": 3620 + }, + { + "epoch": 0.47842153886762573, + "grad_norm": 0.6911696195602417, + "learning_rate": 1.247788379783144e-06, + "loss": 0.1692, + "num_input_tokens_seen": 1783168, + "step": 3625 + }, + { + "epoch": 0.4790814306453742, + "grad_norm": 97.18595123291016, + "learning_rate": 1.2455560044886248e-06, + "loss": 0.0503, + "num_input_tokens_seen": 1785920, + "step": 3630 + }, + { + "epoch": 0.4797413224231226, + "grad_norm": 0.041064053773880005, + "learning_rate": 1.2433223261811337e-06, + "loss": 0.1104, + "num_input_tokens_seen": 1788416, + "step": 3635 + }, + { + "epoch": 0.48040121420087106, + "grad_norm": 0.06536306440830231, + "learning_rate": 1.2410873567134115e-06, + "loss": 0.0317, + "num_input_tokens_seen": 1790848, + "step": 3640 + }, + { + "epoch": 0.4810611059786195, + "grad_norm": 2.3887031078338623, + "learning_rate": 1.238851107945051e-06, + "loss": 0.0394, + "num_input_tokens_seen": 1793280, + "step": 3645 + }, + { + "epoch": 0.48172099775636795, + "grad_norm": 0.03385510668158531, + "learning_rate": 1.2366135917424341e-06, + "loss": 0.1043, + "num_input_tokens_seen": 1795648, + "step": 3650 + }, + { + "epoch": 0.4823808895341164, + "grad_norm": 23.26211929321289, + "learning_rate": 1.2343748199786665e-06, + "loss": 0.183, + "num_input_tokens_seen": 1797952, + "step": 3655 + }, + { + "epoch": 0.48304078131186484, + "grad_norm": 0.2056346982717514, + "learning_rate": 1.2321348045335182e-06, + "loss": 0.0865, + "num_input_tokens_seen": 1800192, + "step": 3660 + }, + { + "epoch": 0.4837006730896133, + "grad_norm": 0.4568115174770355, + "learning_rate": 1.2298935572933575e-06, + "loss": 0.1479, + "num_input_tokens_seen": 1802560, + "step": 3665 + }, + { + "epoch": 0.48436056486736173, + "grad_norm": 23.873966217041016, + "learning_rate": 1.2276510901510892e-06, + "loss": 0.1646, + "num_input_tokens_seen": 1805056, + "step": 3670 + }, + { + "epoch": 0.4850204566451102, + "grad_norm": 2.0380196571350098, + "learning_rate": 1.2254074150060915e-06, + "loss": 0.1443, + "num_input_tokens_seen": 1807744, + "step": 3675 + }, + { + "epoch": 0.4856803484228587, + "grad_norm": 56.635318756103516, + "learning_rate": 1.2231625437641535e-06, + "loss": 0.0999, + "num_input_tokens_seen": 1810368, + "step": 3680 + }, + { + "epoch": 0.4863402402006071, + "grad_norm": 0.2982792258262634, + "learning_rate": 1.2209164883374096e-06, + "loss": 0.0791, + "num_input_tokens_seen": 1813056, + "step": 3685 + }, + { + "epoch": 0.48700013197835557, + "grad_norm": 0.19904585182666779, + "learning_rate": 1.2186692606442793e-06, + "loss": 0.2265, + "num_input_tokens_seen": 1815360, + "step": 3690 + }, + { + "epoch": 0.487660023756104, + "grad_norm": 144.61109924316406, + "learning_rate": 1.216420872609402e-06, + "loss": 0.1958, + "num_input_tokens_seen": 1817920, + "step": 3695 + }, + { + "epoch": 0.48831991553385246, + "grad_norm": 12.121625900268555, + "learning_rate": 1.2141713361635739e-06, + "loss": 0.0936, + "num_input_tokens_seen": 1820288, + "step": 3700 + }, + { + "epoch": 0.4889798073116009, + "grad_norm": 0.04935774579644203, + "learning_rate": 1.2119206632436864e-06, + "loss": 0.157, + "num_input_tokens_seen": 1822656, + "step": 3705 + }, + { + "epoch": 0.48963969908934935, + "grad_norm": 0.5263445973396301, + "learning_rate": 1.209668865792661e-06, + "loss": 0.116, + "num_input_tokens_seen": 1824832, + "step": 3710 + }, + { + "epoch": 0.4902995908670978, + "grad_norm": 35.05288314819336, + "learning_rate": 1.207415955759385e-06, + "loss": 0.0906, + "num_input_tokens_seen": 1827200, + "step": 3715 + }, + { + "epoch": 0.49095948264484623, + "grad_norm": 10.884110450744629, + "learning_rate": 1.2051619450986514e-06, + "loss": 0.1443, + "num_input_tokens_seen": 1829632, + "step": 3720 + }, + { + "epoch": 0.4916193744225947, + "grad_norm": 1.7360846996307373, + "learning_rate": 1.2029068457710923e-06, + "loss": 0.076, + "num_input_tokens_seen": 1832192, + "step": 3725 + }, + { + "epoch": 0.4922792662003431, + "grad_norm": 3.593554973602295, + "learning_rate": 1.200650669743117e-06, + "loss": 0.1089, + "num_input_tokens_seen": 1834752, + "step": 3730 + }, + { + "epoch": 0.49293915797809157, + "grad_norm": 24.667346954345703, + "learning_rate": 1.1983934289868488e-06, + "loss": 0.0533, + "num_input_tokens_seen": 1837248, + "step": 3735 + }, + { + "epoch": 0.49359904975584007, + "grad_norm": 40.43445587158203, + "learning_rate": 1.1961351354800595e-06, + "loss": 0.2063, + "num_input_tokens_seen": 1839680, + "step": 3740 + }, + { + "epoch": 0.4942589415335885, + "grad_norm": 0.25334975123405457, + "learning_rate": 1.193875801206109e-06, + "loss": 0.1478, + "num_input_tokens_seen": 1842304, + "step": 3745 + }, + { + "epoch": 0.49491883331133696, + "grad_norm": 0.46043312549591064, + "learning_rate": 1.1916154381538786e-06, + "loss": 0.0398, + "num_input_tokens_seen": 1844480, + "step": 3750 + }, + { + "epoch": 0.4955787250890854, + "grad_norm": 0.318348228931427, + "learning_rate": 1.1893540583177083e-06, + "loss": 0.1799, + "num_input_tokens_seen": 1846912, + "step": 3755 + }, + { + "epoch": 0.49623861686683385, + "grad_norm": 13.051739692687988, + "learning_rate": 1.187091673697335e-06, + "loss": 0.0861, + "num_input_tokens_seen": 1849024, + "step": 3760 + }, + { + "epoch": 0.4968985086445823, + "grad_norm": 0.8000279068946838, + "learning_rate": 1.184828296297826e-06, + "loss": 0.0693, + "num_input_tokens_seen": 1851712, + "step": 3765 + }, + { + "epoch": 0.49755840042233074, + "grad_norm": 26.590360641479492, + "learning_rate": 1.182563938129518e-06, + "loss": 0.074, + "num_input_tokens_seen": 1854208, + "step": 3770 + }, + { + "epoch": 0.4982182922000792, + "grad_norm": 0.07655533403158188, + "learning_rate": 1.1802986112079507e-06, + "loss": 0.0972, + "num_input_tokens_seen": 1856704, + "step": 3775 + }, + { + "epoch": 0.4988781839778276, + "grad_norm": 2.7111520767211914, + "learning_rate": 1.1780323275538056e-06, + "loss": 0.0812, + "num_input_tokens_seen": 1858944, + "step": 3780 + }, + { + "epoch": 0.49953807575557607, + "grad_norm": 2.1287126541137695, + "learning_rate": 1.1757650991928393e-06, + "loss": 0.2014, + "num_input_tokens_seen": 1861696, + "step": 3785 + }, + { + "epoch": 0.5001979675333246, + "grad_norm": 0.28718459606170654, + "learning_rate": 1.1734969381558235e-06, + "loss": 0.3465, + "num_input_tokens_seen": 1864128, + "step": 3790 + }, + { + "epoch": 0.5001979675333246, + "eval_loss": 0.13253989815711975, + "eval_runtime": 7.6606, + "eval_samples_per_second": 879.171, + "eval_steps_per_second": 109.913, + "num_input_tokens_seen": 1864128, + "step": 3790 + }, + { + "epoch": 0.500857859311073, + "grad_norm": 0.05410047248005867, + "learning_rate": 1.1712278564784774e-06, + "loss": 0.0012, + "num_input_tokens_seen": 1866432, + "step": 3795 + }, + { + "epoch": 0.5015177510888215, + "grad_norm": 50.43254089355469, + "learning_rate": 1.1689578662014064e-06, + "loss": 0.071, + "num_input_tokens_seen": 1868736, + "step": 3800 + }, + { + "epoch": 0.5021776428665699, + "grad_norm": 10.290699005126953, + "learning_rate": 1.1666869793700362e-06, + "loss": 0.2416, + "num_input_tokens_seen": 1871360, + "step": 3805 + }, + { + "epoch": 0.5028375346443184, + "grad_norm": 0.025802727788686752, + "learning_rate": 1.1644152080345515e-06, + "loss": 0.0019, + "num_input_tokens_seen": 1873536, + "step": 3810 + }, + { + "epoch": 0.5034974264220667, + "grad_norm": 32.99125289916992, + "learning_rate": 1.1621425642498289e-06, + "loss": 0.2788, + "num_input_tokens_seen": 1875904, + "step": 3815 + }, + { + "epoch": 0.5041573181998152, + "grad_norm": 88.0829849243164, + "learning_rate": 1.1598690600753759e-06, + "loss": 0.2056, + "num_input_tokens_seen": 1878464, + "step": 3820 + }, + { + "epoch": 0.5048172099775636, + "grad_norm": 69.9671630859375, + "learning_rate": 1.1575947075752644e-06, + "loss": 0.2253, + "num_input_tokens_seen": 1880640, + "step": 3825 + }, + { + "epoch": 0.5054771017553121, + "grad_norm": 16.678607940673828, + "learning_rate": 1.1553195188180691e-06, + "loss": 0.1243, + "num_input_tokens_seen": 1882944, + "step": 3830 + }, + { + "epoch": 0.5061369935330606, + "grad_norm": 0.3082711398601532, + "learning_rate": 1.1530435058768008e-06, + "loss": 0.0629, + "num_input_tokens_seen": 1885248, + "step": 3835 + }, + { + "epoch": 0.506796885310809, + "grad_norm": 16.876184463500977, + "learning_rate": 1.150766680828845e-06, + "loss": 0.0576, + "num_input_tokens_seen": 1887872, + "step": 3840 + }, + { + "epoch": 0.5074567770885575, + "grad_norm": 11.138367652893066, + "learning_rate": 1.1484890557558955e-06, + "loss": 0.004, + "num_input_tokens_seen": 1890560, + "step": 3845 + }, + { + "epoch": 0.5081166688663059, + "grad_norm": 11.504974365234375, + "learning_rate": 1.146210642743892e-06, + "loss": 0.0781, + "num_input_tokens_seen": 1893056, + "step": 3850 + }, + { + "epoch": 0.5087765606440544, + "grad_norm": 0.10916353017091751, + "learning_rate": 1.1439314538829554e-06, + "loss": 0.0498, + "num_input_tokens_seen": 1895360, + "step": 3855 + }, + { + "epoch": 0.5094364524218028, + "grad_norm": 0.09748303145170212, + "learning_rate": 1.141651501267323e-06, + "loss": 0.0617, + "num_input_tokens_seen": 1897664, + "step": 3860 + }, + { + "epoch": 0.5100963441995513, + "grad_norm": 126.38017272949219, + "learning_rate": 1.1393707969952847e-06, + "loss": 0.1711, + "num_input_tokens_seen": 1900288, + "step": 3865 + }, + { + "epoch": 0.5107562359772997, + "grad_norm": 139.21932983398438, + "learning_rate": 1.13708935316912e-06, + "loss": 0.1191, + "num_input_tokens_seen": 1903040, + "step": 3870 + }, + { + "epoch": 0.5114161277550482, + "grad_norm": 2.1678948402404785, + "learning_rate": 1.134807181895032e-06, + "loss": 0.0025, + "num_input_tokens_seen": 1905472, + "step": 3875 + }, + { + "epoch": 0.5120760195327966, + "grad_norm": 75.74095916748047, + "learning_rate": 1.132524295283084e-06, + "loss": 0.1253, + "num_input_tokens_seen": 1907712, + "step": 3880 + }, + { + "epoch": 0.5127359113105451, + "grad_norm": 0.061001695692539215, + "learning_rate": 1.1302407054471355e-06, + "loss": 0.0096, + "num_input_tokens_seen": 1910080, + "step": 3885 + }, + { + "epoch": 0.5133958030882935, + "grad_norm": 64.87725067138672, + "learning_rate": 1.1279564245047767e-06, + "loss": 0.2717, + "num_input_tokens_seen": 1912512, + "step": 3890 + }, + { + "epoch": 0.514055694866042, + "grad_norm": 0.10021132230758667, + "learning_rate": 1.1256714645772662e-06, + "loss": 0.0696, + "num_input_tokens_seen": 1914752, + "step": 3895 + }, + { + "epoch": 0.5147155866437905, + "grad_norm": 0.13533316552639008, + "learning_rate": 1.1233858377894647e-06, + "loss": 0.0073, + "num_input_tokens_seen": 1917120, + "step": 3900 + }, + { + "epoch": 0.5153754784215389, + "grad_norm": 72.85858917236328, + "learning_rate": 1.1210995562697722e-06, + "loss": 0.0094, + "num_input_tokens_seen": 1919232, + "step": 3905 + }, + { + "epoch": 0.5160353701992874, + "grad_norm": 20.26717758178711, + "learning_rate": 1.1188126321500621e-06, + "loss": 0.0061, + "num_input_tokens_seen": 1921856, + "step": 3910 + }, + { + "epoch": 0.5166952619770357, + "grad_norm": 105.2000732421875, + "learning_rate": 1.1165250775656188e-06, + "loss": 0.1091, + "num_input_tokens_seen": 1924224, + "step": 3915 + }, + { + "epoch": 0.5173551537547842, + "grad_norm": 0.042006537318229675, + "learning_rate": 1.1142369046550708e-06, + "loss": 0.0258, + "num_input_tokens_seen": 1926464, + "step": 3920 + }, + { + "epoch": 0.5180150455325326, + "grad_norm": 0.038001008331775665, + "learning_rate": 1.1119481255603289e-06, + "loss": 0.253, + "num_input_tokens_seen": 1928896, + "step": 3925 + }, + { + "epoch": 0.5186749373102811, + "grad_norm": 3.7172634601593018, + "learning_rate": 1.1096587524265197e-06, + "loss": 0.0598, + "num_input_tokens_seen": 1931200, + "step": 3930 + }, + { + "epoch": 0.5193348290880295, + "grad_norm": 0.01534217782318592, + "learning_rate": 1.107368797401923e-06, + "loss": 0.1918, + "num_input_tokens_seen": 1933632, + "step": 3935 + }, + { + "epoch": 0.519994720865778, + "grad_norm": 0.24886855483055115, + "learning_rate": 1.1050782726379054e-06, + "loss": 0.0022, + "num_input_tokens_seen": 1935872, + "step": 3940 + }, + { + "epoch": 0.5206546126435264, + "grad_norm": 0.19433605670928955, + "learning_rate": 1.1027871902888566e-06, + "loss": 0.104, + "num_input_tokens_seen": 1938048, + "step": 3945 + }, + { + "epoch": 0.5213145044212749, + "grad_norm": 46.62074661254883, + "learning_rate": 1.1004955625121257e-06, + "loss": 0.059, + "num_input_tokens_seen": 1940608, + "step": 3950 + }, + { + "epoch": 0.5219743961990233, + "grad_norm": 4.506015777587891, + "learning_rate": 1.0982034014679561e-06, + "loss": 0.2127, + "num_input_tokens_seen": 1943040, + "step": 3955 + }, + { + "epoch": 0.5226342879767718, + "grad_norm": 1.7702916860580444, + "learning_rate": 1.0959107193194206e-06, + "loss": 0.279, + "num_input_tokens_seen": 1945664, + "step": 3960 + }, + { + "epoch": 0.5232941797545203, + "grad_norm": 0.04471131041646004, + "learning_rate": 1.0936175282323575e-06, + "loss": 0.0022, + "num_input_tokens_seen": 1948032, + "step": 3965 + }, + { + "epoch": 0.5239540715322687, + "grad_norm": 96.3348617553711, + "learning_rate": 1.091323840375305e-06, + "loss": 0.0235, + "num_input_tokens_seen": 1950208, + "step": 3970 + }, + { + "epoch": 0.5246139633100172, + "grad_norm": 46.00945281982422, + "learning_rate": 1.0890296679194378e-06, + "loss": 0.2217, + "num_input_tokens_seen": 1952896, + "step": 3975 + }, + { + "epoch": 0.5252738550877656, + "grad_norm": 0.07118234783411026, + "learning_rate": 1.086735023038502e-06, + "loss": 0.0466, + "num_input_tokens_seen": 1955200, + "step": 3980 + }, + { + "epoch": 0.5259337468655141, + "grad_norm": 0.24527551233768463, + "learning_rate": 1.0844399179087512e-06, + "loss": 0.0765, + "num_input_tokens_seen": 1957376, + "step": 3985 + }, + { + "epoch": 0.5265936386432625, + "grad_norm": 0.29095086455345154, + "learning_rate": 1.0821443647088802e-06, + "loss": 0.2646, + "num_input_tokens_seen": 1960064, + "step": 3990 + }, + { + "epoch": 0.527253530421011, + "grad_norm": 0.09518618881702423, + "learning_rate": 1.0798483756199623e-06, + "loss": 0.1166, + "num_input_tokens_seen": 1962624, + "step": 3995 + }, + { + "epoch": 0.5279134221987594, + "grad_norm": 0.0448361411690712, + "learning_rate": 1.0775519628253833e-06, + "loss": 0.0901, + "num_input_tokens_seen": 1965056, + "step": 4000 + }, + { + "epoch": 0.5285733139765079, + "grad_norm": 0.339200496673584, + "learning_rate": 1.0752551385107772e-06, + "loss": 0.1363, + "num_input_tokens_seen": 1967424, + "step": 4005 + }, + { + "epoch": 0.5292332057542563, + "grad_norm": 12.845752716064453, + "learning_rate": 1.0729579148639621e-06, + "loss": 0.1608, + "num_input_tokens_seen": 1969856, + "step": 4010 + }, + { + "epoch": 0.5298930975320048, + "grad_norm": 0.18415102362632751, + "learning_rate": 1.0706603040748747e-06, + "loss": 0.0527, + "num_input_tokens_seen": 1972544, + "step": 4015 + }, + { + "epoch": 0.5305529893097533, + "grad_norm": 0.05650022253394127, + "learning_rate": 1.0683623183355071e-06, + "loss": 0.0851, + "num_input_tokens_seen": 1974912, + "step": 4020 + }, + { + "epoch": 0.5312128810875016, + "grad_norm": 13.724897384643555, + "learning_rate": 1.0660639698398392e-06, + "loss": 0.0918, + "num_input_tokens_seen": 1977216, + "step": 4025 + }, + { + "epoch": 0.5318727728652501, + "grad_norm": 4.000504970550537, + "learning_rate": 1.0637652707837773e-06, + "loss": 0.069, + "num_input_tokens_seen": 1979648, + "step": 4030 + }, + { + "epoch": 0.5325326646429985, + "grad_norm": 63.9135627746582, + "learning_rate": 1.0614662333650876e-06, + "loss": 0.0788, + "num_input_tokens_seen": 1981888, + "step": 4035 + }, + { + "epoch": 0.533192556420747, + "grad_norm": 15.316259384155273, + "learning_rate": 1.0591668697833311e-06, + "loss": 0.199, + "num_input_tokens_seen": 1984448, + "step": 4040 + }, + { + "epoch": 0.5338524481984954, + "grad_norm": 31.211254119873047, + "learning_rate": 1.0568671922398005e-06, + "loss": 0.1948, + "num_input_tokens_seen": 1987072, + "step": 4045 + }, + { + "epoch": 0.5345123399762439, + "grad_norm": 0.47070229053497314, + "learning_rate": 1.054567212937454e-06, + "loss": 0.1732, + "num_input_tokens_seen": 1989632, + "step": 4050 + }, + { + "epoch": 0.5351722317539923, + "grad_norm": 0.44888266921043396, + "learning_rate": 1.0522669440808508e-06, + "loss": 0.0482, + "num_input_tokens_seen": 1992192, + "step": 4055 + }, + { + "epoch": 0.5358321235317408, + "grad_norm": 1.2094718217849731, + "learning_rate": 1.0499663978760871e-06, + "loss": 0.2351, + "num_input_tokens_seen": 1994624, + "step": 4060 + }, + { + "epoch": 0.5364920153094892, + "grad_norm": 9.957518577575684, + "learning_rate": 1.0476655865307308e-06, + "loss": 0.0567, + "num_input_tokens_seen": 1997056, + "step": 4065 + }, + { + "epoch": 0.5371519070872377, + "grad_norm": 0.34155920147895813, + "learning_rate": 1.0453645222537556e-06, + "loss": 0.0665, + "num_input_tokens_seen": 1999360, + "step": 4070 + }, + { + "epoch": 0.5378117988649861, + "grad_norm": 111.1448974609375, + "learning_rate": 1.0430632172554796e-06, + "loss": 0.0719, + "num_input_tokens_seen": 2001856, + "step": 4075 + }, + { + "epoch": 0.5384716906427346, + "grad_norm": 36.95001220703125, + "learning_rate": 1.0407616837474963e-06, + "loss": 0.1029, + "num_input_tokens_seen": 2004288, + "step": 4080 + }, + { + "epoch": 0.5391315824204831, + "grad_norm": 1.12558114528656, + "learning_rate": 1.038459933942612e-06, + "loss": 0.0145, + "num_input_tokens_seen": 2006976, + "step": 4085 + }, + { + "epoch": 0.5397914741982315, + "grad_norm": 11.313764572143555, + "learning_rate": 1.036157980054782e-06, + "loss": 0.0129, + "num_input_tokens_seen": 2009280, + "step": 4090 + }, + { + "epoch": 0.54045136597598, + "grad_norm": 168.36546325683594, + "learning_rate": 1.0338558342990431e-06, + "loss": 0.0985, + "num_input_tokens_seen": 2011776, + "step": 4095 + }, + { + "epoch": 0.5411112577537284, + "grad_norm": 0.4781351089477539, + "learning_rate": 1.0315535088914508e-06, + "loss": 0.2285, + "num_input_tokens_seen": 2014336, + "step": 4100 + }, + { + "epoch": 0.5417711495314769, + "grad_norm": 33.78492736816406, + "learning_rate": 1.0292510160490146e-06, + "loss": 0.1558, + "num_input_tokens_seen": 2017152, + "step": 4105 + }, + { + "epoch": 0.5424310413092253, + "grad_norm": 17.072744369506836, + "learning_rate": 1.0269483679896308e-06, + "loss": 0.1097, + "num_input_tokens_seen": 2019520, + "step": 4110 + }, + { + "epoch": 0.5430909330869738, + "grad_norm": 32.48529052734375, + "learning_rate": 1.0246455769320211e-06, + "loss": 0.164, + "num_input_tokens_seen": 2021632, + "step": 4115 + }, + { + "epoch": 0.5437508248647221, + "grad_norm": 1.9809108972549438, + "learning_rate": 1.0223426550956647e-06, + "loss": 0.1157, + "num_input_tokens_seen": 2023744, + "step": 4120 + }, + { + "epoch": 0.5444107166424706, + "grad_norm": 2.384786367416382, + "learning_rate": 1.0200396147007354e-06, + "loss": 0.06, + "num_input_tokens_seen": 2026048, + "step": 4125 + }, + { + "epoch": 0.545070608420219, + "grad_norm": 0.03345398232340813, + "learning_rate": 1.0177364679680367e-06, + "loss": 0.1203, + "num_input_tokens_seen": 2028352, + "step": 4130 + }, + { + "epoch": 0.5457305001979675, + "grad_norm": 0.08933035284280777, + "learning_rate": 1.015433227118935e-06, + "loss": 0.0494, + "num_input_tokens_seen": 2030848, + "step": 4135 + }, + { + "epoch": 0.5463903919757159, + "grad_norm": 0.14635981619358063, + "learning_rate": 1.0131299043752967e-06, + "loss": 0.1369, + "num_input_tokens_seen": 2033344, + "step": 4140 + }, + { + "epoch": 0.5470502837534644, + "grad_norm": 0.22250190377235413, + "learning_rate": 1.0108265119594233e-06, + "loss": 0.0777, + "num_input_tokens_seen": 2035584, + "step": 4145 + }, + { + "epoch": 0.5477101755312129, + "grad_norm": 13.305469512939453, + "learning_rate": 1.0085230620939853e-06, + "loss": 0.0407, + "num_input_tokens_seen": 2038272, + "step": 4150 + }, + { + "epoch": 0.5483700673089613, + "grad_norm": 11.508169174194336, + "learning_rate": 1.0062195670019583e-06, + "loss": 0.0956, + "num_input_tokens_seen": 2040768, + "step": 4155 + }, + { + "epoch": 0.5490299590867098, + "grad_norm": 114.46903991699219, + "learning_rate": 1.0039160389065582e-06, + "loss": 0.1461, + "num_input_tokens_seen": 2043072, + "step": 4160 + }, + { + "epoch": 0.5496898508644582, + "grad_norm": 9.968348503112793, + "learning_rate": 1.0016124900311755e-06, + "loss": 0.1538, + "num_input_tokens_seen": 2045824, + "step": 4165 + }, + { + "epoch": 0.550217764286657, + "eval_loss": 0.0976191833615303, + "eval_runtime": 7.5976, + "eval_samples_per_second": 886.459, + "eval_steps_per_second": 110.824, + "num_input_tokens_seen": 2047552, + "step": 4169 + }, + { + "epoch": 0.5503497426422067, + "grad_norm": 24.443077087402344, + "learning_rate": 9.99308932599311e-07, + "loss": 0.233, + "num_input_tokens_seen": 2048064, + "step": 4170 + }, + { + "epoch": 0.5510096344199551, + "grad_norm": 0.5319744944572449, + "learning_rate": 9.970053788345112e-07, + "loss": 0.0557, + "num_input_tokens_seen": 2050432, + "step": 4175 + }, + { + "epoch": 0.5516695261977036, + "grad_norm": 0.8921132683753967, + "learning_rate": 9.947018409603036e-07, + "loss": 0.0547, + "num_input_tokens_seen": 2052928, + "step": 4180 + }, + { + "epoch": 0.552329417975452, + "grad_norm": 0.3344038724899292, + "learning_rate": 9.923983312001304e-07, + "loss": 0.0658, + "num_input_tokens_seen": 2055424, + "step": 4185 + }, + { + "epoch": 0.5529893097532005, + "grad_norm": 0.5421162843704224, + "learning_rate": 9.900948617772846e-07, + "loss": 0.1874, + "num_input_tokens_seen": 2057536, + "step": 4190 + }, + { + "epoch": 0.5536492015309489, + "grad_norm": 43.32229995727539, + "learning_rate": 9.877914449148462e-07, + "loss": 0.1518, + "num_input_tokens_seen": 2059840, + "step": 4195 + }, + { + "epoch": 0.5543090933086974, + "grad_norm": 87.34823608398438, + "learning_rate": 9.854880928356157e-07, + "loss": 0.2201, + "num_input_tokens_seen": 2062656, + "step": 4200 + }, + { + "epoch": 0.5549689850864459, + "grad_norm": 0.3885681629180908, + "learning_rate": 9.831848177620493e-07, + "loss": 0.22, + "num_input_tokens_seen": 2064960, + "step": 4205 + }, + { + "epoch": 0.5556288768641943, + "grad_norm": 18.198888778686523, + "learning_rate": 9.808816319161961e-07, + "loss": 0.2685, + "num_input_tokens_seen": 2067008, + "step": 4210 + }, + { + "epoch": 0.5562887686419428, + "grad_norm": 0.18500889837741852, + "learning_rate": 9.785785475196298e-07, + "loss": 0.0021, + "num_input_tokens_seen": 2069696, + "step": 4215 + }, + { + "epoch": 0.5569486604196912, + "grad_norm": 1.4052083492279053, + "learning_rate": 9.76275576793387e-07, + "loss": 0.0054, + "num_input_tokens_seen": 2072320, + "step": 4220 + }, + { + "epoch": 0.5576085521974397, + "grad_norm": 1.9056949615478516, + "learning_rate": 9.739727319579007e-07, + "loss": 0.0023, + "num_input_tokens_seen": 2074752, + "step": 4225 + }, + { + "epoch": 0.558268443975188, + "grad_norm": 1.0958954095840454, + "learning_rate": 9.716700252329361e-07, + "loss": 0.0678, + "num_input_tokens_seen": 2077440, + "step": 4230 + }, + { + "epoch": 0.5589283357529365, + "grad_norm": 20.575729370117188, + "learning_rate": 9.693674688375254e-07, + "loss": 0.2046, + "num_input_tokens_seen": 2080000, + "step": 4235 + }, + { + "epoch": 0.5595882275306849, + "grad_norm": 0.2594149708747864, + "learning_rate": 9.67065074989903e-07, + "loss": 0.1257, + "num_input_tokens_seen": 2082560, + "step": 4240 + }, + { + "epoch": 0.5602481193084334, + "grad_norm": 36.21245193481445, + "learning_rate": 9.647628559074415e-07, + "loss": 0.0827, + "num_input_tokens_seen": 2084864, + "step": 4245 + }, + { + "epoch": 0.5609080110861818, + "grad_norm": 0.03890296071767807, + "learning_rate": 9.62460823806585e-07, + "loss": 0.1167, + "num_input_tokens_seen": 2087424, + "step": 4250 + }, + { + "epoch": 0.5615679028639303, + "grad_norm": 4.345874786376953, + "learning_rate": 9.601589909027857e-07, + "loss": 0.2136, + "num_input_tokens_seen": 2090048, + "step": 4255 + }, + { + "epoch": 0.5622277946416787, + "grad_norm": 0.06426483392715454, + "learning_rate": 9.578573694104394e-07, + "loss": 0.0795, + "num_input_tokens_seen": 2092416, + "step": 4260 + }, + { + "epoch": 0.5628876864194272, + "grad_norm": 5.784552097320557, + "learning_rate": 9.555559715428199e-07, + "loss": 0.0455, + "num_input_tokens_seen": 2094656, + "step": 4265 + }, + { + "epoch": 0.5635475781971757, + "grad_norm": 0.20891836285591125, + "learning_rate": 9.532548095120134e-07, + "loss": 0.0031, + "num_input_tokens_seen": 2097024, + "step": 4270 + }, + { + "epoch": 0.5642074699749241, + "grad_norm": 0.08341825008392334, + "learning_rate": 9.509538955288564e-07, + "loss": 0.0884, + "num_input_tokens_seen": 2099392, + "step": 4275 + }, + { + "epoch": 0.5648673617526726, + "grad_norm": 0.749411940574646, + "learning_rate": 9.486532418028672e-07, + "loss": 0.0815, + "num_input_tokens_seen": 2102016, + "step": 4280 + }, + { + "epoch": 0.565527253530421, + "grad_norm": 25.93520164489746, + "learning_rate": 9.463528605421844e-07, + "loss": 0.117, + "num_input_tokens_seen": 2104320, + "step": 4285 + }, + { + "epoch": 0.5661871453081695, + "grad_norm": 45.35911178588867, + "learning_rate": 9.440527639535004e-07, + "loss": 0.0795, + "num_input_tokens_seen": 2107136, + "step": 4290 + }, + { + "epoch": 0.5668470370859179, + "grad_norm": 0.20163391530513763, + "learning_rate": 9.417529642419971e-07, + "loss": 0.0935, + "num_input_tokens_seen": 2109888, + "step": 4295 + }, + { + "epoch": 0.5675069288636664, + "grad_norm": 24.672039031982422, + "learning_rate": 9.394534736112815e-07, + "loss": 0.1225, + "num_input_tokens_seen": 2112192, + "step": 4300 + }, + { + "epoch": 0.5681668206414148, + "grad_norm": 0.07875992357730865, + "learning_rate": 9.371543042633192e-07, + "loss": 0.1277, + "num_input_tokens_seen": 2114752, + "step": 4305 + }, + { + "epoch": 0.5688267124191633, + "grad_norm": 0.11948826909065247, + "learning_rate": 9.348554683983722e-07, + "loss": 0.1616, + "num_input_tokens_seen": 2117184, + "step": 4310 + }, + { + "epoch": 0.5694866041969117, + "grad_norm": 0.17669005692005157, + "learning_rate": 9.325569782149323e-07, + "loss": 0.0485, + "num_input_tokens_seen": 2119552, + "step": 4315 + }, + { + "epoch": 0.5701464959746602, + "grad_norm": 18.713947296142578, + "learning_rate": 9.302588459096574e-07, + "loss": 0.0897, + "num_input_tokens_seen": 2121920, + "step": 4320 + }, + { + "epoch": 0.5708063877524086, + "grad_norm": 8.844649314880371, + "learning_rate": 9.279610836773064e-07, + "loss": 0.1948, + "num_input_tokens_seen": 2124096, + "step": 4325 + }, + { + "epoch": 0.571466279530157, + "grad_norm": 62.913169860839844, + "learning_rate": 9.256637037106735e-07, + "loss": 0.0979, + "num_input_tokens_seen": 2126528, + "step": 4330 + }, + { + "epoch": 0.5721261713079056, + "grad_norm": 35.835323333740234, + "learning_rate": 9.233667182005259e-07, + "loss": 0.0585, + "num_input_tokens_seen": 2128576, + "step": 4335 + }, + { + "epoch": 0.5727860630856539, + "grad_norm": 236.8058319091797, + "learning_rate": 9.210701393355361e-07, + "loss": 0.1142, + "num_input_tokens_seen": 2130688, + "step": 4340 + }, + { + "epoch": 0.5734459548634024, + "grad_norm": 0.6673513650894165, + "learning_rate": 9.187739793022198e-07, + "loss": 0.1147, + "num_input_tokens_seen": 2133312, + "step": 4345 + }, + { + "epoch": 0.5741058466411508, + "grad_norm": 0.05369502305984497, + "learning_rate": 9.164782502848702e-07, + "loss": 0.0315, + "num_input_tokens_seen": 2135680, + "step": 4350 + }, + { + "epoch": 0.5747657384188993, + "grad_norm": 0.035501688718795776, + "learning_rate": 9.141829644654936e-07, + "loss": 0.2153, + "num_input_tokens_seen": 2138112, + "step": 4355 + }, + { + "epoch": 0.5754256301966477, + "grad_norm": 7.459763526916504, + "learning_rate": 9.118881340237432e-07, + "loss": 0.3872, + "num_input_tokens_seen": 2140352, + "step": 4360 + }, + { + "epoch": 0.5760855219743962, + "grad_norm": 0.08102209866046906, + "learning_rate": 9.095937711368573e-07, + "loss": 0.0637, + "num_input_tokens_seen": 2143040, + "step": 4365 + }, + { + "epoch": 0.5767454137521446, + "grad_norm": 0.06749647855758667, + "learning_rate": 9.072998879795923e-07, + "loss": 0.1285, + "num_input_tokens_seen": 2145280, + "step": 4370 + }, + { + "epoch": 0.5774053055298931, + "grad_norm": 51.86709976196289, + "learning_rate": 9.050064967241596e-07, + "loss": 0.0807, + "num_input_tokens_seen": 2147904, + "step": 4375 + }, + { + "epoch": 0.5780651973076415, + "grad_norm": 0.10375242680311203, + "learning_rate": 9.027136095401598e-07, + "loss": 0.0728, + "num_input_tokens_seen": 2150400, + "step": 4380 + }, + { + "epoch": 0.57872508908539, + "grad_norm": 0.2877858281135559, + "learning_rate": 9.004212385945187e-07, + "loss": 0.1274, + "num_input_tokens_seen": 2153088, + "step": 4385 + }, + { + "epoch": 0.5793849808631385, + "grad_norm": 0.05926657095551491, + "learning_rate": 8.981293960514233e-07, + "loss": 0.0495, + "num_input_tokens_seen": 2155776, + "step": 4390 + }, + { + "epoch": 0.5800448726408869, + "grad_norm": 1.292005181312561, + "learning_rate": 8.958380940722564e-07, + "loss": 0.1366, + "num_input_tokens_seen": 2158400, + "step": 4395 + }, + { + "epoch": 0.5807047644186354, + "grad_norm": 0.3705070912837982, + "learning_rate": 8.935473448155326e-07, + "loss": 0.0731, + "num_input_tokens_seen": 2160704, + "step": 4400 + }, + { + "epoch": 0.5813646561963838, + "grad_norm": 26.712739944458008, + "learning_rate": 8.912571604368324e-07, + "loss": 0.0423, + "num_input_tokens_seen": 2163200, + "step": 4405 + }, + { + "epoch": 0.5820245479741323, + "grad_norm": 68.38367462158203, + "learning_rate": 8.889675530887404e-07, + "loss": 0.1252, + "num_input_tokens_seen": 2165376, + "step": 4410 + }, + { + "epoch": 0.5826844397518807, + "grad_norm": 0.06487785279750824, + "learning_rate": 8.866785349207786e-07, + "loss": 0.131, + "num_input_tokens_seen": 2167808, + "step": 4415 + }, + { + "epoch": 0.5833443315296292, + "grad_norm": 15.265974044799805, + "learning_rate": 8.843901180793423e-07, + "loss": 0.1223, + "num_input_tokens_seen": 2170112, + "step": 4420 + }, + { + "epoch": 0.5840042233073776, + "grad_norm": 1.6116943359375, + "learning_rate": 8.821023147076362e-07, + "loss": 0.001, + "num_input_tokens_seen": 2172480, + "step": 4425 + }, + { + "epoch": 0.5846641150851261, + "grad_norm": 4.275770664215088, + "learning_rate": 8.798151369456098e-07, + "loss": 0.0822, + "num_input_tokens_seen": 2175104, + "step": 4430 + }, + { + "epoch": 0.5853240068628744, + "grad_norm": 12.192449569702148, + "learning_rate": 8.775285969298931e-07, + "loss": 0.0803, + "num_input_tokens_seen": 2177280, + "step": 4435 + }, + { + "epoch": 0.585983898640623, + "grad_norm": 0.0718933716416359, + "learning_rate": 8.752427067937312e-07, + "loss": 0.0628, + "num_input_tokens_seen": 2179776, + "step": 4440 + }, + { + "epoch": 0.5866437904183713, + "grad_norm": 0.020002318546175957, + "learning_rate": 8.729574786669214e-07, + "loss": 0.0845, + "num_input_tokens_seen": 2182400, + "step": 4445 + }, + { + "epoch": 0.5873036821961198, + "grad_norm": 0.39394357800483704, + "learning_rate": 8.706729246757477e-07, + "loss": 0.06, + "num_input_tokens_seen": 2185088, + "step": 4450 + }, + { + "epoch": 0.5879635739738683, + "grad_norm": 1.8858518600463867, + "learning_rate": 8.683890569429173e-07, + "loss": 0.0725, + "num_input_tokens_seen": 2187776, + "step": 4455 + }, + { + "epoch": 0.5886234657516167, + "grad_norm": 0.07854912430047989, + "learning_rate": 8.661058875874956e-07, + "loss": 0.0027, + "num_input_tokens_seen": 2190016, + "step": 4460 + }, + { + "epoch": 0.5892833575293652, + "grad_norm": 0.09435324370861053, + "learning_rate": 8.638234287248423e-07, + "loss": 0.0013, + "num_input_tokens_seen": 2192320, + "step": 4465 + }, + { + "epoch": 0.5899432493071136, + "grad_norm": 44.07099533081055, + "learning_rate": 8.615416924665464e-07, + "loss": 0.0578, + "num_input_tokens_seen": 2194752, + "step": 4470 + }, + { + "epoch": 0.5906031410848621, + "grad_norm": 0.29922375082969666, + "learning_rate": 8.592606909203629e-07, + "loss": 0.0962, + "num_input_tokens_seen": 2197056, + "step": 4475 + }, + { + "epoch": 0.5912630328626105, + "grad_norm": 0.052084218710660934, + "learning_rate": 8.569804361901485e-07, + "loss": 0.0401, + "num_input_tokens_seen": 2199296, + "step": 4480 + }, + { + "epoch": 0.591922924640359, + "grad_norm": 59.697113037109375, + "learning_rate": 8.547009403757963e-07, + "loss": 0.4233, + "num_input_tokens_seen": 2201664, + "step": 4485 + }, + { + "epoch": 0.5925828164181074, + "grad_norm": 16.623720169067383, + "learning_rate": 8.524222155731731e-07, + "loss": 0.1601, + "num_input_tokens_seen": 2204288, + "step": 4490 + }, + { + "epoch": 0.5932427081958559, + "grad_norm": 82.14921569824219, + "learning_rate": 8.501442738740538e-07, + "loss": 0.1259, + "num_input_tokens_seen": 2206528, + "step": 4495 + }, + { + "epoch": 0.5939025999736043, + "grad_norm": 0.7616731524467468, + "learning_rate": 8.47867127366058e-07, + "loss": 0.0636, + "num_input_tokens_seen": 2209024, + "step": 4500 + }, + { + "epoch": 0.5945624917513528, + "grad_norm": 0.1041426807641983, + "learning_rate": 8.455907881325858e-07, + "loss": 0.0027, + "num_input_tokens_seen": 2211584, + "step": 4505 + }, + { + "epoch": 0.5952223835291012, + "grad_norm": 1.8390711545944214, + "learning_rate": 8.433152682527533e-07, + "loss": 0.1052, + "num_input_tokens_seen": 2213952, + "step": 4510 + }, + { + "epoch": 0.5958822753068497, + "grad_norm": 0.08113599568605423, + "learning_rate": 8.410405798013298e-07, + "loss": 0.0747, + "num_input_tokens_seen": 2216192, + "step": 4515 + }, + { + "epoch": 0.5965421670845982, + "grad_norm": 16.143348693847656, + "learning_rate": 8.387667348486712e-07, + "loss": 0.0035, + "num_input_tokens_seen": 2218688, + "step": 4520 + }, + { + "epoch": 0.5972020588623466, + "grad_norm": 135.14500427246094, + "learning_rate": 8.364937454606585e-07, + "loss": 0.1296, + "num_input_tokens_seen": 2220928, + "step": 4525 + }, + { + "epoch": 0.5978619506400951, + "grad_norm": 12.444659233093262, + "learning_rate": 8.342216236986329e-07, + "loss": 0.0014, + "num_input_tokens_seen": 2223360, + "step": 4530 + }, + { + "epoch": 0.5985218424178435, + "grad_norm": 0.052838534116744995, + "learning_rate": 8.319503816193305e-07, + "loss": 0.1463, + "num_input_tokens_seen": 2225792, + "step": 4535 + }, + { + "epoch": 0.599181734195592, + "grad_norm": 29.65154457092285, + "learning_rate": 8.296800312748206e-07, + "loss": 0.1496, + "num_input_tokens_seen": 2228288, + "step": 4540 + }, + { + "epoch": 0.5998416259733403, + "grad_norm": 1.4917051792144775, + "learning_rate": 8.274105847124404e-07, + "loss": 0.1911, + "num_input_tokens_seen": 2230848, + "step": 4545 + }, + { + "epoch": 0.6002375610399895, + "eval_loss": 0.11496574431657791, + "eval_runtime": 7.6571, + "eval_samples_per_second": 879.582, + "eval_steps_per_second": 109.964, + "num_input_tokens_seen": 2232448, + "step": 4548 + }, + { + "epoch": 0.6005015177510888, + "grad_norm": 35.08987808227539, + "learning_rate": 8.251420539747311e-07, + "loss": 0.1187, + "num_input_tokens_seen": 2233472, + "step": 4550 + }, + { + "epoch": 0.6011614095288372, + "grad_norm": 0.22071610391139984, + "learning_rate": 8.228744510993742e-07, + "loss": 0.1799, + "num_input_tokens_seen": 2236096, + "step": 4555 + }, + { + "epoch": 0.6018213013065857, + "grad_norm": 0.21558649837970734, + "learning_rate": 8.206077881191274e-07, + "loss": 0.0908, + "num_input_tokens_seen": 2238720, + "step": 4560 + }, + { + "epoch": 0.6024811930843341, + "grad_norm": 24.909807205200195, + "learning_rate": 8.183420770617614e-07, + "loss": 0.1394, + "num_input_tokens_seen": 2241216, + "step": 4565 + }, + { + "epoch": 0.6031410848620826, + "grad_norm": 2.2823469638824463, + "learning_rate": 8.160773299499955e-07, + "loss": 0.0631, + "num_input_tokens_seen": 2243648, + "step": 4570 + }, + { + "epoch": 0.6038009766398311, + "grad_norm": 1.838703989982605, + "learning_rate": 8.138135588014339e-07, + "loss": 0.0464, + "num_input_tokens_seen": 2246080, + "step": 4575 + }, + { + "epoch": 0.6044608684175795, + "grad_norm": 22.0809268951416, + "learning_rate": 8.115507756285017e-07, + "loss": 0.0632, + "num_input_tokens_seen": 2248256, + "step": 4580 + }, + { + "epoch": 0.605120760195328, + "grad_norm": 0.09841513633728027, + "learning_rate": 8.092889924383819e-07, + "loss": 0.1037, + "num_input_tokens_seen": 2250688, + "step": 4585 + }, + { + "epoch": 0.6057806519730764, + "grad_norm": 3.10756516456604, + "learning_rate": 8.070282212329508e-07, + "loss": 0.0775, + "num_input_tokens_seen": 2253120, + "step": 4590 + }, + { + "epoch": 0.6064405437508249, + "grad_norm": 170.31297302246094, + "learning_rate": 8.047684740087156e-07, + "loss": 0.22, + "num_input_tokens_seen": 2255360, + "step": 4595 + }, + { + "epoch": 0.6071004355285733, + "grad_norm": 46.11749267578125, + "learning_rate": 8.025097627567481e-07, + "loss": 0.1834, + "num_input_tokens_seen": 2257728, + "step": 4600 + }, + { + "epoch": 0.6077603273063218, + "grad_norm": 0.045084141194820404, + "learning_rate": 8.002520994626247e-07, + "loss": 0.0712, + "num_input_tokens_seen": 2260224, + "step": 4605 + }, + { + "epoch": 0.6084202190840702, + "grad_norm": 0.1346772313117981, + "learning_rate": 7.979954961063596e-07, + "loss": 0.0733, + "num_input_tokens_seen": 2262912, + "step": 4610 + }, + { + "epoch": 0.6090801108618187, + "grad_norm": 18.890954971313477, + "learning_rate": 7.957399646623436e-07, + "loss": 0.3433, + "num_input_tokens_seen": 2265152, + "step": 4615 + }, + { + "epoch": 0.6097400026395671, + "grad_norm": 0.26090413331985474, + "learning_rate": 7.934855170992788e-07, + "loss": 0.042, + "num_input_tokens_seen": 2267968, + "step": 4620 + }, + { + "epoch": 0.6103998944173156, + "grad_norm": 0.09057987481355667, + "learning_rate": 7.912321653801161e-07, + "loss": 0.0468, + "num_input_tokens_seen": 2270336, + "step": 4625 + }, + { + "epoch": 0.611059786195064, + "grad_norm": 19.550853729248047, + "learning_rate": 7.889799214619919e-07, + "loss": 0.1865, + "num_input_tokens_seen": 2273024, + "step": 4630 + }, + { + "epoch": 0.6117196779728125, + "grad_norm": 0.048422493040561676, + "learning_rate": 7.867287972961629e-07, + "loss": 0.0821, + "num_input_tokens_seen": 2275264, + "step": 4635 + }, + { + "epoch": 0.612379569750561, + "grad_norm": 0.2724073529243469, + "learning_rate": 7.844788048279453e-07, + "loss": 0.0704, + "num_input_tokens_seen": 2277888, + "step": 4640 + }, + { + "epoch": 0.6130394615283093, + "grad_norm": 0.041433185338974, + "learning_rate": 7.822299559966494e-07, + "loss": 0.0007, + "num_input_tokens_seen": 2280320, + "step": 4645 + }, + { + "epoch": 0.6136993533060578, + "grad_norm": 0.03420973941683769, + "learning_rate": 7.799822627355171e-07, + "loss": 0.0591, + "num_input_tokens_seen": 2282560, + "step": 4650 + }, + { + "epoch": 0.6143592450838062, + "grad_norm": 0.13017447292804718, + "learning_rate": 7.77735736971659e-07, + "loss": 0.0842, + "num_input_tokens_seen": 2284864, + "step": 4655 + }, + { + "epoch": 0.6150191368615547, + "grad_norm": 0.07092246413230896, + "learning_rate": 7.754903906259889e-07, + "loss": 0.1524, + "num_input_tokens_seen": 2287168, + "step": 4660 + }, + { + "epoch": 0.6156790286393031, + "grad_norm": 166.20501708984375, + "learning_rate": 7.732462356131637e-07, + "loss": 0.059, + "num_input_tokens_seen": 2289600, + "step": 4665 + }, + { + "epoch": 0.6163389204170516, + "grad_norm": 0.29940545558929443, + "learning_rate": 7.710032838415179e-07, + "loss": 0.0896, + "num_input_tokens_seen": 2292160, + "step": 4670 + }, + { + "epoch": 0.6169988121948, + "grad_norm": 0.06602998822927475, + "learning_rate": 7.687615472130016e-07, + "loss": 0.155, + "num_input_tokens_seen": 2294912, + "step": 4675 + }, + { + "epoch": 0.6176587039725485, + "grad_norm": 0.13479486107826233, + "learning_rate": 7.665210376231165e-07, + "loss": 0.1138, + "num_input_tokens_seen": 2297024, + "step": 4680 + }, + { + "epoch": 0.6183185957502969, + "grad_norm": 7.841771125793457, + "learning_rate": 7.642817669608536e-07, + "loss": 0.1342, + "num_input_tokens_seen": 2299456, + "step": 4685 + }, + { + "epoch": 0.6189784875280454, + "grad_norm": 0.10149969160556793, + "learning_rate": 7.62043747108629e-07, + "loss": 0.0194, + "num_input_tokens_seen": 2301568, + "step": 4690 + }, + { + "epoch": 0.6196383793057938, + "grad_norm": 122.03047943115234, + "learning_rate": 7.598069899422221e-07, + "loss": 0.1988, + "num_input_tokens_seen": 2303936, + "step": 4695 + }, + { + "epoch": 0.6202982710835423, + "grad_norm": 8.390487670898438, + "learning_rate": 7.575715073307119e-07, + "loss": 0.2107, + "num_input_tokens_seen": 2306176, + "step": 4700 + }, + { + "epoch": 0.6209581628612908, + "grad_norm": 0.41683492064476013, + "learning_rate": 7.55337311136414e-07, + "loss": 0.0995, + "num_input_tokens_seen": 2308736, + "step": 4705 + }, + { + "epoch": 0.6216180546390392, + "grad_norm": 14.42542839050293, + "learning_rate": 7.531044132148183e-07, + "loss": 0.1775, + "num_input_tokens_seen": 2311104, + "step": 4710 + }, + { + "epoch": 0.6222779464167877, + "grad_norm": 16.42903709411621, + "learning_rate": 7.508728254145245e-07, + "loss": 0.0493, + "num_input_tokens_seen": 2313536, + "step": 4715 + }, + { + "epoch": 0.6229378381945361, + "grad_norm": 26.883657455444336, + "learning_rate": 7.486425595771817e-07, + "loss": 0.117, + "num_input_tokens_seen": 2316032, + "step": 4720 + }, + { + "epoch": 0.6235977299722846, + "grad_norm": 8.713482856750488, + "learning_rate": 7.464136275374223e-07, + "loss": 0.1853, + "num_input_tokens_seen": 2318656, + "step": 4725 + }, + { + "epoch": 0.624257621750033, + "grad_norm": 10.561690330505371, + "learning_rate": 7.441860411228029e-07, + "loss": 0.1311, + "num_input_tokens_seen": 2321216, + "step": 4730 + }, + { + "epoch": 0.6249175135277815, + "grad_norm": 39.474449157714844, + "learning_rate": 7.419598121537387e-07, + "loss": 0.1273, + "num_input_tokens_seen": 2323648, + "step": 4735 + }, + { + "epoch": 0.6255774053055299, + "grad_norm": 18.26643943786621, + "learning_rate": 7.397349524434424e-07, + "loss": 0.1446, + "num_input_tokens_seen": 2326080, + "step": 4740 + }, + { + "epoch": 0.6262372970832784, + "grad_norm": 8.37359619140625, + "learning_rate": 7.375114737978605e-07, + "loss": 0.0544, + "num_input_tokens_seen": 2328512, + "step": 4745 + }, + { + "epoch": 0.6268971888610267, + "grad_norm": 4.634432315826416, + "learning_rate": 7.352893880156106e-07, + "loss": 0.1048, + "num_input_tokens_seen": 2331008, + "step": 4750 + }, + { + "epoch": 0.6275570806387752, + "grad_norm": 1.4395649433135986, + "learning_rate": 7.330687068879202e-07, + "loss": 0.0516, + "num_input_tokens_seen": 2333376, + "step": 4755 + }, + { + "epoch": 0.6282169724165237, + "grad_norm": 135.29498291015625, + "learning_rate": 7.308494421985626e-07, + "loss": 0.1411, + "num_input_tokens_seen": 2335872, + "step": 4760 + }, + { + "epoch": 0.6288768641942721, + "grad_norm": 0.25262773036956787, + "learning_rate": 7.286316057237951e-07, + "loss": 0.0029, + "num_input_tokens_seen": 2338432, + "step": 4765 + }, + { + "epoch": 0.6295367559720206, + "grad_norm": 20.409406661987305, + "learning_rate": 7.264152092322963e-07, + "loss": 0.1567, + "num_input_tokens_seen": 2340928, + "step": 4770 + }, + { + "epoch": 0.630196647749769, + "grad_norm": 0.3037130832672119, + "learning_rate": 7.242002644851035e-07, + "loss": 0.0441, + "num_input_tokens_seen": 2343680, + "step": 4775 + }, + { + "epoch": 0.6308565395275175, + "grad_norm": 0.19966571033000946, + "learning_rate": 7.219867832355508e-07, + "loss": 0.0673, + "num_input_tokens_seen": 2346240, + "step": 4780 + }, + { + "epoch": 0.6315164313052659, + "grad_norm": 0.16028675436973572, + "learning_rate": 7.197747772292071e-07, + "loss": 0.0718, + "num_input_tokens_seen": 2348544, + "step": 4785 + }, + { + "epoch": 0.6321763230830144, + "grad_norm": 0.05191419646143913, + "learning_rate": 7.17564258203811e-07, + "loss": 0.2532, + "num_input_tokens_seen": 2350976, + "step": 4790 + }, + { + "epoch": 0.6328362148607628, + "grad_norm": 21.26822280883789, + "learning_rate": 7.153552378892128e-07, + "loss": 0.1214, + "num_input_tokens_seen": 2353216, + "step": 4795 + }, + { + "epoch": 0.6334961066385113, + "grad_norm": 0.49603065848350525, + "learning_rate": 7.131477280073091e-07, + "loss": 0.1191, + "num_input_tokens_seen": 2355584, + "step": 4800 + }, + { + "epoch": 0.6341559984162597, + "grad_norm": 0.12939685583114624, + "learning_rate": 7.109417402719813e-07, + "loss": 0.1127, + "num_input_tokens_seen": 2358144, + "step": 4805 + }, + { + "epoch": 0.6348158901940082, + "grad_norm": 14.447181701660156, + "learning_rate": 7.087372863890346e-07, + "loss": 0.0543, + "num_input_tokens_seen": 2360896, + "step": 4810 + }, + { + "epoch": 0.6354757819717566, + "grad_norm": 25.439424514770508, + "learning_rate": 7.065343780561344e-07, + "loss": 0.2546, + "num_input_tokens_seen": 2363264, + "step": 4815 + }, + { + "epoch": 0.6361356737495051, + "grad_norm": 10.288759231567383, + "learning_rate": 7.043330269627448e-07, + "loss": 0.0676, + "num_input_tokens_seen": 2365632, + "step": 4820 + }, + { + "epoch": 0.6367955655272536, + "grad_norm": 0.07639932632446289, + "learning_rate": 7.021332447900671e-07, + "loss": 0.0018, + "num_input_tokens_seen": 2368000, + "step": 4825 + }, + { + "epoch": 0.637455457305002, + "grad_norm": 63.019187927246094, + "learning_rate": 6.999350432109766e-07, + "loss": 0.1462, + "num_input_tokens_seen": 2370560, + "step": 4830 + }, + { + "epoch": 0.6381153490827505, + "grad_norm": 0.08439631760120392, + "learning_rate": 6.977384338899617e-07, + "loss": 0.001, + "num_input_tokens_seen": 2373120, + "step": 4835 + }, + { + "epoch": 0.6387752408604989, + "grad_norm": 0.06181376054883003, + "learning_rate": 6.955434284830619e-07, + "loss": 0.0052, + "num_input_tokens_seen": 2375872, + "step": 4840 + }, + { + "epoch": 0.6394351326382474, + "grad_norm": 0.05570792779326439, + "learning_rate": 6.933500386378056e-07, + "loss": 0.2037, + "num_input_tokens_seen": 2378432, + "step": 4845 + }, + { + "epoch": 0.6400950244159958, + "grad_norm": 50.27269744873047, + "learning_rate": 6.911582759931482e-07, + "loss": 0.1581, + "num_input_tokens_seen": 2380800, + "step": 4850 + }, + { + "epoch": 0.6407549161937443, + "grad_norm": 0.039350103586912155, + "learning_rate": 6.889681521794109e-07, + "loss": 0.2158, + "num_input_tokens_seen": 2383744, + "step": 4855 + }, + { + "epoch": 0.6414148079714926, + "grad_norm": 11.155346870422363, + "learning_rate": 6.867796788182181e-07, + "loss": 0.0894, + "num_input_tokens_seen": 2386112, + "step": 4860 + }, + { + "epoch": 0.6420746997492411, + "grad_norm": 8.643911361694336, + "learning_rate": 6.845928675224366e-07, + "loss": 0.1499, + "num_input_tokens_seen": 2388736, + "step": 4865 + }, + { + "epoch": 0.6427345915269895, + "grad_norm": 0.24349497258663177, + "learning_rate": 6.82407729896114e-07, + "loss": 0.0662, + "num_input_tokens_seen": 2391104, + "step": 4870 + }, + { + "epoch": 0.643394483304738, + "grad_norm": 15.820056915283203, + "learning_rate": 6.802242775344163e-07, + "loss": 0.0747, + "num_input_tokens_seen": 2393728, + "step": 4875 + }, + { + "epoch": 0.6440543750824864, + "grad_norm": 0.12001825124025345, + "learning_rate": 6.780425220235674e-07, + "loss": 0.1309, + "num_input_tokens_seen": 2396480, + "step": 4880 + }, + { + "epoch": 0.6447142668602349, + "grad_norm": 0.08038333803415298, + "learning_rate": 6.758624749407859e-07, + "loss": 0.008, + "num_input_tokens_seen": 2399104, + "step": 4885 + }, + { + "epoch": 0.6453741586379834, + "grad_norm": 15.686113357543945, + "learning_rate": 6.736841478542264e-07, + "loss": 0.0813, + "num_input_tokens_seen": 2401664, + "step": 4890 + }, + { + "epoch": 0.6460340504157318, + "grad_norm": 0.3630061447620392, + "learning_rate": 6.715075523229151e-07, + "loss": 0.0084, + "num_input_tokens_seen": 2404160, + "step": 4895 + }, + { + "epoch": 0.6466939421934803, + "grad_norm": 29.911376953125, + "learning_rate": 6.693326998966909e-07, + "loss": 0.129, + "num_input_tokens_seen": 2406592, + "step": 4900 + }, + { + "epoch": 0.6473538339712287, + "grad_norm": 0.05508751794695854, + "learning_rate": 6.671596021161431e-07, + "loss": 0.0684, + "num_input_tokens_seen": 2409088, + "step": 4905 + }, + { + "epoch": 0.6480137257489772, + "grad_norm": 0.06392798572778702, + "learning_rate": 6.649882705125494e-07, + "loss": 0.0965, + "num_input_tokens_seen": 2411584, + "step": 4910 + }, + { + "epoch": 0.6486736175267256, + "grad_norm": 0.36957481503486633, + "learning_rate": 6.628187166078163e-07, + "loss": 0.4483, + "num_input_tokens_seen": 2414400, + "step": 4915 + }, + { + "epoch": 0.6493335093044741, + "grad_norm": 18.36041259765625, + "learning_rate": 6.606509519144166e-07, + "loss": 0.0583, + "num_input_tokens_seen": 2416640, + "step": 4920 + }, + { + "epoch": 0.6499934010822225, + "grad_norm": 61.96574783325195, + "learning_rate": 6.584849879353289e-07, + "loss": 0.1499, + "num_input_tokens_seen": 2419136, + "step": 4925 + }, + { + "epoch": 0.6502573577933219, + "eval_loss": 0.09844312816858292, + "eval_runtime": 7.5167, + "eval_samples_per_second": 896.002, + "eval_steps_per_second": 112.017, + "num_input_tokens_seen": 2420096, + "step": 4927 + }, + { + "epoch": 0.650653292859971, + "grad_norm": 0.3677075207233429, + "learning_rate": 6.563208361639772e-07, + "loss": 0.0307, + "num_input_tokens_seen": 2421440, + "step": 4930 + }, + { + "epoch": 0.6513131846377194, + "grad_norm": 0.08293258398771286, + "learning_rate": 6.541585080841687e-07, + "loss": 0.0015, + "num_input_tokens_seen": 2424000, + "step": 4935 + }, + { + "epoch": 0.6519730764154679, + "grad_norm": 88.83380126953125, + "learning_rate": 6.519980151700332e-07, + "loss": 0.0999, + "num_input_tokens_seen": 2426240, + "step": 4940 + }, + { + "epoch": 0.6526329681932164, + "grad_norm": 0.1625138372182846, + "learning_rate": 6.498393688859629e-07, + "loss": 0.0789, + "num_input_tokens_seen": 2428864, + "step": 4945 + }, + { + "epoch": 0.6532928599709648, + "grad_norm": 0.05900685489177704, + "learning_rate": 6.47682580686551e-07, + "loss": 0.0011, + "num_input_tokens_seen": 2431296, + "step": 4950 + }, + { + "epoch": 0.6539527517487133, + "grad_norm": 0.054225701838731766, + "learning_rate": 6.455276620165307e-07, + "loss": 0.002, + "num_input_tokens_seen": 2433984, + "step": 4955 + }, + { + "epoch": 0.6546126435264616, + "grad_norm": 0.02803809382021427, + "learning_rate": 6.433746243107152e-07, + "loss": 0.4195, + "num_input_tokens_seen": 2436224, + "step": 4960 + }, + { + "epoch": 0.6552725353042101, + "grad_norm": 0.09517721086740494, + "learning_rate": 6.412234789939359e-07, + "loss": 0.229, + "num_input_tokens_seen": 2438720, + "step": 4965 + }, + { + "epoch": 0.6559324270819585, + "grad_norm": 0.13722281157970428, + "learning_rate": 6.390742374809832e-07, + "loss": 0.0818, + "num_input_tokens_seen": 2440960, + "step": 4970 + }, + { + "epoch": 0.656592318859707, + "grad_norm": 0.6646612286567688, + "learning_rate": 6.369269111765454e-07, + "loss": 0.0417, + "num_input_tokens_seen": 2443328, + "step": 4975 + }, + { + "epoch": 0.6572522106374554, + "grad_norm": 0.05688225477933884, + "learning_rate": 6.347815114751465e-07, + "loss": 0.1413, + "num_input_tokens_seen": 2445952, + "step": 4980 + }, + { + "epoch": 0.6579121024152039, + "grad_norm": 0.027482135221362114, + "learning_rate": 6.326380497610886e-07, + "loss": 0.1102, + "num_input_tokens_seen": 2448576, + "step": 4985 + }, + { + "epoch": 0.6585719941929523, + "grad_norm": 52.316715240478516, + "learning_rate": 6.304965374083899e-07, + "loss": 0.323, + "num_input_tokens_seen": 2451136, + "step": 4990 + }, + { + "epoch": 0.6592318859707008, + "grad_norm": 0.18591034412384033, + "learning_rate": 6.283569857807245e-07, + "loss": 0.0022, + "num_input_tokens_seen": 2453632, + "step": 4995 + }, + { + "epoch": 0.6598917777484492, + "grad_norm": 0.1707799881696701, + "learning_rate": 6.262194062313615e-07, + "loss": 0.0082, + "num_input_tokens_seen": 2456192, + "step": 5000 + }, + { + "epoch": 0.6605516695261977, + "grad_norm": 0.05098792165517807, + "learning_rate": 6.240838101031063e-07, + "loss": 0.0012, + "num_input_tokens_seen": 2458624, + "step": 5005 + }, + { + "epoch": 0.6612115613039462, + "grad_norm": 0.10480757057666779, + "learning_rate": 6.21950208728239e-07, + "loss": 0.134, + "num_input_tokens_seen": 2460928, + "step": 5010 + }, + { + "epoch": 0.6618714530816946, + "grad_norm": 0.10895920544862747, + "learning_rate": 6.198186134284554e-07, + "loss": 0.1085, + "num_input_tokens_seen": 2463552, + "step": 5015 + }, + { + "epoch": 0.6625313448594431, + "grad_norm": 25.51168441772461, + "learning_rate": 6.176890355148049e-07, + "loss": 0.0561, + "num_input_tokens_seen": 2465856, + "step": 5020 + }, + { + "epoch": 0.6631912366371915, + "grad_norm": 3.873609781265259, + "learning_rate": 6.155614862876335e-07, + "loss": 0.0902, + "num_input_tokens_seen": 2468288, + "step": 5025 + }, + { + "epoch": 0.66385112841494, + "grad_norm": 75.29798889160156, + "learning_rate": 6.134359770365214e-07, + "loss": 0.1482, + "num_input_tokens_seen": 2470912, + "step": 5030 + }, + { + "epoch": 0.6645110201926884, + "grad_norm": 0.2568621039390564, + "learning_rate": 6.11312519040224e-07, + "loss": 0.109, + "num_input_tokens_seen": 2473536, + "step": 5035 + }, + { + "epoch": 0.6651709119704369, + "grad_norm": 0.05576321855187416, + "learning_rate": 6.091911235666125e-07, + "loss": 0.0013, + "num_input_tokens_seen": 2476032, + "step": 5040 + }, + { + "epoch": 0.6658308037481853, + "grad_norm": 0.13206513226032257, + "learning_rate": 6.070718018726124e-07, + "loss": 0.1091, + "num_input_tokens_seen": 2478208, + "step": 5045 + }, + { + "epoch": 0.6664906955259338, + "grad_norm": 0.10654900968074799, + "learning_rate": 6.049545652041459e-07, + "loss": 0.1482, + "num_input_tokens_seen": 2480512, + "step": 5050 + }, + { + "epoch": 0.6671505873036822, + "grad_norm": 0.07339984178543091, + "learning_rate": 6.028394247960709e-07, + "loss": 0.1775, + "num_input_tokens_seen": 2483008, + "step": 5055 + }, + { + "epoch": 0.6678104790814307, + "grad_norm": 0.04593325033783913, + "learning_rate": 6.007263918721221e-07, + "loss": 0.1572, + "num_input_tokens_seen": 2485376, + "step": 5060 + }, + { + "epoch": 0.668470370859179, + "grad_norm": 0.19269201159477234, + "learning_rate": 5.986154776448507e-07, + "loss": 0.0559, + "num_input_tokens_seen": 2488064, + "step": 5065 + }, + { + "epoch": 0.6691302626369275, + "grad_norm": 13.757147789001465, + "learning_rate": 5.965066933155656e-07, + "loss": 0.0578, + "num_input_tokens_seen": 2490624, + "step": 5070 + }, + { + "epoch": 0.669790154414676, + "grad_norm": 20.430967330932617, + "learning_rate": 5.944000500742735e-07, + "loss": 0.2826, + "num_input_tokens_seen": 2493248, + "step": 5075 + }, + { + "epoch": 0.6704500461924244, + "grad_norm": 51.80553436279297, + "learning_rate": 5.922955590996195e-07, + "loss": 0.201, + "num_input_tokens_seen": 2495744, + "step": 5080 + }, + { + "epoch": 0.6711099379701729, + "grad_norm": 0.12118737399578094, + "learning_rate": 5.901932315588281e-07, + "loss": 0.0019, + "num_input_tokens_seen": 2498176, + "step": 5085 + }, + { + "epoch": 0.6717698297479213, + "grad_norm": 20.142244338989258, + "learning_rate": 5.880930786076441e-07, + "loss": 0.1805, + "num_input_tokens_seen": 2500416, + "step": 5090 + }, + { + "epoch": 0.6724297215256698, + "grad_norm": 0.4407406747341156, + "learning_rate": 5.859951113902728e-07, + "loss": 0.06, + "num_input_tokens_seen": 2502848, + "step": 5095 + }, + { + "epoch": 0.6730896133034182, + "grad_norm": 32.401554107666016, + "learning_rate": 5.83899341039321e-07, + "loss": 0.1099, + "num_input_tokens_seen": 2505152, + "step": 5100 + }, + { + "epoch": 0.6737495050811667, + "grad_norm": 34.423946380615234, + "learning_rate": 5.818057786757386e-07, + "loss": 0.1247, + "num_input_tokens_seen": 2507648, + "step": 5105 + }, + { + "epoch": 0.6744093968589151, + "grad_norm": 0.2243095338344574, + "learning_rate": 5.797144354087588e-07, + "loss": 0.0989, + "num_input_tokens_seen": 2510144, + "step": 5110 + }, + { + "epoch": 0.6750692886366636, + "grad_norm": 0.06958218663930893, + "learning_rate": 5.77625322335839e-07, + "loss": 0.076, + "num_input_tokens_seen": 2513024, + "step": 5115 + }, + { + "epoch": 0.675729180414412, + "grad_norm": 0.2868078649044037, + "learning_rate": 5.755384505426032e-07, + "loss": 0.0721, + "num_input_tokens_seen": 2515072, + "step": 5120 + }, + { + "epoch": 0.6763890721921605, + "grad_norm": 0.19552133977413177, + "learning_rate": 5.734538311027819e-07, + "loss": 0.0018, + "num_input_tokens_seen": 2517376, + "step": 5125 + }, + { + "epoch": 0.677048963969909, + "grad_norm": 0.6387649178504944, + "learning_rate": 5.713714750781533e-07, + "loss": 0.0036, + "num_input_tokens_seen": 2520064, + "step": 5130 + }, + { + "epoch": 0.6777088557476574, + "grad_norm": 0.640417218208313, + "learning_rate": 5.692913935184862e-07, + "loss": 0.0685, + "num_input_tokens_seen": 2522688, + "step": 5135 + }, + { + "epoch": 0.6783687475254059, + "grad_norm": 0.32035917043685913, + "learning_rate": 5.672135974614794e-07, + "loss": 0.0071, + "num_input_tokens_seen": 2525184, + "step": 5140 + }, + { + "epoch": 0.6790286393031543, + "grad_norm": 0.08546182513237, + "learning_rate": 5.651380979327034e-07, + "loss": 0.0014, + "num_input_tokens_seen": 2527552, + "step": 5145 + }, + { + "epoch": 0.6796885310809028, + "grad_norm": 1.3679804801940918, + "learning_rate": 5.630649059455444e-07, + "loss": 0.0442, + "num_input_tokens_seen": 2530240, + "step": 5150 + }, + { + "epoch": 0.6803484228586512, + "grad_norm": 0.5069653391838074, + "learning_rate": 5.609940325011413e-07, + "loss": 0.0023, + "num_input_tokens_seen": 2532480, + "step": 5155 + }, + { + "epoch": 0.6810083146363997, + "grad_norm": 0.1547362059354782, + "learning_rate": 5.589254885883325e-07, + "loss": 0.0007, + "num_input_tokens_seen": 2534912, + "step": 5160 + }, + { + "epoch": 0.681668206414148, + "grad_norm": 0.09271689504384995, + "learning_rate": 5.568592851835936e-07, + "loss": 0.0598, + "num_input_tokens_seen": 2537408, + "step": 5165 + }, + { + "epoch": 0.6823280981918965, + "grad_norm": 0.12092125415802002, + "learning_rate": 5.547954332509805e-07, + "loss": 0.3023, + "num_input_tokens_seen": 2539776, + "step": 5170 + }, + { + "epoch": 0.6829879899696449, + "grad_norm": 0.06238294392824173, + "learning_rate": 5.527339437420717e-07, + "loss": 0.0009, + "num_input_tokens_seen": 2542208, + "step": 5175 + }, + { + "epoch": 0.6836478817473934, + "grad_norm": 109.36412811279297, + "learning_rate": 5.506748275959094e-07, + "loss": 0.1061, + "num_input_tokens_seen": 2544704, + "step": 5180 + }, + { + "epoch": 0.6843077735251418, + "grad_norm": 0.061365850269794464, + "learning_rate": 5.48618095738943e-07, + "loss": 0.0535, + "num_input_tokens_seen": 2547072, + "step": 5185 + }, + { + "epoch": 0.6849676653028903, + "grad_norm": 0.15806028246879578, + "learning_rate": 5.465637590849681e-07, + "loss": 0.1301, + "num_input_tokens_seen": 2549440, + "step": 5190 + }, + { + "epoch": 0.6856275570806388, + "grad_norm": 21.357271194458008, + "learning_rate": 5.445118285350723e-07, + "loss": 0.2169, + "num_input_tokens_seen": 2552128, + "step": 5195 + }, + { + "epoch": 0.6862874488583872, + "grad_norm": 0.09460903704166412, + "learning_rate": 5.424623149775745e-07, + "loss": 0.0681, + "num_input_tokens_seen": 2554368, + "step": 5200 + }, + { + "epoch": 0.6869473406361357, + "grad_norm": 0.0203552208840847, + "learning_rate": 5.404152292879676e-07, + "loss": 0.1175, + "num_input_tokens_seen": 2556928, + "step": 5205 + }, + { + "epoch": 0.6876072324138841, + "grad_norm": 16.716796875, + "learning_rate": 5.38370582328863e-07, + "loss": 0.1624, + "num_input_tokens_seen": 2559360, + "step": 5210 + }, + { + "epoch": 0.6882671241916326, + "grad_norm": 0.22735337913036346, + "learning_rate": 5.363283849499293e-07, + "loss": 0.1578, + "num_input_tokens_seen": 2561856, + "step": 5215 + }, + { + "epoch": 0.688927015969381, + "grad_norm": 33.698936462402344, + "learning_rate": 5.342886479878387e-07, + "loss": 0.1794, + "num_input_tokens_seen": 2564352, + "step": 5220 + }, + { + "epoch": 0.6895869077471295, + "grad_norm": 0.5594123601913452, + "learning_rate": 5.32251382266206e-07, + "loss": 0.0437, + "num_input_tokens_seen": 2566784, + "step": 5225 + }, + { + "epoch": 0.6902467995248779, + "grad_norm": 0.27059707045555115, + "learning_rate": 5.302165985955327e-07, + "loss": 0.0593, + "num_input_tokens_seen": 2569152, + "step": 5230 + }, + { + "epoch": 0.6909066913026264, + "grad_norm": 0.09355846047401428, + "learning_rate": 5.281843077731511e-07, + "loss": 0.067, + "num_input_tokens_seen": 2571520, + "step": 5235 + }, + { + "epoch": 0.6915665830803748, + "grad_norm": 121.53573608398438, + "learning_rate": 5.26154520583163e-07, + "loss": 0.141, + "num_input_tokens_seen": 2574080, + "step": 5240 + }, + { + "epoch": 0.6922264748581233, + "grad_norm": 0.16486892104148865, + "learning_rate": 5.241272477963877e-07, + "loss": 0.0595, + "num_input_tokens_seen": 2576320, + "step": 5245 + }, + { + "epoch": 0.6928863666358717, + "grad_norm": 1.9759496450424194, + "learning_rate": 5.221025001703e-07, + "loss": 0.0576, + "num_input_tokens_seen": 2578752, + "step": 5250 + }, + { + "epoch": 0.6935462584136202, + "grad_norm": 17.89307403564453, + "learning_rate": 5.200802884489768e-07, + "loss": 0.1368, + "num_input_tokens_seen": 2581184, + "step": 5255 + }, + { + "epoch": 0.6942061501913687, + "grad_norm": 0.03805484250187874, + "learning_rate": 5.180606233630374e-07, + "loss": 0.1654, + "num_input_tokens_seen": 2583872, + "step": 5260 + }, + { + "epoch": 0.694866041969117, + "grad_norm": 0.12207946926355362, + "learning_rate": 5.160435156295879e-07, + "loss": 0.1912, + "num_input_tokens_seen": 2586304, + "step": 5265 + }, + { + "epoch": 0.6955259337468656, + "grad_norm": 0.035935211926698685, + "learning_rate": 5.14028975952165e-07, + "loss": 0.0201, + "num_input_tokens_seen": 2589056, + "step": 5270 + }, + { + "epoch": 0.6961858255246139, + "grad_norm": 9.020354270935059, + "learning_rate": 5.120170150206768e-07, + "loss": 0.14, + "num_input_tokens_seen": 2591488, + "step": 5275 + }, + { + "epoch": 0.6968457173023624, + "grad_norm": 18.322715759277344, + "learning_rate": 5.100076435113496e-07, + "loss": 0.0542, + "num_input_tokens_seen": 2593792, + "step": 5280 + }, + { + "epoch": 0.6975056090801108, + "grad_norm": 55.9955940246582, + "learning_rate": 5.080008720866673e-07, + "loss": 0.1538, + "num_input_tokens_seen": 2595968, + "step": 5285 + }, + { + "epoch": 0.6981655008578593, + "grad_norm": 11.932297706604004, + "learning_rate": 5.059967113953173e-07, + "loss": 0.2123, + "num_input_tokens_seen": 2598144, + "step": 5290 + }, + { + "epoch": 0.6988253926356077, + "grad_norm": 0.08165155351161957, + "learning_rate": 5.039951720721349e-07, + "loss": 0.0838, + "num_input_tokens_seen": 2600448, + "step": 5295 + }, + { + "epoch": 0.6994852844133562, + "grad_norm": 0.32456350326538086, + "learning_rate": 5.019962647380429e-07, + "loss": 0.0167, + "num_input_tokens_seen": 2602944, + "step": 5300 + }, + { + "epoch": 0.7001451761911046, + "grad_norm": 20.51830291748047, + "learning_rate": 5.000000000000002e-07, + "loss": 0.2014, + "num_input_tokens_seen": 2605120, + "step": 5305 + }, + { + "epoch": 0.7002771545466544, + "eval_loss": 0.09084735810756683, + "eval_runtime": 7.6666, + "eval_samples_per_second": 878.487, + "eval_steps_per_second": 109.827, + "num_input_tokens_seen": 2605504, + "step": 5306 + }, + { + "epoch": 0.7008050679688531, + "grad_norm": 1.9377256631851196, + "learning_rate": 4.980063884509414e-07, + "loss": 0.0377, + "num_input_tokens_seen": 2607296, + "step": 5310 + }, + { + "epoch": 0.7014649597466015, + "grad_norm": 0.11374177783727646, + "learning_rate": 4.960154406697229e-07, + "loss": 0.0463, + "num_input_tokens_seen": 2609728, + "step": 5315 + }, + { + "epoch": 0.70212485152435, + "grad_norm": 11.871938705444336, + "learning_rate": 4.940271672210667e-07, + "loss": 0.2924, + "num_input_tokens_seen": 2612224, + "step": 5320 + }, + { + "epoch": 0.7027847433020985, + "grad_norm": 0.26750093698501587, + "learning_rate": 4.920415786555025e-07, + "loss": 0.0513, + "num_input_tokens_seen": 2614720, + "step": 5325 + }, + { + "epoch": 0.7034446350798469, + "grad_norm": 0.12440818548202515, + "learning_rate": 4.900586855093144e-07, + "loss": 0.3194, + "num_input_tokens_seen": 2617344, + "step": 5330 + }, + { + "epoch": 0.7041045268575954, + "grad_norm": 23.306577682495117, + "learning_rate": 4.880784983044827e-07, + "loss": 0.1166, + "num_input_tokens_seen": 2619584, + "step": 5335 + }, + { + "epoch": 0.7047644186353438, + "grad_norm": 0.1234973892569542, + "learning_rate": 4.861010275486284e-07, + "loss": 0.0176, + "num_input_tokens_seen": 2621888, + "step": 5340 + }, + { + "epoch": 0.7054243104130923, + "grad_norm": 0.14019837975502014, + "learning_rate": 4.8412628373496e-07, + "loss": 0.0731, + "num_input_tokens_seen": 2624512, + "step": 5345 + }, + { + "epoch": 0.7060842021908407, + "grad_norm": 0.18232476711273193, + "learning_rate": 4.821542773422136e-07, + "loss": 0.0024, + "num_input_tokens_seen": 2627008, + "step": 5350 + }, + { + "epoch": 0.7067440939685892, + "grad_norm": 0.28430455923080444, + "learning_rate": 4.801850188346012e-07, + "loss": 0.0019, + "num_input_tokens_seen": 2629440, + "step": 5355 + }, + { + "epoch": 0.7074039857463376, + "grad_norm": 0.19436050951480865, + "learning_rate": 4.782185186617523e-07, + "loss": 0.1034, + "num_input_tokens_seen": 2631872, + "step": 5360 + }, + { + "epoch": 0.7080638775240861, + "grad_norm": 0.2109547257423401, + "learning_rate": 4.762547872586603e-07, + "loss": 0.0814, + "num_input_tokens_seen": 2634560, + "step": 5365 + }, + { + "epoch": 0.7087237693018344, + "grad_norm": 0.2513101100921631, + "learning_rate": 4.7429383504562605e-07, + "loss": 0.1396, + "num_input_tokens_seen": 2637120, + "step": 5370 + }, + { + "epoch": 0.709383661079583, + "grad_norm": 0.30243685841560364, + "learning_rate": 4.723356724282029e-07, + "loss": 0.0019, + "num_input_tokens_seen": 2639552, + "step": 5375 + }, + { + "epoch": 0.7100435528573315, + "grad_norm": 24.248998641967773, + "learning_rate": 4.703803097971426e-07, + "loss": 0.1315, + "num_input_tokens_seen": 2641984, + "step": 5380 + }, + { + "epoch": 0.7107034446350798, + "grad_norm": 8.986465454101562, + "learning_rate": 4.6842775752833763e-07, + "loss": 0.0708, + "num_input_tokens_seen": 2644352, + "step": 5385 + }, + { + "epoch": 0.7113633364128283, + "grad_norm": 0.1666085124015808, + "learning_rate": 4.664780259827689e-07, + "loss": 0.02, + "num_input_tokens_seen": 2647040, + "step": 5390 + }, + { + "epoch": 0.7120232281905767, + "grad_norm": 0.05778901278972626, + "learning_rate": 4.6453112550644857e-07, + "loss": 0.0013, + "num_input_tokens_seen": 2649472, + "step": 5395 + }, + { + "epoch": 0.7126831199683252, + "grad_norm": 0.1988663524389267, + "learning_rate": 4.625870664303663e-07, + "loss": 0.0411, + "num_input_tokens_seen": 2651840, + "step": 5400 + }, + { + "epoch": 0.7133430117460736, + "grad_norm": 0.19517682492733002, + "learning_rate": 4.6064585907043486e-07, + "loss": 0.0056, + "num_input_tokens_seen": 2654464, + "step": 5405 + }, + { + "epoch": 0.7140029035238221, + "grad_norm": 0.02337566576898098, + "learning_rate": 4.587075137274334e-07, + "loss": 0.0537, + "num_input_tokens_seen": 2656576, + "step": 5410 + }, + { + "epoch": 0.7146627953015705, + "grad_norm": 1.0309412479400635, + "learning_rate": 4.5677204068695597e-07, + "loss": 0.0546, + "num_input_tokens_seen": 2659008, + "step": 5415 + }, + { + "epoch": 0.715322687079319, + "grad_norm": 0.022054580971598625, + "learning_rate": 4.5483945021935356e-07, + "loss": 0.0401, + "num_input_tokens_seen": 2661632, + "step": 5420 + }, + { + "epoch": 0.7159825788570674, + "grad_norm": 0.02314288541674614, + "learning_rate": 4.5290975257968155e-07, + "loss": 0.0963, + "num_input_tokens_seen": 2664192, + "step": 5425 + }, + { + "epoch": 0.7166424706348159, + "grad_norm": 22.84745216369629, + "learning_rate": 4.509829580076452e-07, + "loss": 0.1819, + "num_input_tokens_seen": 2666624, + "step": 5430 + }, + { + "epoch": 0.7173023624125643, + "grad_norm": 0.063370481133461, + "learning_rate": 4.490590767275442e-07, + "loss": 0.1842, + "num_input_tokens_seen": 2669120, + "step": 5435 + }, + { + "epoch": 0.7179622541903128, + "grad_norm": 0.49410998821258545, + "learning_rate": 4.4713811894822064e-07, + "loss": 0.102, + "num_input_tokens_seen": 2671552, + "step": 5440 + }, + { + "epoch": 0.7186221459680613, + "grad_norm": 0.3350347578525543, + "learning_rate": 4.4522009486300204e-07, + "loss": 0.071, + "num_input_tokens_seen": 2674240, + "step": 5445 + }, + { + "epoch": 0.7192820377458097, + "grad_norm": 0.07053118199110031, + "learning_rate": 4.43305014649649e-07, + "loss": 0.1247, + "num_input_tokens_seen": 2676544, + "step": 5450 + }, + { + "epoch": 0.7199419295235582, + "grad_norm": 0.14452704787254333, + "learning_rate": 4.4139288847030155e-07, + "loss": 0.0005, + "num_input_tokens_seen": 2678912, + "step": 5455 + }, + { + "epoch": 0.7206018213013066, + "grad_norm": 0.08119305223226547, + "learning_rate": 4.394837264714233e-07, + "loss": 0.0554, + "num_input_tokens_seen": 2681344, + "step": 5460 + }, + { + "epoch": 0.7212617130790551, + "grad_norm": 0.15848740935325623, + "learning_rate": 4.3757753878375005e-07, + "loss": 0.0013, + "num_input_tokens_seen": 2683776, + "step": 5465 + }, + { + "epoch": 0.7219216048568035, + "grad_norm": 0.031311068683862686, + "learning_rate": 4.3567433552223375e-07, + "loss": 0.0567, + "num_input_tokens_seen": 2686016, + "step": 5470 + }, + { + "epoch": 0.722581496634552, + "grad_norm": 0.839226484298706, + "learning_rate": 4.3377412678599e-07, + "loss": 0.1963, + "num_input_tokens_seen": 2688128, + "step": 5475 + }, + { + "epoch": 0.7232413884123003, + "grad_norm": 0.17472581565380096, + "learning_rate": 4.318769226582454e-07, + "loss": 0.1399, + "num_input_tokens_seen": 2690368, + "step": 5480 + }, + { + "epoch": 0.7239012801900488, + "grad_norm": 0.12172765284776688, + "learning_rate": 4.299827332062811e-07, + "loss": 0.0348, + "num_input_tokens_seen": 2692992, + "step": 5485 + }, + { + "epoch": 0.7245611719677972, + "grad_norm": 75.4613037109375, + "learning_rate": 4.2809156848138363e-07, + "loss": 0.0968, + "num_input_tokens_seen": 2695424, + "step": 5490 + }, + { + "epoch": 0.7252210637455457, + "grad_norm": 71.69564056396484, + "learning_rate": 4.2620343851878616e-07, + "loss": 0.1639, + "num_input_tokens_seen": 2697856, + "step": 5495 + }, + { + "epoch": 0.7258809555232941, + "grad_norm": 0.060778968036174774, + "learning_rate": 4.2431835333762123e-07, + "loss": 0.0446, + "num_input_tokens_seen": 2700608, + "step": 5500 + }, + { + "epoch": 0.7265408473010426, + "grad_norm": 0.6623153686523438, + "learning_rate": 4.224363229408628e-07, + "loss": 0.0005, + "num_input_tokens_seen": 2703104, + "step": 5505 + }, + { + "epoch": 0.7272007390787911, + "grad_norm": 0.34537097811698914, + "learning_rate": 4.205573573152753e-07, + "loss": 0.1834, + "num_input_tokens_seen": 2705344, + "step": 5510 + }, + { + "epoch": 0.7278606308565395, + "grad_norm": 0.14280956983566284, + "learning_rate": 4.18681466431361e-07, + "loss": 0.0728, + "num_input_tokens_seen": 2707520, + "step": 5515 + }, + { + "epoch": 0.728520522634288, + "grad_norm": 1.0312310457229614, + "learning_rate": 4.168086602433055e-07, + "loss": 0.105, + "num_input_tokens_seen": 2709888, + "step": 5520 + }, + { + "epoch": 0.7291804144120364, + "grad_norm": 0.46186262369155884, + "learning_rate": 4.1493894868892676e-07, + "loss": 0.1888, + "num_input_tokens_seen": 2712192, + "step": 5525 + }, + { + "epoch": 0.7298403061897849, + "grad_norm": 0.5339822769165039, + "learning_rate": 4.1307234168962093e-07, + "loss": 0.0838, + "num_input_tokens_seen": 2714368, + "step": 5530 + }, + { + "epoch": 0.7305001979675333, + "grad_norm": 0.03589556738734245, + "learning_rate": 4.112088491503095e-07, + "loss": 0.0014, + "num_input_tokens_seen": 2716608, + "step": 5535 + }, + { + "epoch": 0.7311600897452818, + "grad_norm": 0.08683586120605469, + "learning_rate": 4.0934848095938937e-07, + "loss": 0.001, + "num_input_tokens_seen": 2718656, + "step": 5540 + }, + { + "epoch": 0.7318199815230302, + "grad_norm": 0.04359391704201698, + "learning_rate": 4.074912469886763e-07, + "loss": 0.098, + "num_input_tokens_seen": 2721152, + "step": 5545 + }, + { + "epoch": 0.7324798733007787, + "grad_norm": 0.47505855560302734, + "learning_rate": 4.0563715709335657e-07, + "loss": 0.0009, + "num_input_tokens_seen": 2723264, + "step": 5550 + }, + { + "epoch": 0.7331397650785271, + "grad_norm": 40.96418380737305, + "learning_rate": 4.037862211119315e-07, + "loss": 0.2022, + "num_input_tokens_seen": 2725568, + "step": 5555 + }, + { + "epoch": 0.7337996568562756, + "grad_norm": 0.221147358417511, + "learning_rate": 4.0193844886616715e-07, + "loss": 0.0389, + "num_input_tokens_seen": 2728192, + "step": 5560 + }, + { + "epoch": 0.7344595486340241, + "grad_norm": 28.70302391052246, + "learning_rate": 4.0009385016104137e-07, + "loss": 0.1632, + "num_input_tokens_seen": 2731072, + "step": 5565 + }, + { + "epoch": 0.7351194404117725, + "grad_norm": 0.036642443388700485, + "learning_rate": 3.9825243478469164e-07, + "loss": 0.1455, + "num_input_tokens_seen": 2733440, + "step": 5570 + }, + { + "epoch": 0.735779332189521, + "grad_norm": 8.163640022277832, + "learning_rate": 3.9641421250836484e-07, + "loss": 0.1211, + "num_input_tokens_seen": 2736064, + "step": 5575 + }, + { + "epoch": 0.7364392239672694, + "grad_norm": 36.30949401855469, + "learning_rate": 3.945791930863622e-07, + "loss": 0.0356, + "num_input_tokens_seen": 2738496, + "step": 5580 + }, + { + "epoch": 0.7370991157450179, + "grad_norm": 0.6677089333534241, + "learning_rate": 3.9274738625599137e-07, + "loss": 0.002, + "num_input_tokens_seen": 2740800, + "step": 5585 + }, + { + "epoch": 0.7377590075227662, + "grad_norm": 0.42139413952827454, + "learning_rate": 3.909188017375112e-07, + "loss": 0.0746, + "num_input_tokens_seen": 2743104, + "step": 5590 + }, + { + "epoch": 0.7384188993005147, + "grad_norm": 0.15833111107349396, + "learning_rate": 3.890934492340819e-07, + "loss": 0.1553, + "num_input_tokens_seen": 2745344, + "step": 5595 + }, + { + "epoch": 0.7390787910782631, + "grad_norm": 19.225963592529297, + "learning_rate": 3.872713384317147e-07, + "loss": 0.062, + "num_input_tokens_seen": 2747520, + "step": 5600 + }, + { + "epoch": 0.7397386828560116, + "grad_norm": 0.0576261468231678, + "learning_rate": 3.8545247899921776e-07, + "loss": 0.1382, + "num_input_tokens_seen": 2750016, + "step": 5605 + }, + { + "epoch": 0.74039857463376, + "grad_norm": 0.09810295701026917, + "learning_rate": 3.8363688058814614e-07, + "loss": 0.1139, + "num_input_tokens_seen": 2752704, + "step": 5610 + }, + { + "epoch": 0.7410584664115085, + "grad_norm": 37.168209075927734, + "learning_rate": 3.818245528327526e-07, + "loss": 0.1544, + "num_input_tokens_seen": 2755328, + "step": 5615 + }, + { + "epoch": 0.7417183581892569, + "grad_norm": 12.660454750061035, + "learning_rate": 3.8001550534993164e-07, + "loss": 0.0911, + "num_input_tokens_seen": 2757632, + "step": 5620 + }, + { + "epoch": 0.7423782499670054, + "grad_norm": 0.21876884996891022, + "learning_rate": 3.7820974773917413e-07, + "loss": 0.0665, + "num_input_tokens_seen": 2760192, + "step": 5625 + }, + { + "epoch": 0.7430381417447539, + "grad_norm": 0.09194961190223694, + "learning_rate": 3.764072895825117e-07, + "loss": 0.001, + "num_input_tokens_seen": 2762816, + "step": 5630 + }, + { + "epoch": 0.7436980335225023, + "grad_norm": 11.263919830322266, + "learning_rate": 3.7460814044446934e-07, + "loss": 0.0625, + "num_input_tokens_seen": 2765120, + "step": 5635 + }, + { + "epoch": 0.7443579253002508, + "grad_norm": 0.1615023910999298, + "learning_rate": 3.72812309872012e-07, + "loss": 0.0989, + "num_input_tokens_seen": 2767808, + "step": 5640 + }, + { + "epoch": 0.7450178170779992, + "grad_norm": 3.175365447998047, + "learning_rate": 3.71019807394495e-07, + "loss": 0.0518, + "num_input_tokens_seen": 2770176, + "step": 5645 + }, + { + "epoch": 0.7456777088557477, + "grad_norm": 0.27845051884651184, + "learning_rate": 3.6923064252361505e-07, + "loss": 0.0983, + "num_input_tokens_seen": 2772672, + "step": 5650 + }, + { + "epoch": 0.7463376006334961, + "grad_norm": 0.15922772884368896, + "learning_rate": 3.674448247533561e-07, + "loss": 0.1089, + "num_input_tokens_seen": 2775104, + "step": 5655 + }, + { + "epoch": 0.7469974924112446, + "grad_norm": 0.11377550661563873, + "learning_rate": 3.656623635599432e-07, + "loss": 0.2327, + "num_input_tokens_seen": 2777792, + "step": 5660 + }, + { + "epoch": 0.747657384188993, + "grad_norm": 55.64900207519531, + "learning_rate": 3.6388326840178865e-07, + "loss": 0.1313, + "num_input_tokens_seen": 2780416, + "step": 5665 + }, + { + "epoch": 0.7483172759667415, + "grad_norm": 0.06009421497583389, + "learning_rate": 3.621075487194435e-07, + "loss": 0.0056, + "num_input_tokens_seen": 2783232, + "step": 5670 + }, + { + "epoch": 0.7489771677444899, + "grad_norm": 11.293038368225098, + "learning_rate": 3.603352139355483e-07, + "loss": 0.1054, + "num_input_tokens_seen": 2785664, + "step": 5675 + }, + { + "epoch": 0.7496370595222384, + "grad_norm": 8.784896850585938, + "learning_rate": 3.58566273454781e-07, + "loss": 0.0984, + "num_input_tokens_seen": 2788224, + "step": 5680 + }, + { + "epoch": 0.7502969512999867, + "grad_norm": 0.07352028787136078, + "learning_rate": 3.5680073666380817e-07, + "loss": 0.0014, + "num_input_tokens_seen": 2790656, + "step": 5685 + }, + { + "epoch": 0.7502969512999867, + "eval_loss": 0.0956902727484703, + "eval_runtime": 7.594, + "eval_samples_per_second": 886.88, + "eval_steps_per_second": 110.876, + "num_input_tokens_seen": 2790656, + "step": 5685 + }, + { + "epoch": 0.7509568430777352, + "grad_norm": 0.028005223721265793, + "learning_rate": 3.5503861293123514e-07, + "loss": 0.1594, + "num_input_tokens_seen": 2792960, + "step": 5690 + }, + { + "epoch": 0.7516167348554837, + "grad_norm": 0.16075754165649414, + "learning_rate": 3.532799116075571e-07, + "loss": 0.0789, + "num_input_tokens_seen": 2795648, + "step": 5695 + }, + { + "epoch": 0.7522766266332321, + "grad_norm": 53.65882873535156, + "learning_rate": 3.5152464202510777e-07, + "loss": 0.098, + "num_input_tokens_seen": 2797696, + "step": 5700 + }, + { + "epoch": 0.7529365184109806, + "grad_norm": 15.231353759765625, + "learning_rate": 3.4977281349801056e-07, + "loss": 0.1334, + "num_input_tokens_seen": 2800192, + "step": 5705 + }, + { + "epoch": 0.753596410188729, + "grad_norm": 0.14780941605567932, + "learning_rate": 3.4802443532213056e-07, + "loss": 0.0133, + "num_input_tokens_seen": 2802560, + "step": 5710 + }, + { + "epoch": 0.7542563019664775, + "grad_norm": 0.030415428802371025, + "learning_rate": 3.4627951677502233e-07, + "loss": 0.2453, + "num_input_tokens_seen": 2804992, + "step": 5715 + }, + { + "epoch": 0.7549161937442259, + "grad_norm": 0.889872133731842, + "learning_rate": 3.4453806711588397e-07, + "loss": 0.0492, + "num_input_tokens_seen": 2807296, + "step": 5720 + }, + { + "epoch": 0.7555760855219744, + "grad_norm": 0.08172111958265305, + "learning_rate": 3.428000955855054e-07, + "loss": 0.0303, + "num_input_tokens_seen": 2809984, + "step": 5725 + }, + { + "epoch": 0.7562359772997228, + "grad_norm": 0.09841586649417877, + "learning_rate": 3.4106561140621983e-07, + "loss": 0.0023, + "num_input_tokens_seen": 2812736, + "step": 5730 + }, + { + "epoch": 0.7568958690774713, + "grad_norm": 0.37726613879203796, + "learning_rate": 3.393346237818567e-07, + "loss": 0.1465, + "num_input_tokens_seen": 2815040, + "step": 5735 + }, + { + "epoch": 0.7575557608552197, + "grad_norm": 0.7347794771194458, + "learning_rate": 3.3760714189769015e-07, + "loss": 0.1114, + "num_input_tokens_seen": 2817344, + "step": 5740 + }, + { + "epoch": 0.7582156526329682, + "grad_norm": 0.10844270884990692, + "learning_rate": 3.3588317492039266e-07, + "loss": 0.0357, + "num_input_tokens_seen": 2819648, + "step": 5745 + }, + { + "epoch": 0.7588755444107167, + "grad_norm": 46.741573333740234, + "learning_rate": 3.341627319979834e-07, + "loss": 0.1254, + "num_input_tokens_seen": 2822464, + "step": 5750 + }, + { + "epoch": 0.7595354361884651, + "grad_norm": 150.2995147705078, + "learning_rate": 3.324458222597839e-07, + "loss": 0.1943, + "num_input_tokens_seen": 2824896, + "step": 5755 + }, + { + "epoch": 0.7601953279662136, + "grad_norm": 0.15239302814006805, + "learning_rate": 3.307324548163657e-07, + "loss": 0.0749, + "num_input_tokens_seen": 2827648, + "step": 5760 + }, + { + "epoch": 0.760855219743962, + "grad_norm": 0.6753157377243042, + "learning_rate": 3.2902263875950374e-07, + "loss": 0.114, + "num_input_tokens_seen": 2830336, + "step": 5765 + }, + { + "epoch": 0.7615151115217105, + "grad_norm": 16.683015823364258, + "learning_rate": 3.2731638316212894e-07, + "loss": 0.0462, + "num_input_tokens_seen": 2832640, + "step": 5770 + }, + { + "epoch": 0.7621750032994589, + "grad_norm": 1.5054552555084229, + "learning_rate": 3.256136970782782e-07, + "loss": 0.049, + "num_input_tokens_seen": 2834880, + "step": 5775 + }, + { + "epoch": 0.7628348950772074, + "grad_norm": 0.033258408308029175, + "learning_rate": 3.23914589543047e-07, + "loss": 0.1447, + "num_input_tokens_seen": 2837440, + "step": 5780 + }, + { + "epoch": 0.7634947868549558, + "grad_norm": 103.95304107666016, + "learning_rate": 3.2221906957254276e-07, + "loss": 0.0424, + "num_input_tokens_seen": 2839808, + "step": 5785 + }, + { + "epoch": 0.7641546786327043, + "grad_norm": 5.235893726348877, + "learning_rate": 3.205271461638346e-07, + "loss": 0.1412, + "num_input_tokens_seen": 2842432, + "step": 5790 + }, + { + "epoch": 0.7648145704104526, + "grad_norm": 0.035734184086322784, + "learning_rate": 3.188388282949085e-07, + "loss": 0.1313, + "num_input_tokens_seen": 2845120, + "step": 5795 + }, + { + "epoch": 0.7654744621882011, + "grad_norm": 61.18632125854492, + "learning_rate": 3.171541249246166e-07, + "loss": 0.1633, + "num_input_tokens_seen": 2848000, + "step": 5800 + }, + { + "epoch": 0.7661343539659495, + "grad_norm": 14.463330268859863, + "learning_rate": 3.154730449926316e-07, + "loss": 0.161, + "num_input_tokens_seen": 2850624, + "step": 5805 + }, + { + "epoch": 0.766794245743698, + "grad_norm": 0.18341617286205292, + "learning_rate": 3.137955974194e-07, + "loss": 0.121, + "num_input_tokens_seen": 2852992, + "step": 5810 + }, + { + "epoch": 0.7674541375214465, + "grad_norm": 28.731979370117188, + "learning_rate": 3.1212179110609125e-07, + "loss": 0.1251, + "num_input_tokens_seen": 2855424, + "step": 5815 + }, + { + "epoch": 0.7681140292991949, + "grad_norm": 0.0925399586558342, + "learning_rate": 3.104516349345553e-07, + "loss": 0.137, + "num_input_tokens_seen": 2857984, + "step": 5820 + }, + { + "epoch": 0.7687739210769434, + "grad_norm": 0.09687471389770508, + "learning_rate": 3.0878513776727144e-07, + "loss": 0.0643, + "num_input_tokens_seen": 2860672, + "step": 5825 + }, + { + "epoch": 0.7694338128546918, + "grad_norm": 10.534875869750977, + "learning_rate": 3.0712230844730414e-07, + "loss": 0.1726, + "num_input_tokens_seen": 2863040, + "step": 5830 + }, + { + "epoch": 0.7700937046324403, + "grad_norm": 0.4192121624946594, + "learning_rate": 3.054631557982539e-07, + "loss": 0.0704, + "num_input_tokens_seen": 2865856, + "step": 5835 + }, + { + "epoch": 0.7707535964101887, + "grad_norm": 0.11545547842979431, + "learning_rate": 3.0380768862421156e-07, + "loss": 0.1005, + "num_input_tokens_seen": 2868096, + "step": 5840 + }, + { + "epoch": 0.7714134881879372, + "grad_norm": 0.13741333782672882, + "learning_rate": 3.0215591570971234e-07, + "loss": 0.0013, + "num_input_tokens_seen": 2870784, + "step": 5845 + }, + { + "epoch": 0.7720733799656856, + "grad_norm": 55.587005615234375, + "learning_rate": 3.005078458196868e-07, + "loss": 0.0712, + "num_input_tokens_seen": 2873216, + "step": 5850 + }, + { + "epoch": 0.7727332717434341, + "grad_norm": 0.19076700508594513, + "learning_rate": 2.988634876994175e-07, + "loss": 0.0011, + "num_input_tokens_seen": 2875776, + "step": 5855 + }, + { + "epoch": 0.7733931635211825, + "grad_norm": 0.12881390750408173, + "learning_rate": 2.972228500744898e-07, + "loss": 0.0336, + "num_input_tokens_seen": 2878336, + "step": 5860 + }, + { + "epoch": 0.774053055298931, + "grad_norm": 22.819622039794922, + "learning_rate": 2.955859416507467e-07, + "loss": 0.1431, + "num_input_tokens_seen": 2880896, + "step": 5865 + }, + { + "epoch": 0.7747129470766794, + "grad_norm": 0.040956467390060425, + "learning_rate": 2.9395277111424357e-07, + "loss": 0.0684, + "num_input_tokens_seen": 2883648, + "step": 5870 + }, + { + "epoch": 0.7753728388544279, + "grad_norm": 0.0455995537340641, + "learning_rate": 2.9232334713120035e-07, + "loss": 0.0016, + "num_input_tokens_seen": 2885952, + "step": 5875 + }, + { + "epoch": 0.7760327306321764, + "grad_norm": 0.3208160996437073, + "learning_rate": 2.9069767834795655e-07, + "loss": 0.0614, + "num_input_tokens_seen": 2888576, + "step": 5880 + }, + { + "epoch": 0.7766926224099248, + "grad_norm": 3.3780038356781006, + "learning_rate": 2.8907577339092483e-07, + "loss": 0.1331, + "num_input_tokens_seen": 2891136, + "step": 5885 + }, + { + "epoch": 0.7773525141876733, + "grad_norm": 0.030515162274241447, + "learning_rate": 2.8745764086654654e-07, + "loss": 0.0711, + "num_input_tokens_seen": 2893696, + "step": 5890 + }, + { + "epoch": 0.7780124059654216, + "grad_norm": 37.801578521728516, + "learning_rate": 2.8584328936124424e-07, + "loss": 0.0499, + "num_input_tokens_seen": 2896512, + "step": 5895 + }, + { + "epoch": 0.7786722977431701, + "grad_norm": 13.73175048828125, + "learning_rate": 2.8423272744137674e-07, + "loss": 0.1805, + "num_input_tokens_seen": 2899008, + "step": 5900 + }, + { + "epoch": 0.7793321895209185, + "grad_norm": 1.2914345264434814, + "learning_rate": 2.82625963653195e-07, + "loss": 0.0781, + "num_input_tokens_seen": 2901376, + "step": 5905 + }, + { + "epoch": 0.779992081298667, + "grad_norm": 8.905738830566406, + "learning_rate": 2.810230065227944e-07, + "loss": 0.1989, + "num_input_tokens_seen": 2903872, + "step": 5910 + }, + { + "epoch": 0.7806519730764154, + "grad_norm": 0.1153329461812973, + "learning_rate": 2.7942386455607203e-07, + "loss": 0.0016, + "num_input_tokens_seen": 2906240, + "step": 5915 + }, + { + "epoch": 0.7813118648541639, + "grad_norm": 0.40870046615600586, + "learning_rate": 2.77828546238679e-07, + "loss": 0.0695, + "num_input_tokens_seen": 2908736, + "step": 5920 + }, + { + "epoch": 0.7819717566319123, + "grad_norm": 50.935813903808594, + "learning_rate": 2.762370600359774e-07, + "loss": 0.1347, + "num_input_tokens_seen": 2911104, + "step": 5925 + }, + { + "epoch": 0.7826316484096608, + "grad_norm": 0.06911960244178772, + "learning_rate": 2.7464941439299484e-07, + "loss": 0.0614, + "num_input_tokens_seen": 2913472, + "step": 5930 + }, + { + "epoch": 0.7832915401874093, + "grad_norm": 12.452083587646484, + "learning_rate": 2.7306561773437887e-07, + "loss": 0.1583, + "num_input_tokens_seen": 2915840, + "step": 5935 + }, + { + "epoch": 0.7839514319651577, + "grad_norm": 0.09292475879192352, + "learning_rate": 2.714856784643533e-07, + "loss": 0.0047, + "num_input_tokens_seen": 2918144, + "step": 5940 + }, + { + "epoch": 0.7846113237429062, + "grad_norm": 0.06648958474397659, + "learning_rate": 2.6990960496667313e-07, + "loss": 0.1479, + "num_input_tokens_seen": 2920768, + "step": 5945 + }, + { + "epoch": 0.7852712155206546, + "grad_norm": 0.07045161724090576, + "learning_rate": 2.6833740560457976e-07, + "loss": 0.067, + "num_input_tokens_seen": 2923136, + "step": 5950 + }, + { + "epoch": 0.7859311072984031, + "grad_norm": 0.12214231491088867, + "learning_rate": 2.6676908872075757e-07, + "loss": 0.0702, + "num_input_tokens_seen": 2925568, + "step": 5955 + }, + { + "epoch": 0.7865909990761515, + "grad_norm": 0.0641525536775589, + "learning_rate": 2.6520466263728836e-07, + "loss": 0.0576, + "num_input_tokens_seen": 2928064, + "step": 5960 + }, + { + "epoch": 0.7872508908539, + "grad_norm": 38.231407165527344, + "learning_rate": 2.636441356556087e-07, + "loss": 0.2178, + "num_input_tokens_seen": 2930368, + "step": 5965 + }, + { + "epoch": 0.7879107826316484, + "grad_norm": 13.16163158416748, + "learning_rate": 2.620875160564645e-07, + "loss": 0.1005, + "num_input_tokens_seen": 2932928, + "step": 5970 + }, + { + "epoch": 0.7885706744093969, + "grad_norm": 208.31663513183594, + "learning_rate": 2.6053481209986715e-07, + "loss": 0.418, + "num_input_tokens_seen": 2935360, + "step": 5975 + }, + { + "epoch": 0.7892305661871453, + "grad_norm": 0.08345562219619751, + "learning_rate": 2.5898603202505155e-07, + "loss": 0.059, + "num_input_tokens_seen": 2937920, + "step": 5980 + }, + { + "epoch": 0.7898904579648938, + "grad_norm": 0.3885025382041931, + "learning_rate": 2.5744118405042923e-07, + "loss": 0.0502, + "num_input_tokens_seen": 2940224, + "step": 5985 + }, + { + "epoch": 0.7905503497426422, + "grad_norm": 0.12759974598884583, + "learning_rate": 2.559002763735485e-07, + "loss": 0.0017, + "num_input_tokens_seen": 2942848, + "step": 5990 + }, + { + "epoch": 0.7912102415203907, + "grad_norm": 0.2042687088251114, + "learning_rate": 2.543633171710472e-07, + "loss": 0.0591, + "num_input_tokens_seen": 2945344, + "step": 5995 + }, + { + "epoch": 0.7918701332981392, + "grad_norm": 2.7166707515716553, + "learning_rate": 2.5283031459861205e-07, + "loss": 0.0162, + "num_input_tokens_seen": 2947840, + "step": 6000 + }, + { + "epoch": 0.7925300250758875, + "grad_norm": 0.08386794477701187, + "learning_rate": 2.5130127679093396e-07, + "loss": 0.0344, + "num_input_tokens_seen": 2950144, + "step": 6005 + }, + { + "epoch": 0.793189916853636, + "grad_norm": 59.384368896484375, + "learning_rate": 2.497762118616652e-07, + "loss": 0.0428, + "num_input_tokens_seen": 2952384, + "step": 6010 + }, + { + "epoch": 0.7938498086313844, + "grad_norm": 0.061096593737602234, + "learning_rate": 2.4825512790337745e-07, + "loss": 0.0788, + "num_input_tokens_seen": 2955136, + "step": 6015 + }, + { + "epoch": 0.7945097004091329, + "grad_norm": 38.1906623840332, + "learning_rate": 2.467380329875163e-07, + "loss": 0.0441, + "num_input_tokens_seen": 2957824, + "step": 6020 + }, + { + "epoch": 0.7951695921868813, + "grad_norm": 0.729996919631958, + "learning_rate": 2.452249351643615e-07, + "loss": 0.0038, + "num_input_tokens_seen": 2960256, + "step": 6025 + }, + { + "epoch": 0.7958294839646298, + "grad_norm": 0.31032249331474304, + "learning_rate": 2.437158424629817e-07, + "loss": 0.0672, + "num_input_tokens_seen": 2962944, + "step": 6030 + }, + { + "epoch": 0.7964893757423782, + "grad_norm": 0.5417336821556091, + "learning_rate": 2.422107628911929e-07, + "loss": 0.2047, + "num_input_tokens_seen": 2965504, + "step": 6035 + }, + { + "epoch": 0.7971492675201267, + "grad_norm": 0.07609419524669647, + "learning_rate": 2.4070970443551673e-07, + "loss": 0.2335, + "num_input_tokens_seen": 2967744, + "step": 6040 + }, + { + "epoch": 0.7978091592978751, + "grad_norm": 0.5796427726745605, + "learning_rate": 2.392126750611362e-07, + "loss": 0.0017, + "num_input_tokens_seen": 2970240, + "step": 6045 + }, + { + "epoch": 0.7984690510756236, + "grad_norm": 19.56314468383789, + "learning_rate": 2.3771968271185538e-07, + "loss": 0.1777, + "num_input_tokens_seen": 2972928, + "step": 6050 + }, + { + "epoch": 0.799128942853372, + "grad_norm": 0.015146835707128048, + "learning_rate": 2.3623073531005579e-07, + "loss": 0.1485, + "num_input_tokens_seen": 2975168, + "step": 6055 + }, + { + "epoch": 0.7997888346311205, + "grad_norm": 0.11885405331850052, + "learning_rate": 2.3474584075665493e-07, + "loss": 0.1294, + "num_input_tokens_seen": 2977408, + "step": 6060 + }, + { + "epoch": 0.8003167480533192, + "eval_loss": 0.0954766720533371, + "eval_runtime": 7.5442, + "eval_samples_per_second": 892.739, + "eval_steps_per_second": 111.609, + "num_input_tokens_seen": 2979456, + "step": 6064 + }, + { + "epoch": 0.800448726408869, + "grad_norm": 0.06911212205886841, + "learning_rate": 2.3326500693106533e-07, + "loss": 0.0013, + "num_input_tokens_seen": 2979968, + "step": 6065 + }, + { + "epoch": 0.8011086181866174, + "grad_norm": 82.93882751464844, + "learning_rate": 2.3178824169114975e-07, + "loss": 0.209, + "num_input_tokens_seen": 2982528, + "step": 6070 + }, + { + "epoch": 0.8017685099643659, + "grad_norm": 0.18306070566177368, + "learning_rate": 2.303155528731837e-07, + "loss": 0.0494, + "num_input_tokens_seen": 2984832, + "step": 6075 + }, + { + "epoch": 0.8024284017421143, + "grad_norm": 1.8438490629196167, + "learning_rate": 2.2884694829181016e-07, + "loss": 0.0014, + "num_input_tokens_seen": 2987328, + "step": 6080 + }, + { + "epoch": 0.8030882935198628, + "grad_norm": 0.09328246116638184, + "learning_rate": 2.273824357400005e-07, + "loss": 0.0083, + "num_input_tokens_seen": 2989760, + "step": 6085 + }, + { + "epoch": 0.8037481852976112, + "grad_norm": 0.17672888934612274, + "learning_rate": 2.2592202298901174e-07, + "loss": 0.0188, + "num_input_tokens_seen": 2992320, + "step": 6090 + }, + { + "epoch": 0.8044080770753597, + "grad_norm": 0.12940169870853424, + "learning_rate": 2.2446571778834555e-07, + "loss": 0.0014, + "num_input_tokens_seen": 2995136, + "step": 6095 + }, + { + "epoch": 0.805067968853108, + "grad_norm": 0.075173020362854, + "learning_rate": 2.2301352786570827e-07, + "loss": 0.0009, + "num_input_tokens_seen": 2998016, + "step": 6100 + }, + { + "epoch": 0.8057278606308566, + "grad_norm": 0.03360762447118759, + "learning_rate": 2.215654609269685e-07, + "loss": 0.1425, + "num_input_tokens_seen": 3000832, + "step": 6105 + }, + { + "epoch": 0.8063877524086049, + "grad_norm": 15.617521286010742, + "learning_rate": 2.201215246561161e-07, + "loss": 0.1461, + "num_input_tokens_seen": 3003584, + "step": 6110 + }, + { + "epoch": 0.8070476441863534, + "grad_norm": 62.255313873291016, + "learning_rate": 2.1868172671522357e-07, + "loss": 0.0738, + "num_input_tokens_seen": 3006464, + "step": 6115 + }, + { + "epoch": 0.8077075359641019, + "grad_norm": 0.16907618939876556, + "learning_rate": 2.1724607474440216e-07, + "loss": 0.0824, + "num_input_tokens_seen": 3008896, + "step": 6120 + }, + { + "epoch": 0.8083674277418503, + "grad_norm": 21.065229415893555, + "learning_rate": 2.158145763617646e-07, + "loss": 0.1463, + "num_input_tokens_seen": 3011392, + "step": 6125 + }, + { + "epoch": 0.8090273195195988, + "grad_norm": 0.23459585011005402, + "learning_rate": 2.1438723916338198e-07, + "loss": 0.2764, + "num_input_tokens_seen": 3014016, + "step": 6130 + }, + { + "epoch": 0.8096872112973472, + "grad_norm": 0.41196635365486145, + "learning_rate": 2.1296407072324495e-07, + "loss": 0.1715, + "num_input_tokens_seen": 3016576, + "step": 6135 + }, + { + "epoch": 0.8103471030750957, + "grad_norm": 163.61370849609375, + "learning_rate": 2.1154507859322336e-07, + "loss": 0.0432, + "num_input_tokens_seen": 3019008, + "step": 6140 + }, + { + "epoch": 0.8110069948528441, + "grad_norm": 36.33530044555664, + "learning_rate": 2.101302703030252e-07, + "loss": 0.1229, + "num_input_tokens_seen": 3021504, + "step": 6145 + }, + { + "epoch": 0.8116668866305926, + "grad_norm": 10.142012596130371, + "learning_rate": 2.0871965336015885e-07, + "loss": 0.0575, + "num_input_tokens_seen": 3023552, + "step": 6150 + }, + { + "epoch": 0.812326778408341, + "grad_norm": 0.6284022331237793, + "learning_rate": 2.0731323524989031e-07, + "loss": 0.0704, + "num_input_tokens_seen": 3025856, + "step": 6155 + }, + { + "epoch": 0.8129866701860895, + "grad_norm": 1.1452326774597168, + "learning_rate": 2.0591102343520616e-07, + "loss": 0.2049, + "num_input_tokens_seen": 3028096, + "step": 6160 + }, + { + "epoch": 0.8136465619638379, + "grad_norm": 44.43306350708008, + "learning_rate": 2.0451302535677206e-07, + "loss": 0.159, + "num_input_tokens_seen": 3030528, + "step": 6165 + }, + { + "epoch": 0.8143064537415864, + "grad_norm": 113.5491943359375, + "learning_rate": 2.0311924843289396e-07, + "loss": 0.227, + "num_input_tokens_seen": 3033088, + "step": 6170 + }, + { + "epoch": 0.8149663455193348, + "grad_norm": 0.11563657224178314, + "learning_rate": 2.017297000594794e-07, + "loss": 0.0642, + "num_input_tokens_seen": 3035200, + "step": 6175 + }, + { + "epoch": 0.8156262372970833, + "grad_norm": 0.15644113719463348, + "learning_rate": 2.0034438760999696e-07, + "loss": 0.0604, + "num_input_tokens_seen": 3037696, + "step": 6180 + }, + { + "epoch": 0.8162861290748318, + "grad_norm": 14.904664039611816, + "learning_rate": 1.9896331843543856e-07, + "loss": 0.1423, + "num_input_tokens_seen": 3040128, + "step": 6185 + }, + { + "epoch": 0.8169460208525802, + "grad_norm": 0.2976359724998474, + "learning_rate": 1.975864998642789e-07, + "loss": 0.1184, + "num_input_tokens_seen": 3042560, + "step": 6190 + }, + { + "epoch": 0.8176059126303287, + "grad_norm": 37.44635772705078, + "learning_rate": 1.9621393920243767e-07, + "loss": 0.2826, + "num_input_tokens_seen": 3044800, + "step": 6195 + }, + { + "epoch": 0.8182658044080771, + "grad_norm": 105.21582794189453, + "learning_rate": 1.9484564373324074e-07, + "loss": 0.1028, + "num_input_tokens_seen": 3047040, + "step": 6200 + }, + { + "epoch": 0.8189256961858256, + "grad_norm": 0.048264820128679276, + "learning_rate": 1.934816207173805e-07, + "loss": 0.0495, + "num_input_tokens_seen": 3049600, + "step": 6205 + }, + { + "epoch": 0.819585587963574, + "grad_norm": 0.17185015976428986, + "learning_rate": 1.9212187739287943e-07, + "loss": 0.158, + "num_input_tokens_seen": 3052416, + "step": 6210 + }, + { + "epoch": 0.8202454797413224, + "grad_norm": 0.28126591444015503, + "learning_rate": 1.907664209750488e-07, + "loss": 0.0135, + "num_input_tokens_seen": 3055040, + "step": 6215 + }, + { + "epoch": 0.8209053715190708, + "grad_norm": 0.4188820719718933, + "learning_rate": 1.8941525865645336e-07, + "loss": 0.0446, + "num_input_tokens_seen": 3057856, + "step": 6220 + }, + { + "epoch": 0.8215652632968193, + "grad_norm": 49.17670440673828, + "learning_rate": 1.8806839760687076e-07, + "loss": 0.2045, + "num_input_tokens_seen": 3060160, + "step": 6225 + }, + { + "epoch": 0.8222251550745677, + "grad_norm": 0.09683864563703537, + "learning_rate": 1.867258449732545e-07, + "loss": 0.1205, + "num_input_tokens_seen": 3062592, + "step": 6230 + }, + { + "epoch": 0.8228850468523162, + "grad_norm": 15.184978485107422, + "learning_rate": 1.8538760787969676e-07, + "loss": 0.0502, + "num_input_tokens_seen": 3065088, + "step": 6235 + }, + { + "epoch": 0.8235449386300646, + "grad_norm": 1.2835743427276611, + "learning_rate": 1.8405369342738907e-07, + "loss": 0.0019, + "num_input_tokens_seen": 3067712, + "step": 6240 + }, + { + "epoch": 0.8242048304078131, + "grad_norm": 122.01753997802734, + "learning_rate": 1.8272410869458598e-07, + "loss": 0.0876, + "num_input_tokens_seen": 3070144, + "step": 6245 + }, + { + "epoch": 0.8248647221855616, + "grad_norm": 0.3181722164154053, + "learning_rate": 1.8139886073656653e-07, + "loss": 0.2369, + "num_input_tokens_seen": 3072448, + "step": 6250 + }, + { + "epoch": 0.82552461396331, + "grad_norm": 0.28298959136009216, + "learning_rate": 1.800779565855971e-07, + "loss": 0.2066, + "num_input_tokens_seen": 3075072, + "step": 6255 + }, + { + "epoch": 0.8261845057410585, + "grad_norm": 0.08039449155330658, + "learning_rate": 1.7876140325089463e-07, + "loss": 0.0029, + "num_input_tokens_seen": 3077376, + "step": 6260 + }, + { + "epoch": 0.8268443975188069, + "grad_norm": 0.7492879629135132, + "learning_rate": 1.774492077185883e-07, + "loss": 0.1344, + "num_input_tokens_seen": 3079808, + "step": 6265 + }, + { + "epoch": 0.8275042892965554, + "grad_norm": 0.0355597622692585, + "learning_rate": 1.7614137695168408e-07, + "loss": 0.0009, + "num_input_tokens_seen": 3082560, + "step": 6270 + }, + { + "epoch": 0.8281641810743038, + "grad_norm": 0.3212199807167053, + "learning_rate": 1.748379178900261e-07, + "loss": 0.0705, + "num_input_tokens_seen": 3084608, + "step": 6275 + }, + { + "epoch": 0.8288240728520523, + "grad_norm": 17.22373390197754, + "learning_rate": 1.7353883745026055e-07, + "loss": 0.228, + "num_input_tokens_seen": 3087104, + "step": 6280 + }, + { + "epoch": 0.8294839646298007, + "grad_norm": 1.737422227859497, + "learning_rate": 1.722441425257999e-07, + "loss": 0.1102, + "num_input_tokens_seen": 3089408, + "step": 6285 + }, + { + "epoch": 0.8301438564075492, + "grad_norm": 0.23113372921943665, + "learning_rate": 1.7095383998678402e-07, + "loss": 0.0552, + "num_input_tokens_seen": 3091776, + "step": 6290 + }, + { + "epoch": 0.8308037481852976, + "grad_norm": 0.0835813581943512, + "learning_rate": 1.6966793668004653e-07, + "loss": 0.1083, + "num_input_tokens_seen": 3094208, + "step": 6295 + }, + { + "epoch": 0.8314636399630461, + "grad_norm": 0.1578727513551712, + "learning_rate": 1.6838643942907625e-07, + "loss": 0.0801, + "num_input_tokens_seen": 3096768, + "step": 6300 + }, + { + "epoch": 0.8321235317407946, + "grad_norm": 72.27742004394531, + "learning_rate": 1.671093550339815e-07, + "loss": 0.06, + "num_input_tokens_seen": 3099456, + "step": 6305 + }, + { + "epoch": 0.832783423518543, + "grad_norm": 0.853486180305481, + "learning_rate": 1.6583669027145542e-07, + "loss": 0.0046, + "num_input_tokens_seen": 3102208, + "step": 6310 + }, + { + "epoch": 0.8334433152962915, + "grad_norm": 0.05901863053441048, + "learning_rate": 1.6456845189473767e-07, + "loss": 0.0014, + "num_input_tokens_seen": 3104896, + "step": 6315 + }, + { + "epoch": 0.8341032070740398, + "grad_norm": 0.11249249428510666, + "learning_rate": 1.6330464663358123e-07, + "loss": 0.1178, + "num_input_tokens_seen": 3107520, + "step": 6320 + }, + { + "epoch": 0.8347630988517883, + "grad_norm": 0.057853005826473236, + "learning_rate": 1.6204528119421346e-07, + "loss": 0.0014, + "num_input_tokens_seen": 3110144, + "step": 6325 + }, + { + "epoch": 0.8354229906295367, + "grad_norm": 0.1531873196363449, + "learning_rate": 1.607903622593042e-07, + "loss": 0.0501, + "num_input_tokens_seen": 3112768, + "step": 6330 + }, + { + "epoch": 0.8360828824072852, + "grad_norm": 0.04447514936327934, + "learning_rate": 1.5953989648792743e-07, + "loss": 0.0007, + "num_input_tokens_seen": 3115328, + "step": 6335 + }, + { + "epoch": 0.8367427741850336, + "grad_norm": 0.14438007771968842, + "learning_rate": 1.5829389051552678e-07, + "loss": 0.0323, + "num_input_tokens_seen": 3117888, + "step": 6340 + }, + { + "epoch": 0.8374026659627821, + "grad_norm": 101.97913360595703, + "learning_rate": 1.5705235095388136e-07, + "loss": 0.038, + "num_input_tokens_seen": 3120384, + "step": 6345 + }, + { + "epoch": 0.8380625577405305, + "grad_norm": 0.08870197832584381, + "learning_rate": 1.5581528439106907e-07, + "loss": 0.0436, + "num_input_tokens_seen": 3123008, + "step": 6350 + }, + { + "epoch": 0.838722449518279, + "grad_norm": 0.36987948417663574, + "learning_rate": 1.5458269739143292e-07, + "loss": 0.0796, + "num_input_tokens_seen": 3125504, + "step": 6355 + }, + { + "epoch": 0.8393823412960274, + "grad_norm": 2.8769209384918213, + "learning_rate": 1.5335459649554538e-07, + "loss": 0.0025, + "num_input_tokens_seen": 3127744, + "step": 6360 + }, + { + "epoch": 0.8400422330737759, + "grad_norm": 0.1269061416387558, + "learning_rate": 1.5213098822017357e-07, + "loss": 0.1043, + "num_input_tokens_seen": 3130048, + "step": 6365 + }, + { + "epoch": 0.8407021248515244, + "grad_norm": 0.26083889603614807, + "learning_rate": 1.50911879058246e-07, + "loss": 0.0469, + "num_input_tokens_seen": 3132480, + "step": 6370 + }, + { + "epoch": 0.8413620166292728, + "grad_norm": 107.53121948242188, + "learning_rate": 1.4969727547881628e-07, + "loss": 0.1012, + "num_input_tokens_seen": 3135104, + "step": 6375 + }, + { + "epoch": 0.8420219084070213, + "grad_norm": 14.834338188171387, + "learning_rate": 1.4848718392703052e-07, + "loss": 0.1743, + "num_input_tokens_seen": 3137344, + "step": 6380 + }, + { + "epoch": 0.8426818001847697, + "grad_norm": 16.3470401763916, + "learning_rate": 1.472816108240915e-07, + "loss": 0.1728, + "num_input_tokens_seen": 3140096, + "step": 6385 + }, + { + "epoch": 0.8433416919625182, + "grad_norm": 0.3881288170814514, + "learning_rate": 1.46080562567226e-07, + "loss": 0.0782, + "num_input_tokens_seen": 3142400, + "step": 6390 + }, + { + "epoch": 0.8440015837402666, + "grad_norm": 0.1693449318408966, + "learning_rate": 1.4488404552964993e-07, + "loss": 0.0657, + "num_input_tokens_seen": 3144512, + "step": 6395 + }, + { + "epoch": 0.8446614755180151, + "grad_norm": 68.23955535888672, + "learning_rate": 1.4369206606053463e-07, + "loss": 0.0303, + "num_input_tokens_seen": 3146944, + "step": 6400 + }, + { + "epoch": 0.8453213672957635, + "grad_norm": 0.20148129761219025, + "learning_rate": 1.425046304849742e-07, + "loss": 0.0816, + "num_input_tokens_seen": 3149376, + "step": 6405 + }, + { + "epoch": 0.845981259073512, + "grad_norm": 0.5425065755844116, + "learning_rate": 1.4132174510395024e-07, + "loss": 0.1094, + "num_input_tokens_seen": 3151744, + "step": 6410 + }, + { + "epoch": 0.8466411508512603, + "grad_norm": 0.209197536110878, + "learning_rate": 1.4014341619430003e-07, + "loss": 0.0082, + "num_input_tokens_seen": 3154112, + "step": 6415 + }, + { + "epoch": 0.8473010426290088, + "grad_norm": 0.13178904354572296, + "learning_rate": 1.3896965000868188e-07, + "loss": 0.0082, + "num_input_tokens_seen": 3156480, + "step": 6420 + }, + { + "epoch": 0.8479609344067572, + "grad_norm": 33.16168975830078, + "learning_rate": 1.3780045277554276e-07, + "loss": 0.138, + "num_input_tokens_seen": 3158784, + "step": 6425 + }, + { + "epoch": 0.8486208261845057, + "grad_norm": 0.1764160841703415, + "learning_rate": 1.3663583069908535e-07, + "loss": 0.1674, + "num_input_tokens_seen": 3161152, + "step": 6430 + }, + { + "epoch": 0.8492807179622542, + "grad_norm": 0.02807113528251648, + "learning_rate": 1.3547578995923447e-07, + "loss": 0.0385, + "num_input_tokens_seen": 3163776, + "step": 6435 + }, + { + "epoch": 0.8499406097400026, + "grad_norm": 34.52450180053711, + "learning_rate": 1.3432033671160458e-07, + "loss": 0.1202, + "num_input_tokens_seen": 3166272, + "step": 6440 + }, + { + "epoch": 0.8503365448066517, + "eval_loss": 0.09701072424650192, + "eval_runtime": 7.7873, + "eval_samples_per_second": 864.874, + "eval_steps_per_second": 108.125, + "num_input_tokens_seen": 3167488, + "step": 6443 + }, + { + "epoch": 0.8506005015177511, + "grad_norm": 1.1372543573379517, + "learning_rate": 1.3316947708746762e-07, + "loss": 0.0653, + "num_input_tokens_seen": 3168640, + "step": 6445 + }, + { + "epoch": 0.8512603932954995, + "grad_norm": 0.052345190197229385, + "learning_rate": 1.3202321719371967e-07, + "loss": 0.1256, + "num_input_tokens_seen": 3171008, + "step": 6450 + }, + { + "epoch": 0.851920285073248, + "grad_norm": 0.021850943565368652, + "learning_rate": 1.3088156311284893e-07, + "loss": 0.1099, + "num_input_tokens_seen": 3173312, + "step": 6455 + }, + { + "epoch": 0.8525801768509964, + "grad_norm": 0.08195928484201431, + "learning_rate": 1.2974452090290322e-07, + "loss": 0.2267, + "num_input_tokens_seen": 3175808, + "step": 6460 + }, + { + "epoch": 0.8532400686287449, + "grad_norm": 0.05846588686108589, + "learning_rate": 1.2861209659745865e-07, + "loss": 0.0888, + "num_input_tokens_seen": 3178048, + "step": 6465 + }, + { + "epoch": 0.8538999604064933, + "grad_norm": 0.08579205721616745, + "learning_rate": 1.2748429620558654e-07, + "loss": 0.0148, + "num_input_tokens_seen": 3180544, + "step": 6470 + }, + { + "epoch": 0.8545598521842418, + "grad_norm": 9.61294937133789, + "learning_rate": 1.2636112571182167e-07, + "loss": 0.1561, + "num_input_tokens_seen": 3183040, + "step": 6475 + }, + { + "epoch": 0.8552197439619902, + "grad_norm": 0.1357005089521408, + "learning_rate": 1.2524259107613178e-07, + "loss": 0.1766, + "num_input_tokens_seen": 3185664, + "step": 6480 + }, + { + "epoch": 0.8558796357397387, + "grad_norm": 15.378425598144531, + "learning_rate": 1.2412869823388382e-07, + "loss": 0.146, + "num_input_tokens_seen": 3188672, + "step": 6485 + }, + { + "epoch": 0.8565395275174872, + "grad_norm": 0.08732222765684128, + "learning_rate": 1.2301945309581486e-07, + "loss": 0.0385, + "num_input_tokens_seen": 3191168, + "step": 6490 + }, + { + "epoch": 0.8571994192952356, + "grad_norm": 1.1724306344985962, + "learning_rate": 1.2191486154799846e-07, + "loss": 0.0822, + "num_input_tokens_seen": 3193664, + "step": 6495 + }, + { + "epoch": 0.8578593110729841, + "grad_norm": 0.05099056288599968, + "learning_rate": 1.208149294518147e-07, + "loss": 0.001, + "num_input_tokens_seen": 3196224, + "step": 6500 + }, + { + "epoch": 0.8585192028507325, + "grad_norm": 0.06140226498246193, + "learning_rate": 1.1971966264391954e-07, + "loss": 0.1988, + "num_input_tokens_seen": 3198784, + "step": 6505 + }, + { + "epoch": 0.859179094628481, + "grad_norm": 0.07323023676872253, + "learning_rate": 1.1862906693621233e-07, + "loss": 0.1104, + "num_input_tokens_seen": 3201472, + "step": 6510 + }, + { + "epoch": 0.8598389864062294, + "grad_norm": 0.11436515301465988, + "learning_rate": 1.1754314811580623e-07, + "loss": 0.1169, + "num_input_tokens_seen": 3203584, + "step": 6515 + }, + { + "epoch": 0.8604988781839779, + "grad_norm": 0.09268505871295929, + "learning_rate": 1.1646191194499655e-07, + "loss": 0.0712, + "num_input_tokens_seen": 3205888, + "step": 6520 + }, + { + "epoch": 0.8611587699617262, + "grad_norm": 1.127016544342041, + "learning_rate": 1.1538536416123168e-07, + "loss": 0.1908, + "num_input_tokens_seen": 3208000, + "step": 6525 + }, + { + "epoch": 0.8618186617394747, + "grad_norm": 0.36033815145492554, + "learning_rate": 1.1431351047708072e-07, + "loss": 0.0208, + "num_input_tokens_seen": 3210240, + "step": 6530 + }, + { + "epoch": 0.8624785535172231, + "grad_norm": 46.454463958740234, + "learning_rate": 1.1324635658020432e-07, + "loss": 0.1363, + "num_input_tokens_seen": 3212672, + "step": 6535 + }, + { + "epoch": 0.8631384452949716, + "grad_norm": 9.648067474365234, + "learning_rate": 1.1218390813332479e-07, + "loss": 0.1361, + "num_input_tokens_seen": 3215360, + "step": 6540 + }, + { + "epoch": 0.86379833707272, + "grad_norm": 0.07266692072153091, + "learning_rate": 1.1112617077419472e-07, + "loss": 0.1234, + "num_input_tokens_seen": 3218112, + "step": 6545 + }, + { + "epoch": 0.8644582288504685, + "grad_norm": 0.30243945121765137, + "learning_rate": 1.1007315011556884e-07, + "loss": 0.0346, + "num_input_tokens_seen": 3220288, + "step": 6550 + }, + { + "epoch": 0.865118120628217, + "grad_norm": 0.1822911947965622, + "learning_rate": 1.0902485174517251e-07, + "loss": 0.0015, + "num_input_tokens_seen": 3222976, + "step": 6555 + }, + { + "epoch": 0.8657780124059654, + "grad_norm": 0.06488798558712006, + "learning_rate": 1.0798128122567285e-07, + "loss": 0.0725, + "num_input_tokens_seen": 3225472, + "step": 6560 + }, + { + "epoch": 0.8664379041837139, + "grad_norm": 0.4437669813632965, + "learning_rate": 1.0694244409464992e-07, + "loss": 0.1631, + "num_input_tokens_seen": 3228096, + "step": 6565 + }, + { + "epoch": 0.8670977959614623, + "grad_norm": 0.030641254037618637, + "learning_rate": 1.0590834586456577e-07, + "loss": 0.1158, + "num_input_tokens_seen": 3230720, + "step": 6570 + }, + { + "epoch": 0.8677576877392108, + "grad_norm": 149.95916748046875, + "learning_rate": 1.0487899202273708e-07, + "loss": 0.1239, + "num_input_tokens_seen": 3233088, + "step": 6575 + }, + { + "epoch": 0.8684175795169592, + "grad_norm": 26.74003791809082, + "learning_rate": 1.0385438803130364e-07, + "loss": 0.1255, + "num_input_tokens_seen": 3235712, + "step": 6580 + }, + { + "epoch": 0.8690774712947077, + "grad_norm": 0.10586915165185928, + "learning_rate": 1.0283453932720199e-07, + "loss": 0.1423, + "num_input_tokens_seen": 3238528, + "step": 6585 + }, + { + "epoch": 0.8697373630724561, + "grad_norm": 0.10404416173696518, + "learning_rate": 1.0181945132213476e-07, + "loss": 0.0738, + "num_input_tokens_seen": 3240896, + "step": 6590 + }, + { + "epoch": 0.8703972548502046, + "grad_norm": 0.10048986971378326, + "learning_rate": 1.0080912940254227e-07, + "loss": 0.0016, + "num_input_tokens_seen": 3243392, + "step": 6595 + }, + { + "epoch": 0.871057146627953, + "grad_norm": 0.277065247297287, + "learning_rate": 9.980357892957492e-08, + "loss": 0.0041, + "num_input_tokens_seen": 3245824, + "step": 6600 + }, + { + "epoch": 0.8717170384057015, + "grad_norm": 7.954074859619141, + "learning_rate": 9.880280523906337e-08, + "loss": 0.0031, + "num_input_tokens_seen": 3248128, + "step": 6605 + }, + { + "epoch": 0.8723769301834499, + "grad_norm": 13.635552406311035, + "learning_rate": 9.780681364149091e-08, + "loss": 0.1351, + "num_input_tokens_seen": 3250624, + "step": 6610 + }, + { + "epoch": 0.8730368219611984, + "grad_norm": 0.3135831356048584, + "learning_rate": 9.681560942196587e-08, + "loss": 0.1127, + "num_input_tokens_seen": 3253312, + "step": 6615 + }, + { + "epoch": 0.8736967137389469, + "grad_norm": 0.04287222400307655, + "learning_rate": 9.582919784019194e-08, + "loss": 0.1168, + "num_input_tokens_seen": 3255488, + "step": 6620 + }, + { + "epoch": 0.8743566055166953, + "grad_norm": 0.04105079546570778, + "learning_rate": 9.484758413044236e-08, + "loss": 0.0668, + "num_input_tokens_seen": 3257664, + "step": 6625 + }, + { + "epoch": 0.8750164972944438, + "grad_norm": 0.12905164062976837, + "learning_rate": 9.387077350153017e-08, + "loss": 0.0542, + "num_input_tokens_seen": 3260160, + "step": 6630 + }, + { + "epoch": 0.8756763890721921, + "grad_norm": 13.714322090148926, + "learning_rate": 9.289877113678168e-08, + "loss": 0.0616, + "num_input_tokens_seen": 3262528, + "step": 6635 + }, + { + "epoch": 0.8763362808499406, + "grad_norm": 0.020841121673583984, + "learning_rate": 9.19315821940092e-08, + "loss": 0.0576, + "num_input_tokens_seen": 3265024, + "step": 6640 + }, + { + "epoch": 0.876996172627689, + "grad_norm": 0.17767778038978577, + "learning_rate": 9.096921180548234e-08, + "loss": 0.1659, + "num_input_tokens_seen": 3267456, + "step": 6645 + }, + { + "epoch": 0.8776560644054375, + "grad_norm": 0.2553451657295227, + "learning_rate": 9.001166507790259e-08, + "loss": 0.0915, + "num_input_tokens_seen": 3270208, + "step": 6650 + }, + { + "epoch": 0.8783159561831859, + "grad_norm": 12.365303993225098, + "learning_rate": 8.905894709237427e-08, + "loss": 0.1045, + "num_input_tokens_seen": 3272960, + "step": 6655 + }, + { + "epoch": 0.8789758479609344, + "grad_norm": 0.07707412540912628, + "learning_rate": 8.811106290437975e-08, + "loss": 0.0736, + "num_input_tokens_seen": 3275136, + "step": 6660 + }, + { + "epoch": 0.8796357397386828, + "grad_norm": 169.89840698242188, + "learning_rate": 8.716801754375036e-08, + "loss": 0.1122, + "num_input_tokens_seen": 3277696, + "step": 6665 + }, + { + "epoch": 0.8802956315164313, + "grad_norm": 12.09985065460205, + "learning_rate": 8.62298160146413e-08, + "loss": 0.1268, + "num_input_tokens_seen": 3280064, + "step": 6670 + }, + { + "epoch": 0.8809555232941798, + "grad_norm": 0.17147305607795715, + "learning_rate": 8.529646329550466e-08, + "loss": 0.002, + "num_input_tokens_seen": 3282304, + "step": 6675 + }, + { + "epoch": 0.8816154150719282, + "grad_norm": 0.1868370920419693, + "learning_rate": 8.436796433906235e-08, + "loss": 0.0268, + "num_input_tokens_seen": 3284736, + "step": 6680 + }, + { + "epoch": 0.8822753068496767, + "grad_norm": 16.801742553710938, + "learning_rate": 8.344432407228141e-08, + "loss": 0.0431, + "num_input_tokens_seen": 3287168, + "step": 6685 + }, + { + "epoch": 0.8829351986274251, + "grad_norm": 1.1702982187271118, + "learning_rate": 8.252554739634577e-08, + "loss": 0.0486, + "num_input_tokens_seen": 3289600, + "step": 6690 + }, + { + "epoch": 0.8835950904051736, + "grad_norm": 0.043459370732307434, + "learning_rate": 8.16116391866316e-08, + "loss": 0.0731, + "num_input_tokens_seen": 3292160, + "step": 6695 + }, + { + "epoch": 0.884254982182922, + "grad_norm": 17.922903060913086, + "learning_rate": 8.070260429268172e-08, + "loss": 0.1312, + "num_input_tokens_seen": 3294592, + "step": 6700 + }, + { + "epoch": 0.8849148739606705, + "grad_norm": 129.87510681152344, + "learning_rate": 7.979844753817855e-08, + "loss": 0.0078, + "num_input_tokens_seen": 3296960, + "step": 6705 + }, + { + "epoch": 0.8855747657384189, + "grad_norm": 29.814653396606445, + "learning_rate": 7.889917372091982e-08, + "loss": 0.0772, + "num_input_tokens_seen": 3299200, + "step": 6710 + }, + { + "epoch": 0.8862346575161674, + "grad_norm": 8.637088775634766, + "learning_rate": 7.800478761279183e-08, + "loss": 0.2034, + "num_input_tokens_seen": 3301568, + "step": 6715 + }, + { + "epoch": 0.8868945492939158, + "grad_norm": 11.602696418762207, + "learning_rate": 7.711529395974592e-08, + "loss": 0.1794, + "num_input_tokens_seen": 3304064, + "step": 6720 + }, + { + "epoch": 0.8875544410716643, + "grad_norm": 0.04998312518000603, + "learning_rate": 7.623069748177135e-08, + "loss": 0.1778, + "num_input_tokens_seen": 3306432, + "step": 6725 + }, + { + "epoch": 0.8882143328494126, + "grad_norm": 0.4295664131641388, + "learning_rate": 7.535100287287111e-08, + "loss": 0.1002, + "num_input_tokens_seen": 3308736, + "step": 6730 + }, + { + "epoch": 0.8888742246271611, + "grad_norm": 0.11967656761407852, + "learning_rate": 7.447621480103783e-08, + "loss": 0.0022, + "num_input_tokens_seen": 3311168, + "step": 6735 + }, + { + "epoch": 0.8895341164049096, + "grad_norm": 17.428560256958008, + "learning_rate": 7.360633790822713e-08, + "loss": 0.2822, + "num_input_tokens_seen": 3313664, + "step": 6740 + }, + { + "epoch": 0.890194008182658, + "grad_norm": 0.2180459350347519, + "learning_rate": 7.274137681033498e-08, + "loss": 0.022, + "num_input_tokens_seen": 3316224, + "step": 6745 + }, + { + "epoch": 0.8908538999604065, + "grad_norm": 0.13484865427017212, + "learning_rate": 7.188133609717184e-08, + "loss": 0.0855, + "num_input_tokens_seen": 3318464, + "step": 6750 + }, + { + "epoch": 0.8915137917381549, + "grad_norm": 0.0493309311568737, + "learning_rate": 7.102622033243843e-08, + "loss": 0.0011, + "num_input_tokens_seen": 3320896, + "step": 6755 + }, + { + "epoch": 0.8921736835159034, + "grad_norm": 0.22488893568515778, + "learning_rate": 7.017603405370276e-08, + "loss": 0.1368, + "num_input_tokens_seen": 3323648, + "step": 6760 + }, + { + "epoch": 0.8928335752936518, + "grad_norm": 0.15953336656093597, + "learning_rate": 6.933078177237429e-08, + "loss": 0.1476, + "num_input_tokens_seen": 3326208, + "step": 6765 + }, + { + "epoch": 0.8934934670714003, + "grad_norm": 0.4283379912376404, + "learning_rate": 6.849046797368108e-08, + "loss": 0.0651, + "num_input_tokens_seen": 3328576, + "step": 6770 + }, + { + "epoch": 0.8941533588491487, + "grad_norm": 28.798320770263672, + "learning_rate": 6.765509711664574e-08, + "loss": 0.003, + "num_input_tokens_seen": 3331520, + "step": 6775 + }, + { + "epoch": 0.8948132506268972, + "grad_norm": 0.33185452222824097, + "learning_rate": 6.682467363406174e-08, + "loss": 0.0235, + "num_input_tokens_seen": 3334336, + "step": 6780 + }, + { + "epoch": 0.8954731424046456, + "grad_norm": 0.24480366706848145, + "learning_rate": 6.59992019324701e-08, + "loss": 0.0671, + "num_input_tokens_seen": 3336896, + "step": 6785 + }, + { + "epoch": 0.8961330341823941, + "grad_norm": 9.714395523071289, + "learning_rate": 6.517868639213553e-08, + "loss": 0.1574, + "num_input_tokens_seen": 3339328, + "step": 6790 + }, + { + "epoch": 0.8967929259601425, + "grad_norm": 0.48568111658096313, + "learning_rate": 6.436313136702387e-08, + "loss": 0.0331, + "num_input_tokens_seen": 3341760, + "step": 6795 + }, + { + "epoch": 0.897452817737891, + "grad_norm": 0.3631482720375061, + "learning_rate": 6.355254118477815e-08, + "loss": 0.0527, + "num_input_tokens_seen": 3344448, + "step": 6800 + }, + { + "epoch": 0.8981127095156395, + "grad_norm": 0.10991880297660828, + "learning_rate": 6.274692014669602e-08, + "loss": 0.0009, + "num_input_tokens_seen": 3347008, + "step": 6805 + }, + { + "epoch": 0.8987726012933879, + "grad_norm": 0.15773239731788635, + "learning_rate": 6.194627252770768e-08, + "loss": 0.0008, + "num_input_tokens_seen": 3349824, + "step": 6810 + }, + { + "epoch": 0.8994324930711364, + "grad_norm": 0.0758163183927536, + "learning_rate": 6.115060257635174e-08, + "loss": 0.0687, + "num_input_tokens_seen": 3352320, + "step": 6815 + }, + { + "epoch": 0.9000923848488848, + "grad_norm": 0.21164242923259735, + "learning_rate": 6.035991451475375e-08, + "loss": 0.0013, + "num_input_tokens_seen": 3354688, + "step": 6820 + }, + { + "epoch": 0.9003563415599841, + "eval_loss": 0.09568765014410019, + "eval_runtime": 7.581, + "eval_samples_per_second": 888.409, + "eval_steps_per_second": 111.068, + "num_input_tokens_seen": 3355520, + "step": 6822 + }, + { + "epoch": 0.9007522766266333, + "grad_norm": 0.030890563502907753, + "learning_rate": 5.9574212538603505e-08, + "loss": 0.0891, + "num_input_tokens_seen": 3357056, + "step": 6825 + }, + { + "epoch": 0.9014121684043817, + "grad_norm": 0.39177563786506653, + "learning_rate": 5.879350081713252e-08, + "loss": 0.0683, + "num_input_tokens_seen": 3359488, + "step": 6830 + }, + { + "epoch": 0.9020720601821302, + "grad_norm": 0.23050019145011902, + "learning_rate": 5.8017783493092386e-08, + "loss": 0.2249, + "num_input_tokens_seen": 3361920, + "step": 6835 + }, + { + "epoch": 0.9027319519598785, + "grad_norm": 0.1468856930732727, + "learning_rate": 5.7247064682732104e-08, + "loss": 0.0018, + "num_input_tokens_seen": 3364416, + "step": 6840 + }, + { + "epoch": 0.903391843737627, + "grad_norm": 0.22081144154071808, + "learning_rate": 5.6481348475777566e-08, + "loss": 0.0617, + "num_input_tokens_seen": 3366912, + "step": 6845 + }, + { + "epoch": 0.9040517355153754, + "grad_norm": 0.021701961755752563, + "learning_rate": 5.5720638935407796e-08, + "loss": 0.0014, + "num_input_tokens_seen": 3369088, + "step": 6850 + }, + { + "epoch": 0.9047116272931239, + "grad_norm": 0.013656373135745525, + "learning_rate": 5.49649400982356e-08, + "loss": 0.1392, + "num_input_tokens_seen": 3371520, + "step": 6855 + }, + { + "epoch": 0.9053715190708723, + "grad_norm": 0.04417372867465019, + "learning_rate": 5.421425597428442e-08, + "loss": 0.0007, + "num_input_tokens_seen": 3374080, + "step": 6860 + }, + { + "epoch": 0.9060314108486208, + "grad_norm": 126.34750366210938, + "learning_rate": 5.346859054696784e-08, + "loss": 0.0786, + "num_input_tokens_seen": 3376640, + "step": 6865 + }, + { + "epoch": 0.9066913026263693, + "grad_norm": 0.02389339543879032, + "learning_rate": 5.2727947773068773e-08, + "loss": 0.0794, + "num_input_tokens_seen": 3379072, + "step": 6870 + }, + { + "epoch": 0.9073511944041177, + "grad_norm": 0.42352914810180664, + "learning_rate": 5.199233158271732e-08, + "loss": 0.0732, + "num_input_tokens_seen": 3381696, + "step": 6875 + }, + { + "epoch": 0.9080110861818662, + "grad_norm": 11.932153701782227, + "learning_rate": 5.126174587937149e-08, + "loss": 0.2058, + "num_input_tokens_seen": 3384064, + "step": 6880 + }, + { + "epoch": 0.9086709779596146, + "grad_norm": 0.0787430927157402, + "learning_rate": 5.053619453979485e-08, + "loss": 0.0036, + "num_input_tokens_seen": 3386304, + "step": 6885 + }, + { + "epoch": 0.9093308697373631, + "grad_norm": 0.03228071704506874, + "learning_rate": 4.9815681414037025e-08, + "loss": 0.1486, + "num_input_tokens_seen": 3388800, + "step": 6890 + }, + { + "epoch": 0.9099907615151115, + "grad_norm": 0.38972869515419006, + "learning_rate": 4.910021032541334e-08, + "loss": 0.0886, + "num_input_tokens_seen": 3391232, + "step": 6895 + }, + { + "epoch": 0.91065065329286, + "grad_norm": 29.313077926635742, + "learning_rate": 4.838978507048319e-08, + "loss": 0.0815, + "num_input_tokens_seen": 3393664, + "step": 6900 + }, + { + "epoch": 0.9113105450706084, + "grad_norm": 2.1044397354125977, + "learning_rate": 4.768440941903207e-08, + "loss": 0.0055, + "num_input_tokens_seen": 3395968, + "step": 6905 + }, + { + "epoch": 0.9119704368483569, + "grad_norm": 0.08754704892635345, + "learning_rate": 4.698408711404944e-08, + "loss": 0.0122, + "num_input_tokens_seen": 3398272, + "step": 6910 + }, + { + "epoch": 0.9126303286261053, + "grad_norm": 19.10022735595703, + "learning_rate": 4.628882187171046e-08, + "loss": 0.0763, + "num_input_tokens_seen": 3400960, + "step": 6915 + }, + { + "epoch": 0.9132902204038538, + "grad_norm": 20.788782119750977, + "learning_rate": 4.559861738135506e-08, + "loss": 0.155, + "num_input_tokens_seen": 3403520, + "step": 6920 + }, + { + "epoch": 0.9139501121816023, + "grad_norm": 1.3679563999176025, + "learning_rate": 4.491347730546913e-08, + "loss": 0.229, + "num_input_tokens_seen": 3405952, + "step": 6925 + }, + { + "epoch": 0.9146100039593507, + "grad_norm": 18.096542358398438, + "learning_rate": 4.423340527966512e-08, + "loss": 0.128, + "num_input_tokens_seen": 3408320, + "step": 6930 + }, + { + "epoch": 0.9152698957370992, + "grad_norm": 0.17555084824562073, + "learning_rate": 4.355840491266205e-08, + "loss": 0.0052, + "num_input_tokens_seen": 3410880, + "step": 6935 + }, + { + "epoch": 0.9159297875148475, + "grad_norm": 0.056320879608392715, + "learning_rate": 4.288847978626686e-08, + "loss": 0.0576, + "num_input_tokens_seen": 3413440, + "step": 6940 + }, + { + "epoch": 0.916589679292596, + "grad_norm": 26.998863220214844, + "learning_rate": 4.222363345535585e-08, + "loss": 0.1275, + "num_input_tokens_seen": 3416000, + "step": 6945 + }, + { + "epoch": 0.9172495710703444, + "grad_norm": 12.58722972869873, + "learning_rate": 4.1563869447854505e-08, + "loss": 0.1253, + "num_input_tokens_seen": 3418240, + "step": 6950 + }, + { + "epoch": 0.9179094628480929, + "grad_norm": 0.30387794971466064, + "learning_rate": 4.090919126472048e-08, + "loss": 0.1407, + "num_input_tokens_seen": 3420672, + "step": 6955 + }, + { + "epoch": 0.9185693546258413, + "grad_norm": 65.01815795898438, + "learning_rate": 4.025960237992332e-08, + "loss": 0.0538, + "num_input_tokens_seen": 3422912, + "step": 6960 + }, + { + "epoch": 0.9192292464035898, + "grad_norm": 7.150808334350586, + "learning_rate": 3.961510624042741e-08, + "loss": 0.0027, + "num_input_tokens_seen": 3425408, + "step": 6965 + }, + { + "epoch": 0.9198891381813382, + "grad_norm": 14.636774063110352, + "learning_rate": 3.8975706266172636e-08, + "loss": 0.1111, + "num_input_tokens_seen": 3427776, + "step": 6970 + }, + { + "epoch": 0.9205490299590867, + "grad_norm": 41.149513244628906, + "learning_rate": 3.834140585005696e-08, + "loss": 0.0538, + "num_input_tokens_seen": 3430336, + "step": 6975 + }, + { + "epoch": 0.9212089217368351, + "grad_norm": 37.5268669128418, + "learning_rate": 3.771220835791844e-08, + "loss": 0.2688, + "num_input_tokens_seen": 3432896, + "step": 6980 + }, + { + "epoch": 0.9218688135145836, + "grad_norm": 0.18734599649906158, + "learning_rate": 3.708811712851634e-08, + "loss": 0.0703, + "num_input_tokens_seen": 3435136, + "step": 6985 + }, + { + "epoch": 0.9225287052923321, + "grad_norm": 0.09961698204278946, + "learning_rate": 3.6469135473514936e-08, + "loss": 0.0604, + "num_input_tokens_seen": 3437824, + "step": 6990 + }, + { + "epoch": 0.9231885970700805, + "grad_norm": 0.04659373685717583, + "learning_rate": 3.5855266677464744e-08, + "loss": 0.0066, + "num_input_tokens_seen": 3440320, + "step": 6995 + }, + { + "epoch": 0.923848488847829, + "grad_norm": 0.21239009499549866, + "learning_rate": 3.524651399778555e-08, + "loss": 0.0499, + "num_input_tokens_seen": 3442880, + "step": 7000 + }, + { + "epoch": 0.9245083806255774, + "grad_norm": 0.08486049622297287, + "learning_rate": 3.4642880664749296e-08, + "loss": 0.0009, + "num_input_tokens_seen": 3445120, + "step": 7005 + }, + { + "epoch": 0.9251682724033259, + "grad_norm": 0.2830374538898468, + "learning_rate": 3.404436988146242e-08, + "loss": 0.1758, + "num_input_tokens_seen": 3447424, + "step": 7010 + }, + { + "epoch": 0.9258281641810743, + "grad_norm": 0.012739721685647964, + "learning_rate": 3.345098482384956e-08, + "loss": 0.0461, + "num_input_tokens_seen": 3449920, + "step": 7015 + }, + { + "epoch": 0.9264880559588228, + "grad_norm": 0.5981858968734741, + "learning_rate": 3.2862728640636105e-08, + "loss": 0.0499, + "num_input_tokens_seen": 3452416, + "step": 7020 + }, + { + "epoch": 0.9271479477365712, + "grad_norm": 16.553138732910156, + "learning_rate": 3.227960445333155e-08, + "loss": 0.1119, + "num_input_tokens_seen": 3454912, + "step": 7025 + }, + { + "epoch": 0.9278078395143197, + "grad_norm": 0.03474080190062523, + "learning_rate": 3.1701615356213295e-08, + "loss": 0.0654, + "num_input_tokens_seen": 3457472, + "step": 7030 + }, + { + "epoch": 0.928467731292068, + "grad_norm": 0.11611025035381317, + "learning_rate": 3.112876441630985e-08, + "loss": 0.0654, + "num_input_tokens_seen": 3459712, + "step": 7035 + }, + { + "epoch": 0.9291276230698166, + "grad_norm": 0.19927047193050385, + "learning_rate": 3.05610546733851e-08, + "loss": 0.0532, + "num_input_tokens_seen": 3462144, + "step": 7040 + }, + { + "epoch": 0.9297875148475649, + "grad_norm": 13.10682201385498, + "learning_rate": 2.99984891399212e-08, + "loss": 0.2881, + "num_input_tokens_seen": 3464512, + "step": 7045 + }, + { + "epoch": 0.9304474066253134, + "grad_norm": 0.17246191203594208, + "learning_rate": 2.9441070801103808e-08, + "loss": 0.0061, + "num_input_tokens_seen": 3466880, + "step": 7050 + }, + { + "epoch": 0.931107298403062, + "grad_norm": 0.28195682168006897, + "learning_rate": 2.8888802614805085e-08, + "loss": 0.1035, + "num_input_tokens_seen": 3469248, + "step": 7055 + }, + { + "epoch": 0.9317671901808103, + "grad_norm": 41.38626480102539, + "learning_rate": 2.8341687511568734e-08, + "loss": 0.2707, + "num_input_tokens_seen": 3471616, + "step": 7060 + }, + { + "epoch": 0.9324270819585588, + "grad_norm": 0.20374363660812378, + "learning_rate": 2.7799728394594547e-08, + "loss": 0.0773, + "num_input_tokens_seen": 3474240, + "step": 7065 + }, + { + "epoch": 0.9330869737363072, + "grad_norm": 0.10206926614046097, + "learning_rate": 2.7262928139722198e-08, + "loss": 0.0759, + "num_input_tokens_seen": 3476800, + "step": 7070 + }, + { + "epoch": 0.9337468655140557, + "grad_norm": 0.04854326695203781, + "learning_rate": 2.673128959541693e-08, + "loss": 0.0879, + "num_input_tokens_seen": 3479488, + "step": 7075 + }, + { + "epoch": 0.9344067572918041, + "grad_norm": 0.021472515538334846, + "learning_rate": 2.620481558275367e-08, + "loss": 0.0007, + "num_input_tokens_seen": 3482176, + "step": 7080 + }, + { + "epoch": 0.9350666490695526, + "grad_norm": 69.08782958984375, + "learning_rate": 2.5683508895402382e-08, + "loss": 0.0318, + "num_input_tokens_seen": 3484800, + "step": 7085 + }, + { + "epoch": 0.935726540847301, + "grad_norm": 0.1581341028213501, + "learning_rate": 2.5167372299613853e-08, + "loss": 0.1076, + "num_input_tokens_seen": 3487488, + "step": 7090 + }, + { + "epoch": 0.9363864326250495, + "grad_norm": 11.627638816833496, + "learning_rate": 2.4656408534203365e-08, + "loss": 0.238, + "num_input_tokens_seen": 3489728, + "step": 7095 + }, + { + "epoch": 0.9370463244027979, + "grad_norm": 0.025092612951993942, + "learning_rate": 2.4150620310538273e-08, + "loss": 0.2424, + "num_input_tokens_seen": 3491904, + "step": 7100 + }, + { + "epoch": 0.9377062161805464, + "grad_norm": 12.157607078552246, + "learning_rate": 2.3650010312521673e-08, + "loss": 0.0751, + "num_input_tokens_seen": 3494592, + "step": 7105 + }, + { + "epoch": 0.9383661079582949, + "grad_norm": 0.0817142128944397, + "learning_rate": 2.3154581196579648e-08, + "loss": 0.1782, + "num_input_tokens_seen": 3497088, + "step": 7110 + }, + { + "epoch": 0.9390259997360433, + "grad_norm": 0.06925242394208908, + "learning_rate": 2.2664335591646377e-08, + "loss": 0.0552, + "num_input_tokens_seen": 3499520, + "step": 7115 + }, + { + "epoch": 0.9396858915137918, + "grad_norm": 0.029523100703954697, + "learning_rate": 2.2179276099150158e-08, + "loss": 0.1962, + "num_input_tokens_seen": 3502208, + "step": 7120 + }, + { + "epoch": 0.9403457832915402, + "grad_norm": 121.08486938476562, + "learning_rate": 2.1699405293000182e-08, + "loss": 0.1811, + "num_input_tokens_seen": 3504640, + "step": 7125 + }, + { + "epoch": 0.9410056750692887, + "grad_norm": 0.1253107488155365, + "learning_rate": 2.1224725719572235e-08, + "loss": 0.0653, + "num_input_tokens_seen": 3506944, + "step": 7130 + }, + { + "epoch": 0.9416655668470371, + "grad_norm": 46.052162170410156, + "learning_rate": 2.0755239897695453e-08, + "loss": 0.1533, + "num_input_tokens_seen": 3509376, + "step": 7135 + }, + { + "epoch": 0.9423254586247856, + "grad_norm": 0.4726586639881134, + "learning_rate": 2.0290950318639256e-08, + "loss": 0.1645, + "num_input_tokens_seen": 3511680, + "step": 7140 + }, + { + "epoch": 0.942985350402534, + "grad_norm": 3.1492843627929688, + "learning_rate": 1.983185944609944e-08, + "loss": 0.0611, + "num_input_tokens_seen": 3514112, + "step": 7145 + }, + { + "epoch": 0.9436452421802825, + "grad_norm": 0.20620296895503998, + "learning_rate": 1.9377969716185994e-08, + "loss": 0.0665, + "num_input_tokens_seen": 3516480, + "step": 7150 + }, + { + "epoch": 0.9443051339580308, + "grad_norm": 0.07421538978815079, + "learning_rate": 1.8929283537408968e-08, + "loss": 0.1162, + "num_input_tokens_seen": 3518720, + "step": 7155 + }, + { + "epoch": 0.9449650257357793, + "grad_norm": 0.12716051936149597, + "learning_rate": 1.848580329066718e-08, + "loss": 0.0086, + "num_input_tokens_seen": 3521216, + "step": 7160 + }, + { + "epoch": 0.9456249175135277, + "grad_norm": 18.4110164642334, + "learning_rate": 1.804753132923431e-08, + "loss": 0.3859, + "num_input_tokens_seen": 3523776, + "step": 7165 + }, + { + "epoch": 0.9462848092912762, + "grad_norm": 0.3863736093044281, + "learning_rate": 1.7614469978746827e-08, + "loss": 0.0012, + "num_input_tokens_seen": 3526272, + "step": 7170 + }, + { + "epoch": 0.9469447010690247, + "grad_norm": 67.91719818115234, + "learning_rate": 1.7186621537192304e-08, + "loss": 0.0324, + "num_input_tokens_seen": 3528576, + "step": 7175 + }, + { + "epoch": 0.9476045928467731, + "grad_norm": 0.12099135667085648, + "learning_rate": 1.6763988274896003e-08, + "loss": 0.0012, + "num_input_tokens_seen": 3531136, + "step": 7180 + }, + { + "epoch": 0.9482644846245216, + "grad_norm": 14.467368125915527, + "learning_rate": 1.6346572434509876e-08, + "loss": 0.1503, + "num_input_tokens_seen": 3533696, + "step": 7185 + }, + { + "epoch": 0.94892437640227, + "grad_norm": 0.32109934091567993, + "learning_rate": 1.5934376231000248e-08, + "loss": 0.1569, + "num_input_tokens_seen": 3536064, + "step": 7190 + }, + { + "epoch": 0.9495842681800185, + "grad_norm": 0.11669757217168808, + "learning_rate": 1.55274018516357e-08, + "loss": 0.0044, + "num_input_tokens_seen": 3538432, + "step": 7195 + }, + { + "epoch": 0.9502441599577669, + "grad_norm": 0.06187443807721138, + "learning_rate": 1.512565145597633e-08, + "loss": 0.05, + "num_input_tokens_seen": 3541120, + "step": 7200 + }, + { + "epoch": 0.9503761383133166, + "eval_loss": 0.09555233269929886, + "eval_runtime": 7.635, + "eval_samples_per_second": 882.126, + "eval_steps_per_second": 110.282, + "num_input_tokens_seen": 3541632, + "step": 7201 + }, + { + "epoch": 0.9509040517355154, + "grad_norm": 72.59228515625, + "learning_rate": 1.47291271758615e-08, + "loss": 0.0498, + "num_input_tokens_seen": 3543680, + "step": 7205 + }, + { + "epoch": 0.9515639435132638, + "grad_norm": 107.97425079345703, + "learning_rate": 1.4337831115398991e-08, + "loss": 0.1477, + "num_input_tokens_seen": 3545984, + "step": 7210 + }, + { + "epoch": 0.9522238352910123, + "grad_norm": 17.91878318786621, + "learning_rate": 1.3951765350953548e-08, + "loss": 0.1276, + "num_input_tokens_seen": 3548544, + "step": 7215 + }, + { + "epoch": 0.9528837270687607, + "grad_norm": 0.03271764516830444, + "learning_rate": 1.3570931931136009e-08, + "loss": 0.1596, + "num_input_tokens_seen": 3551040, + "step": 7220 + }, + { + "epoch": 0.9535436188465092, + "grad_norm": 0.11517995595932007, + "learning_rate": 1.3195332876792532e-08, + "loss": 0.0839, + "num_input_tokens_seen": 3553536, + "step": 7225 + }, + { + "epoch": 0.9542035106242576, + "grad_norm": 0.08386459946632385, + "learning_rate": 1.2824970180993488e-08, + "loss": 0.1149, + "num_input_tokens_seen": 3555712, + "step": 7230 + }, + { + "epoch": 0.9548634024020061, + "grad_norm": 0.11899381130933762, + "learning_rate": 1.2459845809023484e-08, + "loss": 0.1233, + "num_input_tokens_seen": 3558080, + "step": 7235 + }, + { + "epoch": 0.9555232941797546, + "grad_norm": 25.366180419921875, + "learning_rate": 1.2099961698370353e-08, + "loss": 0.3036, + "num_input_tokens_seen": 3560640, + "step": 7240 + }, + { + "epoch": 0.956183185957503, + "grad_norm": 11.879151344299316, + "learning_rate": 1.1745319758715288e-08, + "loss": 0.0906, + "num_input_tokens_seen": 3563392, + "step": 7245 + }, + { + "epoch": 0.9568430777352515, + "grad_norm": 15.595983505249023, + "learning_rate": 1.1395921871922509e-08, + "loss": 0.1414, + "num_input_tokens_seen": 3565824, + "step": 7250 + }, + { + "epoch": 0.9575029695129998, + "grad_norm": 0.1044035479426384, + "learning_rate": 1.105176989202905e-08, + "loss": 0.0009, + "num_input_tokens_seen": 3568256, + "step": 7255 + }, + { + "epoch": 0.9581628612907483, + "grad_norm": 194.96958923339844, + "learning_rate": 1.0712865645235659e-08, + "loss": 0.0157, + "num_input_tokens_seen": 3570752, + "step": 7260 + }, + { + "epoch": 0.9588227530684967, + "grad_norm": 0.13970717787742615, + "learning_rate": 1.0379210929896131e-08, + "loss": 0.0805, + "num_input_tokens_seen": 3572928, + "step": 7265 + }, + { + "epoch": 0.9594826448462452, + "grad_norm": 26.312641143798828, + "learning_rate": 1.0050807516508553e-08, + "loss": 0.2674, + "num_input_tokens_seen": 3575296, + "step": 7270 + }, + { + "epoch": 0.9601425366239936, + "grad_norm": 0.14852392673492432, + "learning_rate": 9.727657147705737e-09, + "loss": 0.0011, + "num_input_tokens_seen": 3577664, + "step": 7275 + }, + { + "epoch": 0.9608024284017421, + "grad_norm": 0.4168717563152313, + "learning_rate": 9.409761538245575e-09, + "loss": 0.1992, + "num_input_tokens_seen": 3580160, + "step": 7280 + }, + { + "epoch": 0.9614623201794905, + "grad_norm": 0.20540349185466766, + "learning_rate": 9.097122375002264e-09, + "loss": 0.0761, + "num_input_tokens_seen": 3582464, + "step": 7285 + }, + { + "epoch": 0.962122211957239, + "grad_norm": 0.28160926699638367, + "learning_rate": 8.789741316957312e-09, + "loss": 0.1308, + "num_input_tokens_seen": 3584896, + "step": 7290 + }, + { + "epoch": 0.9627821037349875, + "grad_norm": 35.051509857177734, + "learning_rate": 8.487619995190986e-09, + "loss": 0.005, + "num_input_tokens_seen": 3587584, + "step": 7295 + }, + { + "epoch": 0.9634419955127359, + "grad_norm": 19.22887420654297, + "learning_rate": 8.19076001287311e-09, + "loss": 0.1393, + "num_input_tokens_seen": 3590144, + "step": 7300 + }, + { + "epoch": 0.9641018872904844, + "grad_norm": 0.0941128209233284, + "learning_rate": 7.899162945254945e-09, + "loss": 0.0012, + "num_input_tokens_seen": 3592832, + "step": 7305 + }, + { + "epoch": 0.9647617790682328, + "grad_norm": 35.726531982421875, + "learning_rate": 7.612830339660758e-09, + "loss": 0.0509, + "num_input_tokens_seen": 3595456, + "step": 7310 + }, + { + "epoch": 0.9654216708459813, + "grad_norm": 20.33081817626953, + "learning_rate": 7.3317637154796105e-09, + "loss": 0.1043, + "num_input_tokens_seen": 3597888, + "step": 7315 + }, + { + "epoch": 0.9660815626237297, + "grad_norm": 19.57093048095703, + "learning_rate": 7.0559645641572465e-09, + "loss": 0.0687, + "num_input_tokens_seen": 3600384, + "step": 7320 + }, + { + "epoch": 0.9667414544014782, + "grad_norm": 19.5794677734375, + "learning_rate": 6.785434349188102e-09, + "loss": 0.1628, + "num_input_tokens_seen": 3602880, + "step": 7325 + }, + { + "epoch": 0.9674013461792266, + "grad_norm": 0.20137454569339752, + "learning_rate": 6.520174506107867e-09, + "loss": 0.0423, + "num_input_tokens_seen": 3605248, + "step": 7330 + }, + { + "epoch": 0.9680612379569751, + "grad_norm": 0.1589362919330597, + "learning_rate": 6.260186442485494e-09, + "loss": 0.0011, + "num_input_tokens_seen": 3607808, + "step": 7335 + }, + { + "epoch": 0.9687211297347235, + "grad_norm": 6.267104148864746, + "learning_rate": 6.005471537915863e-09, + "loss": 0.1108, + "num_input_tokens_seen": 3610112, + "step": 7340 + }, + { + "epoch": 0.969381021512472, + "grad_norm": 6.449219226837158, + "learning_rate": 5.756031144012685e-09, + "loss": 0.0454, + "num_input_tokens_seen": 3612352, + "step": 7345 + }, + { + "epoch": 0.9700409132902204, + "grad_norm": 17.85236930847168, + "learning_rate": 5.511866584400837e-09, + "loss": 0.1715, + "num_input_tokens_seen": 3614848, + "step": 7350 + }, + { + "epoch": 0.9707008050679689, + "grad_norm": 0.49963685870170593, + "learning_rate": 5.2729791547097e-09, + "loss": 0.0017, + "num_input_tokens_seen": 3617408, + "step": 7355 + }, + { + "epoch": 0.9713606968457174, + "grad_norm": 20.02973175048828, + "learning_rate": 5.039370122566389e-09, + "loss": 0.0783, + "num_input_tokens_seen": 3619968, + "step": 7360 + }, + { + "epoch": 0.9720205886234657, + "grad_norm": 0.46782761812210083, + "learning_rate": 4.811040727588755e-09, + "loss": 0.0965, + "num_input_tokens_seen": 3622016, + "step": 7365 + }, + { + "epoch": 0.9726804804012142, + "grad_norm": 14.681106567382812, + "learning_rate": 4.58799218137873e-09, + "loss": 0.1156, + "num_input_tokens_seen": 3624192, + "step": 7370 + }, + { + "epoch": 0.9733403721789626, + "grad_norm": 0.10944530367851257, + "learning_rate": 4.370225667516325e-09, + "loss": 0.0009, + "num_input_tokens_seen": 3626624, + "step": 7375 + }, + { + "epoch": 0.9740002639567111, + "grad_norm": 10.72696304321289, + "learning_rate": 4.157742341552861e-09, + "loss": 0.1827, + "num_input_tokens_seen": 3628928, + "step": 7380 + }, + { + "epoch": 0.9746601557344595, + "grad_norm": 0.05703306198120117, + "learning_rate": 3.950543331005307e-09, + "loss": 0.0786, + "num_input_tokens_seen": 3631552, + "step": 7385 + }, + { + "epoch": 0.975320047512208, + "grad_norm": 0.22080153226852417, + "learning_rate": 3.748629735349839e-09, + "loss": 0.0009, + "num_input_tokens_seen": 3633984, + "step": 7390 + }, + { + "epoch": 0.9759799392899564, + "grad_norm": 0.05749522149562836, + "learning_rate": 3.552002626016293e-09, + "loss": 0.1332, + "num_input_tokens_seen": 3636224, + "step": 7395 + }, + { + "epoch": 0.9766398310677049, + "grad_norm": 0.09394296258687973, + "learning_rate": 3.3606630463824947e-09, + "loss": 0.2453, + "num_input_tokens_seen": 3638656, + "step": 7400 + }, + { + "epoch": 0.9772997228454533, + "grad_norm": 0.21530580520629883, + "learning_rate": 3.174612011768607e-09, + "loss": 0.0011, + "num_input_tokens_seen": 3641408, + "step": 7405 + }, + { + "epoch": 0.9779596146232018, + "grad_norm": 11.811311721801758, + "learning_rate": 2.9938505094316834e-09, + "loss": 0.1615, + "num_input_tokens_seen": 3643840, + "step": 7410 + }, + { + "epoch": 0.9786195064009502, + "grad_norm": 0.10265132784843445, + "learning_rate": 2.8183794985605637e-09, + "loss": 0.0006, + "num_input_tokens_seen": 3646336, + "step": 7415 + }, + { + "epoch": 0.9792793981786987, + "grad_norm": 0.0780910775065422, + "learning_rate": 2.6481999102707654e-09, + "loss": 0.0664, + "num_input_tokens_seen": 3648960, + "step": 7420 + }, + { + "epoch": 0.9799392899564472, + "grad_norm": 0.4066920280456543, + "learning_rate": 2.4833126475994894e-09, + "loss": 0.0011, + "num_input_tokens_seen": 3651200, + "step": 7425 + }, + { + "epoch": 0.9805991817341956, + "grad_norm": 7.960684776306152, + "learning_rate": 2.3237185855008443e-09, + "loss": 0.0056, + "num_input_tokens_seen": 3653504, + "step": 7430 + }, + { + "epoch": 0.9812590735119441, + "grad_norm": 62.566429138183594, + "learning_rate": 2.1694185708414083e-09, + "loss": 0.2456, + "num_input_tokens_seen": 3656064, + "step": 7435 + }, + { + "epoch": 0.9819189652896925, + "grad_norm": 156.15281677246094, + "learning_rate": 2.0204134223952284e-09, + "loss": 0.2749, + "num_input_tokens_seen": 3658112, + "step": 7440 + }, + { + "epoch": 0.982578857067441, + "grad_norm": 16.413667678833008, + "learning_rate": 1.87670393083994e-09, + "loss": 0.1727, + "num_input_tokens_seen": 3660928, + "step": 7445 + }, + { + "epoch": 0.9832387488451894, + "grad_norm": 23.082048416137695, + "learning_rate": 1.7382908587525447e-09, + "loss": 0.0298, + "num_input_tokens_seen": 3663232, + "step": 7450 + }, + { + "epoch": 0.9838986406229379, + "grad_norm": 0.05155172944068909, + "learning_rate": 1.6051749406049697e-09, + "loss": 0.0013, + "num_input_tokens_seen": 3665600, + "step": 7455 + }, + { + "epoch": 0.9845585324006862, + "grad_norm": 0.047289684414863586, + "learning_rate": 1.4773568827607386e-09, + "loss": 0.0008, + "num_input_tokens_seen": 3668096, + "step": 7460 + }, + { + "epoch": 0.9852184241784347, + "grad_norm": 0.13699422776699066, + "learning_rate": 1.354837363470529e-09, + "loss": 0.0016, + "num_input_tokens_seen": 3670656, + "step": 7465 + }, + { + "epoch": 0.9858783159561831, + "grad_norm": 117.30753326416016, + "learning_rate": 1.23761703286962e-09, + "loss": 0.1466, + "num_input_tokens_seen": 3673024, + "step": 7470 + }, + { + "epoch": 0.9865382077339316, + "grad_norm": 0.04031828045845032, + "learning_rate": 1.1256965129730068e-09, + "loss": 0.0012, + "num_input_tokens_seen": 3675712, + "step": 7475 + }, + { + "epoch": 0.9871980995116801, + "grad_norm": 0.14629097282886505, + "learning_rate": 1.0190763976734018e-09, + "loss": 0.1029, + "num_input_tokens_seen": 3678080, + "step": 7480 + }, + { + "epoch": 0.9878579912894285, + "grad_norm": 59.77119064331055, + "learning_rate": 9.177572527375721e-10, + "loss": 0.1536, + "num_input_tokens_seen": 3680448, + "step": 7485 + }, + { + "epoch": 0.988517883067177, + "grad_norm": 9.770659446716309, + "learning_rate": 8.217396158030076e-10, + "loss": 0.0019, + "num_input_tokens_seen": 3682752, + "step": 7490 + }, + { + "epoch": 0.9891777748449254, + "grad_norm": 77.23456573486328, + "learning_rate": 7.310239963755904e-10, + "loss": 0.1517, + "num_input_tokens_seen": 3685376, + "step": 7495 + }, + { + "epoch": 0.9898376666226739, + "grad_norm": 0.24194952845573425, + "learning_rate": 6.456108758268186e-10, + "loss": 0.0016, + "num_input_tokens_seen": 3687744, + "step": 7500 + }, + { + "epoch": 0.9904975584004223, + "grad_norm": 49.376041412353516, + "learning_rate": 5.655007073909202e-10, + "loss": 0.1517, + "num_input_tokens_seen": 3690240, + "step": 7505 + }, + { + "epoch": 0.9911574501781708, + "grad_norm": 0.04725305363535881, + "learning_rate": 4.906939161627432e-10, + "loss": 0.0507, + "num_input_tokens_seen": 3692736, + "step": 7510 + }, + { + "epoch": 0.9918173419559192, + "grad_norm": 14.56191635131836, + "learning_rate": 4.2119089909542495e-10, + "loss": 0.201, + "num_input_tokens_seen": 3695360, + "step": 7515 + }, + { + "epoch": 0.9924772337336677, + "grad_norm": 0.656075119972229, + "learning_rate": 3.569920249981706e-10, + "loss": 0.0593, + "num_input_tokens_seen": 3697856, + "step": 7520 + }, + { + "epoch": 0.9931371255114161, + "grad_norm": 1.89599609375, + "learning_rate": 2.980976345344777e-10, + "loss": 0.027, + "num_input_tokens_seen": 3700224, + "step": 7525 + }, + { + "epoch": 0.9937970172891646, + "grad_norm": 0.08695349097251892, + "learning_rate": 2.445080402202482e-10, + "loss": 0.0772, + "num_input_tokens_seen": 3702592, + "step": 7530 + }, + { + "epoch": 0.994456909066913, + "grad_norm": 0.4214935898780823, + "learning_rate": 1.962235264222345e-10, + "loss": 0.1564, + "num_input_tokens_seen": 3704896, + "step": 7535 + }, + { + "epoch": 0.9951168008446615, + "grad_norm": 4.679486274719238, + "learning_rate": 1.5324434935615195e-10, + "loss": 0.0446, + "num_input_tokens_seen": 3707264, + "step": 7540 + }, + { + "epoch": 0.99577669262241, + "grad_norm": 11.486593246459961, + "learning_rate": 1.1557073708579057e-10, + "loss": 0.1154, + "num_input_tokens_seen": 3709824, + "step": 7545 + }, + { + "epoch": 0.9964365844001584, + "grad_norm": 1.4587411880493164, + "learning_rate": 8.320288952168297e-11, + "loss": 0.1286, + "num_input_tokens_seen": 3712192, + "step": 7550 + }, + { + "epoch": 0.9970964761779069, + "grad_norm": 97.42974090576172, + "learning_rate": 5.614097841988297e-11, + "loss": 0.0832, + "num_input_tokens_seen": 3714880, + "step": 7555 + }, + { + "epoch": 0.9977563679556553, + "grad_norm": 0.4481765627861023, + "learning_rate": 3.43851473808554e-11, + "loss": 0.0311, + "num_input_tokens_seen": 3717184, + "step": 7560 + }, + { + "epoch": 0.9984162597334038, + "grad_norm": 0.09104231745004654, + "learning_rate": 1.7935511849587192e-11, + "loss": 0.0522, + "num_input_tokens_seen": 3719424, + "step": 7565 + }, + { + "epoch": 0.9990761515111521, + "grad_norm": 0.0702981948852539, + "learning_rate": 6.792159113921947e-12, + "loss": 0.1208, + "num_input_tokens_seen": 3721920, + "step": 7570 + }, + { + "epoch": 0.9997360432889006, + "grad_norm": 39.189273834228516, + "learning_rate": 9.55148304560005e-13, + "loss": 0.1169, + "num_input_tokens_seen": 3724288, + "step": 7575 + }, + { + "epoch": 1.0, + "num_input_tokens_seen": 3725120, + "step": 7577, + "total_flos": 2.175051626840064e+16, + "train_loss": 0.12470523826255549, + "train_runtime": 1215.5483, + "train_samples_per_second": 49.866, + "train_steps_per_second": 6.233 + } + ], + "logging_steps": 5, + "max_steps": 7577, + "num_input_tokens_seen": 3725120, + "num_train_epochs": 1, + "save_steps": 379, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.175051626840064e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..dca7ef2 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3be5956f3244a54af871b31e72ef81b8b4797a3bf991a496b7013ecba3258ed7 +size 6289 diff --git a/training_eval_loss.png b/training_eval_loss.png new file mode 100644 index 0000000..82a1cd8 Binary files /dev/null and b/training_eval_loss.png differ diff --git a/training_loss.png b/training_loss.png new file mode 100644 index 0000000..67fe0d4 Binary files /dev/null and b/training_loss.png differ