commit aa0222e4a1022a09b092d4d170d4ec9c1b6866ce Author: ModelHub XC Date: Sun May 3 10:17:08 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: rbelanec/train_cola_42_1776331560 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..1e7819f --- /dev/null +++ b/README.md @@ -0,0 +1,81 @@ +--- +library_name: transformers +license: llama3.2 +base_model: meta-llama/Llama-3.2-1B-Instruct +tags: +- peft-factory +- full +- llama-factory +- generated_from_trainer +model-index: +- name: train_cola_42_1776331560 + results: [] +--- + + + +# train_cola_42_1776331560 + +This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) on the cola dataset. +It achieves the following results on the evaluation set: +- Loss: 0.1763 +- Num Input Tokens Seen: 1932608 + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 5e-06 +- train_batch_size: 8 +- eval_batch_size: 8 +- seed: 42 +- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_ratio: 0.1 +- num_epochs: 5 + +### Training results + +| Training Loss | Epoch | Step | Validation Loss | Input Tokens Seen | +|:-------------:|:------:|:----:|:---------------:|:-----------------:| +| 0.2021 | 0.2505 | 241 | 0.2780 | 97664 | +| 0.2402 | 0.5010 | 482 | 0.2002 | 194560 | +| 0.1906 | 0.7516 | 723 | 0.2094 | 291712 | +| 0.2397 | 1.0021 | 964 | 0.1763 | 387464 | +| 0.0622 | 1.2526 | 1205 | 0.2676 | 485192 | +| 0.0911 | 1.5031 | 1446 | 0.3146 | 581704 | +| 0.1042 | 1.7536 | 1687 | 0.2114 | 677576 | +| 0.096 | 2.0042 | 1928 | 0.3562 | 775312 | +| 0.0094 | 2.2547 | 2169 | 0.3035 | 873104 | +| 0.0894 | 2.5052 | 2410 | 0.3649 | 969360 | +| 0.0705 | 2.7557 | 2651 | 0.3061 | 1065232 | +| 0.0016 | 3.0062 | 2892 | 0.2698 | 1162016 | +| 0.0469 | 3.2568 | 3133 | 0.3603 | 1259168 | +| 0.0682 | 3.5073 | 3374 | 0.4128 | 1355552 | +| 0.0128 | 3.7578 | 3615 | 0.3697 | 1453088 | +| 0.0238 | 4.0083 | 3856 | 0.3716 | 1549360 | +| 0.0 | 4.2588 | 4097 | 0.4492 | 1645808 | +| 0.0202 | 4.5094 | 4338 | 0.4368 | 1742960 | +| 0.0001 | 4.7599 | 4579 | 0.4381 | 1839344 | + + +### Framework versions + +- Transformers 4.51.3 +- Pytorch 2.10.0+cu128 +- Datasets 4.0.0 +- Tokenizers 0.21.4 diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..7911c8d --- /dev/null +++ b/all_results.json @@ -0,0 +1,13 @@ +{ + "epoch": 5.0, + "eval_loss": 0.17627178132534027, + "eval_runtime": 1.0826, + "eval_samples_per_second": 790.725, + "eval_steps_per_second": 98.841, + "num_input_tokens_seen": 1932608, + "total_flos": 1.1284259767320576e+16, + "train_loss": 0.10950150515592155, + "train_runtime": 1431.7139, + "train_samples_per_second": 26.873, + "train_steps_per_second": 3.36 +} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..5a2b93f --- /dev/null +++ b/config.json @@ -0,0 +1,39 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.51.3", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/eval_results.json b/eval_results.json new file mode 100644 index 0000000..b63aa32 --- /dev/null +++ b/eval_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 5.0, + "eval_loss": 0.17627178132534027, + "eval_runtime": 1.0826, + "eval_samples_per_second": 790.725, + "eval_steps_per_second": 98.841, + "num_input_tokens_seen": 1932608 +} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..2b8ae57 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.51.3" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..58a821d --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b2332dbb96615a9369e293d07ed67624654b5e3f03d1013b2867122f224c12e +size 4943274328 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..14daf45 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,26 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..ddc3ce0 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2069 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/train.yaml b/train.yaml new file mode 100644 index 0000000..050d7bb --- /dev/null +++ b/train.yaml @@ -0,0 +1,55 @@ +seed: 42 + +### model +model_name_or_path: meta-llama/Llama-3.2-1B-Instruct +trust_remote_code: true +flash_attn: auto +use_cache: false + +### method +stage: sft +do_train: true +finetuning_type: full + +### dataset +dataset: cola +template: llama3 +cutoff_len: 2048 +overwrite_cache: true +preprocessing_num_workers: 4 +dataloader_num_workers: 4 +packing: false + +### output +output_dir: saves_bts_preliminary/base/llama-3.2-1b-instruct/train_cola_42_1776331560 +logging_steps: 5 +save_steps: 0.05 +overwrite_output_dir: true +save_only_model: false +plot_loss: true +include_num_input_tokens_seen: true +push_to_hub: true +push_to_hub_organization: rbelanec +load_best_model_at_end: true +save_total_limit: 1 + +### train +per_device_train_batch_size: 8 +learning_rate: 5.0e-6 +num_train_epochs: 5 +weight_decay: 1.0e-5 +lr_scheduler_type: cosine +bf16: true +ddp_timeout: 180000000 +resume_from_checkpoint: null +warmup_ratio: 0.1 +optim: adamw_torch +report_to: +- wandb +run_name: base_llama-3.2-1b-instruct_train_cola_42_1776331560 + +### eval +per_device_eval_batch_size: 8 +eval_strategy: steps +eval_steps: 0.05 +val_size: 0.1 \ No newline at end of file diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..f1b956a --- /dev/null +++ b/train_results.json @@ -0,0 +1,9 @@ +{ + "epoch": 5.0, + "num_input_tokens_seen": 1932608, + "total_flos": 1.1284259767320576e+16, + "train_loss": 0.10950150515592155, + "train_runtime": 1431.7139, + "train_samples_per_second": 26.873, + "train_steps_per_second": 3.36 +} \ No newline at end of file diff --git a/trainer_log.jsonl b/trainer_log.jsonl new file mode 100644 index 0000000..ce58b2c --- /dev/null +++ b/trainer_log.jsonl @@ -0,0 +1,982 @@ +{"current_steps": 5, "total_steps": 4810, "loss": 1.2917, "lr": 4.158004158004159e-08, "epoch": 0.005197505197505198, "percentage": 0.1, "elapsed_time": "0:00:00", "remaining_time": "0:10:09", "throughput": 3231.57, "total_tokens": 2048} +{"current_steps": 10, "total_steps": 4810, "loss": 1.2491, "lr": 9.355509355509357e-08, "epoch": 0.010395010395010396, "percentage": 0.21, "elapsed_time": "0:00:00", "remaining_time": "0:07:44", "throughput": 4366.27, "total_tokens": 4224} +{"current_steps": 15, "total_steps": 4810, "loss": 1.3117, "lr": 1.4553014553014554e-07, "epoch": 0.015592515592515593, "percentage": 0.31, "elapsed_time": "0:00:01", "remaining_time": "0:06:52", "throughput": 4863.9, "total_tokens": 6272} +{"current_steps": 20, "total_steps": 4810, "loss": 1.1366, "lr": 1.9750519750519752e-07, "epoch": 0.02079002079002079, "percentage": 0.42, "elapsed_time": "0:00:01", "remaining_time": "0:06:26", "throughput": 5199.14, "total_tokens": 8384} +{"current_steps": 25, "total_steps": 4810, "loss": 0.7792, "lr": 2.494802494802495e-07, "epoch": 0.02598752598752599, "percentage": 0.52, "elapsed_time": "0:00:01", "remaining_time": "0:06:10", "throughput": 5421.16, "total_tokens": 10496} +{"current_steps": 30, "total_steps": 4810, "loss": 0.523, "lr": 3.014553014553015e-07, "epoch": 0.031185031185031187, "percentage": 0.62, "elapsed_time": "0:00:02", "remaining_time": "0:05:59", "throughput": 5553.26, "total_tokens": 12544} +{"current_steps": 35, "total_steps": 4810, "loss": 0.3194, "lr": 3.534303534303535e-07, "epoch": 0.036382536382536385, "percentage": 0.73, "elapsed_time": "0:00:02", "remaining_time": "0:05:51", "throughput": 5631.11, "total_tokens": 14528} +{"current_steps": 40, "total_steps": 4810, "loss": 0.3482, "lr": 4.0540540540540546e-07, "epoch": 0.04158004158004158, "percentage": 0.83, "elapsed_time": "0:00:02", "remaining_time": "0:05:46", "throughput": 5711.68, "total_tokens": 16576} +{"current_steps": 45, "total_steps": 4810, "loss": 0.377, "lr": 4.5738045738045745e-07, "epoch": 0.04677754677754678, "percentage": 0.94, "elapsed_time": "0:00:03", "remaining_time": "0:05:41", "throughput": 5757.88, "total_tokens": 18560} +{"current_steps": 50, "total_steps": 4810, "loss": 0.2695, "lr": 5.093555093555094e-07, "epoch": 0.05197505197505198, "percentage": 1.04, "elapsed_time": "0:00:03", "remaining_time": "0:05:37", "throughput": 5813.34, "total_tokens": 20608} +{"current_steps": 55, "total_steps": 4810, "loss": 0.2431, "lr": 5.613305613305614e-07, "epoch": 0.057172557172557176, "percentage": 1.14, "elapsed_time": "0:00:03", "remaining_time": "0:05:34", "throughput": 5859.47, "total_tokens": 22656} +{"current_steps": 60, "total_steps": 4810, "loss": 0.3162, "lr": 6.133056133056134e-07, "epoch": 0.062370062370062374, "percentage": 1.25, "elapsed_time": "0:00:04", "remaining_time": "0:05:31", "throughput": 5883.2, "total_tokens": 24640} +{"current_steps": 65, "total_steps": 4810, "loss": 0.3171, "lr": 6.652806652806654e-07, "epoch": 0.06756756756756757, "percentage": 1.35, "elapsed_time": "0:00:04", "remaining_time": "0:05:29", "throughput": 5928.15, "total_tokens": 26752} +{"current_steps": 70, "total_steps": 4810, "loss": 0.4943, "lr": 7.172557172557173e-07, "epoch": 0.07276507276507277, "percentage": 1.46, "elapsed_time": "0:00:04", "remaining_time": "0:05:28", "throughput": 5898.9, "total_tokens": 28608} +{"current_steps": 75, "total_steps": 4810, "loss": 0.3067, "lr": 7.692307692307694e-07, "epoch": 0.07796257796257797, "percentage": 1.56, "elapsed_time": "0:00:05", "remaining_time": "0:05:27", "throughput": 5957.08, "total_tokens": 30912} +{"current_steps": 80, "total_steps": 4810, "loss": 0.5316, "lr": 8.212058212058213e-07, "epoch": 0.08316008316008316, "percentage": 1.66, "elapsed_time": "0:00:05", "remaining_time": "0:05:25", "throughput": 5967.54, "total_tokens": 32896} +{"current_steps": 85, "total_steps": 4810, "loss": 0.3228, "lr": 8.731808731808733e-07, "epoch": 0.08835758835758836, "percentage": 1.77, "elapsed_time": "0:00:05", "remaining_time": "0:05:24", "throughput": 5965.9, "total_tokens": 34816} +{"current_steps": 90, "total_steps": 4810, "loss": 0.3339, "lr": 9.251559251559253e-07, "epoch": 0.09355509355509356, "percentage": 1.87, "elapsed_time": "0:00:06", "remaining_time": "0:05:22", "throughput": 5966.22, "total_tokens": 36736} +{"current_steps": 95, "total_steps": 4810, "loss": 0.2951, "lr": 9.771309771309773e-07, "epoch": 0.09875259875259876, "percentage": 1.98, "elapsed_time": "0:00:06", "remaining_time": "0:05:21", "throughput": 5974.7, "total_tokens": 38720} +{"current_steps": 100, "total_steps": 4810, "loss": 0.2279, "lr": 1.0291060291060292e-06, "epoch": 0.10395010395010396, "percentage": 2.08, "elapsed_time": "0:00:06", "remaining_time": "0:05:20", "throughput": 5971.58, "total_tokens": 40640} +{"current_steps": 105, "total_steps": 4810, "loss": 0.2562, "lr": 1.0810810810810812e-06, "epoch": 0.10914760914760915, "percentage": 2.18, "elapsed_time": "0:00:07", "remaining_time": "0:05:19", "throughput": 5987.29, "total_tokens": 42688} +{"current_steps": 110, "total_steps": 4810, "loss": 0.2794, "lr": 1.1330561330561333e-06, "epoch": 0.11434511434511435, "percentage": 2.29, "elapsed_time": "0:00:07", "remaining_time": "0:05:18", "throughput": 5978.08, "total_tokens": 44544} +{"current_steps": 115, "total_steps": 4810, "loss": 0.2511, "lr": 1.1850311850311852e-06, "epoch": 0.11954261954261955, "percentage": 2.39, "elapsed_time": "0:00:07", "remaining_time": "0:05:17", "throughput": 5969.5, "total_tokens": 46400} +{"current_steps": 120, "total_steps": 4810, "loss": 0.2503, "lr": 1.2370062370062372e-06, "epoch": 0.12474012474012475, "percentage": 2.49, "elapsed_time": "0:00:08", "remaining_time": "0:05:16", "throughput": 5983.08, "total_tokens": 48448} +{"current_steps": 125, "total_steps": 4810, "loss": 0.3088, "lr": 1.288981288981289e-06, "epoch": 0.12993762993762994, "percentage": 2.6, "elapsed_time": "0:00:08", "remaining_time": "0:05:15", "throughput": 5996.41, "total_tokens": 50496} +{"current_steps": 130, "total_steps": 4810, "loss": 0.2409, "lr": 1.340956340956341e-06, "epoch": 0.13513513513513514, "percentage": 2.7, "elapsed_time": "0:00:08", "remaining_time": "0:05:14", "throughput": 5994.04, "total_tokens": 52416} +{"current_steps": 135, "total_steps": 4810, "loss": 0.2575, "lr": 1.3929313929313932e-06, "epoch": 0.14033264033264034, "percentage": 2.81, "elapsed_time": "0:00:09", "remaining_time": "0:05:14", "throughput": 6006.12, "total_tokens": 54464} +{"current_steps": 140, "total_steps": 4810, "loss": 0.2283, "lr": 1.4449064449064451e-06, "epoch": 0.14553014553014554, "percentage": 2.91, "elapsed_time": "0:00:09", "remaining_time": "0:05:13", "throughput": 6010.95, "total_tokens": 56448} +{"current_steps": 145, "total_steps": 4810, "loss": 0.2004, "lr": 1.496881496881497e-06, "epoch": 0.15072765072765074, "percentage": 3.01, "elapsed_time": "0:00:09", "remaining_time": "0:05:12", "throughput": 6009.87, "total_tokens": 58368} +{"current_steps": 150, "total_steps": 4810, "loss": 0.3166, "lr": 1.548856548856549e-06, "epoch": 0.15592515592515593, "percentage": 3.12, "elapsed_time": "0:00:10", "remaining_time": "0:05:11", "throughput": 6032.8, "total_tokens": 60544} +{"current_steps": 155, "total_steps": 4810, "loss": 0.3419, "lr": 1.6008316008316011e-06, "epoch": 0.16112266112266113, "percentage": 3.22, "elapsed_time": "0:00:10", "remaining_time": "0:05:11", "throughput": 6042.22, "total_tokens": 62592} +{"current_steps": 160, "total_steps": 4810, "loss": 0.2605, "lr": 1.652806652806653e-06, "epoch": 0.16632016632016633, "percentage": 3.33, "elapsed_time": "0:00:10", "remaining_time": "0:05:10", "throughput": 6045.74, "total_tokens": 64576} +{"current_steps": 165, "total_steps": 4810, "loss": 0.2617, "lr": 1.704781704781705e-06, "epoch": 0.17151767151767153, "percentage": 3.43, "elapsed_time": "0:00:11", "remaining_time": "0:05:09", "throughput": 6059.71, "total_tokens": 66688} +{"current_steps": 170, "total_steps": 4810, "loss": 0.2505, "lr": 1.756756756756757e-06, "epoch": 0.17671517671517672, "percentage": 3.53, "elapsed_time": "0:00:11", "remaining_time": "0:05:09", "throughput": 6051.99, "total_tokens": 68544} +{"current_steps": 175, "total_steps": 4810, "loss": 0.2672, "lr": 1.808731808731809e-06, "epoch": 0.18191268191268192, "percentage": 3.64, "elapsed_time": "0:00:11", "remaining_time": "0:05:08", "throughput": 6059.76, "total_tokens": 70592} +{"current_steps": 180, "total_steps": 4810, "loss": 0.2488, "lr": 1.860706860706861e-06, "epoch": 0.18711018711018712, "percentage": 3.74, "elapsed_time": "0:00:11", "remaining_time": "0:05:07", "throughput": 6062.52, "total_tokens": 72576} +{"current_steps": 185, "total_steps": 4810, "loss": 0.1863, "lr": 1.912681912681913e-06, "epoch": 0.19230769230769232, "percentage": 3.85, "elapsed_time": "0:00:12", "remaining_time": "0:05:07", "throughput": 6070.09, "total_tokens": 74624} +{"current_steps": 190, "total_steps": 4810, "loss": 0.1639, "lr": 1.964656964656965e-06, "epoch": 0.19750519750519752, "percentage": 3.95, "elapsed_time": "0:00:12", "remaining_time": "0:05:06", "throughput": 6072.27, "total_tokens": 76608} +{"current_steps": 195, "total_steps": 4810, "loss": 0.3799, "lr": 2.016632016632017e-06, "epoch": 0.20270270270270271, "percentage": 4.05, "elapsed_time": "0:00:12", "remaining_time": "0:05:06", "throughput": 6083.92, "total_tokens": 78720} +{"current_steps": 200, "total_steps": 4810, "loss": 0.455, "lr": 2.0686070686070687e-06, "epoch": 0.2079002079002079, "percentage": 4.16, "elapsed_time": "0:00:13", "remaining_time": "0:05:06", "throughput": 6112.65, "total_tokens": 81152} +{"current_steps": 205, "total_steps": 4810, "loss": 0.2303, "lr": 2.120582120582121e-06, "epoch": 0.2130977130977131, "percentage": 4.26, "elapsed_time": "0:00:13", "remaining_time": "0:05:05", "throughput": 6118.26, "total_tokens": 83200} +{"current_steps": 210, "total_steps": 4810, "loss": 0.2985, "lr": 2.172557172557173e-06, "epoch": 0.2182952182952183, "percentage": 4.37, "elapsed_time": "0:00:13", "remaining_time": "0:05:04", "throughput": 6118.97, "total_tokens": 85184} +{"current_steps": 215, "total_steps": 4810, "loss": 0.1462, "lr": 2.2245322245322247e-06, "epoch": 0.2234927234927235, "percentage": 4.47, "elapsed_time": "0:00:14", "remaining_time": "0:05:04", "throughput": 6123.62, "total_tokens": 87232} +{"current_steps": 220, "total_steps": 4810, "loss": 0.2323, "lr": 2.276507276507277e-06, "epoch": 0.2286902286902287, "percentage": 4.57, "elapsed_time": "0:00:14", "remaining_time": "0:05:03", "throughput": 6120.1, "total_tokens": 89152} +{"current_steps": 225, "total_steps": 4810, "loss": 0.3453, "lr": 2.3284823284823286e-06, "epoch": 0.2338877338877339, "percentage": 4.68, "elapsed_time": "0:00:14", "remaining_time": "0:05:03", "throughput": 6132.76, "total_tokens": 91328} +{"current_steps": 230, "total_steps": 4810, "loss": 0.1965, "lr": 2.3804573804573807e-06, "epoch": 0.2390852390852391, "percentage": 4.78, "elapsed_time": "0:00:15", "remaining_time": "0:05:02", "throughput": 6133.17, "total_tokens": 93312} +{"current_steps": 235, "total_steps": 4810, "loss": 0.186, "lr": 2.432432432432433e-06, "epoch": 0.2442827442827443, "percentage": 4.89, "elapsed_time": "0:00:15", "remaining_time": "0:05:02", "throughput": 6133.43, "total_tokens": 95296} +{"current_steps": 240, "total_steps": 4810, "loss": 0.2021, "lr": 2.4844074844074846e-06, "epoch": 0.2494802494802495, "percentage": 4.99, "elapsed_time": "0:00:15", "remaining_time": "0:05:01", "throughput": 6129.99, "total_tokens": 97216} +{"current_steps": 241, "total_steps": 4810, "eval_loss": 0.2780425250530243, "epoch": 0.2505197505197505, "percentage": 5.01, "elapsed_time": "0:00:16", "remaining_time": "0:05:20", "throughput": 5768.67, "total_tokens": 97664} +{"current_steps": 245, "total_steps": 4810, "loss": 0.2316, "lr": 2.5363825363825367e-06, "epoch": 0.25467775467775466, "percentage": 5.09, "elapsed_time": "0:01:58", "remaining_time": "0:36:52", "throughput": 835.93, "total_tokens": 99264} +{"current_steps": 250, "total_steps": 4810, "loss": 0.2509, "lr": 2.5883575883575885e-06, "epoch": 0.2598752598752599, "percentage": 5.2, "elapsed_time": "0:01:59", "remaining_time": "0:36:11", "throughput": 849.79, "total_tokens": 101184} +{"current_steps": 255, "total_steps": 4810, "loss": 0.1696, "lr": 2.6403326403326406e-06, "epoch": 0.26507276507276506, "percentage": 5.3, "elapsed_time": "0:01:59", "remaining_time": "0:35:32", "throughput": 865.18, "total_tokens": 103296} +{"current_steps": 260, "total_steps": 4810, "loss": 0.3451, "lr": 2.6923076923076923e-06, "epoch": 0.2702702702702703, "percentage": 5.41, "elapsed_time": "0:01:59", "remaining_time": "0:34:55", "throughput": 879.95, "total_tokens": 105344} +{"current_steps": 265, "total_steps": 4810, "loss": 0.1851, "lr": 2.7442827442827445e-06, "epoch": 0.27546777546777546, "percentage": 5.51, "elapsed_time": "0:02:00", "remaining_time": "0:34:18", "throughput": 894.65, "total_tokens": 107392} +{"current_steps": 270, "total_steps": 4810, "loss": 0.2177, "lr": 2.796257796257796e-06, "epoch": 0.2806652806652807, "percentage": 5.61, "elapsed_time": "0:02:00", "remaining_time": "0:33:43", "throughput": 909.27, "total_tokens": 109440} +{"current_steps": 275, "total_steps": 4810, "loss": 0.2767, "lr": 2.8482328482328488e-06, "epoch": 0.28586278586278585, "percentage": 5.72, "elapsed_time": "0:02:00", "remaining_time": "0:33:10", "throughput": 923.27, "total_tokens": 111424} +{"current_steps": 280, "total_steps": 4810, "loss": 0.3802, "lr": 2.9002079002079005e-06, "epoch": 0.2910602910602911, "percentage": 5.82, "elapsed_time": "0:02:01", "remaining_time": "0:32:37", "throughput": 937.2, "total_tokens": 113408} +{"current_steps": 285, "total_steps": 4810, "loss": 0.2173, "lr": 2.9521829521829526e-06, "epoch": 0.29625779625779625, "percentage": 5.93, "elapsed_time": "0:02:01", "remaining_time": "0:32:06", "throughput": 951.06, "total_tokens": 115392} +{"current_steps": 290, "total_steps": 4810, "loss": 0.2155, "lr": 3.0041580041580043e-06, "epoch": 0.30145530145530147, "percentage": 6.03, "elapsed_time": "0:02:01", "remaining_time": "0:31:36", "throughput": 965.36, "total_tokens": 117440} +{"current_steps": 295, "total_steps": 4810, "loss": 0.2116, "lr": 3.0561330561330565e-06, "epoch": 0.30665280665280664, "percentage": 6.13, "elapsed_time": "0:02:01", "remaining_time": "0:31:06", "throughput": 979.05, "total_tokens": 119424} +{"current_steps": 300, "total_steps": 4810, "loss": 0.2158, "lr": 3.1081081081081082e-06, "epoch": 0.31185031185031187, "percentage": 6.24, "elapsed_time": "0:02:02", "remaining_time": "0:30:38", "throughput": 992.17, "total_tokens": 121344} +{"current_steps": 305, "total_steps": 4810, "loss": 0.2048, "lr": 3.1600831600831604e-06, "epoch": 0.31704781704781704, "percentage": 6.34, "elapsed_time": "0:02:02", "remaining_time": "0:30:11", "throughput": 1005.21, "total_tokens": 123264} +{"current_steps": 310, "total_steps": 4810, "loss": 0.2194, "lr": 3.212058212058212e-06, "epoch": 0.32224532224532226, "percentage": 6.44, "elapsed_time": "0:02:02", "remaining_time": "0:29:44", "throughput": 1018.19, "total_tokens": 125184} +{"current_steps": 315, "total_steps": 4810, "loss": 0.1868, "lr": 3.2640332640332646e-06, "epoch": 0.32744282744282743, "percentage": 6.55, "elapsed_time": "0:02:03", "remaining_time": "0:29:19", "throughput": 1032.65, "total_tokens": 127296} +{"current_steps": 320, "total_steps": 4810, "loss": 0.217, "lr": 3.3160083160083164e-06, "epoch": 0.33264033264033266, "percentage": 6.65, "elapsed_time": "0:02:03", "remaining_time": "0:28:54", "throughput": 1047.03, "total_tokens": 129408} +{"current_steps": 325, "total_steps": 4810, "loss": 0.179, "lr": 3.3679833679833685e-06, "epoch": 0.33783783783783783, "percentage": 6.76, "elapsed_time": "0:02:03", "remaining_time": "0:28:30", "throughput": 1061.33, "total_tokens": 131520} +{"current_steps": 330, "total_steps": 4810, "loss": 0.2437, "lr": 3.4199584199584202e-06, "epoch": 0.34303534303534305, "percentage": 6.86, "elapsed_time": "0:02:04", "remaining_time": "0:28:06", "throughput": 1075.06, "total_tokens": 133568} +{"current_steps": 335, "total_steps": 4810, "loss": 0.1981, "lr": 3.4719334719334724e-06, "epoch": 0.3482328482328482, "percentage": 6.96, "elapsed_time": "0:02:04", "remaining_time": "0:27:43", "throughput": 1088.72, "total_tokens": 135616} +{"current_steps": 340, "total_steps": 4810, "loss": 0.3892, "lr": 3.523908523908524e-06, "epoch": 0.35343035343035345, "percentage": 7.07, "elapsed_time": "0:02:04", "remaining_time": "0:27:21", "throughput": 1102.31, "total_tokens": 137664} +{"current_steps": 345, "total_steps": 4810, "loss": 0.1327, "lr": 3.5758835758835762e-06, "epoch": 0.3586278586278586, "percentage": 7.17, "elapsed_time": "0:02:05", "remaining_time": "0:27:00", "throughput": 1114.82, "total_tokens": 139584} +{"current_steps": 350, "total_steps": 4810, "loss": 0.2165, "lr": 3.627858627858628e-06, "epoch": 0.36382536382536385, "percentage": 7.28, "elapsed_time": "0:02:05", "remaining_time": "0:26:39", "throughput": 1127.25, "total_tokens": 141504} +{"current_steps": 355, "total_steps": 4810, "loss": 0.2907, "lr": 3.6798336798336805e-06, "epoch": 0.369022869022869, "percentage": 7.38, "elapsed_time": "0:02:05", "remaining_time": "0:26:19", "throughput": 1140.64, "total_tokens": 143552} +{"current_steps": 360, "total_steps": 4810, "loss": 0.3012, "lr": 3.7318087318087322e-06, "epoch": 0.37422037422037424, "percentage": 7.48, "elapsed_time": "0:02:06", "remaining_time": "0:25:59", "throughput": 1153.46, "total_tokens": 145536} +{"current_steps": 365, "total_steps": 4810, "loss": 0.2293, "lr": 3.7837837837837844e-06, "epoch": 0.3794178794178794, "percentage": 7.59, "elapsed_time": "0:02:06", "remaining_time": "0:25:40", "throughput": 1165.73, "total_tokens": 147456} +{"current_steps": 370, "total_steps": 4810, "loss": 0.1782, "lr": 3.835758835758836e-06, "epoch": 0.38461538461538464, "percentage": 7.69, "elapsed_time": "0:02:06", "remaining_time": "0:25:21", "throughput": 1178.44, "total_tokens": 149440} +{"current_steps": 375, "total_steps": 4810, "loss": 0.3696, "lr": 3.887733887733889e-06, "epoch": 0.3898128898128898, "percentage": 7.8, "elapsed_time": "0:02:07", "remaining_time": "0:25:03", "throughput": 1190.6, "total_tokens": 151360} +{"current_steps": 380, "total_steps": 4810, "loss": 0.309, "lr": 3.9397089397089396e-06, "epoch": 0.39501039501039503, "percentage": 7.9, "elapsed_time": "0:02:07", "remaining_time": "0:24:45", "throughput": 1203.19, "total_tokens": 153344} +{"current_steps": 385, "total_steps": 4810, "loss": 0.2413, "lr": 3.991683991683992e-06, "epoch": 0.4002079002079002, "percentage": 8.0, "elapsed_time": "0:02:07", "remaining_time": "0:24:28", "throughput": 1215.23, "total_tokens": 155264} +{"current_steps": 390, "total_steps": 4810, "loss": 0.3064, "lr": 4.043659043659044e-06, "epoch": 0.40540540540540543, "percentage": 8.11, "elapsed_time": "0:02:08", "remaining_time": "0:24:11", "throughput": 1227.69, "total_tokens": 157248} +{"current_steps": 395, "total_steps": 4810, "loss": 0.2798, "lr": 4.095634095634096e-06, "epoch": 0.4106029106029106, "percentage": 8.21, "elapsed_time": "0:02:08", "remaining_time": "0:23:55", "throughput": 1240.58, "total_tokens": 159296} +{"current_steps": 400, "total_steps": 4810, "loss": 0.3488, "lr": 4.147609147609148e-06, "epoch": 0.4158004158004158, "percentage": 8.32, "elapsed_time": "0:02:08", "remaining_time": "0:23:39", "throughput": 1253.41, "total_tokens": 161344} +{"current_steps": 405, "total_steps": 4810, "loss": 0.2072, "lr": 4.1995841995842e-06, "epoch": 0.420997920997921, "percentage": 8.42, "elapsed_time": "0:02:09", "remaining_time": "0:23:23", "throughput": 1265.68, "total_tokens": 163328} +{"current_steps": 410, "total_steps": 4810, "loss": 0.1704, "lr": 4.2515592515592516e-06, "epoch": 0.4261954261954262, "percentage": 8.52, "elapsed_time": "0:02:09", "remaining_time": "0:23:08", "throughput": 1277.89, "total_tokens": 165312} +{"current_steps": 415, "total_steps": 4810, "loss": 0.0573, "lr": 4.303534303534304e-06, "epoch": 0.4313929313929314, "percentage": 8.63, "elapsed_time": "0:02:09", "remaining_time": "0:22:53", "throughput": 1290.54, "total_tokens": 167360} +{"current_steps": 420, "total_steps": 4810, "loss": 0.9576, "lr": 4.355509355509356e-06, "epoch": 0.4365904365904366, "percentage": 8.73, "elapsed_time": "0:02:10", "remaining_time": "0:22:38", "throughput": 1302.64, "total_tokens": 169344} +{"current_steps": 425, "total_steps": 4810, "loss": 0.3222, "lr": 4.4074844074844084e-06, "epoch": 0.4417879417879418, "percentage": 8.84, "elapsed_time": "0:02:10", "remaining_time": "0:22:24", "throughput": 1315.63, "total_tokens": 171456} +{"current_steps": 430, "total_steps": 4810, "loss": 0.3442, "lr": 4.45945945945946e-06, "epoch": 0.446985446985447, "percentage": 8.94, "elapsed_time": "0:02:10", "remaining_time": "0:22:10", "throughput": 1328.57, "total_tokens": 173568} +{"current_steps": 435, "total_steps": 4810, "loss": 0.1851, "lr": 4.511434511434512e-06, "epoch": 0.4521829521829522, "percentage": 9.04, "elapsed_time": "0:02:10", "remaining_time": "0:21:57", "throughput": 1340.46, "total_tokens": 175552} +{"current_steps": 440, "total_steps": 4810, "loss": 0.2573, "lr": 4.563409563409564e-06, "epoch": 0.4573804573804574, "percentage": 9.15, "elapsed_time": "0:02:11", "remaining_time": "0:21:43", "throughput": 1352.29, "total_tokens": 177536} +{"current_steps": 445, "total_steps": 4810, "loss": 0.2972, "lr": 4.615384615384616e-06, "epoch": 0.4625779625779626, "percentage": 9.25, "elapsed_time": "0:02:11", "remaining_time": "0:21:30", "throughput": 1364.54, "total_tokens": 179584} +{"current_steps": 450, "total_steps": 4810, "loss": 0.2247, "lr": 4.667359667359668e-06, "epoch": 0.4677754677754678, "percentage": 9.36, "elapsed_time": "0:02:11", "remaining_time": "0:21:18", "throughput": 1376.29, "total_tokens": 181568} +{"current_steps": 455, "total_steps": 4810, "loss": 0.2355, "lr": 4.71933471933472e-06, "epoch": 0.47297297297297297, "percentage": 9.46, "elapsed_time": "0:02:12", "remaining_time": "0:21:05", "throughput": 1387.97, "total_tokens": 183552} +{"current_steps": 460, "total_steps": 4810, "loss": 0.1821, "lr": 4.771309771309771e-06, "epoch": 0.4781704781704782, "percentage": 9.56, "elapsed_time": "0:02:12", "remaining_time": "0:20:53", "throughput": 1400.06, "total_tokens": 185600} +{"current_steps": 465, "total_steps": 4810, "loss": 0.1938, "lr": 4.823284823284824e-06, "epoch": 0.48336798336798337, "percentage": 9.67, "elapsed_time": "0:02:12", "remaining_time": "0:20:41", "throughput": 1411.62, "total_tokens": 187584} +{"current_steps": 470, "total_steps": 4810, "loss": 0.2747, "lr": 4.875259875259876e-06, "epoch": 0.4885654885654886, "percentage": 9.77, "elapsed_time": "0:02:13", "remaining_time": "0:20:30", "throughput": 1423.13, "total_tokens": 189568} +{"current_steps": 475, "total_steps": 4810, "loss": 0.2394, "lr": 4.927234927234928e-06, "epoch": 0.49376299376299376, "percentage": 9.88, "elapsed_time": "0:02:13", "remaining_time": "0:20:18", "throughput": 1435.53, "total_tokens": 191680} +{"current_steps": 480, "total_steps": 4810, "loss": 0.2402, "lr": 4.97920997920998e-06, "epoch": 0.498960498960499, "percentage": 9.98, "elapsed_time": "0:02:13", "remaining_time": "0:20:07", "throughput": 1447.4, "total_tokens": 193728} +{"current_steps": 482, "total_steps": 4810, "eval_loss": 0.20022711157798767, "epoch": 0.501039501039501, "percentage": 10.02, "elapsed_time": "0:02:14", "remaining_time": "0:20:12", "throughput": 1441.25, "total_tokens": 194560} +{"current_steps": 485, "total_steps": 4810, "loss": 0.1873, "lr": 4.999994075155936e-06, "epoch": 0.5041580041580042, "percentage": 10.08, "elapsed_time": "0:03:02", "remaining_time": "0:27:06", "throughput": 1073.4, "total_tokens": 195776} +{"current_steps": 490, "total_steps": 4810, "loss": 0.1905, "lr": 4.999957867877242e-06, "epoch": 0.5093555093555093, "percentage": 10.19, "elapsed_time": "0:03:02", "remaining_time": "0:26:50", "throughput": 1082.04, "total_tokens": 197696} +{"current_steps": 495, "total_steps": 4810, "loss": 0.1952, "lr": 4.999888745376028e-06, "epoch": 0.5145530145530145, "percentage": 10.29, "elapsed_time": "0:03:03", "remaining_time": "0:26:35", "throughput": 1091.0, "total_tokens": 199680} +{"current_steps": 500, "total_steps": 4810, "loss": 0.2149, "lr": 4.999786708562382e-06, "epoch": 0.5197505197505198, "percentage": 10.4, "elapsed_time": "0:03:03", "remaining_time": "0:26:20", "throughput": 1100.61, "total_tokens": 201792} +{"current_steps": 505, "total_steps": 4810, "loss": 0.2066, "lr": 4.999651758779753e-06, "epoch": 0.524948024948025, "percentage": 10.5, "elapsed_time": "0:03:03", "remaining_time": "0:26:05", "throughput": 1109.81, "total_tokens": 203840} +{"current_steps": 510, "total_steps": 4810, "loss": 0.2161, "lr": 4.999483897804933e-06, "epoch": 0.5301455301455301, "percentage": 10.6, "elapsed_time": "0:03:03", "remaining_time": "0:25:51", "throughput": 1118.63, "total_tokens": 205824} +{"current_steps": 515, "total_steps": 4810, "loss": 0.1777, "lr": 4.999283127848029e-06, "epoch": 0.5353430353430353, "percentage": 10.71, "elapsed_time": "0:03:04", "remaining_time": "0:25:37", "throughput": 1128.13, "total_tokens": 207936} +{"current_steps": 520, "total_steps": 4810, "loss": 0.1931, "lr": 4.999049451552443e-06, "epoch": 0.5405405405405406, "percentage": 10.81, "elapsed_time": "0:03:04", "remaining_time": "0:25:23", "throughput": 1137.24, "total_tokens": 209984} +{"current_steps": 525, "total_steps": 4810, "loss": 0.3235, "lr": 4.998782871994828e-06, "epoch": 0.5457380457380457, "percentage": 10.91, "elapsed_time": "0:03:04", "remaining_time": "0:25:09", "throughput": 1146.67, "total_tokens": 212096} +{"current_steps": 530, "total_steps": 4810, "loss": 0.2083, "lr": 4.998483392685055e-06, "epoch": 0.5509355509355509, "percentage": 11.02, "elapsed_time": "0:03:05", "remaining_time": "0:24:56", "throughput": 1155.38, "total_tokens": 214080} +{"current_steps": 535, "total_steps": 4810, "loss": 0.2592, "lr": 4.9981510175661606e-06, "epoch": 0.5561330561330561, "percentage": 11.12, "elapsed_time": "0:03:05", "remaining_time": "0:24:43", "throughput": 1164.4, "total_tokens": 216128} +{"current_steps": 540, "total_steps": 4810, "loss": 0.2199, "lr": 4.9977857510143e-06, "epoch": 0.5613305613305614, "percentage": 11.23, "elapsed_time": "0:03:05", "remaining_time": "0:24:30", "throughput": 1173.38, "total_tokens": 218176} +{"current_steps": 545, "total_steps": 4810, "loss": 0.1414, "lr": 4.997387597838684e-06, "epoch": 0.5665280665280665, "percentage": 11.33, "elapsed_time": "0:03:06", "remaining_time": "0:24:17", "throughput": 1181.66, "total_tokens": 220096} +{"current_steps": 550, "total_steps": 4810, "loss": 0.1874, "lr": 4.996956563281524e-06, "epoch": 0.5717255717255717, "percentage": 11.43, "elapsed_time": "0:03:06", "remaining_time": "0:24:05", "throughput": 1190.24, "total_tokens": 222080} +{"current_steps": 555, "total_steps": 4810, "loss": 0.2643, "lr": 4.996492653017953e-06, "epoch": 0.5769230769230769, "percentage": 11.54, "elapsed_time": "0:03:06", "remaining_time": "0:23:52", "throughput": 1198.47, "total_tokens": 224000} +{"current_steps": 560, "total_steps": 4810, "loss": 0.2975, "lr": 4.995995873155958e-06, "epoch": 0.5821205821205822, "percentage": 11.64, "elapsed_time": "0:03:07", "remaining_time": "0:23:40", "throughput": 1207.01, "total_tokens": 225984} +{"current_steps": 565, "total_steps": 4810, "loss": 0.1955, "lr": 4.995466230236298e-06, "epoch": 0.5873180873180873, "percentage": 11.75, "elapsed_time": "0:03:07", "remaining_time": "0:23:29", "throughput": 1214.83, "total_tokens": 227840} +{"current_steps": 570, "total_steps": 4810, "loss": 0.2476, "lr": 4.994903731232415e-06, "epoch": 0.5925155925155925, "percentage": 11.85, "elapsed_time": "0:03:07", "remaining_time": "0:23:17", "throughput": 1223.3, "total_tokens": 229824} +{"current_steps": 575, "total_steps": 4810, "loss": 0.213, "lr": 4.994308383550347e-06, "epoch": 0.5977130977130977, "percentage": 11.95, "elapsed_time": "0:03:08", "remaining_time": "0:23:06", "throughput": 1232.09, "total_tokens": 231872} +{"current_steps": 580, "total_steps": 4810, "loss": 0.2039, "lr": 4.993680195028626e-06, "epoch": 0.6029106029106029, "percentage": 12.06, "elapsed_time": "0:03:08", "remaining_time": "0:22:54", "throughput": 1240.84, "total_tokens": 233920} +{"current_steps": 585, "total_steps": 4810, "loss": 0.2036, "lr": 4.993019173938178e-06, "epoch": 0.6081081081081081, "percentage": 12.16, "elapsed_time": "0:03:08", "remaining_time": "0:22:43", "throughput": 1248.89, "total_tokens": 235840} +{"current_steps": 590, "total_steps": 4810, "loss": 0.2111, "lr": 4.992325328982212e-06, "epoch": 0.6133056133056133, "percentage": 12.27, "elapsed_time": "0:03:09", "remaining_time": "0:22:33", "throughput": 1258.25, "total_tokens": 238016} +{"current_steps": 595, "total_steps": 4810, "loss": 0.1706, "lr": 4.991598669296105e-06, "epoch": 0.6185031185031185, "percentage": 12.37, "elapsed_time": "0:03:09", "remaining_time": "0:22:22", "throughput": 1266.9, "total_tokens": 240064} +{"current_steps": 600, "total_steps": 4810, "loss": 0.2236, "lr": 4.990839204447287e-06, "epoch": 0.6237006237006237, "percentage": 12.47, "elapsed_time": "0:03:09", "remaining_time": "0:22:11", "throughput": 1275.19, "total_tokens": 242048} +{"current_steps": 605, "total_steps": 4810, "loss": 0.1908, "lr": 4.990046944435105e-06, "epoch": 0.6288981288981289, "percentage": 12.58, "elapsed_time": "0:03:10", "remaining_time": "0:22:01", "throughput": 1283.14, "total_tokens": 243968} +{"current_steps": 610, "total_steps": 4810, "loss": 0.2409, "lr": 4.989221899690704e-06, "epoch": 0.6340956340956341, "percentage": 12.68, "elapsed_time": "0:03:10", "remaining_time": "0:21:51", "throughput": 1291.71, "total_tokens": 246016} +{"current_steps": 615, "total_steps": 4810, "loss": 0.2135, "lr": 4.988364081076877e-06, "epoch": 0.6392931392931392, "percentage": 12.79, "elapsed_time": "0:03:10", "remaining_time": "0:21:41", "throughput": 1299.93, "total_tokens": 248000} +{"current_steps": 620, "total_steps": 4810, "loss": 0.203, "lr": 4.987473499887932e-06, "epoch": 0.6444906444906445, "percentage": 12.89, "elapsed_time": "0:03:11", "remaining_time": "0:21:31", "throughput": 1308.44, "total_tokens": 250048} +{"current_steps": 625, "total_steps": 4810, "loss": 0.1867, "lr": 4.986550167849538e-06, "epoch": 0.6496881496881497, "percentage": 12.99, "elapsed_time": "0:03:11", "remaining_time": "0:21:21", "throughput": 1316.93, "total_tokens": 252096} +{"current_steps": 630, "total_steps": 4810, "loss": 0.1162, "lr": 4.9855940971185705e-06, "epoch": 0.6548856548856549, "percentage": 13.1, "elapsed_time": "0:03:11", "remaining_time": "0:21:12", "throughput": 1325.39, "total_tokens": 254144} +{"current_steps": 635, "total_steps": 4810, "loss": 0.2562, "lr": 4.984605300282955e-06, "epoch": 0.66008316008316, "percentage": 13.2, "elapsed_time": "0:03:12", "remaining_time": "0:21:02", "throughput": 1333.5, "total_tokens": 256128} +{"current_steps": 640, "total_steps": 4810, "loss": 0.1389, "lr": 4.983583790361497e-06, "epoch": 0.6652806652806653, "percentage": 13.31, "elapsed_time": "0:03:12", "remaining_time": "0:20:53", "throughput": 1341.24, "total_tokens": 258048} +{"current_steps": 645, "total_steps": 4810, "loss": 0.3054, "lr": 4.982529580803714e-06, "epoch": 0.6704781704781705, "percentage": 13.41, "elapsed_time": "0:03:12", "remaining_time": "0:20:44", "throughput": 1350.93, "total_tokens": 260352} +{"current_steps": 650, "total_steps": 4810, "loss": 0.2884, "lr": 4.981442685489659e-06, "epoch": 0.6756756756756757, "percentage": 13.51, "elapsed_time": "0:03:13", "remaining_time": "0:20:35", "throughput": 1358.61, "total_tokens": 262272} +{"current_steps": 655, "total_steps": 4810, "loss": 0.1599, "lr": 4.9803231187297305e-06, "epoch": 0.6808731808731808, "percentage": 13.62, "elapsed_time": "0:03:13", "remaining_time": "0:20:26", "throughput": 1366.93, "total_tokens": 264320} +{"current_steps": 660, "total_steps": 4810, "loss": 0.1946, "lr": 4.979170895264494e-06, "epoch": 0.6860706860706861, "percentage": 13.72, "elapsed_time": "0:03:13", "remaining_time": "0:20:17", "throughput": 1374.57, "total_tokens": 266240} +{"current_steps": 665, "total_steps": 4810, "loss": 0.2128, "lr": 4.977986030264483e-06, "epoch": 0.6912681912681913, "percentage": 13.83, "elapsed_time": "0:03:14", "remaining_time": "0:20:09", "throughput": 1382.51, "total_tokens": 268224} +{"current_steps": 670, "total_steps": 4810, "loss": 0.2326, "lr": 4.9767685393299946e-06, "epoch": 0.6964656964656964, "percentage": 13.93, "elapsed_time": "0:03:14", "remaining_time": "0:20:00", "throughput": 1390.75, "total_tokens": 270272} +{"current_steps": 675, "total_steps": 4810, "loss": 0.2276, "lr": 4.975518438490897e-06, "epoch": 0.7016632016632016, "percentage": 14.03, "elapsed_time": "0:03:14", "remaining_time": "0:19:52", "throughput": 1398.64, "total_tokens": 272256} +{"current_steps": 680, "total_steps": 4810, "loss": 0.1786, "lr": 4.974235744206405e-06, "epoch": 0.7068607068607069, "percentage": 14.14, "elapsed_time": "0:03:14", "remaining_time": "0:19:44", "throughput": 1406.5, "total_tokens": 274240} +{"current_steps": 685, "total_steps": 4810, "loss": 0.1923, "lr": 4.972920473364869e-06, "epoch": 0.7120582120582121, "percentage": 14.24, "elapsed_time": "0:03:15", "remaining_time": "0:19:36", "throughput": 1414.64, "total_tokens": 276288} +{"current_steps": 690, "total_steps": 4810, "loss": 0.1661, "lr": 4.971572643283557e-06, "epoch": 0.7172557172557172, "percentage": 14.35, "elapsed_time": "0:03:15", "remaining_time": "0:19:28", "throughput": 1422.45, "total_tokens": 278272} +{"current_steps": 695, "total_steps": 4810, "loss": 0.1867, "lr": 4.970192271708416e-06, "epoch": 0.7224532224532224, "percentage": 14.45, "elapsed_time": "0:03:15", "remaining_time": "0:19:20", "throughput": 1430.88, "total_tokens": 280384} +{"current_steps": 700, "total_steps": 4810, "loss": 0.3333, "lr": 4.968779376813849e-06, "epoch": 0.7276507276507277, "percentage": 14.55, "elapsed_time": "0:03:16", "remaining_time": "0:19:12", "throughput": 1438.63, "total_tokens": 282368} +{"current_steps": 705, "total_steps": 4810, "loss": 0.1327, "lr": 4.967333977202469e-06, "epoch": 0.7328482328482329, "percentage": 14.66, "elapsed_time": "0:03:16", "remaining_time": "0:19:04", "throughput": 1446.68, "total_tokens": 284416} +{"current_steps": 710, "total_steps": 4810, "loss": 0.2235, "lr": 4.965856091904855e-06, "epoch": 0.738045738045738, "percentage": 14.76, "elapsed_time": "0:03:16", "remaining_time": "0:18:57", "throughput": 1454.7, "total_tokens": 286464} +{"current_steps": 715, "total_steps": 4810, "loss": 0.3413, "lr": 4.964345740379307e-06, "epoch": 0.7432432432432432, "percentage": 14.86, "elapsed_time": "0:03:17", "remaining_time": "0:18:49", "throughput": 1462.39, "total_tokens": 288448} +{"current_steps": 720, "total_steps": 4810, "loss": 0.1906, "lr": 4.962802942511582e-06, "epoch": 0.7484407484407485, "percentage": 14.97, "elapsed_time": "0:03:17", "remaining_time": "0:18:42", "throughput": 1470.35, "total_tokens": 290496} +{"current_steps": 723, "total_steps": 4810, "eval_loss": 0.20943090319633484, "epoch": 0.7515592515592515, "percentage": 15.03, "elapsed_time": "0:03:18", "remaining_time": "0:18:43", "throughput": 1467.63, "total_tokens": 291712} +{"current_steps": 725, "total_steps": 4810, "loss": 0.2576, "lr": 4.961227718614634e-06, "epoch": 0.7536382536382537, "percentage": 15.07, "elapsed_time": "0:04:03", "remaining_time": "0:22:52", "throughput": 1200.99, "total_tokens": 292480} +{"current_steps": 730, "total_steps": 4810, "loss": 0.2352, "lr": 4.959620089428354e-06, "epoch": 0.7588357588357588, "percentage": 15.18, "elapsed_time": "0:04:03", "remaining_time": "0:22:42", "throughput": 1207.48, "total_tokens": 294464} +{"current_steps": 735, "total_steps": 4810, "loss": 0.2617, "lr": 4.957980076119285e-06, "epoch": 0.764033264033264, "percentage": 15.28, "elapsed_time": "0:04:04", "remaining_time": "0:22:33", "throughput": 1213.96, "total_tokens": 296448} +{"current_steps": 740, "total_steps": 4810, "loss": 0.2079, "lr": 4.956307700280354e-06, "epoch": 0.7692307692307693, "percentage": 15.38, "elapsed_time": "0:04:04", "remaining_time": "0:22:24", "throughput": 1220.42, "total_tokens": 298432} +{"current_steps": 745, "total_steps": 4810, "loss": 0.2712, "lr": 4.954602983930581e-06, "epoch": 0.7744282744282744, "percentage": 15.49, "elapsed_time": "0:04:04", "remaining_time": "0:22:16", "throughput": 1227.12, "total_tokens": 300480} +{"current_steps": 750, "total_steps": 4810, "loss": 0.2211, "lr": 4.95286594951479e-06, "epoch": 0.7796257796257796, "percentage": 15.59, "elapsed_time": "0:04:05", "remaining_time": "0:22:07", "throughput": 1233.29, "total_tokens": 302400} +{"current_steps": 755, "total_steps": 4810, "loss": 0.2161, "lr": 4.951096619903317e-06, "epoch": 0.7848232848232848, "percentage": 15.7, "elapsed_time": "0:04:05", "remaining_time": "0:21:58", "throughput": 1239.43, "total_tokens": 304320} +{"current_steps": 760, "total_steps": 4810, "loss": 0.1828, "lr": 4.949295018391707e-06, "epoch": 0.7900207900207901, "percentage": 15.8, "elapsed_time": "0:04:05", "remaining_time": "0:21:50", "throughput": 1245.58, "total_tokens": 306240} +{"current_steps": 765, "total_steps": 4810, "loss": 0.2155, "lr": 4.9474611687004025e-06, "epoch": 0.7952182952182952, "percentage": 15.9, "elapsed_time": "0:04:06", "remaining_time": "0:21:41", "throughput": 1251.17, "total_tokens": 308032} +{"current_steps": 770, "total_steps": 4810, "loss": 0.2009, "lr": 4.945595094974442e-06, "epoch": 0.8004158004158004, "percentage": 16.01, "elapsed_time": "0:04:06", "remaining_time": "0:21:33", "throughput": 1257.29, "total_tokens": 309952} +{"current_steps": 775, "total_steps": 4810, "loss": 0.1813, "lr": 4.94369682178313e-06, "epoch": 0.8056133056133056, "percentage": 16.11, "elapsed_time": "0:04:06", "remaining_time": "0:21:25", "throughput": 1263.69, "total_tokens": 311936} +{"current_steps": 780, "total_steps": 4810, "loss": 0.1603, "lr": 4.941766374119724e-06, "epoch": 0.8108108108108109, "percentage": 16.22, "elapsed_time": "0:04:07", "remaining_time": "0:21:17", "throughput": 1270.07, "total_tokens": 313920} +{"current_steps": 785, "total_steps": 4810, "loss": 0.2613, "lr": 4.939803777401096e-06, "epoch": 0.816008316008316, "percentage": 16.32, "elapsed_time": "0:04:07", "remaining_time": "0:21:08", "throughput": 1276.7, "total_tokens": 315968} +{"current_steps": 790, "total_steps": 4810, "loss": 0.2641, "lr": 4.937809057467404e-06, "epoch": 0.8212058212058212, "percentage": 16.42, "elapsed_time": "0:04:07", "remaining_time": "0:21:01", "throughput": 1283.05, "total_tokens": 317952} +{"current_steps": 795, "total_steps": 4810, "loss": 0.1934, "lr": 4.935782240581753e-06, "epoch": 0.8264033264033264, "percentage": 16.53, "elapsed_time": "0:04:08", "remaining_time": "0:20:53", "throughput": 1289.13, "total_tokens": 319872} +{"current_steps": 800, "total_steps": 4810, "loss": 0.2498, "lr": 4.933723353429842e-06, "epoch": 0.8316008316008316, "percentage": 16.63, "elapsed_time": "0:04:08", "remaining_time": "0:20:45", "throughput": 1295.44, "total_tokens": 321856} +{"current_steps": 805, "total_steps": 4810, "loss": 0.1671, "lr": 4.931632423119621e-06, "epoch": 0.8367983367983368, "percentage": 16.74, "elapsed_time": "0:04:08", "remaining_time": "0:20:37", "throughput": 1302.26, "total_tokens": 323968} +{"current_steps": 810, "total_steps": 4810, "loss": 0.2092, "lr": 4.929509477180929e-06, "epoch": 0.841995841995842, "percentage": 16.84, "elapsed_time": "0:04:09", "remaining_time": "0:20:30", "throughput": 1308.54, "total_tokens": 325952} +{"current_steps": 815, "total_steps": 4810, "loss": 0.0581, "lr": 4.927354543565131e-06, "epoch": 0.8471933471933472, "percentage": 16.94, "elapsed_time": "0:04:09", "remaining_time": "0:20:22", "throughput": 1315.07, "total_tokens": 328000} +{"current_steps": 820, "total_steps": 4810, "loss": 0.1592, "lr": 4.925167650644752e-06, "epoch": 0.8523908523908524, "percentage": 17.05, "elapsed_time": "0:04:09", "remaining_time": "0:20:15", "throughput": 1321.33, "total_tokens": 329984} +{"current_steps": 825, "total_steps": 4810, "loss": 0.4462, "lr": 4.922948827213107e-06, "epoch": 0.8575883575883576, "percentage": 17.15, "elapsed_time": "0:04:10", "remaining_time": "0:20:07", "throughput": 1327.32, "total_tokens": 331904} +{"current_steps": 830, "total_steps": 4810, "loss": 0.4518, "lr": 4.920698102483913e-06, "epoch": 0.8627858627858628, "percentage": 17.26, "elapsed_time": "0:04:10", "remaining_time": "0:20:00", "throughput": 1333.55, "total_tokens": 333888} +{"current_steps": 835, "total_steps": 4810, "loss": 0.2671, "lr": 4.9184155060909115e-06, "epoch": 0.867983367983368, "percentage": 17.36, "elapsed_time": "0:04:10", "remaining_time": "0:19:53", "throughput": 1339.76, "total_tokens": 335872} +{"current_steps": 840, "total_steps": 4810, "loss": 0.3681, "lr": 4.916101068087477e-06, "epoch": 0.8731808731808732, "percentage": 17.46, "elapsed_time": "0:04:11", "remaining_time": "0:19:46", "throughput": 1345.96, "total_tokens": 337856} +{"current_steps": 845, "total_steps": 4810, "loss": 0.2011, "lr": 4.9137548189462185e-06, "epoch": 0.8783783783783784, "percentage": 17.57, "elapsed_time": "0:04:11", "remaining_time": "0:19:39", "throughput": 1351.89, "total_tokens": 339776} +{"current_steps": 850, "total_steps": 4810, "loss": 0.1852, "lr": 4.911376789558584e-06, "epoch": 0.8835758835758836, "percentage": 17.67, "elapsed_time": "0:04:11", "remaining_time": "0:19:32", "throughput": 1358.05, "total_tokens": 341760} +{"current_steps": 855, "total_steps": 4810, "loss": 0.3553, "lr": 4.908967011234446e-06, "epoch": 0.8887733887733887, "percentage": 17.78, "elapsed_time": "0:04:11", "remaining_time": "0:19:25", "throughput": 1363.96, "total_tokens": 343680} +{"current_steps": 860, "total_steps": 4810, "loss": 0.2092, "lr": 4.9065255157016955e-06, "epoch": 0.893970893970894, "percentage": 17.88, "elapsed_time": "0:04:12", "remaining_time": "0:19:18", "throughput": 1369.84, "total_tokens": 345600} +{"current_steps": 865, "total_steps": 4810, "loss": 0.2165, "lr": 4.904052335105822e-06, "epoch": 0.8991683991683992, "percentage": 17.98, "elapsed_time": "0:04:12", "remaining_time": "0:19:12", "throughput": 1375.72, "total_tokens": 347520} +{"current_steps": 870, "total_steps": 4810, "loss": 0.1773, "lr": 4.90154750200949e-06, "epoch": 0.9043659043659044, "percentage": 18.09, "elapsed_time": "0:04:12", "remaining_time": "0:19:05", "throughput": 1382.07, "total_tokens": 349568} +{"current_steps": 875, "total_steps": 4810, "loss": 0.1146, "lr": 4.899011049392111e-06, "epoch": 0.9095634095634095, "percentage": 18.19, "elapsed_time": "0:04:13", "remaining_time": "0:18:58", "throughput": 1388.16, "total_tokens": 351552} +{"current_steps": 880, "total_steps": 4810, "loss": 0.1213, "lr": 4.896443010649408e-06, "epoch": 0.9147609147609148, "percentage": 18.3, "elapsed_time": "0:04:13", "remaining_time": "0:18:52", "throughput": 1393.98, "total_tokens": 353472} +{"current_steps": 885, "total_steps": 4810, "loss": 0.123, "lr": 4.893843419592977e-06, "epoch": 0.91995841995842, "percentage": 18.4, "elapsed_time": "0:04:13", "remaining_time": "0:18:46", "throughput": 1399.78, "total_tokens": 355392} +{"current_steps": 890, "total_steps": 4810, "loss": 0.1794, "lr": 4.891212310449845e-06, "epoch": 0.9251559251559252, "percentage": 18.5, "elapsed_time": "0:04:14", "remaining_time": "0:18:39", "throughput": 1406.08, "total_tokens": 357440} +{"current_steps": 895, "total_steps": 4810, "loss": 0.1822, "lr": 4.88854971786201e-06, "epoch": 0.9303534303534303, "percentage": 18.61, "elapsed_time": "0:04:14", "remaining_time": "0:18:33", "throughput": 1412.35, "total_tokens": 359488} +{"current_steps": 900, "total_steps": 4810, "loss": 0.282, "lr": 4.885855676885995e-06, "epoch": 0.9355509355509356, "percentage": 18.71, "elapsed_time": "0:04:14", "remaining_time": "0:18:27", "throughput": 1418.11, "total_tokens": 361408} +{"current_steps": 905, "total_steps": 4810, "loss": 0.1931, "lr": 4.88313022299238e-06, "epoch": 0.9407484407484408, "percentage": 18.81, "elapsed_time": "0:04:15", "remaining_time": "0:18:21", "throughput": 1424.11, "total_tokens": 363392} +{"current_steps": 910, "total_steps": 4810, "loss": 0.318, "lr": 4.880373392065339e-06, "epoch": 0.9459459459459459, "percentage": 18.92, "elapsed_time": "0:04:15", "remaining_time": "0:18:14", "throughput": 1430.35, "total_tokens": 365440} +{"current_steps": 915, "total_steps": 4810, "loss": 0.1793, "lr": 4.877585220402167e-06, "epoch": 0.9511434511434511, "percentage": 19.02, "elapsed_time": "0:04:15", "remaining_time": "0:18:08", "throughput": 1437.05, "total_tokens": 367616} +{"current_steps": 920, "total_steps": 4810, "loss": 0.1164, "lr": 4.874765744712796e-06, "epoch": 0.9563409563409564, "percentage": 19.13, "elapsed_time": "0:04:16", "remaining_time": "0:18:02", "throughput": 1443.0, "total_tokens": 369600} +{"current_steps": 925, "total_steps": 4810, "loss": 0.2515, "lr": 4.8719150021193206e-06, "epoch": 0.9615384615384616, "percentage": 19.23, "elapsed_time": "0:04:16", "remaining_time": "0:17:57", "throughput": 1448.7, "total_tokens": 371520} +{"current_steps": 930, "total_steps": 4810, "loss": 0.3492, "lr": 4.869033030155504e-06, "epoch": 0.9667359667359667, "percentage": 19.33, "elapsed_time": "0:04:16", "remaining_time": "0:17:51", "throughput": 1454.86, "total_tokens": 373568} +{"current_steps": 935, "total_steps": 4810, "loss": 0.1902, "lr": 4.866119866766286e-06, "epoch": 0.9719334719334719, "percentage": 19.44, "elapsed_time": "0:04:17", "remaining_time": "0:17:45", "throughput": 1460.53, "total_tokens": 375488} +{"current_steps": 940, "total_steps": 4810, "loss": 0.2238, "lr": 4.86317555030728e-06, "epoch": 0.9771309771309772, "percentage": 19.54, "elapsed_time": "0:04:17", "remaining_time": "0:17:39", "throughput": 1467.39, "total_tokens": 377728} +{"current_steps": 945, "total_steps": 4810, "loss": 0.11, "lr": 4.860200119544273e-06, "epoch": 0.9823284823284824, "percentage": 19.65, "elapsed_time": "0:04:17", "remaining_time": "0:17:34", "throughput": 1473.75, "total_tokens": 379840} +{"current_steps": 950, "total_steps": 4810, "loss": 0.2154, "lr": 4.857193613652711e-06, "epoch": 0.9875259875259875, "percentage": 19.75, "elapsed_time": "0:04:18", "remaining_time": "0:17:28", "throughput": 1479.37, "total_tokens": 381760} +{"current_steps": 955, "total_steps": 4810, "loss": 0.1666, "lr": 4.854156072217185e-06, "epoch": 0.9927234927234927, "percentage": 19.85, "elapsed_time": "0:04:18", "remaining_time": "0:17:22", "throughput": 1485.47, "total_tokens": 383808} +{"current_steps": 960, "total_steps": 4810, "loss": 0.2397, "lr": 4.851087535230911e-06, "epoch": 0.997920997920998, "percentage": 19.96, "elapsed_time": "0:04:18", "remaining_time": "0:17:17", "throughput": 1491.56, "total_tokens": 385856} +{"current_steps": 964, "total_steps": 4810, "eval_loss": 0.17627178132534027, "epoch": 1.002079002079002, "percentage": 20.04, "elapsed_time": "0:04:20", "remaining_time": "0:17:17", "throughput": 1490.01, "total_tokens": 387464} +{"current_steps": 965, "total_steps": 4810, "loss": 0.176, "lr": 4.8479880430952e-06, "epoch": 1.003118503118503, "percentage": 20.06, "elapsed_time": "0:05:53", "remaining_time": "0:23:28", "throughput": 1096.8, "total_tokens": 387848} +{"current_steps": 970, "total_steps": 4810, "loss": 0.0833, "lr": 4.844857636618928e-06, "epoch": 1.0083160083160083, "percentage": 20.17, "elapsed_time": "0:05:53", "remaining_time": "0:23:21", "throughput": 1100.85, "total_tokens": 389640} +{"current_steps": 975, "total_steps": 4810, "loss": 0.1134, "lr": 4.841696357018003e-06, "epoch": 1.0135135135135136, "percentage": 20.27, "elapsed_time": "0:05:54", "remaining_time": "0:23:13", "throughput": 1105.44, "total_tokens": 391624} +{"current_steps": 980, "total_steps": 4810, "loss": 0.0776, "lr": 4.838504245914812e-06, "epoch": 1.0187110187110187, "percentage": 20.37, "elapsed_time": "0:05:54", "remaining_time": "0:23:05", "throughput": 1110.2, "total_tokens": 393672} +{"current_steps": 985, "total_steps": 4810, "loss": 0.0266, "lr": 4.835281345337684e-06, "epoch": 1.023908523908524, "percentage": 20.48, "elapsed_time": "0:05:54", "remaining_time": "0:22:58", "throughput": 1115.14, "total_tokens": 395784} +{"current_steps": 990, "total_steps": 4810, "loss": 0.2075, "lr": 4.832027697720329e-06, "epoch": 1.0291060291060292, "percentage": 20.58, "elapsed_time": "0:05:55", "remaining_time": "0:22:50", "throughput": 1119.7, "total_tokens": 397768} +{"current_steps": 995, "total_steps": 4810, "loss": 0.4063, "lr": 4.828743345901285e-06, "epoch": 1.0343035343035343, "percentage": 20.69, "elapsed_time": "0:05:55", "remaining_time": "0:22:43", "throughput": 1124.44, "total_tokens": 399816} +{"current_steps": 1000, "total_steps": 4810, "loss": 0.1017, "lr": 4.825428333123346e-06, "epoch": 1.0395010395010396, "percentage": 20.79, "elapsed_time": "0:05:55", "remaining_time": "0:22:35", "throughput": 1129.35, "total_tokens": 401928} +{"current_steps": 1005, "total_steps": 4810, "loss": 0.0338, "lr": 4.822082703033003e-06, "epoch": 1.0446985446985446, "percentage": 20.89, "elapsed_time": "0:05:56", "remaining_time": "0:22:28", "throughput": 1133.89, "total_tokens": 403912} +{"current_steps": 1010, "total_steps": 4810, "loss": 0.1392, "lr": 4.818706499679862e-06, "epoch": 1.04989604989605, "percentage": 21.0, "elapsed_time": "0:05:56", "remaining_time": "0:22:21", "throughput": 1138.24, "total_tokens": 405832} +{"current_steps": 1015, "total_steps": 4810, "loss": 0.1168, "lr": 4.815299767516065e-06, "epoch": 1.0550935550935552, "percentage": 21.1, "elapsed_time": "0:05:56", "remaining_time": "0:22:14", "throughput": 1142.95, "total_tokens": 407880} +{"current_steps": 1020, "total_steps": 4810, "loss": 0.1001, "lr": 4.811862551395707e-06, "epoch": 1.0602910602910602, "percentage": 21.21, "elapsed_time": "0:05:57", "remaining_time": "0:22:07", "throughput": 1148.18, "total_tokens": 410120} +{"current_steps": 1025, "total_steps": 4810, "loss": 0.0944, "lr": 4.808394896574246e-06, "epoch": 1.0654885654885655, "percentage": 21.31, "elapsed_time": "0:05:57", "remaining_time": "0:22:00", "throughput": 1152.87, "total_tokens": 412168} +{"current_steps": 1030, "total_steps": 4810, "loss": 0.1433, "lr": 4.8048968487079e-06, "epoch": 1.0706860706860706, "percentage": 21.41, "elapsed_time": "0:05:57", "remaining_time": "0:21:53", "throughput": 1158.24, "total_tokens": 414472} +{"current_steps": 1035, "total_steps": 4810, "loss": 0.3131, "lr": 4.801368453853057e-06, "epoch": 1.0758835758835759, "percentage": 21.52, "elapsed_time": "0:05:58", "remaining_time": "0:21:46", "throughput": 1162.91, "total_tokens": 416520} +{"current_steps": 1040, "total_steps": 4810, "loss": 0.171, "lr": 4.79780975846566e-06, "epoch": 1.0810810810810811, "percentage": 21.62, "elapsed_time": "0:05:58", "remaining_time": "0:21:39", "throughput": 1167.57, "total_tokens": 418568} +{"current_steps": 1045, "total_steps": 4810, "loss": 0.1287, "lr": 4.7942208094006e-06, "epoch": 1.0862785862785862, "percentage": 21.73, "elapsed_time": "0:05:58", "remaining_time": "0:21:32", "throughput": 1171.86, "total_tokens": 420488} +{"current_steps": 1050, "total_steps": 4810, "loss": 0.1098, "lr": 4.790601653911094e-06, "epoch": 1.0914760914760915, "percentage": 21.83, "elapsed_time": "0:05:59", "remaining_time": "0:21:26", "throughput": 1176.33, "total_tokens": 422472} +{"current_steps": 1055, "total_steps": 4810, "loss": 0.297, "lr": 4.786952339648071e-06, "epoch": 1.0966735966735968, "percentage": 21.93, "elapsed_time": "0:05:59", "remaining_time": "0:21:19", "throughput": 1180.79, "total_tokens": 424456} +{"current_steps": 1060, "total_steps": 4810, "loss": 0.3308, "lr": 4.783272914659535e-06, "epoch": 1.1018711018711018, "percentage": 22.04, "elapsed_time": "0:05:59", "remaining_time": "0:21:12", "throughput": 1185.59, "total_tokens": 426568} +{"current_steps": 1065, "total_steps": 4810, "loss": 0.1061, "lr": 4.77956342738994e-06, "epoch": 1.107068607068607, "percentage": 22.14, "elapsed_time": "0:06:00", "remaining_time": "0:21:06", "throughput": 1190.03, "total_tokens": 428552} +{"current_steps": 1070, "total_steps": 4810, "loss": 0.0996, "lr": 4.775823926679549e-06, "epoch": 1.1122661122661124, "percentage": 22.25, "elapsed_time": "0:06:00", "remaining_time": "0:20:59", "throughput": 1194.29, "total_tokens": 430472} +{"current_steps": 1075, "total_steps": 4810, "loss": 0.1315, "lr": 4.77205446176379e-06, "epoch": 1.1174636174636174, "percentage": 22.35, "elapsed_time": "0:06:00", "remaining_time": "0:20:53", "throughput": 1198.36, "total_tokens": 432328} +{"current_steps": 1080, "total_steps": 4810, "loss": 0.2841, "lr": 4.768255082272612e-06, "epoch": 1.1226611226611227, "percentage": 22.45, "elapsed_time": "0:06:01", "remaining_time": "0:20:47", "throughput": 1203.13, "total_tokens": 434440} +{"current_steps": 1085, "total_steps": 4810, "loss": 0.0783, "lr": 4.764425838229823e-06, "epoch": 1.1278586278586278, "percentage": 22.56, "elapsed_time": "0:06:01", "remaining_time": "0:20:40", "throughput": 1207.72, "total_tokens": 436488} +{"current_steps": 1090, "total_steps": 4810, "loss": 0.346, "lr": 4.760566780052445e-06, "epoch": 1.133056133056133, "percentage": 22.66, "elapsed_time": "0:06:01", "remaining_time": "0:20:34", "throughput": 1212.12, "total_tokens": 438472} +{"current_steps": 1095, "total_steps": 4810, "loss": 0.4155, "lr": 4.756677958550035e-06, "epoch": 1.1382536382536383, "percentage": 22.77, "elapsed_time": "0:06:02", "remaining_time": "0:20:28", "throughput": 1216.52, "total_tokens": 440456} +{"current_steps": 1100, "total_steps": 4810, "loss": 0.1236, "lr": 4.752759424924026e-06, "epoch": 1.1434511434511434, "percentage": 22.87, "elapsed_time": "0:06:02", "remaining_time": "0:20:22", "throughput": 1220.9, "total_tokens": 442440} +{"current_steps": 1105, "total_steps": 4810, "loss": 0.099, "lr": 4.7488112307670515e-06, "epoch": 1.1486486486486487, "percentage": 22.97, "elapsed_time": "0:06:02", "remaining_time": "0:20:16", "throughput": 1225.29, "total_tokens": 444424} +{"current_steps": 1110, "total_steps": 4810, "loss": 0.1891, "lr": 4.7448334280622624e-06, "epoch": 1.1538461538461537, "percentage": 23.08, "elapsed_time": "0:06:03", "remaining_time": "0:20:10", "throughput": 1229.3, "total_tokens": 446280} +{"current_steps": 1115, "total_steps": 4810, "loss": 0.1802, "lr": 4.740826069182645e-06, "epoch": 1.159043659043659, "percentage": 23.18, "elapsed_time": "0:06:03", "remaining_time": "0:20:04", "throughput": 1233.66, "total_tokens": 448264} +{"current_steps": 1120, "total_steps": 4810, "loss": 0.2325, "lr": 4.736789206890332e-06, "epoch": 1.1642411642411643, "percentage": 23.28, "elapsed_time": "0:06:03", "remaining_time": "0:19:58", "throughput": 1238.37, "total_tokens": 450376} +{"current_steps": 1125, "total_steps": 4810, "loss": 0.1142, "lr": 4.732722894335909e-06, "epoch": 1.1694386694386694, "percentage": 23.39, "elapsed_time": "0:06:04", "remaining_time": "0:19:52", "throughput": 1243.24, "total_tokens": 452552} +{"current_steps": 1130, "total_steps": 4810, "loss": 0.1432, "lr": 4.728627185057711e-06, "epoch": 1.1746361746361746, "percentage": 23.49, "elapsed_time": "0:06:04", "remaining_time": "0:19:46", "throughput": 1247.75, "total_tokens": 454600} +{"current_steps": 1135, "total_steps": 4810, "loss": 0.1061, "lr": 4.724502132981119e-06, "epoch": 1.17983367983368, "percentage": 23.6, "elapsed_time": "0:06:04", "remaining_time": "0:19:40", "throughput": 1252.26, "total_tokens": 456648} +{"current_steps": 1140, "total_steps": 4810, "loss": 0.078, "lr": 4.720347792417851e-06, "epoch": 1.185031185031185, "percentage": 23.7, "elapsed_time": "0:06:04", "remaining_time": "0:19:34", "throughput": 1256.59, "total_tokens": 458632} +{"current_steps": 1145, "total_steps": 4810, "loss": 0.1068, "lr": 4.716164218065246e-06, "epoch": 1.1902286902286903, "percentage": 23.8, "elapsed_time": "0:06:05", "remaining_time": "0:19:29", "throughput": 1261.08, "total_tokens": 460680} +{"current_steps": 1150, "total_steps": 4810, "loss": 0.2177, "lr": 4.711951465005548e-06, "epoch": 1.1954261954261955, "percentage": 23.91, "elapsed_time": "0:06:05", "remaining_time": "0:19:23", "throughput": 1265.56, "total_tokens": 462728} +{"current_steps": 1155, "total_steps": 4810, "loss": 0.058, "lr": 4.707709588705169e-06, "epoch": 1.2006237006237006, "percentage": 24.01, "elapsed_time": "0:06:05", "remaining_time": "0:19:18", "throughput": 1270.03, "total_tokens": 464776} +{"current_steps": 1160, "total_steps": 4810, "loss": 0.3544, "lr": 4.7034386450139735e-06, "epoch": 1.2058212058212059, "percentage": 24.12, "elapsed_time": "0:06:06", "remaining_time": "0:19:12", "throughput": 1274.14, "total_tokens": 466696} +{"current_steps": 1165, "total_steps": 4810, "loss": 0.1744, "lr": 4.699138690164533e-06, "epoch": 1.211018711018711, "percentage": 24.22, "elapsed_time": "0:06:06", "remaining_time": "0:19:07", "throughput": 1278.26, "total_tokens": 468616} +{"current_steps": 1170, "total_steps": 4810, "loss": 0.1842, "lr": 4.694809780771391e-06, "epoch": 1.2162162162162162, "percentage": 24.32, "elapsed_time": "0:06:06", "remaining_time": "0:19:01", "throughput": 1282.89, "total_tokens": 470728} +{"current_steps": 1175, "total_steps": 4810, "loss": 0.1067, "lr": 4.690451973830314e-06, "epoch": 1.2214137214137215, "percentage": 24.43, "elapsed_time": "0:06:07", "remaining_time": "0:18:56", "throughput": 1287.34, "total_tokens": 472776} +{"current_steps": 1180, "total_steps": 4810, "loss": 0.177, "lr": 4.6860653267175425e-06, "epoch": 1.2266112266112266, "percentage": 24.53, "elapsed_time": "0:06:07", "remaining_time": "0:18:50", "throughput": 1291.76, "total_tokens": 474824} +{"current_steps": 1185, "total_steps": 4810, "loss": 0.2562, "lr": 4.681649897189036e-06, "epoch": 1.2318087318087318, "percentage": 24.64, "elapsed_time": "0:06:07", "remaining_time": "0:18:45", "throughput": 1295.83, "total_tokens": 476744} +{"current_steps": 1190, "total_steps": 4810, "loss": 0.053, "lr": 4.677205743379714e-06, "epoch": 1.237006237006237, "percentage": 24.74, "elapsed_time": "0:06:08", "remaining_time": "0:18:40", "throughput": 1300.4, "total_tokens": 478856} +{"current_steps": 1195, "total_steps": 4810, "loss": 0.1686, "lr": 4.672732923802685e-06, "epoch": 1.2422037422037422, "percentage": 24.84, "elapsed_time": "0:06:08", "remaining_time": "0:18:34", "throughput": 1304.45, "total_tokens": 480776} +{"current_steps": 1200, "total_steps": 4810, "loss": 0.0292, "lr": 4.6682314973484844e-06, "epoch": 1.2474012474012475, "percentage": 24.95, "elapsed_time": "0:06:08", "remaining_time": "0:18:29", "throughput": 1309.19, "total_tokens": 482952} +{"current_steps": 1205, "total_steps": 4810, "loss": 0.0622, "lr": 4.663701523284291e-06, "epoch": 1.2525987525987525, "percentage": 25.05, "elapsed_time": "0:06:09", "remaining_time": "0:18:24", "throughput": 1314.1, "total_tokens": 485192} +{"current_steps": 1205, "total_steps": 4810, "eval_loss": 0.26757940649986267, "epoch": 1.2525987525987525, "percentage": 25.05, "elapsed_time": "0:06:10", "remaining_time": "0:18:27", "throughput": 1310.35, "total_tokens": 485192} +{"current_steps": 1210, "total_steps": 4810, "loss": 0.1299, "lr": 4.659143061253152e-06, "epoch": 1.2577962577962578, "percentage": 25.16, "elapsed_time": "0:07:02", "remaining_time": "0:20:55", "throughput": 1153.92, "total_tokens": 487112} +{"current_steps": 1215, "total_steps": 4810, "loss": 0.2685, "lr": 4.654556171273196e-06, "epoch": 1.262993762993763, "percentage": 25.26, "elapsed_time": "0:07:02", "remaining_time": "0:20:49", "throughput": 1157.89, "total_tokens": 489160} +{"current_steps": 1220, "total_steps": 4810, "loss": 0.2017, "lr": 4.649940913736841e-06, "epoch": 1.2681912681912682, "percentage": 25.36, "elapsed_time": "0:07:02", "remaining_time": "0:20:44", "throughput": 1161.54, "total_tokens": 491080} +{"current_steps": 1225, "total_steps": 4810, "loss": 0.0607, "lr": 4.645297349410005e-06, "epoch": 1.2733887733887734, "percentage": 25.47, "elapsed_time": "0:07:03", "remaining_time": "0:20:38", "throughput": 1165.34, "total_tokens": 493064} +{"current_steps": 1230, "total_steps": 4810, "loss": 0.1537, "lr": 4.640625539431298e-06, "epoch": 1.2785862785862787, "percentage": 25.57, "elapsed_time": "0:07:03", "remaining_time": "0:20:32", "throughput": 1168.98, "total_tokens": 494984} +{"current_steps": 1235, "total_steps": 4810, "loss": 0.2946, "lr": 4.635925545311224e-06, "epoch": 1.2837837837837838, "percentage": 25.68, "elapsed_time": "0:07:03", "remaining_time": "0:20:26", "throughput": 1172.76, "total_tokens": 496968} +{"current_steps": 1240, "total_steps": 4810, "loss": 0.0799, "lr": 4.631197428931365e-06, "epoch": 1.288981288981289, "percentage": 25.78, "elapsed_time": "0:07:04", "remaining_time": "0:20:20", "throughput": 1176.24, "total_tokens": 498824} +{"current_steps": 1245, "total_steps": 4810, "loss": 0.0804, "lr": 4.626441252543572e-06, "epoch": 1.2941787941787941, "percentage": 25.88, "elapsed_time": "0:07:04", "remaining_time": "0:20:15", "throughput": 1180.02, "total_tokens": 500808} +{"current_steps": 1250, "total_steps": 4810, "loss": 0.251, "lr": 4.621657078769143e-06, "epoch": 1.2993762993762994, "percentage": 25.99, "elapsed_time": "0:07:04", "remaining_time": "0:20:09", "throughput": 1183.94, "total_tokens": 502856} +{"current_steps": 1255, "total_steps": 4810, "loss": 0.0856, "lr": 4.616844970597996e-06, "epoch": 1.3045738045738045, "percentage": 26.09, "elapsed_time": "0:07:05", "remaining_time": "0:20:04", "throughput": 1187.4, "total_tokens": 504712} +{"current_steps": 1260, "total_steps": 4810, "loss": 0.3719, "lr": 4.612004991387843e-06, "epoch": 1.3097713097713097, "percentage": 26.2, "elapsed_time": "0:07:05", "remaining_time": "0:19:58", "throughput": 1191.16, "total_tokens": 506696} +{"current_steps": 1265, "total_steps": 4810, "loss": 0.0936, "lr": 4.607137204863356e-06, "epoch": 1.314968814968815, "percentage": 26.3, "elapsed_time": "0:07:05", "remaining_time": "0:19:53", "throughput": 1194.85, "total_tokens": 508680} +{"current_steps": 1270, "total_steps": 4810, "loss": 0.1072, "lr": 4.602241675115326e-06, "epoch": 1.32016632016632, "percentage": 26.4, "elapsed_time": "0:07:06", "remaining_time": "0:19:47", "throughput": 1198.75, "total_tokens": 510728} +{"current_steps": 1275, "total_steps": 4810, "loss": 0.0841, "lr": 4.597318466599819e-06, "epoch": 1.3253638253638254, "percentage": 26.51, "elapsed_time": "0:07:06", "remaining_time": "0:19:42", "throughput": 1202.49, "total_tokens": 512712} +{"current_steps": 1280, "total_steps": 4810, "loss": 0.1067, "lr": 4.592367644137329e-06, "epoch": 1.3305613305613306, "percentage": 26.61, "elapsed_time": "0:07:06", "remaining_time": "0:19:36", "throughput": 1206.21, "total_tokens": 514696} +{"current_steps": 1285, "total_steps": 4810, "loss": 0.1895, "lr": 4.587389272911923e-06, "epoch": 1.3357588357588357, "percentage": 26.72, "elapsed_time": "0:07:07", "remaining_time": "0:19:31", "throughput": 1210.23, "total_tokens": 516808} +{"current_steps": 1290, "total_steps": 4810, "loss": 0.2118, "lr": 4.582383418470386e-06, "epoch": 1.340956340956341, "percentage": 26.82, "elapsed_time": "0:07:07", "remaining_time": "0:19:26", "throughput": 1213.95, "total_tokens": 518792} +{"current_steps": 1295, "total_steps": 4810, "loss": 0.1325, "lr": 4.5773501467213525e-06, "epoch": 1.3461538461538463, "percentage": 26.92, "elapsed_time": "0:07:07", "remaining_time": "0:19:20", "throughput": 1217.82, "total_tokens": 520840} +{"current_steps": 1300, "total_steps": 4810, "loss": 0.0526, "lr": 4.572289523934444e-06, "epoch": 1.3513513513513513, "percentage": 27.03, "elapsed_time": "0:07:08", "remaining_time": "0:19:15", "throughput": 1221.39, "total_tokens": 522760} +{"current_steps": 1305, "total_steps": 4810, "loss": 0.2152, "lr": 4.567201616739393e-06, "epoch": 1.3565488565488566, "percentage": 27.13, "elapsed_time": "0:07:08", "remaining_time": "0:19:10", "throughput": 1225.4, "total_tokens": 524872} +{"current_steps": 1310, "total_steps": 4810, "loss": 0.1978, "lr": 4.562086492125167e-06, "epoch": 1.3617463617463619, "percentage": 27.23, "elapsed_time": "0:07:08", "remaining_time": "0:19:05", "throughput": 1229.26, "total_tokens": 526920} +{"current_steps": 1315, "total_steps": 4810, "loss": 0.1374, "lr": 4.5569442174390885e-06, "epoch": 1.366943866943867, "percentage": 27.34, "elapsed_time": "0:07:08", "remaining_time": "0:19:00", "throughput": 1233.11, "total_tokens": 528968} +{"current_steps": 1320, "total_steps": 4810, "loss": 0.0818, "lr": 4.551774860385944e-06, "epoch": 1.3721413721413722, "percentage": 27.44, "elapsed_time": "0:07:09", "remaining_time": "0:18:55", "throughput": 1236.66, "total_tokens": 530888} +{"current_steps": 1325, "total_steps": 4810, "loss": 0.1644, "lr": 4.546578489027095e-06, "epoch": 1.3773388773388773, "percentage": 27.55, "elapsed_time": "0:07:09", "remaining_time": "0:18:49", "throughput": 1240.36, "total_tokens": 532872} +{"current_steps": 1330, "total_steps": 4810, "loss": 0.118, "lr": 4.541355171779582e-06, "epoch": 1.3825363825363826, "percentage": 27.65, "elapsed_time": "0:07:09", "remaining_time": "0:18:44", "throughput": 1244.19, "total_tokens": 534920} +{"current_steps": 1335, "total_steps": 4810, "loss": 0.0039, "lr": 4.536104977415225e-06, "epoch": 1.3877338877338876, "percentage": 27.75, "elapsed_time": "0:07:10", "remaining_time": "0:18:39", "throughput": 1247.73, "total_tokens": 536840} +{"current_steps": 1340, "total_steps": 4810, "loss": 0.3705, "lr": 4.530827975059715e-06, "epoch": 1.392931392931393, "percentage": 27.86, "elapsed_time": "0:07:10", "remaining_time": "0:18:34", "throughput": 1251.26, "total_tokens": 538760} +{"current_steps": 1345, "total_steps": 4810, "loss": 0.2364, "lr": 4.525524234191705e-06, "epoch": 1.3981288981288982, "percentage": 27.96, "elapsed_time": "0:07:10", "remaining_time": "0:18:30", "throughput": 1254.78, "total_tokens": 540680} +{"current_steps": 1350, "total_steps": 4810, "loss": 0.1405, "lr": 4.520193824641898e-06, "epoch": 1.4033264033264032, "percentage": 28.07, "elapsed_time": "0:07:11", "remaining_time": "0:18:25", "throughput": 1258.44, "total_tokens": 542664} +{"current_steps": 1355, "total_steps": 4810, "loss": 0.1596, "lr": 4.51483681659212e-06, "epoch": 1.4085239085239085, "percentage": 28.17, "elapsed_time": "0:07:11", "remaining_time": "0:18:20", "throughput": 1262.25, "total_tokens": 544712} +{"current_steps": 1360, "total_steps": 4810, "loss": 0.2662, "lr": 4.5094532805744075e-06, "epoch": 1.4137214137214138, "percentage": 28.27, "elapsed_time": "0:07:11", "remaining_time": "0:18:15", "throughput": 1266.2, "total_tokens": 546824} +{"current_steps": 1365, "total_steps": 4810, "loss": 0.0791, "lr": 4.504043287470068e-06, "epoch": 1.4189189189189189, "percentage": 28.38, "elapsed_time": "0:07:12", "remaining_time": "0:18:10", "throughput": 1270.12, "total_tokens": 548936} +{"current_steps": 1370, "total_steps": 4810, "loss": 0.1218, "lr": 4.498606908508754e-06, "epoch": 1.4241164241164241, "percentage": 28.48, "elapsed_time": "0:07:12", "remaining_time": "0:18:06", "throughput": 1273.73, "total_tokens": 550920} +{"current_steps": 1375, "total_steps": 4810, "loss": 0.0307, "lr": 4.493144215267519e-06, "epoch": 1.4293139293139294, "percentage": 28.59, "elapsed_time": "0:07:12", "remaining_time": "0:18:01", "throughput": 1277.37, "total_tokens": 552904} +{"current_steps": 1380, "total_steps": 4810, "loss": 0.1616, "lr": 4.4876552796698814e-06, "epoch": 1.4345114345114345, "percentage": 28.69, "elapsed_time": "0:07:13", "remaining_time": "0:17:56", "throughput": 1280.84, "total_tokens": 554824} +{"current_steps": 1385, "total_steps": 4810, "loss": 0.214, "lr": 4.482140173984875e-06, "epoch": 1.4397089397089398, "percentage": 28.79, "elapsed_time": "0:07:13", "remaining_time": "0:17:51", "throughput": 1284.62, "total_tokens": 556872} +{"current_steps": 1390, "total_steps": 4810, "loss": 0.1453, "lr": 4.476598970826093e-06, "epoch": 1.444906444906445, "percentage": 28.9, "elapsed_time": "0:07:13", "remaining_time": "0:17:47", "throughput": 1288.53, "total_tokens": 558984} +{"current_steps": 1395, "total_steps": 4810, "loss": 0.2061, "lr": 4.471031743150744e-06, "epoch": 1.45010395010395, "percentage": 29.0, "elapsed_time": "0:07:14", "remaining_time": "0:17:42", "throughput": 1292.14, "total_tokens": 560968} +{"current_steps": 1400, "total_steps": 4810, "loss": 0.2358, "lr": 4.465438564258673e-06, "epoch": 1.4553014553014554, "percentage": 29.11, "elapsed_time": "0:07:14", "remaining_time": "0:17:38", "throughput": 1295.74, "total_tokens": 562952} +{"current_steps": 1405, "total_steps": 4810, "loss": 0.0357, "lr": 4.459819507791415e-06, "epoch": 1.4604989604989604, "percentage": 29.21, "elapsed_time": "0:07:14", "remaining_time": "0:17:33", "throughput": 1299.63, "total_tokens": 565064} +{"current_steps": 1410, "total_steps": 4810, "loss": 0.1194, "lr": 4.454174647731213e-06, "epoch": 1.4656964656964657, "percentage": 29.31, "elapsed_time": "0:07:15", "remaining_time": "0:17:29", "throughput": 1303.37, "total_tokens": 567112} +{"current_steps": 1415, "total_steps": 4810, "loss": 0.2261, "lr": 4.448504058400052e-06, "epoch": 1.4708939708939708, "percentage": 29.42, "elapsed_time": "0:07:15", "remaining_time": "0:17:24", "throughput": 1307.11, "total_tokens": 569160} +{"current_steps": 1420, "total_steps": 4810, "loss": 0.1794, "lr": 4.4428078144586715e-06, "epoch": 1.476091476091476, "percentage": 29.52, "elapsed_time": "0:07:15", "remaining_time": "0:17:20", "throughput": 1311.13, "total_tokens": 571336} +{"current_steps": 1425, "total_steps": 4810, "loss": 0.2622, "lr": 4.437085990905591e-06, "epoch": 1.4812889812889813, "percentage": 29.63, "elapsed_time": "0:07:16", "remaining_time": "0:17:15", "throughput": 1314.86, "total_tokens": 573384} +{"current_steps": 1430, "total_steps": 4810, "loss": 0.1625, "lr": 4.431338663076119e-06, "epoch": 1.4864864864864864, "percentage": 29.73, "elapsed_time": "0:07:16", "remaining_time": "0:17:11", "throughput": 1318.3, "total_tokens": 575304} +{"current_steps": 1435, "total_steps": 4810, "loss": 0.0647, "lr": 4.42556590664136e-06, "epoch": 1.4916839916839917, "percentage": 29.83, "elapsed_time": "0:07:16", "remaining_time": "0:17:07", "throughput": 1321.57, "total_tokens": 577160} +{"current_steps": 1440, "total_steps": 4810, "loss": 0.11, "lr": 4.41976779760722e-06, "epoch": 1.496881496881497, "percentage": 29.94, "elapsed_time": "0:07:17", "remaining_time": "0:17:02", "throughput": 1325.26, "total_tokens": 579208} +{"current_steps": 1445, "total_steps": 4810, "loss": 0.0911, "lr": 4.413944412313405e-06, "epoch": 1.502079002079002, "percentage": 30.04, "elapsed_time": "0:07:17", "remaining_time": "0:16:58", "throughput": 1328.96, "total_tokens": 581256} +{"current_steps": 1446, "total_steps": 4810, "eval_loss": 0.3145564794540405, "epoch": 1.503118503118503, "percentage": 30.06, "elapsed_time": "0:07:18", "remaining_time": "0:17:00", "throughput": 1326.51, "total_tokens": 581704} +{"current_steps": 1450, "total_steps": 4810, "loss": 0.1191, "lr": 4.408095827432416e-06, "epoch": 1.5072765072765073, "percentage": 30.15, "elapsed_time": "0:08:14", "remaining_time": "0:19:04", "throughput": 1180.65, "total_tokens": 583304} +{"current_steps": 1455, "total_steps": 4810, "loss": 0.3479, "lr": 4.40222211996854e-06, "epoch": 1.5124740124740126, "percentage": 30.25, "elapsed_time": "0:08:14", "remaining_time": "0:18:59", "throughput": 1183.77, "total_tokens": 585224} +{"current_steps": 1460, "total_steps": 4810, "loss": 0.2617, "lr": 4.396323367256836e-06, "epoch": 1.5176715176715176, "percentage": 30.35, "elapsed_time": "0:08:14", "remaining_time": "0:18:55", "throughput": 1187.13, "total_tokens": 587272} +{"current_steps": 1465, "total_steps": 4810, "loss": 0.1985, "lr": 4.390399646962117e-06, "epoch": 1.5228690228690227, "percentage": 30.46, "elapsed_time": "0:08:15", "remaining_time": "0:18:50", "throughput": 1190.49, "total_tokens": 589320} +{"current_steps": 1470, "total_steps": 4810, "loss": 0.1369, "lr": 4.384451037077924e-06, "epoch": 1.5280665280665282, "percentage": 30.56, "elapsed_time": "0:08:15", "remaining_time": "0:18:45", "throughput": 1193.72, "total_tokens": 591304} +{"current_steps": 1475, "total_steps": 4810, "loss": 0.1433, "lr": 4.378477615925506e-06, "epoch": 1.5332640332640333, "percentage": 30.67, "elapsed_time": "0:08:15", "remaining_time": "0:18:40", "throughput": 1196.82, "total_tokens": 593224} +{"current_steps": 1480, "total_steps": 4810, "loss": 0.1273, "lr": 4.372479462152781e-06, "epoch": 1.5384615384615383, "percentage": 30.77, "elapsed_time": "0:08:15", "remaining_time": "0:18:35", "throughput": 1200.29, "total_tokens": 595336} +{"current_steps": 1485, "total_steps": 4810, "loss": 0.2715, "lr": 4.366456654733308e-06, "epoch": 1.5436590436590436, "percentage": 30.87, "elapsed_time": "0:08:16", "remaining_time": "0:18:31", "throughput": 1203.38, "total_tokens": 597256} +{"current_steps": 1490, "total_steps": 4810, "loss": 0.1859, "lr": 4.360409272965242e-06, "epoch": 1.5488565488565489, "percentage": 30.98, "elapsed_time": "0:08:16", "remaining_time": "0:18:26", "throughput": 1206.72, "total_tokens": 599304} +{"current_steps": 1495, "total_steps": 4810, "loss": 0.0745, "lr": 4.354337396470291e-06, "epoch": 1.554054054054054, "percentage": 31.08, "elapsed_time": "0:08:16", "remaining_time": "0:18:21", "throughput": 1209.93, "total_tokens": 601288} +{"current_steps": 1500, "total_steps": 4810, "loss": 0.1641, "lr": 4.348241105192668e-06, "epoch": 1.5592515592515592, "percentage": 31.19, "elapsed_time": "0:08:17", "remaining_time": "0:18:17", "throughput": 1213.13, "total_tokens": 603272} +{"current_steps": 1505, "total_steps": 4810, "loss": 0.1365, "lr": 4.34212047939804e-06, "epoch": 1.5644490644490645, "percentage": 31.29, "elapsed_time": "0:08:17", "remaining_time": "0:18:12", "throughput": 1216.33, "total_tokens": 605256} +{"current_steps": 1510, "total_steps": 4810, "loss": 0.0868, "lr": 4.335975599672469e-06, "epoch": 1.5696465696465696, "percentage": 31.39, "elapsed_time": "0:08:17", "remaining_time": "0:18:08", "throughput": 1219.66, "total_tokens": 607304} +{"current_steps": 1515, "total_steps": 4810, "loss": 0.1281, "lr": 4.329806546921354e-06, "epoch": 1.5748440748440748, "percentage": 31.5, "elapsed_time": "0:08:18", "remaining_time": "0:18:03", "throughput": 1222.72, "total_tokens": 609224} +{"current_steps": 1520, "total_steps": 4810, "loss": 0.0465, "lr": 4.3236134023683565e-06, "epoch": 1.5800415800415801, "percentage": 31.6, "elapsed_time": "0:08:18", "remaining_time": "0:17:59", "throughput": 1226.16, "total_tokens": 611336} +{"current_steps": 1525, "total_steps": 4810, "loss": 0.1156, "lr": 4.3173962475543475e-06, "epoch": 1.5852390852390852, "percentage": 31.7, "elapsed_time": "0:08:18", "remaining_time": "0:17:54", "throughput": 1229.34, "total_tokens": 613320} +{"current_steps": 1530, "total_steps": 4810, "loss": 0.2405, "lr": 4.311155164336318e-06, "epoch": 1.5904365904365905, "percentage": 31.81, "elapsed_time": "0:08:19", "remaining_time": "0:17:50", "throughput": 1232.26, "total_tokens": 615176} +{"current_steps": 1535, "total_steps": 4810, "loss": 0.1673, "lr": 4.3048902348863116e-06, "epoch": 1.5956340956340958, "percentage": 31.91, "elapsed_time": "0:08:19", "remaining_time": "0:17:45", "throughput": 1235.56, "total_tokens": 617224} +{"current_steps": 1540, "total_steps": 4810, "loss": 0.1683, "lr": 4.298601541690336e-06, "epoch": 1.6008316008316008, "percentage": 32.02, "elapsed_time": "0:08:19", "remaining_time": "0:17:41", "throughput": 1238.73, "total_tokens": 619208} +{"current_steps": 1545, "total_steps": 4810, "loss": 0.221, "lr": 4.292289167547281e-06, "epoch": 1.6060291060291059, "percentage": 32.12, "elapsed_time": "0:08:20", "remaining_time": "0:17:37", "throughput": 1241.9, "total_tokens": 621192} +{"current_steps": 1550, "total_steps": 4810, "loss": 0.1458, "lr": 4.285953195567827e-06, "epoch": 1.6112266112266114, "percentage": 32.22, "elapsed_time": "0:08:20", "remaining_time": "0:17:32", "throughput": 1245.06, "total_tokens": 623176} +{"current_steps": 1555, "total_steps": 4810, "loss": 0.246, "lr": 4.279593709173352e-06, "epoch": 1.6164241164241164, "percentage": 32.33, "elapsed_time": "0:08:20", "remaining_time": "0:17:28", "throughput": 1248.21, "total_tokens": 625160} +{"current_steps": 1560, "total_steps": 4810, "loss": 0.1381, "lr": 4.27321079209483e-06, "epoch": 1.6216216216216215, "percentage": 32.43, "elapsed_time": "0:08:21", "remaining_time": "0:17:24", "throughput": 1251.36, "total_tokens": 627144} +{"current_steps": 1565, "total_steps": 4810, "loss": 0.1634, "lr": 4.266804528371732e-06, "epoch": 1.6268191268191268, "percentage": 32.54, "elapsed_time": "0:08:21", "remaining_time": "0:17:19", "throughput": 1254.63, "total_tokens": 629192} +{"current_steps": 1570, "total_steps": 4810, "loss": 0.1174, "lr": 4.260375002350917e-06, "epoch": 1.632016632016632, "percentage": 32.64, "elapsed_time": "0:08:21", "remaining_time": "0:17:15", "throughput": 1257.91, "total_tokens": 631240} +{"current_steps": 1575, "total_steps": 4810, "loss": 0.2274, "lr": 4.253922298685525e-06, "epoch": 1.637214137214137, "percentage": 32.74, "elapsed_time": "0:08:22", "remaining_time": "0:17:11", "throughput": 1261.05, "total_tokens": 633224} +{"current_steps": 1580, "total_steps": 4810, "loss": 0.1367, "lr": 4.2474465023338586e-06, "epoch": 1.6424116424116424, "percentage": 32.85, "elapsed_time": "0:08:22", "remaining_time": "0:17:07", "throughput": 1264.19, "total_tokens": 635208} +{"current_steps": 1585, "total_steps": 4810, "loss": 0.1048, "lr": 4.2409476985582645e-06, "epoch": 1.6476091476091477, "percentage": 32.95, "elapsed_time": "0:08:22", "remaining_time": "0:17:03", "throughput": 1267.45, "total_tokens": 637256} +{"current_steps": 1590, "total_steps": 4810, "loss": 0.0156, "lr": 4.234425972924014e-06, "epoch": 1.6528066528066527, "percentage": 33.06, "elapsed_time": "0:08:23", "remaining_time": "0:16:58", "throughput": 1270.46, "total_tokens": 639176} +{"current_steps": 1595, "total_steps": 4810, "loss": 0.1551, "lr": 4.227881411298175e-06, "epoch": 1.658004158004158, "percentage": 33.16, "elapsed_time": "0:08:23", "remaining_time": "0:16:54", "throughput": 1273.71, "total_tokens": 641224} +{"current_steps": 1600, "total_steps": 4810, "loss": 0.1125, "lr": 4.221314099848481e-06, "epoch": 1.6632016632016633, "percentage": 33.26, "elapsed_time": "0:08:23", "remaining_time": "0:16:50", "throughput": 1276.7, "total_tokens": 643144} +{"current_steps": 1605, "total_steps": 4810, "loss": 0.1457, "lr": 4.214724125042195e-06, "epoch": 1.6683991683991684, "percentage": 33.37, "elapsed_time": "0:08:24", "remaining_time": "0:16:46", "throughput": 1279.44, "total_tokens": 644936} +{"current_steps": 1610, "total_steps": 4810, "loss": 0.1623, "lr": 4.208111573644975e-06, "epoch": 1.6735966735966736, "percentage": 33.47, "elapsed_time": "0:08:24", "remaining_time": "0:16:42", "throughput": 1282.68, "total_tokens": 646984} +{"current_steps": 1615, "total_steps": 4810, "loss": 0.2052, "lr": 4.2014765327197285e-06, "epoch": 1.678794178794179, "percentage": 33.58, "elapsed_time": "0:08:24", "remaining_time": "0:16:38", "throughput": 1285.92, "total_tokens": 649032} +{"current_steps": 1620, "total_steps": 4810, "loss": 0.2047, "lr": 4.194819089625466e-06, "epoch": 1.683991683991684, "percentage": 33.68, "elapsed_time": "0:08:25", "remaining_time": "0:16:34", "throughput": 1289.15, "total_tokens": 651080} +{"current_steps": 1625, "total_steps": 4810, "loss": 0.2123, "lr": 4.188139332016154e-06, "epoch": 1.689189189189189, "percentage": 33.78, "elapsed_time": "0:08:25", "remaining_time": "0:16:30", "throughput": 1292.13, "total_tokens": 653000} +{"current_steps": 1630, "total_steps": 4810, "loss": 0.2089, "lr": 4.181437347839559e-06, "epoch": 1.6943866943866945, "percentage": 33.89, "elapsed_time": "0:08:25", "remaining_time": "0:16:26", "throughput": 1295.11, "total_tokens": 654920} +{"current_steps": 1635, "total_steps": 4810, "loss": 0.1685, "lr": 4.174713225336087e-06, "epoch": 1.6995841995841996, "percentage": 33.99, "elapsed_time": "0:08:26", "remaining_time": "0:16:22", "throughput": 1298.2, "total_tokens": 656904} +{"current_steps": 1640, "total_steps": 4810, "loss": 0.105, "lr": 4.167967053037625e-06, "epoch": 1.7047817047817047, "percentage": 34.1, "elapsed_time": "0:08:26", "remaining_time": "0:16:18", "throughput": 1301.42, "total_tokens": 658952} +{"current_steps": 1645, "total_steps": 4810, "loss": 0.0899, "lr": 4.161198919766375e-06, "epoch": 1.70997920997921, "percentage": 34.2, "elapsed_time": "0:08:26", "remaining_time": "0:16:14", "throughput": 1304.39, "total_tokens": 660872} +{"current_steps": 1650, "total_steps": 4810, "loss": 0.2054, "lr": 4.154408914633685e-06, "epoch": 1.7151767151767152, "percentage": 34.3, "elapsed_time": "0:08:26", "remaining_time": "0:16:10", "throughput": 1307.47, "total_tokens": 662856} +{"current_steps": 1655, "total_steps": 4810, "loss": 0.2025, "lr": 4.147597127038873e-06, "epoch": 1.7203742203742203, "percentage": 34.41, "elapsed_time": "0:08:27", "remaining_time": "0:16:07", "throughput": 1310.67, "total_tokens": 664904} +{"current_steps": 1660, "total_steps": 4810, "loss": 0.141, "lr": 4.140763646668051e-06, "epoch": 1.7255717255717256, "percentage": 34.51, "elapsed_time": "0:08:27", "remaining_time": "0:16:03", "throughput": 1313.75, "total_tokens": 666888} +{"current_steps": 1665, "total_steps": 4810, "loss": 0.0252, "lr": 4.133908563492949e-06, "epoch": 1.7307692307692308, "percentage": 34.62, "elapsed_time": "0:08:27", "remaining_time": "0:15:59", "throughput": 1316.95, "total_tokens": 668936} +{"current_steps": 1670, "total_steps": 4810, "loss": 0.2066, "lr": 4.12703196776972e-06, "epoch": 1.735966735966736, "percentage": 34.72, "elapsed_time": "0:08:28", "remaining_time": "0:15:55", "throughput": 1319.89, "total_tokens": 670856} +{"current_steps": 1675, "total_steps": 4810, "loss": 0.3627, "lr": 4.120133950037763e-06, "epoch": 1.7411642411642412, "percentage": 34.82, "elapsed_time": "0:08:28", "remaining_time": "0:15:51", "throughput": 1322.95, "total_tokens": 672840} +{"current_steps": 1680, "total_steps": 4810, "loss": 0.2218, "lr": 4.113214601118523e-06, "epoch": 1.7463617463617465, "percentage": 34.93, "elapsed_time": "0:08:28", "remaining_time": "0:15:48", "throughput": 1326.01, "total_tokens": 674824} +{"current_steps": 1685, "total_steps": 4810, "loss": 0.1042, "lr": 4.106274012114302e-06, "epoch": 1.7515592515592515, "percentage": 35.03, "elapsed_time": "0:08:29", "remaining_time": "0:15:44", "throughput": 1329.06, "total_tokens": 676808} +{"current_steps": 1687, "total_steps": 4810, "eval_loss": 0.2114141583442688, "epoch": 1.7536382536382535, "percentage": 35.07, "elapsed_time": "0:08:30", "remaining_time": "0:15:44", "throughput": 1327.52, "total_tokens": 677576} +{"current_steps": 1690, "total_steps": 4810, "loss": 0.1712, "lr": 4.099312274407049e-06, "epoch": 1.7567567567567568, "percentage": 35.14, "elapsed_time": "0:10:08", "remaining_time": "0:18:42", "throughput": 1115.94, "total_tokens": 678728} +{"current_steps": 1695, "total_steps": 4810, "loss": 0.1031, "lr": 4.092329479657168e-06, "epoch": 1.761954261954262, "percentage": 35.24, "elapsed_time": "0:10:08", "remaining_time": "0:18:38", "throughput": 1118.71, "total_tokens": 680776} +{"current_steps": 1700, "total_steps": 4810, "loss": 0.1288, "lr": 4.085325719802307e-06, "epoch": 1.7671517671517671, "percentage": 35.34, "elapsed_time": "0:10:08", "remaining_time": "0:18:33", "throughput": 1121.79, "total_tokens": 683016} +{"current_steps": 1705, "total_steps": 4810, "loss": 0.0556, "lr": 4.0783010870561445e-06, "epoch": 1.7723492723492722, "percentage": 35.45, "elapsed_time": "0:10:09", "remaining_time": "0:18:29", "throughput": 1124.87, "total_tokens": 685256} +{"current_steps": 1710, "total_steps": 4810, "loss": 0.3125, "lr": 4.07125567390718e-06, "epoch": 1.7775467775467777, "percentage": 35.55, "elapsed_time": "0:10:09", "remaining_time": "0:18:24", "throughput": 1127.63, "total_tokens": 687304} +{"current_steps": 1715, "total_steps": 4810, "loss": 0.2158, "lr": 4.064189573117512e-06, "epoch": 1.7827442827442828, "percentage": 35.65, "elapsed_time": "0:10:09", "remaining_time": "0:18:20", "throughput": 1130.18, "total_tokens": 689224} +{"current_steps": 1720, "total_steps": 4810, "loss": 0.1701, "lr": 4.057102877721621e-06, "epoch": 1.7879417879417878, "percentage": 35.76, "elapsed_time": "0:10:10", "remaining_time": "0:18:16", "throughput": 1133.14, "total_tokens": 691400} +{"current_steps": 1725, "total_steps": 4810, "loss": 0.1154, "lr": 4.049995681025143e-06, "epoch": 1.793139293139293, "percentage": 35.86, "elapsed_time": "0:10:10", "remaining_time": "0:18:11", "throughput": 1135.68, "total_tokens": 693320} +{"current_steps": 1730, "total_steps": 4810, "loss": 0.1654, "lr": 4.0428680766036386e-06, "epoch": 1.7983367983367984, "percentage": 35.97, "elapsed_time": "0:10:10", "remaining_time": "0:18:07", "throughput": 1138.53, "total_tokens": 695432} +{"current_steps": 1735, "total_steps": 4810, "loss": 0.2169, "lr": 4.035720158301363e-06, "epoch": 1.8035343035343034, "percentage": 36.07, "elapsed_time": "0:10:11", "remaining_time": "0:18:03", "throughput": 1141.38, "total_tokens": 697544} +{"current_steps": 1740, "total_steps": 4810, "loss": 0.1438, "lr": 4.028552020230031e-06, "epoch": 1.8087318087318087, "percentage": 36.17, "elapsed_time": "0:10:11", "remaining_time": "0:17:58", "throughput": 1144.11, "total_tokens": 699592} +{"current_steps": 1745, "total_steps": 4810, "loss": 0.2247, "lr": 4.021363756767577e-06, "epoch": 1.813929313929314, "percentage": 36.28, "elapsed_time": "0:10:11", "remaining_time": "0:17:54", "throughput": 1146.74, "total_tokens": 701576} +{"current_steps": 1750, "total_steps": 4810, "loss": 0.2586, "lr": 4.014155462556913e-06, "epoch": 1.819126819126819, "percentage": 36.38, "elapsed_time": "0:10:12", "remaining_time": "0:17:50", "throughput": 1149.58, "total_tokens": 703688} +{"current_steps": 1755, "total_steps": 4810, "loss": 0.2187, "lr": 4.006927232504682e-06, "epoch": 1.8243243243243243, "percentage": 36.49, "elapsed_time": "0:10:12", "remaining_time": "0:17:46", "throughput": 1152.31, "total_tokens": 705736} +{"current_steps": 1760, "total_steps": 4810, "loss": 0.043, "lr": 3.999679161780006e-06, "epoch": 1.8295218295218296, "percentage": 36.59, "elapsed_time": "0:10:12", "remaining_time": "0:17:41", "throughput": 1154.94, "total_tokens": 707720} +{"current_steps": 1765, "total_steps": 4810, "loss": 0.08, "lr": 3.99241134581324e-06, "epoch": 1.8347193347193347, "percentage": 36.69, "elapsed_time": "0:10:13", "remaining_time": "0:17:37", "throughput": 1157.88, "total_tokens": 709896} +{"current_steps": 1770, "total_steps": 4810, "loss": 0.1669, "lr": 3.985123880294708e-06, "epoch": 1.83991683991684, "percentage": 36.8, "elapsed_time": "0:10:13", "remaining_time": "0:17:33", "throughput": 1160.61, "total_tokens": 711944} +{"current_steps": 1775, "total_steps": 4810, "loss": 0.1912, "lr": 3.977816861173446e-06, "epoch": 1.8451143451143452, "percentage": 36.9, "elapsed_time": "0:10:13", "remaining_time": "0:17:29", "throughput": 1163.33, "total_tokens": 713992} +{"current_steps": 1780, "total_steps": 4810, "loss": 0.1846, "lr": 3.970490384655939e-06, "epoch": 1.8503118503118503, "percentage": 37.01, "elapsed_time": "0:10:14", "remaining_time": "0:17:25", "throughput": 1165.95, "total_tokens": 715976} +{"current_steps": 1785, "total_steps": 4810, "loss": 0.105, "lr": 3.963144547204856e-06, "epoch": 1.8555093555093554, "percentage": 37.11, "elapsed_time": "0:10:14", "remaining_time": "0:17:21", "throughput": 1168.66, "total_tokens": 718024} +{"current_steps": 1790, "total_steps": 4810, "loss": 0.2342, "lr": 3.955779445537776e-06, "epoch": 1.8607068607068609, "percentage": 37.21, "elapsed_time": "0:10:14", "remaining_time": "0:17:17", "throughput": 1171.38, "total_tokens": 720072} +{"current_steps": 1795, "total_steps": 4810, "loss": 0.2314, "lr": 3.948395176625918e-06, "epoch": 1.865904365904366, "percentage": 37.32, "elapsed_time": "0:10:15", "remaining_time": "0:17:13", "throughput": 1174.09, "total_tokens": 722120} +{"current_steps": 1800, "total_steps": 4810, "loss": 0.1187, "lr": 3.940991837692861e-06, "epoch": 1.871101871101871, "percentage": 37.42, "elapsed_time": "0:10:15", "remaining_time": "0:17:09", "throughput": 1176.8, "total_tokens": 724168} +{"current_steps": 1805, "total_steps": 4810, "loss": 0.1292, "lr": 3.933569526213268e-06, "epoch": 1.8762993762993763, "percentage": 37.53, "elapsed_time": "0:10:15", "remaining_time": "0:17:05", "throughput": 1179.61, "total_tokens": 726280} +{"current_steps": 1810, "total_steps": 4810, "loss": 0.0843, "lr": 3.926128339911599e-06, "epoch": 1.8814968814968815, "percentage": 37.63, "elapsed_time": "0:10:16", "remaining_time": "0:17:01", "throughput": 1182.21, "total_tokens": 728264} +{"current_steps": 1815, "total_steps": 4810, "loss": 0.1791, "lr": 3.918668376760827e-06, "epoch": 1.8866943866943866, "percentage": 37.73, "elapsed_time": "0:10:16", "remaining_time": "0:16:57", "throughput": 1184.91, "total_tokens": 730312} +{"current_steps": 1820, "total_steps": 4810, "loss": 0.1365, "lr": 3.9111897349811455e-06, "epoch": 1.8918918918918919, "percentage": 37.84, "elapsed_time": "0:10:16", "remaining_time": "0:16:53", "throughput": 1187.51, "total_tokens": 732296} +{"current_steps": 1825, "total_steps": 4810, "loss": 0.1369, "lr": 3.903692513038677e-06, "epoch": 1.8970893970893972, "percentage": 37.94, "elapsed_time": "0:10:16", "remaining_time": "0:16:49", "throughput": 1189.79, "total_tokens": 734088} +{"current_steps": 1830, "total_steps": 4810, "loss": 0.2305, "lr": 3.896176809644178e-06, "epoch": 1.9022869022869022, "percentage": 38.05, "elapsed_time": "0:10:17", "remaining_time": "0:16:45", "throughput": 1192.37, "total_tokens": 736072} +{"current_steps": 1835, "total_steps": 4810, "loss": 0.2062, "lr": 3.8886427237517345e-06, "epoch": 1.9074844074844075, "percentage": 38.15, "elapsed_time": "0:10:17", "remaining_time": "0:16:41", "throughput": 1195.06, "total_tokens": 738120} +{"current_steps": 1840, "total_steps": 4810, "loss": 0.2077, "lr": 3.881090354557463e-06, "epoch": 1.9126819126819128, "percentage": 38.25, "elapsed_time": "0:10:17", "remaining_time": "0:16:37", "throughput": 1197.75, "total_tokens": 740168} +{"current_steps": 1845, "total_steps": 4810, "loss": 0.1425, "lr": 3.8735198014982066e-06, "epoch": 1.9178794178794178, "percentage": 38.36, "elapsed_time": "0:10:18", "remaining_time": "0:16:33", "throughput": 1200.54, "total_tokens": 742280} +{"current_steps": 1850, "total_steps": 4810, "loss": 0.0702, "lr": 3.865931164250219e-06, "epoch": 1.9230769230769231, "percentage": 38.46, "elapsed_time": "0:10:18", "remaining_time": "0:16:29", "throughput": 1203.22, "total_tokens": 744328} +{"current_steps": 1855, "total_steps": 4810, "loss": 0.1732, "lr": 3.858324542727859e-06, "epoch": 1.9282744282744284, "percentage": 38.57, "elapsed_time": "0:10:18", "remaining_time": "0:16:25", "throughput": 1206.0, "total_tokens": 746440} +{"current_steps": 1860, "total_steps": 4810, "loss": 0.1543, "lr": 3.8507000370822675e-06, "epoch": 1.9334719334719335, "percentage": 38.67, "elapsed_time": "0:10:19", "remaining_time": "0:16:22", "throughput": 1208.68, "total_tokens": 748488} +{"current_steps": 1865, "total_steps": 4810, "loss": 0.1298, "lr": 3.84305774770006e-06, "epoch": 1.9386694386694385, "percentage": 38.77, "elapsed_time": "0:10:19", "remaining_time": "0:16:18", "throughput": 1211.04, "total_tokens": 750344} +{"current_steps": 1870, "total_steps": 4810, "loss": 0.0507, "lr": 3.835397775201991e-06, "epoch": 1.943866943866944, "percentage": 38.88, "elapsed_time": "0:10:19", "remaining_time": "0:16:14", "throughput": 1213.61, "total_tokens": 752328} +{"current_steps": 1875, "total_steps": 4810, "loss": 0.2625, "lr": 3.827720220441642e-06, "epoch": 1.949064449064449, "percentage": 38.98, "elapsed_time": "0:10:20", "remaining_time": "0:16:10", "throughput": 1216.17, "total_tokens": 754312} +{"current_steps": 1880, "total_steps": 4810, "loss": 0.4145, "lr": 3.820025184504085e-06, "epoch": 1.9542619542619541, "percentage": 39.09, "elapsed_time": "0:10:20", "remaining_time": "0:16:07", "throughput": 1218.63, "total_tokens": 756232} +{"current_steps": 1885, "total_steps": 4810, "loss": 0.2626, "lr": 3.812312768704557e-06, "epoch": 1.9594594594594594, "percentage": 39.19, "elapsed_time": "0:10:20", "remaining_time": "0:16:03", "throughput": 1221.29, "total_tokens": 758280} +{"current_steps": 1890, "total_steps": 4810, "loss": 0.1128, "lr": 3.80458307458712e-06, "epoch": 1.9646569646569647, "percentage": 39.29, "elapsed_time": "0:10:21", "remaining_time": "0:15:59", "throughput": 1223.95, "total_tokens": 760328} +{"current_steps": 1895, "total_steps": 4810, "loss": 0.1213, "lr": 3.7968362039233315e-06, "epoch": 1.9698544698544698, "percentage": 39.4, "elapsed_time": "0:10:21", "remaining_time": "0:15:56", "throughput": 1226.4, "total_tokens": 762248} +{"current_steps": 1900, "total_steps": 4810, "loss": 0.077, "lr": 3.7890722587108985e-06, "epoch": 1.975051975051975, "percentage": 39.5, "elapsed_time": "0:10:21", "remaining_time": "0:15:52", "throughput": 1228.84, "total_tokens": 764168} +{"current_steps": 1905, "total_steps": 4810, "loss": 0.0655, "lr": 3.7812913411723377e-06, "epoch": 1.9802494802494803, "percentage": 39.6, "elapsed_time": "0:10:22", "remaining_time": "0:15:48", "throughput": 1231.49, "total_tokens": 766216} +{"current_steps": 1910, "total_steps": 4810, "loss": 0.0962, "lr": 3.773493553753628e-06, "epoch": 1.9854469854469854, "percentage": 39.71, "elapsed_time": "0:10:22", "remaining_time": "0:15:45", "throughput": 1234.14, "total_tokens": 768264} +{"current_steps": 1915, "total_steps": 4810, "loss": 0.0219, "lr": 3.7656789991228638e-06, "epoch": 1.9906444906444907, "percentage": 39.81, "elapsed_time": "0:10:22", "remaining_time": "0:15:41", "throughput": 1236.58, "total_tokens": 770184} +{"current_steps": 1920, "total_steps": 4810, "loss": 0.1279, "lr": 3.7578477801689e-06, "epoch": 1.995841995841996, "percentage": 39.92, "elapsed_time": "0:10:23", "remaining_time": "0:15:37", "throughput": 1239.12, "total_tokens": 772168} +{"current_steps": 1925, "total_steps": 4810, "loss": 0.096, "lr": 3.7500000000000005e-06, "epoch": 2.001039501039501, "percentage": 40.02, "elapsed_time": "0:10:23", "remaining_time": "0:15:34", "throughput": 1241.5, "total_tokens": 774160} +{"current_steps": 1928, "total_steps": 4810, "eval_loss": 0.3561875522136688, "epoch": 2.004158004158004, "percentage": 40.08, "elapsed_time": "0:10:24", "remaining_time": "0:15:33", "throughput": 1240.92, "total_tokens": 775312} +{"current_steps": 1930, "total_steps": 4810, "loss": 0.0698, "lr": 3.7421357619424793e-06, "epoch": 2.006237006237006, "percentage": 40.12, "elapsed_time": "0:11:14", "remaining_time": "0:16:46", "throughput": 1150.84, "total_tokens": 776144} +{"current_steps": 1935, "total_steps": 4810, "loss": 0.0941, "lr": 3.7342551695393375e-06, "epoch": 2.0114345114345116, "percentage": 40.23, "elapsed_time": "0:11:14", "remaining_time": "0:16:42", "throughput": 1153.24, "total_tokens": 778128} +{"current_steps": 1940, "total_steps": 4810, "loss": 0.0863, "lr": 3.7263583265489077e-06, "epoch": 2.0166320166320166, "percentage": 40.33, "elapsed_time": "0:11:15", "remaining_time": "0:16:38", "throughput": 1155.72, "total_tokens": 780176} +{"current_steps": 1945, "total_steps": 4810, "loss": 0.0572, "lr": 3.718445336943478e-06, "epoch": 2.0218295218295217, "percentage": 40.44, "elapsed_time": "0:11:15", "remaining_time": "0:16:34", "throughput": 1158.11, "total_tokens": 782160} +{"current_steps": 1950, "total_steps": 4810, "loss": 0.0675, "lr": 3.7105163049079305e-06, "epoch": 2.027027027027027, "percentage": 40.54, "elapsed_time": "0:11:15", "remaining_time": "0:16:31", "throughput": 1160.59, "total_tokens": 784208} +{"current_steps": 1955, "total_steps": 4810, "loss": 0.0002, "lr": 3.702571334838365e-06, "epoch": 2.0322245322245323, "percentage": 40.64, "elapsed_time": "0:11:16", "remaining_time": "0:16:27", "throughput": 1163.07, "total_tokens": 786256} +{"current_steps": 1960, "total_steps": 4810, "loss": 0.1288, "lr": 3.6946105313407287e-06, "epoch": 2.0374220374220373, "percentage": 40.75, "elapsed_time": "0:11:16", "remaining_time": "0:16:23", "throughput": 1165.46, "total_tokens": 788240} +{"current_steps": 1965, "total_steps": 4810, "loss": 0.1179, "lr": 3.6866339992294347e-06, "epoch": 2.042619542619543, "percentage": 40.85, "elapsed_time": "0:11:16", "remaining_time": "0:16:19", "throughput": 1167.93, "total_tokens": 790288} +{"current_steps": 1970, "total_steps": 4810, "loss": 0.0768, "lr": 3.678641843525986e-06, "epoch": 2.047817047817048, "percentage": 40.96, "elapsed_time": "0:11:16", "remaining_time": "0:16:15", "throughput": 1170.31, "total_tokens": 792272} +{"current_steps": 1975, "total_steps": 4810, "loss": 0.0297, "lr": 3.670634169457587e-06, "epoch": 2.053014553014553, "percentage": 41.06, "elapsed_time": "0:11:17", "remaining_time": "0:16:12", "throughput": 1172.88, "total_tokens": 794384} +{"current_steps": 1980, "total_steps": 4810, "loss": 0.1305, "lr": 3.662611082455766e-06, "epoch": 2.0582120582120584, "percentage": 41.16, "elapsed_time": "0:11:17", "remaining_time": "0:16:08", "throughput": 1175.25, "total_tokens": 796368} +{"current_steps": 1985, "total_steps": 4810, "loss": 0.0029, "lr": 3.6545726881549792e-06, "epoch": 2.0634095634095635, "percentage": 41.27, "elapsed_time": "0:11:17", "remaining_time": "0:16:04", "throughput": 1177.81, "total_tokens": 798480} +{"current_steps": 1990, "total_steps": 4810, "loss": 0.0937, "lr": 3.6465190923912275e-06, "epoch": 2.0686070686070686, "percentage": 41.37, "elapsed_time": "0:11:18", "remaining_time": "0:16:01", "throughput": 1180.28, "total_tokens": 800528} +{"current_steps": 1995, "total_steps": 4810, "loss": 0.1904, "lr": 3.6384504012006544e-06, "epoch": 2.0738045738045736, "percentage": 41.48, "elapsed_time": "0:11:18", "remaining_time": "0:15:57", "throughput": 1183.02, "total_tokens": 802768} +{"current_steps": 2000, "total_steps": 4810, "loss": 0.1647, "lr": 3.6303667208181576e-06, "epoch": 2.079002079002079, "percentage": 41.58, "elapsed_time": "0:11:18", "remaining_time": "0:15:53", "throughput": 1185.38, "total_tokens": 804752} +{"current_steps": 2005, "total_steps": 4810, "loss": 0.0649, "lr": 3.622268157675986e-06, "epoch": 2.084199584199584, "percentage": 41.68, "elapsed_time": "0:11:19", "remaining_time": "0:15:50", "throughput": 1187.65, "total_tokens": 806672} +{"current_steps": 2010, "total_steps": 4810, "loss": 0.0186, "lr": 3.614154818402339e-06, "epoch": 2.0893970893970892, "percentage": 41.79, "elapsed_time": "0:11:19", "remaining_time": "0:15:46", "throughput": 1190.01, "total_tokens": 808656} +{"current_steps": 2015, "total_steps": 4810, "loss": 0.0494, "lr": 3.6060268098199656e-06, "epoch": 2.0945945945945947, "percentage": 41.89, "elapsed_time": "0:11:19", "remaining_time": "0:15:43", "throughput": 1192.37, "total_tokens": 810640} +{"current_steps": 2020, "total_steps": 4810, "loss": 0.0657, "lr": 3.5978842389447523e-06, "epoch": 2.0997920997921, "percentage": 42.0, "elapsed_time": "0:11:20", "remaining_time": "0:15:39", "throughput": 1194.82, "total_tokens": 812688} +{"current_steps": 2025, "total_steps": 4810, "loss": 0.0206, "lr": 3.5897272129843198e-06, "epoch": 2.104989604989605, "percentage": 42.1, "elapsed_time": "0:11:20", "remaining_time": "0:15:35", "throughput": 1197.36, "total_tokens": 814800} +{"current_steps": 2030, "total_steps": 4810, "loss": 0.0252, "lr": 3.5815558393366064e-06, "epoch": 2.1101871101871104, "percentage": 42.2, "elapsed_time": "0:11:20", "remaining_time": "0:15:32", "throughput": 1199.9, "total_tokens": 816912} +{"current_steps": 2035, "total_steps": 4810, "loss": 0.1156, "lr": 3.57337022558846e-06, "epoch": 2.1153846153846154, "percentage": 42.31, "elapsed_time": "0:11:21", "remaining_time": "0:15:28", "throughput": 1202.24, "total_tokens": 818896} +{"current_steps": 2040, "total_steps": 4810, "loss": 0.0855, "lr": 3.5651704795142137e-06, "epoch": 2.1205821205821205, "percentage": 42.41, "elapsed_time": "0:11:21", "remaining_time": "0:15:25", "throughput": 1204.59, "total_tokens": 820880} +{"current_steps": 2045, "total_steps": 4810, "loss": 0.1594, "lr": 3.5569567090742763e-06, "epoch": 2.125779625779626, "percentage": 42.52, "elapsed_time": "0:11:21", "remaining_time": "0:15:21", "throughput": 1206.94, "total_tokens": 822864} +{"current_steps": 2050, "total_steps": 4810, "loss": 0.0265, "lr": 3.548729022413701e-06, "epoch": 2.130977130977131, "percentage": 42.62, "elapsed_time": "0:11:22", "remaining_time": "0:15:18", "throughput": 1209.56, "total_tokens": 825040} +{"current_steps": 2055, "total_steps": 4810, "loss": 0.0995, "lr": 3.5404875278607693e-06, "epoch": 2.136174636174636, "percentage": 42.72, "elapsed_time": "0:11:22", "remaining_time": "0:15:14", "throughput": 1211.9, "total_tokens": 827024} +{"current_steps": 2060, "total_steps": 4810, "loss": 0.072, "lr": 3.5322323339255602e-06, "epoch": 2.141372141372141, "percentage": 42.83, "elapsed_time": "0:11:22", "remaining_time": "0:15:11", "throughput": 1214.42, "total_tokens": 829136} +{"current_steps": 2065, "total_steps": 4810, "loss": 0.0483, "lr": 3.5239635492985248e-06, "epoch": 2.1465696465696467, "percentage": 42.93, "elapsed_time": "0:11:23", "remaining_time": "0:15:07", "throughput": 1216.85, "total_tokens": 831184} +{"current_steps": 2070, "total_steps": 4810, "loss": 0.0007, "lr": 3.5156812828490507e-06, "epoch": 2.1517671517671517, "percentage": 43.04, "elapsed_time": "0:11:23", "remaining_time": "0:15:04", "throughput": 1219.18, "total_tokens": 833168} +{"current_steps": 2075, "total_steps": 4810, "loss": 0.0685, "lr": 3.5073856436240335e-06, "epoch": 2.156964656964657, "percentage": 43.14, "elapsed_time": "0:11:23", "remaining_time": "0:15:01", "throughput": 1221.61, "total_tokens": 835216} +{"current_steps": 2080, "total_steps": 4810, "loss": 0.0004, "lr": 3.4990767408464383e-06, "epoch": 2.1621621621621623, "percentage": 43.24, "elapsed_time": "0:11:24", "remaining_time": "0:14:57", "throughput": 1223.84, "total_tokens": 837136} +{"current_steps": 2085, "total_steps": 4810, "loss": 0.1832, "lr": 3.4907546839138627e-06, "epoch": 2.1673596673596673, "percentage": 43.35, "elapsed_time": "0:11:24", "remaining_time": "0:14:54", "throughput": 1226.17, "total_tokens": 839120} +{"current_steps": 2090, "total_steps": 4810, "loss": 0.0608, "lr": 3.4824195823970954e-06, "epoch": 2.1725571725571724, "percentage": 43.45, "elapsed_time": "0:11:24", "remaining_time": "0:14:51", "throughput": 1228.49, "total_tokens": 841104} +{"current_steps": 2095, "total_steps": 4810, "loss": 0.0894, "lr": 3.4740715460386732e-06, "epoch": 2.177754677754678, "percentage": 43.56, "elapsed_time": "0:11:24", "remaining_time": "0:14:47", "throughput": 1230.91, "total_tokens": 843152} +{"current_steps": 2100, "total_steps": 4810, "loss": 0.0972, "lr": 3.46571068475144e-06, "epoch": 2.182952182952183, "percentage": 43.66, "elapsed_time": "0:11:25", "remaining_time": "0:14:44", "throughput": 1233.22, "total_tokens": 845136} +{"current_steps": 2105, "total_steps": 4810, "loss": 0.1887, "lr": 3.457337108617094e-06, "epoch": 2.188149688149688, "percentage": 43.76, "elapsed_time": "0:11:25", "remaining_time": "0:14:41", "throughput": 1235.47, "total_tokens": 847120} +{"current_steps": 2110, "total_steps": 4810, "loss": 0.2052, "lr": 3.4489509278847415e-06, "epoch": 2.1933471933471935, "percentage": 43.87, "elapsed_time": "0:11:25", "remaining_time": "0:14:37", "throughput": 1237.87, "total_tokens": 849168} +{"current_steps": 2115, "total_steps": 4810, "loss": 0.0731, "lr": 3.440552252969446e-06, "epoch": 2.1985446985446986, "percentage": 43.97, "elapsed_time": "0:11:26", "remaining_time": "0:14:34", "throughput": 1240.18, "total_tokens": 851152} +{"current_steps": 2120, "total_steps": 4810, "loss": 0.0078, "lr": 3.432141194450772e-06, "epoch": 2.2037422037422036, "percentage": 44.07, "elapsed_time": "0:11:26", "remaining_time": "0:14:31", "throughput": 1242.29, "total_tokens": 853008} +{"current_steps": 2125, "total_steps": 4810, "loss": 0.0651, "lr": 3.4237178630713312e-06, "epoch": 2.208939708939709, "percentage": 44.18, "elapsed_time": "0:11:26", "remaining_time": "0:14:27", "throughput": 1244.78, "total_tokens": 855120} +{"current_steps": 2130, "total_steps": 4810, "loss": 0.1599, "lr": 3.4152823697353237e-06, "epoch": 2.214137214137214, "percentage": 44.28, "elapsed_time": "0:11:27", "remaining_time": "0:14:24", "throughput": 1247.27, "total_tokens": 857232} +{"current_steps": 2135, "total_steps": 4810, "loss": 0.057, "lr": 3.4068348255070764e-06, "epoch": 2.2193347193347193, "percentage": 44.39, "elapsed_time": "0:11:27", "remaining_time": "0:14:21", "throughput": 1249.75, "total_tokens": 859344} +{"current_steps": 2140, "total_steps": 4810, "loss": 0.0868, "lr": 3.3983753416095844e-06, "epoch": 2.2245322245322248, "percentage": 44.49, "elapsed_time": "0:11:27", "remaining_time": "0:14:18", "throughput": 1252.05, "total_tokens": 861328} +{"current_steps": 2145, "total_steps": 4810, "loss": 0.2098, "lr": 3.3899040294230413e-06, "epoch": 2.22972972972973, "percentage": 44.59, "elapsed_time": "0:11:28", "remaining_time": "0:14:15", "throughput": 1254.45, "total_tokens": 863376} +{"current_steps": 2150, "total_steps": 4810, "loss": 0.0096, "lr": 3.381421000483378e-06, "epoch": 2.234927234927235, "percentage": 44.7, "elapsed_time": "0:11:28", "remaining_time": "0:14:11", "throughput": 1256.84, "total_tokens": 865424} +{"current_steps": 2155, "total_steps": 4810, "loss": 0.0981, "lr": 3.37292636648079e-06, "epoch": 2.24012474012474, "percentage": 44.8, "elapsed_time": "0:11:28", "remaining_time": "0:14:08", "throughput": 1259.22, "total_tokens": 867472} +{"current_steps": 2160, "total_steps": 4810, "loss": 0.1542, "lr": 3.3644202392582703e-06, "epoch": 2.2453222453222454, "percentage": 44.91, "elapsed_time": "0:11:29", "remaining_time": "0:14:05", "throughput": 1261.7, "total_tokens": 869584} +{"current_steps": 2165, "total_steps": 4810, "loss": 0.0094, "lr": 3.3559027308101344e-06, "epoch": 2.2505197505197505, "percentage": 45.01, "elapsed_time": "0:11:29", "remaining_time": "0:14:02", "throughput": 1263.99, "total_tokens": 871568} +{"current_steps": 2169, "total_steps": 4810, "eval_loss": 0.30345332622528076, "epoch": 2.2546777546777546, "percentage": 45.09, "elapsed_time": "0:11:30", "remaining_time": "0:14:01", "throughput": 1263.84, "total_tokens": 873104} +{"current_steps": 2170, "total_steps": 4810, "loss": 0.0945, "lr": 3.3473739532805464e-06, "epoch": 2.2557172557172556, "percentage": 45.11, "elapsed_time": "0:12:17", "remaining_time": "0:14:57", "throughput": 1184.44, "total_tokens": 873488} +{"current_steps": 2175, "total_steps": 4810, "loss": 0.1038, "lr": 3.3388340189620427e-06, "epoch": 2.260914760914761, "percentage": 45.22, "elapsed_time": "0:12:17", "remaining_time": "0:14:53", "throughput": 1186.61, "total_tokens": 875472} +{"current_steps": 2180, "total_steps": 4810, "loss": 0.0275, "lr": 3.3302830402940534e-06, "epoch": 2.266112266112266, "percentage": 45.32, "elapsed_time": "0:12:18", "remaining_time": "0:14:50", "throughput": 1188.7, "total_tokens": 877392} +{"current_steps": 2185, "total_steps": 4810, "loss": 0.1037, "lr": 3.3217211298614225e-06, "epoch": 2.271309771309771, "percentage": 45.43, "elapsed_time": "0:12:18", "remaining_time": "0:14:47", "throughput": 1191.04, "total_tokens": 879504} +{"current_steps": 2190, "total_steps": 4810, "loss": 0.0551, "lr": 3.313148400392925e-06, "epoch": 2.2765072765072767, "percentage": 45.53, "elapsed_time": "0:12:18", "remaining_time": "0:14:43", "throughput": 1193.04, "total_tokens": 881360} +{"current_steps": 2195, "total_steps": 4810, "loss": 0.094, "lr": 3.3045649647597814e-06, "epoch": 2.2817047817047817, "percentage": 45.63, "elapsed_time": "0:12:19", "remaining_time": "0:14:40", "throughput": 1195.12, "total_tokens": 883280} +{"current_steps": 2200, "total_steps": 4810, "loss": 0.0053, "lr": 3.2959709359741743e-06, "epoch": 2.286902286902287, "percentage": 45.74, "elapsed_time": "0:12:19", "remaining_time": "0:14:37", "throughput": 1197.37, "total_tokens": 885328} +{"current_steps": 2205, "total_steps": 4810, "loss": 0.0732, "lr": 3.2873664271877588e-06, "epoch": 2.2920997920997923, "percentage": 45.84, "elapsed_time": "0:12:19", "remaining_time": "0:14:33", "throughput": 1199.53, "total_tokens": 887312} +{"current_steps": 2210, "total_steps": 4810, "loss": 0.0574, "lr": 3.2787515516901717e-06, "epoch": 2.2972972972972974, "percentage": 45.95, "elapsed_time": "0:12:20", "remaining_time": "0:14:30", "throughput": 1201.69, "total_tokens": 889296} +{"current_steps": 2215, "total_steps": 4810, "loss": 0.0007, "lr": 3.2701264229075443e-06, "epoch": 2.3024948024948024, "percentage": 46.05, "elapsed_time": "0:12:20", "remaining_time": "0:14:27", "throughput": 1204.02, "total_tokens": 891408} +{"current_steps": 2220, "total_steps": 4810, "loss": 0.001, "lr": 3.261491154401001e-06, "epoch": 2.3076923076923075, "percentage": 46.15, "elapsed_time": "0:12:20", "remaining_time": "0:14:24", "throughput": 1206.18, "total_tokens": 893392} +{"current_steps": 2225, "total_steps": 4810, "loss": 0.0047, "lr": 3.2528458598651735e-06, "epoch": 2.312889812889813, "percentage": 46.26, "elapsed_time": "0:12:21", "remaining_time": "0:14:20", "throughput": 1208.42, "total_tokens": 895440} +{"current_steps": 2230, "total_steps": 4810, "loss": 0.1493, "lr": 3.2441906531266963e-06, "epoch": 2.318087318087318, "percentage": 46.36, "elapsed_time": "0:12:21", "remaining_time": "0:14:17", "throughput": 1210.83, "total_tokens": 897616} +{"current_steps": 2235, "total_steps": 4810, "loss": 0.0359, "lr": 3.2355256481427145e-06, "epoch": 2.323284823284823, "percentage": 46.47, "elapsed_time": "0:12:21", "remaining_time": "0:14:14", "throughput": 1212.89, "total_tokens": 899536} +{"current_steps": 2240, "total_steps": 4810, "loss": 0.0408, "lr": 3.2268509589993745e-06, "epoch": 2.3284823284823286, "percentage": 46.57, "elapsed_time": "0:12:21", "remaining_time": "0:14:11", "throughput": 1215.21, "total_tokens": 901648} +{"current_steps": 2245, "total_steps": 4810, "loss": 0.1054, "lr": 3.218166699910332e-06, "epoch": 2.3336798336798337, "percentage": 46.67, "elapsed_time": "0:12:22", "remaining_time": "0:14:08", "throughput": 1217.45, "total_tokens": 903696} +{"current_steps": 2250, "total_steps": 4810, "loss": 0.1455, "lr": 3.209472985215243e-06, "epoch": 2.3388773388773387, "percentage": 46.78, "elapsed_time": "0:12:22", "remaining_time": "0:14:04", "throughput": 1219.43, "total_tokens": 905552} +{"current_steps": 2255, "total_steps": 4810, "loss": 0.0118, "lr": 3.2007699293782557e-06, "epoch": 2.3440748440748442, "percentage": 46.88, "elapsed_time": "0:12:22", "remaining_time": "0:14:01", "throughput": 1221.49, "total_tokens": 907472} +{"current_steps": 2260, "total_steps": 4810, "loss": 0.1043, "lr": 3.1920576469865115e-06, "epoch": 2.3492723492723493, "percentage": 46.99, "elapsed_time": "0:12:23", "remaining_time": "0:13:58", "throughput": 1223.8, "total_tokens": 909584} +{"current_steps": 2265, "total_steps": 4810, "loss": 0.0544, "lr": 3.183336252748627e-06, "epoch": 2.3544698544698544, "percentage": 47.09, "elapsed_time": "0:12:23", "remaining_time": "0:13:55", "throughput": 1226.03, "total_tokens": 911632} +{"current_steps": 2270, "total_steps": 4810, "loss": 0.0396, "lr": 3.1746058614931918e-06, "epoch": 2.35966735966736, "percentage": 47.19, "elapsed_time": "0:12:23", "remaining_time": "0:13:52", "throughput": 1228.17, "total_tokens": 913616} +{"current_steps": 2275, "total_steps": 4810, "loss": 0.0559, "lr": 3.16586658816725e-06, "epoch": 2.364864864864865, "percentage": 47.3, "elapsed_time": "0:12:24", "remaining_time": "0:13:49", "throughput": 1230.47, "total_tokens": 915728} +{"current_steps": 2280, "total_steps": 4810, "loss": 0.1154, "lr": 3.157118547834793e-06, "epoch": 2.37006237006237, "percentage": 47.4, "elapsed_time": "0:12:24", "remaining_time": "0:13:46", "throughput": 1232.7, "total_tokens": 917776} +{"current_steps": 2285, "total_steps": 4810, "loss": 0.1803, "lr": 3.1483618556752373e-06, "epoch": 2.375259875259875, "percentage": 47.51, "elapsed_time": "0:12:24", "remaining_time": "0:13:43", "throughput": 1235.08, "total_tokens": 919952} +{"current_steps": 2290, "total_steps": 4810, "loss": 0.0648, "lr": 3.139596626981916e-06, "epoch": 2.3804573804573805, "percentage": 47.61, "elapsed_time": "0:12:25", "remaining_time": "0:13:40", "throughput": 1237.12, "total_tokens": 921872} +{"current_steps": 2295, "total_steps": 4810, "loss": 0.1079, "lr": 3.1308229771605546e-06, "epoch": 2.3856548856548856, "percentage": 47.71, "elapsed_time": "0:12:25", "remaining_time": "0:13:36", "throughput": 1239.22, "total_tokens": 923856} +{"current_steps": 2300, "total_steps": 4810, "loss": 0.1516, "lr": 3.1220410217277546e-06, "epoch": 2.390852390852391, "percentage": 47.82, "elapsed_time": "0:12:25", "remaining_time": "0:13:33", "throughput": 1241.51, "total_tokens": 925968} +{"current_steps": 2305, "total_steps": 4810, "loss": 0.0496, "lr": 3.1132508763094715e-06, "epoch": 2.396049896049896, "percentage": 47.92, "elapsed_time": "0:12:26", "remaining_time": "0:13:30", "throughput": 1243.55, "total_tokens": 927888} +{"current_steps": 2310, "total_steps": 4810, "loss": 0.0691, "lr": 3.1044526566394924e-06, "epoch": 2.401247401247401, "percentage": 48.02, "elapsed_time": "0:12:26", "remaining_time": "0:13:27", "throughput": 1245.57, "total_tokens": 929808} +{"current_steps": 2315, "total_steps": 4810, "loss": 0.0009, "lr": 3.0956464785579125e-06, "epoch": 2.4064449064449063, "percentage": 48.13, "elapsed_time": "0:12:26", "remaining_time": "0:13:24", "throughput": 1247.61, "total_tokens": 931728} +{"current_steps": 2320, "total_steps": 4810, "loss": 0.0309, "lr": 3.0868324580096113e-06, "epoch": 2.4116424116424118, "percentage": 48.23, "elapsed_time": "0:12:27", "remaining_time": "0:13:21", "throughput": 1249.9, "total_tokens": 933840} +{"current_steps": 2325, "total_steps": 4810, "loss": 0.1115, "lr": 3.078010711042723e-06, "epoch": 2.416839916839917, "percentage": 48.34, "elapsed_time": "0:12:27", "remaining_time": "0:13:18", "throughput": 1252.02, "total_tokens": 935824} +{"current_steps": 2330, "total_steps": 4810, "loss": 0.043, "lr": 3.069181353807111e-06, "epoch": 2.422037422037422, "percentage": 48.44, "elapsed_time": "0:12:27", "remaining_time": "0:13:15", "throughput": 1254.22, "total_tokens": 937872} +{"current_steps": 2335, "total_steps": 4810, "loss": 0.098, "lr": 3.0603445025528377e-06, "epoch": 2.4272349272349274, "percentage": 48.54, "elapsed_time": "0:12:28", "remaining_time": "0:13:12", "throughput": 1256.5, "total_tokens": 939984} +{"current_steps": 2340, "total_steps": 4810, "loss": 0.0482, "lr": 3.051500273628633e-06, "epoch": 2.4324324324324325, "percentage": 48.65, "elapsed_time": "0:12:28", "remaining_time": "0:13:09", "throughput": 1258.61, "total_tokens": 941968} +{"current_steps": 2345, "total_steps": 4810, "loss": 0.0265, "lr": 3.042648783480366e-06, "epoch": 2.4376299376299375, "percentage": 48.75, "elapsed_time": "0:12:28", "remaining_time": "0:13:07", "throughput": 1260.72, "total_tokens": 943952} +{"current_steps": 2350, "total_steps": 4810, "loss": 0.0727, "lr": 3.0337901486495073e-06, "epoch": 2.442827442827443, "percentage": 48.86, "elapsed_time": "0:12:29", "remaining_time": "0:13:04", "throughput": 1262.74, "total_tokens": 945872} +{"current_steps": 2355, "total_steps": 4810, "loss": 0.1045, "lr": 3.0249244857715977e-06, "epoch": 2.448024948024948, "percentage": 48.96, "elapsed_time": "0:12:29", "remaining_time": "0:13:01", "throughput": 1264.85, "total_tokens": 947856} +{"current_steps": 2360, "total_steps": 4810, "loss": 0.0835, "lr": 3.01605191157471e-06, "epoch": 2.453222453222453, "percentage": 49.06, "elapsed_time": "0:12:29", "remaining_time": "0:12:58", "throughput": 1266.95, "total_tokens": 949840} +{"current_steps": 2365, "total_steps": 4810, "loss": 0.0307, "lr": 3.0071725428779152e-06, "epoch": 2.4584199584199586, "percentage": 49.17, "elapsed_time": "0:12:30", "remaining_time": "0:12:55", "throughput": 1268.97, "total_tokens": 951760} +{"current_steps": 2370, "total_steps": 4810, "loss": 0.0294, "lr": 2.9982864965897423e-06, "epoch": 2.4636174636174637, "percentage": 49.27, "elapsed_time": "0:12:30", "remaining_time": "0:12:52", "throughput": 1270.99, "total_tokens": 953680} +{"current_steps": 2375, "total_steps": 4810, "loss": 0.0349, "lr": 2.9893938897066392e-06, "epoch": 2.4688149688149688, "percentage": 49.38, "elapsed_time": "0:12:30", "remaining_time": "0:12:49", "throughput": 1273.0, "total_tokens": 955600} +{"current_steps": 2380, "total_steps": 4810, "loss": 0.2071, "lr": 2.9804948393114325e-06, "epoch": 2.474012474012474, "percentage": 49.48, "elapsed_time": "0:12:30", "remaining_time": "0:12:46", "throughput": 1274.93, "total_tokens": 957456} +{"current_steps": 2385, "total_steps": 4810, "loss": 0.0055, "lr": 2.9715894625717868e-06, "epoch": 2.4792099792099793, "percentage": 49.58, "elapsed_time": "0:12:31", "remaining_time": "0:12:43", "throughput": 1277.1, "total_tokens": 959504} +{"current_steps": 2390, "total_steps": 4810, "loss": 0.0277, "lr": 2.9626778767386604e-06, "epoch": 2.4844074844074844, "percentage": 49.69, "elapsed_time": "0:12:31", "remaining_time": "0:12:41", "throughput": 1279.18, "total_tokens": 961488} +{"current_steps": 2395, "total_steps": 4810, "loss": 0.1288, "lr": 2.953760199144764e-06, "epoch": 2.4896049896049894, "percentage": 49.79, "elapsed_time": "0:12:31", "remaining_time": "0:12:38", "throughput": 1281.19, "total_tokens": 963408} +{"current_steps": 2400, "total_steps": 4810, "loss": 0.0595, "lr": 2.9448365472030116e-06, "epoch": 2.494802494802495, "percentage": 49.9, "elapsed_time": "0:12:32", "remaining_time": "0:12:35", "throughput": 1283.27, "total_tokens": 965392} +{"current_steps": 2405, "total_steps": 4810, "loss": 0.0664, "lr": 2.935907038404981e-06, "epoch": 2.5, "percentage": 50.0, "elapsed_time": "0:12:32", "remaining_time": "0:12:32", "throughput": 1285.44, "total_tokens": 967440} +{"current_steps": 2410, "total_steps": 4810, "loss": 0.0894, "lr": 2.9269717903193603e-06, "epoch": 2.505197505197505, "percentage": 50.1, "elapsed_time": "0:12:32", "remaining_time": "0:12:29", "throughput": 1287.43, "total_tokens": 969360} +{"current_steps": 2410, "total_steps": 4810, "eval_loss": 0.3648892641067505, "epoch": 2.505197505197505, "percentage": 50.1, "elapsed_time": "0:12:34", "remaining_time": "0:12:30", "throughput": 1285.59, "total_tokens": 969360} +{"current_steps": 2415, "total_steps": 4810, "loss": 0.0082, "lr": 2.918030920590403e-06, "epoch": 2.51039501039501, "percentage": 50.21, "elapsed_time": "0:13:11", "remaining_time": "0:13:05", "throughput": 1226.85, "total_tokens": 971472} +{"current_steps": 2420, "total_steps": 4810, "loss": 0.0006, "lr": 2.9090845469363804e-06, "epoch": 2.5155925155925156, "percentage": 50.31, "elapsed_time": "0:13:12", "remaining_time": "0:13:02", "throughput": 1228.86, "total_tokens": 973456} +{"current_steps": 2425, "total_steps": 4810, "loss": 0.0004, "lr": 2.9001327871480296e-06, "epoch": 2.5207900207900207, "percentage": 50.42, "elapsed_time": "0:13:12", "remaining_time": "0:12:59", "throughput": 1230.94, "total_tokens": 975504} +{"current_steps": 2430, "total_steps": 4810, "loss": 0.0019, "lr": 2.8911757590870028e-06, "epoch": 2.525987525987526, "percentage": 50.52, "elapsed_time": "0:13:12", "remaining_time": "0:12:56", "throughput": 1233.03, "total_tokens": 977552} +{"current_steps": 2435, "total_steps": 4810, "loss": 0.1355, "lr": 2.8822135806843156e-06, "epoch": 2.5311850311850312, "percentage": 50.62, "elapsed_time": "0:13:13", "remaining_time": "0:12:53", "throughput": 1235.03, "total_tokens": 979536} +{"current_steps": 2440, "total_steps": 4810, "loss": 0.084, "lr": 2.873246369938797e-06, "epoch": 2.5363825363825363, "percentage": 50.73, "elapsed_time": "0:13:13", "remaining_time": "0:12:50", "throughput": 1237.11, "total_tokens": 981584} +{"current_steps": 2445, "total_steps": 4810, "loss": 0.0365, "lr": 2.8642742449155287e-06, "epoch": 2.5415800415800414, "percentage": 50.83, "elapsed_time": "0:13:13", "remaining_time": "0:12:47", "throughput": 1239.19, "total_tokens": 983632} +{"current_steps": 2450, "total_steps": 4810, "loss": 0.1776, "lr": 2.855297323744301e-06, "epoch": 2.546777546777547, "percentage": 50.94, "elapsed_time": "0:13:14", "remaining_time": "0:12:44", "throughput": 1241.27, "total_tokens": 985680} +{"current_steps": 2455, "total_steps": 4810, "loss": 0.0731, "lr": 2.8463157246180465e-06, "epoch": 2.551975051975052, "percentage": 51.04, "elapsed_time": "0:13:14", "remaining_time": "0:12:42", "throughput": 1243.26, "total_tokens": 987664} +{"current_steps": 2460, "total_steps": 4810, "loss": 0.0002, "lr": 2.8373295657912947e-06, "epoch": 2.5571725571725574, "percentage": 51.14, "elapsed_time": "0:13:14", "remaining_time": "0:12:39", "throughput": 1245.26, "total_tokens": 989648} +{"current_steps": 2465, "total_steps": 4810, "loss": 0.0005, "lr": 2.828338965578603e-06, "epoch": 2.5623700623700625, "percentage": 51.25, "elapsed_time": "0:13:15", "remaining_time": "0:12:36", "throughput": 1247.33, "total_tokens": 991696} +{"current_steps": 2470, "total_steps": 4810, "loss": 0.2142, "lr": 2.8193440423530117e-06, "epoch": 2.5675675675675675, "percentage": 51.35, "elapsed_time": "0:13:15", "remaining_time": "0:12:33", "throughput": 1249.24, "total_tokens": 993616} +{"current_steps": 2475, "total_steps": 4810, "loss": 0.0503, "lr": 2.810344914544475e-06, "epoch": 2.5727650727650726, "percentage": 51.46, "elapsed_time": "0:13:15", "remaining_time": "0:12:30", "throughput": 1251.3, "total_tokens": 995664} +{"current_steps": 2480, "total_steps": 4810, "loss": 0.1017, "lr": 2.8013417006383078e-06, "epoch": 2.577962577962578, "percentage": 51.56, "elapsed_time": "0:13:16", "remaining_time": "0:12:27", "throughput": 1253.29, "total_tokens": 997648} +{"current_steps": 2485, "total_steps": 4810, "loss": 0.0796, "lr": 2.792334519173624e-06, "epoch": 2.583160083160083, "percentage": 51.66, "elapsed_time": "0:13:16", "remaining_time": "0:12:25", "throughput": 1255.35, "total_tokens": 999696} +{"current_steps": 2490, "total_steps": 4810, "loss": 0.1002, "lr": 2.7833234887417745e-06, "epoch": 2.5883575883575882, "percentage": 51.77, "elapsed_time": "0:13:16", "remaining_time": "0:12:22", "throughput": 1257.33, "total_tokens": 1001680} +{"current_steps": 2495, "total_steps": 4810, "loss": 0.0836, "lr": 2.774308727984787e-06, "epoch": 2.5935550935550937, "percentage": 51.87, "elapsed_time": "0:13:16", "remaining_time": "0:12:19", "throughput": 1259.39, "total_tokens": 1003728} +{"current_steps": 2500, "total_steps": 4810, "loss": 0.0495, "lr": 2.7652903555938047e-06, "epoch": 2.598752598752599, "percentage": 51.98, "elapsed_time": "0:13:17", "remaining_time": "0:12:16", "throughput": 1261.21, "total_tokens": 1005584} +{"current_steps": 2505, "total_steps": 4810, "loss": 0.061, "lr": 2.756268490307524e-06, "epoch": 2.603950103950104, "percentage": 52.08, "elapsed_time": "0:13:17", "remaining_time": "0:12:13", "throughput": 1263.34, "total_tokens": 1007696} +{"current_steps": 2510, "total_steps": 4810, "loss": 0.2945, "lr": 2.747243250910625e-06, "epoch": 2.609147609147609, "percentage": 52.18, "elapsed_time": "0:13:17", "remaining_time": "0:12:11", "throughput": 1265.32, "total_tokens": 1009680} +{"current_steps": 2515, "total_steps": 4810, "loss": 0.0414, "lr": 2.7382147562322175e-06, "epoch": 2.6143451143451144, "percentage": 52.29, "elapsed_time": "0:13:18", "remaining_time": "0:12:08", "throughput": 1267.38, "total_tokens": 1011728} +{"current_steps": 2520, "total_steps": 4810, "loss": 0.0023, "lr": 2.729183125144269e-06, "epoch": 2.6195426195426195, "percentage": 52.39, "elapsed_time": "0:13:18", "remaining_time": "0:12:05", "throughput": 1269.51, "total_tokens": 1013840} +{"current_steps": 2525, "total_steps": 4810, "loss": 0.1403, "lr": 2.7201484765600426e-06, "epoch": 2.624740124740125, "percentage": 52.49, "elapsed_time": "0:13:18", "remaining_time": "0:12:02", "throughput": 1271.48, "total_tokens": 1015824} +{"current_steps": 2530, "total_steps": 4810, "loss": 0.1792, "lr": 2.71111092943253e-06, "epoch": 2.62993762993763, "percentage": 52.6, "elapsed_time": "0:13:19", "remaining_time": "0:12:00", "throughput": 1273.38, "total_tokens": 1017744} +{"current_steps": 2535, "total_steps": 4810, "loss": 0.0616, "lr": 2.702070602752887e-06, "epoch": 2.635135135135135, "percentage": 52.7, "elapsed_time": "0:13:19", "remaining_time": "0:11:57", "throughput": 1275.35, "total_tokens": 1019728} +{"current_steps": 2540, "total_steps": 4810, "loss": 0.0836, "lr": 2.693027615548864e-06, "epoch": 2.64033264033264, "percentage": 52.81, "elapsed_time": "0:13:19", "remaining_time": "0:11:54", "throughput": 1277.48, "total_tokens": 1021840} +{"current_steps": 2545, "total_steps": 4810, "loss": 0.0909, "lr": 2.6839820868832433e-06, "epoch": 2.6455301455301456, "percentage": 52.91, "elapsed_time": "0:13:20", "remaining_time": "0:11:52", "throughput": 1279.44, "total_tokens": 1023824} +{"current_steps": 2550, "total_steps": 4810, "loss": 0.0143, "lr": 2.6749341358522675e-06, "epoch": 2.6507276507276507, "percentage": 53.01, "elapsed_time": "0:13:20", "remaining_time": "0:11:49", "throughput": 1281.17, "total_tokens": 1025616} +{"current_steps": 2555, "total_steps": 4810, "loss": 0.0105, "lr": 2.665883881584072e-06, "epoch": 2.6559251559251558, "percentage": 53.12, "elapsed_time": "0:13:20", "remaining_time": "0:11:46", "throughput": 1283.22, "total_tokens": 1027664} +{"current_steps": 2560, "total_steps": 4810, "loss": 0.0167, "lr": 2.6568314432371183e-06, "epoch": 2.6611226611226613, "percentage": 53.22, "elapsed_time": "0:13:21", "remaining_time": "0:11:44", "throughput": 1285.18, "total_tokens": 1029648} +{"current_steps": 2565, "total_steps": 4810, "loss": 0.0354, "lr": 2.647776939998625e-06, "epoch": 2.6663201663201663, "percentage": 53.33, "elapsed_time": "0:13:21", "remaining_time": "0:11:41", "throughput": 1287.15, "total_tokens": 1031632} +{"current_steps": 2570, "total_steps": 4810, "loss": 0.0416, "lr": 2.6387204910829954e-06, "epoch": 2.6715176715176714, "percentage": 53.43, "elapsed_time": "0:13:21", "remaining_time": "0:11:38", "throughput": 1288.95, "total_tokens": 1033488} +{"current_steps": 2575, "total_steps": 4810, "loss": 0.0011, "lr": 2.629662215730253e-06, "epoch": 2.6767151767151764, "percentage": 53.53, "elapsed_time": "0:13:22", "remaining_time": "0:11:36", "throughput": 1290.99, "total_tokens": 1035536} +{"current_steps": 2580, "total_steps": 4810, "loss": 0.0636, "lr": 2.620602233204467e-06, "epoch": 2.681912681912682, "percentage": 53.64, "elapsed_time": "0:13:22", "remaining_time": "0:11:33", "throughput": 1293.02, "total_tokens": 1037584} +{"current_steps": 2585, "total_steps": 4810, "loss": 0.1506, "lr": 2.6115406627921823e-06, "epoch": 2.687110187110187, "percentage": 53.74, "elapsed_time": "0:13:22", "remaining_time": "0:11:30", "throughput": 1294.98, "total_tokens": 1039568} +{"current_steps": 2590, "total_steps": 4810, "loss": 0.0269, "lr": 2.6024776238008543e-06, "epoch": 2.6923076923076925, "percentage": 53.85, "elapsed_time": "0:13:23", "remaining_time": "0:11:28", "throughput": 1297.01, "total_tokens": 1041616} +{"current_steps": 2595, "total_steps": 4810, "loss": 0.1038, "lr": 2.5934132355572713e-06, "epoch": 2.6975051975051976, "percentage": 53.95, "elapsed_time": "0:13:23", "remaining_time": "0:11:25", "throughput": 1299.04, "total_tokens": 1043664} +{"current_steps": 2600, "total_steps": 4810, "loss": 0.159, "lr": 2.5843476174059874e-06, "epoch": 2.7027027027027026, "percentage": 54.05, "elapsed_time": "0:13:23", "remaining_time": "0:11:23", "throughput": 1300.84, "total_tokens": 1045520} +{"current_steps": 2605, "total_steps": 4810, "loss": 0.1412, "lr": 2.575280888707748e-06, "epoch": 2.7079002079002077, "percentage": 54.16, "elapsed_time": "0:13:24", "remaining_time": "0:11:20", "throughput": 1302.63, "total_tokens": 1047376} +{"current_steps": 2610, "total_steps": 4810, "loss": 0.0029, "lr": 2.5662131688379244e-06, "epoch": 2.713097713097713, "percentage": 54.26, "elapsed_time": "0:13:24", "remaining_time": "0:11:18", "throughput": 1304.57, "total_tokens": 1049360} +{"current_steps": 2615, "total_steps": 4810, "loss": 0.054, "lr": 2.557144577184933e-06, "epoch": 2.7182952182952183, "percentage": 54.37, "elapsed_time": "0:13:24", "remaining_time": "0:11:15", "throughput": 1306.52, "total_tokens": 1051344} +{"current_steps": 2620, "total_steps": 4810, "loss": 0.0051, "lr": 2.5480752331486742e-06, "epoch": 2.7234927234927238, "percentage": 54.47, "elapsed_time": "0:13:25", "remaining_time": "0:11:12", "throughput": 1308.39, "total_tokens": 1053264} +{"current_steps": 2625, "total_steps": 4810, "loss": 0.0494, "lr": 2.539005256138948e-06, "epoch": 2.728690228690229, "percentage": 54.57, "elapsed_time": "0:13:25", "remaining_time": "0:11:10", "throughput": 1310.33, "total_tokens": 1055248} +{"current_steps": 2630, "total_steps": 4810, "loss": 0.0155, "lr": 2.529934765573893e-06, "epoch": 2.733887733887734, "percentage": 54.68, "elapsed_time": "0:13:25", "remaining_time": "0:11:07", "throughput": 1312.12, "total_tokens": 1057104} +{"current_steps": 2635, "total_steps": 4810, "loss": 0.0379, "lr": 2.520863880878408e-06, "epoch": 2.739085239085239, "percentage": 54.78, "elapsed_time": "0:13:25", "remaining_time": "0:11:05", "throughput": 1313.98, "total_tokens": 1059024} +{"current_steps": 2640, "total_steps": 4810, "loss": 0.2379, "lr": 2.511792721482581e-06, "epoch": 2.7442827442827444, "percentage": 54.89, "elapsed_time": "0:13:26", "remaining_time": "0:11:02", "throughput": 1315.84, "total_tokens": 1060944} +{"current_steps": 2645, "total_steps": 4810, "loss": 0.038, "lr": 2.502721406820116e-06, "epoch": 2.7494802494802495, "percentage": 54.99, "elapsed_time": "0:13:26", "remaining_time": "0:11:00", "throughput": 1317.86, "total_tokens": 1062992} +{"current_steps": 2650, "total_steps": 4810, "loss": 0.0705, "lr": 2.493650056326763e-06, "epoch": 2.7546777546777546, "percentage": 55.09, "elapsed_time": "0:13:26", "remaining_time": "0:10:57", "throughput": 1319.64, "total_tokens": 1064848} +{"current_steps": 2651, "total_steps": 4810, "eval_loss": 0.306118369102478, "epoch": 2.7557172557172556, "percentage": 55.11, "elapsed_time": "0:13:28", "remaining_time": "0:10:58", "throughput": 1318.26, "total_tokens": 1065232} +{"current_steps": 2655, "total_steps": 4810, "loss": 0.2106, "lr": 2.4845787894387427e-06, "epoch": 2.75987525987526, "percentage": 55.2, "elapsed_time": "0:15:04", "remaining_time": "0:12:14", "throughput": 1179.21, "total_tokens": 1066832} +{"current_steps": 2660, "total_steps": 4810, "loss": 0.0032, "lr": 2.4755077255911746e-06, "epoch": 2.765072765072765, "percentage": 55.3, "elapsed_time": "0:15:05", "remaining_time": "0:12:11", "throughput": 1181.05, "total_tokens": 1068880} +{"current_steps": 2665, "total_steps": 4810, "loss": 0.151, "lr": 2.466436984216507e-06, "epoch": 2.77027027027027, "percentage": 55.41, "elapsed_time": "0:15:05", "remaining_time": "0:12:08", "throughput": 1182.82, "total_tokens": 1070864} +{"current_steps": 2670, "total_steps": 4810, "loss": 0.1102, "lr": 2.4573666847429383e-06, "epoch": 2.7754677754677752, "percentage": 55.51, "elapsed_time": "0:15:05", "remaining_time": "0:12:05", "throughput": 1184.59, "total_tokens": 1072848} +{"current_steps": 2675, "total_steps": 4810, "loss": 0.0628, "lr": 2.4482969465928545e-06, "epoch": 2.7806652806652807, "percentage": 55.61, "elapsed_time": "0:15:05", "remaining_time": "0:12:03", "throughput": 1186.36, "total_tokens": 1074832} +{"current_steps": 2680, "total_steps": 4810, "loss": 0.002, "lr": 2.4392278891812457e-06, "epoch": 2.785862785862786, "percentage": 55.72, "elapsed_time": "0:15:06", "remaining_time": "0:12:00", "throughput": 1188.27, "total_tokens": 1076944} +{"current_steps": 2685, "total_steps": 4810, "loss": 0.0233, "lr": 2.430159631914141e-06, "epoch": 2.7910602910602913, "percentage": 55.82, "elapsed_time": "0:15:06", "remaining_time": "0:11:57", "throughput": 1189.89, "total_tokens": 1078800} +{"current_steps": 2690, "total_steps": 4810, "loss": 0.1463, "lr": 2.421092294187037e-06, "epoch": 2.7962577962577964, "percentage": 55.93, "elapsed_time": "0:15:06", "remaining_time": "0:11:54", "throughput": 1191.8, "total_tokens": 1080912} +{"current_steps": 2695, "total_steps": 4810, "loss": 0.0068, "lr": 2.41202599538332e-06, "epoch": 2.8014553014553014, "percentage": 56.03, "elapsed_time": "0:15:07", "remaining_time": "0:11:52", "throughput": 1193.63, "total_tokens": 1082960} +{"current_steps": 2700, "total_steps": 4810, "loss": 0.0591, "lr": 2.402960854872697e-06, "epoch": 2.8066528066528065, "percentage": 56.13, "elapsed_time": "0:15:07", "remaining_time": "0:11:49", "throughput": 1195.46, "total_tokens": 1085008} +{"current_steps": 2705, "total_steps": 4810, "loss": 0.0729, "lr": 2.39389699200963e-06, "epoch": 2.811850311850312, "percentage": 56.24, "elapsed_time": "0:15:07", "remaining_time": "0:11:46", "throughput": 1197.43, "total_tokens": 1087184} +{"current_steps": 2710, "total_steps": 4810, "loss": 0.0013, "lr": 2.3848345261317523e-06, "epoch": 2.817047817047817, "percentage": 56.34, "elapsed_time": "0:15:08", "remaining_time": "0:11:43", "throughput": 1199.12, "total_tokens": 1089104} +{"current_steps": 2715, "total_steps": 4810, "loss": 0.1587, "lr": 2.3757735765583083e-06, "epoch": 2.822245322245322, "percentage": 56.44, "elapsed_time": "0:15:08", "remaining_time": "0:11:41", "throughput": 1200.81, "total_tokens": 1091024} +{"current_steps": 2720, "total_steps": 4810, "loss": 0.0685, "lr": 2.3667142625885774e-06, "epoch": 2.8274428274428276, "percentage": 56.55, "elapsed_time": "0:15:08", "remaining_time": "0:11:38", "throughput": 1202.56, "total_tokens": 1093008} +{"current_steps": 2725, "total_steps": 4810, "loss": 0.0005, "lr": 2.357656703500303e-06, "epoch": 2.8326403326403327, "percentage": 56.65, "elapsed_time": "0:15:09", "remaining_time": "0:11:35", "throughput": 1204.32, "total_tokens": 1094992} +{"current_steps": 2730, "total_steps": 4810, "loss": 0.0003, "lr": 2.3486010185481247e-06, "epoch": 2.8378378378378377, "percentage": 56.76, "elapsed_time": "0:15:09", "remaining_time": "0:11:32", "throughput": 1206.15, "total_tokens": 1097040} +{"current_steps": 2735, "total_steps": 4810, "loss": 0.1532, "lr": 2.3395473269620055e-06, "epoch": 2.8430353430353428, "percentage": 56.86, "elapsed_time": "0:15:09", "remaining_time": "0:11:30", "throughput": 1207.82, "total_tokens": 1098960} +{"current_steps": 2740, "total_steps": 4810, "loss": 0.0005, "lr": 2.330495747945665e-06, "epoch": 2.8482328482328483, "percentage": 56.96, "elapsed_time": "0:15:10", "remaining_time": "0:11:27", "throughput": 1209.85, "total_tokens": 1101200} +{"current_steps": 2745, "total_steps": 4810, "loss": 0.1635, "lr": 2.321446400675005e-06, "epoch": 2.8534303534303533, "percentage": 57.07, "elapsed_time": "0:15:10", "remaining_time": "0:11:24", "throughput": 1211.53, "total_tokens": 1103120} +{"current_steps": 2750, "total_steps": 4810, "loss": 0.0648, "lr": 2.3123994042965454e-06, "epoch": 2.858627858627859, "percentage": 57.17, "elapsed_time": "0:15:10", "remaining_time": "0:11:22", "throughput": 1213.35, "total_tokens": 1105168} +{"current_steps": 2755, "total_steps": 4810, "loss": 0.0463, "lr": 2.3033548779258535e-06, "epoch": 2.863825363825364, "percentage": 57.28, "elapsed_time": "0:15:11", "remaining_time": "0:11:19", "throughput": 1215.1, "total_tokens": 1107152} +{"current_steps": 2760, "total_steps": 4810, "loss": 0.2765, "lr": 2.2943129406459754e-06, "epoch": 2.869022869022869, "percentage": 57.38, "elapsed_time": "0:15:11", "remaining_time": "0:11:17", "throughput": 1216.91, "total_tokens": 1109200} +{"current_steps": 2765, "total_steps": 4810, "loss": 0.2216, "lr": 2.2852737115058684e-06, "epoch": 2.874220374220374, "percentage": 57.48, "elapsed_time": "0:15:11", "remaining_time": "0:11:14", "throughput": 1218.72, "total_tokens": 1111248} +{"current_steps": 2770, "total_steps": 4810, "loss": 0.1188, "lr": 2.2762373095188344e-06, "epoch": 2.8794178794178795, "percentage": 57.59, "elapsed_time": "0:15:12", "remaining_time": "0:11:11", "throughput": 1220.47, "total_tokens": 1113232} +{"current_steps": 2775, "total_steps": 4810, "loss": 0.0557, "lr": 2.2672038536609487e-06, "epoch": 2.8846153846153846, "percentage": 57.69, "elapsed_time": "0:15:12", "remaining_time": "0:11:09", "throughput": 1222.21, "total_tokens": 1115216} +{"current_steps": 2780, "total_steps": 4810, "loss": 0.0011, "lr": 2.2581734628695034e-06, "epoch": 2.88981288981289, "percentage": 57.8, "elapsed_time": "0:15:12", "remaining_time": "0:11:06", "throughput": 1224.02, "total_tokens": 1117264} +{"current_steps": 2785, "total_steps": 4810, "loss": 0.1068, "lr": 2.2491462560414287e-06, "epoch": 2.895010395010395, "percentage": 57.9, "elapsed_time": "0:15:13", "remaining_time": "0:11:03", "throughput": 1225.9, "total_tokens": 1119376} +{"current_steps": 2790, "total_steps": 4810, "loss": 0.1178, "lr": 2.2401223520317363e-06, "epoch": 2.9002079002079, "percentage": 58.0, "elapsed_time": "0:15:13", "remaining_time": "0:11:01", "throughput": 1227.71, "total_tokens": 1121424} +{"current_steps": 2795, "total_steps": 4810, "loss": 0.0582, "lr": 2.2311018696519532e-06, "epoch": 2.9054054054054053, "percentage": 58.11, "elapsed_time": "0:15:13", "remaining_time": "0:10:58", "throughput": 1229.52, "total_tokens": 1123472} +{"current_steps": 2800, "total_steps": 4810, "loss": 0.0007, "lr": 2.2220849276685533e-06, "epoch": 2.9106029106029108, "percentage": 58.21, "elapsed_time": "0:15:14", "remaining_time": "0:10:56", "throughput": 1231.4, "total_tokens": 1125584} +{"current_steps": 2805, "total_steps": 4810, "loss": 0.0783, "lr": 2.2130716448014e-06, "epoch": 2.915800415800416, "percentage": 58.32, "elapsed_time": "0:15:14", "remaining_time": "0:10:53", "throughput": 1233.13, "total_tokens": 1127568} +{"current_steps": 2810, "total_steps": 4810, "loss": 0.0946, "lr": 2.2040621397221762e-06, "epoch": 2.920997920997921, "percentage": 58.42, "elapsed_time": "0:15:14", "remaining_time": "0:10:51", "throughput": 1234.87, "total_tokens": 1129552} +{"current_steps": 2815, "total_steps": 4810, "loss": 0.0011, "lr": 2.1950565310528264e-06, "epoch": 2.9261954261954264, "percentage": 58.52, "elapsed_time": "0:15:15", "remaining_time": "0:10:48", "throughput": 1236.54, "total_tokens": 1131472} +{"current_steps": 2820, "total_steps": 4810, "loss": 0.0005, "lr": 2.186054937363996e-06, "epoch": 2.9313929313929314, "percentage": 58.63, "elapsed_time": "0:15:15", "remaining_time": "0:10:45", "throughput": 1238.21, "total_tokens": 1133392} +{"current_steps": 2825, "total_steps": 4810, "loss": 0.0004, "lr": 2.1770574771734644e-06, "epoch": 2.9365904365904365, "percentage": 58.73, "elapsed_time": "0:15:15", "remaining_time": "0:10:43", "throughput": 1240.01, "total_tokens": 1135440} +{"current_steps": 2830, "total_steps": 4810, "loss": 0.0037, "lr": 2.168064268944591e-06, "epoch": 2.9417879417879416, "percentage": 58.84, "elapsed_time": "0:15:15", "remaining_time": "0:10:40", "throughput": 1241.74, "total_tokens": 1137424} +{"current_steps": 2835, "total_steps": 4810, "loss": 0.018, "lr": 2.1590754310847513e-06, "epoch": 2.946985446985447, "percentage": 58.94, "elapsed_time": "0:15:16", "remaining_time": "0:10:38", "throughput": 1243.48, "total_tokens": 1139408} +{"current_steps": 2840, "total_steps": 4810, "loss": 0.1722, "lr": 2.150091081943777e-06, "epoch": 2.952182952182952, "percentage": 59.04, "elapsed_time": "0:15:16", "remaining_time": "0:10:35", "throughput": 1245.28, "total_tokens": 1141456} +{"current_steps": 2845, "total_steps": 4810, "loss": 0.1002, "lr": 2.141111339812405e-06, "epoch": 2.9573804573804576, "percentage": 59.15, "elapsed_time": "0:15:16", "remaining_time": "0:10:33", "throughput": 1247.01, "total_tokens": 1143440} +{"current_steps": 2850, "total_steps": 4810, "loss": 0.0783, "lr": 2.1321363229207097e-06, "epoch": 2.9625779625779627, "percentage": 59.25, "elapsed_time": "0:15:17", "remaining_time": "0:10:30", "throughput": 1248.67, "total_tokens": 1145360} +{"current_steps": 2855, "total_steps": 4810, "loss": 0.1061, "lr": 2.123166149436556e-06, "epoch": 2.9677754677754677, "percentage": 59.36, "elapsed_time": "0:15:17", "remaining_time": "0:10:28", "throughput": 1250.33, "total_tokens": 1147280} +{"current_steps": 2860, "total_steps": 4810, "loss": 0.1705, "lr": 2.114200937464035e-06, "epoch": 2.972972972972973, "percentage": 59.46, "elapsed_time": "0:15:17", "remaining_time": "0:10:25", "throughput": 1251.99, "total_tokens": 1149200} +{"current_steps": 2865, "total_steps": 4810, "loss": 0.003, "lr": 2.1052408050419153e-06, "epoch": 2.9781704781704783, "percentage": 59.56, "elapsed_time": "0:15:18", "remaining_time": "0:10:23", "throughput": 1253.71, "total_tokens": 1151184} +{"current_steps": 2870, "total_steps": 4810, "loss": 0.0952, "lr": 2.0962858701420867e-06, "epoch": 2.9833679833679834, "percentage": 59.67, "elapsed_time": "0:15:18", "remaining_time": "0:10:20", "throughput": 1255.51, "total_tokens": 1153232} +{"current_steps": 2875, "total_steps": 4810, "loss": 0.1992, "lr": 2.087336250668006e-06, "epoch": 2.9885654885654884, "percentage": 59.77, "elapsed_time": "0:15:18", "remaining_time": "0:10:18", "throughput": 1257.23, "total_tokens": 1155216} +{"current_steps": 2880, "total_steps": 4810, "loss": 0.1408, "lr": 2.0783920644531443e-06, "epoch": 2.993762993762994, "percentage": 59.88, "elapsed_time": "0:15:19", "remaining_time": "0:10:15", "throughput": 1259.02, "total_tokens": 1157264} +{"current_steps": 2885, "total_steps": 4810, "loss": 0.2101, "lr": 2.069453429259439e-06, "epoch": 2.998960498960499, "percentage": 59.98, "elapsed_time": "0:15:19", "remaining_time": "0:10:13", "throughput": 1260.81, "total_tokens": 1159312} +{"current_steps": 2890, "total_steps": 4810, "loss": 0.0016, "lr": 2.06052046277574e-06, "epoch": 3.004158004158004, "percentage": 60.08, "elapsed_time": "0:15:20", "remaining_time": "0:10:11", "throughput": 1261.75, "total_tokens": 1161248} +{"current_steps": 2892, "total_steps": 4810, "eval_loss": 0.2698093056678772, "epoch": 3.006237006237006, "percentage": 60.12, "elapsed_time": "0:15:21", "remaining_time": "0:10:11", "throughput": 1261.01, "total_tokens": 1162016} +{"current_steps": 2895, "total_steps": 4810, "loss": 0.0012, "lr": 2.051593282616262e-06, "epoch": 3.0093555093555096, "percentage": 60.19, "elapsed_time": "0:16:08", "remaining_time": "0:10:40", "throughput": 1201.34, "total_tokens": 1163168} +{"current_steps": 2900, "total_steps": 4810, "loss": 0.0559, "lr": 2.0426720063190335e-06, "epoch": 3.0145530145530146, "percentage": 60.29, "elapsed_time": "0:16:08", "remaining_time": "0:10:37", "throughput": 1202.92, "total_tokens": 1165088} +{"current_steps": 2905, "total_steps": 4810, "loss": 0.0012, "lr": 2.0337567513443518e-06, "epoch": 3.0197505197505197, "percentage": 60.4, "elapsed_time": "0:16:08", "remaining_time": "0:10:35", "throughput": 1204.64, "total_tokens": 1167136} +{"current_steps": 2910, "total_steps": 4810, "loss": 0.046, "lr": 2.0248476350732368e-06, "epoch": 3.024948024948025, "percentage": 60.5, "elapsed_time": "0:16:09", "remaining_time": "0:10:32", "throughput": 1206.28, "total_tokens": 1169120} +{"current_steps": 2915, "total_steps": 4810, "loss": 0.0235, "lr": 2.0159447748058803e-06, "epoch": 3.0301455301455302, "percentage": 60.6, "elapsed_time": "0:16:09", "remaining_time": "0:10:30", "throughput": 1207.87, "total_tokens": 1171040} +{"current_steps": 2920, "total_steps": 4810, "loss": 0.1135, "lr": 2.007048287760113e-06, "epoch": 3.0353430353430353, "percentage": 60.71, "elapsed_time": "0:16:09", "remaining_time": "0:10:27", "throughput": 1209.51, "total_tokens": 1173024} +{"current_steps": 2925, "total_steps": 4810, "loss": 0.0007, "lr": 1.998158291069845e-06, "epoch": 3.0405405405405403, "percentage": 60.81, "elapsed_time": "0:16:10", "remaining_time": "0:10:25", "throughput": 1211.09, "total_tokens": 1174944} +{"current_steps": 2930, "total_steps": 4810, "loss": 0.009, "lr": 1.989274901783538e-06, "epoch": 3.045738045738046, "percentage": 60.91, "elapsed_time": "0:16:10", "remaining_time": "0:10:22", "throughput": 1212.86, "total_tokens": 1177056} +{"current_steps": 2935, "total_steps": 4810, "loss": 0.0004, "lr": 1.9803982368626582e-06, "epoch": 3.050935550935551, "percentage": 61.02, "elapsed_time": "0:16:10", "remaining_time": "0:10:20", "throughput": 1214.44, "total_tokens": 1178976} +{"current_steps": 2940, "total_steps": 4810, "loss": 0.0007, "lr": 1.9715284131801353e-06, "epoch": 3.056133056133056, "percentage": 61.12, "elapsed_time": "0:16:11", "remaining_time": "0:10:17", "throughput": 1216.15, "total_tokens": 1181024} +{"current_steps": 2945, "total_steps": 4810, "loss": 0.0003, "lr": 1.9626655475188237e-06, "epoch": 3.0613305613305615, "percentage": 61.23, "elapsed_time": "0:16:11", "remaining_time": "0:10:15", "throughput": 1217.79, "total_tokens": 1183008} +{"current_steps": 2950, "total_steps": 4810, "loss": 0.0003, "lr": 1.953809756569971e-06, "epoch": 3.0665280665280665, "percentage": 61.33, "elapsed_time": "0:16:11", "remaining_time": "0:10:12", "throughput": 1219.49, "total_tokens": 1185056} +{"current_steps": 2955, "total_steps": 4810, "loss": 0.0623, "lr": 1.9449611569316716e-06, "epoch": 3.0717255717255716, "percentage": 61.43, "elapsed_time": "0:16:12", "remaining_time": "0:10:10", "throughput": 1221.07, "total_tokens": 1186976} +{"current_steps": 2960, "total_steps": 4810, "loss": 0.1065, "lr": 1.936119865107341e-06, "epoch": 3.076923076923077, "percentage": 61.54, "elapsed_time": "0:16:12", "remaining_time": "0:10:07", "throughput": 1222.7, "total_tokens": 1188960} +{"current_steps": 2965, "total_steps": 4810, "loss": 0.0002, "lr": 1.9272859975041757e-06, "epoch": 3.082120582120582, "percentage": 61.64, "elapsed_time": "0:16:12", "remaining_time": "0:10:05", "throughput": 1224.34, "total_tokens": 1190944} +{"current_steps": 2970, "total_steps": 4810, "loss": 0.0381, "lr": 1.918459670431622e-06, "epoch": 3.087318087318087, "percentage": 61.75, "elapsed_time": "0:16:13", "remaining_time": "0:10:02", "throughput": 1225.98, "total_tokens": 1192928} +{"current_steps": 2975, "total_steps": 4810, "loss": 0.0045, "lr": 1.9096410000998478e-06, "epoch": 3.0925155925155927, "percentage": 61.85, "elapsed_time": "0:16:13", "remaining_time": "0:10:00", "throughput": 1227.55, "total_tokens": 1194848} +{"current_steps": 2980, "total_steps": 4810, "loss": 0.0019, "lr": 1.9008301026182064e-06, "epoch": 3.0977130977130978, "percentage": 61.95, "elapsed_time": "0:16:13", "remaining_time": "0:09:57", "throughput": 1229.12, "total_tokens": 1196768} +{"current_steps": 2985, "total_steps": 4810, "loss": 0.0002, "lr": 1.892027093993716e-06, "epoch": 3.102910602910603, "percentage": 62.06, "elapsed_time": "0:16:14", "remaining_time": "0:09:55", "throughput": 1230.68, "total_tokens": 1198688} +{"current_steps": 2990, "total_steps": 4810, "loss": 0.0002, "lr": 1.883232090129523e-06, "epoch": 3.108108108108108, "percentage": 62.16, "elapsed_time": "0:16:14", "remaining_time": "0:09:53", "throughput": 1232.31, "total_tokens": 1200672} +{"current_steps": 2995, "total_steps": 4810, "loss": 0.0713, "lr": 1.8744452068233826e-06, "epoch": 3.1133056133056134, "percentage": 62.27, "elapsed_time": "0:16:14", "remaining_time": "0:09:50", "throughput": 1234.01, "total_tokens": 1202720} +{"current_steps": 3000, "total_steps": 4810, "loss": 0.0002, "lr": 1.8656665597661334e-06, "epoch": 3.1185031185031185, "percentage": 62.37, "elapsed_time": "0:16:14", "remaining_time": "0:09:48", "throughput": 1235.7, "total_tokens": 1204768} +{"current_steps": 3005, "total_steps": 4810, "loss": 0.0001, "lr": 1.8568962645401702e-06, "epoch": 3.1237006237006235, "percentage": 62.47, "elapsed_time": "0:16:15", "remaining_time": "0:09:45", "throughput": 1237.53, "total_tokens": 1206944} +{"current_steps": 3010, "total_steps": 4810, "loss": 0.095, "lr": 1.8481344366179284e-06, "epoch": 3.128898128898129, "percentage": 62.58, "elapsed_time": "0:16:15", "remaining_time": "0:09:43", "throughput": 1239.28, "total_tokens": 1209056} +{"current_steps": 3015, "total_steps": 4810, "loss": 0.0002, "lr": 1.8393811913603583e-06, "epoch": 3.134095634095634, "percentage": 62.68, "elapsed_time": "0:16:15", "remaining_time": "0:09:41", "throughput": 1240.84, "total_tokens": 1210976} +{"current_steps": 3020, "total_steps": 4810, "loss": 0.0002, "lr": 1.8306366440154067e-06, "epoch": 3.139293139293139, "percentage": 62.79, "elapsed_time": "0:16:16", "remaining_time": "0:09:38", "throughput": 1242.53, "total_tokens": 1213024} +{"current_steps": 3025, "total_steps": 4810, "loss": 0.0302, "lr": 1.8219009097165042e-06, "epoch": 3.1444906444906446, "percentage": 62.89, "elapsed_time": "0:16:16", "remaining_time": "0:09:36", "throughput": 1244.29, "total_tokens": 1215136} +{"current_steps": 3030, "total_steps": 4810, "loss": 0.0004, "lr": 1.8131741034810436e-06, "epoch": 3.1496881496881497, "percentage": 62.99, "elapsed_time": "0:16:16", "remaining_time": "0:09:33", "throughput": 1245.85, "total_tokens": 1217056} +{"current_steps": 3035, "total_steps": 4810, "loss": 0.0003, "lr": 1.8044563402088686e-06, "epoch": 3.1548856548856548, "percentage": 63.1, "elapsed_time": "0:16:17", "remaining_time": "0:09:31", "throughput": 1247.6, "total_tokens": 1219168} +{"current_steps": 3040, "total_steps": 4810, "loss": 0.0002, "lr": 1.7957477346807622e-06, "epoch": 3.1600831600831603, "percentage": 63.2, "elapsed_time": "0:16:17", "remaining_time": "0:09:29", "throughput": 1249.16, "total_tokens": 1221088} +{"current_steps": 3045, "total_steps": 4810, "loss": 0.0002, "lr": 1.7870484015569306e-06, "epoch": 3.1652806652806653, "percentage": 63.31, "elapsed_time": "0:16:17", "remaining_time": "0:09:26", "throughput": 1250.97, "total_tokens": 1223264} +{"current_steps": 3050, "total_steps": 4810, "loss": 0.0002, "lr": 1.7783584553755007e-06, "epoch": 3.1704781704781704, "percentage": 63.41, "elapsed_time": "0:16:18", "remaining_time": "0:09:24", "throughput": 1252.78, "total_tokens": 1225440} +{"current_steps": 3055, "total_steps": 4810, "loss": 0.0864, "lr": 1.769678010551003e-06, "epoch": 3.175675675675676, "percentage": 63.51, "elapsed_time": "0:16:18", "remaining_time": "0:09:22", "throughput": 1254.4, "total_tokens": 1227424} +{"current_steps": 3060, "total_steps": 4810, "loss": 0.0793, "lr": 1.7610071813728741e-06, "epoch": 3.180873180873181, "percentage": 63.62, "elapsed_time": "0:16:18", "remaining_time": "0:09:19", "throughput": 1255.95, "total_tokens": 1229344} +{"current_steps": 3065, "total_steps": 4810, "loss": 0.0974, "lr": 1.7523460820039466e-06, "epoch": 3.186070686070686, "percentage": 63.72, "elapsed_time": "0:16:19", "remaining_time": "0:09:17", "throughput": 1257.7, "total_tokens": 1231456} +{"current_steps": 3070, "total_steps": 4810, "loss": 0.0003, "lr": 1.7436948264789465e-06, "epoch": 3.1912681912681915, "percentage": 63.83, "elapsed_time": "0:16:19", "remaining_time": "0:09:15", "throughput": 1259.31, "total_tokens": 1233440} +{"current_steps": 3075, "total_steps": 4810, "loss": 0.0779, "lr": 1.7350535287029957e-06, "epoch": 3.1964656964656966, "percentage": 63.93, "elapsed_time": "0:16:19", "remaining_time": "0:09:12", "throughput": 1261.05, "total_tokens": 1235552} +{"current_steps": 3080, "total_steps": 4810, "loss": 0.152, "lr": 1.7264223024501064e-06, "epoch": 3.2016632016632016, "percentage": 64.03, "elapsed_time": "0:16:20", "remaining_time": "0:09:10", "throughput": 1262.67, "total_tokens": 1237536} +{"current_steps": 3085, "total_steps": 4810, "loss": 0.0004, "lr": 1.717801261361685e-06, "epoch": 3.2068607068607067, "percentage": 64.14, "elapsed_time": "0:16:20", "remaining_time": "0:09:08", "throughput": 1264.34, "total_tokens": 1239584} +{"current_steps": 3090, "total_steps": 4810, "loss": 0.0013, "lr": 1.7091905189450425e-06, "epoch": 3.212058212058212, "percentage": 64.24, "elapsed_time": "0:16:20", "remaining_time": "0:09:05", "throughput": 1265.89, "total_tokens": 1241504} +{"current_steps": 3095, "total_steps": 4810, "loss": 0.0375, "lr": 1.700590188571887e-06, "epoch": 3.2172557172557172, "percentage": 64.35, "elapsed_time": "0:16:21", "remaining_time": "0:09:03", "throughput": 1267.56, "total_tokens": 1243552} +{"current_steps": 3100, "total_steps": 4810, "loss": 0.0002, "lr": 1.6920003834768438e-06, "epoch": 3.2224532224532223, "percentage": 64.45, "elapsed_time": "0:16:21", "remaining_time": "0:09:01", "throughput": 1269.23, "total_tokens": 1245600} +{"current_steps": 3105, "total_steps": 4810, "loss": 0.0002, "lr": 1.6834212167559578e-06, "epoch": 3.227650727650728, "percentage": 64.55, "elapsed_time": "0:16:21", "remaining_time": "0:08:59", "throughput": 1270.97, "total_tokens": 1247712} +{"current_steps": 3110, "total_steps": 4810, "loss": 0.031, "lr": 1.6748528013652032e-06, "epoch": 3.232848232848233, "percentage": 64.66, "elapsed_time": "0:16:22", "remaining_time": "0:08:56", "throughput": 1272.57, "total_tokens": 1249696} +{"current_steps": 3115, "total_steps": 4810, "loss": 0.0647, "lr": 1.6662952501190032e-06, "epoch": 3.238045738045738, "percentage": 64.76, "elapsed_time": "0:16:22", "remaining_time": "0:08:54", "throughput": 1274.31, "total_tokens": 1251808} +{"current_steps": 3120, "total_steps": 4810, "loss": 0.0462, "lr": 1.6577486756887376e-06, "epoch": 3.2432432432432434, "percentage": 64.86, "elapsed_time": "0:16:22", "remaining_time": "0:08:52", "throughput": 1275.85, "total_tokens": 1253728} +{"current_steps": 3125, "total_steps": 4810, "loss": 0.0289, "lr": 1.6492131906012608e-06, "epoch": 3.2484407484407485, "percentage": 64.97, "elapsed_time": "0:16:22", "remaining_time": "0:08:50", "throughput": 1277.58, "total_tokens": 1255840} +{"current_steps": 3130, "total_steps": 4810, "loss": 0.0469, "lr": 1.640688907237425e-06, "epoch": 3.2536382536382535, "percentage": 65.07, "elapsed_time": "0:16:23", "remaining_time": "0:08:47", "throughput": 1279.24, "total_tokens": 1257888} +{"current_steps": 3133, "total_steps": 4810, "eval_loss": 0.36025160551071167, "epoch": 3.2567567567567566, "percentage": 65.14, "elapsed_time": "0:16:24", "remaining_time": "0:08:47", "throughput": 1278.91, "total_tokens": 1259168} +{"current_steps": 3135, "total_steps": 4810, "loss": 0.0767, "lr": 1.632175937830594e-06, "epoch": 3.258835758835759, "percentage": 65.18, "elapsed_time": "0:16:58", "remaining_time": "0:09:04", "throughput": 1237.0, "total_tokens": 1259936} +{"current_steps": 3140, "total_steps": 4810, "loss": 0.0504, "lr": 1.6236743944651703e-06, "epoch": 3.264033264033264, "percentage": 65.28, "elapsed_time": "0:16:58", "remaining_time": "0:09:01", "throughput": 1238.74, "total_tokens": 1262112} +{"current_steps": 3145, "total_steps": 4810, "loss": 0.0185, "lr": 1.6151843890751172e-06, "epoch": 3.269230769230769, "percentage": 65.38, "elapsed_time": "0:16:59", "remaining_time": "0:08:59", "throughput": 1240.11, "total_tokens": 1263904} +{"current_steps": 3150, "total_steps": 4810, "loss": 0.0131, "lr": 1.6067060334424836e-06, "epoch": 3.274428274428274, "percentage": 65.49, "elapsed_time": "0:16:59", "remaining_time": "0:08:57", "throughput": 1241.73, "total_tokens": 1265952} +{"current_steps": 3155, "total_steps": 4810, "loss": 0.0002, "lr": 1.5982394391959382e-06, "epoch": 3.2796257796257797, "percentage": 65.59, "elapsed_time": "0:16:59", "remaining_time": "0:08:54", "throughput": 1243.21, "total_tokens": 1267872} +{"current_steps": 3160, "total_steps": 4810, "loss": 0.0937, "lr": 1.5897847178092902e-06, "epoch": 3.284823284823285, "percentage": 65.7, "elapsed_time": "0:17:00", "remaining_time": "0:08:52", "throughput": 1244.71, "total_tokens": 1269792} +{"current_steps": 3165, "total_steps": 4810, "loss": 0.0014, "lr": 1.5813419806000329e-06, "epoch": 3.29002079002079, "percentage": 65.8, "elapsed_time": "0:17:00", "remaining_time": "0:08:50", "throughput": 1246.26, "total_tokens": 1271776} +{"current_steps": 3170, "total_steps": 4810, "loss": 0.0785, "lr": 1.5729113387278675e-06, "epoch": 3.2952182952182953, "percentage": 65.9, "elapsed_time": "0:17:00", "remaining_time": "0:08:48", "throughput": 1247.81, "total_tokens": 1273760} +{"current_steps": 3175, "total_steps": 4810, "loss": 0.1213, "lr": 1.5644929031932455e-06, "epoch": 3.3004158004158004, "percentage": 66.01, "elapsed_time": "0:17:01", "remaining_time": "0:08:45", "throughput": 1249.42, "total_tokens": 1275808} +{"current_steps": 3180, "total_steps": 4810, "loss": 0.0576, "lr": 1.556086784835908e-06, "epoch": 3.3056133056133055, "percentage": 66.11, "elapsed_time": "0:17:01", "remaining_time": "0:08:43", "throughput": 1250.97, "total_tokens": 1277792} +{"current_steps": 3185, "total_steps": 4810, "loss": 0.0004, "lr": 1.547693094333421e-06, "epoch": 3.310810810810811, "percentage": 66.22, "elapsed_time": "0:17:01", "remaining_time": "0:08:41", "throughput": 1252.52, "total_tokens": 1279776} +{"current_steps": 3190, "total_steps": 4810, "loss": 0.1482, "lr": 1.5393119421997252e-06, "epoch": 3.316008316008316, "percentage": 66.32, "elapsed_time": "0:17:02", "remaining_time": "0:08:39", "throughput": 1254.07, "total_tokens": 1281760} +{"current_steps": 3195, "total_steps": 4810, "loss": 0.0042, "lr": 1.5309434387836737e-06, "epoch": 3.321205821205821, "percentage": 66.42, "elapsed_time": "0:17:02", "remaining_time": "0:08:36", "throughput": 1255.61, "total_tokens": 1283744} +{"current_steps": 3200, "total_steps": 4810, "loss": 0.0005, "lr": 1.5225876942675844e-06, "epoch": 3.3264033264033266, "percentage": 66.53, "elapsed_time": "0:17:02", "remaining_time": "0:08:34", "throughput": 1257.22, "total_tokens": 1285792} +{"current_steps": 3205, "total_steps": 4810, "loss": 0.0525, "lr": 1.514244818665788e-06, "epoch": 3.3316008316008316, "percentage": 66.63, "elapsed_time": "0:17:03", "remaining_time": "0:08:32", "throughput": 1258.77, "total_tokens": 1287776} +{"current_steps": 3210, "total_steps": 4810, "loss": 0.0002, "lr": 1.505914921823178e-06, "epoch": 3.3367983367983367, "percentage": 66.74, "elapsed_time": "0:17:03", "remaining_time": "0:08:30", "throughput": 1260.25, "total_tokens": 1289696} +{"current_steps": 3215, "total_steps": 4810, "loss": 0.0006, "lr": 1.497598113413766e-06, "epoch": 3.3419958419958418, "percentage": 66.84, "elapsed_time": "0:17:03", "remaining_time": "0:08:27", "throughput": 1261.79, "total_tokens": 1291680} +{"current_steps": 3220, "total_steps": 4810, "loss": 0.0003, "lr": 1.489294502939238e-06, "epoch": 3.3471933471933473, "percentage": 66.94, "elapsed_time": "0:17:04", "remaining_time": "0:08:25", "throughput": 1263.21, "total_tokens": 1293536} +{"current_steps": 3225, "total_steps": 4810, "loss": 0.0003, "lr": 1.4810041997275094e-06, "epoch": 3.3523908523908523, "percentage": 67.05, "elapsed_time": "0:17:04", "remaining_time": "0:08:23", "throughput": 1264.94, "total_tokens": 1295712} +{"current_steps": 3230, "total_steps": 4810, "loss": 0.0008, "lr": 1.4727273129312918e-06, "epoch": 3.357588357588358, "percentage": 67.15, "elapsed_time": "0:17:04", "remaining_time": "0:08:21", "throughput": 1266.54, "total_tokens": 1297760} +{"current_steps": 3235, "total_steps": 4810, "loss": 0.0001, "lr": 1.4644639515266484e-06, "epoch": 3.362785862785863, "percentage": 67.26, "elapsed_time": "0:17:04", "remaining_time": "0:08:19", "throughput": 1268.14, "total_tokens": 1299808} +{"current_steps": 3240, "total_steps": 4810, "loss": 0.0002, "lr": 1.4562142243115646e-06, "epoch": 3.367983367983368, "percentage": 67.36, "elapsed_time": "0:17:05", "remaining_time": "0:08:16", "throughput": 1269.8, "total_tokens": 1301920} +{"current_steps": 3245, "total_steps": 4810, "loss": 0.0054, "lr": 1.4479782399045152e-06, "epoch": 3.373180873180873, "percentage": 67.46, "elapsed_time": "0:17:05", "remaining_time": "0:08:14", "throughput": 1271.33, "total_tokens": 1303904} +{"current_steps": 3250, "total_steps": 4810, "loss": 0.0424, "lr": 1.43975610674303e-06, "epoch": 3.3783783783783785, "percentage": 67.57, "elapsed_time": "0:17:05", "remaining_time": "0:08:12", "throughput": 1272.87, "total_tokens": 1305888} +{"current_steps": 3255, "total_steps": 4810, "loss": 0.1061, "lr": 1.4315479330822711e-06, "epoch": 3.3835758835758836, "percentage": 67.67, "elapsed_time": "0:17:06", "remaining_time": "0:08:10", "throughput": 1274.59, "total_tokens": 1308064} +{"current_steps": 3260, "total_steps": 4810, "loss": 0.0016, "lr": 1.4233538269936042e-06, "epoch": 3.3887733887733886, "percentage": 67.78, "elapsed_time": "0:17:06", "remaining_time": "0:08:08", "throughput": 1276.12, "total_tokens": 1310048} +{"current_steps": 3265, "total_steps": 4810, "loss": 0.1162, "lr": 1.415173896363178e-06, "epoch": 3.393970893970894, "percentage": 67.88, "elapsed_time": "0:17:06", "remaining_time": "0:08:05", "throughput": 1277.59, "total_tokens": 1311968} +{"current_steps": 3270, "total_steps": 4810, "loss": 0.0003, "lr": 1.4070082488905034e-06, "epoch": 3.399168399168399, "percentage": 67.98, "elapsed_time": "0:17:07", "remaining_time": "0:08:03", "throughput": 1279.06, "total_tokens": 1313888} +{"current_steps": 3275, "total_steps": 4810, "loss": 0.0648, "lr": 1.3988569920870315e-06, "epoch": 3.4043659043659042, "percentage": 68.09, "elapsed_time": "0:17:07", "remaining_time": "0:08:01", "throughput": 1280.78, "total_tokens": 1316064} +{"current_steps": 3280, "total_steps": 4810, "loss": 0.0011, "lr": 1.3907202332747454e-06, "epoch": 3.4095634095634098, "percentage": 68.19, "elapsed_time": "0:17:07", "remaining_time": "0:07:59", "throughput": 1282.37, "total_tokens": 1318112} +{"current_steps": 3285, "total_steps": 4810, "loss": 0.0003, "lr": 1.3825980795847401e-06, "epoch": 3.414760914760915, "percentage": 68.3, "elapsed_time": "0:17:08", "remaining_time": "0:07:57", "throughput": 1283.78, "total_tokens": 1319968} +{"current_steps": 3290, "total_steps": 4810, "loss": 0.038, "lr": 1.3744906379558165e-06, "epoch": 3.41995841995842, "percentage": 68.4, "elapsed_time": "0:17:08", "remaining_time": "0:07:55", "throughput": 1285.37, "total_tokens": 1322016} +{"current_steps": 3295, "total_steps": 4810, "loss": 0.0009, "lr": 1.3663980151330734e-06, "epoch": 3.4251559251559254, "percentage": 68.5, "elapsed_time": "0:17:08", "remaining_time": "0:07:53", "throughput": 1286.83, "total_tokens": 1323936} +{"current_steps": 3300, "total_steps": 4810, "loss": 0.0241, "lr": 1.358320317666496e-06, "epoch": 3.4303534303534304, "percentage": 68.61, "elapsed_time": "0:17:09", "remaining_time": "0:07:50", "throughput": 1288.36, "total_tokens": 1325920} +{"current_steps": 3305, "total_steps": 4810, "loss": 0.0668, "lr": 1.350257651909562e-06, "epoch": 3.4355509355509355, "percentage": 68.71, "elapsed_time": "0:17:09", "remaining_time": "0:07:48", "throughput": 1289.82, "total_tokens": 1327840} +{"current_steps": 3310, "total_steps": 4810, "loss": 0.0001, "lr": 1.3422101240178365e-06, "epoch": 3.4407484407484406, "percentage": 68.81, "elapsed_time": "0:17:09", "remaining_time": "0:07:46", "throughput": 1291.29, "total_tokens": 1329760} +{"current_steps": 3315, "total_steps": 4810, "loss": 0.0002, "lr": 1.3341778399475714e-06, "epoch": 3.445945945945946, "percentage": 68.92, "elapsed_time": "0:17:10", "remaining_time": "0:07:44", "throughput": 1292.81, "total_tokens": 1331744} +{"current_steps": 3320, "total_steps": 4810, "loss": 0.0278, "lr": 1.3261609054543178e-06, "epoch": 3.451143451143451, "percentage": 69.02, "elapsed_time": "0:17:10", "remaining_time": "0:07:42", "throughput": 1294.4, "total_tokens": 1333792} +{"current_steps": 3325, "total_steps": 4810, "loss": 0.0412, "lr": 1.3181594260915263e-06, "epoch": 3.456340956340956, "percentage": 69.13, "elapsed_time": "0:17:10", "remaining_time": "0:07:40", "throughput": 1295.92, "total_tokens": 1335776} +{"current_steps": 3330, "total_steps": 4810, "loss": 0.0003, "lr": 1.3101735072091624e-06, "epoch": 3.4615384615384617, "percentage": 69.23, "elapsed_time": "0:17:11", "remaining_time": "0:07:38", "throughput": 1297.5, "total_tokens": 1337824} +{"current_steps": 3335, "total_steps": 4810, "loss": 0.0311, "lr": 1.3022032539523177e-06, "epoch": 3.4667359667359667, "percentage": 69.33, "elapsed_time": "0:17:11", "remaining_time": "0:07:36", "throughput": 1299.08, "total_tokens": 1339872} +{"current_steps": 3340, "total_steps": 4810, "loss": 0.0937, "lr": 1.2942487712598234e-06, "epoch": 3.471933471933472, "percentage": 69.44, "elapsed_time": "0:17:11", "remaining_time": "0:07:34", "throughput": 1300.66, "total_tokens": 1341920} +{"current_steps": 3345, "total_steps": 4810, "loss": 0.0176, "lr": 1.2863101638628716e-06, "epoch": 3.4771309771309773, "percentage": 69.54, "elapsed_time": "0:17:12", "remaining_time": "0:07:31", "throughput": 1302.18, "total_tokens": 1343904} +{"current_steps": 3350, "total_steps": 4810, "loss": 0.0738, "lr": 1.2783875362836373e-06, "epoch": 3.4823284823284824, "percentage": 69.65, "elapsed_time": "0:17:12", "remaining_time": "0:07:29", "throughput": 1303.76, "total_tokens": 1345952} +{"current_steps": 3355, "total_steps": 4810, "loss": 0.0394, "lr": 1.2704809928338957e-06, "epoch": 3.4875259875259874, "percentage": 69.75, "elapsed_time": "0:17:12", "remaining_time": "0:07:27", "throughput": 1305.46, "total_tokens": 1348128} +{"current_steps": 3360, "total_steps": 4810, "loss": 0.0012, "lr": 1.2625906376136582e-06, "epoch": 3.492723492723493, "percentage": 69.85, "elapsed_time": "0:17:13", "remaining_time": "0:07:25", "throughput": 1306.92, "total_tokens": 1350048} +{"current_steps": 3365, "total_steps": 4810, "loss": 0.1121, "lr": 1.2547165745097927e-06, "epoch": 3.497920997920998, "percentage": 69.96, "elapsed_time": "0:17:13", "remaining_time": "0:07:23", "throughput": 1308.37, "total_tokens": 1351968} +{"current_steps": 3370, "total_steps": 4810, "loss": 0.0682, "lr": 1.2468589071946632e-06, "epoch": 3.503118503118503, "percentage": 70.06, "elapsed_time": "0:17:13", "remaining_time": "0:07:21", "throughput": 1309.88, "total_tokens": 1353952} +{"current_steps": 3374, "total_steps": 4810, "eval_loss": 0.4127735495567322, "epoch": 3.507276507276507, "percentage": 70.15, "elapsed_time": "0:17:14", "remaining_time": "0:07:20", "throughput": 1309.81, "total_tokens": 1355552} +{"current_steps": 3375, "total_steps": 4810, "loss": 0.0726, "lr": 1.2390177391247616e-06, "epoch": 3.508316008316008, "percentage": 70.17, "elapsed_time": "0:18:52", "remaining_time": "0:08:01", "throughput": 1197.82, "total_tokens": 1356000} +{"current_steps": 3380, "total_steps": 4810, "loss": 0.1161, "lr": 1.2311931735393417e-06, "epoch": 3.5135135135135136, "percentage": 70.27, "elapsed_time": "0:18:52", "remaining_time": "0:07:59", "throughput": 1199.23, "total_tokens": 1357984} +{"current_steps": 3385, "total_steps": 4810, "loss": 0.0002, "lr": 1.2233853134590698e-06, "epoch": 3.5187110187110187, "percentage": 70.37, "elapsed_time": "0:18:52", "remaining_time": "0:07:56", "throughput": 1200.59, "total_tokens": 1359904} +{"current_steps": 3390, "total_steps": 4810, "loss": 0.0385, "lr": 1.2155942616846562e-06, "epoch": 3.523908523908524, "percentage": 70.48, "elapsed_time": "0:18:53", "remaining_time": "0:07:54", "throughput": 1202.06, "total_tokens": 1361952} +{"current_steps": 3395, "total_steps": 4810, "loss": 0.1318, "lr": 1.2078201207955122e-06, "epoch": 3.529106029106029, "percentage": 70.58, "elapsed_time": "0:18:53", "remaining_time": "0:07:52", "throughput": 1203.52, "total_tokens": 1364000} +{"current_steps": 3400, "total_steps": 4810, "loss": 0.0008, "lr": 1.2000629931483947e-06, "epoch": 3.5343035343035343, "percentage": 70.69, "elapsed_time": "0:18:53", "remaining_time": "0:07:50", "throughput": 1205.05, "total_tokens": 1366112} +{"current_steps": 3405, "total_steps": 4810, "loss": 0.0016, "lr": 1.1923229808760565e-06, "epoch": 3.5395010395010393, "percentage": 70.79, "elapsed_time": "0:18:53", "remaining_time": "0:07:47", "throughput": 1206.45, "total_tokens": 1368096} +{"current_steps": 3410, "total_steps": 4810, "loss": 0.0661, "lr": 1.1846001858859054e-06, "epoch": 3.544698544698545, "percentage": 70.89, "elapsed_time": "0:18:54", "remaining_time": "0:07:45", "throughput": 1207.97, "total_tokens": 1370208} +{"current_steps": 3415, "total_steps": 4810, "loss": 0.0004, "lr": 1.1768947098586628e-06, "epoch": 3.54989604989605, "percentage": 71.0, "elapsed_time": "0:18:54", "remaining_time": "0:07:43", "throughput": 1209.37, "total_tokens": 1372192} +{"current_steps": 3420, "total_steps": 4810, "loss": 0.0171, "lr": 1.1692066542470202e-06, "epoch": 3.555093555093555, "percentage": 71.1, "elapsed_time": "0:18:54", "remaining_time": "0:07:41", "throughput": 1210.83, "total_tokens": 1374240} +{"current_steps": 3425, "total_steps": 4810, "loss": 0.0003, "lr": 1.1615361202743088e-06, "epoch": 3.5602910602910605, "percentage": 71.21, "elapsed_time": "0:18:55", "remaining_time": "0:07:39", "throughput": 1212.18, "total_tokens": 1376160} +{"current_steps": 3430, "total_steps": 4810, "loss": 0.0008, "lr": 1.1538832089331628e-06, "epoch": 3.5654885654885655, "percentage": 71.31, "elapsed_time": "0:18:55", "remaining_time": "0:07:36", "throughput": 1213.64, "total_tokens": 1378208} +{"current_steps": 3435, "total_steps": 4810, "loss": 0.0007, "lr": 1.1462480209841928e-06, "epoch": 3.5706860706860706, "percentage": 71.41, "elapsed_time": "0:18:55", "remaining_time": "0:07:34", "throughput": 1215.04, "total_tokens": 1380192} +{"current_steps": 3440, "total_steps": 4810, "loss": 0.0491, "lr": 1.1386306569546578e-06, "epoch": 3.5758835758835756, "percentage": 71.52, "elapsed_time": "0:18:56", "remaining_time": "0:07:32", "throughput": 1216.61, "total_tokens": 1382368} +{"current_steps": 3445, "total_steps": 4810, "loss": 0.0002, "lr": 1.1310312171371394e-06, "epoch": 3.581081081081081, "percentage": 71.62, "elapsed_time": "0:18:56", "remaining_time": "0:07:30", "throughput": 1218.23, "total_tokens": 1384608} +{"current_steps": 3450, "total_steps": 4810, "loss": 0.1426, "lr": 1.123449801588226e-06, "epoch": 3.586278586278586, "percentage": 71.73, "elapsed_time": "0:18:56", "remaining_time": "0:07:28", "throughput": 1219.63, "total_tokens": 1386592} +{"current_steps": 3455, "total_steps": 4810, "loss": 0.098, "lr": 1.1158865101271906e-06, "epoch": 3.5914760914760917, "percentage": 71.83, "elapsed_time": "0:18:57", "remaining_time": "0:07:26", "throughput": 1220.91, "total_tokens": 1388448} +{"current_steps": 3460, "total_steps": 4810, "loss": 0.0001, "lr": 1.1083414423346807e-06, "epoch": 3.5966735966735968, "percentage": 71.93, "elapsed_time": "0:18:57", "remaining_time": "0:07:23", "throughput": 1222.41, "total_tokens": 1390560} +{"current_steps": 3465, "total_steps": 4810, "loss": 0.0977, "lr": 1.100814697551406e-06, "epoch": 3.601871101871102, "percentage": 72.04, "elapsed_time": "0:18:57", "remaining_time": "0:07:21", "throughput": 1223.98, "total_tokens": 1392736} +{"current_steps": 3470, "total_steps": 4810, "loss": 0.1036, "lr": 1.0933063748768254e-06, "epoch": 3.607068607068607, "percentage": 72.14, "elapsed_time": "0:18:58", "remaining_time": "0:07:19", "throughput": 1225.37, "total_tokens": 1394720} +{"current_steps": 3475, "total_steps": 4810, "loss": 0.0001, "lr": 1.0858165731678514e-06, "epoch": 3.6122661122661124, "percentage": 72.25, "elapsed_time": "0:18:58", "remaining_time": "0:07:17", "throughput": 1226.71, "total_tokens": 1396640} +{"current_steps": 3480, "total_steps": 4810, "loss": 0.0528, "lr": 1.0783453910375423e-06, "epoch": 3.6174636174636174, "percentage": 72.35, "elapsed_time": "0:18:58", "remaining_time": "0:07:15", "throughput": 1228.22, "total_tokens": 1398752} +{"current_steps": 3485, "total_steps": 4810, "loss": 0.0787, "lr": 1.0708929268538034e-06, "epoch": 3.6226611226611225, "percentage": 72.45, "elapsed_time": "0:18:59", "remaining_time": "0:07:13", "throughput": 1229.66, "total_tokens": 1400800} +{"current_steps": 3490, "total_steps": 4810, "loss": 0.0007, "lr": 1.0634592787380964e-06, "epoch": 3.627858627858628, "percentage": 72.56, "elapsed_time": "0:18:59", "remaining_time": "0:07:10", "throughput": 1231.0, "total_tokens": 1402720} +{"current_steps": 3495, "total_steps": 4810, "loss": 0.0827, "lr": 1.0560445445641423e-06, "epoch": 3.633056133056133, "percentage": 72.66, "elapsed_time": "0:18:59", "remaining_time": "0:07:08", "throughput": 1232.39, "total_tokens": 1404704} +{"current_steps": 3500, "total_steps": 4810, "loss": 0.0002, "lr": 1.048648821956637e-06, "epoch": 3.638253638253638, "percentage": 72.77, "elapsed_time": "0:19:00", "remaining_time": "0:07:06", "throughput": 1233.66, "total_tokens": 1406560} +{"current_steps": 3505, "total_steps": 4810, "loss": 0.0586, "lr": 1.0412722082899647e-06, "epoch": 3.643451143451143, "percentage": 72.87, "elapsed_time": "0:19:00", "remaining_time": "0:07:04", "throughput": 1235.05, "total_tokens": 1408544} +{"current_steps": 3510, "total_steps": 4810, "loss": 0.0003, "lr": 1.033914800686912e-06, "epoch": 3.6486486486486487, "percentage": 72.97, "elapsed_time": "0:19:00", "remaining_time": "0:07:02", "throughput": 1236.38, "total_tokens": 1410464} +{"current_steps": 3515, "total_steps": 4810, "loss": 0.0001, "lr": 1.0265766960173964e-06, "epoch": 3.6538461538461537, "percentage": 73.08, "elapsed_time": "0:19:01", "remaining_time": "0:07:00", "throughput": 1237.76, "total_tokens": 1412448} +{"current_steps": 3520, "total_steps": 4810, "loss": 0.042, "lr": 1.019257990897185e-06, "epoch": 3.6590436590436592, "percentage": 73.18, "elapsed_time": "0:19:01", "remaining_time": "0:06:58", "throughput": 1239.37, "total_tokens": 1414688} +{"current_steps": 3525, "total_steps": 4810, "loss": 0.0036, "lr": 1.0119587816866258e-06, "epoch": 3.6642411642411643, "percentage": 73.28, "elapsed_time": "0:19:01", "remaining_time": "0:06:56", "throughput": 1240.76, "total_tokens": 1416672} +{"current_steps": 3530, "total_steps": 4810, "loss": 0.0002, "lr": 1.0046791644893757e-06, "epoch": 3.6694386694386694, "percentage": 73.39, "elapsed_time": "0:19:02", "remaining_time": "0:06:54", "throughput": 1242.09, "total_tokens": 1418592} +{"current_steps": 3535, "total_steps": 4810, "loss": 0.0004, "lr": 9.97419235151137e-07, "epoch": 3.6746361746361744, "percentage": 73.49, "elapsed_time": "0:19:02", "remaining_time": "0:06:52", "throughput": 1243.48, "total_tokens": 1420576} +{"current_steps": 3540, "total_steps": 4810, "loss": 0.0005, "lr": 9.901790892583973e-07, "epoch": 3.67983367983368, "percentage": 73.6, "elapsed_time": "0:19:02", "remaining_time": "0:06:49", "throughput": 1244.86, "total_tokens": 1422560} +{"current_steps": 3545, "total_steps": 4810, "loss": 0.0001, "lr": 9.829588221371694e-07, "epoch": 3.685031185031185, "percentage": 73.7, "elapsed_time": "0:19:03", "remaining_time": "0:06:47", "throughput": 1246.3, "total_tokens": 1424608} +{"current_steps": 3550, "total_steps": 4810, "loss": 0.0002, "lr": 9.757585288517329e-07, "epoch": 3.6902286902286905, "percentage": 73.8, "elapsed_time": "0:19:03", "remaining_time": "0:06:45", "throughput": 1247.85, "total_tokens": 1426784} +{"current_steps": 3555, "total_steps": 4810, "loss": 0.0507, "lr": 9.6857830420339e-07, "epoch": 3.6954261954261955, "percentage": 73.91, "elapsed_time": "0:19:03", "remaining_time": "0:06:43", "throughput": 1249.35, "total_tokens": 1428896} +{"current_steps": 3560, "total_steps": 4810, "loss": 0.0001, "lr": 9.614182427292076e-07, "epoch": 3.7006237006237006, "percentage": 74.01, "elapsed_time": "0:19:04", "remaining_time": "0:06:41", "throughput": 1250.73, "total_tokens": 1430880} +{"current_steps": 3565, "total_steps": 4810, "loss": 0.0706, "lr": 9.54278438700785e-07, "epoch": 3.7058212058212057, "percentage": 74.12, "elapsed_time": "0:19:04", "remaining_time": "0:06:39", "throughput": 1252.11, "total_tokens": 1432864} +{"current_steps": 3570, "total_steps": 4810, "loss": 0.0558, "lr": 9.471589861229999e-07, "epoch": 3.711018711018711, "percentage": 74.22, "elapsed_time": "0:19:04", "remaining_time": "0:06:37", "throughput": 1253.55, "total_tokens": 1434912} +{"current_steps": 3575, "total_steps": 4810, "loss": 0.0451, "lr": 9.400599787327774e-07, "epoch": 3.7162162162162162, "percentage": 74.32, "elapsed_time": "0:19:05", "remaining_time": "0:06:35", "throughput": 1254.87, "total_tokens": 1436832} +{"current_steps": 3580, "total_steps": 4810, "loss": 0.0456, "lr": 9.329815099978567e-07, "epoch": 3.7214137214137213, "percentage": 74.43, "elapsed_time": "0:19:05", "remaining_time": "0:06:33", "throughput": 1256.19, "total_tokens": 1438752} +{"current_steps": 3585, "total_steps": 4810, "loss": 0.0002, "lr": 9.259236731155583e-07, "epoch": 3.726611226611227, "percentage": 74.53, "elapsed_time": "0:19:05", "remaining_time": "0:06:31", "throughput": 1257.52, "total_tokens": 1440672} +{"current_steps": 3590, "total_steps": 4810, "loss": 0.0311, "lr": 9.188865610115572e-07, "epoch": 3.731808731808732, "percentage": 74.64, "elapsed_time": "0:19:05", "remaining_time": "0:06:29", "throughput": 1259.0, "total_tokens": 1442784} +{"current_steps": 3595, "total_steps": 4810, "loss": 0.0596, "lr": 9.118702663386583e-07, "epoch": 3.737006237006237, "percentage": 74.74, "elapsed_time": "0:19:06", "remaining_time": "0:06:27", "throughput": 1260.55, "total_tokens": 1444960} +{"current_steps": 3600, "total_steps": 4810, "loss": 0.0648, "lr": 9.048748814755783e-07, "epoch": 3.742203742203742, "percentage": 74.84, "elapsed_time": "0:19:06", "remaining_time": "0:06:25", "throughput": 1261.87, "total_tokens": 1446880} +{"current_steps": 3605, "total_steps": 4810, "loss": 0.0394, "lr": 8.979004985257294e-07, "epoch": 3.7474012474012475, "percentage": 74.95, "elapsed_time": "0:19:06", "remaining_time": "0:06:23", "throughput": 1263.35, "total_tokens": 1448992} +{"current_steps": 3610, "total_steps": 4810, "loss": 0.0295, "lr": 8.909472093160066e-07, "epoch": 3.7525987525987525, "percentage": 75.05, "elapsed_time": "0:19:07", "remaining_time": "0:06:21", "throughput": 1264.73, "total_tokens": 1450976} +{"current_steps": 3615, "total_steps": 4810, "loss": 0.0128, "lr": 8.840151053955773e-07, "epoch": 3.757796257796258, "percentage": 75.16, "elapsed_time": "0:19:07", "remaining_time": "0:06:19", "throughput": 1266.21, "total_tokens": 1453088} +{"current_steps": 3615, "total_steps": 4810, "eval_loss": 0.36968719959259033, "epoch": 3.757796257796258, "percentage": 75.16, "elapsed_time": "0:19:08", "remaining_time": "0:06:19", "throughput": 1264.85, "total_tokens": 1453088} +{"current_steps": 3620, "total_steps": 4810, "loss": 0.0014, "lr": 8.771042780346767e-07, "epoch": 3.762993762993763, "percentage": 75.26, "elapsed_time": "0:19:51", "remaining_time": "0:06:31", "throughput": 1221.67, "total_tokens": 1455136} +{"current_steps": 3625, "total_steps": 4810, "loss": 0.1087, "lr": 8.702148182234043e-07, "epoch": 3.768191268191268, "percentage": 75.36, "elapsed_time": "0:19:51", "remaining_time": "0:06:29", "throughput": 1223.01, "total_tokens": 1457120} +{"current_steps": 3630, "total_steps": 4810, "loss": 0.0001, "lr": 8.633468166705336e-07, "epoch": 3.773388773388773, "percentage": 75.47, "elapsed_time": "0:19:51", "remaining_time": "0:06:27", "throughput": 1224.39, "total_tokens": 1459168} +{"current_steps": 3635, "total_steps": 4810, "loss": 0.0061, "lr": 8.565003638023065e-07, "epoch": 3.7785862785862787, "percentage": 75.57, "elapsed_time": "0:19:52", "remaining_time": "0:06:25", "throughput": 1225.73, "total_tokens": 1461152} +{"current_steps": 3640, "total_steps": 4810, "loss": 0.0002, "lr": 8.496755497612491e-07, "epoch": 3.7837837837837838, "percentage": 75.68, "elapsed_time": "0:19:52", "remaining_time": "0:06:23", "throughput": 1227.06, "total_tokens": 1463136} +{"current_steps": 3645, "total_steps": 4810, "loss": 0.0001, "lr": 8.42872464404986e-07, "epoch": 3.788981288981289, "percentage": 75.78, "elapsed_time": "0:19:52", "remaining_time": "0:06:21", "throughput": 1228.39, "total_tokens": 1465120} +{"current_steps": 3650, "total_steps": 4810, "loss": 0.0322, "lr": 8.360911973050537e-07, "epoch": 3.7941787941787943, "percentage": 75.88, "elapsed_time": "0:19:53", "remaining_time": "0:06:19", "throughput": 1229.72, "total_tokens": 1467104} +{"current_steps": 3655, "total_steps": 4810, "loss": 0.0004, "lr": 8.29331837745724e-07, "epoch": 3.7993762993762994, "percentage": 75.99, "elapsed_time": "0:19:53", "remaining_time": "0:06:17", "throughput": 1231.1, "total_tokens": 1469152} +{"current_steps": 3660, "total_steps": 4810, "loss": 0.1215, "lr": 8.225944747228257e-07, "epoch": 3.8045738045738045, "percentage": 76.09, "elapsed_time": "0:19:53", "remaining_time": "0:06:15", "throughput": 1232.54, "total_tokens": 1471264} +{"current_steps": 3665, "total_steps": 4810, "loss": 0.0868, "lr": 8.158791969425739e-07, "epoch": 3.8097713097713095, "percentage": 76.2, "elapsed_time": "0:19:54", "remaining_time": "0:06:13", "throughput": 1233.86, "total_tokens": 1473248} +{"current_steps": 3670, "total_steps": 4810, "loss": 0.0009, "lr": 8.091860928204048e-07, "epoch": 3.814968814968815, "percentage": 76.3, "elapsed_time": "0:19:54", "remaining_time": "0:06:10", "throughput": 1235.3, "total_tokens": 1475360} +{"current_steps": 3675, "total_steps": 4810, "loss": 0.0001, "lr": 8.025152504798078e-07, "epoch": 3.82016632016632, "percentage": 76.4, "elapsed_time": "0:19:54", "remaining_time": "0:06:08", "throughput": 1236.73, "total_tokens": 1477472} +{"current_steps": 3680, "total_steps": 4810, "loss": 0.0912, "lr": 7.958667577511684e-07, "epoch": 3.8253638253638256, "percentage": 76.51, "elapsed_time": "0:19:54", "remaining_time": "0:06:06", "throughput": 1237.95, "total_tokens": 1479328} +{"current_steps": 3685, "total_steps": 4810, "loss": 0.0447, "lr": 7.892407021706064e-07, "epoch": 3.8305613305613306, "percentage": 76.61, "elapsed_time": "0:19:55", "remaining_time": "0:06:04", "throughput": 1239.22, "total_tokens": 1481248} +{"current_steps": 3690, "total_steps": 4810, "loss": 0.0001, "lr": 7.826371709788314e-07, "epoch": 3.8357588357588357, "percentage": 76.72, "elapsed_time": "0:19:55", "remaining_time": "0:06:02", "throughput": 1240.49, "total_tokens": 1483168} +{"current_steps": 3695, "total_steps": 4810, "loss": 0.0007, "lr": 7.760562511199881e-07, "epoch": 3.8409563409563408, "percentage": 76.82, "elapsed_time": "0:19:55", "remaining_time": "0:06:00", "throughput": 1241.81, "total_tokens": 1485152} +{"current_steps": 3700, "total_steps": 4810, "loss": 0.0407, "lr": 7.694980292405122e-07, "epoch": 3.8461538461538463, "percentage": 76.92, "elapsed_time": "0:19:56", "remaining_time": "0:05:58", "throughput": 1243.19, "total_tokens": 1487200} +{"current_steps": 3705, "total_steps": 4810, "loss": 0.0294, "lr": 7.629625916879932e-07, "epoch": 3.8513513513513513, "percentage": 77.03, "elapsed_time": "0:19:56", "remaining_time": "0:05:56", "throughput": 1244.51, "total_tokens": 1489184} +{"current_steps": 3710, "total_steps": 4810, "loss": 0.0046, "lr": 7.564500245100326e-07, "epoch": 3.856548856548857, "percentage": 77.13, "elapsed_time": "0:19:56", "remaining_time": "0:05:54", "throughput": 1245.83, "total_tokens": 1491168} +{"current_steps": 3715, "total_steps": 4810, "loss": 0.0001, "lr": 7.49960413453115e-07, "epoch": 3.861746361746362, "percentage": 77.23, "elapsed_time": "0:19:57", "remaining_time": "0:05:52", "throughput": 1247.21, "total_tokens": 1493216} +{"current_steps": 3720, "total_steps": 4810, "loss": 0.0738, "lr": 7.434938439614781e-07, "epoch": 3.866943866943867, "percentage": 77.34, "elapsed_time": "0:19:57", "remaining_time": "0:05:50", "throughput": 1248.53, "total_tokens": 1495200} +{"current_steps": 3725, "total_steps": 4810, "loss": 0.0047, "lr": 7.370504011759855e-07, "epoch": 3.872141372141372, "percentage": 77.44, "elapsed_time": "0:19:57", "remaining_time": "0:05:48", "throughput": 1249.85, "total_tokens": 1497184} +{"current_steps": 3730, "total_steps": 4810, "loss": 0.0633, "lr": 7.306301699330065e-07, "epoch": 3.8773388773388775, "percentage": 77.55, "elapsed_time": "0:19:58", "remaining_time": "0:05:46", "throughput": 1251.06, "total_tokens": 1499040} +{"current_steps": 3735, "total_steps": 4810, "loss": 0.0354, "lr": 7.242332347633052e-07, "epoch": 3.8825363825363826, "percentage": 77.65, "elapsed_time": "0:19:58", "remaining_time": "0:05:44", "throughput": 1252.38, "total_tokens": 1501024} +{"current_steps": 3740, "total_steps": 4810, "loss": 0.042, "lr": 7.17859679890916e-07, "epoch": 3.8877338877338876, "percentage": 77.75, "elapsed_time": "0:19:58", "remaining_time": "0:05:42", "throughput": 1253.74, "total_tokens": 1503072} +{"current_steps": 3745, "total_steps": 4810, "loss": 0.0002, "lr": 7.115095892320456e-07, "epoch": 3.892931392931393, "percentage": 77.86, "elapsed_time": "0:19:59", "remaining_time": "0:05:41", "throughput": 1255.22, "total_tokens": 1505248} +{"current_steps": 3750, "total_steps": 4810, "loss": 0.0084, "lr": 7.051830463939605e-07, "epoch": 3.898128898128898, "percentage": 77.96, "elapsed_time": "0:19:59", "remaining_time": "0:05:39", "throughput": 1256.59, "total_tokens": 1507296} +{"current_steps": 3755, "total_steps": 4810, "loss": 0.0226, "lr": 6.988801346738911e-07, "epoch": 3.9033264033264032, "percentage": 78.07, "elapsed_time": "0:19:59", "remaining_time": "0:05:37", "throughput": 1257.95, "total_tokens": 1509344} +{"current_steps": 3760, "total_steps": 4810, "loss": 0.0001, "lr": 6.926009370579334e-07, "epoch": 3.9085239085239083, "percentage": 78.17, "elapsed_time": "0:20:00", "remaining_time": "0:05:35", "throughput": 1259.37, "total_tokens": 1511456} +{"current_steps": 3765, "total_steps": 4810, "loss": 0.0235, "lr": 6.863455362199542e-07, "epoch": 3.913721413721414, "percentage": 78.27, "elapsed_time": "0:20:00", "remaining_time": "0:05:33", "throughput": 1260.69, "total_tokens": 1513440} +{"current_steps": 3770, "total_steps": 4810, "loss": 0.0001, "lr": 6.801140145205071e-07, "epoch": 3.918918918918919, "percentage": 78.38, "elapsed_time": "0:20:00", "remaining_time": "0:05:31", "throughput": 1262.05, "total_tokens": 1515488} +{"current_steps": 3775, "total_steps": 4810, "loss": 0.0065, "lr": 6.739064540057425e-07, "epoch": 3.9241164241164244, "percentage": 78.48, "elapsed_time": "0:20:01", "remaining_time": "0:05:29", "throughput": 1263.31, "total_tokens": 1517408} +{"current_steps": 3780, "total_steps": 4810, "loss": 0.0335, "lr": 6.677229364063329e-07, "epoch": 3.9293139293139294, "percentage": 78.59, "elapsed_time": "0:20:01", "remaining_time": "0:05:27", "throughput": 1264.62, "total_tokens": 1519392} +{"current_steps": 3785, "total_steps": 4810, "loss": 0.0001, "lr": 6.615635431363943e-07, "epoch": 3.9345114345114345, "percentage": 78.69, "elapsed_time": "0:20:01", "remaining_time": "0:05:25", "throughput": 1265.98, "total_tokens": 1521440} +{"current_steps": 3790, "total_steps": 4810, "loss": 0.0844, "lr": 6.554283552924118e-07, "epoch": 3.9397089397089395, "percentage": 78.79, "elapsed_time": "0:20:02", "remaining_time": "0:05:23", "throughput": 1267.35, "total_tokens": 1523488} +{"current_steps": 3795, "total_steps": 4810, "loss": 0.0001, "lr": 6.493174536521768e-07, "epoch": 3.944906444906445, "percentage": 78.9, "elapsed_time": "0:20:02", "remaining_time": "0:05:21", "throughput": 1268.76, "total_tokens": 1525600} +{"current_steps": 3800, "total_steps": 4810, "loss": 0.0715, "lr": 6.43230918673721e-07, "epoch": 3.95010395010395, "percentage": 79.0, "elapsed_time": "0:20:02", "remaining_time": "0:05:19", "throughput": 1270.07, "total_tokens": 1527584} +{"current_steps": 3805, "total_steps": 4810, "loss": 0.0002, "lr": 6.371688304942544e-07, "epoch": 3.955301455301455, "percentage": 79.11, "elapsed_time": "0:20:03", "remaining_time": "0:05:17", "throughput": 1271.32, "total_tokens": 1529504} +{"current_steps": 3810, "total_steps": 4810, "loss": 0.0805, "lr": 6.311312689291166e-07, "epoch": 3.9604989604989607, "percentage": 79.21, "elapsed_time": "0:20:03", "remaining_time": "0:05:15", "throughput": 1272.58, "total_tokens": 1531424} +{"current_steps": 3815, "total_steps": 4810, "loss": 0.0, "lr": 6.251183134707183e-07, "epoch": 3.9656964656964657, "percentage": 79.31, "elapsed_time": "0:20:03", "remaining_time": "0:05:13", "throughput": 1273.88, "total_tokens": 1533408} +{"current_steps": 3820, "total_steps": 4810, "loss": 0.1432, "lr": 6.191300432875017e-07, "epoch": 3.970893970893971, "percentage": 79.42, "elapsed_time": "0:20:04", "remaining_time": "0:05:12", "throughput": 1275.19, "total_tokens": 1535392} +{"current_steps": 3825, "total_steps": 4810, "loss": 0.0178, "lr": 6.13166537222894e-07, "epoch": 3.976091476091476, "percentage": 79.52, "elapsed_time": "0:20:04", "remaining_time": "0:05:10", "throughput": 1276.45, "total_tokens": 1537312} +{"current_steps": 3830, "total_steps": 4810, "loss": 0.0611, "lr": 6.072278737942691e-07, "epoch": 3.9812889812889813, "percentage": 79.63, "elapsed_time": "0:20:04", "remaining_time": "0:05:08", "throughput": 1277.8, "total_tokens": 1539360} +{"current_steps": 3835, "total_steps": 4810, "loss": 0.0019, "lr": 6.013141311919168e-07, "epoch": 3.9864864864864864, "percentage": 79.73, "elapsed_time": "0:20:05", "remaining_time": "0:05:06", "throughput": 1279.06, "total_tokens": 1541280} +{"current_steps": 3840, "total_steps": 4810, "loss": 0.0644, "lr": 5.954253872780102e-07, "epoch": 3.991683991683992, "percentage": 79.83, "elapsed_time": "0:20:05", "remaining_time": "0:05:04", "throughput": 1280.26, "total_tokens": 1543136} +{"current_steps": 3845, "total_steps": 4810, "loss": 0.1091, "lr": 5.895617195855827e-07, "epoch": 3.996881496881497, "percentage": 79.94, "elapsed_time": "0:20:05", "remaining_time": "0:05:02", "throughput": 1281.57, "total_tokens": 1545120} +{"current_steps": 3850, "total_steps": 4810, "loss": 0.0001, "lr": 5.837232053175065e-07, "epoch": 4.002079002079002, "percentage": 80.04, "elapsed_time": "0:20:06", "remaining_time": "0:05:00", "throughput": 1282.73, "total_tokens": 1547056} +{"current_steps": 3855, "total_steps": 4810, "loss": 0.0238, "lr": 5.77909921345475e-07, "epoch": 4.007276507276507, "percentage": 80.15, "elapsed_time": "0:20:06", "remaining_time": "0:04:58", "throughput": 1283.97, "total_tokens": 1548976} +{"current_steps": 3856, "total_steps": 4810, "eval_loss": 0.3716074526309967, "epoch": 4.008316008316008, "percentage": 80.17, "elapsed_time": "0:20:07", "remaining_time": "0:04:58", "throughput": 1283.1, "total_tokens": 1549360} +{"current_steps": 3860, "total_steps": 4810, "loss": 0.0133, "lr": 5.721219442089925e-07, "epoch": 4.012474012474012, "percentage": 80.25, "elapsed_time": "0:20:35", "remaining_time": "0:05:04", "throughput": 1255.6, "total_tokens": 1550960} +{"current_steps": 3865, "total_steps": 4810, "loss": 0.011, "lr": 5.663593501143663e-07, "epoch": 4.017671517671518, "percentage": 80.35, "elapsed_time": "0:20:35", "remaining_time": "0:05:02", "throughput": 1256.87, "total_tokens": 1552944} +{"current_steps": 3870, "total_steps": 4810, "loss": 0.0378, "lr": 5.606222149337004e-07, "epoch": 4.022869022869023, "percentage": 80.46, "elapsed_time": "0:20:35", "remaining_time": "0:05:00", "throughput": 1258.2, "total_tokens": 1554992} +{"current_steps": 3875, "total_steps": 4810, "loss": 0.0001, "lr": 5.549106142039018e-07, "epoch": 4.028066528066528, "percentage": 80.56, "elapsed_time": "0:20:36", "remaining_time": "0:04:58", "throughput": 1259.57, "total_tokens": 1557104} +{"current_steps": 3880, "total_steps": 4810, "loss": 0.0008, "lr": 5.492246231256798e-07, "epoch": 4.033264033264033, "percentage": 80.67, "elapsed_time": "0:20:36", "remaining_time": "0:04:56", "throughput": 1260.84, "total_tokens": 1559088} +{"current_steps": 3885, "total_steps": 4810, "loss": 0.0001, "lr": 5.435643165625615e-07, "epoch": 4.038461538461538, "percentage": 80.77, "elapsed_time": "0:20:36", "remaining_time": "0:04:54", "throughput": 1262.07, "total_tokens": 1561008} +{"current_steps": 3890, "total_steps": 4810, "loss": 0.0007, "lr": 5.379297690399035e-07, "epoch": 4.043659043659043, "percentage": 80.87, "elapsed_time": "0:20:37", "remaining_time": "0:04:52", "throughput": 1263.4, "total_tokens": 1563056} +{"current_steps": 3895, "total_steps": 4810, "loss": 0.0001, "lr": 5.323210547439089e-07, "epoch": 4.048856548856548, "percentage": 80.98, "elapsed_time": "0:20:37", "remaining_time": "0:04:50", "throughput": 1264.67, "total_tokens": 1565040} +{"current_steps": 3900, "total_steps": 4810, "loss": 0.0001, "lr": 5.267382475206548e-07, "epoch": 4.054054054054054, "percentage": 81.08, "elapsed_time": "0:20:37", "remaining_time": "0:04:48", "throughput": 1265.94, "total_tokens": 1567024} +{"current_steps": 3905, "total_steps": 4810, "loss": 0.0003, "lr": 5.21181420875117e-07, "epoch": 4.0592515592515594, "percentage": 81.19, "elapsed_time": "0:20:38", "remaining_time": "0:04:46", "throughput": 1267.31, "total_tokens": 1569136} +{"current_steps": 3910, "total_steps": 4810, "loss": 0.0001, "lr": 5.15650647970202e-07, "epoch": 4.0644490644490645, "percentage": 81.29, "elapsed_time": "0:20:38", "remaining_time": "0:04:45", "throughput": 1268.58, "total_tokens": 1571120} +{"current_steps": 3915, "total_steps": 4810, "loss": 0.0001, "lr": 5.101460016257858e-07, "epoch": 4.06964656964657, "percentage": 81.39, "elapsed_time": "0:20:38", "remaining_time": "0:04:43", "throughput": 1269.8, "total_tokens": 1573040} +{"current_steps": 3920, "total_steps": 4810, "loss": 0.0005, "lr": 5.046675543177531e-07, "epoch": 4.074844074844075, "percentage": 81.5, "elapsed_time": "0:20:39", "remaining_time": "0:04:41", "throughput": 1270.96, "total_tokens": 1574896} +{"current_steps": 3925, "total_steps": 4810, "loss": 0.0001, "lr": 4.992153781770448e-07, "epoch": 4.08004158004158, "percentage": 81.6, "elapsed_time": "0:20:39", "remaining_time": "0:04:39", "throughput": 1272.23, "total_tokens": 1576880} +{"current_steps": 3930, "total_steps": 4810, "loss": 0.0001, "lr": 4.937895449887076e-07, "epoch": 4.085239085239086, "percentage": 81.7, "elapsed_time": "0:20:39", "remaining_time": "0:04:37", "throughput": 1273.49, "total_tokens": 1578864} +{"current_steps": 3935, "total_steps": 4810, "loss": 0.0, "lr": 4.883901261909466e-07, "epoch": 4.090436590436591, "percentage": 81.81, "elapsed_time": "0:20:40", "remaining_time": "0:04:35", "throughput": 1274.76, "total_tokens": 1580848} +{"current_steps": 3940, "total_steps": 4810, "loss": 0.0001, "lr": 4.830171928741901e-07, "epoch": 4.095634095634096, "percentage": 81.91, "elapsed_time": "0:20:40", "remaining_time": "0:04:33", "throughput": 1275.92, "total_tokens": 1582704} +{"current_steps": 3945, "total_steps": 4810, "loss": 0.0008, "lr": 4.776708157801463e-07, "epoch": 4.100831600831601, "percentage": 82.02, "elapsed_time": "0:20:40", "remaining_time": "0:04:32", "throughput": 1277.29, "total_tokens": 1584816} +{"current_steps": 3950, "total_steps": 4810, "loss": 0.0387, "lr": 4.723510653008809e-07, "epoch": 4.106029106029106, "percentage": 82.12, "elapsed_time": "0:20:41", "remaining_time": "0:04:30", "throughput": 1278.56, "total_tokens": 1586800} +{"current_steps": 3955, "total_steps": 4810, "loss": 0.081, "lr": 4.6705801147788136e-07, "epoch": 4.111226611226611, "percentage": 82.22, "elapsed_time": "0:20:41", "remaining_time": "0:04:28", "throughput": 1279.77, "total_tokens": 1588720} +{"current_steps": 3960, "total_steps": 4810, "loss": 0.0001, "lr": 4.617917240011394e-07, "epoch": 4.116424116424117, "percentage": 82.33, "elapsed_time": "0:20:41", "remaining_time": "0:04:26", "throughput": 1280.93, "total_tokens": 1590576} +{"current_steps": 3965, "total_steps": 4810, "loss": 0.0001, "lr": 4.5655227220823355e-07, "epoch": 4.121621621621622, "percentage": 82.43, "elapsed_time": "0:20:42", "remaining_time": "0:04:24", "throughput": 1282.15, "total_tokens": 1592496} +{"current_steps": 3970, "total_steps": 4810, "loss": 0.0123, "lr": 4.513397250834159e-07, "epoch": 4.126819126819127, "percentage": 82.54, "elapsed_time": "0:20:42", "remaining_time": "0:04:22", "throughput": 1283.46, "total_tokens": 1594544} +{"current_steps": 3975, "total_steps": 4810, "loss": 0.0007, "lr": 4.461541512567011e-07, "epoch": 4.132016632016632, "percentage": 82.64, "elapsed_time": "0:20:42", "remaining_time": "0:04:21", "throughput": 1284.62, "total_tokens": 1596400} +{"current_steps": 3980, "total_steps": 4810, "loss": 0.0585, "lr": 4.409956190029674e-07, "epoch": 4.137214137214137, "percentage": 82.74, "elapsed_time": "0:20:43", "remaining_time": "0:04:19", "throughput": 1285.83, "total_tokens": 1598320} +{"current_steps": 3985, "total_steps": 4810, "loss": 0.0202, "lr": 4.358641962410537e-07, "epoch": 4.142411642411642, "percentage": 82.85, "elapsed_time": "0:20:43", "remaining_time": "0:04:17", "throughput": 1287.14, "total_tokens": 1600368} +{"current_steps": 3990, "total_steps": 4810, "loss": 0.0, "lr": 4.3075995053286716e-07, "epoch": 4.147609147609147, "percentage": 82.95, "elapsed_time": "0:20:43", "remaining_time": "0:04:15", "throughput": 1288.4, "total_tokens": 1602352} +{"current_steps": 3995, "total_steps": 4810, "loss": 0.0002, "lr": 4.2568294908249486e-07, "epoch": 4.152806652806653, "percentage": 83.06, "elapsed_time": "0:20:43", "remaining_time": "0:04:13", "throughput": 1289.66, "total_tokens": 1604336} +{"current_steps": 4000, "total_steps": 4810, "loss": 0.0, "lr": 4.2063325873531485e-07, "epoch": 4.158004158004158, "percentage": 83.16, "elapsed_time": "0:20:44", "remaining_time": "0:04:11", "throughput": 1290.87, "total_tokens": 1606256} +{"current_steps": 4005, "total_steps": 4810, "loss": 0.0001, "lr": 4.156109459771215e-07, "epoch": 4.163201663201663, "percentage": 83.26, "elapsed_time": "0:20:44", "remaining_time": "0:04:10", "throughput": 1292.18, "total_tokens": 1608304} +{"current_steps": 4010, "total_steps": 4810, "loss": 0.0001, "lr": 4.106160769332443e-07, "epoch": 4.168399168399168, "percentage": 83.37, "elapsed_time": "0:20:44", "remaining_time": "0:04:08", "throughput": 1293.59, "total_tokens": 1610480} +{"current_steps": 4015, "total_steps": 4810, "loss": 0.0382, "lr": 4.056487173676843e-07, "epoch": 4.173596673596673, "percentage": 83.47, "elapsed_time": "0:20:45", "remaining_time": "0:04:06", "throughput": 1294.89, "total_tokens": 1612528} +{"current_steps": 4020, "total_steps": 4810, "loss": 0.0001, "lr": 4.0070893268224055e-07, "epoch": 4.1787941787941785, "percentage": 83.58, "elapsed_time": "0:20:45", "remaining_time": "0:04:04", "throughput": 1296.2, "total_tokens": 1614576} +{"current_steps": 4025, "total_steps": 4810, "loss": 0.0, "lr": 3.9579678791565323e-07, "epoch": 4.183991683991684, "percentage": 83.68, "elapsed_time": "0:20:45", "remaining_time": "0:04:02", "throughput": 1297.51, "total_tokens": 1616624} +{"current_steps": 4030, "total_steps": 4810, "loss": 0.0378, "lr": 3.9091234774274873e-07, "epoch": 4.1891891891891895, "percentage": 83.78, "elapsed_time": "0:20:46", "remaining_time": "0:04:01", "throughput": 1298.81, "total_tokens": 1618672} +{"current_steps": 4035, "total_steps": 4810, "loss": 0.0029, "lr": 3.8605567647358426e-07, "epoch": 4.1943866943866945, "percentage": 83.89, "elapsed_time": "0:20:46", "remaining_time": "0:03:59", "throughput": 1300.17, "total_tokens": 1620784} +{"current_steps": 4040, "total_steps": 4810, "loss": 0.0002, "lr": 3.812268380526046e-07, "epoch": 4.1995841995842, "percentage": 83.99, "elapsed_time": "0:20:46", "remaining_time": "0:03:57", "throughput": 1301.42, "total_tokens": 1622768} +{"current_steps": 4045, "total_steps": 4810, "loss": 0.0001, "lr": 3.764258960577971e-07, "epoch": 4.204781704781705, "percentage": 84.1, "elapsed_time": "0:20:47", "remaining_time": "0:03:55", "throughput": 1302.62, "total_tokens": 1624688} +{"current_steps": 4050, "total_steps": 4810, "loss": 0.0004, "lr": 3.7165291369985616e-07, "epoch": 4.20997920997921, "percentage": 84.2, "elapsed_time": "0:20:47", "remaining_time": "0:03:54", "throughput": 1303.87, "total_tokens": 1626672} +{"current_steps": 4055, "total_steps": 4810, "loss": 0.0001, "lr": 3.6690795382135184e-07, "epoch": 4.215176715176715, "percentage": 84.3, "elapsed_time": "0:20:47", "remaining_time": "0:03:52", "throughput": 1305.28, "total_tokens": 1628848} +{"current_steps": 4060, "total_steps": 4810, "loss": 0.0001, "lr": 3.6219107889590154e-07, "epoch": 4.220374220374221, "percentage": 84.41, "elapsed_time": "0:20:48", "remaining_time": "0:03:50", "throughput": 1306.53, "total_tokens": 1630832} +{"current_steps": 4065, "total_steps": 4810, "loss": 0.0007, "lr": 3.575023510273462e-07, "epoch": 4.225571725571726, "percentage": 84.51, "elapsed_time": "0:20:48", "remaining_time": "0:03:48", "throughput": 1307.83, "total_tokens": 1632880} +{"current_steps": 4070, "total_steps": 4810, "loss": 0.0001, "lr": 3.528418319489349e-07, "epoch": 4.230769230769231, "percentage": 84.62, "elapsed_time": "0:20:48", "remaining_time": "0:03:47", "throughput": 1309.18, "total_tokens": 1634992} +{"current_steps": 4075, "total_steps": 4810, "loss": 0.0001, "lr": 3.48209583022511e-07, "epoch": 4.235966735966736, "percentage": 84.72, "elapsed_time": "0:20:49", "remaining_time": "0:03:45", "throughput": 1310.38, "total_tokens": 1636912} +{"current_steps": 4080, "total_steps": 4810, "loss": 0.0, "lr": 3.436056652377043e-07, "epoch": 4.241164241164241, "percentage": 84.82, "elapsed_time": "0:20:49", "remaining_time": "0:03:43", "throughput": 1311.58, "total_tokens": 1638832} +{"current_steps": 4085, "total_steps": 4810, "loss": 0.056, "lr": 3.3903013921112753e-07, "epoch": 4.246361746361746, "percentage": 84.93, "elapsed_time": "0:20:49", "remaining_time": "0:03:41", "throughput": 1313.02, "total_tokens": 1641072} +{"current_steps": 4090, "total_steps": 4810, "loss": 0.0001, "lr": 3.3448306518557795e-07, "epoch": 4.251559251559252, "percentage": 85.03, "elapsed_time": "0:20:50", "remaining_time": "0:03:40", "throughput": 1314.22, "total_tokens": 1642992} +{"current_steps": 4095, "total_steps": 4810, "loss": 0.0, "lr": 3.299645030292467e-07, "epoch": 4.256756756756757, "percentage": 85.14, "elapsed_time": "0:20:50", "remaining_time": "0:03:38", "throughput": 1315.52, "total_tokens": 1645040} +{"current_steps": 4097, "total_steps": 4810, "eval_loss": 0.4492134153842926, "epoch": 4.258835758835759, "percentage": 85.18, "elapsed_time": "0:20:51", "remaining_time": "0:03:37", "throughput": 1314.93, "total_tokens": 1645808} +{"current_steps": 4100, "total_steps": 4810, "loss": 0.0, "lr": 3.254745122349279e-07, "epoch": 4.261954261954262, "percentage": 85.24, "elapsed_time": "0:21:37", "remaining_time": "0:03:44", "throughput": 1269.52, "total_tokens": 1647024} +{"current_steps": 4105, "total_steps": 4810, "loss": 0.0001, "lr": 3.2101315191923667e-07, "epoch": 4.267151767151767, "percentage": 85.34, "elapsed_time": "0:21:37", "remaining_time": "0:03:42", "throughput": 1270.74, "total_tokens": 1649008} +{"current_steps": 4110, "total_steps": 4810, "loss": 0.0003, "lr": 3.1658048082182926e-07, "epoch": 4.272349272349272, "percentage": 85.45, "elapsed_time": "0:21:37", "remaining_time": "0:03:41", "throughput": 1272.0, "total_tokens": 1651056} +{"current_steps": 4115, "total_steps": 4810, "loss": 0.0001, "lr": 3.1217655730463094e-07, "epoch": 4.277546777546777, "percentage": 85.55, "elapsed_time": "0:21:38", "remaining_time": "0:03:39", "throughput": 1273.27, "total_tokens": 1653104} +{"current_steps": 4120, "total_steps": 4810, "loss": 0.0001, "lr": 3.078014393510695e-07, "epoch": 4.282744282744282, "percentage": 85.65, "elapsed_time": "0:21:38", "remaining_time": "0:03:37", "throughput": 1274.67, "total_tokens": 1655344} +{"current_steps": 4125, "total_steps": 4810, "loss": 0.042, "lr": 3.0345518456530666e-07, "epoch": 4.287941787941788, "percentage": 85.76, "elapsed_time": "0:21:38", "remaining_time": "0:03:35", "throughput": 1275.94, "total_tokens": 1657392} +{"current_steps": 4130, "total_steps": 4810, "loss": 0.0002, "lr": 2.9913785017148563e-07, "epoch": 4.293139293139293, "percentage": 85.86, "elapsed_time": "0:21:39", "remaining_time": "0:03:33", "throughput": 1277.1, "total_tokens": 1659312} +{"current_steps": 4135, "total_steps": 4810, "loss": 0.0557, "lr": 2.9484949301297166e-07, "epoch": 4.298336798336798, "percentage": 85.97, "elapsed_time": "0:21:39", "remaining_time": "0:03:32", "throughput": 1278.41, "total_tokens": 1661424} +{"current_steps": 4140, "total_steps": 4810, "loss": 0.0239, "lr": 2.905901695516092e-07, "epoch": 4.303534303534303, "percentage": 86.07, "elapsed_time": "0:21:39", "remaining_time": "0:03:30", "throughput": 1279.62, "total_tokens": 1663408} +{"current_steps": 4145, "total_steps": 4810, "loss": 0.0001, "lr": 2.8635993586697555e-07, "epoch": 4.3087318087318085, "percentage": 86.17, "elapsed_time": "0:21:40", "remaining_time": "0:03:28", "throughput": 1280.77, "total_tokens": 1665328} +{"current_steps": 4150, "total_steps": 4810, "loss": 0.0001, "lr": 2.8215884765564197e-07, "epoch": 4.313929313929314, "percentage": 86.28, "elapsed_time": "0:21:40", "remaining_time": "0:03:26", "throughput": 1281.98, "total_tokens": 1667312} +{"current_steps": 4155, "total_steps": 4810, "loss": 0.0003, "lr": 2.779869602304416e-07, "epoch": 4.3191268191268195, "percentage": 86.38, "elapsed_time": "0:21:40", "remaining_time": "0:03:25", "throughput": 1283.19, "total_tokens": 1669296} +{"current_steps": 4160, "total_steps": 4810, "loss": 0.0003, "lr": 2.73844328519742e-07, "epoch": 4.324324324324325, "percentage": 86.49, "elapsed_time": "0:21:41", "remaining_time": "0:03:23", "throughput": 1284.39, "total_tokens": 1671280} +{"current_steps": 4165, "total_steps": 4810, "loss": 0.0002, "lr": 2.6973100706672e-07, "epoch": 4.32952182952183, "percentage": 86.59, "elapsed_time": "0:21:41", "remaining_time": "0:03:21", "throughput": 1285.74, "total_tokens": 1673456} +{"current_steps": 4170, "total_steps": 4810, "loss": 0.0001, "lr": 2.656470500286451e-07, "epoch": 4.334719334719335, "percentage": 86.69, "elapsed_time": "0:21:41", "remaining_time": "0:03:19", "throughput": 1286.99, "total_tokens": 1675504} +{"current_steps": 4175, "total_steps": 4810, "loss": 0.0, "lr": 2.615925111761647e-07, "epoch": 4.33991683991684, "percentage": 86.8, "elapsed_time": "0:21:42", "remaining_time": "0:03:18", "throughput": 1288.19, "total_tokens": 1677488} +{"current_steps": 4180, "total_steps": 4810, "loss": 0.0633, "lr": 2.575674438925974e-07, "epoch": 4.345114345114345, "percentage": 86.9, "elapsed_time": "0:21:42", "remaining_time": "0:03:16", "throughput": 1289.44, "total_tokens": 1679536} +{"current_steps": 4185, "total_steps": 4810, "loss": 0.0875, "lr": 2.535719011732321e-07, "epoch": 4.350311850311851, "percentage": 87.01, "elapsed_time": "0:21:42", "remaining_time": "0:03:14", "throughput": 1290.65, "total_tokens": 1681520} +{"current_steps": 4190, "total_steps": 4810, "loss": 0.0372, "lr": 2.4960593562462496e-07, "epoch": 4.355509355509356, "percentage": 87.11, "elapsed_time": "0:21:43", "remaining_time": "0:03:12", "throughput": 1291.9, "total_tokens": 1683568} +{"current_steps": 4195, "total_steps": 4810, "loss": 0.0001, "lr": 2.4566959946391246e-07, "epoch": 4.360706860706861, "percentage": 87.21, "elapsed_time": "0:21:43", "remaining_time": "0:03:11", "throughput": 1293.05, "total_tokens": 1685488} +{"current_steps": 4200, "total_steps": 4810, "loss": 0.0341, "lr": 2.4176294451811936e-07, "epoch": 4.365904365904366, "percentage": 87.32, "elapsed_time": "0:21:43", "remaining_time": "0:03:09", "throughput": 1294.2, "total_tokens": 1687408} +{"current_steps": 4205, "total_steps": 4810, "loss": 0.0001, "lr": 2.378860222234794e-07, "epoch": 4.371101871101871, "percentage": 87.42, "elapsed_time": "0:21:44", "remaining_time": "0:03:07", "throughput": 1295.49, "total_tokens": 1689520} +{"current_steps": 4210, "total_steps": 4810, "loss": 0.0003, "lr": 2.3403888362475784e-07, "epoch": 4.376299376299376, "percentage": 87.53, "elapsed_time": "0:21:44", "remaining_time": "0:03:05", "throughput": 1296.74, "total_tokens": 1691568} +{"current_steps": 4215, "total_steps": 4810, "loss": 0.0, "lr": 2.3022157937457628e-07, "epoch": 4.381496881496881, "percentage": 87.63, "elapsed_time": "0:21:44", "remaining_time": "0:03:04", "throughput": 1297.99, "total_tokens": 1693616} +{"current_steps": 4220, "total_steps": 4810, "loss": 0.0001, "lr": 2.2643415973275017e-07, "epoch": 4.386694386694387, "percentage": 87.73, "elapsed_time": "0:21:45", "remaining_time": "0:03:02", "throughput": 1299.19, "total_tokens": 1695600} +{"current_steps": 4225, "total_steps": 4810, "loss": 0.0001, "lr": 2.226766745656231e-07, "epoch": 4.391891891891892, "percentage": 87.84, "elapsed_time": "0:21:45", "remaining_time": "0:03:00", "throughput": 1300.39, "total_tokens": 1697584} +{"current_steps": 4230, "total_steps": 4810, "loss": 0.0001, "lr": 2.1894917334541355e-07, "epoch": 4.397089397089397, "percentage": 87.94, "elapsed_time": "0:21:45", "remaining_time": "0:02:59", "throughput": 1301.59, "total_tokens": 1699568} +{"current_steps": 4235, "total_steps": 4810, "loss": 0.0017, "lr": 2.15251705149562e-07, "epoch": 4.402286902286902, "percentage": 88.05, "elapsed_time": "0:21:46", "remaining_time": "0:02:57", "throughput": 1302.93, "total_tokens": 1701744} +{"current_steps": 4240, "total_steps": 4810, "loss": 0.0, "lr": 2.11584318660083e-07, "epoch": 4.407484407484407, "percentage": 88.15, "elapsed_time": "0:21:46", "remaining_time": "0:02:55", "throughput": 1304.03, "total_tokens": 1703600} +{"current_steps": 4245, "total_steps": 4810, "loss": 0.0613, "lr": 2.0794706216292815e-07, "epoch": 4.412681912681912, "percentage": 88.25, "elapsed_time": "0:21:46", "remaining_time": "0:02:53", "throughput": 1305.32, "total_tokens": 1705712} +{"current_steps": 4250, "total_steps": 4810, "loss": 0.0001, "lr": 2.043399835473475e-07, "epoch": 4.417879417879418, "percentage": 88.36, "elapsed_time": "0:21:47", "remaining_time": "0:02:52", "throughput": 1306.52, "total_tokens": 1707696} +{"current_steps": 4255, "total_steps": 4810, "loss": 0.0012, "lr": 2.0076313030525845e-07, "epoch": 4.423076923076923, "percentage": 88.46, "elapsed_time": "0:21:47", "remaining_time": "0:02:50", "throughput": 1307.76, "total_tokens": 1709744} +{"current_steps": 4260, "total_steps": 4810, "loss": 0.0001, "lr": 1.9721654953062412e-07, "epoch": 4.428274428274428, "percentage": 88.57, "elapsed_time": "0:21:47", "remaining_time": "0:02:48", "throughput": 1309.01, "total_tokens": 1711792} +{"current_steps": 4265, "total_steps": 4810, "loss": 0.0002, "lr": 1.937002879188285e-07, "epoch": 4.4334719334719335, "percentage": 88.67, "elapsed_time": "0:21:48", "remaining_time": "0:02:47", "throughput": 1310.3, "total_tokens": 1713904} +{"current_steps": 4270, "total_steps": 4810, "loss": 0.0, "lr": 1.9021439176606565e-07, "epoch": 4.4386694386694385, "percentage": 88.77, "elapsed_time": "0:21:48", "remaining_time": "0:02:45", "throughput": 1311.44, "total_tokens": 1715824} +{"current_steps": 4275, "total_steps": 4810, "loss": 0.0001, "lr": 1.8675890696872838e-07, "epoch": 4.443866943866944, "percentage": 88.88, "elapsed_time": "0:21:48", "remaining_time": "0:02:43", "throughput": 1312.64, "total_tokens": 1717808} +{"current_steps": 4280, "total_steps": 4810, "loss": 0.0326, "lr": 1.8333387902280314e-07, "epoch": 4.4490644490644495, "percentage": 88.98, "elapsed_time": "0:21:48", "remaining_time": "0:02:42", "throughput": 1313.88, "total_tokens": 1719856} +{"current_steps": 4285, "total_steps": 4810, "loss": 0.0001, "lr": 1.799393530232729e-07, "epoch": 4.454261954261955, "percentage": 89.09, "elapsed_time": "0:21:49", "remaining_time": "0:02:40", "throughput": 1315.02, "total_tokens": 1721776} +{"current_steps": 4290, "total_steps": 4810, "loss": 0.0001, "lr": 1.765753736635234e-07, "epoch": 4.45945945945946, "percentage": 89.19, "elapsed_time": "0:21:49", "remaining_time": "0:02:38", "throughput": 1316.11, "total_tokens": 1723632} +{"current_steps": 4295, "total_steps": 4810, "loss": 0.0001, "lr": 1.7324198523475111e-07, "epoch": 4.464656964656965, "percentage": 89.29, "elapsed_time": "0:21:49", "remaining_time": "0:02:37", "throughput": 1317.2, "total_tokens": 1725488} +{"current_steps": 4300, "total_steps": 4810, "loss": 0.0001, "lr": 1.6993923162538562e-07, "epoch": 4.46985446985447, "percentage": 89.4, "elapsed_time": "0:21:50", "remaining_time": "0:02:35", "throughput": 1318.49, "total_tokens": 1727600} +{"current_steps": 4305, "total_steps": 4810, "loss": 0.0462, "lr": 1.666671563205069e-07, "epoch": 4.475051975051975, "percentage": 89.5, "elapsed_time": "0:21:50", "remaining_time": "0:02:33", "throughput": 1319.77, "total_tokens": 1729712} +{"current_steps": 4310, "total_steps": 4810, "loss": 0.0, "lr": 1.6342580240127582e-07, "epoch": 4.48024948024948, "percentage": 89.6, "elapsed_time": "0:21:50", "remaining_time": "0:02:32", "throughput": 1320.96, "total_tokens": 1731696} +{"current_steps": 4315, "total_steps": 4810, "loss": 0.0169, "lr": 1.6021521254436678e-07, "epoch": 4.485446985446986, "percentage": 89.71, "elapsed_time": "0:21:51", "remaining_time": "0:02:30", "throughput": 1322.2, "total_tokens": 1733744} +{"current_steps": 4320, "total_steps": 4810, "loss": 0.0001, "lr": 1.5703542902140296e-07, "epoch": 4.490644490644491, "percentage": 89.81, "elapsed_time": "0:21:51", "remaining_time": "0:02:28", "throughput": 1323.38, "total_tokens": 1735728} +{"current_steps": 4325, "total_steps": 4810, "loss": 0.0001, "lr": 1.538864936984036e-07, "epoch": 4.495841995841996, "percentage": 89.92, "elapsed_time": "0:21:51", "remaining_time": "0:02:27", "throughput": 1324.61, "total_tokens": 1737776} +{"current_steps": 4330, "total_steps": 4810, "loss": 0.0313, "lr": 1.507684480352292e-07, "epoch": 4.501039501039501, "percentage": 90.02, "elapsed_time": "0:21:52", "remaining_time": "0:02:25", "throughput": 1325.85, "total_tokens": 1739824} +{"current_steps": 4335, "total_steps": 4810, "loss": 0.0202, "lr": 1.476813330850388e-07, "epoch": 4.506237006237006, "percentage": 90.12, "elapsed_time": "0:21:52", "remaining_time": "0:02:23", "throughput": 1326.98, "total_tokens": 1741744} +{"current_steps": 4338, "total_steps": 4810, "eval_loss": 0.43684616684913635, "epoch": 4.509355509355509, "percentage": 90.19, "elapsed_time": "0:21:53", "remaining_time": "0:02:22", "throughput": 1326.69, "total_tokens": 1742960} +{"current_steps": 4340, "total_steps": 4810, "loss": 0.0002, "lr": 1.4462518949374838e-07, "epoch": 4.511434511434511, "percentage": 90.23, "elapsed_time": "0:22:16", "remaining_time": "0:02:24", "throughput": 1304.41, "total_tokens": 1743728} +{"current_steps": 4345, "total_steps": 4810, "loss": 0.0723, "lr": 1.4160005749949328e-07, "epoch": 4.516632016632016, "percentage": 90.33, "elapsed_time": "0:22:17", "remaining_time": "0:02:23", "throughput": 1305.72, "total_tokens": 1745904} +{"current_steps": 4350, "total_steps": 4810, "loss": 0.0001, "lr": 1.386059769321027e-07, "epoch": 4.521829521829522, "percentage": 90.44, "elapsed_time": "0:22:17", "remaining_time": "0:02:21", "throughput": 1306.84, "total_tokens": 1747824} +{"current_steps": 4355, "total_steps": 4810, "loss": 0.0002, "lr": 1.3564298721257223e-07, "epoch": 4.527027027027027, "percentage": 90.54, "elapsed_time": "0:22:17", "remaining_time": "0:02:19", "throughput": 1308.05, "total_tokens": 1749872} +{"current_steps": 4360, "total_steps": 4810, "loss": 0.0, "lr": 1.32711127352545e-07, "epoch": 4.532224532224532, "percentage": 90.64, "elapsed_time": "0:22:18", "remaining_time": "0:02:18", "throughput": 1309.17, "total_tokens": 1751792} +{"current_steps": 4365, "total_steps": 4810, "loss": 0.0001, "lr": 1.2981043595380048e-07, "epoch": 4.537422037422037, "percentage": 90.75, "elapsed_time": "0:22:18", "remaining_time": "0:02:16", "throughput": 1310.33, "total_tokens": 1753776} +{"current_steps": 4370, "total_steps": 4810, "loss": 0.0, "lr": 1.269409512077427e-07, "epoch": 4.542619542619542, "percentage": 90.85, "elapsed_time": "0:22:18", "remaining_time": "0:02:14", "throughput": 1311.54, "total_tokens": 1755824} +{"current_steps": 4375, "total_steps": 4810, "loss": 0.0001, "lr": 1.241027108949e-07, "epoch": 4.547817047817047, "percentage": 90.96, "elapsed_time": "0:22:19", "remaining_time": "0:02:13", "throughput": 1312.85, "total_tokens": 1758000} +{"current_steps": 4380, "total_steps": 4810, "loss": 0.0006, "lr": 1.2129575238442715e-07, "epoch": 4.553014553014553, "percentage": 91.06, "elapsed_time": "0:22:19", "remaining_time": "0:02:11", "throughput": 1314.01, "total_tokens": 1759984} +{"current_steps": 4385, "total_steps": 4810, "loss": 0.0002, "lr": 1.1852011263361218e-07, "epoch": 4.558212058212058, "percentage": 91.16, "elapsed_time": "0:22:19", "remaining_time": "0:02:09", "throughput": 1315.18, "total_tokens": 1761968} +{"current_steps": 4390, "total_steps": 4810, "loss": 0.0, "lr": 1.1577582818739136e-07, "epoch": 4.5634095634095635, "percentage": 91.27, "elapsed_time": "0:22:20", "remaining_time": "0:02:08", "throughput": 1316.39, "total_tokens": 1764016} +{"current_steps": 4395, "total_steps": 4810, "loss": 0.0046, "lr": 1.1306293517786615e-07, "epoch": 4.5686070686070686, "percentage": 91.37, "elapsed_time": "0:22:20", "remaining_time": "0:02:06", "throughput": 1317.5, "total_tokens": 1765936} +{"current_steps": 4400, "total_steps": 4810, "loss": 0.0002, "lr": 1.1038146932383003e-07, "epoch": 4.573804573804574, "percentage": 91.48, "elapsed_time": "0:22:20", "remaining_time": "0:02:04", "throughput": 1318.71, "total_tokens": 1767984} +{"current_steps": 4405, "total_steps": 4810, "loss": 0.0266, "lr": 1.0773146593029637e-07, "epoch": 4.579002079002079, "percentage": 91.58, "elapsed_time": "0:22:21", "remaining_time": "0:02:03", "throughput": 1319.82, "total_tokens": 1769904} +{"current_steps": 4410, "total_steps": 4810, "loss": 0.0001, "lr": 1.0511295988803293e-07, "epoch": 4.584199584199585, "percentage": 91.68, "elapsed_time": "0:22:21", "remaining_time": "0:02:01", "throughput": 1320.97, "total_tokens": 1771888} +{"current_steps": 4415, "total_steps": 4810, "loss": 0.0027, "lr": 1.0252598567310451e-07, "epoch": 4.58939708939709, "percentage": 91.79, "elapsed_time": "0:22:21", "remaining_time": "0:02:00", "throughput": 1322.18, "total_tokens": 1773936} +{"current_steps": 4420, "total_steps": 4810, "loss": 0.0, "lr": 9.997057734641852e-08, "epoch": 4.594594594594595, "percentage": 91.89, "elapsed_time": "0:22:22", "remaining_time": "0:01:58", "throughput": 1323.38, "total_tokens": 1775984} +{"current_steps": 4425, "total_steps": 4810, "loss": 0.0, "lr": 9.744676855327484e-08, "epoch": 4.5997920997921, "percentage": 92.0, "elapsed_time": "0:22:22", "remaining_time": "0:01:56", "throughput": 1324.45, "total_tokens": 1777840} +{"current_steps": 4430, "total_steps": 4810, "loss": 0.0267, "lr": 9.495459252292505e-08, "epoch": 4.604989604989605, "percentage": 92.1, "elapsed_time": "0:22:22", "remaining_time": "0:01:55", "throughput": 1325.61, "total_tokens": 1779824} +{"current_steps": 4435, "total_steps": 4810, "loss": 0.0723, "lr": 9.249408206813332e-08, "epoch": 4.61018711018711, "percentage": 92.2, "elapsed_time": "0:22:22", "remaining_time": "0:01:53", "throughput": 1326.82, "total_tokens": 1781872} +{"current_steps": 4440, "total_steps": 4810, "loss": 0.0001, "lr": 9.00652695847451e-08, "epoch": 4.615384615384615, "percentage": 92.31, "elapsed_time": "0:22:23", "remaining_time": "0:01:51", "throughput": 1328.08, "total_tokens": 1783984} +{"current_steps": 4445, "total_steps": 4810, "loss": 0.0, "lr": 8.766818705126134e-08, "epoch": 4.620582120582121, "percentage": 92.41, "elapsed_time": "0:22:23", "remaining_time": "0:01:50", "throughput": 1329.28, "total_tokens": 1786032} +{"current_steps": 4450, "total_steps": 4810, "loss": 0.0058, "lr": 8.530286602841525e-08, "epoch": 4.625779625779626, "percentage": 92.52, "elapsed_time": "0:22:23", "remaining_time": "0:01:48", "throughput": 1330.44, "total_tokens": 1788016} +{"current_steps": 4455, "total_steps": 4810, "loss": 0.0001, "lr": 8.296933765875898e-08, "epoch": 4.630977130977131, "percentage": 92.62, "elapsed_time": "0:22:24", "remaining_time": "0:01:47", "throughput": 1331.65, "total_tokens": 1790064} +{"current_steps": 4460, "total_steps": 4810, "loss": 0.0003, "lr": 8.066763266625283e-08, "epoch": 4.636174636174636, "percentage": 92.72, "elapsed_time": "0:22:24", "remaining_time": "0:01:45", "throughput": 1332.76, "total_tokens": 1791984} +{"current_steps": 4465, "total_steps": 4810, "loss": 0.0321, "lr": 7.839778135586007e-08, "epoch": 4.641372141372141, "percentage": 92.83, "elapsed_time": "0:22:24", "remaining_time": "0:01:43", "throughput": 1333.87, "total_tokens": 1793904} +{"current_steps": 4470, "total_steps": 4810, "loss": 0.0003, "lr": 7.61598136131489e-08, "epoch": 4.646569646569646, "percentage": 92.93, "elapsed_time": "0:22:25", "remaining_time": "0:01:42", "throughput": 1335.02, "total_tokens": 1795888} +{"current_steps": 4475, "total_steps": 4810, "loss": 0.028, "lr": 7.3953758903898e-08, "epoch": 4.651767151767151, "percentage": 93.04, "elapsed_time": "0:22:25", "remaining_time": "0:01:40", "throughput": 1336.18, "total_tokens": 1797872} +{"current_steps": 4480, "total_steps": 4810, "loss": 0.0007, "lr": 7.177964627370999e-08, "epoch": 4.656964656964657, "percentage": 93.14, "elapsed_time": "0:22:25", "remaining_time": "0:01:39", "throughput": 1337.38, "total_tokens": 1799920} +{"current_steps": 4485, "total_steps": 4810, "loss": 0.0001, "lr": 6.963750434762745e-08, "epoch": 4.662162162162162, "percentage": 93.24, "elapsed_time": "0:22:26", "remaining_time": "0:01:37", "throughput": 1338.44, "total_tokens": 1801776} +{"current_steps": 4490, "total_steps": 4810, "loss": 0.0157, "lr": 6.752736132975696e-08, "epoch": 4.667359667359667, "percentage": 93.35, "elapsed_time": "0:22:26", "remaining_time": "0:01:35", "throughput": 1339.65, "total_tokens": 1803824} +{"current_steps": 4495, "total_steps": 4810, "loss": 0.0562, "lr": 6.544924500289789e-08, "epoch": 4.672557172557172, "percentage": 93.45, "elapsed_time": "0:22:26", "remaining_time": "0:01:34", "throughput": 1340.75, "total_tokens": 1805744} +{"current_steps": 4500, "total_steps": 4810, "loss": 0.0, "lr": 6.340318272817476e-08, "epoch": 4.6777546777546775, "percentage": 93.56, "elapsed_time": "0:22:27", "remaining_time": "0:01:32", "throughput": 1341.91, "total_tokens": 1807728} +{"current_steps": 4505, "total_steps": 4810, "loss": 0.0329, "lr": 6.138920144468124e-08, "epoch": 4.682952182952183, "percentage": 93.66, "elapsed_time": "0:22:27", "remaining_time": "0:01:31", "throughput": 1343.06, "total_tokens": 1809712} +{"current_steps": 4510, "total_steps": 4810, "loss": 0.1284, "lr": 5.940732766912011e-08, "epoch": 4.6881496881496885, "percentage": 93.76, "elapsed_time": "0:22:27", "remaining_time": "0:01:29", "throughput": 1344.17, "total_tokens": 1811632} +{"current_steps": 4515, "total_steps": 4810, "loss": 0.0, "lr": 5.745758749545749e-08, "epoch": 4.6933471933471935, "percentage": 93.87, "elapsed_time": "0:22:28", "remaining_time": "0:01:28", "throughput": 1345.27, "total_tokens": 1813552} +{"current_steps": 4520, "total_steps": 4810, "loss": 0.0, "lr": 5.554000659457881e-08, "epoch": 4.698544698544699, "percentage": 93.97, "elapsed_time": "0:22:28", "remaining_time": "0:01:26", "throughput": 1346.52, "total_tokens": 1815664} +{"current_steps": 4525, "total_steps": 4810, "loss": 0.0056, "lr": 5.365461021395096e-08, "epoch": 4.703742203742204, "percentage": 94.07, "elapsed_time": "0:22:28", "remaining_time": "0:01:24", "throughput": 1347.67, "total_tokens": 1817648} +{"current_steps": 4530, "total_steps": 4810, "loss": 0.0226, "lr": 5.1801423177288146e-08, "epoch": 4.708939708939709, "percentage": 94.18, "elapsed_time": "0:22:29", "remaining_time": "0:01:23", "throughput": 1348.87, "total_tokens": 1819696} +{"current_steps": 4535, "total_steps": 4810, "loss": 0.0, "lr": 4.998046988422767e-08, "epoch": 4.714137214137214, "percentage": 94.28, "elapsed_time": "0:22:29", "remaining_time": "0:01:21", "throughput": 1350.02, "total_tokens": 1821680} +{"current_steps": 4540, "total_steps": 4810, "loss": 0.0, "lr": 4.8191774310006045e-08, "epoch": 4.71933471933472, "percentage": 94.39, "elapsed_time": "0:22:29", "remaining_time": "0:01:20", "throughput": 1351.21, "total_tokens": 1823728} +{"current_steps": 4545, "total_steps": 4810, "loss": 0.0006, "lr": 4.6435360005145647e-08, "epoch": 4.724532224532225, "percentage": 94.49, "elapsed_time": "0:22:30", "remaining_time": "0:01:18", "throughput": 1352.36, "total_tokens": 1825712} +{"current_steps": 4550, "total_steps": 4810, "loss": 0.0258, "lr": 4.471125009514326e-08, "epoch": 4.72972972972973, "percentage": 94.59, "elapsed_time": "0:22:30", "remaining_time": "0:01:17", "throughput": 1353.55, "total_tokens": 1827760} +{"current_steps": 4555, "total_steps": 4810, "loss": 0.0, "lr": 4.30194672801662e-08, "epoch": 4.734927234927235, "percentage": 94.7, "elapsed_time": "0:22:30", "remaining_time": "0:01:15", "throughput": 1354.65, "total_tokens": 1829680} +{"current_steps": 4560, "total_steps": 4810, "loss": 0.0002, "lr": 4.136003383475251e-08, "epoch": 4.74012474012474, "percentage": 94.8, "elapsed_time": "0:22:30", "remaining_time": "0:01:14", "throughput": 1355.85, "total_tokens": 1831728} +{"current_steps": 4565, "total_steps": 4810, "loss": 0.0001, "lr": 3.9732971607519264e-08, "epoch": 4.745322245322245, "percentage": 94.91, "elapsed_time": "0:22:31", "remaining_time": "0:01:12", "throughput": 1356.94, "total_tokens": 1833648} +{"current_steps": 4570, "total_steps": 4810, "loss": 0.0, "lr": 3.813830202087338e-08, "epoch": 4.75051975051975, "percentage": 95.01, "elapsed_time": "0:22:31", "remaining_time": "0:01:10", "throughput": 1358.14, "total_tokens": 1835696} +{"current_steps": 4575, "total_steps": 4810, "loss": 0.0001, "lr": 3.6576046070730676e-08, "epoch": 4.755717255717256, "percentage": 95.11, "elapsed_time": "0:22:31", "remaining_time": "0:01:09", "throughput": 1359.37, "total_tokens": 1837808} +{"current_steps": 4579, "total_steps": 4810, "eval_loss": 0.4380520284175873, "epoch": 4.75987525987526, "percentage": 95.2, "elapsed_time": "0:22:33", "remaining_time": "0:01:08", "throughput": 1359.23, "total_tokens": 1839344} +{"current_steps": 4580, "total_steps": 4810, "loss": 0.0003, "lr": 3.504622432623811e-08, "epoch": 4.760914760914761, "percentage": 95.22, "elapsed_time": "0:23:11", "remaining_time": "0:01:09", "throughput": 1321.78, "total_tokens": 1839728} +{"current_steps": 4585, "total_steps": 4810, "loss": 0.002, "lr": 3.354885692950505e-08, "epoch": 4.766112266112266, "percentage": 95.32, "elapsed_time": "0:23:12", "remaining_time": "0:01:08", "throughput": 1322.94, "total_tokens": 1841776} +{"current_steps": 4590, "total_steps": 4810, "loss": 0.0001, "lr": 3.208396359533572e-08, "epoch": 4.771309771309771, "percentage": 95.43, "elapsed_time": "0:23:12", "remaining_time": "0:01:06", "throughput": 1324.02, "total_tokens": 1843696} +{"current_steps": 4595, "total_steps": 4810, "loss": 0.0002, "lr": 3.065156361097138e-08, "epoch": 4.776507276507276, "percentage": 95.53, "elapsed_time": "0:23:12", "remaining_time": "0:01:05", "throughput": 1325.18, "total_tokens": 1845744} +{"current_steps": 4600, "total_steps": 4810, "loss": 0.0009, "lr": 2.925167583583577e-08, "epoch": 4.781704781704782, "percentage": 95.63, "elapsed_time": "0:23:13", "remaining_time": "0:01:03", "throughput": 1326.34, "total_tokens": 1847792} +{"current_steps": 4605, "total_steps": 4810, "loss": 0.0712, "lr": 2.7884318701285883e-08, "epoch": 4.786902286902287, "percentage": 95.74, "elapsed_time": "0:23:13", "remaining_time": "0:01:02", "throughput": 1327.45, "total_tokens": 1849776} +{"current_steps": 4610, "total_steps": 4810, "loss": 0.0, "lr": 2.654951021037161e-08, "epoch": 4.792099792099792, "percentage": 95.84, "elapsed_time": "0:23:13", "remaining_time": "0:01:00", "throughput": 1328.75, "total_tokens": 1852016} +{"current_steps": 4615, "total_steps": 4810, "loss": 0.0001, "lr": 2.524726793759591e-08, "epoch": 4.797297297297297, "percentage": 95.95, "elapsed_time": "0:23:14", "remaining_time": "0:00:58", "throughput": 1329.91, "total_tokens": 1854064} +{"current_steps": 4620, "total_steps": 4810, "loss": 0.0002, "lr": 2.3977609028686123e-08, "epoch": 4.802494802494802, "percentage": 96.05, "elapsed_time": "0:23:14", "remaining_time": "0:00:57", "throughput": 1331.07, "total_tokens": 1856112} +{"current_steps": 4625, "total_steps": 4810, "loss": 0.0, "lr": 2.2740550200365528e-08, "epoch": 4.8076923076923075, "percentage": 96.15, "elapsed_time": "0:23:14", "remaining_time": "0:00:55", "throughput": 1332.18, "total_tokens": 1858096} +{"current_steps": 4630, "total_steps": 4810, "loss": 0.0, "lr": 2.153610774013548e-08, "epoch": 4.8128898128898125, "percentage": 96.26, "elapsed_time": "0:23:15", "remaining_time": "0:00:54", "throughput": 1333.43, "total_tokens": 1860272} +{"current_steps": 4635, "total_steps": 4810, "loss": 0.0001, "lr": 2.0364297506060005e-08, "epoch": 4.8180873180873185, "percentage": 96.36, "elapsed_time": "0:23:15", "remaining_time": "0:00:52", "throughput": 1334.55, "total_tokens": 1862256} +{"current_steps": 4640, "total_steps": 4810, "loss": 0.0, "lr": 1.922513492655653e-08, "epoch": 4.8232848232848236, "percentage": 96.47, "elapsed_time": "0:23:15", "remaining_time": "0:00:51", "throughput": 1335.71, "total_tokens": 1864304} +{"current_steps": 4645, "total_steps": 4810, "loss": 0.0, "lr": 1.8118635000194395e-08, "epoch": 4.828482328482329, "percentage": 96.57, "elapsed_time": "0:23:16", "remaining_time": "0:00:49", "throughput": 1336.77, "total_tokens": 1866224} +{"current_steps": 4650, "total_steps": 4810, "loss": 0.0002, "lr": 1.704481229549526e-08, "epoch": 4.833679833679834, "percentage": 96.67, "elapsed_time": "0:23:16", "remaining_time": "0:00:48", "throughput": 1337.97, "total_tokens": 1868336} +{"current_steps": 4655, "total_steps": 4810, "loss": 0.0001, "lr": 1.6003680950742728e-08, "epoch": 4.838877338877339, "percentage": 96.78, "elapsed_time": "0:23:16", "remaining_time": "0:00:46", "throughput": 1339.18, "total_tokens": 1870448} +{"current_steps": 4660, "total_steps": 4810, "loss": 0.0076, "lr": 1.499525467379581e-08, "epoch": 4.844074844074844, "percentage": 96.88, "elapsed_time": "0:23:17", "remaining_time": "0:00:44", "throughput": 1340.24, "total_tokens": 1872368} +{"current_steps": 4665, "total_steps": 4810, "loss": 0.0001, "lr": 1.4019546741908252e-08, "epoch": 4.849272349272349, "percentage": 96.99, "elapsed_time": "0:23:17", "remaining_time": "0:00:43", "throughput": 1341.44, "total_tokens": 1874480} +{"current_steps": 4670, "total_steps": 4810, "loss": 0.0214, "lr": 1.3076570001553934e-08, "epoch": 4.854469854469855, "percentage": 97.09, "elapsed_time": "0:23:17", "remaining_time": "0:00:41", "throughput": 1342.55, "total_tokens": 1876464} +{"current_steps": 4675, "total_steps": 4810, "loss": 0.0, "lr": 1.216633686825841e-08, "epoch": 4.85966735966736, "percentage": 97.19, "elapsed_time": "0:23:18", "remaining_time": "0:00:40", "throughput": 1343.66, "total_tokens": 1878448} +{"current_steps": 4680, "total_steps": 4810, "loss": 0.0426, "lr": 1.1288859326433477e-08, "epoch": 4.864864864864865, "percentage": 97.3, "elapsed_time": "0:23:18", "remaining_time": "0:00:38", "throughput": 1344.77, "total_tokens": 1880432} +{"current_steps": 4685, "total_steps": 4810, "loss": 0.0598, "lr": 1.0444148929221466e-08, "epoch": 4.87006237006237, "percentage": 97.4, "elapsed_time": "0:23:18", "remaining_time": "0:00:37", "throughput": 1345.96, "total_tokens": 1882544} +{"current_steps": 4690, "total_steps": 4810, "loss": 0.0001, "lr": 9.632216798342032e-09, "epoch": 4.875259875259875, "percentage": 97.51, "elapsed_time": "0:23:18", "remaining_time": "0:00:35", "throughput": 1347.07, "total_tokens": 1884528} +{"current_steps": 4695, "total_steps": 4810, "loss": 0.0, "lr": 8.853073623946163e-09, "epoch": 4.88045738045738, "percentage": 97.61, "elapsed_time": "0:23:19", "remaining_time": "0:00:34", "throughput": 1348.27, "total_tokens": 1886640} +{"current_steps": 4700, "total_steps": 4810, "loss": 0.0369, "lr": 8.106729664475178e-09, "epoch": 4.885654885654886, "percentage": 97.71, "elapsed_time": "0:23:19", "remaining_time": "0:00:32", "throughput": 1349.42, "total_tokens": 1888688} +{"current_steps": 4705, "total_steps": 4810, "loss": 0.0001, "lr": 7.3931947465252786e-09, "epoch": 4.890852390852391, "percentage": 97.82, "elapsed_time": "0:23:19", "remaining_time": "0:00:31", "throughput": 1350.57, "total_tokens": 1890736} +{"current_steps": 4710, "total_steps": 4810, "loss": 0.0, "lr": 6.7124782647196015e-09, "epoch": 4.896049896049896, "percentage": 97.92, "elapsed_time": "0:23:20", "remaining_time": "0:00:29", "throughput": 1351.67, "total_tokens": 1892720} +{"current_steps": 4715, "total_steps": 4810, "loss": 0.0, "lr": 6.064589181582481e-09, "epoch": 4.901247401247401, "percentage": 98.02, "elapsed_time": "0:23:20", "remaining_time": "0:00:28", "throughput": 1352.77, "total_tokens": 1894704} +{"current_steps": 4720, "total_steps": 4810, "loss": 0.0287, "lr": 5.4495360274231526e-09, "epoch": 4.906444906444906, "percentage": 98.13, "elapsed_time": "0:23:20", "remaining_time": "0:00:26", "throughput": 1353.83, "total_tokens": 1896624} +{"current_steps": 4725, "total_steps": 4810, "loss": 0.0307, "lr": 4.867326900223068e-09, "epoch": 4.911642411642411, "percentage": 98.23, "elapsed_time": "0:23:21", "remaining_time": "0:00:25", "throughput": 1354.89, "total_tokens": 1898544} +{"current_steps": 4730, "total_steps": 4810, "loss": 0.0353, "lr": 4.317969465527927e-09, "epoch": 4.916839916839917, "percentage": 98.34, "elapsed_time": "0:23:21", "remaining_time": "0:00:23", "throughput": 1356.04, "total_tokens": 1900592} +{"current_steps": 4735, "total_steps": 4810, "loss": 0.0287, "lr": 3.801470956348863e-09, "epoch": 4.922037422037422, "percentage": 98.44, "elapsed_time": "0:23:21", "remaining_time": "0:00:22", "throughput": 1357.14, "total_tokens": 1902576} +{"current_steps": 4740, "total_steps": 4810, "loss": 0.0001, "lr": 3.3178381730661345e-09, "epoch": 4.927234927234927, "percentage": 98.54, "elapsed_time": "0:23:22", "remaining_time": "0:00:20", "throughput": 1358.29, "total_tokens": 1904624} +{"current_steps": 4745, "total_steps": 4810, "loss": 0.0, "lr": 2.8670774833386427e-09, "epoch": 4.9324324324324325, "percentage": 98.65, "elapsed_time": "0:23:22", "remaining_time": "0:00:19", "throughput": 1359.48, "total_tokens": 1906736} +{"current_steps": 4750, "total_steps": 4810, "loss": 0.0, "lr": 2.449194822022327e-09, "epoch": 4.9376299376299375, "percentage": 98.75, "elapsed_time": "0:23:22", "remaining_time": "0:00:17", "throughput": 1360.49, "total_tokens": 1908592} +{"current_steps": 4755, "total_steps": 4810, "loss": 0.0006, "lr": 2.064195691089954e-09, "epoch": 4.942827442827443, "percentage": 98.86, "elapsed_time": "0:23:23", "remaining_time": "0:00:16", "throughput": 1361.59, "total_tokens": 1910576} +{"current_steps": 4760, "total_steps": 4810, "loss": 0.0, "lr": 1.7120851595597842e-09, "epoch": 4.948024948024948, "percentage": 98.96, "elapsed_time": "0:23:23", "remaining_time": "0:00:14", "throughput": 1362.74, "total_tokens": 1912624} +{"current_steps": 4765, "total_steps": 4810, "loss": 0.0283, "lr": 1.3928678634289595e-09, "epoch": 4.953222453222454, "percentage": 99.06, "elapsed_time": "0:23:23", "remaining_time": "0:00:13", "throughput": 1363.84, "total_tokens": 1914608} +{"current_steps": 4770, "total_steps": 4810, "loss": 0.0004, "lr": 1.1065480056110521e-09, "epoch": 4.958419958419959, "percentage": 99.17, "elapsed_time": "0:23:24", "remaining_time": "0:00:11", "throughput": 1364.94, "total_tokens": 1916592} +{"current_steps": 4775, "total_steps": 4810, "loss": 0.0001, "lr": 8.531293558824983e-10, "epoch": 4.963617463617464, "percentage": 99.27, "elapsed_time": "0:23:24", "remaining_time": "0:00:10", "throughput": 1366.13, "total_tokens": 1918704} +{"current_steps": 4780, "total_steps": 4810, "loss": 0.0001, "lr": 6.326152508320804e-10, "epoch": 4.968814968814969, "percentage": 99.38, "elapsed_time": "0:23:24", "remaining_time": "0:00:08", "throughput": 1367.18, "total_tokens": 1920624} +{"current_steps": 4785, "total_steps": 4810, "loss": 0.0013, "lr": 4.450085938170756e-10, "epoch": 4.974012474012474, "percentage": 99.48, "elapsed_time": "0:23:25", "remaining_time": "0:00:07", "throughput": 1368.19, "total_tokens": 1922480} +{"current_steps": 4790, "total_steps": 4810, "loss": 0.0, "lr": 2.903118549252293e-10, "epoch": 4.979209979209979, "percentage": 99.58, "elapsed_time": "0:23:25", "remaining_time": "0:00:05", "throughput": 1369.28, "total_tokens": 1924464} +{"current_steps": 4795, "total_steps": 4810, "loss": 0.0001, "lr": 1.6852707094172637e-10, "epoch": 4.984407484407484, "percentage": 99.69, "elapsed_time": "0:23:25", "remaining_time": "0:00:04", "throughput": 1370.38, "total_tokens": 1926448} +{"current_steps": 4800, "total_steps": 4810, "loss": 0.0002, "lr": 7.965584532282356e-11, "epoch": 4.98960498960499, "percentage": 99.79, "elapsed_time": "0:23:26", "remaining_time": "0:00:02", "throughput": 1371.56, "total_tokens": 1928560} +{"current_steps": 4805, "total_steps": 4810, "loss": 0.0177, "lr": 2.3699348174754943e-11, "epoch": 4.994802494802495, "percentage": 99.9, "elapsed_time": "0:23:26", "remaining_time": "0:00:01", "throughput": 1372.65, "total_tokens": 1930544} +{"current_steps": 4810, "total_steps": 4810, "loss": 0.0001, "lr": 6.583162381890162e-13, "epoch": 5.0, "percentage": 100.0, "elapsed_time": "0:23:26", "remaining_time": "0:00:00", "throughput": 1373.76, "total_tokens": 1932608} +{"current_steps": 4810, "total_steps": 4810, "epoch": 5.0, "percentage": 100.0, "elapsed_time": "0:23:50", "remaining_time": "0:00:00", "throughput": 1351.3, "total_tokens": 1932608} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..7d79161 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,7911 @@ +{ + "best_global_step": 964, + "best_metric": 0.17627178132534027, + "best_model_checkpoint": "saves_bts_preliminary/base/llama-3.2-1b-instruct/train_cola_42_1776331560/checkpoint-964", + "epoch": 5.0, + "eval_steps": 241, + "global_step": 4810, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005197505197505198, + "grad_norm": 440.98797607421875, + "learning_rate": 4.158004158004159e-08, + "loss": 1.2917, + "num_input_tokens_seen": 2048, + "step": 5 + }, + { + "epoch": 0.010395010395010396, + "grad_norm": 396.0676574707031, + "learning_rate": 9.355509355509357e-08, + "loss": 1.2491, + "num_input_tokens_seen": 4224, + "step": 10 + }, + { + "epoch": 0.015592515592515593, + "grad_norm": 480.88055419921875, + "learning_rate": 1.4553014553014554e-07, + "loss": 1.3117, + "num_input_tokens_seen": 6272, + "step": 15 + }, + { + "epoch": 0.02079002079002079, + "grad_norm": 408.1519775390625, + "learning_rate": 1.9750519750519752e-07, + "loss": 1.1366, + "num_input_tokens_seen": 8384, + "step": 20 + }, + { + "epoch": 0.02598752598752599, + "grad_norm": 239.34207153320312, + "learning_rate": 2.494802494802495e-07, + "loss": 0.7792, + "num_input_tokens_seen": 10496, + "step": 25 + }, + { + "epoch": 0.031185031185031187, + "grad_norm": 36.67585754394531, + "learning_rate": 3.014553014553015e-07, + "loss": 0.523, + "num_input_tokens_seen": 12544, + "step": 30 + }, + { + "epoch": 0.036382536382536385, + "grad_norm": 53.71092987060547, + "learning_rate": 3.534303534303535e-07, + "loss": 0.3194, + "num_input_tokens_seen": 14528, + "step": 35 + }, + { + "epoch": 0.04158004158004158, + "grad_norm": 72.29695129394531, + "learning_rate": 4.0540540540540546e-07, + "loss": 0.3482, + "num_input_tokens_seen": 16576, + "step": 40 + }, + { + "epoch": 0.04677754677754678, + "grad_norm": 158.2495880126953, + "learning_rate": 4.5738045738045745e-07, + "loss": 0.377, + "num_input_tokens_seen": 18560, + "step": 45 + }, + { + "epoch": 0.05197505197505198, + "grad_norm": 47.052833557128906, + "learning_rate": 5.093555093555094e-07, + "loss": 0.2695, + "num_input_tokens_seen": 20608, + "step": 50 + }, + { + "epoch": 0.057172557172557176, + "grad_norm": 29.388397216796875, + "learning_rate": 5.613305613305614e-07, + "loss": 0.2431, + "num_input_tokens_seen": 22656, + "step": 55 + }, + { + "epoch": 0.062370062370062374, + "grad_norm": 94.9759292602539, + "learning_rate": 6.133056133056134e-07, + "loss": 0.3162, + "num_input_tokens_seen": 24640, + "step": 60 + }, + { + "epoch": 0.06756756756756757, + "grad_norm": 108.12734985351562, + "learning_rate": 6.652806652806654e-07, + "loss": 0.3171, + "num_input_tokens_seen": 26752, + "step": 65 + }, + { + "epoch": 0.07276507276507277, + "grad_norm": 64.33253479003906, + "learning_rate": 7.172557172557173e-07, + "loss": 0.4943, + "num_input_tokens_seen": 28608, + "step": 70 + }, + { + "epoch": 0.07796257796257797, + "grad_norm": 85.49552154541016, + "learning_rate": 7.692307692307694e-07, + "loss": 0.3067, + "num_input_tokens_seen": 30912, + "step": 75 + }, + { + "epoch": 0.08316008316008316, + "grad_norm": 71.20551300048828, + "learning_rate": 8.212058212058213e-07, + "loss": 0.5316, + "num_input_tokens_seen": 32896, + "step": 80 + }, + { + "epoch": 0.08835758835758836, + "grad_norm": 8.865546226501465, + "learning_rate": 8.731808731808733e-07, + "loss": 0.3228, + "num_input_tokens_seen": 34816, + "step": 85 + }, + { + "epoch": 0.09355509355509356, + "grad_norm": 39.09188461303711, + "learning_rate": 9.251559251559253e-07, + "loss": 0.3339, + "num_input_tokens_seen": 36736, + "step": 90 + }, + { + "epoch": 0.09875259875259876, + "grad_norm": 8.73692512512207, + "learning_rate": 9.771309771309773e-07, + "loss": 0.2951, + "num_input_tokens_seen": 38720, + "step": 95 + }, + { + "epoch": 0.10395010395010396, + "grad_norm": 16.94306182861328, + "learning_rate": 1.0291060291060292e-06, + "loss": 0.2279, + "num_input_tokens_seen": 40640, + "step": 100 + }, + { + "epoch": 0.10914760914760915, + "grad_norm": 23.53331756591797, + "learning_rate": 1.0810810810810812e-06, + "loss": 0.2562, + "num_input_tokens_seen": 42688, + "step": 105 + }, + { + "epoch": 0.11434511434511435, + "grad_norm": 104.1171646118164, + "learning_rate": 1.1330561330561333e-06, + "loss": 0.2794, + "num_input_tokens_seen": 44544, + "step": 110 + }, + { + "epoch": 0.11954261954261955, + "grad_norm": 18.66437530517578, + "learning_rate": 1.1850311850311852e-06, + "loss": 0.2511, + "num_input_tokens_seen": 46400, + "step": 115 + }, + { + "epoch": 0.12474012474012475, + "grad_norm": 52.965763092041016, + "learning_rate": 1.2370062370062372e-06, + "loss": 0.2503, + "num_input_tokens_seen": 48448, + "step": 120 + }, + { + "epoch": 0.12993762993762994, + "grad_norm": 38.35150146484375, + "learning_rate": 1.288981288981289e-06, + "loss": 0.3088, + "num_input_tokens_seen": 50496, + "step": 125 + }, + { + "epoch": 0.13513513513513514, + "grad_norm": 67.04692840576172, + "learning_rate": 1.340956340956341e-06, + "loss": 0.2409, + "num_input_tokens_seen": 52416, + "step": 130 + }, + { + "epoch": 0.14033264033264034, + "grad_norm": 44.75727844238281, + "learning_rate": 1.3929313929313932e-06, + "loss": 0.2575, + "num_input_tokens_seen": 54464, + "step": 135 + }, + { + "epoch": 0.14553014553014554, + "grad_norm": 21.614540100097656, + "learning_rate": 1.4449064449064451e-06, + "loss": 0.2283, + "num_input_tokens_seen": 56448, + "step": 140 + }, + { + "epoch": 0.15072765072765074, + "grad_norm": 66.04798126220703, + "learning_rate": 1.496881496881497e-06, + "loss": 0.2004, + "num_input_tokens_seen": 58368, + "step": 145 + }, + { + "epoch": 0.15592515592515593, + "grad_norm": 141.60061645507812, + "learning_rate": 1.548856548856549e-06, + "loss": 0.3166, + "num_input_tokens_seen": 60544, + "step": 150 + }, + { + "epoch": 0.16112266112266113, + "grad_norm": 102.77252960205078, + "learning_rate": 1.6008316008316011e-06, + "loss": 0.3419, + "num_input_tokens_seen": 62592, + "step": 155 + }, + { + "epoch": 0.16632016632016633, + "grad_norm": 21.740901947021484, + "learning_rate": 1.652806652806653e-06, + "loss": 0.2605, + "num_input_tokens_seen": 64576, + "step": 160 + }, + { + "epoch": 0.17151767151767153, + "grad_norm": 58.935909271240234, + "learning_rate": 1.704781704781705e-06, + "loss": 0.2617, + "num_input_tokens_seen": 66688, + "step": 165 + }, + { + "epoch": 0.17671517671517672, + "grad_norm": 14.498135566711426, + "learning_rate": 1.756756756756757e-06, + "loss": 0.2505, + "num_input_tokens_seen": 68544, + "step": 170 + }, + { + "epoch": 0.18191268191268192, + "grad_norm": 55.772621154785156, + "learning_rate": 1.808731808731809e-06, + "loss": 0.2672, + "num_input_tokens_seen": 70592, + "step": 175 + }, + { + "epoch": 0.18711018711018712, + "grad_norm": 37.71332931518555, + "learning_rate": 1.860706860706861e-06, + "loss": 0.2488, + "num_input_tokens_seen": 72576, + "step": 180 + }, + { + "epoch": 0.19230769230769232, + "grad_norm": 14.720597267150879, + "learning_rate": 1.912681912681913e-06, + "loss": 0.1863, + "num_input_tokens_seen": 74624, + "step": 185 + }, + { + "epoch": 0.19750519750519752, + "grad_norm": 25.971519470214844, + "learning_rate": 1.964656964656965e-06, + "loss": 0.1639, + "num_input_tokens_seen": 76608, + "step": 190 + }, + { + "epoch": 0.20270270270270271, + "grad_norm": 135.57969665527344, + "learning_rate": 2.016632016632017e-06, + "loss": 0.3799, + "num_input_tokens_seen": 78720, + "step": 195 + }, + { + "epoch": 0.2079002079002079, + "grad_norm": 85.43441009521484, + "learning_rate": 2.0686070686070687e-06, + "loss": 0.455, + "num_input_tokens_seen": 81152, + "step": 200 + }, + { + "epoch": 0.2130977130977131, + "grad_norm": 21.497081756591797, + "learning_rate": 2.120582120582121e-06, + "loss": 0.2303, + "num_input_tokens_seen": 83200, + "step": 205 + }, + { + "epoch": 0.2182952182952183, + "grad_norm": 43.83966064453125, + "learning_rate": 2.172557172557173e-06, + "loss": 0.2985, + "num_input_tokens_seen": 85184, + "step": 210 + }, + { + "epoch": 0.2234927234927235, + "grad_norm": 21.950963973999023, + "learning_rate": 2.2245322245322247e-06, + "loss": 0.1462, + "num_input_tokens_seen": 87232, + "step": 215 + }, + { + "epoch": 0.2286902286902287, + "grad_norm": 16.27555274963379, + "learning_rate": 2.276507276507277e-06, + "loss": 0.2323, + "num_input_tokens_seen": 89152, + "step": 220 + }, + { + "epoch": 0.2338877338877339, + "grad_norm": 20.64867401123047, + "learning_rate": 2.3284823284823286e-06, + "loss": 0.3453, + "num_input_tokens_seen": 91328, + "step": 225 + }, + { + "epoch": 0.2390852390852391, + "grad_norm": 17.483060836791992, + "learning_rate": 2.3804573804573807e-06, + "loss": 0.1965, + "num_input_tokens_seen": 93312, + "step": 230 + }, + { + "epoch": 0.2442827442827443, + "grad_norm": 18.55425262451172, + "learning_rate": 2.432432432432433e-06, + "loss": 0.186, + "num_input_tokens_seen": 95296, + "step": 235 + }, + { + "epoch": 0.2494802494802495, + "grad_norm": 59.2343635559082, + "learning_rate": 2.4844074844074846e-06, + "loss": 0.2021, + "num_input_tokens_seen": 97216, + "step": 240 + }, + { + "epoch": 0.2505197505197505, + "eval_loss": 0.2780425250530243, + "eval_runtime": 1.0326, + "eval_samples_per_second": 829.001, + "eval_steps_per_second": 103.625, + "num_input_tokens_seen": 97664, + "step": 241 + }, + { + "epoch": 0.25467775467775466, + "grad_norm": 32.85165023803711, + "learning_rate": 2.5363825363825367e-06, + "loss": 0.2316, + "num_input_tokens_seen": 99264, + "step": 245 + }, + { + "epoch": 0.2598752598752599, + "grad_norm": 52.35283660888672, + "learning_rate": 2.5883575883575885e-06, + "loss": 0.2509, + "num_input_tokens_seen": 101184, + "step": 250 + }, + { + "epoch": 0.26507276507276506, + "grad_norm": 18.137977600097656, + "learning_rate": 2.6403326403326406e-06, + "loss": 0.1696, + "num_input_tokens_seen": 103296, + "step": 255 + }, + { + "epoch": 0.2702702702702703, + "grad_norm": 35.994441986083984, + "learning_rate": 2.6923076923076923e-06, + "loss": 0.3451, + "num_input_tokens_seen": 105344, + "step": 260 + }, + { + "epoch": 0.27546777546777546, + "grad_norm": 18.656810760498047, + "learning_rate": 2.7442827442827445e-06, + "loss": 0.1851, + "num_input_tokens_seen": 107392, + "step": 265 + }, + { + "epoch": 0.2806652806652807, + "grad_norm": 21.467487335205078, + "learning_rate": 2.796257796257796e-06, + "loss": 0.2177, + "num_input_tokens_seen": 109440, + "step": 270 + }, + { + "epoch": 0.28586278586278585, + "grad_norm": 61.47985076904297, + "learning_rate": 2.8482328482328488e-06, + "loss": 0.2767, + "num_input_tokens_seen": 111424, + "step": 275 + }, + { + "epoch": 0.2910602910602911, + "grad_norm": 38.212928771972656, + "learning_rate": 2.9002079002079005e-06, + "loss": 0.3802, + "num_input_tokens_seen": 113408, + "step": 280 + }, + { + "epoch": 0.29625779625779625, + "grad_norm": 29.204410552978516, + "learning_rate": 2.9521829521829526e-06, + "loss": 0.2173, + "num_input_tokens_seen": 115392, + "step": 285 + }, + { + "epoch": 0.30145530145530147, + "grad_norm": 38.67171859741211, + "learning_rate": 3.0041580041580043e-06, + "loss": 0.2155, + "num_input_tokens_seen": 117440, + "step": 290 + }, + { + "epoch": 0.30665280665280664, + "grad_norm": 17.80721664428711, + "learning_rate": 3.0561330561330565e-06, + "loss": 0.2116, + "num_input_tokens_seen": 119424, + "step": 295 + }, + { + "epoch": 0.31185031185031187, + "grad_norm": 30.9665470123291, + "learning_rate": 3.1081081081081082e-06, + "loss": 0.2158, + "num_input_tokens_seen": 121344, + "step": 300 + }, + { + "epoch": 0.31704781704781704, + "grad_norm": 24.32500648498535, + "learning_rate": 3.1600831600831604e-06, + "loss": 0.2048, + "num_input_tokens_seen": 123264, + "step": 305 + }, + { + "epoch": 0.32224532224532226, + "grad_norm": 24.350160598754883, + "learning_rate": 3.212058212058212e-06, + "loss": 0.2194, + "num_input_tokens_seen": 125184, + "step": 310 + }, + { + "epoch": 0.32744282744282743, + "grad_norm": 17.5281982421875, + "learning_rate": 3.2640332640332646e-06, + "loss": 0.1868, + "num_input_tokens_seen": 127296, + "step": 315 + }, + { + "epoch": 0.33264033264033266, + "grad_norm": 30.43378257751465, + "learning_rate": 3.3160083160083164e-06, + "loss": 0.217, + "num_input_tokens_seen": 129408, + "step": 320 + }, + { + "epoch": 0.33783783783783783, + "grad_norm": 7.501856327056885, + "learning_rate": 3.3679833679833685e-06, + "loss": 0.179, + "num_input_tokens_seen": 131520, + "step": 325 + }, + { + "epoch": 0.34303534303534305, + "grad_norm": 37.26128005981445, + "learning_rate": 3.4199584199584202e-06, + "loss": 0.2437, + "num_input_tokens_seen": 133568, + "step": 330 + }, + { + "epoch": 0.3482328482328482, + "grad_norm": 50.59195327758789, + "learning_rate": 3.4719334719334724e-06, + "loss": 0.1981, + "num_input_tokens_seen": 135616, + "step": 335 + }, + { + "epoch": 0.35343035343035345, + "grad_norm": 27.73749542236328, + "learning_rate": 3.523908523908524e-06, + "loss": 0.3892, + "num_input_tokens_seen": 137664, + "step": 340 + }, + { + "epoch": 0.3586278586278586, + "grad_norm": 25.262042999267578, + "learning_rate": 3.5758835758835762e-06, + "loss": 0.1327, + "num_input_tokens_seen": 139584, + "step": 345 + }, + { + "epoch": 0.36382536382536385, + "grad_norm": 33.850399017333984, + "learning_rate": 3.627858627858628e-06, + "loss": 0.2165, + "num_input_tokens_seen": 141504, + "step": 350 + }, + { + "epoch": 0.369022869022869, + "grad_norm": 40.24353790283203, + "learning_rate": 3.6798336798336805e-06, + "loss": 0.2907, + "num_input_tokens_seen": 143552, + "step": 355 + }, + { + "epoch": 0.37422037422037424, + "grad_norm": 26.471458435058594, + "learning_rate": 3.7318087318087322e-06, + "loss": 0.3012, + "num_input_tokens_seen": 145536, + "step": 360 + }, + { + "epoch": 0.3794178794178794, + "grad_norm": 24.082931518554688, + "learning_rate": 3.7837837837837844e-06, + "loss": 0.2293, + "num_input_tokens_seen": 147456, + "step": 365 + }, + { + "epoch": 0.38461538461538464, + "grad_norm": 17.22445297241211, + "learning_rate": 3.835758835758836e-06, + "loss": 0.1782, + "num_input_tokens_seen": 149440, + "step": 370 + }, + { + "epoch": 0.3898128898128898, + "grad_norm": 28.956668853759766, + "learning_rate": 3.887733887733889e-06, + "loss": 0.3696, + "num_input_tokens_seen": 151360, + "step": 375 + }, + { + "epoch": 0.39501039501039503, + "grad_norm": 33.45877456665039, + "learning_rate": 3.9397089397089396e-06, + "loss": 0.309, + "num_input_tokens_seen": 153344, + "step": 380 + }, + { + "epoch": 0.4002079002079002, + "grad_norm": 5.932255744934082, + "learning_rate": 3.991683991683992e-06, + "loss": 0.2413, + "num_input_tokens_seen": 155264, + "step": 385 + }, + { + "epoch": 0.40540540540540543, + "grad_norm": 21.102771759033203, + "learning_rate": 4.043659043659044e-06, + "loss": 0.3064, + "num_input_tokens_seen": 157248, + "step": 390 + }, + { + "epoch": 0.4106029106029106, + "grad_norm": 4.685667991638184, + "learning_rate": 4.095634095634096e-06, + "loss": 0.2798, + "num_input_tokens_seen": 159296, + "step": 395 + }, + { + "epoch": 0.4158004158004158, + "grad_norm": 17.0113525390625, + "learning_rate": 4.147609147609148e-06, + "loss": 0.3488, + "num_input_tokens_seen": 161344, + "step": 400 + }, + { + "epoch": 0.420997920997921, + "grad_norm": 6.919645309448242, + "learning_rate": 4.1995841995842e-06, + "loss": 0.2072, + "num_input_tokens_seen": 163328, + "step": 405 + }, + { + "epoch": 0.4261954261954262, + "grad_norm": 69.12760925292969, + "learning_rate": 4.2515592515592516e-06, + "loss": 0.1704, + "num_input_tokens_seen": 165312, + "step": 410 + }, + { + "epoch": 0.4313929313929314, + "grad_norm": 5.609197616577148, + "learning_rate": 4.303534303534304e-06, + "loss": 0.0573, + "num_input_tokens_seen": 167360, + "step": 415 + }, + { + "epoch": 0.4365904365904366, + "grad_norm": 99.32202911376953, + "learning_rate": 4.355509355509356e-06, + "loss": 0.9576, + "num_input_tokens_seen": 169344, + "step": 420 + }, + { + "epoch": 0.4417879417879418, + "grad_norm": 10.58239459991455, + "learning_rate": 4.4074844074844084e-06, + "loss": 0.3222, + "num_input_tokens_seen": 171456, + "step": 425 + }, + { + "epoch": 0.446985446985447, + "grad_norm": 20.188488006591797, + "learning_rate": 4.45945945945946e-06, + "loss": 0.3442, + "num_input_tokens_seen": 173568, + "step": 430 + }, + { + "epoch": 0.4521829521829522, + "grad_norm": 8.674958229064941, + "learning_rate": 4.511434511434512e-06, + "loss": 0.1851, + "num_input_tokens_seen": 175552, + "step": 435 + }, + { + "epoch": 0.4573804573804574, + "grad_norm": 22.08028793334961, + "learning_rate": 4.563409563409564e-06, + "loss": 0.2573, + "num_input_tokens_seen": 177536, + "step": 440 + }, + { + "epoch": 0.4625779625779626, + "grad_norm": 11.568997383117676, + "learning_rate": 4.615384615384616e-06, + "loss": 0.2972, + "num_input_tokens_seen": 179584, + "step": 445 + }, + { + "epoch": 0.4677754677754678, + "grad_norm": 6.849438190460205, + "learning_rate": 4.667359667359668e-06, + "loss": 0.2247, + "num_input_tokens_seen": 181568, + "step": 450 + }, + { + "epoch": 0.47297297297297297, + "grad_norm": 3.9055252075195312, + "learning_rate": 4.71933471933472e-06, + "loss": 0.2355, + "num_input_tokens_seen": 183552, + "step": 455 + }, + { + "epoch": 0.4781704781704782, + "grad_norm": 20.351293563842773, + "learning_rate": 4.771309771309771e-06, + "loss": 0.1821, + "num_input_tokens_seen": 185600, + "step": 460 + }, + { + "epoch": 0.48336798336798337, + "grad_norm": 21.34255599975586, + "learning_rate": 4.823284823284824e-06, + "loss": 0.1938, + "num_input_tokens_seen": 187584, + "step": 465 + }, + { + "epoch": 0.4885654885654886, + "grad_norm": 28.844085693359375, + "learning_rate": 4.875259875259876e-06, + "loss": 0.2747, + "num_input_tokens_seen": 189568, + "step": 470 + }, + { + "epoch": 0.49376299376299376, + "grad_norm": 14.666620254516602, + "learning_rate": 4.927234927234928e-06, + "loss": 0.2394, + "num_input_tokens_seen": 191680, + "step": 475 + }, + { + "epoch": 0.498960498960499, + "grad_norm": 23.078649520874023, + "learning_rate": 4.97920997920998e-06, + "loss": 0.2402, + "num_input_tokens_seen": 193728, + "step": 480 + }, + { + "epoch": 0.501039501039501, + "eval_loss": 0.20022711157798767, + "eval_runtime": 1.0474, + "eval_samples_per_second": 817.231, + "eval_steps_per_second": 102.154, + "num_input_tokens_seen": 194560, + "step": 482 + }, + { + "epoch": 0.5041580041580042, + "grad_norm": 49.961700439453125, + "learning_rate": 4.999994075155936e-06, + "loss": 0.1873, + "num_input_tokens_seen": 195776, + "step": 485 + }, + { + "epoch": 0.5093555093555093, + "grad_norm": 21.11049461364746, + "learning_rate": 4.999957867877242e-06, + "loss": 0.1905, + "num_input_tokens_seen": 197696, + "step": 490 + }, + { + "epoch": 0.5145530145530145, + "grad_norm": 40.248802185058594, + "learning_rate": 4.999888745376028e-06, + "loss": 0.1952, + "num_input_tokens_seen": 199680, + "step": 495 + }, + { + "epoch": 0.5197505197505198, + "grad_norm": 25.25174903869629, + "learning_rate": 4.999786708562382e-06, + "loss": 0.2149, + "num_input_tokens_seen": 201792, + "step": 500 + }, + { + "epoch": 0.524948024948025, + "grad_norm": 30.329490661621094, + "learning_rate": 4.999651758779753e-06, + "loss": 0.2066, + "num_input_tokens_seen": 203840, + "step": 505 + }, + { + "epoch": 0.5301455301455301, + "grad_norm": 23.636180877685547, + "learning_rate": 4.999483897804933e-06, + "loss": 0.2161, + "num_input_tokens_seen": 205824, + "step": 510 + }, + { + "epoch": 0.5353430353430353, + "grad_norm": 29.035306930541992, + "learning_rate": 4.999283127848029e-06, + "loss": 0.1777, + "num_input_tokens_seen": 207936, + "step": 515 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 21.316884994506836, + "learning_rate": 4.999049451552443e-06, + "loss": 0.1931, + "num_input_tokens_seen": 209984, + "step": 520 + }, + { + "epoch": 0.5457380457380457, + "grad_norm": 39.675086975097656, + "learning_rate": 4.998782871994828e-06, + "loss": 0.3235, + "num_input_tokens_seen": 212096, + "step": 525 + }, + { + "epoch": 0.5509355509355509, + "grad_norm": 20.291854858398438, + "learning_rate": 4.998483392685055e-06, + "loss": 0.2083, + "num_input_tokens_seen": 214080, + "step": 530 + }, + { + "epoch": 0.5561330561330561, + "grad_norm": 11.547039985656738, + "learning_rate": 4.9981510175661606e-06, + "loss": 0.2592, + "num_input_tokens_seen": 216128, + "step": 535 + }, + { + "epoch": 0.5613305613305614, + "grad_norm": 14.435676574707031, + "learning_rate": 4.9977857510143e-06, + "loss": 0.2199, + "num_input_tokens_seen": 218176, + "step": 540 + }, + { + "epoch": 0.5665280665280665, + "grad_norm": 11.747395515441895, + "learning_rate": 4.997387597838684e-06, + "loss": 0.1414, + "num_input_tokens_seen": 220096, + "step": 545 + }, + { + "epoch": 0.5717255717255717, + "grad_norm": 39.84230422973633, + "learning_rate": 4.996956563281524e-06, + "loss": 0.1874, + "num_input_tokens_seen": 222080, + "step": 550 + }, + { + "epoch": 0.5769230769230769, + "grad_norm": 41.40126419067383, + "learning_rate": 4.996492653017953e-06, + "loss": 0.2643, + "num_input_tokens_seen": 224000, + "step": 555 + }, + { + "epoch": 0.5821205821205822, + "grad_norm": 26.15458106994629, + "learning_rate": 4.995995873155958e-06, + "loss": 0.2975, + "num_input_tokens_seen": 225984, + "step": 560 + }, + { + "epoch": 0.5873180873180873, + "grad_norm": 17.151378631591797, + "learning_rate": 4.995466230236298e-06, + "loss": 0.1955, + "num_input_tokens_seen": 227840, + "step": 565 + }, + { + "epoch": 0.5925155925155925, + "grad_norm": 15.602777481079102, + "learning_rate": 4.994903731232415e-06, + "loss": 0.2476, + "num_input_tokens_seen": 229824, + "step": 570 + }, + { + "epoch": 0.5977130977130977, + "grad_norm": 6.400282859802246, + "learning_rate": 4.994308383550347e-06, + "loss": 0.213, + "num_input_tokens_seen": 231872, + "step": 575 + }, + { + "epoch": 0.6029106029106029, + "grad_norm": 21.05705451965332, + "learning_rate": 4.993680195028626e-06, + "loss": 0.2039, + "num_input_tokens_seen": 233920, + "step": 580 + }, + { + "epoch": 0.6081081081081081, + "grad_norm": 20.211143493652344, + "learning_rate": 4.993019173938178e-06, + "loss": 0.2036, + "num_input_tokens_seen": 235840, + "step": 585 + }, + { + "epoch": 0.6133056133056133, + "grad_norm": 7.714716911315918, + "learning_rate": 4.992325328982212e-06, + "loss": 0.2111, + "num_input_tokens_seen": 238016, + "step": 590 + }, + { + "epoch": 0.6185031185031185, + "grad_norm": 11.061738967895508, + "learning_rate": 4.991598669296105e-06, + "loss": 0.1706, + "num_input_tokens_seen": 240064, + "step": 595 + }, + { + "epoch": 0.6237006237006237, + "grad_norm": 37.05807113647461, + "learning_rate": 4.990839204447287e-06, + "loss": 0.2236, + "num_input_tokens_seen": 242048, + "step": 600 + }, + { + "epoch": 0.6288981288981289, + "grad_norm": 23.512836456298828, + "learning_rate": 4.990046944435105e-06, + "loss": 0.1908, + "num_input_tokens_seen": 243968, + "step": 605 + }, + { + "epoch": 0.6340956340956341, + "grad_norm": 26.345306396484375, + "learning_rate": 4.989221899690704e-06, + "loss": 0.2409, + "num_input_tokens_seen": 246016, + "step": 610 + }, + { + "epoch": 0.6392931392931392, + "grad_norm": 8.184889793395996, + "learning_rate": 4.988364081076877e-06, + "loss": 0.2135, + "num_input_tokens_seen": 248000, + "step": 615 + }, + { + "epoch": 0.6444906444906445, + "grad_norm": 8.331661224365234, + "learning_rate": 4.987473499887932e-06, + "loss": 0.203, + "num_input_tokens_seen": 250048, + "step": 620 + }, + { + "epoch": 0.6496881496881497, + "grad_norm": 19.97974967956543, + "learning_rate": 4.986550167849538e-06, + "loss": 0.1867, + "num_input_tokens_seen": 252096, + "step": 625 + }, + { + "epoch": 0.6548856548856549, + "grad_norm": 15.157999992370605, + "learning_rate": 4.9855940971185705e-06, + "loss": 0.1162, + "num_input_tokens_seen": 254144, + "step": 630 + }, + { + "epoch": 0.66008316008316, + "grad_norm": 9.33337116241455, + "learning_rate": 4.984605300282955e-06, + "loss": 0.2562, + "num_input_tokens_seen": 256128, + "step": 635 + }, + { + "epoch": 0.6652806652806653, + "grad_norm": 28.826885223388672, + "learning_rate": 4.983583790361497e-06, + "loss": 0.1389, + "num_input_tokens_seen": 258048, + "step": 640 + }, + { + "epoch": 0.6704781704781705, + "grad_norm": 48.085391998291016, + "learning_rate": 4.982529580803714e-06, + "loss": 0.3054, + "num_input_tokens_seen": 260352, + "step": 645 + }, + { + "epoch": 0.6756756756756757, + "grad_norm": 24.728063583374023, + "learning_rate": 4.981442685489659e-06, + "loss": 0.2884, + "num_input_tokens_seen": 262272, + "step": 650 + }, + { + "epoch": 0.6808731808731808, + "grad_norm": 24.10839080810547, + "learning_rate": 4.9803231187297305e-06, + "loss": 0.1599, + "num_input_tokens_seen": 264320, + "step": 655 + }, + { + "epoch": 0.6860706860706861, + "grad_norm": 10.08352279663086, + "learning_rate": 4.979170895264494e-06, + "loss": 0.1946, + "num_input_tokens_seen": 266240, + "step": 660 + }, + { + "epoch": 0.6912681912681913, + "grad_norm": 17.471120834350586, + "learning_rate": 4.977986030264483e-06, + "loss": 0.2128, + "num_input_tokens_seen": 268224, + "step": 665 + }, + { + "epoch": 0.6964656964656964, + "grad_norm": 19.734243392944336, + "learning_rate": 4.9767685393299946e-06, + "loss": 0.2326, + "num_input_tokens_seen": 270272, + "step": 670 + }, + { + "epoch": 0.7016632016632016, + "grad_norm": 8.745848655700684, + "learning_rate": 4.975518438490897e-06, + "loss": 0.2276, + "num_input_tokens_seen": 272256, + "step": 675 + }, + { + "epoch": 0.7068607068607069, + "grad_norm": 24.683629989624023, + "learning_rate": 4.974235744206405e-06, + "loss": 0.1786, + "num_input_tokens_seen": 274240, + "step": 680 + }, + { + "epoch": 0.7120582120582121, + "grad_norm": 32.86091232299805, + "learning_rate": 4.972920473364869e-06, + "loss": 0.1923, + "num_input_tokens_seen": 276288, + "step": 685 + }, + { + "epoch": 0.7172557172557172, + "grad_norm": 13.548423767089844, + "learning_rate": 4.971572643283557e-06, + "loss": 0.1661, + "num_input_tokens_seen": 278272, + "step": 690 + }, + { + "epoch": 0.7224532224532224, + "grad_norm": 31.974199295043945, + "learning_rate": 4.970192271708416e-06, + "loss": 0.1867, + "num_input_tokens_seen": 280384, + "step": 695 + }, + { + "epoch": 0.7276507276507277, + "grad_norm": 16.395275115966797, + "learning_rate": 4.968779376813849e-06, + "loss": 0.3333, + "num_input_tokens_seen": 282368, + "step": 700 + }, + { + "epoch": 0.7328482328482329, + "grad_norm": 12.498151779174805, + "learning_rate": 4.967333977202469e-06, + "loss": 0.1327, + "num_input_tokens_seen": 284416, + "step": 705 + }, + { + "epoch": 0.738045738045738, + "grad_norm": 70.73739624023438, + "learning_rate": 4.965856091904855e-06, + "loss": 0.2235, + "num_input_tokens_seen": 286464, + "step": 710 + }, + { + "epoch": 0.7432432432432432, + "grad_norm": 11.769681930541992, + "learning_rate": 4.964345740379307e-06, + "loss": 0.3413, + "num_input_tokens_seen": 288448, + "step": 715 + }, + { + "epoch": 0.7484407484407485, + "grad_norm": 9.002899169921875, + "learning_rate": 4.962802942511582e-06, + "loss": 0.1906, + "num_input_tokens_seen": 290496, + "step": 720 + }, + { + "epoch": 0.7515592515592515, + "eval_loss": 0.20943090319633484, + "eval_runtime": 1.0284, + "eval_samples_per_second": 832.355, + "eval_steps_per_second": 104.044, + "num_input_tokens_seen": 291712, + "step": 723 + }, + { + "epoch": 0.7536382536382537, + "grad_norm": 30.15701675415039, + "learning_rate": 4.961227718614634e-06, + "loss": 0.2576, + "num_input_tokens_seen": 292480, + "step": 725 + }, + { + "epoch": 0.7588357588357588, + "grad_norm": 16.669214248657227, + "learning_rate": 4.959620089428354e-06, + "loss": 0.2352, + "num_input_tokens_seen": 294464, + "step": 730 + }, + { + "epoch": 0.764033264033264, + "grad_norm": 24.175790786743164, + "learning_rate": 4.957980076119285e-06, + "loss": 0.2617, + "num_input_tokens_seen": 296448, + "step": 735 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 14.268982887268066, + "learning_rate": 4.956307700280354e-06, + "loss": 0.2079, + "num_input_tokens_seen": 298432, + "step": 740 + }, + { + "epoch": 0.7744282744282744, + "grad_norm": 6.003777980804443, + "learning_rate": 4.954602983930581e-06, + "loss": 0.2712, + "num_input_tokens_seen": 300480, + "step": 745 + }, + { + "epoch": 0.7796257796257796, + "grad_norm": 9.822731018066406, + "learning_rate": 4.95286594951479e-06, + "loss": 0.2211, + "num_input_tokens_seen": 302400, + "step": 750 + }, + { + "epoch": 0.7848232848232848, + "grad_norm": 13.10158920288086, + "learning_rate": 4.951096619903317e-06, + "loss": 0.2161, + "num_input_tokens_seen": 304320, + "step": 755 + }, + { + "epoch": 0.7900207900207901, + "grad_norm": 6.7825775146484375, + "learning_rate": 4.949295018391707e-06, + "loss": 0.1828, + "num_input_tokens_seen": 306240, + "step": 760 + }, + { + "epoch": 0.7952182952182952, + "grad_norm": 16.962614059448242, + "learning_rate": 4.9474611687004025e-06, + "loss": 0.2155, + "num_input_tokens_seen": 308032, + "step": 765 + }, + { + "epoch": 0.8004158004158004, + "grad_norm": 11.7578125, + "learning_rate": 4.945595094974442e-06, + "loss": 0.2009, + "num_input_tokens_seen": 309952, + "step": 770 + }, + { + "epoch": 0.8056133056133056, + "grad_norm": 11.759556770324707, + "learning_rate": 4.94369682178313e-06, + "loss": 0.1813, + "num_input_tokens_seen": 311936, + "step": 775 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 10.483379364013672, + "learning_rate": 4.941766374119724e-06, + "loss": 0.1603, + "num_input_tokens_seen": 313920, + "step": 780 + }, + { + "epoch": 0.816008316008316, + "grad_norm": 33.7660026550293, + "learning_rate": 4.939803777401096e-06, + "loss": 0.2613, + "num_input_tokens_seen": 315968, + "step": 785 + }, + { + "epoch": 0.8212058212058212, + "grad_norm": 12.025551795959473, + "learning_rate": 4.937809057467404e-06, + "loss": 0.2641, + "num_input_tokens_seen": 317952, + "step": 790 + }, + { + "epoch": 0.8264033264033264, + "grad_norm": 14.894133567810059, + "learning_rate": 4.935782240581753e-06, + "loss": 0.1934, + "num_input_tokens_seen": 319872, + "step": 795 + }, + { + "epoch": 0.8316008316008316, + "grad_norm": 7.2731733322143555, + "learning_rate": 4.933723353429842e-06, + "loss": 0.2498, + "num_input_tokens_seen": 321856, + "step": 800 + }, + { + "epoch": 0.8367983367983368, + "grad_norm": 7.900448799133301, + "learning_rate": 4.931632423119621e-06, + "loss": 0.1671, + "num_input_tokens_seen": 323968, + "step": 805 + }, + { + "epoch": 0.841995841995842, + "grad_norm": 13.05286693572998, + "learning_rate": 4.929509477180929e-06, + "loss": 0.2092, + "num_input_tokens_seen": 325952, + "step": 810 + }, + { + "epoch": 0.8471933471933472, + "grad_norm": 0.7964070439338684, + "learning_rate": 4.927354543565131e-06, + "loss": 0.0581, + "num_input_tokens_seen": 328000, + "step": 815 + }, + { + "epoch": 0.8523908523908524, + "grad_norm": 68.79032135009766, + "learning_rate": 4.925167650644752e-06, + "loss": 0.1592, + "num_input_tokens_seen": 329984, + "step": 820 + }, + { + "epoch": 0.8575883575883576, + "grad_norm": 15.014650344848633, + "learning_rate": 4.922948827213107e-06, + "loss": 0.4462, + "num_input_tokens_seen": 331904, + "step": 825 + }, + { + "epoch": 0.8627858627858628, + "grad_norm": 11.443034172058105, + "learning_rate": 4.920698102483913e-06, + "loss": 0.4518, + "num_input_tokens_seen": 333888, + "step": 830 + }, + { + "epoch": 0.867983367983368, + "grad_norm": 61.09624481201172, + "learning_rate": 4.9184155060909115e-06, + "loss": 0.2671, + "num_input_tokens_seen": 335872, + "step": 835 + }, + { + "epoch": 0.8731808731808732, + "grad_norm": 69.81952667236328, + "learning_rate": 4.916101068087477e-06, + "loss": 0.3681, + "num_input_tokens_seen": 337856, + "step": 840 + }, + { + "epoch": 0.8783783783783784, + "grad_norm": 26.779844284057617, + "learning_rate": 4.9137548189462185e-06, + "loss": 0.2011, + "num_input_tokens_seen": 339776, + "step": 845 + }, + { + "epoch": 0.8835758835758836, + "grad_norm": 14.02595043182373, + "learning_rate": 4.911376789558584e-06, + "loss": 0.1852, + "num_input_tokens_seen": 341760, + "step": 850 + }, + { + "epoch": 0.8887733887733887, + "grad_norm": 13.688316345214844, + "learning_rate": 4.908967011234446e-06, + "loss": 0.3553, + "num_input_tokens_seen": 343680, + "step": 855 + }, + { + "epoch": 0.893970893970894, + "grad_norm": 12.0100679397583, + "learning_rate": 4.9065255157016955e-06, + "loss": 0.2092, + "num_input_tokens_seen": 345600, + "step": 860 + }, + { + "epoch": 0.8991683991683992, + "grad_norm": 13.758508682250977, + "learning_rate": 4.904052335105822e-06, + "loss": 0.2165, + "num_input_tokens_seen": 347520, + "step": 865 + }, + { + "epoch": 0.9043659043659044, + "grad_norm": 21.069822311401367, + "learning_rate": 4.90154750200949e-06, + "loss": 0.1773, + "num_input_tokens_seen": 349568, + "step": 870 + }, + { + "epoch": 0.9095634095634095, + "grad_norm": 12.611119270324707, + "learning_rate": 4.899011049392111e-06, + "loss": 0.1146, + "num_input_tokens_seen": 351552, + "step": 875 + }, + { + "epoch": 0.9147609147609148, + "grad_norm": 10.34527587890625, + "learning_rate": 4.896443010649408e-06, + "loss": 0.1213, + "num_input_tokens_seen": 353472, + "step": 880 + }, + { + "epoch": 0.91995841995842, + "grad_norm": 7.383549690246582, + "learning_rate": 4.893843419592977e-06, + "loss": 0.123, + "num_input_tokens_seen": 355392, + "step": 885 + }, + { + "epoch": 0.9251559251559252, + "grad_norm": 36.267250061035156, + "learning_rate": 4.891212310449845e-06, + "loss": 0.1794, + "num_input_tokens_seen": 357440, + "step": 890 + }, + { + "epoch": 0.9303534303534303, + "grad_norm": 21.83590316772461, + "learning_rate": 4.88854971786201e-06, + "loss": 0.1822, + "num_input_tokens_seen": 359488, + "step": 895 + }, + { + "epoch": 0.9355509355509356, + "grad_norm": 74.82781982421875, + "learning_rate": 4.885855676885995e-06, + "loss": 0.282, + "num_input_tokens_seen": 361408, + "step": 900 + }, + { + "epoch": 0.9407484407484408, + "grad_norm": 27.140975952148438, + "learning_rate": 4.88313022299238e-06, + "loss": 0.1931, + "num_input_tokens_seen": 363392, + "step": 905 + }, + { + "epoch": 0.9459459459459459, + "grad_norm": 45.93625259399414, + "learning_rate": 4.880373392065339e-06, + "loss": 0.318, + "num_input_tokens_seen": 365440, + "step": 910 + }, + { + "epoch": 0.9511434511434511, + "grad_norm": 22.00739860534668, + "learning_rate": 4.877585220402167e-06, + "loss": 0.1793, + "num_input_tokens_seen": 367616, + "step": 915 + }, + { + "epoch": 0.9563409563409564, + "grad_norm": 20.41562843322754, + "learning_rate": 4.874765744712796e-06, + "loss": 0.1164, + "num_input_tokens_seen": 369600, + "step": 920 + }, + { + "epoch": 0.9615384615384616, + "grad_norm": 30.4830379486084, + "learning_rate": 4.8719150021193206e-06, + "loss": 0.2515, + "num_input_tokens_seen": 371520, + "step": 925 + }, + { + "epoch": 0.9667359667359667, + "grad_norm": 19.38152503967285, + "learning_rate": 4.869033030155504e-06, + "loss": 0.3492, + "num_input_tokens_seen": 373568, + "step": 930 + }, + { + "epoch": 0.9719334719334719, + "grad_norm": 14.191635131835938, + "learning_rate": 4.866119866766286e-06, + "loss": 0.1902, + "num_input_tokens_seen": 375488, + "step": 935 + }, + { + "epoch": 0.9771309771309772, + "grad_norm": 24.351335525512695, + "learning_rate": 4.86317555030728e-06, + "loss": 0.2238, + "num_input_tokens_seen": 377728, + "step": 940 + }, + { + "epoch": 0.9823284823284824, + "grad_norm": 7.04970121383667, + "learning_rate": 4.860200119544273e-06, + "loss": 0.11, + "num_input_tokens_seen": 379840, + "step": 945 + }, + { + "epoch": 0.9875259875259875, + "grad_norm": 40.61119079589844, + "learning_rate": 4.857193613652711e-06, + "loss": 0.2154, + "num_input_tokens_seen": 381760, + "step": 950 + }, + { + "epoch": 0.9927234927234927, + "grad_norm": 18.396310806274414, + "learning_rate": 4.854156072217185e-06, + "loss": 0.1666, + "num_input_tokens_seen": 383808, + "step": 955 + }, + { + "epoch": 0.997920997920998, + "grad_norm": 8.14262866973877, + "learning_rate": 4.851087535230911e-06, + "loss": 0.2397, + "num_input_tokens_seen": 385856, + "step": 960 + }, + { + "epoch": 1.002079002079002, + "eval_loss": 0.17627178132534027, + "eval_runtime": 1.0408, + "eval_samples_per_second": 822.411, + "eval_steps_per_second": 102.801, + "num_input_tokens_seen": 387464, + "step": 964 + }, + { + "epoch": 1.003118503118503, + "grad_norm": 17.4815731048584, + "learning_rate": 4.8479880430952e-06, + "loss": 0.176, + "num_input_tokens_seen": 387848, + "step": 965 + }, + { + "epoch": 1.0083160083160083, + "grad_norm": 3.335278272628784, + "learning_rate": 4.844857636618928e-06, + "loss": 0.0833, + "num_input_tokens_seen": 389640, + "step": 970 + }, + { + "epoch": 1.0135135135135136, + "grad_norm": 37.46596908569336, + "learning_rate": 4.841696357018003e-06, + "loss": 0.1134, + "num_input_tokens_seen": 391624, + "step": 975 + }, + { + "epoch": 1.0187110187110187, + "grad_norm": 12.097825050354004, + "learning_rate": 4.838504245914812e-06, + "loss": 0.0776, + "num_input_tokens_seen": 393672, + "step": 980 + }, + { + "epoch": 1.023908523908524, + "grad_norm": 0.35555675625801086, + "learning_rate": 4.835281345337684e-06, + "loss": 0.0266, + "num_input_tokens_seen": 395784, + "step": 985 + }, + { + "epoch": 1.0291060291060292, + "grad_norm": 99.47309112548828, + "learning_rate": 4.832027697720329e-06, + "loss": 0.2075, + "num_input_tokens_seen": 397768, + "step": 990 + }, + { + "epoch": 1.0343035343035343, + "grad_norm": 65.3523178100586, + "learning_rate": 4.828743345901285e-06, + "loss": 0.4063, + "num_input_tokens_seen": 399816, + "step": 995 + }, + { + "epoch": 1.0395010395010396, + "grad_norm": 1.6000525951385498, + "learning_rate": 4.825428333123346e-06, + "loss": 0.1017, + "num_input_tokens_seen": 401928, + "step": 1000 + }, + { + "epoch": 1.0446985446985446, + "grad_norm": 5.073741436004639, + "learning_rate": 4.822082703033003e-06, + "loss": 0.0338, + "num_input_tokens_seen": 403912, + "step": 1005 + }, + { + "epoch": 1.04989604989605, + "grad_norm": 31.460180282592773, + "learning_rate": 4.818706499679862e-06, + "loss": 0.1392, + "num_input_tokens_seen": 405832, + "step": 1010 + }, + { + "epoch": 1.0550935550935552, + "grad_norm": 13.538461685180664, + "learning_rate": 4.815299767516065e-06, + "loss": 0.1168, + "num_input_tokens_seen": 407880, + "step": 1015 + }, + { + "epoch": 1.0602910602910602, + "grad_norm": 48.505496978759766, + "learning_rate": 4.811862551395707e-06, + "loss": 0.1001, + "num_input_tokens_seen": 410120, + "step": 1020 + }, + { + "epoch": 1.0654885654885655, + "grad_norm": 25.246902465820312, + "learning_rate": 4.808394896574246e-06, + "loss": 0.0944, + "num_input_tokens_seen": 412168, + "step": 1025 + }, + { + "epoch": 1.0706860706860706, + "grad_norm": 40.07487487792969, + "learning_rate": 4.8048968487079e-06, + "loss": 0.1433, + "num_input_tokens_seen": 414472, + "step": 1030 + }, + { + "epoch": 1.0758835758835759, + "grad_norm": 77.1120834350586, + "learning_rate": 4.801368453853057e-06, + "loss": 0.3131, + "num_input_tokens_seen": 416520, + "step": 1035 + }, + { + "epoch": 1.0810810810810811, + "grad_norm": 22.368253707885742, + "learning_rate": 4.79780975846566e-06, + "loss": 0.171, + "num_input_tokens_seen": 418568, + "step": 1040 + }, + { + "epoch": 1.0862785862785862, + "grad_norm": 13.560877799987793, + "learning_rate": 4.7942208094006e-06, + "loss": 0.1287, + "num_input_tokens_seen": 420488, + "step": 1045 + }, + { + "epoch": 1.0914760914760915, + "grad_norm": 31.480270385742188, + "learning_rate": 4.790601653911094e-06, + "loss": 0.1098, + "num_input_tokens_seen": 422472, + "step": 1050 + }, + { + "epoch": 1.0966735966735968, + "grad_norm": 34.814151763916016, + "learning_rate": 4.786952339648071e-06, + "loss": 0.297, + "num_input_tokens_seen": 424456, + "step": 1055 + }, + { + "epoch": 1.1018711018711018, + "grad_norm": 38.07038116455078, + "learning_rate": 4.783272914659535e-06, + "loss": 0.3308, + "num_input_tokens_seen": 426568, + "step": 1060 + }, + { + "epoch": 1.107068607068607, + "grad_norm": 114.09796905517578, + "learning_rate": 4.77956342738994e-06, + "loss": 0.1061, + "num_input_tokens_seen": 428552, + "step": 1065 + }, + { + "epoch": 1.1122661122661124, + "grad_norm": 26.920055389404297, + "learning_rate": 4.775823926679549e-06, + "loss": 0.0996, + "num_input_tokens_seen": 430472, + "step": 1070 + }, + { + "epoch": 1.1174636174636174, + "grad_norm": 20.18915557861328, + "learning_rate": 4.77205446176379e-06, + "loss": 0.1315, + "num_input_tokens_seen": 432328, + "step": 1075 + }, + { + "epoch": 1.1226611226611227, + "grad_norm": 146.2836151123047, + "learning_rate": 4.768255082272612e-06, + "loss": 0.2841, + "num_input_tokens_seen": 434440, + "step": 1080 + }, + { + "epoch": 1.1278586278586278, + "grad_norm": 140.84530639648438, + "learning_rate": 4.764425838229823e-06, + "loss": 0.0783, + "num_input_tokens_seen": 436488, + "step": 1085 + }, + { + "epoch": 1.133056133056133, + "grad_norm": 28.586977005004883, + "learning_rate": 4.760566780052445e-06, + "loss": 0.346, + "num_input_tokens_seen": 438472, + "step": 1090 + }, + { + "epoch": 1.1382536382536383, + "grad_norm": 42.30632400512695, + "learning_rate": 4.756677958550035e-06, + "loss": 0.4155, + "num_input_tokens_seen": 440456, + "step": 1095 + }, + { + "epoch": 1.1434511434511434, + "grad_norm": 41.02433395385742, + "learning_rate": 4.752759424924026e-06, + "loss": 0.1236, + "num_input_tokens_seen": 442440, + "step": 1100 + }, + { + "epoch": 1.1486486486486487, + "grad_norm": 22.878646850585938, + "learning_rate": 4.7488112307670515e-06, + "loss": 0.099, + "num_input_tokens_seen": 444424, + "step": 1105 + }, + { + "epoch": 1.1538461538461537, + "grad_norm": 18.822031021118164, + "learning_rate": 4.7448334280622624e-06, + "loss": 0.1891, + "num_input_tokens_seen": 446280, + "step": 1110 + }, + { + "epoch": 1.159043659043659, + "grad_norm": 26.573184967041016, + "learning_rate": 4.740826069182645e-06, + "loss": 0.1802, + "num_input_tokens_seen": 448264, + "step": 1115 + }, + { + "epoch": 1.1642411642411643, + "grad_norm": 14.406500816345215, + "learning_rate": 4.736789206890332e-06, + "loss": 0.2325, + "num_input_tokens_seen": 450376, + "step": 1120 + }, + { + "epoch": 1.1694386694386694, + "grad_norm": 2.926020860671997, + "learning_rate": 4.732722894335909e-06, + "loss": 0.1142, + "num_input_tokens_seen": 452552, + "step": 1125 + }, + { + "epoch": 1.1746361746361746, + "grad_norm": 13.685401916503906, + "learning_rate": 4.728627185057711e-06, + "loss": 0.1432, + "num_input_tokens_seen": 454600, + "step": 1130 + }, + { + "epoch": 1.17983367983368, + "grad_norm": 37.38111877441406, + "learning_rate": 4.724502132981119e-06, + "loss": 0.1061, + "num_input_tokens_seen": 456648, + "step": 1135 + }, + { + "epoch": 1.185031185031185, + "grad_norm": 28.67649269104004, + "learning_rate": 4.720347792417851e-06, + "loss": 0.078, + "num_input_tokens_seen": 458632, + "step": 1140 + }, + { + "epoch": 1.1902286902286903, + "grad_norm": 83.61138153076172, + "learning_rate": 4.716164218065246e-06, + "loss": 0.1068, + "num_input_tokens_seen": 460680, + "step": 1145 + }, + { + "epoch": 1.1954261954261955, + "grad_norm": 14.42905330657959, + "learning_rate": 4.711951465005548e-06, + "loss": 0.2177, + "num_input_tokens_seen": 462728, + "step": 1150 + }, + { + "epoch": 1.2006237006237006, + "grad_norm": 9.12094783782959, + "learning_rate": 4.707709588705169e-06, + "loss": 0.058, + "num_input_tokens_seen": 464776, + "step": 1155 + }, + { + "epoch": 1.2058212058212059, + "grad_norm": 33.16276550292969, + "learning_rate": 4.7034386450139735e-06, + "loss": 0.3544, + "num_input_tokens_seen": 466696, + "step": 1160 + }, + { + "epoch": 1.211018711018711, + "grad_norm": 29.38105010986328, + "learning_rate": 4.699138690164533e-06, + "loss": 0.1744, + "num_input_tokens_seen": 468616, + "step": 1165 + }, + { + "epoch": 1.2162162162162162, + "grad_norm": 41.46559143066406, + "learning_rate": 4.694809780771391e-06, + "loss": 0.1842, + "num_input_tokens_seen": 470728, + "step": 1170 + }, + { + "epoch": 1.2214137214137215, + "grad_norm": 15.253453254699707, + "learning_rate": 4.690451973830314e-06, + "loss": 0.1067, + "num_input_tokens_seen": 472776, + "step": 1175 + }, + { + "epoch": 1.2266112266112266, + "grad_norm": 32.71086120605469, + "learning_rate": 4.6860653267175425e-06, + "loss": 0.177, + "num_input_tokens_seen": 474824, + "step": 1180 + }, + { + "epoch": 1.2318087318087318, + "grad_norm": 14.900812149047852, + "learning_rate": 4.681649897189036e-06, + "loss": 0.2562, + "num_input_tokens_seen": 476744, + "step": 1185 + }, + { + "epoch": 1.237006237006237, + "grad_norm": 11.960613250732422, + "learning_rate": 4.677205743379714e-06, + "loss": 0.053, + "num_input_tokens_seen": 478856, + "step": 1190 + }, + { + "epoch": 1.2422037422037422, + "grad_norm": 33.26080322265625, + "learning_rate": 4.672732923802685e-06, + "loss": 0.1686, + "num_input_tokens_seen": 480776, + "step": 1195 + }, + { + "epoch": 1.2474012474012475, + "grad_norm": 13.924117088317871, + "learning_rate": 4.6682314973484844e-06, + "loss": 0.0292, + "num_input_tokens_seen": 482952, + "step": 1200 + }, + { + "epoch": 1.2525987525987525, + "grad_norm": 18.63437271118164, + "learning_rate": 4.663701523284291e-06, + "loss": 0.0622, + "num_input_tokens_seen": 485192, + "step": 1205 + }, + { + "epoch": 1.2525987525987525, + "eval_loss": 0.26757940649986267, + "eval_runtime": 1.0561, + "eval_samples_per_second": 810.538, + "eval_steps_per_second": 101.317, + "num_input_tokens_seen": 485192, + "step": 1205 + }, + { + "epoch": 1.2577962577962578, + "grad_norm": 0.11255653202533722, + "learning_rate": 4.659143061253152e-06, + "loss": 0.1299, + "num_input_tokens_seen": 487112, + "step": 1210 + }, + { + "epoch": 1.262993762993763, + "grad_norm": 38.90354537963867, + "learning_rate": 4.654556171273196e-06, + "loss": 0.2685, + "num_input_tokens_seen": 489160, + "step": 1215 + }, + { + "epoch": 1.2681912681912682, + "grad_norm": 0.6540949940681458, + "learning_rate": 4.649940913736841e-06, + "loss": 0.2017, + "num_input_tokens_seen": 491080, + "step": 1220 + }, + { + "epoch": 1.2733887733887734, + "grad_norm": 0.8767725825309753, + "learning_rate": 4.645297349410005e-06, + "loss": 0.0607, + "num_input_tokens_seen": 493064, + "step": 1225 + }, + { + "epoch": 1.2785862785862787, + "grad_norm": 1.3416281938552856, + "learning_rate": 4.640625539431298e-06, + "loss": 0.1537, + "num_input_tokens_seen": 494984, + "step": 1230 + }, + { + "epoch": 1.2837837837837838, + "grad_norm": 11.072347640991211, + "learning_rate": 4.635925545311224e-06, + "loss": 0.2946, + "num_input_tokens_seen": 496968, + "step": 1235 + }, + { + "epoch": 1.288981288981289, + "grad_norm": 5.309337615966797, + "learning_rate": 4.631197428931365e-06, + "loss": 0.0799, + "num_input_tokens_seen": 498824, + "step": 1240 + }, + { + "epoch": 1.2941787941787941, + "grad_norm": 42.28898239135742, + "learning_rate": 4.626441252543572e-06, + "loss": 0.0804, + "num_input_tokens_seen": 500808, + "step": 1245 + }, + { + "epoch": 1.2993762993762994, + "grad_norm": 99.69770812988281, + "learning_rate": 4.621657078769143e-06, + "loss": 0.251, + "num_input_tokens_seen": 502856, + "step": 1250 + }, + { + "epoch": 1.3045738045738045, + "grad_norm": 48.22378921508789, + "learning_rate": 4.616844970597996e-06, + "loss": 0.0856, + "num_input_tokens_seen": 504712, + "step": 1255 + }, + { + "epoch": 1.3097713097713097, + "grad_norm": 22.802143096923828, + "learning_rate": 4.612004991387843e-06, + "loss": 0.3719, + "num_input_tokens_seen": 506696, + "step": 1260 + }, + { + "epoch": 1.314968814968815, + "grad_norm": 20.26570701599121, + "learning_rate": 4.607137204863356e-06, + "loss": 0.0936, + "num_input_tokens_seen": 508680, + "step": 1265 + }, + { + "epoch": 1.32016632016632, + "grad_norm": 4.629741191864014, + "learning_rate": 4.602241675115326e-06, + "loss": 0.1072, + "num_input_tokens_seen": 510728, + "step": 1270 + }, + { + "epoch": 1.3253638253638254, + "grad_norm": 2.296597957611084, + "learning_rate": 4.597318466599819e-06, + "loss": 0.0841, + "num_input_tokens_seen": 512712, + "step": 1275 + }, + { + "epoch": 1.3305613305613306, + "grad_norm": 0.3281061351299286, + "learning_rate": 4.592367644137329e-06, + "loss": 0.1067, + "num_input_tokens_seen": 514696, + "step": 1280 + }, + { + "epoch": 1.3357588357588357, + "grad_norm": 28.257986068725586, + "learning_rate": 4.587389272911923e-06, + "loss": 0.1895, + "num_input_tokens_seen": 516808, + "step": 1285 + }, + { + "epoch": 1.340956340956341, + "grad_norm": 49.04169464111328, + "learning_rate": 4.582383418470386e-06, + "loss": 0.2118, + "num_input_tokens_seen": 518792, + "step": 1290 + }, + { + "epoch": 1.3461538461538463, + "grad_norm": 75.7163314819336, + "learning_rate": 4.5773501467213525e-06, + "loss": 0.1325, + "num_input_tokens_seen": 520840, + "step": 1295 + }, + { + "epoch": 1.3513513513513513, + "grad_norm": 0.3487839698791504, + "learning_rate": 4.572289523934444e-06, + "loss": 0.0526, + "num_input_tokens_seen": 522760, + "step": 1300 + }, + { + "epoch": 1.3565488565488566, + "grad_norm": 21.736961364746094, + "learning_rate": 4.567201616739393e-06, + "loss": 0.2152, + "num_input_tokens_seen": 524872, + "step": 1305 + }, + { + "epoch": 1.3617463617463619, + "grad_norm": 66.38067626953125, + "learning_rate": 4.562086492125167e-06, + "loss": 0.1978, + "num_input_tokens_seen": 526920, + "step": 1310 + }, + { + "epoch": 1.366943866943867, + "grad_norm": 11.493204116821289, + "learning_rate": 4.5569442174390885e-06, + "loss": 0.1374, + "num_input_tokens_seen": 528968, + "step": 1315 + }, + { + "epoch": 1.3721413721413722, + "grad_norm": 0.7820735573768616, + "learning_rate": 4.551774860385944e-06, + "loss": 0.0818, + "num_input_tokens_seen": 530888, + "step": 1320 + }, + { + "epoch": 1.3773388773388773, + "grad_norm": 0.8318536281585693, + "learning_rate": 4.546578489027095e-06, + "loss": 0.1644, + "num_input_tokens_seen": 532872, + "step": 1325 + }, + { + "epoch": 1.3825363825363826, + "grad_norm": 10.547067642211914, + "learning_rate": 4.541355171779582e-06, + "loss": 0.118, + "num_input_tokens_seen": 534920, + "step": 1330 + }, + { + "epoch": 1.3877338877338876, + "grad_norm": 1.9730658531188965, + "learning_rate": 4.536104977415225e-06, + "loss": 0.0039, + "num_input_tokens_seen": 536840, + "step": 1335 + }, + { + "epoch": 1.392931392931393, + "grad_norm": 21.502456665039062, + "learning_rate": 4.530827975059715e-06, + "loss": 0.3705, + "num_input_tokens_seen": 538760, + "step": 1340 + }, + { + "epoch": 1.3981288981288982, + "grad_norm": 0.3489514887332916, + "learning_rate": 4.525524234191705e-06, + "loss": 0.2364, + "num_input_tokens_seen": 540680, + "step": 1345 + }, + { + "epoch": 1.4033264033264032, + "grad_norm": 44.12948989868164, + "learning_rate": 4.520193824641898e-06, + "loss": 0.1405, + "num_input_tokens_seen": 542664, + "step": 1350 + }, + { + "epoch": 1.4085239085239085, + "grad_norm": 91.0547103881836, + "learning_rate": 4.51483681659212e-06, + "loss": 0.1596, + "num_input_tokens_seen": 544712, + "step": 1355 + }, + { + "epoch": 1.4137214137214138, + "grad_norm": 16.955787658691406, + "learning_rate": 4.5094532805744075e-06, + "loss": 0.2662, + "num_input_tokens_seen": 546824, + "step": 1360 + }, + { + "epoch": 1.4189189189189189, + "grad_norm": 24.00728416442871, + "learning_rate": 4.504043287470068e-06, + "loss": 0.0791, + "num_input_tokens_seen": 548936, + "step": 1365 + }, + { + "epoch": 1.4241164241164241, + "grad_norm": 1.3119056224822998, + "learning_rate": 4.498606908508754e-06, + "loss": 0.1218, + "num_input_tokens_seen": 550920, + "step": 1370 + }, + { + "epoch": 1.4293139293139294, + "grad_norm": 0.5111730694770813, + "learning_rate": 4.493144215267519e-06, + "loss": 0.0307, + "num_input_tokens_seen": 552904, + "step": 1375 + }, + { + "epoch": 1.4345114345114345, + "grad_norm": 5.568216800689697, + "learning_rate": 4.4876552796698814e-06, + "loss": 0.1616, + "num_input_tokens_seen": 554824, + "step": 1380 + }, + { + "epoch": 1.4397089397089398, + "grad_norm": 21.439517974853516, + "learning_rate": 4.482140173984875e-06, + "loss": 0.214, + "num_input_tokens_seen": 556872, + "step": 1385 + }, + { + "epoch": 1.444906444906445, + "grad_norm": 4.337334156036377, + "learning_rate": 4.476598970826093e-06, + "loss": 0.1453, + "num_input_tokens_seen": 558984, + "step": 1390 + }, + { + "epoch": 1.45010395010395, + "grad_norm": 15.975934982299805, + "learning_rate": 4.471031743150744e-06, + "loss": 0.2061, + "num_input_tokens_seen": 560968, + "step": 1395 + }, + { + "epoch": 1.4553014553014554, + "grad_norm": 65.18135070800781, + "learning_rate": 4.465438564258673e-06, + "loss": 0.2358, + "num_input_tokens_seen": 562952, + "step": 1400 + }, + { + "epoch": 1.4604989604989604, + "grad_norm": 0.4641796052455902, + "learning_rate": 4.459819507791415e-06, + "loss": 0.0357, + "num_input_tokens_seen": 565064, + "step": 1405 + }, + { + "epoch": 1.4656964656964657, + "grad_norm": 0.4620436728000641, + "learning_rate": 4.454174647731213e-06, + "loss": 0.1194, + "num_input_tokens_seen": 567112, + "step": 1410 + }, + { + "epoch": 1.4708939708939708, + "grad_norm": 0.40077999234199524, + "learning_rate": 4.448504058400052e-06, + "loss": 0.2261, + "num_input_tokens_seen": 569160, + "step": 1415 + }, + { + "epoch": 1.476091476091476, + "grad_norm": 31.147504806518555, + "learning_rate": 4.4428078144586715e-06, + "loss": 0.1794, + "num_input_tokens_seen": 571336, + "step": 1420 + }, + { + "epoch": 1.4812889812889813, + "grad_norm": 60.33259582519531, + "learning_rate": 4.437085990905591e-06, + "loss": 0.2622, + "num_input_tokens_seen": 573384, + "step": 1425 + }, + { + "epoch": 1.4864864864864864, + "grad_norm": 42.84424591064453, + "learning_rate": 4.431338663076119e-06, + "loss": 0.1625, + "num_input_tokens_seen": 575304, + "step": 1430 + }, + { + "epoch": 1.4916839916839917, + "grad_norm": 1.8637181520462036, + "learning_rate": 4.42556590664136e-06, + "loss": 0.0647, + "num_input_tokens_seen": 577160, + "step": 1435 + }, + { + "epoch": 1.496881496881497, + "grad_norm": 34.14229965209961, + "learning_rate": 4.41976779760722e-06, + "loss": 0.11, + "num_input_tokens_seen": 579208, + "step": 1440 + }, + { + "epoch": 1.502079002079002, + "grad_norm": 48.55718994140625, + "learning_rate": 4.413944412313405e-06, + "loss": 0.0911, + "num_input_tokens_seen": 581256, + "step": 1445 + }, + { + "epoch": 1.503118503118503, + "eval_loss": 0.3145564794540405, + "eval_runtime": 1.1073, + "eval_samples_per_second": 773.075, + "eval_steps_per_second": 96.634, + "num_input_tokens_seen": 581704, + "step": 1446 + }, + { + "epoch": 1.5072765072765073, + "grad_norm": 0.2164192944765091, + "learning_rate": 4.408095827432416e-06, + "loss": 0.1191, + "num_input_tokens_seen": 583304, + "step": 1450 + }, + { + "epoch": 1.5124740124740126, + "grad_norm": 50.20052719116211, + "learning_rate": 4.40222211996854e-06, + "loss": 0.3479, + "num_input_tokens_seen": 585224, + "step": 1455 + }, + { + "epoch": 1.5176715176715176, + "grad_norm": 31.401309967041016, + "learning_rate": 4.396323367256836e-06, + "loss": 0.2617, + "num_input_tokens_seen": 587272, + "step": 1460 + }, + { + "epoch": 1.5228690228690227, + "grad_norm": 32.85145568847656, + "learning_rate": 4.390399646962117e-06, + "loss": 0.1985, + "num_input_tokens_seen": 589320, + "step": 1465 + }, + { + "epoch": 1.5280665280665282, + "grad_norm": 13.456771850585938, + "learning_rate": 4.384451037077924e-06, + "loss": 0.1369, + "num_input_tokens_seen": 591304, + "step": 1470 + }, + { + "epoch": 1.5332640332640333, + "grad_norm": 1.2188056707382202, + "learning_rate": 4.378477615925506e-06, + "loss": 0.1433, + "num_input_tokens_seen": 593224, + "step": 1475 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 28.065401077270508, + "learning_rate": 4.372479462152781e-06, + "loss": 0.1273, + "num_input_tokens_seen": 595336, + "step": 1480 + }, + { + "epoch": 1.5436590436590436, + "grad_norm": 21.94362449645996, + "learning_rate": 4.366456654733308e-06, + "loss": 0.2715, + "num_input_tokens_seen": 597256, + "step": 1485 + }, + { + "epoch": 1.5488565488565489, + "grad_norm": 11.613262176513672, + "learning_rate": 4.360409272965242e-06, + "loss": 0.1859, + "num_input_tokens_seen": 599304, + "step": 1490 + }, + { + "epoch": 1.554054054054054, + "grad_norm": 3.856565237045288, + "learning_rate": 4.354337396470291e-06, + "loss": 0.0745, + "num_input_tokens_seen": 601288, + "step": 1495 + }, + { + "epoch": 1.5592515592515592, + "grad_norm": 52.13639831542969, + "learning_rate": 4.348241105192668e-06, + "loss": 0.1641, + "num_input_tokens_seen": 603272, + "step": 1500 + }, + { + "epoch": 1.5644490644490645, + "grad_norm": 4.8234357833862305, + "learning_rate": 4.34212047939804e-06, + "loss": 0.1365, + "num_input_tokens_seen": 605256, + "step": 1505 + }, + { + "epoch": 1.5696465696465696, + "grad_norm": 17.75581169128418, + "learning_rate": 4.335975599672469e-06, + "loss": 0.0868, + "num_input_tokens_seen": 607304, + "step": 1510 + }, + { + "epoch": 1.5748440748440748, + "grad_norm": 6.615152835845947, + "learning_rate": 4.329806546921354e-06, + "loss": 0.1281, + "num_input_tokens_seen": 609224, + "step": 1515 + }, + { + "epoch": 1.5800415800415801, + "grad_norm": 29.3903865814209, + "learning_rate": 4.3236134023683565e-06, + "loss": 0.0465, + "num_input_tokens_seen": 611336, + "step": 1520 + }, + { + "epoch": 1.5852390852390852, + "grad_norm": 66.3274154663086, + "learning_rate": 4.3173962475543475e-06, + "loss": 0.1156, + "num_input_tokens_seen": 613320, + "step": 1525 + }, + { + "epoch": 1.5904365904365905, + "grad_norm": 45.9361686706543, + "learning_rate": 4.311155164336318e-06, + "loss": 0.2405, + "num_input_tokens_seen": 615176, + "step": 1530 + }, + { + "epoch": 1.5956340956340958, + "grad_norm": 15.004530906677246, + "learning_rate": 4.3048902348863116e-06, + "loss": 0.1673, + "num_input_tokens_seen": 617224, + "step": 1535 + }, + { + "epoch": 1.6008316008316008, + "grad_norm": 46.72109603881836, + "learning_rate": 4.298601541690336e-06, + "loss": 0.1683, + "num_input_tokens_seen": 619208, + "step": 1540 + }, + { + "epoch": 1.6060291060291059, + "grad_norm": 25.378122329711914, + "learning_rate": 4.292289167547281e-06, + "loss": 0.221, + "num_input_tokens_seen": 621192, + "step": 1545 + }, + { + "epoch": 1.6112266112266114, + "grad_norm": 15.957310676574707, + "learning_rate": 4.285953195567827e-06, + "loss": 0.1458, + "num_input_tokens_seen": 623176, + "step": 1550 + }, + { + "epoch": 1.6164241164241164, + "grad_norm": 20.942054748535156, + "learning_rate": 4.279593709173352e-06, + "loss": 0.246, + "num_input_tokens_seen": 625160, + "step": 1555 + }, + { + "epoch": 1.6216216216216215, + "grad_norm": 0.6662601232528687, + "learning_rate": 4.27321079209483e-06, + "loss": 0.1381, + "num_input_tokens_seen": 627144, + "step": 1560 + }, + { + "epoch": 1.6268191268191268, + "grad_norm": 24.978178024291992, + "learning_rate": 4.266804528371732e-06, + "loss": 0.1634, + "num_input_tokens_seen": 629192, + "step": 1565 + }, + { + "epoch": 1.632016632016632, + "grad_norm": 7.083024024963379, + "learning_rate": 4.260375002350917e-06, + "loss": 0.1174, + "num_input_tokens_seen": 631240, + "step": 1570 + }, + { + "epoch": 1.637214137214137, + "grad_norm": 20.857555389404297, + "learning_rate": 4.253922298685525e-06, + "loss": 0.2274, + "num_input_tokens_seen": 633224, + "step": 1575 + }, + { + "epoch": 1.6424116424116424, + "grad_norm": 46.30590057373047, + "learning_rate": 4.2474465023338586e-06, + "loss": 0.1367, + "num_input_tokens_seen": 635208, + "step": 1580 + }, + { + "epoch": 1.6476091476091477, + "grad_norm": 0.8250938057899475, + "learning_rate": 4.2409476985582645e-06, + "loss": 0.1048, + "num_input_tokens_seen": 637256, + "step": 1585 + }, + { + "epoch": 1.6528066528066527, + "grad_norm": 2.617978572845459, + "learning_rate": 4.234425972924014e-06, + "loss": 0.0156, + "num_input_tokens_seen": 639176, + "step": 1590 + }, + { + "epoch": 1.658004158004158, + "grad_norm": 0.5611851811408997, + "learning_rate": 4.227881411298175e-06, + "loss": 0.1551, + "num_input_tokens_seen": 641224, + "step": 1595 + }, + { + "epoch": 1.6632016632016633, + "grad_norm": 0.4693892002105713, + "learning_rate": 4.221314099848481e-06, + "loss": 0.1125, + "num_input_tokens_seen": 643144, + "step": 1600 + }, + { + "epoch": 1.6683991683991684, + "grad_norm": 38.350101470947266, + "learning_rate": 4.214724125042195e-06, + "loss": 0.1457, + "num_input_tokens_seen": 644936, + "step": 1605 + }, + { + "epoch": 1.6735966735966736, + "grad_norm": 36.42043685913086, + "learning_rate": 4.208111573644975e-06, + "loss": 0.1623, + "num_input_tokens_seen": 646984, + "step": 1610 + }, + { + "epoch": 1.678794178794179, + "grad_norm": 0.15917447209358215, + "learning_rate": 4.2014765327197285e-06, + "loss": 0.2052, + "num_input_tokens_seen": 649032, + "step": 1615 + }, + { + "epoch": 1.683991683991684, + "grad_norm": 25.833524703979492, + "learning_rate": 4.194819089625466e-06, + "loss": 0.2047, + "num_input_tokens_seen": 651080, + "step": 1620 + }, + { + "epoch": 1.689189189189189, + "grad_norm": 13.379853248596191, + "learning_rate": 4.188139332016154e-06, + "loss": 0.2123, + "num_input_tokens_seen": 653000, + "step": 1625 + }, + { + "epoch": 1.6943866943866945, + "grad_norm": 10.135590553283691, + "learning_rate": 4.181437347839559e-06, + "loss": 0.2089, + "num_input_tokens_seen": 654920, + "step": 1630 + }, + { + "epoch": 1.6995841995841996, + "grad_norm": 3.057936191558838, + "learning_rate": 4.174713225336087e-06, + "loss": 0.1685, + "num_input_tokens_seen": 656904, + "step": 1635 + }, + { + "epoch": 1.7047817047817047, + "grad_norm": 29.033493041992188, + "learning_rate": 4.167967053037625e-06, + "loss": 0.105, + "num_input_tokens_seen": 658952, + "step": 1640 + }, + { + "epoch": 1.70997920997921, + "grad_norm": 0.9139642715454102, + "learning_rate": 4.161198919766375e-06, + "loss": 0.0899, + "num_input_tokens_seen": 660872, + "step": 1645 + }, + { + "epoch": 1.7151767151767152, + "grad_norm": 37.07249069213867, + "learning_rate": 4.154408914633685e-06, + "loss": 0.2054, + "num_input_tokens_seen": 662856, + "step": 1650 + }, + { + "epoch": 1.7203742203742203, + "grad_norm": 6.607808589935303, + "learning_rate": 4.147597127038873e-06, + "loss": 0.2025, + "num_input_tokens_seen": 664904, + "step": 1655 + }, + { + "epoch": 1.7255717255717256, + "grad_norm": 20.834936141967773, + "learning_rate": 4.140763646668051e-06, + "loss": 0.141, + "num_input_tokens_seen": 666888, + "step": 1660 + }, + { + "epoch": 1.7307692307692308, + "grad_norm": 8.577420234680176, + "learning_rate": 4.133908563492949e-06, + "loss": 0.0252, + "num_input_tokens_seen": 668936, + "step": 1665 + }, + { + "epoch": 1.735966735966736, + "grad_norm": 86.40946960449219, + "learning_rate": 4.12703196776972e-06, + "loss": 0.2066, + "num_input_tokens_seen": 670856, + "step": 1670 + }, + { + "epoch": 1.7411642411642412, + "grad_norm": 30.79783058166504, + "learning_rate": 4.120133950037763e-06, + "loss": 0.3627, + "num_input_tokens_seen": 672840, + "step": 1675 + }, + { + "epoch": 1.7463617463617465, + "grad_norm": 40.20522689819336, + "learning_rate": 4.113214601118523e-06, + "loss": 0.2218, + "num_input_tokens_seen": 674824, + "step": 1680 + }, + { + "epoch": 1.7515592515592515, + "grad_norm": 26.033588409423828, + "learning_rate": 4.106274012114302e-06, + "loss": 0.1042, + "num_input_tokens_seen": 676808, + "step": 1685 + }, + { + "epoch": 1.7536382536382535, + "eval_loss": 0.2114141583442688, + "eval_runtime": 1.0685, + "eval_samples_per_second": 801.127, + "eval_steps_per_second": 100.141, + "num_input_tokens_seen": 677576, + "step": 1687 + }, + { + "epoch": 1.7567567567567568, + "grad_norm": 20.88821792602539, + "learning_rate": 4.099312274407049e-06, + "loss": 0.1712, + "num_input_tokens_seen": 678728, + "step": 1690 + }, + { + "epoch": 1.761954261954262, + "grad_norm": 36.20766067504883, + "learning_rate": 4.092329479657168e-06, + "loss": 0.1031, + "num_input_tokens_seen": 680776, + "step": 1695 + }, + { + "epoch": 1.7671517671517671, + "grad_norm": 1.624861478805542, + "learning_rate": 4.085325719802307e-06, + "loss": 0.1288, + "num_input_tokens_seen": 683016, + "step": 1700 + }, + { + "epoch": 1.7723492723492722, + "grad_norm": 5.351211071014404, + "learning_rate": 4.0783010870561445e-06, + "loss": 0.0556, + "num_input_tokens_seen": 685256, + "step": 1705 + }, + { + "epoch": 1.7775467775467777, + "grad_norm": 45.75218963623047, + "learning_rate": 4.07125567390718e-06, + "loss": 0.3125, + "num_input_tokens_seen": 687304, + "step": 1710 + }, + { + "epoch": 1.7827442827442828, + "grad_norm": 26.084054946899414, + "learning_rate": 4.064189573117512e-06, + "loss": 0.2158, + "num_input_tokens_seen": 689224, + "step": 1715 + }, + { + "epoch": 1.7879417879417878, + "grad_norm": 1.3422380685806274, + "learning_rate": 4.057102877721621e-06, + "loss": 0.1701, + "num_input_tokens_seen": 691400, + "step": 1720 + }, + { + "epoch": 1.793139293139293, + "grad_norm": 14.056601524353027, + "learning_rate": 4.049995681025143e-06, + "loss": 0.1154, + "num_input_tokens_seen": 693320, + "step": 1725 + }, + { + "epoch": 1.7983367983367984, + "grad_norm": 18.775554656982422, + "learning_rate": 4.0428680766036386e-06, + "loss": 0.1654, + "num_input_tokens_seen": 695432, + "step": 1730 + }, + { + "epoch": 1.8035343035343034, + "grad_norm": 18.788192749023438, + "learning_rate": 4.035720158301363e-06, + "loss": 0.2169, + "num_input_tokens_seen": 697544, + "step": 1735 + }, + { + "epoch": 1.8087318087318087, + "grad_norm": 15.670763969421387, + "learning_rate": 4.028552020230031e-06, + "loss": 0.1438, + "num_input_tokens_seen": 699592, + "step": 1740 + }, + { + "epoch": 1.813929313929314, + "grad_norm": 27.88116455078125, + "learning_rate": 4.021363756767577e-06, + "loss": 0.2247, + "num_input_tokens_seen": 701576, + "step": 1745 + }, + { + "epoch": 1.819126819126819, + "grad_norm": 17.78391456604004, + "learning_rate": 4.014155462556913e-06, + "loss": 0.2586, + "num_input_tokens_seen": 703688, + "step": 1750 + }, + { + "epoch": 1.8243243243243243, + "grad_norm": 16.56380844116211, + "learning_rate": 4.006927232504682e-06, + "loss": 0.2187, + "num_input_tokens_seen": 705736, + "step": 1755 + }, + { + "epoch": 1.8295218295218296, + "grad_norm": 6.663086414337158, + "learning_rate": 3.999679161780006e-06, + "loss": 0.043, + "num_input_tokens_seen": 707720, + "step": 1760 + }, + { + "epoch": 1.8347193347193347, + "grad_norm": 0.7404634952545166, + "learning_rate": 3.99241134581324e-06, + "loss": 0.08, + "num_input_tokens_seen": 709896, + "step": 1765 + }, + { + "epoch": 1.83991683991684, + "grad_norm": 39.46183395385742, + "learning_rate": 3.985123880294708e-06, + "loss": 0.1669, + "num_input_tokens_seen": 711944, + "step": 1770 + }, + { + "epoch": 1.8451143451143452, + "grad_norm": 7.113590717315674, + "learning_rate": 3.977816861173446e-06, + "loss": 0.1912, + "num_input_tokens_seen": 713992, + "step": 1775 + }, + { + "epoch": 1.8503118503118503, + "grad_norm": 48.9863395690918, + "learning_rate": 3.970490384655939e-06, + "loss": 0.1846, + "num_input_tokens_seen": 715976, + "step": 1780 + }, + { + "epoch": 1.8555093555093554, + "grad_norm": 31.94883918762207, + "learning_rate": 3.963144547204856e-06, + "loss": 0.105, + "num_input_tokens_seen": 718024, + "step": 1785 + }, + { + "epoch": 1.8607068607068609, + "grad_norm": 11.180656433105469, + "learning_rate": 3.955779445537776e-06, + "loss": 0.2342, + "num_input_tokens_seen": 720072, + "step": 1790 + }, + { + "epoch": 1.865904365904366, + "grad_norm": 5.974298000335693, + "learning_rate": 3.948395176625918e-06, + "loss": 0.2314, + "num_input_tokens_seen": 722120, + "step": 1795 + }, + { + "epoch": 1.871101871101871, + "grad_norm": 3.235103130340576, + "learning_rate": 3.940991837692861e-06, + "loss": 0.1187, + "num_input_tokens_seen": 724168, + "step": 1800 + }, + { + "epoch": 1.8762993762993763, + "grad_norm": 25.299224853515625, + "learning_rate": 3.933569526213268e-06, + "loss": 0.1292, + "num_input_tokens_seen": 726280, + "step": 1805 + }, + { + "epoch": 1.8814968814968815, + "grad_norm": 2.7856338024139404, + "learning_rate": 3.926128339911599e-06, + "loss": 0.0843, + "num_input_tokens_seen": 728264, + "step": 1810 + }, + { + "epoch": 1.8866943866943866, + "grad_norm": 0.9576424360275269, + "learning_rate": 3.918668376760827e-06, + "loss": 0.1791, + "num_input_tokens_seen": 730312, + "step": 1815 + }, + { + "epoch": 1.8918918918918919, + "grad_norm": 55.718650817871094, + "learning_rate": 3.9111897349811455e-06, + "loss": 0.1365, + "num_input_tokens_seen": 732296, + "step": 1820 + }, + { + "epoch": 1.8970893970893972, + "grad_norm": 20.79973793029785, + "learning_rate": 3.903692513038677e-06, + "loss": 0.1369, + "num_input_tokens_seen": 734088, + "step": 1825 + }, + { + "epoch": 1.9022869022869022, + "grad_norm": 51.692867279052734, + "learning_rate": 3.896176809644178e-06, + "loss": 0.2305, + "num_input_tokens_seen": 736072, + "step": 1830 + }, + { + "epoch": 1.9074844074844075, + "grad_norm": 3.8833014965057373, + "learning_rate": 3.8886427237517345e-06, + "loss": 0.2062, + "num_input_tokens_seen": 738120, + "step": 1835 + }, + { + "epoch": 1.9126819126819128, + "grad_norm": 26.46794891357422, + "learning_rate": 3.881090354557463e-06, + "loss": 0.2077, + "num_input_tokens_seen": 740168, + "step": 1840 + }, + { + "epoch": 1.9178794178794178, + "grad_norm": 11.211897850036621, + "learning_rate": 3.8735198014982066e-06, + "loss": 0.1425, + "num_input_tokens_seen": 742280, + "step": 1845 + }, + { + "epoch": 1.9230769230769231, + "grad_norm": 26.17072296142578, + "learning_rate": 3.865931164250219e-06, + "loss": 0.0702, + "num_input_tokens_seen": 744328, + "step": 1850 + }, + { + "epoch": 1.9282744282744284, + "grad_norm": 13.180768966674805, + "learning_rate": 3.858324542727859e-06, + "loss": 0.1732, + "num_input_tokens_seen": 746440, + "step": 1855 + }, + { + "epoch": 1.9334719334719335, + "grad_norm": 41.96379470825195, + "learning_rate": 3.8507000370822675e-06, + "loss": 0.1543, + "num_input_tokens_seen": 748488, + "step": 1860 + }, + { + "epoch": 1.9386694386694385, + "grad_norm": 14.179240226745605, + "learning_rate": 3.84305774770006e-06, + "loss": 0.1298, + "num_input_tokens_seen": 750344, + "step": 1865 + }, + { + "epoch": 1.943866943866944, + "grad_norm": 10.15986442565918, + "learning_rate": 3.835397775201991e-06, + "loss": 0.0507, + "num_input_tokens_seen": 752328, + "step": 1870 + }, + { + "epoch": 1.949064449064449, + "grad_norm": 14.734403610229492, + "learning_rate": 3.827720220441642e-06, + "loss": 0.2625, + "num_input_tokens_seen": 754312, + "step": 1875 + }, + { + "epoch": 1.9542619542619541, + "grad_norm": 38.59032440185547, + "learning_rate": 3.820025184504085e-06, + "loss": 0.4145, + "num_input_tokens_seen": 756232, + "step": 1880 + }, + { + "epoch": 1.9594594594594594, + "grad_norm": 22.36400604248047, + "learning_rate": 3.812312768704557e-06, + "loss": 0.2626, + "num_input_tokens_seen": 758280, + "step": 1885 + }, + { + "epoch": 1.9646569646569647, + "grad_norm": 5.247732162475586, + "learning_rate": 3.80458307458712e-06, + "loss": 0.1128, + "num_input_tokens_seen": 760328, + "step": 1890 + }, + { + "epoch": 1.9698544698544698, + "grad_norm": 14.368209838867188, + "learning_rate": 3.7968362039233315e-06, + "loss": 0.1213, + "num_input_tokens_seen": 762248, + "step": 1895 + }, + { + "epoch": 1.975051975051975, + "grad_norm": 22.700754165649414, + "learning_rate": 3.7890722587108985e-06, + "loss": 0.077, + "num_input_tokens_seen": 764168, + "step": 1900 + }, + { + "epoch": 1.9802494802494803, + "grad_norm": 1.1314526796340942, + "learning_rate": 3.7812913411723377e-06, + "loss": 0.0655, + "num_input_tokens_seen": 766216, + "step": 1905 + }, + { + "epoch": 1.9854469854469854, + "grad_norm": 22.53055763244629, + "learning_rate": 3.773493553753628e-06, + "loss": 0.0962, + "num_input_tokens_seen": 768264, + "step": 1910 + }, + { + "epoch": 1.9906444906444907, + "grad_norm": 19.964323043823242, + "learning_rate": 3.7656789991228638e-06, + "loss": 0.0219, + "num_input_tokens_seen": 770184, + "step": 1915 + }, + { + "epoch": 1.995841995841996, + "grad_norm": 3.517256259918213, + "learning_rate": 3.7578477801689e-06, + "loss": 0.1279, + "num_input_tokens_seen": 772168, + "step": 1920 + }, + { + "epoch": 2.001039501039501, + "grad_norm": 0.14674918353557587, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.096, + "num_input_tokens_seen": 774160, + "step": 1925 + }, + { + "epoch": 2.004158004158004, + "eval_loss": 0.3561875522136688, + "eval_runtime": 1.0449, + "eval_samples_per_second": 819.204, + "eval_steps_per_second": 102.401, + "num_input_tokens_seen": 775312, + "step": 1928 + }, + { + "epoch": 2.006237006237006, + "grad_norm": 1.6722761392593384, + "learning_rate": 3.7421357619424793e-06, + "loss": 0.0698, + "num_input_tokens_seen": 776144, + "step": 1930 + }, + { + "epoch": 2.0114345114345116, + "grad_norm": 0.06727920472621918, + "learning_rate": 3.7342551695393375e-06, + "loss": 0.0941, + "num_input_tokens_seen": 778128, + "step": 1935 + }, + { + "epoch": 2.0166320166320166, + "grad_norm": 0.06038059666752815, + "learning_rate": 3.7263583265489077e-06, + "loss": 0.0863, + "num_input_tokens_seen": 780176, + "step": 1940 + }, + { + "epoch": 2.0218295218295217, + "grad_norm": 52.84983444213867, + "learning_rate": 3.718445336943478e-06, + "loss": 0.0572, + "num_input_tokens_seen": 782160, + "step": 1945 + }, + { + "epoch": 2.027027027027027, + "grad_norm": 35.81016540527344, + "learning_rate": 3.7105163049079305e-06, + "loss": 0.0675, + "num_input_tokens_seen": 784208, + "step": 1950 + }, + { + "epoch": 2.0322245322245323, + "grad_norm": 0.14884886145591736, + "learning_rate": 3.702571334838365e-06, + "loss": 0.0002, + "num_input_tokens_seen": 786256, + "step": 1955 + }, + { + "epoch": 2.0374220374220373, + "grad_norm": 0.03602315112948418, + "learning_rate": 3.6946105313407287e-06, + "loss": 0.1288, + "num_input_tokens_seen": 788240, + "step": 1960 + }, + { + "epoch": 2.042619542619543, + "grad_norm": 152.0932159423828, + "learning_rate": 3.6866339992294347e-06, + "loss": 0.1179, + "num_input_tokens_seen": 790288, + "step": 1965 + }, + { + "epoch": 2.047817047817048, + "grad_norm": 0.03992204740643501, + "learning_rate": 3.678641843525986e-06, + "loss": 0.0768, + "num_input_tokens_seen": 792272, + "step": 1970 + }, + { + "epoch": 2.053014553014553, + "grad_norm": 0.021054543554782867, + "learning_rate": 3.670634169457587e-06, + "loss": 0.0297, + "num_input_tokens_seen": 794384, + "step": 1975 + }, + { + "epoch": 2.0582120582120584, + "grad_norm": 0.09833300858736038, + "learning_rate": 3.662611082455766e-06, + "loss": 0.1305, + "num_input_tokens_seen": 796368, + "step": 1980 + }, + { + "epoch": 2.0634095634095635, + "grad_norm": 1.8203060626983643, + "learning_rate": 3.6545726881549792e-06, + "loss": 0.0029, + "num_input_tokens_seen": 798480, + "step": 1985 + }, + { + "epoch": 2.0686070686070686, + "grad_norm": 13.759748458862305, + "learning_rate": 3.6465190923912275e-06, + "loss": 0.0937, + "num_input_tokens_seen": 800528, + "step": 1990 + }, + { + "epoch": 2.0738045738045736, + "grad_norm": 0.23287345468997955, + "learning_rate": 3.6384504012006544e-06, + "loss": 0.1904, + "num_input_tokens_seen": 802768, + "step": 1995 + }, + { + "epoch": 2.079002079002079, + "grad_norm": 15.67501163482666, + "learning_rate": 3.6303667208181576e-06, + "loss": 0.1647, + "num_input_tokens_seen": 804752, + "step": 2000 + }, + { + "epoch": 2.084199584199584, + "grad_norm": 0.5168598294258118, + "learning_rate": 3.622268157675986e-06, + "loss": 0.0649, + "num_input_tokens_seen": 806672, + "step": 2005 + }, + { + "epoch": 2.0893970893970892, + "grad_norm": 0.3060367703437805, + "learning_rate": 3.614154818402339e-06, + "loss": 0.0186, + "num_input_tokens_seen": 808656, + "step": 2010 + }, + { + "epoch": 2.0945945945945947, + "grad_norm": 0.8559133410453796, + "learning_rate": 3.6060268098199656e-06, + "loss": 0.0494, + "num_input_tokens_seen": 810640, + "step": 2015 + }, + { + "epoch": 2.0997920997921, + "grad_norm": 0.40390461683273315, + "learning_rate": 3.5978842389447523e-06, + "loss": 0.0657, + "num_input_tokens_seen": 812688, + "step": 2020 + }, + { + "epoch": 2.104989604989605, + "grad_norm": 0.16403798758983612, + "learning_rate": 3.5897272129843198e-06, + "loss": 0.0206, + "num_input_tokens_seen": 814800, + "step": 2025 + }, + { + "epoch": 2.1101871101871104, + "grad_norm": 0.8833001255989075, + "learning_rate": 3.5815558393366064e-06, + "loss": 0.0252, + "num_input_tokens_seen": 816912, + "step": 2030 + }, + { + "epoch": 2.1153846153846154, + "grad_norm": 0.044099193066358566, + "learning_rate": 3.57337022558846e-06, + "loss": 0.1156, + "num_input_tokens_seen": 818896, + "step": 2035 + }, + { + "epoch": 2.1205821205821205, + "grad_norm": 20.8973445892334, + "learning_rate": 3.5651704795142137e-06, + "loss": 0.0855, + "num_input_tokens_seen": 820880, + "step": 2040 + }, + { + "epoch": 2.125779625779626, + "grad_norm": 21.765148162841797, + "learning_rate": 3.5569567090742763e-06, + "loss": 0.1594, + "num_input_tokens_seen": 822864, + "step": 2045 + }, + { + "epoch": 2.130977130977131, + "grad_norm": 2.817866325378418, + "learning_rate": 3.548729022413701e-06, + "loss": 0.0265, + "num_input_tokens_seen": 825040, + "step": 2050 + }, + { + "epoch": 2.136174636174636, + "grad_norm": 0.0856303721666336, + "learning_rate": 3.5404875278607693e-06, + "loss": 0.0995, + "num_input_tokens_seen": 827024, + "step": 2055 + }, + { + "epoch": 2.141372141372141, + "grad_norm": 0.09817512333393097, + "learning_rate": 3.5322323339255602e-06, + "loss": 0.072, + "num_input_tokens_seen": 829136, + "step": 2060 + }, + { + "epoch": 2.1465696465696467, + "grad_norm": 4.946967601776123, + "learning_rate": 3.5239635492985248e-06, + "loss": 0.0483, + "num_input_tokens_seen": 831184, + "step": 2065 + }, + { + "epoch": 2.1517671517671517, + "grad_norm": 0.04570393264293671, + "learning_rate": 3.5156812828490507e-06, + "loss": 0.0007, + "num_input_tokens_seen": 833168, + "step": 2070 + }, + { + "epoch": 2.156964656964657, + "grad_norm": 0.031534019857645035, + "learning_rate": 3.5073856436240335e-06, + "loss": 0.0685, + "num_input_tokens_seen": 835216, + "step": 2075 + }, + { + "epoch": 2.1621621621621623, + "grad_norm": 0.12049651145935059, + "learning_rate": 3.4990767408464383e-06, + "loss": 0.0004, + "num_input_tokens_seen": 837136, + "step": 2080 + }, + { + "epoch": 2.1673596673596673, + "grad_norm": 0.0549314022064209, + "learning_rate": 3.4907546839138627e-06, + "loss": 0.1832, + "num_input_tokens_seen": 839120, + "step": 2085 + }, + { + "epoch": 2.1725571725571724, + "grad_norm": 0.11109847575426102, + "learning_rate": 3.4824195823970954e-06, + "loss": 0.0608, + "num_input_tokens_seen": 841104, + "step": 2090 + }, + { + "epoch": 2.177754677754678, + "grad_norm": 9.288878440856934, + "learning_rate": 3.4740715460386732e-06, + "loss": 0.0894, + "num_input_tokens_seen": 843152, + "step": 2095 + }, + { + "epoch": 2.182952182952183, + "grad_norm": 0.1733572781085968, + "learning_rate": 3.46571068475144e-06, + "loss": 0.0972, + "num_input_tokens_seen": 845136, + "step": 2100 + }, + { + "epoch": 2.188149688149688, + "grad_norm": 0.4034070670604706, + "learning_rate": 3.457337108617094e-06, + "loss": 0.1887, + "num_input_tokens_seen": 847120, + "step": 2105 + }, + { + "epoch": 2.1933471933471935, + "grad_norm": 26.97930908203125, + "learning_rate": 3.4489509278847415e-06, + "loss": 0.2052, + "num_input_tokens_seen": 849168, + "step": 2110 + }, + { + "epoch": 2.1985446985446986, + "grad_norm": 74.03186798095703, + "learning_rate": 3.440552252969446e-06, + "loss": 0.0731, + "num_input_tokens_seen": 851152, + "step": 2115 + }, + { + "epoch": 2.2037422037422036, + "grad_norm": 3.1255908012390137, + "learning_rate": 3.432141194450772e-06, + "loss": 0.0078, + "num_input_tokens_seen": 853008, + "step": 2120 + }, + { + "epoch": 2.208939708939709, + "grad_norm": 0.36267173290252686, + "learning_rate": 3.4237178630713312e-06, + "loss": 0.0651, + "num_input_tokens_seen": 855120, + "step": 2125 + }, + { + "epoch": 2.214137214137214, + "grad_norm": 0.01146312803030014, + "learning_rate": 3.4152823697353237e-06, + "loss": 0.1599, + "num_input_tokens_seen": 857232, + "step": 2130 + }, + { + "epoch": 2.2193347193347193, + "grad_norm": 28.126773834228516, + "learning_rate": 3.4068348255070764e-06, + "loss": 0.057, + "num_input_tokens_seen": 859344, + "step": 2135 + }, + { + "epoch": 2.2245322245322248, + "grad_norm": 15.560206413269043, + "learning_rate": 3.3983753416095844e-06, + "loss": 0.0868, + "num_input_tokens_seen": 861328, + "step": 2140 + }, + { + "epoch": 2.22972972972973, + "grad_norm": 13.887256622314453, + "learning_rate": 3.3899040294230413e-06, + "loss": 0.2098, + "num_input_tokens_seen": 863376, + "step": 2145 + }, + { + "epoch": 2.234927234927235, + "grad_norm": 0.028023365885019302, + "learning_rate": 3.381421000483378e-06, + "loss": 0.0096, + "num_input_tokens_seen": 865424, + "step": 2150 + }, + { + "epoch": 2.24012474012474, + "grad_norm": 32.4400520324707, + "learning_rate": 3.37292636648079e-06, + "loss": 0.0981, + "num_input_tokens_seen": 867472, + "step": 2155 + }, + { + "epoch": 2.2453222453222454, + "grad_norm": 72.8097915649414, + "learning_rate": 3.3644202392582703e-06, + "loss": 0.1542, + "num_input_tokens_seen": 869584, + "step": 2160 + }, + { + "epoch": 2.2505197505197505, + "grad_norm": 1.8866627216339111, + "learning_rate": 3.3559027308101344e-06, + "loss": 0.0094, + "num_input_tokens_seen": 871568, + "step": 2165 + }, + { + "epoch": 2.2546777546777546, + "eval_loss": 0.30345332622528076, + "eval_runtime": 1.0686, + "eval_samples_per_second": 801.043, + "eval_steps_per_second": 100.13, + "num_input_tokens_seen": 873104, + "step": 2169 + }, + { + "epoch": 2.2557172557172556, + "grad_norm": 12.88599967956543, + "learning_rate": 3.3473739532805464e-06, + "loss": 0.0945, + "num_input_tokens_seen": 873488, + "step": 2170 + }, + { + "epoch": 2.260914760914761, + "grad_norm": 2.817438840866089, + "learning_rate": 3.3388340189620427e-06, + "loss": 0.1038, + "num_input_tokens_seen": 875472, + "step": 2175 + }, + { + "epoch": 2.266112266112266, + "grad_norm": 0.07076973468065262, + "learning_rate": 3.3302830402940534e-06, + "loss": 0.0275, + "num_input_tokens_seen": 877392, + "step": 2180 + }, + { + "epoch": 2.271309771309771, + "grad_norm": 0.515289306640625, + "learning_rate": 3.3217211298614225e-06, + "loss": 0.1037, + "num_input_tokens_seen": 879504, + "step": 2185 + }, + { + "epoch": 2.2765072765072767, + "grad_norm": 4.857708930969238, + "learning_rate": 3.313148400392925e-06, + "loss": 0.0551, + "num_input_tokens_seen": 881360, + "step": 2190 + }, + { + "epoch": 2.2817047817047817, + "grad_norm": 0.10649969428777695, + "learning_rate": 3.3045649647597814e-06, + "loss": 0.094, + "num_input_tokens_seen": 883280, + "step": 2195 + }, + { + "epoch": 2.286902286902287, + "grad_norm": 0.1783861368894577, + "learning_rate": 3.2959709359741743e-06, + "loss": 0.0053, + "num_input_tokens_seen": 885328, + "step": 2200 + }, + { + "epoch": 2.2920997920997923, + "grad_norm": 23.481298446655273, + "learning_rate": 3.2873664271877588e-06, + "loss": 0.0732, + "num_input_tokens_seen": 887312, + "step": 2205 + }, + { + "epoch": 2.2972972972972974, + "grad_norm": 54.74378967285156, + "learning_rate": 3.2787515516901717e-06, + "loss": 0.0574, + "num_input_tokens_seen": 889296, + "step": 2210 + }, + { + "epoch": 2.3024948024948024, + "grad_norm": 0.0618121400475502, + "learning_rate": 3.2701264229075443e-06, + "loss": 0.0007, + "num_input_tokens_seen": 891408, + "step": 2215 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 0.05290354788303375, + "learning_rate": 3.261491154401001e-06, + "loss": 0.001, + "num_input_tokens_seen": 893392, + "step": 2220 + }, + { + "epoch": 2.312889812889813, + "grad_norm": 21.230501174926758, + "learning_rate": 3.2528458598651735e-06, + "loss": 0.0047, + "num_input_tokens_seen": 895440, + "step": 2225 + }, + { + "epoch": 2.318087318087318, + "grad_norm": 2.141026258468628, + "learning_rate": 3.2441906531266963e-06, + "loss": 0.1493, + "num_input_tokens_seen": 897616, + "step": 2230 + }, + { + "epoch": 2.323284823284823, + "grad_norm": 35.35129165649414, + "learning_rate": 3.2355256481427145e-06, + "loss": 0.0359, + "num_input_tokens_seen": 899536, + "step": 2235 + }, + { + "epoch": 2.3284823284823286, + "grad_norm": 0.017625411972403526, + "learning_rate": 3.2268509589993745e-06, + "loss": 0.0408, + "num_input_tokens_seen": 901648, + "step": 2240 + }, + { + "epoch": 2.3336798336798337, + "grad_norm": 3.886178731918335, + "learning_rate": 3.218166699910332e-06, + "loss": 0.1054, + "num_input_tokens_seen": 903696, + "step": 2245 + }, + { + "epoch": 2.3388773388773387, + "grad_norm": 27.935588836669922, + "learning_rate": 3.209472985215243e-06, + "loss": 0.1455, + "num_input_tokens_seen": 905552, + "step": 2250 + }, + { + "epoch": 2.3440748440748442, + "grad_norm": 35.09407043457031, + "learning_rate": 3.2007699293782557e-06, + "loss": 0.0118, + "num_input_tokens_seen": 907472, + "step": 2255 + }, + { + "epoch": 2.3492723492723493, + "grad_norm": 58.58681869506836, + "learning_rate": 3.1920576469865115e-06, + "loss": 0.1043, + "num_input_tokens_seen": 909584, + "step": 2260 + }, + { + "epoch": 2.3544698544698544, + "grad_norm": 45.3444938659668, + "learning_rate": 3.183336252748627e-06, + "loss": 0.0544, + "num_input_tokens_seen": 911632, + "step": 2265 + }, + { + "epoch": 2.35966735966736, + "grad_norm": 0.2568526566028595, + "learning_rate": 3.1746058614931918e-06, + "loss": 0.0396, + "num_input_tokens_seen": 913616, + "step": 2270 + }, + { + "epoch": 2.364864864864865, + "grad_norm": 55.77798080444336, + "learning_rate": 3.16586658816725e-06, + "loss": 0.0559, + "num_input_tokens_seen": 915728, + "step": 2275 + }, + { + "epoch": 2.37006237006237, + "grad_norm": 0.06812699884176254, + "learning_rate": 3.157118547834793e-06, + "loss": 0.1154, + "num_input_tokens_seen": 917776, + "step": 2280 + }, + { + "epoch": 2.375259875259875, + "grad_norm": 0.2111903578042984, + "learning_rate": 3.1483618556752373e-06, + "loss": 0.1803, + "num_input_tokens_seen": 919952, + "step": 2285 + }, + { + "epoch": 2.3804573804573805, + "grad_norm": 0.02562599442899227, + "learning_rate": 3.139596626981916e-06, + "loss": 0.0648, + "num_input_tokens_seen": 921872, + "step": 2290 + }, + { + "epoch": 2.3856548856548856, + "grad_norm": 0.2788306176662445, + "learning_rate": 3.1308229771605546e-06, + "loss": 0.1079, + "num_input_tokens_seen": 923856, + "step": 2295 + }, + { + "epoch": 2.390852390852391, + "grad_norm": 10.78072738647461, + "learning_rate": 3.1220410217277546e-06, + "loss": 0.1516, + "num_input_tokens_seen": 925968, + "step": 2300 + }, + { + "epoch": 2.396049896049896, + "grad_norm": 3.1442511081695557, + "learning_rate": 3.1132508763094715e-06, + "loss": 0.0496, + "num_input_tokens_seen": 927888, + "step": 2305 + }, + { + "epoch": 2.401247401247401, + "grad_norm": 0.10060002654790878, + "learning_rate": 3.1044526566394924e-06, + "loss": 0.0691, + "num_input_tokens_seen": 929808, + "step": 2310 + }, + { + "epoch": 2.4064449064449063, + "grad_norm": 0.07174642384052277, + "learning_rate": 3.0956464785579125e-06, + "loss": 0.0009, + "num_input_tokens_seen": 931728, + "step": 2315 + }, + { + "epoch": 2.4116424116424118, + "grad_norm": 0.3574046790599823, + "learning_rate": 3.0868324580096113e-06, + "loss": 0.0309, + "num_input_tokens_seen": 933840, + "step": 2320 + }, + { + "epoch": 2.416839916839917, + "grad_norm": 0.8769842982292175, + "learning_rate": 3.078010711042723e-06, + "loss": 0.1115, + "num_input_tokens_seen": 935824, + "step": 2325 + }, + { + "epoch": 2.422037422037422, + "grad_norm": 0.044372253119945526, + "learning_rate": 3.069181353807111e-06, + "loss": 0.043, + "num_input_tokens_seen": 937872, + "step": 2330 + }, + { + "epoch": 2.4272349272349274, + "grad_norm": 0.3668450713157654, + "learning_rate": 3.0603445025528377e-06, + "loss": 0.098, + "num_input_tokens_seen": 939984, + "step": 2335 + }, + { + "epoch": 2.4324324324324325, + "grad_norm": 0.3943478465080261, + "learning_rate": 3.051500273628633e-06, + "loss": 0.0482, + "num_input_tokens_seen": 941968, + "step": 2340 + }, + { + "epoch": 2.4376299376299375, + "grad_norm": 40.15293502807617, + "learning_rate": 3.042648783480366e-06, + "loss": 0.0265, + "num_input_tokens_seen": 943952, + "step": 2345 + }, + { + "epoch": 2.442827442827443, + "grad_norm": 0.6149891018867493, + "learning_rate": 3.0337901486495073e-06, + "loss": 0.0727, + "num_input_tokens_seen": 945872, + "step": 2350 + }, + { + "epoch": 2.448024948024948, + "grad_norm": 0.031426846981048584, + "learning_rate": 3.0249244857715977e-06, + "loss": 0.1045, + "num_input_tokens_seen": 947856, + "step": 2355 + }, + { + "epoch": 2.453222453222453, + "grad_norm": 0.029239047318696976, + "learning_rate": 3.01605191157471e-06, + "loss": 0.0835, + "num_input_tokens_seen": 949840, + "step": 2360 + }, + { + "epoch": 2.4584199584199586, + "grad_norm": 43.117408752441406, + "learning_rate": 3.0071725428779152e-06, + "loss": 0.0307, + "num_input_tokens_seen": 951760, + "step": 2365 + }, + { + "epoch": 2.4636174636174637, + "grad_norm": 5.0166168212890625, + "learning_rate": 2.9982864965897423e-06, + "loss": 0.0294, + "num_input_tokens_seen": 953680, + "step": 2370 + }, + { + "epoch": 2.4688149688149688, + "grad_norm": 0.023427043110132217, + "learning_rate": 2.9893938897066392e-06, + "loss": 0.0349, + "num_input_tokens_seen": 955600, + "step": 2375 + }, + { + "epoch": 2.474012474012474, + "grad_norm": 0.23864233493804932, + "learning_rate": 2.9804948393114325e-06, + "loss": 0.2071, + "num_input_tokens_seen": 957456, + "step": 2380 + }, + { + "epoch": 2.4792099792099793, + "grad_norm": 0.4926133453845978, + "learning_rate": 2.9715894625717868e-06, + "loss": 0.0055, + "num_input_tokens_seen": 959504, + "step": 2385 + }, + { + "epoch": 2.4844074844074844, + "grad_norm": 0.32122310996055603, + "learning_rate": 2.9626778767386604e-06, + "loss": 0.0277, + "num_input_tokens_seen": 961488, + "step": 2390 + }, + { + "epoch": 2.4896049896049894, + "grad_norm": 0.13178545236587524, + "learning_rate": 2.953760199144764e-06, + "loss": 0.1288, + "num_input_tokens_seen": 963408, + "step": 2395 + }, + { + "epoch": 2.494802494802495, + "grad_norm": 0.05135256052017212, + "learning_rate": 2.9448365472030116e-06, + "loss": 0.0595, + "num_input_tokens_seen": 965392, + "step": 2400 + }, + { + "epoch": 2.5, + "grad_norm": 0.09723832458257675, + "learning_rate": 2.935907038404981e-06, + "loss": 0.0664, + "num_input_tokens_seen": 967440, + "step": 2405 + }, + { + "epoch": 2.505197505197505, + "grad_norm": 90.10528564453125, + "learning_rate": 2.9269717903193603e-06, + "loss": 0.0894, + "num_input_tokens_seen": 969360, + "step": 2410 + }, + { + "epoch": 2.505197505197505, + "eval_loss": 0.3648892641067505, + "eval_runtime": 1.0725, + "eval_samples_per_second": 798.114, + "eval_steps_per_second": 99.764, + "num_input_tokens_seen": 969360, + "step": 2410 + }, + { + "epoch": 2.51039501039501, + "grad_norm": 116.97930908203125, + "learning_rate": 2.918030920590403e-06, + "loss": 0.0082, + "num_input_tokens_seen": 971472, + "step": 2415 + }, + { + "epoch": 2.5155925155925156, + "grad_norm": 0.011047163046896458, + "learning_rate": 2.9090845469363804e-06, + "loss": 0.0006, + "num_input_tokens_seen": 973456, + "step": 2420 + }, + { + "epoch": 2.5207900207900207, + "grad_norm": 0.1614125818014145, + "learning_rate": 2.9001327871480296e-06, + "loss": 0.0004, + "num_input_tokens_seen": 975504, + "step": 2425 + }, + { + "epoch": 2.525987525987526, + "grad_norm": 0.01074185874313116, + "learning_rate": 2.8911757590870028e-06, + "loss": 0.0019, + "num_input_tokens_seen": 977552, + "step": 2430 + }, + { + "epoch": 2.5311850311850312, + "grad_norm": 173.61000061035156, + "learning_rate": 2.8822135806843156e-06, + "loss": 0.1355, + "num_input_tokens_seen": 979536, + "step": 2435 + }, + { + "epoch": 2.5363825363825363, + "grad_norm": 0.009233055636286736, + "learning_rate": 2.873246369938797e-06, + "loss": 0.084, + "num_input_tokens_seen": 981584, + "step": 2440 + }, + { + "epoch": 2.5415800415800414, + "grad_norm": 3.7363264560699463, + "learning_rate": 2.8642742449155287e-06, + "loss": 0.0365, + "num_input_tokens_seen": 983632, + "step": 2445 + }, + { + "epoch": 2.546777546777547, + "grad_norm": 13.669214248657227, + "learning_rate": 2.855297323744301e-06, + "loss": 0.1776, + "num_input_tokens_seen": 985680, + "step": 2450 + }, + { + "epoch": 2.551975051975052, + "grad_norm": 17.678695678710938, + "learning_rate": 2.8463157246180465e-06, + "loss": 0.0731, + "num_input_tokens_seen": 987664, + "step": 2455 + }, + { + "epoch": 2.5571725571725574, + "grad_norm": 0.007595015689730644, + "learning_rate": 2.8373295657912947e-06, + "loss": 0.0002, + "num_input_tokens_seen": 989648, + "step": 2460 + }, + { + "epoch": 2.5623700623700625, + "grad_norm": 0.08272235840559006, + "learning_rate": 2.828338965578603e-06, + "loss": 0.0005, + "num_input_tokens_seen": 991696, + "step": 2465 + }, + { + "epoch": 2.5675675675675675, + "grad_norm": 14.857572555541992, + "learning_rate": 2.8193440423530117e-06, + "loss": 0.2142, + "num_input_tokens_seen": 993616, + "step": 2470 + }, + { + "epoch": 2.5727650727650726, + "grad_norm": 155.0998077392578, + "learning_rate": 2.810344914544475e-06, + "loss": 0.0503, + "num_input_tokens_seen": 995664, + "step": 2475 + }, + { + "epoch": 2.577962577962578, + "grad_norm": 0.12371411919593811, + "learning_rate": 2.8013417006383078e-06, + "loss": 0.1017, + "num_input_tokens_seen": 997648, + "step": 2480 + }, + { + "epoch": 2.583160083160083, + "grad_norm": 0.05877931788563728, + "learning_rate": 2.792334519173624e-06, + "loss": 0.0796, + "num_input_tokens_seen": 999696, + "step": 2485 + }, + { + "epoch": 2.5883575883575882, + "grad_norm": 0.09183234721422195, + "learning_rate": 2.7833234887417745e-06, + "loss": 0.1002, + "num_input_tokens_seen": 1001680, + "step": 2490 + }, + { + "epoch": 2.5935550935550937, + "grad_norm": 0.1903308629989624, + "learning_rate": 2.774308727984787e-06, + "loss": 0.0836, + "num_input_tokens_seen": 1003728, + "step": 2495 + }, + { + "epoch": 2.598752598752599, + "grad_norm": 0.0603751540184021, + "learning_rate": 2.7652903555938047e-06, + "loss": 0.0495, + "num_input_tokens_seen": 1005584, + "step": 2500 + }, + { + "epoch": 2.603950103950104, + "grad_norm": 0.11998436599969864, + "learning_rate": 2.756268490307524e-06, + "loss": 0.061, + "num_input_tokens_seen": 1007696, + "step": 2505 + }, + { + "epoch": 2.609147609147609, + "grad_norm": 15.387810707092285, + "learning_rate": 2.747243250910625e-06, + "loss": 0.2945, + "num_input_tokens_seen": 1009680, + "step": 2510 + }, + { + "epoch": 2.6143451143451144, + "grad_norm": 0.8963765501976013, + "learning_rate": 2.7382147562322175e-06, + "loss": 0.0414, + "num_input_tokens_seen": 1011728, + "step": 2515 + }, + { + "epoch": 2.6195426195426195, + "grad_norm": 0.18006020784378052, + "learning_rate": 2.729183125144269e-06, + "loss": 0.0023, + "num_input_tokens_seen": 1013840, + "step": 2520 + }, + { + "epoch": 2.624740124740125, + "grad_norm": 33.4202880859375, + "learning_rate": 2.7201484765600426e-06, + "loss": 0.1403, + "num_input_tokens_seen": 1015824, + "step": 2525 + }, + { + "epoch": 2.62993762993763, + "grad_norm": 0.056018222123384476, + "learning_rate": 2.71111092943253e-06, + "loss": 0.1792, + "num_input_tokens_seen": 1017744, + "step": 2530 + }, + { + "epoch": 2.635135135135135, + "grad_norm": 0.27936848998069763, + "learning_rate": 2.702070602752887e-06, + "loss": 0.0616, + "num_input_tokens_seen": 1019728, + "step": 2535 + }, + { + "epoch": 2.64033264033264, + "grad_norm": 0.3187178075313568, + "learning_rate": 2.693027615548864e-06, + "loss": 0.0836, + "num_input_tokens_seen": 1021840, + "step": 2540 + }, + { + "epoch": 2.6455301455301456, + "grad_norm": 18.821897506713867, + "learning_rate": 2.6839820868832433e-06, + "loss": 0.0909, + "num_input_tokens_seen": 1023824, + "step": 2545 + }, + { + "epoch": 2.6507276507276507, + "grad_norm": 0.6045968532562256, + "learning_rate": 2.6749341358522675e-06, + "loss": 0.0143, + "num_input_tokens_seen": 1025616, + "step": 2550 + }, + { + "epoch": 2.6559251559251558, + "grad_norm": 0.18768300116062164, + "learning_rate": 2.665883881584072e-06, + "loss": 0.0105, + "num_input_tokens_seen": 1027664, + "step": 2555 + }, + { + "epoch": 2.6611226611226613, + "grad_norm": 0.04695185646414757, + "learning_rate": 2.6568314432371183e-06, + "loss": 0.0167, + "num_input_tokens_seen": 1029648, + "step": 2560 + }, + { + "epoch": 2.6663201663201663, + "grad_norm": 0.04115242138504982, + "learning_rate": 2.647776939998625e-06, + "loss": 0.0354, + "num_input_tokens_seen": 1031632, + "step": 2565 + }, + { + "epoch": 2.6715176715176714, + "grad_norm": 0.029054885730147362, + "learning_rate": 2.6387204910829954e-06, + "loss": 0.0416, + "num_input_tokens_seen": 1033488, + "step": 2570 + }, + { + "epoch": 2.6767151767151764, + "grad_norm": 12.681103706359863, + "learning_rate": 2.629662215730253e-06, + "loss": 0.0011, + "num_input_tokens_seen": 1035536, + "step": 2575 + }, + { + "epoch": 2.681912681912682, + "grad_norm": 0.039936427026987076, + "learning_rate": 2.620602233204467e-06, + "loss": 0.0636, + "num_input_tokens_seen": 1037584, + "step": 2580 + }, + { + "epoch": 2.687110187110187, + "grad_norm": 17.539627075195312, + "learning_rate": 2.6115406627921823e-06, + "loss": 0.1506, + "num_input_tokens_seen": 1039568, + "step": 2585 + }, + { + "epoch": 2.6923076923076925, + "grad_norm": 0.6394942998886108, + "learning_rate": 2.6024776238008543e-06, + "loss": 0.0269, + "num_input_tokens_seen": 1041616, + "step": 2590 + }, + { + "epoch": 2.6975051975051976, + "grad_norm": 0.01985405571758747, + "learning_rate": 2.5934132355572713e-06, + "loss": 0.1038, + "num_input_tokens_seen": 1043664, + "step": 2595 + }, + { + "epoch": 2.7027027027027026, + "grad_norm": 12.785253524780273, + "learning_rate": 2.5843476174059874e-06, + "loss": 0.159, + "num_input_tokens_seen": 1045520, + "step": 2600 + }, + { + "epoch": 2.7079002079002077, + "grad_norm": 0.04270019382238388, + "learning_rate": 2.575280888707748e-06, + "loss": 0.1412, + "num_input_tokens_seen": 1047376, + "step": 2605 + }, + { + "epoch": 2.713097713097713, + "grad_norm": 7.434192657470703, + "learning_rate": 2.5662131688379244e-06, + "loss": 0.0029, + "num_input_tokens_seen": 1049360, + "step": 2610 + }, + { + "epoch": 2.7182952182952183, + "grad_norm": 0.07113603502511978, + "learning_rate": 2.557144577184933e-06, + "loss": 0.054, + "num_input_tokens_seen": 1051344, + "step": 2615 + }, + { + "epoch": 2.7234927234927238, + "grad_norm": 0.07835633307695389, + "learning_rate": 2.5480752331486742e-06, + "loss": 0.0051, + "num_input_tokens_seen": 1053264, + "step": 2620 + }, + { + "epoch": 2.728690228690229, + "grad_norm": 0.1012062355875969, + "learning_rate": 2.539005256138948e-06, + "loss": 0.0494, + "num_input_tokens_seen": 1055248, + "step": 2625 + }, + { + "epoch": 2.733887733887734, + "grad_norm": 87.71424865722656, + "learning_rate": 2.529934765573893e-06, + "loss": 0.0155, + "num_input_tokens_seen": 1057104, + "step": 2630 + }, + { + "epoch": 2.739085239085239, + "grad_norm": 0.09936363995075226, + "learning_rate": 2.520863880878408e-06, + "loss": 0.0379, + "num_input_tokens_seen": 1059024, + "step": 2635 + }, + { + "epoch": 2.7442827442827444, + "grad_norm": 1.4544926881790161, + "learning_rate": 2.511792721482581e-06, + "loss": 0.2379, + "num_input_tokens_seen": 1060944, + "step": 2640 + }, + { + "epoch": 2.7494802494802495, + "grad_norm": 1.8390294313430786, + "learning_rate": 2.502721406820116e-06, + "loss": 0.038, + "num_input_tokens_seen": 1062992, + "step": 2645 + }, + { + "epoch": 2.7546777546777546, + "grad_norm": 0.23238161206245422, + "learning_rate": 2.493650056326763e-06, + "loss": 0.0705, + "num_input_tokens_seen": 1064848, + "step": 2650 + }, + { + "epoch": 2.7557172557172556, + "eval_loss": 0.306118369102478, + "eval_runtime": 1.0992, + "eval_samples_per_second": 778.717, + "eval_steps_per_second": 97.34, + "num_input_tokens_seen": 1065232, + "step": 2651 + }, + { + "epoch": 2.75987525987526, + "grad_norm": 1.4501862525939941, + "learning_rate": 2.4845787894387427e-06, + "loss": 0.2106, + "num_input_tokens_seen": 1066832, + "step": 2655 + }, + { + "epoch": 2.765072765072765, + "grad_norm": 0.20231160521507263, + "learning_rate": 2.4755077255911746e-06, + "loss": 0.0032, + "num_input_tokens_seen": 1068880, + "step": 2660 + }, + { + "epoch": 2.77027027027027, + "grad_norm": 12.596285820007324, + "learning_rate": 2.466436984216507e-06, + "loss": 0.151, + "num_input_tokens_seen": 1070864, + "step": 2665 + }, + { + "epoch": 2.7754677754677752, + "grad_norm": 2.4909775257110596, + "learning_rate": 2.4573666847429383e-06, + "loss": 0.1102, + "num_input_tokens_seen": 1072848, + "step": 2670 + }, + { + "epoch": 2.7806652806652807, + "grad_norm": 0.3123326301574707, + "learning_rate": 2.4482969465928545e-06, + "loss": 0.0628, + "num_input_tokens_seen": 1074832, + "step": 2675 + }, + { + "epoch": 2.785862785862786, + "grad_norm": 0.03955717012286186, + "learning_rate": 2.4392278891812457e-06, + "loss": 0.002, + "num_input_tokens_seen": 1076944, + "step": 2680 + }, + { + "epoch": 2.7910602910602913, + "grad_norm": 1.0874260663986206, + "learning_rate": 2.430159631914141e-06, + "loss": 0.0233, + "num_input_tokens_seen": 1078800, + "step": 2685 + }, + { + "epoch": 2.7962577962577964, + "grad_norm": 0.6165662407875061, + "learning_rate": 2.421092294187037e-06, + "loss": 0.1463, + "num_input_tokens_seen": 1080912, + "step": 2690 + }, + { + "epoch": 2.8014553014553014, + "grad_norm": 0.12875588238239288, + "learning_rate": 2.41202599538332e-06, + "loss": 0.0068, + "num_input_tokens_seen": 1082960, + "step": 2695 + }, + { + "epoch": 2.8066528066528065, + "grad_norm": 0.024786395952105522, + "learning_rate": 2.402960854872697e-06, + "loss": 0.0591, + "num_input_tokens_seen": 1085008, + "step": 2700 + }, + { + "epoch": 2.811850311850312, + "grad_norm": 0.05379832535982132, + "learning_rate": 2.39389699200963e-06, + "loss": 0.0729, + "num_input_tokens_seen": 1087184, + "step": 2705 + }, + { + "epoch": 2.817047817047817, + "grad_norm": 0.04001461714506149, + "learning_rate": 2.3848345261317523e-06, + "loss": 0.0013, + "num_input_tokens_seen": 1089104, + "step": 2710 + }, + { + "epoch": 2.822245322245322, + "grad_norm": 0.09780512005090714, + "learning_rate": 2.3757735765583083e-06, + "loss": 0.1587, + "num_input_tokens_seen": 1091024, + "step": 2715 + }, + { + "epoch": 2.8274428274428276, + "grad_norm": 0.06699176877737045, + "learning_rate": 2.3667142625885774e-06, + "loss": 0.0685, + "num_input_tokens_seen": 1093008, + "step": 2720 + }, + { + "epoch": 2.8326403326403327, + "grad_norm": 0.03752860054373741, + "learning_rate": 2.357656703500303e-06, + "loss": 0.0005, + "num_input_tokens_seen": 1094992, + "step": 2725 + }, + { + "epoch": 2.8378378378378377, + "grad_norm": 0.09200336039066315, + "learning_rate": 2.3486010185481247e-06, + "loss": 0.0003, + "num_input_tokens_seen": 1097040, + "step": 2730 + }, + { + "epoch": 2.8430353430353428, + "grad_norm": 0.3943188786506653, + "learning_rate": 2.3395473269620055e-06, + "loss": 0.1532, + "num_input_tokens_seen": 1098960, + "step": 2735 + }, + { + "epoch": 2.8482328482328483, + "grad_norm": 0.03474006429314613, + "learning_rate": 2.330495747945665e-06, + "loss": 0.0005, + "num_input_tokens_seen": 1101200, + "step": 2740 + }, + { + "epoch": 2.8534303534303533, + "grad_norm": 0.25783851742744446, + "learning_rate": 2.321446400675005e-06, + "loss": 0.1635, + "num_input_tokens_seen": 1103120, + "step": 2745 + }, + { + "epoch": 2.858627858627859, + "grad_norm": 27.41282081604004, + "learning_rate": 2.3123994042965454e-06, + "loss": 0.0648, + "num_input_tokens_seen": 1105168, + "step": 2750 + }, + { + "epoch": 2.863825363825364, + "grad_norm": 0.015091204084455967, + "learning_rate": 2.3033548779258535e-06, + "loss": 0.0463, + "num_input_tokens_seen": 1107152, + "step": 2755 + }, + { + "epoch": 2.869022869022869, + "grad_norm": 48.32891845703125, + "learning_rate": 2.2943129406459754e-06, + "loss": 0.2765, + "num_input_tokens_seen": 1109200, + "step": 2760 + }, + { + "epoch": 2.874220374220374, + "grad_norm": 0.029947001487016678, + "learning_rate": 2.2852737115058684e-06, + "loss": 0.2216, + "num_input_tokens_seen": 1111248, + "step": 2765 + }, + { + "epoch": 2.8794178794178795, + "grad_norm": 54.1898307800293, + "learning_rate": 2.2762373095188344e-06, + "loss": 0.1188, + "num_input_tokens_seen": 1113232, + "step": 2770 + }, + { + "epoch": 2.8846153846153846, + "grad_norm": 10.942605972290039, + "learning_rate": 2.2672038536609487e-06, + "loss": 0.0557, + "num_input_tokens_seen": 1115216, + "step": 2775 + }, + { + "epoch": 2.88981288981289, + "grad_norm": 0.1424887627363205, + "learning_rate": 2.2581734628695034e-06, + "loss": 0.0011, + "num_input_tokens_seen": 1117264, + "step": 2780 + }, + { + "epoch": 2.895010395010395, + "grad_norm": 16.02339744567871, + "learning_rate": 2.2491462560414287e-06, + "loss": 0.1068, + "num_input_tokens_seen": 1119376, + "step": 2785 + }, + { + "epoch": 2.9002079002079, + "grad_norm": 1.1276624202728271, + "learning_rate": 2.2401223520317363e-06, + "loss": 0.1178, + "num_input_tokens_seen": 1121424, + "step": 2790 + }, + { + "epoch": 2.9054054054054053, + "grad_norm": 13.925151824951172, + "learning_rate": 2.2311018696519532e-06, + "loss": 0.0582, + "num_input_tokens_seen": 1123472, + "step": 2795 + }, + { + "epoch": 2.9106029106029108, + "grad_norm": 0.04868851974606514, + "learning_rate": 2.2220849276685533e-06, + "loss": 0.0007, + "num_input_tokens_seen": 1125584, + "step": 2800 + }, + { + "epoch": 2.915800415800416, + "grad_norm": 0.11241783946752548, + "learning_rate": 2.2130716448014e-06, + "loss": 0.0783, + "num_input_tokens_seen": 1127568, + "step": 2805 + }, + { + "epoch": 2.920997920997921, + "grad_norm": 0.37742850184440613, + "learning_rate": 2.2040621397221762e-06, + "loss": 0.0946, + "num_input_tokens_seen": 1129552, + "step": 2810 + }, + { + "epoch": 2.9261954261954264, + "grad_norm": 0.31173890829086304, + "learning_rate": 2.1950565310528264e-06, + "loss": 0.0011, + "num_input_tokens_seen": 1131472, + "step": 2815 + }, + { + "epoch": 2.9313929313929314, + "grad_norm": 0.06812157481908798, + "learning_rate": 2.186054937363996e-06, + "loss": 0.0005, + "num_input_tokens_seen": 1133392, + "step": 2820 + }, + { + "epoch": 2.9365904365904365, + "grad_norm": 0.7941020727157593, + "learning_rate": 2.1770574771734644e-06, + "loss": 0.0004, + "num_input_tokens_seen": 1135440, + "step": 2825 + }, + { + "epoch": 2.9417879417879416, + "grad_norm": 23.382444381713867, + "learning_rate": 2.168064268944591e-06, + "loss": 0.0037, + "num_input_tokens_seen": 1137424, + "step": 2830 + }, + { + "epoch": 2.946985446985447, + "grad_norm": 0.1391032636165619, + "learning_rate": 2.1590754310847513e-06, + "loss": 0.018, + "num_input_tokens_seen": 1139408, + "step": 2835 + }, + { + "epoch": 2.952182952182952, + "grad_norm": 0.01357136107981205, + "learning_rate": 2.150091081943777e-06, + "loss": 0.1722, + "num_input_tokens_seen": 1141456, + "step": 2840 + }, + { + "epoch": 2.9573804573804576, + "grad_norm": 0.020888514816761017, + "learning_rate": 2.141111339812405e-06, + "loss": 0.1002, + "num_input_tokens_seen": 1143440, + "step": 2845 + }, + { + "epoch": 2.9625779625779627, + "grad_norm": 0.11883700639009476, + "learning_rate": 2.1321363229207097e-06, + "loss": 0.0783, + "num_input_tokens_seen": 1145360, + "step": 2850 + }, + { + "epoch": 2.9677754677754677, + "grad_norm": 0.4169588088989258, + "learning_rate": 2.123166149436556e-06, + "loss": 0.1061, + "num_input_tokens_seen": 1147280, + "step": 2855 + }, + { + "epoch": 2.972972972972973, + "grad_norm": 0.14435216784477234, + "learning_rate": 2.114200937464035e-06, + "loss": 0.1705, + "num_input_tokens_seen": 1149200, + "step": 2860 + }, + { + "epoch": 2.9781704781704783, + "grad_norm": 0.27636605501174927, + "learning_rate": 2.1052408050419153e-06, + "loss": 0.003, + "num_input_tokens_seen": 1151184, + "step": 2865 + }, + { + "epoch": 2.9833679833679834, + "grad_norm": 0.32042670249938965, + "learning_rate": 2.0962858701420867e-06, + "loss": 0.0952, + "num_input_tokens_seen": 1153232, + "step": 2870 + }, + { + "epoch": 2.9885654885654884, + "grad_norm": 12.104057312011719, + "learning_rate": 2.087336250668006e-06, + "loss": 0.1992, + "num_input_tokens_seen": 1155216, + "step": 2875 + }, + { + "epoch": 2.993762993762994, + "grad_norm": 0.12431977689266205, + "learning_rate": 2.0783920644531443e-06, + "loss": 0.1408, + "num_input_tokens_seen": 1157264, + "step": 2880 + }, + { + "epoch": 2.998960498960499, + "grad_norm": 0.302804559469223, + "learning_rate": 2.069453429259439e-06, + "loss": 0.2101, + "num_input_tokens_seen": 1159312, + "step": 2885 + }, + { + "epoch": 3.004158004158004, + "grad_norm": 0.18154755234718323, + "learning_rate": 2.06052046277574e-06, + "loss": 0.0016, + "num_input_tokens_seen": 1161248, + "step": 2890 + }, + { + "epoch": 3.006237006237006, + "eval_loss": 0.2698093056678772, + "eval_runtime": 1.0525, + "eval_samples_per_second": 813.316, + "eval_steps_per_second": 101.664, + "num_input_tokens_seen": 1162016, + "step": 2892 + }, + { + "epoch": 3.0093555093555096, + "grad_norm": 0.132065549492836, + "learning_rate": 2.051593282616262e-06, + "loss": 0.0012, + "num_input_tokens_seen": 1163168, + "step": 2895 + }, + { + "epoch": 3.0145530145530146, + "grad_norm": 0.12736886739730835, + "learning_rate": 2.0426720063190335e-06, + "loss": 0.0559, + "num_input_tokens_seen": 1165088, + "step": 2900 + }, + { + "epoch": 3.0197505197505197, + "grad_norm": 0.15903866291046143, + "learning_rate": 2.0337567513443518e-06, + "loss": 0.0012, + "num_input_tokens_seen": 1167136, + "step": 2905 + }, + { + "epoch": 3.024948024948025, + "grad_norm": 0.06871409714221954, + "learning_rate": 2.0248476350732368e-06, + "loss": 0.046, + "num_input_tokens_seen": 1169120, + "step": 2910 + }, + { + "epoch": 3.0301455301455302, + "grad_norm": 6.423719882965088, + "learning_rate": 2.0159447748058803e-06, + "loss": 0.0235, + "num_input_tokens_seen": 1171040, + "step": 2915 + }, + { + "epoch": 3.0353430353430353, + "grad_norm": 0.08531547337770462, + "learning_rate": 2.007048287760113e-06, + "loss": 0.1135, + "num_input_tokens_seen": 1173024, + "step": 2920 + }, + { + "epoch": 3.0405405405405403, + "grad_norm": 0.07908215373754501, + "learning_rate": 1.998158291069845e-06, + "loss": 0.0007, + "num_input_tokens_seen": 1174944, + "step": 2925 + }, + { + "epoch": 3.045738045738046, + "grad_norm": 15.975863456726074, + "learning_rate": 1.989274901783538e-06, + "loss": 0.009, + "num_input_tokens_seen": 1177056, + "step": 2930 + }, + { + "epoch": 3.050935550935551, + "grad_norm": 0.03592640534043312, + "learning_rate": 1.9803982368626582e-06, + "loss": 0.0004, + "num_input_tokens_seen": 1178976, + "step": 2935 + }, + { + "epoch": 3.056133056133056, + "grad_norm": 0.09219586849212646, + "learning_rate": 1.9715284131801353e-06, + "loss": 0.0007, + "num_input_tokens_seen": 1181024, + "step": 2940 + }, + { + "epoch": 3.0613305613305615, + "grad_norm": 0.022503485903143883, + "learning_rate": 1.9626655475188237e-06, + "loss": 0.0003, + "num_input_tokens_seen": 1183008, + "step": 2945 + }, + { + "epoch": 3.0665280665280665, + "grad_norm": 0.01857338473200798, + "learning_rate": 1.953809756569971e-06, + "loss": 0.0003, + "num_input_tokens_seen": 1185056, + "step": 2950 + }, + { + "epoch": 3.0717255717255716, + "grad_norm": 0.03196537122130394, + "learning_rate": 1.9449611569316716e-06, + "loss": 0.0623, + "num_input_tokens_seen": 1186976, + "step": 2955 + }, + { + "epoch": 3.076923076923077, + "grad_norm": 0.016185106709599495, + "learning_rate": 1.936119865107341e-06, + "loss": 0.1065, + "num_input_tokens_seen": 1188960, + "step": 2960 + }, + { + "epoch": 3.082120582120582, + "grad_norm": 0.03204691782593727, + "learning_rate": 1.9272859975041757e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1190944, + "step": 2965 + }, + { + "epoch": 3.087318087318087, + "grad_norm": 0.03665002062916756, + "learning_rate": 1.918459670431622e-06, + "loss": 0.0381, + "num_input_tokens_seen": 1192928, + "step": 2970 + }, + { + "epoch": 3.0925155925155927, + "grad_norm": 0.01817925274372101, + "learning_rate": 1.9096410000998478e-06, + "loss": 0.0045, + "num_input_tokens_seen": 1194848, + "step": 2975 + }, + { + "epoch": 3.0977130977130978, + "grad_norm": 0.28329595923423767, + "learning_rate": 1.9008301026182064e-06, + "loss": 0.0019, + "num_input_tokens_seen": 1196768, + "step": 2980 + }, + { + "epoch": 3.102910602910603, + "grad_norm": 0.016043463721871376, + "learning_rate": 1.892027093993716e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1198688, + "step": 2985 + }, + { + "epoch": 3.108108108108108, + "grad_norm": 0.028100663796067238, + "learning_rate": 1.883232090129523e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1200672, + "step": 2990 + }, + { + "epoch": 3.1133056133056134, + "grad_norm": 0.06433451920747757, + "learning_rate": 1.8744452068233826e-06, + "loss": 0.0713, + "num_input_tokens_seen": 1202720, + "step": 2995 + }, + { + "epoch": 3.1185031185031185, + "grad_norm": 0.017661597579717636, + "learning_rate": 1.8656665597661334e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1204768, + "step": 3000 + }, + { + "epoch": 3.1237006237006235, + "grad_norm": 0.02003006637096405, + "learning_rate": 1.8568962645401702e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1206944, + "step": 3005 + }, + { + "epoch": 3.128898128898129, + "grad_norm": 0.017040062695741653, + "learning_rate": 1.8481344366179284e-06, + "loss": 0.095, + "num_input_tokens_seen": 1209056, + "step": 3010 + }, + { + "epoch": 3.134095634095634, + "grad_norm": 0.030846811830997467, + "learning_rate": 1.8393811913603583e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1210976, + "step": 3015 + }, + { + "epoch": 3.139293139293139, + "grad_norm": 0.029971648007631302, + "learning_rate": 1.8306366440154067e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1213024, + "step": 3020 + }, + { + "epoch": 3.1444906444906446, + "grad_norm": 0.033417243510484695, + "learning_rate": 1.8219009097165042e-06, + "loss": 0.0302, + "num_input_tokens_seen": 1215136, + "step": 3025 + }, + { + "epoch": 3.1496881496881497, + "grad_norm": 0.3327726423740387, + "learning_rate": 1.8131741034810436e-06, + "loss": 0.0004, + "num_input_tokens_seen": 1217056, + "step": 3030 + }, + { + "epoch": 3.1548856548856548, + "grad_norm": 0.030325112864375114, + "learning_rate": 1.8044563402088686e-06, + "loss": 0.0003, + "num_input_tokens_seen": 1219168, + "step": 3035 + }, + { + "epoch": 3.1600831600831603, + "grad_norm": 0.0346427820622921, + "learning_rate": 1.7957477346807622e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1221088, + "step": 3040 + }, + { + "epoch": 3.1652806652806653, + "grad_norm": 0.013777323067188263, + "learning_rate": 1.7870484015569306e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1223264, + "step": 3045 + }, + { + "epoch": 3.1704781704781704, + "grad_norm": 0.006741571240127087, + "learning_rate": 1.7783584553755007e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1225440, + "step": 3050 + }, + { + "epoch": 3.175675675675676, + "grad_norm": 0.1481812596321106, + "learning_rate": 1.769678010551003e-06, + "loss": 0.0864, + "num_input_tokens_seen": 1227424, + "step": 3055 + }, + { + "epoch": 3.180873180873181, + "grad_norm": 0.15245255827903748, + "learning_rate": 1.7610071813728741e-06, + "loss": 0.0793, + "num_input_tokens_seen": 1229344, + "step": 3060 + }, + { + "epoch": 3.186070686070686, + "grad_norm": 6.337021827697754, + "learning_rate": 1.7523460820039466e-06, + "loss": 0.0974, + "num_input_tokens_seen": 1231456, + "step": 3065 + }, + { + "epoch": 3.1912681912681915, + "grad_norm": 0.27218034863471985, + "learning_rate": 1.7436948264789465e-06, + "loss": 0.0003, + "num_input_tokens_seen": 1233440, + "step": 3070 + }, + { + "epoch": 3.1964656964656966, + "grad_norm": 0.02221021242439747, + "learning_rate": 1.7350535287029957e-06, + "loss": 0.0779, + "num_input_tokens_seen": 1235552, + "step": 3075 + }, + { + "epoch": 3.2016632016632016, + "grad_norm": 0.04011327400803566, + "learning_rate": 1.7264223024501064e-06, + "loss": 0.152, + "num_input_tokens_seen": 1237536, + "step": 3080 + }, + { + "epoch": 3.2068607068607067, + "grad_norm": 0.02025497704744339, + "learning_rate": 1.717801261361685e-06, + "loss": 0.0004, + "num_input_tokens_seen": 1239584, + "step": 3085 + }, + { + "epoch": 3.212058212058212, + "grad_norm": 1.1895182132720947, + "learning_rate": 1.7091905189450425e-06, + "loss": 0.0013, + "num_input_tokens_seen": 1241504, + "step": 3090 + }, + { + "epoch": 3.2172557172557172, + "grad_norm": 251.4841766357422, + "learning_rate": 1.700590188571887e-06, + "loss": 0.0375, + "num_input_tokens_seen": 1243552, + "step": 3095 + }, + { + "epoch": 3.2224532224532223, + "grad_norm": 0.022575953975319862, + "learning_rate": 1.6920003834768438e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1245600, + "step": 3100 + }, + { + "epoch": 3.227650727650728, + "grad_norm": 0.019538020715117455, + "learning_rate": 1.6834212167559578e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1247712, + "step": 3105 + }, + { + "epoch": 3.232848232848233, + "grad_norm": 102.0543212890625, + "learning_rate": 1.6748528013652032e-06, + "loss": 0.031, + "num_input_tokens_seen": 1249696, + "step": 3110 + }, + { + "epoch": 3.238045738045738, + "grad_norm": 0.1734342724084854, + "learning_rate": 1.6662952501190032e-06, + "loss": 0.0647, + "num_input_tokens_seen": 1251808, + "step": 3115 + }, + { + "epoch": 3.2432432432432434, + "grad_norm": 106.23870086669922, + "learning_rate": 1.6577486756887376e-06, + "loss": 0.0462, + "num_input_tokens_seen": 1253728, + "step": 3120 + }, + { + "epoch": 3.2484407484407485, + "grad_norm": 0.021452955901622772, + "learning_rate": 1.6492131906012608e-06, + "loss": 0.0289, + "num_input_tokens_seen": 1255840, + "step": 3125 + }, + { + "epoch": 3.2536382536382535, + "grad_norm": 0.07759949564933777, + "learning_rate": 1.640688907237425e-06, + "loss": 0.0469, + "num_input_tokens_seen": 1257888, + "step": 3130 + }, + { + "epoch": 3.2567567567567566, + "eval_loss": 0.36025160551071167, + "eval_runtime": 1.086, + "eval_samples_per_second": 788.217, + "eval_steps_per_second": 98.527, + "num_input_tokens_seen": 1259168, + "step": 3133 + }, + { + "epoch": 3.258835758835759, + "grad_norm": 17.87605857849121, + "learning_rate": 1.632175937830594e-06, + "loss": 0.0767, + "num_input_tokens_seen": 1259936, + "step": 3135 + }, + { + "epoch": 3.264033264033264, + "grad_norm": 0.020596951246261597, + "learning_rate": 1.6236743944651703e-06, + "loss": 0.0504, + "num_input_tokens_seen": 1262112, + "step": 3140 + }, + { + "epoch": 3.269230769230769, + "grad_norm": 20.961088180541992, + "learning_rate": 1.6151843890751172e-06, + "loss": 0.0185, + "num_input_tokens_seen": 1263904, + "step": 3145 + }, + { + "epoch": 3.274428274428274, + "grad_norm": 0.048750557005405426, + "learning_rate": 1.6067060334424836e-06, + "loss": 0.0131, + "num_input_tokens_seen": 1265952, + "step": 3150 + }, + { + "epoch": 3.2796257796257797, + "grad_norm": 0.010466455481946468, + "learning_rate": 1.5982394391959382e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1267872, + "step": 3155 + }, + { + "epoch": 3.284823284823285, + "grad_norm": 70.06320190429688, + "learning_rate": 1.5897847178092902e-06, + "loss": 0.0937, + "num_input_tokens_seen": 1269792, + "step": 3160 + }, + { + "epoch": 3.29002079002079, + "grad_norm": 1.3579819202423096, + "learning_rate": 1.5813419806000329e-06, + "loss": 0.0014, + "num_input_tokens_seen": 1271776, + "step": 3165 + }, + { + "epoch": 3.2952182952182953, + "grad_norm": 0.01165761612355709, + "learning_rate": 1.5729113387278675e-06, + "loss": 0.0785, + "num_input_tokens_seen": 1273760, + "step": 3170 + }, + { + "epoch": 3.3004158004158004, + "grad_norm": 0.04458131268620491, + "learning_rate": 1.5644929031932455e-06, + "loss": 0.1213, + "num_input_tokens_seen": 1275808, + "step": 3175 + }, + { + "epoch": 3.3056133056133055, + "grad_norm": 0.052004504948854446, + "learning_rate": 1.556086784835908e-06, + "loss": 0.0576, + "num_input_tokens_seen": 1277792, + "step": 3180 + }, + { + "epoch": 3.310810810810811, + "grad_norm": 0.08415602892637253, + "learning_rate": 1.547693094333421e-06, + "loss": 0.0004, + "num_input_tokens_seen": 1279776, + "step": 3185 + }, + { + "epoch": 3.316008316008316, + "grad_norm": 38.53419494628906, + "learning_rate": 1.5393119421997252e-06, + "loss": 0.1482, + "num_input_tokens_seen": 1281760, + "step": 3190 + }, + { + "epoch": 3.321205821205821, + "grad_norm": 0.08246491849422455, + "learning_rate": 1.5309434387836737e-06, + "loss": 0.0042, + "num_input_tokens_seen": 1283744, + "step": 3195 + }, + { + "epoch": 3.3264033264033266, + "grad_norm": 0.12744662165641785, + "learning_rate": 1.5225876942675844e-06, + "loss": 0.0005, + "num_input_tokens_seen": 1285792, + "step": 3200 + }, + { + "epoch": 3.3316008316008316, + "grad_norm": 19.751220703125, + "learning_rate": 1.514244818665788e-06, + "loss": 0.0525, + "num_input_tokens_seen": 1287776, + "step": 3205 + }, + { + "epoch": 3.3367983367983367, + "grad_norm": 0.04199739545583725, + "learning_rate": 1.505914921823178e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1289696, + "step": 3210 + }, + { + "epoch": 3.3419958419958418, + "grad_norm": 0.19491150975227356, + "learning_rate": 1.497598113413766e-06, + "loss": 0.0006, + "num_input_tokens_seen": 1291680, + "step": 3215 + }, + { + "epoch": 3.3471933471933473, + "grad_norm": 0.019337935373187065, + "learning_rate": 1.489294502939238e-06, + "loss": 0.0003, + "num_input_tokens_seen": 1293536, + "step": 3220 + }, + { + "epoch": 3.3523908523908523, + "grad_norm": 0.01680462807416916, + "learning_rate": 1.4810041997275094e-06, + "loss": 0.0003, + "num_input_tokens_seen": 1295712, + "step": 3225 + }, + { + "epoch": 3.357588357588358, + "grad_norm": 0.025287525728344917, + "learning_rate": 1.4727273129312918e-06, + "loss": 0.0008, + "num_input_tokens_seen": 1297760, + "step": 3230 + }, + { + "epoch": 3.362785862785863, + "grad_norm": 0.015904569998383522, + "learning_rate": 1.4644639515266484e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1299808, + "step": 3235 + }, + { + "epoch": 3.367983367983368, + "grad_norm": 0.02150922454893589, + "learning_rate": 1.4562142243115646e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1301920, + "step": 3240 + }, + { + "epoch": 3.373180873180873, + "grad_norm": 0.014152280054986477, + "learning_rate": 1.4479782399045152e-06, + "loss": 0.0054, + "num_input_tokens_seen": 1303904, + "step": 3245 + }, + { + "epoch": 3.3783783783783785, + "grad_norm": 0.002619291888549924, + "learning_rate": 1.43975610674303e-06, + "loss": 0.0424, + "num_input_tokens_seen": 1305888, + "step": 3250 + }, + { + "epoch": 3.3835758835758836, + "grad_norm": 16.205442428588867, + "learning_rate": 1.4315479330822711e-06, + "loss": 0.1061, + "num_input_tokens_seen": 1308064, + "step": 3255 + }, + { + "epoch": 3.3887733887733886, + "grad_norm": 0.017430748790502548, + "learning_rate": 1.4233538269936042e-06, + "loss": 0.0016, + "num_input_tokens_seen": 1310048, + "step": 3260 + }, + { + "epoch": 3.393970893970894, + "grad_norm": 0.022695308551192284, + "learning_rate": 1.415173896363178e-06, + "loss": 0.1162, + "num_input_tokens_seen": 1311968, + "step": 3265 + }, + { + "epoch": 3.399168399168399, + "grad_norm": 0.009034757502377033, + "learning_rate": 1.4070082488905034e-06, + "loss": 0.0003, + "num_input_tokens_seen": 1313888, + "step": 3270 + }, + { + "epoch": 3.4043659043659042, + "grad_norm": 0.014441024512052536, + "learning_rate": 1.3988569920870315e-06, + "loss": 0.0648, + "num_input_tokens_seen": 1316064, + "step": 3275 + }, + { + "epoch": 3.4095634095634098, + "grad_norm": 0.019770730286836624, + "learning_rate": 1.3907202332747454e-06, + "loss": 0.0011, + "num_input_tokens_seen": 1318112, + "step": 3280 + }, + { + "epoch": 3.414760914760915, + "grad_norm": 0.18654315173625946, + "learning_rate": 1.3825980795847401e-06, + "loss": 0.0003, + "num_input_tokens_seen": 1319968, + "step": 3285 + }, + { + "epoch": 3.41995841995842, + "grad_norm": 0.016684433445334435, + "learning_rate": 1.3744906379558165e-06, + "loss": 0.038, + "num_input_tokens_seen": 1322016, + "step": 3290 + }, + { + "epoch": 3.4251559251559254, + "grad_norm": 0.012927900068461895, + "learning_rate": 1.3663980151330734e-06, + "loss": 0.0009, + "num_input_tokens_seen": 1323936, + "step": 3295 + }, + { + "epoch": 3.4303534303534304, + "grad_norm": 0.034205105155706406, + "learning_rate": 1.358320317666496e-06, + "loss": 0.0241, + "num_input_tokens_seen": 1325920, + "step": 3300 + }, + { + "epoch": 3.4355509355509355, + "grad_norm": 0.016045430675148964, + "learning_rate": 1.350257651909562e-06, + "loss": 0.0668, + "num_input_tokens_seen": 1327840, + "step": 3305 + }, + { + "epoch": 3.4407484407484406, + "grad_norm": 0.01693640649318695, + "learning_rate": 1.3422101240178365e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1329760, + "step": 3310 + }, + { + "epoch": 3.445945945945946, + "grad_norm": 0.12216309458017349, + "learning_rate": 1.3341778399475714e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1331744, + "step": 3315 + }, + { + "epoch": 3.451143451143451, + "grad_norm": 72.96051025390625, + "learning_rate": 1.3261609054543178e-06, + "loss": 0.0278, + "num_input_tokens_seen": 1333792, + "step": 3320 + }, + { + "epoch": 3.456340956340956, + "grad_norm": 0.03497939929366112, + "learning_rate": 1.3181594260915263e-06, + "loss": 0.0412, + "num_input_tokens_seen": 1335776, + "step": 3325 + }, + { + "epoch": 3.4615384615384617, + "grad_norm": 0.28867605328559875, + "learning_rate": 1.3101735072091624e-06, + "loss": 0.0003, + "num_input_tokens_seen": 1337824, + "step": 3330 + }, + { + "epoch": 3.4667359667359667, + "grad_norm": 0.010723100043833256, + "learning_rate": 1.3022032539523177e-06, + "loss": 0.0311, + "num_input_tokens_seen": 1339872, + "step": 3335 + }, + { + "epoch": 3.471933471933472, + "grad_norm": 0.06669893115758896, + "learning_rate": 1.2942487712598234e-06, + "loss": 0.0937, + "num_input_tokens_seen": 1341920, + "step": 3340 + }, + { + "epoch": 3.4771309771309773, + "grad_norm": 41.53193664550781, + "learning_rate": 1.2863101638628716e-06, + "loss": 0.0176, + "num_input_tokens_seen": 1343904, + "step": 3345 + }, + { + "epoch": 3.4823284823284824, + "grad_norm": 0.002587628783658147, + "learning_rate": 1.2783875362836373e-06, + "loss": 0.0738, + "num_input_tokens_seen": 1345952, + "step": 3350 + }, + { + "epoch": 3.4875259875259874, + "grad_norm": 0.01352632511407137, + "learning_rate": 1.2704809928338957e-06, + "loss": 0.0394, + "num_input_tokens_seen": 1348128, + "step": 3355 + }, + { + "epoch": 3.492723492723493, + "grad_norm": 0.01383188832551241, + "learning_rate": 1.2625906376136582e-06, + "loss": 0.0012, + "num_input_tokens_seen": 1350048, + "step": 3360 + }, + { + "epoch": 3.497920997920998, + "grad_norm": 0.03481636196374893, + "learning_rate": 1.2547165745097927e-06, + "loss": 0.1121, + "num_input_tokens_seen": 1351968, + "step": 3365 + }, + { + "epoch": 3.503118503118503, + "grad_norm": 0.007815233431756496, + "learning_rate": 1.2468589071946632e-06, + "loss": 0.0682, + "num_input_tokens_seen": 1353952, + "step": 3370 + }, + { + "epoch": 3.507276507276507, + "eval_loss": 0.4127735495567322, + "eval_runtime": 1.0486, + "eval_samples_per_second": 816.351, + "eval_steps_per_second": 102.044, + "num_input_tokens_seen": 1355552, + "step": 3374 + }, + { + "epoch": 3.508316008316008, + "grad_norm": 0.027879195287823677, + "learning_rate": 1.2390177391247616e-06, + "loss": 0.0726, + "num_input_tokens_seen": 1356000, + "step": 3375 + }, + { + "epoch": 3.5135135135135136, + "grad_norm": 26.8095760345459, + "learning_rate": 1.2311931735393417e-06, + "loss": 0.1161, + "num_input_tokens_seen": 1357984, + "step": 3380 + }, + { + "epoch": 3.5187110187110187, + "grad_norm": 0.03682544827461243, + "learning_rate": 1.2233853134590698e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1359904, + "step": 3385 + }, + { + "epoch": 3.523908523908524, + "grad_norm": 0.015444116666913033, + "learning_rate": 1.2155942616846562e-06, + "loss": 0.0385, + "num_input_tokens_seen": 1361952, + "step": 3390 + }, + { + "epoch": 3.529106029106029, + "grad_norm": 0.0405961312353611, + "learning_rate": 1.2078201207955122e-06, + "loss": 0.1318, + "num_input_tokens_seen": 1364000, + "step": 3395 + }, + { + "epoch": 3.5343035343035343, + "grad_norm": 0.05229797586798668, + "learning_rate": 1.2000629931483947e-06, + "loss": 0.0008, + "num_input_tokens_seen": 1366112, + "step": 3400 + }, + { + "epoch": 3.5395010395010393, + "grad_norm": 0.09235959500074387, + "learning_rate": 1.1923229808760565e-06, + "loss": 0.0016, + "num_input_tokens_seen": 1368096, + "step": 3405 + }, + { + "epoch": 3.544698544698545, + "grad_norm": 0.044506847858428955, + "learning_rate": 1.1846001858859054e-06, + "loss": 0.0661, + "num_input_tokens_seen": 1370208, + "step": 3410 + }, + { + "epoch": 3.54989604989605, + "grad_norm": 0.27768710255622864, + "learning_rate": 1.1768947098586628e-06, + "loss": 0.0004, + "num_input_tokens_seen": 1372192, + "step": 3415 + }, + { + "epoch": 3.555093555093555, + "grad_norm": 0.05268951505422592, + "learning_rate": 1.1692066542470202e-06, + "loss": 0.0171, + "num_input_tokens_seen": 1374240, + "step": 3420 + }, + { + "epoch": 3.5602910602910605, + "grad_norm": 0.04955555871129036, + "learning_rate": 1.1615361202743088e-06, + "loss": 0.0003, + "num_input_tokens_seen": 1376160, + "step": 3425 + }, + { + "epoch": 3.5654885654885655, + "grad_norm": 0.020622428506612778, + "learning_rate": 1.1538832089331628e-06, + "loss": 0.0008, + "num_input_tokens_seen": 1378208, + "step": 3430 + }, + { + "epoch": 3.5706860706860706, + "grad_norm": 0.04335801303386688, + "learning_rate": 1.1462480209841928e-06, + "loss": 0.0007, + "num_input_tokens_seen": 1380192, + "step": 3435 + }, + { + "epoch": 3.5758835758835756, + "grad_norm": 0.053323231637477875, + "learning_rate": 1.1386306569546578e-06, + "loss": 0.0491, + "num_input_tokens_seen": 1382368, + "step": 3440 + }, + { + "epoch": 3.581081081081081, + "grad_norm": 0.09630803763866425, + "learning_rate": 1.1310312171371394e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1384608, + "step": 3445 + }, + { + "epoch": 3.586278586278586, + "grad_norm": 0.24890074133872986, + "learning_rate": 1.123449801588226e-06, + "loss": 0.1426, + "num_input_tokens_seen": 1386592, + "step": 3450 + }, + { + "epoch": 3.5914760914760917, + "grad_norm": 51.86346435546875, + "learning_rate": 1.1158865101271906e-06, + "loss": 0.098, + "num_input_tokens_seen": 1388448, + "step": 3455 + }, + { + "epoch": 3.5966735966735968, + "grad_norm": 0.017590023577213287, + "learning_rate": 1.1083414423346807e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1390560, + "step": 3460 + }, + { + "epoch": 3.601871101871102, + "grad_norm": 0.01884053274989128, + "learning_rate": 1.100814697551406e-06, + "loss": 0.0977, + "num_input_tokens_seen": 1392736, + "step": 3465 + }, + { + "epoch": 3.607068607068607, + "grad_norm": 173.05203247070312, + "learning_rate": 1.0933063748768254e-06, + "loss": 0.1036, + "num_input_tokens_seen": 1394720, + "step": 3470 + }, + { + "epoch": 3.6122661122661124, + "grad_norm": 0.04371850937604904, + "learning_rate": 1.0858165731678514e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1396640, + "step": 3475 + }, + { + "epoch": 3.6174636174636174, + "grad_norm": 78.75630187988281, + "learning_rate": 1.0783453910375423e-06, + "loss": 0.0528, + "num_input_tokens_seen": 1398752, + "step": 3480 + }, + { + "epoch": 3.6226611226611225, + "grad_norm": 17.215036392211914, + "learning_rate": 1.0708929268538034e-06, + "loss": 0.0787, + "num_input_tokens_seen": 1400800, + "step": 3485 + }, + { + "epoch": 3.627858627858628, + "grad_norm": 0.05456389859318733, + "learning_rate": 1.0634592787380964e-06, + "loss": 0.0007, + "num_input_tokens_seen": 1402720, + "step": 3490 + }, + { + "epoch": 3.633056133056133, + "grad_norm": 0.06369329243898392, + "learning_rate": 1.0560445445641423e-06, + "loss": 0.0827, + "num_input_tokens_seen": 1404704, + "step": 3495 + }, + { + "epoch": 3.638253638253638, + "grad_norm": 0.02703475020825863, + "learning_rate": 1.048648821956637e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1406560, + "step": 3500 + }, + { + "epoch": 3.643451143451143, + "grad_norm": 0.0234812144190073, + "learning_rate": 1.0412722082899647e-06, + "loss": 0.0586, + "num_input_tokens_seen": 1408544, + "step": 3505 + }, + { + "epoch": 3.6486486486486487, + "grad_norm": 0.03321904316544533, + "learning_rate": 1.033914800686912e-06, + "loss": 0.0003, + "num_input_tokens_seen": 1410464, + "step": 3510 + }, + { + "epoch": 3.6538461538461537, + "grad_norm": 0.021713286638259888, + "learning_rate": 1.0265766960173964e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1412448, + "step": 3515 + }, + { + "epoch": 3.6590436590436592, + "grad_norm": 19.148372650146484, + "learning_rate": 1.019257990897185e-06, + "loss": 0.042, + "num_input_tokens_seen": 1414688, + "step": 3520 + }, + { + "epoch": 3.6642411642411643, + "grad_norm": 13.719977378845215, + "learning_rate": 1.0119587816866258e-06, + "loss": 0.0036, + "num_input_tokens_seen": 1416672, + "step": 3525 + }, + { + "epoch": 3.6694386694386694, + "grad_norm": 0.012155055068433285, + "learning_rate": 1.0046791644893757e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1418592, + "step": 3530 + }, + { + "epoch": 3.6746361746361744, + "grad_norm": 0.015267685987055302, + "learning_rate": 9.97419235151137e-07, + "loss": 0.0004, + "num_input_tokens_seen": 1420576, + "step": 3535 + }, + { + "epoch": 3.67983367983368, + "grad_norm": 0.4185558259487152, + "learning_rate": 9.901790892583973e-07, + "loss": 0.0005, + "num_input_tokens_seen": 1422560, + "step": 3540 + }, + { + "epoch": 3.685031185031185, + "grad_norm": 0.01660173013806343, + "learning_rate": 9.829588221371694e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1424608, + "step": 3545 + }, + { + "epoch": 3.6902286902286905, + "grad_norm": 0.06823495030403137, + "learning_rate": 9.757585288517329e-07, + "loss": 0.0002, + "num_input_tokens_seen": 1426784, + "step": 3550 + }, + { + "epoch": 3.6954261954261955, + "grad_norm": 0.010435913689434528, + "learning_rate": 9.6857830420339e-07, + "loss": 0.0507, + "num_input_tokens_seen": 1428896, + "step": 3555 + }, + { + "epoch": 3.7006237006237006, + "grad_norm": 0.03763195872306824, + "learning_rate": 9.614182427292076e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1430880, + "step": 3560 + }, + { + "epoch": 3.7058212058212057, + "grad_norm": 0.07442791014909744, + "learning_rate": 9.54278438700785e-07, + "loss": 0.0706, + "num_input_tokens_seen": 1432864, + "step": 3565 + }, + { + "epoch": 3.711018711018711, + "grad_norm": 13.558998107910156, + "learning_rate": 9.471589861229999e-07, + "loss": 0.0558, + "num_input_tokens_seen": 1434912, + "step": 3570 + }, + { + "epoch": 3.7162162162162162, + "grad_norm": 0.03634670376777649, + "learning_rate": 9.400599787327774e-07, + "loss": 0.0451, + "num_input_tokens_seen": 1436832, + "step": 3575 + }, + { + "epoch": 3.7214137214137213, + "grad_norm": 0.015272362157702446, + "learning_rate": 9.329815099978567e-07, + "loss": 0.0456, + "num_input_tokens_seen": 1438752, + "step": 3580 + }, + { + "epoch": 3.726611226611227, + "grad_norm": 0.06222844123840332, + "learning_rate": 9.259236731155583e-07, + "loss": 0.0002, + "num_input_tokens_seen": 1440672, + "step": 3585 + }, + { + "epoch": 3.731808731808732, + "grad_norm": 0.31334197521209717, + "learning_rate": 9.188865610115572e-07, + "loss": 0.0311, + "num_input_tokens_seen": 1442784, + "step": 3590 + }, + { + "epoch": 3.737006237006237, + "grad_norm": 51.054107666015625, + "learning_rate": 9.118702663386583e-07, + "loss": 0.0596, + "num_input_tokens_seen": 1444960, + "step": 3595 + }, + { + "epoch": 3.742203742203742, + "grad_norm": 33.01020431518555, + "learning_rate": 9.048748814755783e-07, + "loss": 0.0648, + "num_input_tokens_seen": 1446880, + "step": 3600 + }, + { + "epoch": 3.7474012474012475, + "grad_norm": 0.032987091690301895, + "learning_rate": 8.979004985257294e-07, + "loss": 0.0394, + "num_input_tokens_seen": 1448992, + "step": 3605 + }, + { + "epoch": 3.7525987525987525, + "grad_norm": 0.09595970064401627, + "learning_rate": 8.909472093160066e-07, + "loss": 0.0295, + "num_input_tokens_seen": 1450976, + "step": 3610 + }, + { + "epoch": 3.757796257796258, + "grad_norm": 1.557525396347046, + "learning_rate": 8.840151053955773e-07, + "loss": 0.0128, + "num_input_tokens_seen": 1453088, + "step": 3615 + }, + { + "epoch": 3.757796257796258, + "eval_loss": 0.36968719959259033, + "eval_runtime": 1.2334, + "eval_samples_per_second": 694.025, + "eval_steps_per_second": 86.753, + "num_input_tokens_seen": 1453088, + "step": 3615 + }, + { + "epoch": 3.762993762993763, + "grad_norm": 0.020010627806186676, + "learning_rate": 8.771042780346767e-07, + "loss": 0.0014, + "num_input_tokens_seen": 1455136, + "step": 3620 + }, + { + "epoch": 3.768191268191268, + "grad_norm": 12.859967231750488, + "learning_rate": 8.702148182234043e-07, + "loss": 0.1087, + "num_input_tokens_seen": 1457120, + "step": 3625 + }, + { + "epoch": 3.773388773388773, + "grad_norm": 0.03449089452624321, + "learning_rate": 8.633468166705336e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1459168, + "step": 3630 + }, + { + "epoch": 3.7785862785862787, + "grad_norm": 0.013074683956801891, + "learning_rate": 8.565003638023065e-07, + "loss": 0.0061, + "num_input_tokens_seen": 1461152, + "step": 3635 + }, + { + "epoch": 3.7837837837837838, + "grad_norm": 0.00507075572386384, + "learning_rate": 8.496755497612491e-07, + "loss": 0.0002, + "num_input_tokens_seen": 1463136, + "step": 3640 + }, + { + "epoch": 3.788981288981289, + "grad_norm": 0.010262695141136646, + "learning_rate": 8.42872464404986e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1465120, + "step": 3645 + }, + { + "epoch": 3.7941787941787943, + "grad_norm": 4.041860103607178, + "learning_rate": 8.360911973050537e-07, + "loss": 0.0322, + "num_input_tokens_seen": 1467104, + "step": 3650 + }, + { + "epoch": 3.7993762993762994, + "grad_norm": 0.005001334939152002, + "learning_rate": 8.29331837745724e-07, + "loss": 0.0004, + "num_input_tokens_seen": 1469152, + "step": 3655 + }, + { + "epoch": 3.8045738045738045, + "grad_norm": 17.126569747924805, + "learning_rate": 8.225944747228257e-07, + "loss": 0.1215, + "num_input_tokens_seen": 1471264, + "step": 3660 + }, + { + "epoch": 3.8097713097713095, + "grad_norm": 0.0037782315630465746, + "learning_rate": 8.158791969425739e-07, + "loss": 0.0868, + "num_input_tokens_seen": 1473248, + "step": 3665 + }, + { + "epoch": 3.814968814968815, + "grad_norm": 0.027992993593215942, + "learning_rate": 8.091860928204048e-07, + "loss": 0.0009, + "num_input_tokens_seen": 1475360, + "step": 3670 + }, + { + "epoch": 3.82016632016632, + "grad_norm": 0.006942141801118851, + "learning_rate": 8.025152504798078e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1477472, + "step": 3675 + }, + { + "epoch": 3.8253638253638256, + "grad_norm": 19.416587829589844, + "learning_rate": 7.958667577511684e-07, + "loss": 0.0912, + "num_input_tokens_seen": 1479328, + "step": 3680 + }, + { + "epoch": 3.8305613305613306, + "grad_norm": 0.010084366425871849, + "learning_rate": 7.892407021706064e-07, + "loss": 0.0447, + "num_input_tokens_seen": 1481248, + "step": 3685 + }, + { + "epoch": 3.8357588357588357, + "grad_norm": 0.02589116431772709, + "learning_rate": 7.826371709788314e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1483168, + "step": 3690 + }, + { + "epoch": 3.8409563409563408, + "grad_norm": 0.12098560482263565, + "learning_rate": 7.760562511199881e-07, + "loss": 0.0007, + "num_input_tokens_seen": 1485152, + "step": 3695 + }, + { + "epoch": 3.8461538461538463, + "grad_norm": 0.0336734913289547, + "learning_rate": 7.694980292405122e-07, + "loss": 0.0407, + "num_input_tokens_seen": 1487200, + "step": 3700 + }, + { + "epoch": 3.8513513513513513, + "grad_norm": 0.08973251283168793, + "learning_rate": 7.629625916879932e-07, + "loss": 0.0294, + "num_input_tokens_seen": 1489184, + "step": 3705 + }, + { + "epoch": 3.856548856548857, + "grad_norm": 17.128236770629883, + "learning_rate": 7.564500245100326e-07, + "loss": 0.0046, + "num_input_tokens_seen": 1491168, + "step": 3710 + }, + { + "epoch": 3.861746361746362, + "grad_norm": 0.03917059302330017, + "learning_rate": 7.49960413453115e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1493216, + "step": 3715 + }, + { + "epoch": 3.866943866943867, + "grad_norm": 0.022577917203307152, + "learning_rate": 7.434938439614781e-07, + "loss": 0.0738, + "num_input_tokens_seen": 1495200, + "step": 3720 + }, + { + "epoch": 3.872141372141372, + "grad_norm": 0.04259275645017624, + "learning_rate": 7.370504011759855e-07, + "loss": 0.0047, + "num_input_tokens_seen": 1497184, + "step": 3725 + }, + { + "epoch": 3.8773388773388775, + "grad_norm": 39.698997497558594, + "learning_rate": 7.306301699330065e-07, + "loss": 0.0633, + "num_input_tokens_seen": 1499040, + "step": 3730 + }, + { + "epoch": 3.8825363825363826, + "grad_norm": 21.861370086669922, + "learning_rate": 7.242332347633052e-07, + "loss": 0.0354, + "num_input_tokens_seen": 1501024, + "step": 3735 + }, + { + "epoch": 3.8877338877338876, + "grad_norm": 0.0236463975161314, + "learning_rate": 7.17859679890916e-07, + "loss": 0.042, + "num_input_tokens_seen": 1503072, + "step": 3740 + }, + { + "epoch": 3.892931392931393, + "grad_norm": 0.09350544959306717, + "learning_rate": 7.115095892320456e-07, + "loss": 0.0002, + "num_input_tokens_seen": 1505248, + "step": 3745 + }, + { + "epoch": 3.898128898128898, + "grad_norm": 0.004034217447042465, + "learning_rate": 7.051830463939605e-07, + "loss": 0.0084, + "num_input_tokens_seen": 1507296, + "step": 3750 + }, + { + "epoch": 3.9033264033264032, + "grad_norm": 0.026631083339452744, + "learning_rate": 6.988801346738911e-07, + "loss": 0.0226, + "num_input_tokens_seen": 1509344, + "step": 3755 + }, + { + "epoch": 3.9085239085239083, + "grad_norm": 0.008157435804605484, + "learning_rate": 6.926009370579334e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1511456, + "step": 3760 + }, + { + "epoch": 3.913721413721414, + "grad_norm": 72.86700439453125, + "learning_rate": 6.863455362199542e-07, + "loss": 0.0235, + "num_input_tokens_seen": 1513440, + "step": 3765 + }, + { + "epoch": 3.918918918918919, + "grad_norm": 0.05969979614019394, + "learning_rate": 6.801140145205071e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1515488, + "step": 3770 + }, + { + "epoch": 3.9241164241164244, + "grad_norm": 4.924336910247803, + "learning_rate": 6.739064540057425e-07, + "loss": 0.0065, + "num_input_tokens_seen": 1517408, + "step": 3775 + }, + { + "epoch": 3.9293139293139294, + "grad_norm": 0.07060942053794861, + "learning_rate": 6.677229364063329e-07, + "loss": 0.0335, + "num_input_tokens_seen": 1519392, + "step": 3780 + }, + { + "epoch": 3.9345114345114345, + "grad_norm": 0.025277776643633842, + "learning_rate": 6.615635431363943e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1521440, + "step": 3785 + }, + { + "epoch": 3.9397089397089395, + "grad_norm": 22.37493896484375, + "learning_rate": 6.554283552924118e-07, + "loss": 0.0844, + "num_input_tokens_seen": 1523488, + "step": 3790 + }, + { + "epoch": 3.944906444906445, + "grad_norm": 0.008414591662585735, + "learning_rate": 6.493174536521768e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1525600, + "step": 3795 + }, + { + "epoch": 3.95010395010395, + "grad_norm": 4.057095527648926, + "learning_rate": 6.43230918673721e-07, + "loss": 0.0715, + "num_input_tokens_seen": 1527584, + "step": 3800 + }, + { + "epoch": 3.955301455301455, + "grad_norm": 0.2397640198469162, + "learning_rate": 6.371688304942544e-07, + "loss": 0.0002, + "num_input_tokens_seen": 1529504, + "step": 3805 + }, + { + "epoch": 3.9604989604989607, + "grad_norm": 0.024253297597169876, + "learning_rate": 6.311312689291166e-07, + "loss": 0.0805, + "num_input_tokens_seen": 1531424, + "step": 3810 + }, + { + "epoch": 3.9656964656964657, + "grad_norm": 0.006427168846130371, + "learning_rate": 6.251183134707183e-07, + "loss": 0.0, + "num_input_tokens_seen": 1533408, + "step": 3815 + }, + { + "epoch": 3.970893970893971, + "grad_norm": 22.389490127563477, + "learning_rate": 6.191300432875017e-07, + "loss": 0.1432, + "num_input_tokens_seen": 1535392, + "step": 3820 + }, + { + "epoch": 3.976091476091476, + "grad_norm": 42.83168029785156, + "learning_rate": 6.13166537222894e-07, + "loss": 0.0178, + "num_input_tokens_seen": 1537312, + "step": 3825 + }, + { + "epoch": 3.9812889812889813, + "grad_norm": 34.80426788330078, + "learning_rate": 6.072278737942691e-07, + "loss": 0.0611, + "num_input_tokens_seen": 1539360, + "step": 3830 + }, + { + "epoch": 3.9864864864864864, + "grad_norm": 0.005531808827072382, + "learning_rate": 6.013141311919168e-07, + "loss": 0.0019, + "num_input_tokens_seen": 1541280, + "step": 3835 + }, + { + "epoch": 3.991683991683992, + "grad_norm": 0.09399595111608505, + "learning_rate": 5.954253872780102e-07, + "loss": 0.0644, + "num_input_tokens_seen": 1543136, + "step": 3840 + }, + { + "epoch": 3.996881496881497, + "grad_norm": 0.004355916753411293, + "learning_rate": 5.895617195855827e-07, + "loss": 0.1091, + "num_input_tokens_seen": 1545120, + "step": 3845 + }, + { + "epoch": 4.002079002079002, + "grad_norm": 0.013024209067225456, + "learning_rate": 5.837232053175065e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1547056, + "step": 3850 + }, + { + "epoch": 4.007276507276507, + "grad_norm": 0.05919545143842697, + "learning_rate": 5.77909921345475e-07, + "loss": 0.0238, + "num_input_tokens_seen": 1548976, + "step": 3855 + }, + { + "epoch": 4.008316008316008, + "eval_loss": 0.3716074526309967, + "eval_runtime": 1.0785, + "eval_samples_per_second": 793.715, + "eval_steps_per_second": 99.214, + "num_input_tokens_seen": 1549360, + "step": 3856 + }, + { + "epoch": 4.012474012474012, + "grad_norm": 0.22275064885616302, + "learning_rate": 5.721219442089925e-07, + "loss": 0.0133, + "num_input_tokens_seen": 1550960, + "step": 3860 + }, + { + "epoch": 4.017671517671518, + "grad_norm": 11.842212677001953, + "learning_rate": 5.663593501143663e-07, + "loss": 0.011, + "num_input_tokens_seen": 1552944, + "step": 3865 + }, + { + "epoch": 4.022869022869023, + "grad_norm": 0.035551466047763824, + "learning_rate": 5.606222149337004e-07, + "loss": 0.0378, + "num_input_tokens_seen": 1554992, + "step": 3870 + }, + { + "epoch": 4.028066528066528, + "grad_norm": 0.21466241776943207, + "learning_rate": 5.549106142039018e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1557104, + "step": 3875 + }, + { + "epoch": 4.033264033264033, + "grad_norm": 0.010968453250825405, + "learning_rate": 5.492246231256798e-07, + "loss": 0.0008, + "num_input_tokens_seen": 1559088, + "step": 3880 + }, + { + "epoch": 4.038461538461538, + "grad_norm": 0.0740390494465828, + "learning_rate": 5.435643165625615e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1561008, + "step": 3885 + }, + { + "epoch": 4.043659043659043, + "grad_norm": 0.03413901478052139, + "learning_rate": 5.379297690399035e-07, + "loss": 0.0007, + "num_input_tokens_seen": 1563056, + "step": 3890 + }, + { + "epoch": 4.048856548856548, + "grad_norm": 0.023828689008951187, + "learning_rate": 5.323210547439089e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1565040, + "step": 3895 + }, + { + "epoch": 4.054054054054054, + "grad_norm": 0.02368989959359169, + "learning_rate": 5.267382475206548e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1567024, + "step": 3900 + }, + { + "epoch": 4.0592515592515594, + "grad_norm": 0.1620592474937439, + "learning_rate": 5.21181420875117e-07, + "loss": 0.0003, + "num_input_tokens_seen": 1569136, + "step": 3905 + }, + { + "epoch": 4.0644490644490645, + "grad_norm": 0.013055311515927315, + "learning_rate": 5.15650647970202e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1571120, + "step": 3910 + }, + { + "epoch": 4.06964656964657, + "grad_norm": 0.005612197332084179, + "learning_rate": 5.101460016257858e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1573040, + "step": 3915 + }, + { + "epoch": 4.074844074844075, + "grad_norm": 0.016595976427197456, + "learning_rate": 5.046675543177531e-07, + "loss": 0.0005, + "num_input_tokens_seen": 1574896, + "step": 3920 + }, + { + "epoch": 4.08004158004158, + "grad_norm": 0.05645221471786499, + "learning_rate": 4.992153781770448e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1576880, + "step": 3925 + }, + { + "epoch": 4.085239085239086, + "grad_norm": 0.02893124334514141, + "learning_rate": 4.937895449887076e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1578864, + "step": 3930 + }, + { + "epoch": 4.090436590436591, + "grad_norm": 0.010248606093227863, + "learning_rate": 4.883901261909466e-07, + "loss": 0.0, + "num_input_tokens_seen": 1580848, + "step": 3935 + }, + { + "epoch": 4.095634095634096, + "grad_norm": 0.019447464495897293, + "learning_rate": 4.830171928741901e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1582704, + "step": 3940 + }, + { + "epoch": 4.100831600831601, + "grad_norm": 0.15405897796154022, + "learning_rate": 4.776708157801463e-07, + "loss": 0.0008, + "num_input_tokens_seen": 1584816, + "step": 3945 + }, + { + "epoch": 4.106029106029106, + "grad_norm": 8.753682136535645, + "learning_rate": 4.723510653008809e-07, + "loss": 0.0387, + "num_input_tokens_seen": 1586800, + "step": 3950 + }, + { + "epoch": 4.111226611226611, + "grad_norm": 0.06123171001672745, + "learning_rate": 4.6705801147788136e-07, + "loss": 0.081, + "num_input_tokens_seen": 1588720, + "step": 3955 + }, + { + "epoch": 4.116424116424117, + "grad_norm": 0.004952425602823496, + "learning_rate": 4.617917240011394e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1590576, + "step": 3960 + }, + { + "epoch": 4.121621621621622, + "grad_norm": 0.00792229175567627, + "learning_rate": 4.5655227220823355e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1592496, + "step": 3965 + }, + { + "epoch": 4.126819126819127, + "grad_norm": 0.013923810794949532, + "learning_rate": 4.513397250834159e-07, + "loss": 0.0123, + "num_input_tokens_seen": 1594544, + "step": 3970 + }, + { + "epoch": 4.132016632016632, + "grad_norm": 0.029175899922847748, + "learning_rate": 4.461541512567011e-07, + "loss": 0.0007, + "num_input_tokens_seen": 1596400, + "step": 3975 + }, + { + "epoch": 4.137214137214137, + "grad_norm": 0.04299869015812874, + "learning_rate": 4.409956190029674e-07, + "loss": 0.0585, + "num_input_tokens_seen": 1598320, + "step": 3980 + }, + { + "epoch": 4.142411642411642, + "grad_norm": 36.72762680053711, + "learning_rate": 4.358641962410537e-07, + "loss": 0.0202, + "num_input_tokens_seen": 1600368, + "step": 3985 + }, + { + "epoch": 4.147609147609147, + "grad_norm": 0.005658295005559921, + "learning_rate": 4.3075995053286716e-07, + "loss": 0.0, + "num_input_tokens_seen": 1602352, + "step": 3990 + }, + { + "epoch": 4.152806652806653, + "grad_norm": 0.00978625938296318, + "learning_rate": 4.2568294908249486e-07, + "loss": 0.0002, + "num_input_tokens_seen": 1604336, + "step": 3995 + }, + { + "epoch": 4.158004158004158, + "grad_norm": 0.005897314287722111, + "learning_rate": 4.2063325873531485e-07, + "loss": 0.0, + "num_input_tokens_seen": 1606256, + "step": 4000 + }, + { + "epoch": 4.163201663201663, + "grad_norm": 0.059251993894577026, + "learning_rate": 4.156109459771215e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1608304, + "step": 4005 + }, + { + "epoch": 4.168399168399168, + "grad_norm": 0.004152240231633186, + "learning_rate": 4.106160769332443e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1610480, + "step": 4010 + }, + { + "epoch": 4.173596673596673, + "grad_norm": 0.047246526926755905, + "learning_rate": 4.056487173676843e-07, + "loss": 0.0382, + "num_input_tokens_seen": 1612528, + "step": 4015 + }, + { + "epoch": 4.1787941787941785, + "grad_norm": 0.026120582595467567, + "learning_rate": 4.0070893268224055e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1614576, + "step": 4020 + }, + { + "epoch": 4.183991683991684, + "grad_norm": 0.012839434668421745, + "learning_rate": 3.9579678791565323e-07, + "loss": 0.0, + "num_input_tokens_seen": 1616624, + "step": 4025 + }, + { + "epoch": 4.1891891891891895, + "grad_norm": 16.232559204101562, + "learning_rate": 3.9091234774274873e-07, + "loss": 0.0378, + "num_input_tokens_seen": 1618672, + "step": 4030 + }, + { + "epoch": 4.1943866943866945, + "grad_norm": 0.0076831188052892685, + "learning_rate": 3.8605567647358426e-07, + "loss": 0.0029, + "num_input_tokens_seen": 1620784, + "step": 4035 + }, + { + "epoch": 4.1995841995842, + "grad_norm": 0.009812161326408386, + "learning_rate": 3.812268380526046e-07, + "loss": 0.0002, + "num_input_tokens_seen": 1622768, + "step": 4040 + }, + { + "epoch": 4.204781704781705, + "grad_norm": 0.12099117040634155, + "learning_rate": 3.764258960577971e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1624688, + "step": 4045 + }, + { + "epoch": 4.20997920997921, + "grad_norm": 0.005353657063096762, + "learning_rate": 3.7165291369985616e-07, + "loss": 0.0004, + "num_input_tokens_seen": 1626672, + "step": 4050 + }, + { + "epoch": 4.215176715176715, + "grad_norm": 0.001504407380707562, + "learning_rate": 3.6690795382135184e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1628848, + "step": 4055 + }, + { + "epoch": 4.220374220374221, + "grad_norm": 0.009774814359843731, + "learning_rate": 3.6219107889590154e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1630832, + "step": 4060 + }, + { + "epoch": 4.225571725571726, + "grad_norm": 0.00985631812363863, + "learning_rate": 3.575023510273462e-07, + "loss": 0.0007, + "num_input_tokens_seen": 1632880, + "step": 4065 + }, + { + "epoch": 4.230769230769231, + "grad_norm": 0.01718440279364586, + "learning_rate": 3.528418319489349e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1634992, + "step": 4070 + }, + { + "epoch": 4.235966735966736, + "grad_norm": 0.021337008103728294, + "learning_rate": 3.48209583022511e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1636912, + "step": 4075 + }, + { + "epoch": 4.241164241164241, + "grad_norm": 0.03264433145523071, + "learning_rate": 3.436056652377043e-07, + "loss": 0.0, + "num_input_tokens_seen": 1638832, + "step": 4080 + }, + { + "epoch": 4.246361746361746, + "grad_norm": 0.028791099786758423, + "learning_rate": 3.3903013921112753e-07, + "loss": 0.056, + "num_input_tokens_seen": 1641072, + "step": 4085 + }, + { + "epoch": 4.251559251559252, + "grad_norm": 0.00902112852782011, + "learning_rate": 3.3448306518557795e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1642992, + "step": 4090 + }, + { + "epoch": 4.256756756756757, + "grad_norm": 0.0031842426396906376, + "learning_rate": 3.299645030292467e-07, + "loss": 0.0, + "num_input_tokens_seen": 1645040, + "step": 4095 + }, + { + "epoch": 4.258835758835759, + "eval_loss": 0.4492134153842926, + "eval_runtime": 1.0401, + "eval_samples_per_second": 823.003, + "eval_steps_per_second": 102.875, + "num_input_tokens_seen": 1645808, + "step": 4097 + }, + { + "epoch": 4.261954261954262, + "grad_norm": 0.008271156810224056, + "learning_rate": 3.254745122349279e-07, + "loss": 0.0, + "num_input_tokens_seen": 1647024, + "step": 4100 + }, + { + "epoch": 4.267151767151767, + "grad_norm": 0.009126213379204273, + "learning_rate": 3.2101315191923667e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1649008, + "step": 4105 + }, + { + "epoch": 4.272349272349272, + "grad_norm": 0.008243863470852375, + "learning_rate": 3.1658048082182926e-07, + "loss": 0.0003, + "num_input_tokens_seen": 1651056, + "step": 4110 + }, + { + "epoch": 4.277546777546777, + "grad_norm": 0.016346026211977005, + "learning_rate": 3.1217655730463094e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1653104, + "step": 4115 + }, + { + "epoch": 4.282744282744282, + "grad_norm": 0.014476928859949112, + "learning_rate": 3.078014393510695e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1655344, + "step": 4120 + }, + { + "epoch": 4.287941787941788, + "grad_norm": 0.00862564891576767, + "learning_rate": 3.0345518456530666e-07, + "loss": 0.042, + "num_input_tokens_seen": 1657392, + "step": 4125 + }, + { + "epoch": 4.293139293139293, + "grad_norm": 0.011305141262710094, + "learning_rate": 2.9913785017148563e-07, + "loss": 0.0002, + "num_input_tokens_seen": 1659312, + "step": 4130 + }, + { + "epoch": 4.298336798336798, + "grad_norm": 17.00044822692871, + "learning_rate": 2.9484949301297166e-07, + "loss": 0.0557, + "num_input_tokens_seen": 1661424, + "step": 4135 + }, + { + "epoch": 4.303534303534303, + "grad_norm": 0.0021855896338820457, + "learning_rate": 2.905901695516092e-07, + "loss": 0.0239, + "num_input_tokens_seen": 1663408, + "step": 4140 + }, + { + "epoch": 4.3087318087318085, + "grad_norm": 0.005250105168670416, + "learning_rate": 2.8635993586697555e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1665328, + "step": 4145 + }, + { + "epoch": 4.313929313929314, + "grad_norm": 0.02172735519707203, + "learning_rate": 2.8215884765564197e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1667312, + "step": 4150 + }, + { + "epoch": 4.3191268191268195, + "grad_norm": 0.3306088447570801, + "learning_rate": 2.779869602304416e-07, + "loss": 0.0003, + "num_input_tokens_seen": 1669296, + "step": 4155 + }, + { + "epoch": 4.324324324324325, + "grad_norm": 0.0034492157865315676, + "learning_rate": 2.73844328519742e-07, + "loss": 0.0003, + "num_input_tokens_seen": 1671280, + "step": 4160 + }, + { + "epoch": 4.32952182952183, + "grad_norm": 0.3147349953651428, + "learning_rate": 2.6973100706672e-07, + "loss": 0.0002, + "num_input_tokens_seen": 1673456, + "step": 4165 + }, + { + "epoch": 4.334719334719335, + "grad_norm": 0.0011071843327954412, + "learning_rate": 2.656470500286451e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1675504, + "step": 4170 + }, + { + "epoch": 4.33991683991684, + "grad_norm": 0.00639357278123498, + "learning_rate": 2.615925111761647e-07, + "loss": 0.0, + "num_input_tokens_seen": 1677488, + "step": 4175 + }, + { + "epoch": 4.345114345114345, + "grad_norm": 0.00608447939157486, + "learning_rate": 2.575674438925974e-07, + "loss": 0.0633, + "num_input_tokens_seen": 1679536, + "step": 4180 + }, + { + "epoch": 4.350311850311851, + "grad_norm": 84.28992462158203, + "learning_rate": 2.535719011732321e-07, + "loss": 0.0875, + "num_input_tokens_seen": 1681520, + "step": 4185 + }, + { + "epoch": 4.355509355509356, + "grad_norm": 13.161194801330566, + "learning_rate": 2.4960593562462496e-07, + "loss": 0.0372, + "num_input_tokens_seen": 1683568, + "step": 4190 + }, + { + "epoch": 4.360706860706861, + "grad_norm": 0.006262065842747688, + "learning_rate": 2.4566959946391246e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1685488, + "step": 4195 + }, + { + "epoch": 4.365904365904366, + "grad_norm": 0.010419082827866077, + "learning_rate": 2.4176294451811936e-07, + "loss": 0.0341, + "num_input_tokens_seen": 1687408, + "step": 4200 + }, + { + "epoch": 4.371101871101871, + "grad_norm": 0.0028410113882273436, + "learning_rate": 2.378860222234794e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1689520, + "step": 4205 + }, + { + "epoch": 4.376299376299376, + "grad_norm": 0.00360031402669847, + "learning_rate": 2.3403888362475784e-07, + "loss": 0.0003, + "num_input_tokens_seen": 1691568, + "step": 4210 + }, + { + "epoch": 4.381496881496881, + "grad_norm": 0.006181271746754646, + "learning_rate": 2.3022157937457628e-07, + "loss": 0.0, + "num_input_tokens_seen": 1693616, + "step": 4215 + }, + { + "epoch": 4.386694386694387, + "grad_norm": 0.014252823777496815, + "learning_rate": 2.2643415973275017e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1695600, + "step": 4220 + }, + { + "epoch": 4.391891891891892, + "grad_norm": 0.008665296249091625, + "learning_rate": 2.226766745656231e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1697584, + "step": 4225 + }, + { + "epoch": 4.397089397089397, + "grad_norm": 0.004776356276124716, + "learning_rate": 2.1894917334541355e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1699568, + "step": 4230 + }, + { + "epoch": 4.402286902286902, + "grad_norm": 0.01509240921586752, + "learning_rate": 2.15251705149562e-07, + "loss": 0.0017, + "num_input_tokens_seen": 1701744, + "step": 4235 + }, + { + "epoch": 4.407484407484407, + "grad_norm": 0.002179246162995696, + "learning_rate": 2.11584318660083e-07, + "loss": 0.0, + "num_input_tokens_seen": 1703600, + "step": 4240 + }, + { + "epoch": 4.412681912681912, + "grad_norm": 0.01842692494392395, + "learning_rate": 2.0794706216292815e-07, + "loss": 0.0613, + "num_input_tokens_seen": 1705712, + "step": 4245 + }, + { + "epoch": 4.417879417879418, + "grad_norm": 0.007841149345040321, + "learning_rate": 2.043399835473475e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1707696, + "step": 4250 + }, + { + "epoch": 4.423076923076923, + "grad_norm": 0.006627705413848162, + "learning_rate": 2.0076313030525845e-07, + "loss": 0.0012, + "num_input_tokens_seen": 1709744, + "step": 4255 + }, + { + "epoch": 4.428274428274428, + "grad_norm": 0.0027992126997560263, + "learning_rate": 1.9721654953062412e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1711792, + "step": 4260 + }, + { + "epoch": 4.4334719334719335, + "grad_norm": 0.02479691430926323, + "learning_rate": 1.937002879188285e-07, + "loss": 0.0002, + "num_input_tokens_seen": 1713904, + "step": 4265 + }, + { + "epoch": 4.4386694386694385, + "grad_norm": 0.011448011733591557, + "learning_rate": 1.9021439176606565e-07, + "loss": 0.0, + "num_input_tokens_seen": 1715824, + "step": 4270 + }, + { + "epoch": 4.443866943866944, + "grad_norm": 0.01309084240347147, + "learning_rate": 1.8675890696872838e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1717808, + "step": 4275 + }, + { + "epoch": 4.4490644490644495, + "grad_norm": 16.534465789794922, + "learning_rate": 1.8333387902280314e-07, + "loss": 0.0326, + "num_input_tokens_seen": 1719856, + "step": 4280 + }, + { + "epoch": 4.454261954261955, + "grad_norm": 0.006772617343813181, + "learning_rate": 1.799393530232729e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1721776, + "step": 4285 + }, + { + "epoch": 4.45945945945946, + "grad_norm": 0.04006092995405197, + "learning_rate": 1.765753736635234e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1723632, + "step": 4290 + }, + { + "epoch": 4.464656964656965, + "grad_norm": 0.004494254942983389, + "learning_rate": 1.7324198523475111e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1725488, + "step": 4295 + }, + { + "epoch": 4.46985446985447, + "grad_norm": 0.005892970599234104, + "learning_rate": 1.6993923162538562e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1727600, + "step": 4300 + }, + { + "epoch": 4.475051975051975, + "grad_norm": 0.016387267038226128, + "learning_rate": 1.666671563205069e-07, + "loss": 0.0462, + "num_input_tokens_seen": 1729712, + "step": 4305 + }, + { + "epoch": 4.48024948024948, + "grad_norm": 0.003626425750553608, + "learning_rate": 1.6342580240127582e-07, + "loss": 0.0, + "num_input_tokens_seen": 1731696, + "step": 4310 + }, + { + "epoch": 4.485446985446986, + "grad_norm": 0.050125960260629654, + "learning_rate": 1.6021521254436678e-07, + "loss": 0.0169, + "num_input_tokens_seen": 1733744, + "step": 4315 + }, + { + "epoch": 4.490644490644491, + "grad_norm": 0.010477319359779358, + "learning_rate": 1.5703542902140296e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1735728, + "step": 4320 + }, + { + "epoch": 4.495841995841996, + "grad_norm": 0.18304939568042755, + "learning_rate": 1.538864936984036e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1737776, + "step": 4325 + }, + { + "epoch": 4.501039501039501, + "grad_norm": 0.0033877205569297075, + "learning_rate": 1.507684480352292e-07, + "loss": 0.0313, + "num_input_tokens_seen": 1739824, + "step": 4330 + }, + { + "epoch": 4.506237006237006, + "grad_norm": 0.005890438798815012, + "learning_rate": 1.476813330850388e-07, + "loss": 0.0202, + "num_input_tokens_seen": 1741744, + "step": 4335 + }, + { + "epoch": 4.509355509355509, + "eval_loss": 0.43684616684913635, + "eval_runtime": 1.0364, + "eval_samples_per_second": 825.951, + "eval_steps_per_second": 103.244, + "num_input_tokens_seen": 1742960, + "step": 4338 + }, + { + "epoch": 4.511434511434511, + "grad_norm": 0.004618450067937374, + "learning_rate": 1.4462518949374838e-07, + "loss": 0.0002, + "num_input_tokens_seen": 1743728, + "step": 4340 + }, + { + "epoch": 4.516632016632016, + "grad_norm": 15.636531829833984, + "learning_rate": 1.4160005749949328e-07, + "loss": 0.0723, + "num_input_tokens_seen": 1745904, + "step": 4345 + }, + { + "epoch": 4.521829521829522, + "grad_norm": 0.009265006519854069, + "learning_rate": 1.386059769321027e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1747824, + "step": 4350 + }, + { + "epoch": 4.527027027027027, + "grad_norm": 0.14768032729625702, + "learning_rate": 1.3564298721257223e-07, + "loss": 0.0002, + "num_input_tokens_seen": 1749872, + "step": 4355 + }, + { + "epoch": 4.532224532224532, + "grad_norm": 0.004393266513943672, + "learning_rate": 1.32711127352545e-07, + "loss": 0.0, + "num_input_tokens_seen": 1751792, + "step": 4360 + }, + { + "epoch": 4.537422037422037, + "grad_norm": 0.0018293843604624271, + "learning_rate": 1.2981043595380048e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1753776, + "step": 4365 + }, + { + "epoch": 4.542619542619542, + "grad_norm": 0.010307732038199902, + "learning_rate": 1.269409512077427e-07, + "loss": 0.0, + "num_input_tokens_seen": 1755824, + "step": 4370 + }, + { + "epoch": 4.547817047817047, + "grad_norm": 0.02882198989391327, + "learning_rate": 1.241027108949e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1758000, + "step": 4375 + }, + { + "epoch": 4.553014553014553, + "grad_norm": 0.02132793888449669, + "learning_rate": 1.2129575238442715e-07, + "loss": 0.0006, + "num_input_tokens_seen": 1759984, + "step": 4380 + }, + { + "epoch": 4.558212058212058, + "grad_norm": 0.014733387157320976, + "learning_rate": 1.1852011263361218e-07, + "loss": 0.0002, + "num_input_tokens_seen": 1761968, + "step": 4385 + }, + { + "epoch": 4.5634095634095635, + "grad_norm": 0.009524204768240452, + "learning_rate": 1.1577582818739136e-07, + "loss": 0.0, + "num_input_tokens_seen": 1764016, + "step": 4390 + }, + { + "epoch": 4.5686070686070686, + "grad_norm": 17.770906448364258, + "learning_rate": 1.1306293517786615e-07, + "loss": 0.0046, + "num_input_tokens_seen": 1765936, + "step": 4395 + }, + { + "epoch": 4.573804573804574, + "grad_norm": 0.0056666964665055275, + "learning_rate": 1.1038146932383003e-07, + "loss": 0.0002, + "num_input_tokens_seen": 1767984, + "step": 4400 + }, + { + "epoch": 4.579002079002079, + "grad_norm": 0.2879636287689209, + "learning_rate": 1.0773146593029637e-07, + "loss": 0.0266, + "num_input_tokens_seen": 1769904, + "step": 4405 + }, + { + "epoch": 4.584199584199585, + "grad_norm": 0.01593073643743992, + "learning_rate": 1.0511295988803293e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1771888, + "step": 4410 + }, + { + "epoch": 4.58939708939709, + "grad_norm": 0.005392360966652632, + "learning_rate": 1.0252598567310451e-07, + "loss": 0.0027, + "num_input_tokens_seen": 1773936, + "step": 4415 + }, + { + "epoch": 4.594594594594595, + "grad_norm": 0.00791078433394432, + "learning_rate": 9.997057734641852e-08, + "loss": 0.0, + "num_input_tokens_seen": 1775984, + "step": 4420 + }, + { + "epoch": 4.5997920997921, + "grad_norm": 0.006835015490651131, + "learning_rate": 9.744676855327484e-08, + "loss": 0.0, + "num_input_tokens_seen": 1777840, + "step": 4425 + }, + { + "epoch": 4.604989604989605, + "grad_norm": 11.176219940185547, + "learning_rate": 9.495459252292505e-08, + "loss": 0.0267, + "num_input_tokens_seen": 1779824, + "step": 4430 + }, + { + "epoch": 4.61018711018711, + "grad_norm": 0.013272907584905624, + "learning_rate": 9.249408206813332e-08, + "loss": 0.0723, + "num_input_tokens_seen": 1781872, + "step": 4435 + }, + { + "epoch": 4.615384615384615, + "grad_norm": 0.012476145289838314, + "learning_rate": 9.00652695847451e-08, + "loss": 0.0001, + "num_input_tokens_seen": 1783984, + "step": 4440 + }, + { + "epoch": 4.620582120582121, + "grad_norm": 0.02734716795384884, + "learning_rate": 8.766818705126134e-08, + "loss": 0.0, + "num_input_tokens_seen": 1786032, + "step": 4445 + }, + { + "epoch": 4.625779625779626, + "grad_norm": 0.0005791023722849786, + "learning_rate": 8.530286602841525e-08, + "loss": 0.0058, + "num_input_tokens_seen": 1788016, + "step": 4450 + }, + { + "epoch": 4.630977130977131, + "grad_norm": 0.0094565125182271, + "learning_rate": 8.296933765875898e-08, + "loss": 0.0001, + "num_input_tokens_seen": 1790064, + "step": 4455 + }, + { + "epoch": 4.636174636174636, + "grad_norm": 0.0023999966215342283, + "learning_rate": 8.066763266625283e-08, + "loss": 0.0003, + "num_input_tokens_seen": 1791984, + "step": 4460 + }, + { + "epoch": 4.641372141372141, + "grad_norm": 0.004338675644248724, + "learning_rate": 7.839778135586007e-08, + "loss": 0.0321, + "num_input_tokens_seen": 1793904, + "step": 4465 + }, + { + "epoch": 4.646569646569646, + "grad_norm": 0.7023778557777405, + "learning_rate": 7.61598136131489e-08, + "loss": 0.0003, + "num_input_tokens_seen": 1795888, + "step": 4470 + }, + { + "epoch": 4.651767151767151, + "grad_norm": 0.019446399062871933, + "learning_rate": 7.3953758903898e-08, + "loss": 0.028, + "num_input_tokens_seen": 1797872, + "step": 4475 + }, + { + "epoch": 4.656964656964657, + "grad_norm": 0.06941288709640503, + "learning_rate": 7.177964627370999e-08, + "loss": 0.0007, + "num_input_tokens_seen": 1799920, + "step": 4480 + }, + { + "epoch": 4.662162162162162, + "grad_norm": 0.009321698918938637, + "learning_rate": 6.963750434762745e-08, + "loss": 0.0001, + "num_input_tokens_seen": 1801776, + "step": 4485 + }, + { + "epoch": 4.667359667359667, + "grad_norm": 61.291290283203125, + "learning_rate": 6.752736132975696e-08, + "loss": 0.0157, + "num_input_tokens_seen": 1803824, + "step": 4490 + }, + { + "epoch": 4.672557172557172, + "grad_norm": 0.003786651650443673, + "learning_rate": 6.544924500289789e-08, + "loss": 0.0562, + "num_input_tokens_seen": 1805744, + "step": 4495 + }, + { + "epoch": 4.6777546777546775, + "grad_norm": 0.01198617834597826, + "learning_rate": 6.340318272817476e-08, + "loss": 0.0, + "num_input_tokens_seen": 1807728, + "step": 4500 + }, + { + "epoch": 4.682952182952183, + "grad_norm": 0.024431385099887848, + "learning_rate": 6.138920144468124e-08, + "loss": 0.0329, + "num_input_tokens_seen": 1809712, + "step": 4505 + }, + { + "epoch": 4.6881496881496885, + "grad_norm": 0.011199146509170532, + "learning_rate": 5.940732766912011e-08, + "loss": 0.1284, + "num_input_tokens_seen": 1811632, + "step": 4510 + }, + { + "epoch": 4.6933471933471935, + "grad_norm": 0.015243390575051308, + "learning_rate": 5.745758749545749e-08, + "loss": 0.0, + "num_input_tokens_seen": 1813552, + "step": 4515 + }, + { + "epoch": 4.698544698544699, + "grad_norm": 0.02215772680938244, + "learning_rate": 5.554000659457881e-08, + "loss": 0.0, + "num_input_tokens_seen": 1815664, + "step": 4520 + }, + { + "epoch": 4.703742203742204, + "grad_norm": 0.07075236737728119, + "learning_rate": 5.365461021395096e-08, + "loss": 0.0056, + "num_input_tokens_seen": 1817648, + "step": 4525 + }, + { + "epoch": 4.708939708939709, + "grad_norm": 0.6212006211280823, + "learning_rate": 5.1801423177288146e-08, + "loss": 0.0226, + "num_input_tokens_seen": 1819696, + "step": 4530 + }, + { + "epoch": 4.714137214137214, + "grad_norm": 0.021660171449184418, + "learning_rate": 4.998046988422767e-08, + "loss": 0.0, + "num_input_tokens_seen": 1821680, + "step": 4535 + }, + { + "epoch": 4.71933471933472, + "grad_norm": 0.003982728812843561, + "learning_rate": 4.8191774310006045e-08, + "loss": 0.0, + "num_input_tokens_seen": 1823728, + "step": 4540 + }, + { + "epoch": 4.724532224532225, + "grad_norm": 0.005566044710576534, + "learning_rate": 4.6435360005145647e-08, + "loss": 0.0006, + "num_input_tokens_seen": 1825712, + "step": 4545 + }, + { + "epoch": 4.72972972972973, + "grad_norm": 0.006296331528574228, + "learning_rate": 4.471125009514326e-08, + "loss": 0.0258, + "num_input_tokens_seen": 1827760, + "step": 4550 + }, + { + "epoch": 4.734927234927235, + "grad_norm": 0.007365924771875143, + "learning_rate": 4.30194672801662e-08, + "loss": 0.0, + "num_input_tokens_seen": 1829680, + "step": 4555 + }, + { + "epoch": 4.74012474012474, + "grad_norm": 0.032850153744220734, + "learning_rate": 4.136003383475251e-08, + "loss": 0.0002, + "num_input_tokens_seen": 1831728, + "step": 4560 + }, + { + "epoch": 4.745322245322245, + "grad_norm": 0.048938535153865814, + "learning_rate": 3.9732971607519264e-08, + "loss": 0.0001, + "num_input_tokens_seen": 1833648, + "step": 4565 + }, + { + "epoch": 4.75051975051975, + "grad_norm": 0.010438877157866955, + "learning_rate": 3.813830202087338e-08, + "loss": 0.0, + "num_input_tokens_seen": 1835696, + "step": 4570 + }, + { + "epoch": 4.755717255717256, + "grad_norm": 0.24429504573345184, + "learning_rate": 3.6576046070730676e-08, + "loss": 0.0001, + "num_input_tokens_seen": 1837808, + "step": 4575 + }, + { + "epoch": 4.75987525987526, + "eval_loss": 0.4380520284175873, + "eval_runtime": 1.0491, + "eval_samples_per_second": 815.908, + "eval_steps_per_second": 101.988, + "num_input_tokens_seen": 1839344, + "step": 4579 + }, + { + "epoch": 4.760914760914761, + "grad_norm": 0.021253783255815506, + "learning_rate": 3.504622432623811e-08, + "loss": 0.0003, + "num_input_tokens_seen": 1839728, + "step": 4580 + }, + { + "epoch": 4.766112266112266, + "grad_norm": 0.007059005554765463, + "learning_rate": 3.354885692950505e-08, + "loss": 0.002, + "num_input_tokens_seen": 1841776, + "step": 4585 + }, + { + "epoch": 4.771309771309771, + "grad_norm": 0.0066725509241223335, + "learning_rate": 3.208396359533572e-08, + "loss": 0.0001, + "num_input_tokens_seen": 1843696, + "step": 4590 + }, + { + "epoch": 4.776507276507276, + "grad_norm": 0.006126644089818001, + "learning_rate": 3.065156361097138e-08, + "loss": 0.0002, + "num_input_tokens_seen": 1845744, + "step": 4595 + }, + { + "epoch": 4.781704781704782, + "grad_norm": 5.130758285522461, + "learning_rate": 2.925167583583577e-08, + "loss": 0.0009, + "num_input_tokens_seen": 1847792, + "step": 4600 + }, + { + "epoch": 4.786902286902287, + "grad_norm": 0.009116302244365215, + "learning_rate": 2.7884318701285883e-08, + "loss": 0.0712, + "num_input_tokens_seen": 1849776, + "step": 4605 + }, + { + "epoch": 4.792099792099792, + "grad_norm": 0.005589164327830076, + "learning_rate": 2.654951021037161e-08, + "loss": 0.0, + "num_input_tokens_seen": 1852016, + "step": 4610 + }, + { + "epoch": 4.797297297297297, + "grad_norm": 0.0037636614870280027, + "learning_rate": 2.524726793759591e-08, + "loss": 0.0001, + "num_input_tokens_seen": 1854064, + "step": 4615 + }, + { + "epoch": 4.802494802494802, + "grad_norm": 0.012889928184449673, + "learning_rate": 2.3977609028686123e-08, + "loss": 0.0002, + "num_input_tokens_seen": 1856112, + "step": 4620 + }, + { + "epoch": 4.8076923076923075, + "grad_norm": 0.0022313897497951984, + "learning_rate": 2.2740550200365528e-08, + "loss": 0.0, + "num_input_tokens_seen": 1858096, + "step": 4625 + }, + { + "epoch": 4.8128898128898125, + "grad_norm": 0.004886255133897066, + "learning_rate": 2.153610774013548e-08, + "loss": 0.0, + "num_input_tokens_seen": 1860272, + "step": 4630 + }, + { + "epoch": 4.8180873180873185, + "grad_norm": 0.004527249839156866, + "learning_rate": 2.0364297506060005e-08, + "loss": 0.0001, + "num_input_tokens_seen": 1862256, + "step": 4635 + }, + { + "epoch": 4.8232848232848236, + "grad_norm": 0.005999819375574589, + "learning_rate": 1.922513492655653e-08, + "loss": 0.0, + "num_input_tokens_seen": 1864304, + "step": 4640 + }, + { + "epoch": 4.828482328482329, + "grad_norm": 0.003096930915489793, + "learning_rate": 1.8118635000194395e-08, + "loss": 0.0, + "num_input_tokens_seen": 1866224, + "step": 4645 + }, + { + "epoch": 4.833679833679834, + "grad_norm": 0.011734005995094776, + "learning_rate": 1.704481229549526e-08, + "loss": 0.0002, + "num_input_tokens_seen": 1868336, + "step": 4650 + }, + { + "epoch": 4.838877338877339, + "grad_norm": 0.005439637694507837, + "learning_rate": 1.6003680950742728e-08, + "loss": 0.0001, + "num_input_tokens_seen": 1870448, + "step": 4655 + }, + { + "epoch": 4.844074844074844, + "grad_norm": 21.41458511352539, + "learning_rate": 1.499525467379581e-08, + "loss": 0.0076, + "num_input_tokens_seen": 1872368, + "step": 4660 + }, + { + "epoch": 4.849272349272349, + "grad_norm": 0.007195206359028816, + "learning_rate": 1.4019546741908252e-08, + "loss": 0.0001, + "num_input_tokens_seen": 1874480, + "step": 4665 + }, + { + "epoch": 4.854469854469855, + "grad_norm": 0.019606366753578186, + "learning_rate": 1.3076570001553934e-08, + "loss": 0.0214, + "num_input_tokens_seen": 1876464, + "step": 4670 + }, + { + "epoch": 4.85966735966736, + "grad_norm": 0.021399203687906265, + "learning_rate": 1.216633686825841e-08, + "loss": 0.0, + "num_input_tokens_seen": 1878448, + "step": 4675 + }, + { + "epoch": 4.864864864864865, + "grad_norm": 0.008349803276360035, + "learning_rate": 1.1288859326433477e-08, + "loss": 0.0426, + "num_input_tokens_seen": 1880432, + "step": 4680 + }, + { + "epoch": 4.87006237006237, + "grad_norm": 0.00672512361779809, + "learning_rate": 1.0444148929221466e-08, + "loss": 0.0598, + "num_input_tokens_seen": 1882544, + "step": 4685 + }, + { + "epoch": 4.875259875259875, + "grad_norm": 0.009454301558434963, + "learning_rate": 9.632216798342032e-09, + "loss": 0.0001, + "num_input_tokens_seen": 1884528, + "step": 4690 + }, + { + "epoch": 4.88045738045738, + "grad_norm": 0.001981085864827037, + "learning_rate": 8.853073623946163e-09, + "loss": 0.0, + "num_input_tokens_seen": 1886640, + "step": 4695 + }, + { + "epoch": 4.885654885654886, + "grad_norm": 13.079166412353516, + "learning_rate": 8.106729664475178e-09, + "loss": 0.0369, + "num_input_tokens_seen": 1888688, + "step": 4700 + }, + { + "epoch": 4.890852390852391, + "grad_norm": 0.020751064643263817, + "learning_rate": 7.3931947465252786e-09, + "loss": 0.0001, + "num_input_tokens_seen": 1890736, + "step": 4705 + }, + { + "epoch": 4.896049896049896, + "grad_norm": 0.0023189696948975325, + "learning_rate": 6.7124782647196015e-09, + "loss": 0.0, + "num_input_tokens_seen": 1892720, + "step": 4710 + }, + { + "epoch": 4.901247401247401, + "grad_norm": 0.010008195415139198, + "learning_rate": 6.064589181582481e-09, + "loss": 0.0, + "num_input_tokens_seen": 1894704, + "step": 4715 + }, + { + "epoch": 4.906444906444906, + "grad_norm": 0.011259862221777439, + "learning_rate": 5.4495360274231526e-09, + "loss": 0.0287, + "num_input_tokens_seen": 1896624, + "step": 4720 + }, + { + "epoch": 4.911642411642411, + "grad_norm": 0.006877740379422903, + "learning_rate": 4.867326900223068e-09, + "loss": 0.0307, + "num_input_tokens_seen": 1898544, + "step": 4725 + }, + { + "epoch": 4.916839916839917, + "grad_norm": 0.0025101625360548496, + "learning_rate": 4.317969465527927e-09, + "loss": 0.0353, + "num_input_tokens_seen": 1900592, + "step": 4730 + }, + { + "epoch": 4.922037422037422, + "grad_norm": 0.022444335743784904, + "learning_rate": 3.801470956348863e-09, + "loss": 0.0287, + "num_input_tokens_seen": 1902576, + "step": 4735 + }, + { + "epoch": 4.927234927234927, + "grad_norm": 0.0029787139501422644, + "learning_rate": 3.3178381730661345e-09, + "loss": 0.0001, + "num_input_tokens_seen": 1904624, + "step": 4740 + }, + { + "epoch": 4.9324324324324325, + "grad_norm": 0.0010304702445864677, + "learning_rate": 2.8670774833386427e-09, + "loss": 0.0, + "num_input_tokens_seen": 1906736, + "step": 4745 + }, + { + "epoch": 4.9376299376299375, + "grad_norm": 0.0025558616034686565, + "learning_rate": 2.449194822022327e-09, + "loss": 0.0, + "num_input_tokens_seen": 1908592, + "step": 4750 + }, + { + "epoch": 4.942827442827443, + "grad_norm": 0.019764816388487816, + "learning_rate": 2.064195691089954e-09, + "loss": 0.0006, + "num_input_tokens_seen": 1910576, + "step": 4755 + }, + { + "epoch": 4.948024948024948, + "grad_norm": 0.004834398627281189, + "learning_rate": 1.7120851595597842e-09, + "loss": 0.0, + "num_input_tokens_seen": 1912624, + "step": 4760 + }, + { + "epoch": 4.953222453222454, + "grad_norm": 0.03484058007597923, + "learning_rate": 1.3928678634289595e-09, + "loss": 0.0283, + "num_input_tokens_seen": 1914608, + "step": 4765 + }, + { + "epoch": 4.958419958419959, + "grad_norm": 0.008615722879767418, + "learning_rate": 1.1065480056110521e-09, + "loss": 0.0004, + "num_input_tokens_seen": 1916592, + "step": 4770 + }, + { + "epoch": 4.963617463617464, + "grad_norm": 0.16332073509693146, + "learning_rate": 8.531293558824983e-10, + "loss": 0.0001, + "num_input_tokens_seen": 1918704, + "step": 4775 + }, + { + "epoch": 4.968814968814969, + "grad_norm": 0.0005217403522692621, + "learning_rate": 6.326152508320804e-10, + "loss": 0.0001, + "num_input_tokens_seen": 1920624, + "step": 4780 + }, + { + "epoch": 4.974012474012474, + "grad_norm": 1.4390920400619507, + "learning_rate": 4.450085938170756e-10, + "loss": 0.0013, + "num_input_tokens_seen": 1922480, + "step": 4785 + }, + { + "epoch": 4.979209979209979, + "grad_norm": 0.018787242472171783, + "learning_rate": 2.903118549252293e-10, + "loss": 0.0, + "num_input_tokens_seen": 1924464, + "step": 4790 + }, + { + "epoch": 4.984407484407484, + "grad_norm": 0.14635036885738373, + "learning_rate": 1.6852707094172637e-10, + "loss": 0.0001, + "num_input_tokens_seen": 1926448, + "step": 4795 + }, + { + "epoch": 4.98960498960499, + "grad_norm": 0.04664904624223709, + "learning_rate": 7.965584532282356e-11, + "loss": 0.0002, + "num_input_tokens_seen": 1928560, + "step": 4800 + }, + { + "epoch": 4.994802494802495, + "grad_norm": 0.024172263219952583, + "learning_rate": 2.3699348174754943e-11, + "loss": 0.0177, + "num_input_tokens_seen": 1930544, + "step": 4805 + }, + { + "epoch": 5.0, + "grad_norm": 0.0018674664897844195, + "learning_rate": 6.583162381890162e-13, + "loss": 0.0001, + "num_input_tokens_seen": 1932608, + "step": 4810 + }, + { + "epoch": 5.0, + "num_input_tokens_seen": 1932608, + "step": 4810, + "total_flos": 1.1284259767320576e+16, + "train_loss": 0.10950150515592155, + "train_runtime": 1431.7139, + "train_samples_per_second": 26.873, + "train_steps_per_second": 3.36 + } + ], + "logging_steps": 5, + "max_steps": 4810, + "num_input_tokens_seen": 1932608, + "num_train_epochs": 5, + "save_steps": 241, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.1284259767320576e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..c71cfbf --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b12b06bd22ce85658655e793c63bc30ae7d311bfa6170cf93b640e9bd448e704 +size 6289 diff --git a/training_eval_loss.png b/training_eval_loss.png new file mode 100644 index 0000000..2a076a6 Binary files /dev/null and b/training_eval_loss.png differ diff --git a/training_loss.png b/training_loss.png new file mode 100644 index 0000000..ad4b767 Binary files /dev/null and b/training_loss.png differ