commit 7ceef409c62c6009ee89fe80bd663a6b7dc6dad5 Author: ModelHub XC Date: Sun May 3 10:20:24 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: rbelanec/train_mrpc_42_1776331557 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..158b00a --- /dev/null +++ b/README.md @@ -0,0 +1,81 @@ +--- +library_name: transformers +license: llama3.2 +base_model: meta-llama/Llama-3.2-1B-Instruct +tags: +- peft-factory +- full +- llama-factory +- generated_from_trainer +model-index: +- name: train_mrpc_42_1776331557 + results: [] +--- + + + +# train_mrpc_42_1776331557 + +This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) on the mrpc dataset. +It achieves the following results on the evaluation set: +- Loss: 0.1084 +- Num Input Tokens Seen: 1780000 + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 5e-06 +- train_batch_size: 8 +- eval_batch_size: 8 +- seed: 42 +- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_ratio: 0.1 +- num_epochs: 5 + +### Training results + +| Training Loss | Epoch | Step | Validation Loss | Input Tokens Seen | +|:-------------:|:------:|:----:|:---------------:|:-----------------:| +| 0.1552 | 0.2518 | 104 | 0.1485 | 89600 | +| 0.2178 | 0.5036 | 208 | 0.1320 | 178688 | +| 0.1165 | 0.7554 | 312 | 0.1130 | 267968 | +| 0.1193 | 1.0073 | 416 | 0.1084 | 357488 | +| 0.0685 | 1.2591 | 520 | 0.1903 | 446896 | +| 0.0801 | 1.5109 | 624 | 0.1982 | 536176 | +| 0.2066 | 1.7627 | 728 | 0.1449 | 626992 | +| 0.0011 | 2.0145 | 832 | 0.2068 | 716344 | +| 0.0059 | 2.2663 | 936 | 0.2691 | 806712 | +| 0.0756 | 2.5182 | 1040 | 0.2895 | 895736 | +| 0.0001 | 2.7700 | 1144 | 0.2260 | 985592 | +| 0.0 | 3.0218 | 1248 | 0.2253 | 1074624 | +| 0.0 | 3.2736 | 1352 | 0.2578 | 1164544 | +| 0.0 | 3.5254 | 1456 | 0.2580 | 1253248 | +| 0.0 | 3.7772 | 1560 | 0.2703 | 1344000 | +| 0.0 | 4.0291 | 1664 | 0.2502 | 1432880 | +| 0.0001 | 4.2809 | 1768 | 0.2504 | 1522544 | +| 0.0 | 4.5327 | 1872 | 0.2489 | 1611760 | +| 0.0 | 4.7845 | 1976 | 0.2508 | 1702832 | + + +### Framework versions + +- Transformers 4.51.3 +- Pytorch 2.10.0+cu128 +- Datasets 4.0.0 +- Tokenizers 0.21.4 diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..eb25305 --- /dev/null +++ b/all_results.json @@ -0,0 +1,13 @@ +{ + "epoch": 5.0, + "eval_loss": 0.10842076689004898, + "eval_runtime": 0.6289, + "eval_samples_per_second": 583.581, + "eval_steps_per_second": 73.146, + "num_input_tokens_seen": 1780000, + "total_flos": 1.039320047616e+16, + "train_loss": 0.06261659951716346, + "train_runtime": 1141.6604, + "train_samples_per_second": 14.457, + "train_steps_per_second": 1.809 +} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..5a2b93f --- /dev/null +++ b/config.json @@ -0,0 +1,39 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.51.3", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/eval_results.json b/eval_results.json new file mode 100644 index 0000000..f0e6938 --- /dev/null +++ b/eval_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 5.0, + "eval_loss": 0.10842076689004898, + "eval_runtime": 0.6289, + "eval_samples_per_second": 583.581, + "eval_steps_per_second": 73.146, + "num_input_tokens_seen": 1780000 +} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..2b8ae57 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.51.3" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..f2b8cbf --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b7af7ab14e0309187e05387160a099eb1b33ea1c3a9f9af496fbb6393ec06a7 +size 4943274328 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..14daf45 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,26 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..ddc3ce0 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2069 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/train.yaml b/train.yaml new file mode 100644 index 0000000..7d3d0c5 --- /dev/null +++ b/train.yaml @@ -0,0 +1,55 @@ +seed: 42 + +### model +model_name_or_path: meta-llama/Llama-3.2-1B-Instruct +trust_remote_code: true +flash_attn: auto +use_cache: false + +### method +stage: sft +do_train: true +finetuning_type: full + +### dataset +dataset: mrpc +template: llama3 +cutoff_len: 2048 +overwrite_cache: true +preprocessing_num_workers: 4 +dataloader_num_workers: 4 +packing: false + +### output +output_dir: saves_bts_preliminary/base/llama-3.2-1b-instruct/train_mrpc_42_1776331557 +logging_steps: 5 +save_steps: 0.05 +overwrite_output_dir: true +save_only_model: false +plot_loss: true +include_num_input_tokens_seen: true +push_to_hub: true +push_to_hub_organization: rbelanec +load_best_model_at_end: true +save_total_limit: 1 + +### train +per_device_train_batch_size: 8 +learning_rate: 5.0e-6 +num_train_epochs: 5 +weight_decay: 1.0e-5 +lr_scheduler_type: cosine +bf16: true +ddp_timeout: 180000000 +resume_from_checkpoint: null +warmup_ratio: 0.1 +optim: adamw_torch +report_to: +- wandb +run_name: base_llama-3.2-1b-instruct_train_mrpc_42_1776331557 + +### eval +per_device_eval_batch_size: 8 +eval_strategy: steps +eval_steps: 0.05 +val_size: 0.1 \ No newline at end of file diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..ea93519 --- /dev/null +++ b/train_results.json @@ -0,0 +1,9 @@ +{ + "epoch": 5.0, + "num_input_tokens_seen": 1780000, + "total_flos": 1.039320047616e+16, + "train_loss": 0.06261659951716346, + "train_runtime": 1141.6604, + "train_samples_per_second": 14.457, + "train_steps_per_second": 1.809 +} \ No newline at end of file diff --git a/trainer_log.jsonl b/trainer_log.jsonl new file mode 100644 index 0000000..3827129 --- /dev/null +++ b/trainer_log.jsonl @@ -0,0 +1,433 @@ +{"current_steps": 5, "total_steps": 2065, "loss": 0.81, "lr": 9.661835748792271e-08, "epoch": 0.012106537530266344, "percentage": 0.24, "elapsed_time": "0:00:00", "remaining_time": "0:04:33", "throughput": 6549.56, "total_tokens": 4352} +{"current_steps": 10, "total_steps": 2065, "loss": 0.8, "lr": 2.173913043478261e-07, "epoch": 0.024213075060532687, "percentage": 0.48, "elapsed_time": "0:00:01", "remaining_time": "0:03:35", "throughput": 8373.14, "total_tokens": 8768} +{"current_steps": 15, "total_steps": 2065, "loss": 0.6335, "lr": 3.3816425120772945e-07, "epoch": 0.03631961259079903, "percentage": 0.73, "elapsed_time": "0:00:01", "remaining_time": "0:03:13", "throughput": 9173.69, "total_tokens": 12992} +{"current_steps": 20, "total_steps": 2065, "loss": 0.3717, "lr": 4.5893719806763294e-07, "epoch": 0.048426150121065374, "percentage": 0.97, "elapsed_time": "0:00:01", "remaining_time": "0:03:02", "throughput": 9704.71, "total_tokens": 17344} +{"current_steps": 25, "total_steps": 2065, "loss": 0.2438, "lr": 5.797101449275363e-07, "epoch": 0.06053268765133172, "percentage": 1.21, "elapsed_time": "0:00:02", "remaining_time": "0:02:56", "throughput": 10054.34, "total_tokens": 21696} +{"current_steps": 30, "total_steps": 2065, "loss": 0.2523, "lr": 7.004830917874397e-07, "epoch": 0.07263922518159806, "percentage": 1.45, "elapsed_time": "0:00:02", "remaining_time": "0:02:51", "throughput": 10328.87, "total_tokens": 26112} +{"current_steps": 35, "total_steps": 2065, "loss": 0.2223, "lr": 8.212560386473431e-07, "epoch": 0.0847457627118644, "percentage": 1.69, "elapsed_time": "0:00:02", "remaining_time": "0:02:47", "throughput": 10436.42, "total_tokens": 30208} +{"current_steps": 40, "total_steps": 2065, "loss": 0.2184, "lr": 9.420289855072465e-07, "epoch": 0.09685230024213075, "percentage": 1.94, "elapsed_time": "0:00:03", "remaining_time": "0:02:45", "throughput": 10618.68, "total_tokens": 34688} +{"current_steps": 45, "total_steps": 2065, "loss": 0.2163, "lr": 1.0628019323671499e-06, "epoch": 0.1089588377723971, "percentage": 2.18, "elapsed_time": "0:00:03", "remaining_time": "0:02:42", "throughput": 10689.52, "total_tokens": 38784} +{"current_steps": 50, "total_steps": 2065, "loss": 0.2198, "lr": 1.1835748792270531e-06, "epoch": 0.12106537530266344, "percentage": 2.42, "elapsed_time": "0:00:04", "remaining_time": "0:02:41", "throughput": 10794.69, "total_tokens": 43200} +{"current_steps": 55, "total_steps": 2065, "loss": 0.2224, "lr": 1.3043478260869566e-06, "epoch": 0.13317191283292978, "percentage": 2.66, "elapsed_time": "0:00:04", "remaining_time": "0:02:39", "throughput": 10823.52, "total_tokens": 47296} +{"current_steps": 60, "total_steps": 2065, "loss": 0.2272, "lr": 1.42512077294686e-06, "epoch": 0.14527845036319612, "percentage": 2.91, "elapsed_time": "0:00:04", "remaining_time": "0:02:38", "throughput": 10914.4, "total_tokens": 51712} +{"current_steps": 65, "total_steps": 2065, "loss": 0.1665, "lr": 1.5458937198067634e-06, "epoch": 0.15738498789346247, "percentage": 3.15, "elapsed_time": "0:00:05", "remaining_time": "0:02:37", "throughput": 10944.07, "total_tokens": 55872} +{"current_steps": 70, "total_steps": 2065, "loss": 0.1835, "lr": 1.6666666666666667e-06, "epoch": 0.1694915254237288, "percentage": 3.39, "elapsed_time": "0:00:05", "remaining_time": "0:02:35", "throughput": 10943.0, "total_tokens": 59840} +{"current_steps": 75, "total_steps": 2065, "loss": 0.1898, "lr": 1.7874396135265702e-06, "epoch": 0.18159806295399517, "percentage": 3.63, "elapsed_time": "0:00:05", "remaining_time": "0:02:34", "throughput": 10968.19, "total_tokens": 64000} +{"current_steps": 80, "total_steps": 2065, "loss": 0.2149, "lr": 1.9082125603864736e-06, "epoch": 0.1937046004842615, "percentage": 3.87, "elapsed_time": "0:00:06", "remaining_time": "0:02:33", "throughput": 11018.77, "total_tokens": 68352} +{"current_steps": 85, "total_steps": 2065, "loss": 0.1519, "lr": 2.028985507246377e-06, "epoch": 0.20581113801452786, "percentage": 4.12, "elapsed_time": "0:00:06", "remaining_time": "0:02:33", "throughput": 11068.81, "total_tokens": 72768} +{"current_steps": 90, "total_steps": 2065, "loss": 0.1468, "lr": 2.1497584541062806e-06, "epoch": 0.2179176755447942, "percentage": 4.36, "elapsed_time": "0:00:06", "remaining_time": "0:02:32", "throughput": 11103.7, "total_tokens": 77120} +{"current_steps": 95, "total_steps": 2065, "loss": 0.2277, "lr": 2.270531400966184e-06, "epoch": 0.23002421307506055, "percentage": 4.6, "elapsed_time": "0:00:07", "remaining_time": "0:02:32", "throughput": 11131.81, "total_tokens": 81664} +{"current_steps": 100, "total_steps": 2065, "loss": 0.1552, "lr": 2.391304347826087e-06, "epoch": 0.24213075060532688, "percentage": 4.84, "elapsed_time": "0:00:07", "remaining_time": "0:02:31", "throughput": 11169.26, "total_tokens": 86080} +{"current_steps": 104, "total_steps": 2065, "eval_loss": 0.1484687179327011, "epoch": 0.25181598062953997, "percentage": 5.04, "elapsed_time": "0:00:08", "remaining_time": "0:02:41", "throughput": 10430.75, "total_tokens": 89600} +{"current_steps": 105, "total_steps": 2065, "loss": 0.1673, "lr": 2.5120772946859904e-06, "epoch": 0.2542372881355932, "percentage": 5.08, "elapsed_time": "0:01:19", "remaining_time": "0:24:48", "throughput": 1133.88, "total_tokens": 90432} +{"current_steps": 110, "total_steps": 2065, "loss": 0.1694, "lr": 2.632850241545894e-06, "epoch": 0.26634382566585957, "percentage": 5.33, "elapsed_time": "0:01:20", "remaining_time": "0:23:43", "throughput": 1179.82, "total_tokens": 94528} +{"current_steps": 115, "total_steps": 2065, "loss": 0.1627, "lr": 2.7536231884057974e-06, "epoch": 0.2784503631961259, "percentage": 5.57, "elapsed_time": "0:01:20", "remaining_time": "0:22:44", "throughput": 1227.67, "total_tokens": 98816} +{"current_steps": 120, "total_steps": 2065, "loss": 0.2205, "lr": 2.8743961352657007e-06, "epoch": 0.29055690072639223, "percentage": 5.81, "elapsed_time": "0:01:20", "remaining_time": "0:21:50", "throughput": 1275.15, "total_tokens": 103104} +{"current_steps": 125, "total_steps": 2065, "loss": 0.1841, "lr": 2.995169082125604e-06, "epoch": 0.3026634382566586, "percentage": 6.05, "elapsed_time": "0:01:21", "remaining_time": "0:21:00", "throughput": 1321.41, "total_tokens": 107328} +{"current_steps": 130, "total_steps": 2065, "loss": 0.1779, "lr": 3.1159420289855073e-06, "epoch": 0.31476997578692495, "percentage": 6.3, "elapsed_time": "0:01:21", "remaining_time": "0:20:14", "throughput": 1366.46, "total_tokens": 111488} +{"current_steps": 135, "total_steps": 2065, "loss": 0.158, "lr": 3.236714975845411e-06, "epoch": 0.3268765133171913, "percentage": 6.54, "elapsed_time": "0:01:21", "remaining_time": "0:19:31", "throughput": 1414.94, "total_tokens": 115968} +{"current_steps": 140, "total_steps": 2065, "loss": 0.2089, "lr": 3.3574879227053142e-06, "epoch": 0.3389830508474576, "percentage": 6.78, "elapsed_time": "0:01:22", "remaining_time": "0:18:51", "throughput": 1459.93, "total_tokens": 120192} +{"current_steps": 145, "total_steps": 2065, "loss": 0.1007, "lr": 3.4782608695652175e-06, "epoch": 0.35108958837772397, "percentage": 7.02, "elapsed_time": "0:01:22", "remaining_time": "0:18:14", "throughput": 1504.53, "total_tokens": 124416} +{"current_steps": 150, "total_steps": 2065, "loss": 0.2718, "lr": 3.5990338164251208e-06, "epoch": 0.36319612590799033, "percentage": 7.26, "elapsed_time": "0:01:23", "remaining_time": "0:17:40", "throughput": 1550.99, "total_tokens": 128832} +{"current_steps": 155, "total_steps": 2065, "loss": 0.3704, "lr": 3.7198067632850245e-06, "epoch": 0.37530266343825663, "percentage": 7.51, "elapsed_time": "0:01:23", "remaining_time": "0:17:08", "throughput": 1594.12, "total_tokens": 132992} +{"current_steps": 160, "total_steps": 2065, "loss": 0.1945, "lr": 3.840579710144928e-06, "epoch": 0.387409200968523, "percentage": 7.75, "elapsed_time": "0:01:23", "remaining_time": "0:16:37", "throughput": 1638.27, "total_tokens": 137280} +{"current_steps": 165, "total_steps": 2065, "loss": 0.253, "lr": 3.961352657004831e-06, "epoch": 0.39951573849878935, "percentage": 7.99, "elapsed_time": "0:01:24", "remaining_time": "0:16:09", "throughput": 1682.08, "total_tokens": 141568} +{"current_steps": 170, "total_steps": 2065, "loss": 0.1622, "lr": 4.082125603864734e-06, "epoch": 0.4116222760290557, "percentage": 8.23, "elapsed_time": "0:01:24", "remaining_time": "0:15:42", "throughput": 1727.0, "total_tokens": 145984} +{"current_steps": 175, "total_steps": 2065, "loss": 0.1974, "lr": 4.202898550724638e-06, "epoch": 0.423728813559322, "percentage": 8.47, "elapsed_time": "0:01:24", "remaining_time": "0:15:16", "throughput": 1768.6, "total_tokens": 150144} +{"current_steps": 180, "total_steps": 2065, "loss": 0.2191, "lr": 4.323671497584541e-06, "epoch": 0.4358353510895884, "percentage": 8.72, "elapsed_time": "0:01:25", "remaining_time": "0:14:52", "throughput": 1813.49, "total_tokens": 154624} +{"current_steps": 185, "total_steps": 2065, "loss": 0.2192, "lr": 4.444444444444444e-06, "epoch": 0.44794188861985473, "percentage": 8.96, "elapsed_time": "0:01:25", "remaining_time": "0:14:30", "throughput": 1854.32, "total_tokens": 158784} +{"current_steps": 190, "total_steps": 2065, "loss": 0.1887, "lr": 4.565217391304348e-06, "epoch": 0.4600484261501211, "percentage": 9.2, "elapsed_time": "0:01:25", "remaining_time": "0:14:08", "throughput": 1896.22, "total_tokens": 163072} +{"current_steps": 195, "total_steps": 2065, "loss": 0.1951, "lr": 4.6859903381642516e-06, "epoch": 0.4721549636803874, "percentage": 9.44, "elapsed_time": "0:01:26", "remaining_time": "0:13:48", "throughput": 1934.98, "total_tokens": 167104} +{"current_steps": 200, "total_steps": 2065, "loss": 0.1486, "lr": 4.806763285024155e-06, "epoch": 0.48426150121065376, "percentage": 9.69, "elapsed_time": "0:01:26", "remaining_time": "0:13:28", "throughput": 1976.89, "total_tokens": 171456} +{"current_steps": 205, "total_steps": 2065, "loss": 0.2178, "lr": 4.927536231884059e-06, "epoch": 0.4963680387409201, "percentage": 9.93, "elapsed_time": "0:01:27", "remaining_time": "0:13:10", "throughput": 2018.51, "total_tokens": 175808} +{"current_steps": 208, "total_steps": 2065, "eval_loss": 0.1319892704486847, "epoch": 0.5036319612590799, "percentage": 10.07, "elapsed_time": "0:01:27", "remaining_time": "0:13:04", "throughput": 2032.59, "total_tokens": 178688} +{"current_steps": 210, "total_steps": 2065, "loss": 0.1052, "lr": 4.999985705205496e-06, "epoch": 0.5084745762711864, "percentage": 10.17, "elapsed_time": "0:02:03", "remaining_time": "0:18:13", "throughput": 1456.18, "total_tokens": 180224} +{"current_steps": 215, "total_steps": 2065, "loss": 0.1655, "lr": 4.999824890644693e-06, "epoch": 0.5205811138014528, "percentage": 10.41, "elapsed_time": "0:02:04", "remaining_time": "0:17:48", "throughput": 1487.91, "total_tokens": 184704} +{"current_steps": 220, "total_steps": 2065, "loss": 0.3684, "lr": 4.999485404562269e-06, "epoch": 0.5326876513317191, "percentage": 10.65, "elapsed_time": "0:02:04", "remaining_time": "0:17:24", "throughput": 1519.45, "total_tokens": 189184} +{"current_steps": 225, "total_steps": 2065, "loss": 0.1527, "lr": 4.998967271222521e-06, "epoch": 0.5447941888619855, "percentage": 10.9, "elapsed_time": "0:02:04", "remaining_time": "0:17:01", "throughput": 1549.8, "total_tokens": 193536} +{"current_steps": 230, "total_steps": 2065, "loss": 0.1238, "lr": 4.998270527658311e-06, "epoch": 0.5569007263922519, "percentage": 11.14, "elapsed_time": "0:02:05", "remaining_time": "0:16:39", "throughput": 1579.99, "total_tokens": 197888} +{"current_steps": 235, "total_steps": 2065, "loss": 0.2147, "lr": 4.997395223668422e-06, "epoch": 0.5690072639225182, "percentage": 11.38, "elapsed_time": "0:02:05", "remaining_time": "0:16:18", "throughput": 1608.93, "total_tokens": 202112} +{"current_steps": 240, "total_steps": 2065, "loss": 0.1162, "lr": 4.996341421813993e-06, "epoch": 0.5811138014527845, "percentage": 11.62, "elapsed_time": "0:02:05", "remaining_time": "0:15:58", "throughput": 1639.23, "total_tokens": 206528} +{"current_steps": 245, "total_steps": 2065, "loss": 0.1311, "lr": 4.995109197414051e-06, "epoch": 0.5932203389830508, "percentage": 11.86, "elapsed_time": "0:02:06", "remaining_time": "0:15:38", "throughput": 1669.38, "total_tokens": 210944} +{"current_steps": 250, "total_steps": 2065, "loss": 0.1437, "lr": 4.9936986385401305e-06, "epoch": 0.6053268765133172, "percentage": 12.11, "elapsed_time": "0:02:06", "remaining_time": "0:15:20", "throughput": 1697.32, "total_tokens": 215104} +{"current_steps": 255, "total_steps": 2065, "loss": 0.1597, "lr": 4.992109846009972e-06, "epoch": 0.6174334140435835, "percentage": 12.35, "elapsed_time": "0:02:07", "remaining_time": "0:15:02", "throughput": 1725.62, "total_tokens": 219328} +{"current_steps": 260, "total_steps": 2065, "loss": 0.1878, "lr": 4.990342933380321e-06, "epoch": 0.6295399515738499, "percentage": 12.59, "elapsed_time": "0:02:07", "remaining_time": "0:14:44", "throughput": 1754.71, "total_tokens": 223680} +{"current_steps": 265, "total_steps": 2065, "loss": 0.1445, "lr": 4.988398026938811e-06, "epoch": 0.6416464891041163, "percentage": 12.83, "elapsed_time": "0:02:07", "remaining_time": "0:14:28", "throughput": 1782.69, "total_tokens": 227904} +{"current_steps": 270, "total_steps": 2065, "loss": 0.0992, "lr": 4.986275265694935e-06, "epoch": 0.6537530266343826, "percentage": 13.08, "elapsed_time": "0:02:08", "remaining_time": "0:14:12", "throughput": 1808.96, "total_tokens": 231936} +{"current_steps": 275, "total_steps": 2065, "loss": 0.0608, "lr": 4.983974801370115e-06, "epoch": 0.6658595641646489, "percentage": 13.32, "elapsed_time": "0:02:08", "remaining_time": "0:13:57", "throughput": 1836.54, "total_tokens": 236160} +{"current_steps": 280, "total_steps": 2065, "loss": 0.2262, "lr": 4.981496798386849e-06, "epoch": 0.6779661016949152, "percentage": 13.56, "elapsed_time": "0:02:08", "remaining_time": "0:13:42", "throughput": 1863.56, "total_tokens": 240320} +{"current_steps": 285, "total_steps": 2065, "loss": 0.1165, "lr": 4.9788414338569715e-06, "epoch": 0.6900726392251816, "percentage": 13.8, "elapsed_time": "0:02:09", "remaining_time": "0:13:27", "throughput": 1892.7, "total_tokens": 244800} +{"current_steps": 290, "total_steps": 2065, "loss": 0.2377, "lr": 4.9760088975689815e-06, "epoch": 0.7021791767554479, "percentage": 14.04, "elapsed_time": "0:02:09", "remaining_time": "0:13:13", "throughput": 1920.82, "total_tokens": 249152} +{"current_steps": 295, "total_steps": 2065, "loss": 0.1377, "lr": 4.972999391974488e-06, "epoch": 0.7142857142857143, "percentage": 14.29, "elapsed_time": "0:02:10", "remaining_time": "0:13:00", "throughput": 1947.89, "total_tokens": 253376} +{"current_steps": 300, "total_steps": 2065, "loss": 0.19, "lr": 4.969813132173735e-06, "epoch": 0.7263922518159807, "percentage": 14.53, "elapsed_time": "0:02:10", "remaining_time": "0:12:47", "throughput": 1975.26, "total_tokens": 257664} +{"current_steps": 305, "total_steps": 2065, "loss": 0.1146, "lr": 4.966450345900229e-06, "epoch": 0.738498789346247, "percentage": 14.77, "elapsed_time": "0:02:10", "remaining_time": "0:12:34", "throughput": 2002.92, "total_tokens": 262016} +{"current_steps": 310, "total_steps": 2065, "loss": 0.1165, "lr": 4.962911273504461e-06, "epoch": 0.7506053268765133, "percentage": 15.01, "elapsed_time": "0:02:11", "remaining_time": "0:12:22", "throughput": 2030.98, "total_tokens": 266432} +{"current_steps": 312, "total_steps": 2065, "eval_loss": 0.11303775012493134, "epoch": 0.7554479418886199, "percentage": 15.11, "elapsed_time": "0:02:13", "remaining_time": "0:12:27", "throughput": 2014.4, "total_tokens": 267968} +{"current_steps": 315, "total_steps": 2065, "loss": 0.181, "lr": 4.959196167936729e-06, "epoch": 0.7627118644067796, "percentage": 15.25, "elapsed_time": "0:03:02", "remaining_time": "0:16:54", "throughput": 1480.79, "total_tokens": 270464} +{"current_steps": 320, "total_steps": 2065, "loss": 0.0946, "lr": 4.955305294729056e-06, "epoch": 0.774818401937046, "percentage": 15.5, "elapsed_time": "0:03:03", "remaining_time": "0:16:38", "throughput": 1500.88, "total_tokens": 274688} +{"current_steps": 325, "total_steps": 2065, "loss": 0.1293, "lr": 4.9512389319762165e-06, "epoch": 0.7869249394673123, "percentage": 15.74, "elapsed_time": "0:03:03", "remaining_time": "0:16:21", "throughput": 1520.53, "total_tokens": 278848} +{"current_steps": 330, "total_steps": 2065, "loss": 0.124, "lr": 4.946997370315857e-06, "epoch": 0.7990314769975787, "percentage": 15.98, "elapsed_time": "0:03:03", "remaining_time": "0:16:06", "throughput": 1540.78, "total_tokens": 283136} +{"current_steps": 335, "total_steps": 2065, "loss": 0.1767, "lr": 4.9425809129077204e-06, "epoch": 0.8111380145278451, "percentage": 16.22, "elapsed_time": "0:03:04", "remaining_time": "0:15:50", "throughput": 1562.31, "total_tokens": 287680} +{"current_steps": 340, "total_steps": 2065, "loss": 0.0811, "lr": 4.937989875411986e-06, "epoch": 0.8232445520581114, "percentage": 16.46, "elapsed_time": "0:03:04", "remaining_time": "0:15:36", "throughput": 1583.75, "total_tokens": 292224} +{"current_steps": 345, "total_steps": 2065, "loss": 0.1567, "lr": 4.933224585966696e-06, "epoch": 0.8353510895883777, "percentage": 16.71, "elapsed_time": "0:03:04", "remaining_time": "0:15:21", "throughput": 1603.4, "total_tokens": 296448} +{"current_steps": 350, "total_steps": 2065, "loss": 0.1363, "lr": 4.928285385164316e-06, "epoch": 0.847457627118644, "percentage": 16.95, "elapsed_time": "0:03:05", "remaining_time": "0:15:07", "throughput": 1623.29, "total_tokens": 300736} +{"current_steps": 355, "total_steps": 2065, "loss": 0.1348, "lr": 4.92317262602738e-06, "epoch": 0.8595641646489104, "percentage": 17.19, "elapsed_time": "0:03:05", "remaining_time": "0:14:54", "throughput": 1642.76, "total_tokens": 304960} +{"current_steps": 360, "total_steps": 2065, "loss": 0.1694, "lr": 4.917886673983267e-06, "epoch": 0.8716707021791767, "percentage": 17.43, "elapsed_time": "0:03:06", "remaining_time": "0:14:40", "throughput": 1662.16, "total_tokens": 309184} +{"current_steps": 365, "total_steps": 2065, "loss": 0.1352, "lr": 4.912427906838079e-06, "epoch": 0.8837772397094431, "percentage": 17.68, "elapsed_time": "0:03:06", "remaining_time": "0:14:28", "throughput": 1681.51, "total_tokens": 313408} +{"current_steps": 370, "total_steps": 2065, "loss": 0.0933, "lr": 4.906796714749635e-06, "epoch": 0.8958837772397095, "percentage": 17.92, "elapsed_time": "0:03:06", "remaining_time": "0:14:15", "throughput": 1702.11, "total_tokens": 317888} +{"current_steps": 375, "total_steps": 2065, "loss": 0.1488, "lr": 4.900993500199591e-06, "epoch": 0.9079903147699758, "percentage": 18.16, "elapsed_time": "0:03:07", "remaining_time": "0:14:03", "throughput": 1720.96, "total_tokens": 322048} +{"current_steps": 380, "total_steps": 2065, "loss": 0.087, "lr": 4.895018677964669e-06, "epoch": 0.9200968523002422, "percentage": 18.4, "elapsed_time": "0:03:07", "remaining_time": "0:13:51", "throughput": 1741.7, "total_tokens": 326592} +{"current_steps": 385, "total_steps": 2065, "loss": 0.1017, "lr": 4.888872675087012e-06, "epoch": 0.9322033898305084, "percentage": 18.64, "elapsed_time": "0:03:07", "remaining_time": "0:13:39", "throughput": 1761.07, "total_tokens": 330880} +{"current_steps": 390, "total_steps": 2065, "loss": 0.1105, "lr": 4.882555930843664e-06, "epoch": 0.9443099273607748, "percentage": 18.89, "elapsed_time": "0:03:08", "remaining_time": "0:13:28", "throughput": 1780.03, "total_tokens": 335104} +{"current_steps": 395, "total_steps": 2065, "loss": 0.1437, "lr": 4.876068896715171e-06, "epoch": 0.9564164648910412, "percentage": 19.13, "elapsed_time": "0:03:08", "remaining_time": "0:13:17", "throughput": 1799.27, "total_tokens": 339392} +{"current_steps": 400, "total_steps": 2065, "loss": 0.146, "lr": 4.8694120363533105e-06, "epoch": 0.9685230024213075, "percentage": 19.37, "elapsed_time": "0:03:09", "remaining_time": "0:13:06", "throughput": 1818.74, "total_tokens": 343744} +{"current_steps": 405, "total_steps": 2065, "loss": 0.0985, "lr": 4.862585825547957e-06, "epoch": 0.9806295399515739, "percentage": 19.61, "elapsed_time": "0:03:09", "remaining_time": "0:12:56", "throughput": 1838.45, "total_tokens": 348160} +{"current_steps": 410, "total_steps": 2065, "loss": 0.116, "lr": 4.855590752193075e-06, "epoch": 0.9927360774818402, "percentage": 19.85, "elapsed_time": "0:03:09", "remaining_time": "0:12:45", "throughput": 1857.49, "total_tokens": 352448} +{"current_steps": 415, "total_steps": 2065, "loss": 0.1193, "lr": 4.848427316251843e-06, "epoch": 1.0048426150121066, "percentage": 20.1, "elapsed_time": "0:03:10", "remaining_time": "0:12:36", "throughput": 1875.08, "total_tokens": 356656} +{"current_steps": 416, "total_steps": 2065, "eval_loss": 0.10842076689004898, "epoch": 1.0072639225181599, "percentage": 20.15, "elapsed_time": "0:03:10", "remaining_time": "0:12:36", "throughput": 1872.78, "total_tokens": 357488} +{"current_steps": 420, "total_steps": 2065, "loss": 0.073, "lr": 4.841096029720921e-06, "epoch": 1.0169491525423728, "percentage": 20.34, "elapsed_time": "0:04:43", "remaining_time": "0:18:31", "throughput": 1271.96, "total_tokens": 360880} +{"current_steps": 425, "total_steps": 2065, "loss": 0.0535, "lr": 4.833597416593861e-06, "epoch": 1.0290556900726393, "percentage": 20.58, "elapsed_time": "0:04:44", "remaining_time": "0:18:16", "throughput": 1285.18, "total_tokens": 365104} +{"current_steps": 430, "total_steps": 2065, "loss": 0.1458, "lr": 4.825932012823652e-06, "epoch": 1.0411622276029056, "percentage": 20.82, "elapsed_time": "0:04:44", "remaining_time": "0:18:01", "throughput": 1299.92, "total_tokens": 369776} +{"current_steps": 435, "total_steps": 2065, "loss": 0.1602, "lr": 4.818100366284408e-06, "epoch": 1.053268765133172, "percentage": 21.07, "elapsed_time": "0:04:44", "remaining_time": "0:17:47", "throughput": 1313.06, "total_tokens": 374000} +{"current_steps": 440, "total_steps": 2065, "loss": 0.2577, "lr": 4.81010303673222e-06, "epoch": 1.0653753026634383, "percentage": 21.31, "elapsed_time": "0:04:45", "remaining_time": "0:17:33", "throughput": 1325.75, "total_tokens": 378096} +{"current_steps": 445, "total_steps": 2065, "loss": 0.0566, "lr": 4.80194059576514e-06, "epoch": 1.0774818401937045, "percentage": 21.55, "elapsed_time": "0:04:45", "remaining_time": "0:17:19", "throughput": 1338.63, "total_tokens": 382256} +{"current_steps": 450, "total_steps": 2065, "loss": 0.1761, "lr": 4.793613626782331e-06, "epoch": 1.089588377723971, "percentage": 21.79, "elapsed_time": "0:04:45", "remaining_time": "0:17:06", "throughput": 1352.35, "total_tokens": 386672} +{"current_steps": 455, "total_steps": 2065, "loss": 0.0591, "lr": 4.785122724942367e-06, "epoch": 1.1016949152542372, "percentage": 22.03, "elapsed_time": "0:04:46", "remaining_time": "0:16:53", "throughput": 1365.58, "total_tokens": 390960} +{"current_steps": 460, "total_steps": 2065, "loss": 0.0952, "lr": 4.7764684971206974e-06, "epoch": 1.1138014527845037, "percentage": 22.28, "elapsed_time": "0:04:46", "remaining_time": "0:16:40", "throughput": 1379.44, "total_tokens": 395440} +{"current_steps": 465, "total_steps": 2065, "loss": 0.0664, "lr": 4.767651561866269e-06, "epoch": 1.12590799031477, "percentage": 22.52, "elapsed_time": "0:04:47", "remaining_time": "0:16:27", "throughput": 1392.16, "total_tokens": 399600} +{"current_steps": 470, "total_steps": 2065, "loss": 0.1001, "lr": 4.758672549357316e-06, "epoch": 1.1380145278450362, "percentage": 22.76, "elapsed_time": "0:04:47", "remaining_time": "0:16:15", "throughput": 1405.3, "total_tokens": 403888} +{"current_steps": 475, "total_steps": 2065, "loss": 0.2506, "lr": 4.7495321013563225e-06, "epoch": 1.1501210653753027, "percentage": 23.0, "elapsed_time": "0:04:47", "remaining_time": "0:16:03", "throughput": 1418.41, "total_tokens": 408176} +{"current_steps": 480, "total_steps": 2065, "loss": 0.044, "lr": 4.740230871164148e-06, "epoch": 1.162227602905569, "percentage": 23.24, "elapsed_time": "0:04:48", "remaining_time": "0:15:51", "throughput": 1430.62, "total_tokens": 412208} +{"current_steps": 485, "total_steps": 2065, "loss": 0.1472, "lr": 4.730769523573337e-06, "epoch": 1.1743341404358354, "percentage": 23.49, "elapsed_time": "0:04:48", "remaining_time": "0:15:39", "throughput": 1444.09, "total_tokens": 416624} +{"current_steps": 490, "total_steps": 2065, "loss": 0.1661, "lr": 4.721148734820605e-06, "epoch": 1.1864406779661016, "percentage": 23.73, "elapsed_time": "0:04:48", "remaining_time": "0:15:28", "throughput": 1457.53, "total_tokens": 421040} +{"current_steps": 495, "total_steps": 2065, "loss": 0.094, "lr": 4.711369192538503e-06, "epoch": 1.1985472154963681, "percentage": 23.97, "elapsed_time": "0:04:49", "remaining_time": "0:15:17", "throughput": 1469.85, "total_tokens": 425136} +{"current_steps": 500, "total_steps": 2065, "loss": 0.1282, "lr": 4.701431595706269e-06, "epoch": 1.2106537530266344, "percentage": 24.21, "elapsed_time": "0:04:49", "remaining_time": "0:15:06", "throughput": 1483.63, "total_tokens": 429680} +{"current_steps": 505, "total_steps": 2065, "loss": 0.0874, "lr": 4.691336654599873e-06, "epoch": 1.2227602905569008, "percentage": 24.46, "elapsed_time": "0:04:49", "remaining_time": "0:14:55", "throughput": 1497.39, "total_tokens": 434224} +{"current_steps": 510, "total_steps": 2065, "loss": 0.0403, "lr": 4.6810850907412486e-06, "epoch": 1.234866828087167, "percentage": 24.7, "elapsed_time": "0:04:50", "remaining_time": "0:14:45", "throughput": 1509.6, "total_tokens": 438320} +{"current_steps": 515, "total_steps": 2065, "loss": 0.0227, "lr": 4.6706776368467236e-06, "epoch": 1.2469733656174333, "percentage": 24.94, "elapsed_time": "0:04:50", "remaining_time": "0:14:34", "throughput": 1522.66, "total_tokens": 442672} +{"current_steps": 520, "total_steps": 2065, "loss": 0.0685, "lr": 4.6601150367746485e-06, "epoch": 1.2590799031476998, "percentage": 25.18, "elapsed_time": "0:04:51", "remaining_time": "0:14:24", "throughput": 1535.25, "total_tokens": 446896} +{"current_steps": 520, "total_steps": 2065, "eval_loss": 0.19028596580028534, "epoch": 1.2590799031476998, "percentage": 25.18, "elapsed_time": "0:04:51", "remaining_time": "0:14:27", "throughput": 1531.47, "total_tokens": 446896} +{"current_steps": 525, "total_steps": 2065, "loss": 0.1008, "lr": 4.649398045472235e-06, "epoch": 1.271186440677966, "percentage": 25.42, "elapsed_time": "0:05:25", "remaining_time": "0:15:55", "throughput": 1385.14, "total_tokens": 451312} +{"current_steps": 530, "total_steps": 2065, "loss": 0.3076, "lr": 4.638527428921592e-06, "epoch": 1.2832929782082325, "percentage": 25.67, "elapsed_time": "0:05:26", "remaining_time": "0:15:44", "throughput": 1396.15, "total_tokens": 455408} +{"current_steps": 535, "total_steps": 2065, "loss": 0.0462, "lr": 4.627503964084981e-06, "epoch": 1.2953995157384988, "percentage": 25.91, "elapsed_time": "0:05:26", "remaining_time": "0:15:33", "throughput": 1408.87, "total_tokens": 460080} +{"current_steps": 540, "total_steps": 2065, "loss": 0.0124, "lr": 4.616328438849284e-06, "epoch": 1.307506053268765, "percentage": 26.15, "elapsed_time": "0:05:26", "remaining_time": "0:15:23", "throughput": 1420.78, "total_tokens": 464496} +{"current_steps": 545, "total_steps": 2065, "loss": 0.1408, "lr": 4.605001651969686e-06, "epoch": 1.3196125907990315, "percentage": 26.39, "elapsed_time": "0:05:27", "remaining_time": "0:15:12", "throughput": 1432.09, "total_tokens": 468720} +{"current_steps": 550, "total_steps": 2065, "loss": 0.115, "lr": 4.5935244130125925e-06, "epoch": 1.331719128329298, "percentage": 26.63, "elapsed_time": "0:05:27", "remaining_time": "0:15:02", "throughput": 1444.33, "total_tokens": 473264} +{"current_steps": 555, "total_steps": 2065, "loss": 0.0061, "lr": 4.581897542297761e-06, "epoch": 1.3438256658595642, "percentage": 26.88, "elapsed_time": "0:05:28", "remaining_time": "0:14:52", "throughput": 1455.77, "total_tokens": 477552} +{"current_steps": 560, "total_steps": 2065, "loss": 0.0843, "lr": 4.570121870839671e-06, "epoch": 1.3559322033898304, "percentage": 27.12, "elapsed_time": "0:05:28", "remaining_time": "0:14:42", "throughput": 1467.77, "total_tokens": 482032} +{"current_steps": 565, "total_steps": 2065, "loss": 0.0764, "lr": 4.558198240288131e-06, "epoch": 1.368038740920097, "percentage": 27.36, "elapsed_time": "0:05:28", "remaining_time": "0:14:32", "throughput": 1479.35, "total_tokens": 486384} +{"current_steps": 570, "total_steps": 2065, "loss": 0.1836, "lr": 4.5461275028681186e-06, "epoch": 1.3801452784503632, "percentage": 27.6, "elapsed_time": "0:05:29", "remaining_time": "0:14:23", "throughput": 1490.72, "total_tokens": 490672} +{"current_steps": 575, "total_steps": 2065, "loss": 0.1097, "lr": 4.533910521318872e-06, "epoch": 1.3922518159806296, "percentage": 27.85, "elapsed_time": "0:05:29", "remaining_time": "0:14:13", "throughput": 1502.06, "total_tokens": 494960} +{"current_steps": 580, "total_steps": 2065, "loss": 0.1144, "lr": 4.521548168832227e-06, "epoch": 1.4043583535108959, "percentage": 28.09, "elapsed_time": "0:05:29", "remaining_time": "0:14:04", "throughput": 1512.99, "total_tokens": 499120} +{"current_steps": 585, "total_steps": 2065, "loss": 0.0169, "lr": 4.509041328990204e-06, "epoch": 1.4164648910411621, "percentage": 28.33, "elapsed_time": "0:05:30", "remaining_time": "0:13:55", "throughput": 1524.3, "total_tokens": 503408} +{"current_steps": 590, "total_steps": 2065, "loss": 0.0424, "lr": 4.496390895701858e-06, "epoch": 1.4285714285714286, "percentage": 28.57, "elapsed_time": "0:05:30", "remaining_time": "0:13:46", "throughput": 1534.45, "total_tokens": 507312} +{"current_steps": 595, "total_steps": 2065, "loss": 0.053, "lr": 4.483597773139387e-06, "epoch": 1.4406779661016949, "percentage": 28.81, "elapsed_time": "0:05:30", "remaining_time": "0:13:37", "throughput": 1545.71, "total_tokens": 511600} +{"current_steps": 600, "total_steps": 2065, "loss": 0.0615, "lr": 4.470662875673506e-06, "epoch": 1.4527845036319613, "percentage": 29.06, "elapsed_time": "0:05:31", "remaining_time": "0:13:29", "throughput": 1556.94, "total_tokens": 515888} +{"current_steps": 605, "total_steps": 2065, "loss": 0.2071, "lr": 4.4575871278080964e-06, "epoch": 1.4648910411622276, "percentage": 29.3, "elapsed_time": "0:05:31", "remaining_time": "0:13:20", "throughput": 1567.38, "total_tokens": 519920} +{"current_steps": 610, "total_steps": 2065, "loss": 0.0688, "lr": 4.444371464114126e-06, "epoch": 1.4769975786924938, "percentage": 29.54, "elapsed_time": "0:05:32", "remaining_time": "0:13:12", "throughput": 1578.93, "total_tokens": 524336} +{"current_steps": 615, "total_steps": 2065, "loss": 0.071, "lr": 4.431016829162851e-06, "epoch": 1.4891041162227603, "percentage": 29.78, "elapsed_time": "0:05:32", "remaining_time": "0:13:03", "throughput": 1589.7, "total_tokens": 528496} +{"current_steps": 620, "total_steps": 2065, "loss": 0.0801, "lr": 4.417524177458309e-06, "epoch": 1.5012106537530268, "percentage": 30.02, "elapsed_time": "0:05:32", "remaining_time": "0:12:55", "throughput": 1600.84, "total_tokens": 532784} +{"current_steps": 624, "total_steps": 2065, "eval_loss": 0.1981746405363083, "epoch": 1.5108958837772397, "percentage": 30.22, "elapsed_time": "0:05:33", "remaining_time": "0:12:50", "throughput": 1606.65, "total_tokens": 536176} +{"current_steps": 625, "total_steps": 2065, "loss": 0.0258, "lr": 4.403894473369092e-06, "epoch": 1.513317191283293, "percentage": 30.27, "elapsed_time": "0:06:02", "remaining_time": "0:13:54", "throughput": 1483.2, "total_tokens": 537136} +{"current_steps": 630, "total_steps": 2065, "loss": 0.199, "lr": 4.390128691059423e-06, "epoch": 1.5254237288135593, "percentage": 30.51, "elapsed_time": "0:06:02", "remaining_time": "0:13:45", "throughput": 1493.87, "total_tokens": 541552} +{"current_steps": 635, "total_steps": 2065, "loss": 0.1964, "lr": 4.376227814419524e-06, "epoch": 1.5375302663438255, "percentage": 30.75, "elapsed_time": "0:06:02", "remaining_time": "0:13:37", "throughput": 1503.66, "total_tokens": 545648} +{"current_steps": 640, "total_steps": 2065, "loss": 0.06, "lr": 4.3621928369952995e-06, "epoch": 1.549636803874092, "percentage": 30.99, "elapsed_time": "0:06:03", "remaining_time": "0:13:28", "throughput": 1514.8, "total_tokens": 550256} +{"current_steps": 645, "total_steps": 2065, "loss": 0.1114, "lr": 4.348024761917321e-06, "epoch": 1.5617433414043584, "percentage": 31.23, "elapsed_time": "0:06:03", "remaining_time": "0:13:20", "throughput": 1526.09, "total_tokens": 554928} +{"current_steps": 650, "total_steps": 2065, "loss": 0.0725, "lr": 4.333724601829132e-06, "epoch": 1.5738498789346247, "percentage": 31.48, "elapsed_time": "0:06:03", "remaining_time": "0:13:12", "throughput": 1536.66, "total_tokens": 559344} +{"current_steps": 655, "total_steps": 2065, "loss": 0.1308, "lr": 4.319293378814868e-06, "epoch": 1.585956416464891, "percentage": 31.72, "elapsed_time": "0:06:04", "remaining_time": "0:13:04", "throughput": 1547.23, "total_tokens": 563760} +{"current_steps": 660, "total_steps": 2065, "loss": 0.0653, "lr": 4.3047321243262065e-06, "epoch": 1.5980629539951574, "percentage": 31.96, "elapsed_time": "0:06:04", "remaining_time": "0:12:56", "throughput": 1557.6, "total_tokens": 568112} +{"current_steps": 665, "total_steps": 2065, "loss": 0.006, "lr": 4.290041879108641e-06, "epoch": 1.6101694915254239, "percentage": 32.2, "elapsed_time": "0:06:05", "remaining_time": "0:12:48", "throughput": 1567.94, "total_tokens": 572464} +{"current_steps": 670, "total_steps": 2065, "loss": 0.0771, "lr": 4.275223693127103e-06, "epoch": 1.6222760290556901, "percentage": 32.45, "elapsed_time": "0:06:05", "remaining_time": "0:12:40", "throughput": 1578.09, "total_tokens": 576752} +{"current_steps": 675, "total_steps": 2065, "loss": 0.034, "lr": 4.260278625490911e-06, "epoch": 1.6343825665859564, "percentage": 32.69, "elapsed_time": "0:06:05", "remaining_time": "0:12:33", "throughput": 1588.04, "total_tokens": 580976} +{"current_steps": 680, "total_steps": 2065, "loss": 0.1429, "lr": 4.245207744378075e-06, "epoch": 1.6464891041162226, "percentage": 32.93, "elapsed_time": "0:06:06", "remaining_time": "0:12:25", "throughput": 1598.15, "total_tokens": 585264} +{"current_steps": 685, "total_steps": 2065, "loss": 0.0664, "lr": 4.2300121269589475e-06, "epoch": 1.658595641646489, "percentage": 33.17, "elapsed_time": "0:06:06", "remaining_time": "0:12:18", "throughput": 1608.74, "total_tokens": 589744} +{"current_steps": 690, "total_steps": 2065, "loss": 0.0792, "lr": 4.2146928593192375e-06, "epoch": 1.6707021791767556, "percentage": 33.41, "elapsed_time": "0:06:06", "remaining_time": "0:12:11", "throughput": 1618.65, "total_tokens": 593968} +{"current_steps": 695, "total_steps": 2065, "loss": 0.1061, "lr": 4.19925103638238e-06, "epoch": 1.6828087167070218, "percentage": 33.66, "elapsed_time": "0:06:07", "remaining_time": "0:12:04", "throughput": 1628.7, "total_tokens": 598256} +{"current_steps": 700, "total_steps": 2065, "loss": 0.0958, "lr": 4.183687761831282e-06, "epoch": 1.694915254237288, "percentage": 33.9, "elapsed_time": "0:06:07", "remaining_time": "0:11:56", "throughput": 1638.9, "total_tokens": 602608} +{"current_steps": 705, "total_steps": 2065, "loss": 0.1234, "lr": 4.168004148029435e-06, "epoch": 1.7070217917675545, "percentage": 34.14, "elapsed_time": "0:06:08", "remaining_time": "0:11:50", "throughput": 1649.42, "total_tokens": 607088} +{"current_steps": 710, "total_steps": 2065, "loss": 0.1094, "lr": 4.152201315941414e-06, "epoch": 1.7191283292978208, "percentage": 34.38, "elapsed_time": "0:06:08", "remaining_time": "0:11:43", "throughput": 1659.06, "total_tokens": 611248} +{"current_steps": 715, "total_steps": 2065, "loss": 0.1047, "lr": 4.136280395052754e-06, "epoch": 1.7312348668280872, "percentage": 34.62, "elapsed_time": "0:06:08", "remaining_time": "0:11:36", "throughput": 1669.02, "total_tokens": 615536} +{"current_steps": 720, "total_steps": 2065, "loss": 0.0341, "lr": 4.120242523289223e-06, "epoch": 1.7433414043583535, "percentage": 34.87, "elapsed_time": "0:06:09", "remaining_time": "0:11:29", "throughput": 1679.29, "total_tokens": 619952} +{"current_steps": 725, "total_steps": 2065, "loss": 0.2066, "lr": 4.104088846935493e-06, "epoch": 1.7554479418886197, "percentage": 35.11, "elapsed_time": "0:06:09", "remaining_time": "0:11:23", "throughput": 1689.56, "total_tokens": 624368} +{"current_steps": 728, "total_steps": 2065, "eval_loss": 0.14485575258731842, "epoch": 1.7627118644067796, "percentage": 35.25, "elapsed_time": "0:06:10", "remaining_time": "0:11:20", "throughput": 1692.87, "total_tokens": 626992} +{"current_steps": 730, "total_steps": 2065, "loss": 0.0104, "lr": 4.087820520553205e-06, "epoch": 1.7675544794188862, "percentage": 35.35, "elapsed_time": "0:07:04", "remaining_time": "0:12:56", "throughput": 1480.95, "total_tokens": 628720} +{"current_steps": 735, "total_steps": 2065, "loss": 0.0572, "lr": 4.071438706898457e-06, "epoch": 1.7796610169491527, "percentage": 35.59, "elapsed_time": "0:07:04", "remaining_time": "0:12:48", "throughput": 1489.76, "total_tokens": 633008} +{"current_steps": 740, "total_steps": 2065, "loss": 0.1222, "lr": 4.0549445768386895e-06, "epoch": 1.791767554479419, "percentage": 35.84, "elapsed_time": "0:07:05", "remaining_time": "0:12:41", "throughput": 1498.69, "total_tokens": 637360} +{"current_steps": 745, "total_steps": 2065, "loss": 0.1171, "lr": 4.038339309269002e-06, "epoch": 1.8038740920096852, "percentage": 36.08, "elapsed_time": "0:07:05", "remaining_time": "0:12:34", "throughput": 1507.47, "total_tokens": 641648} +{"current_steps": 750, "total_steps": 2065, "loss": 0.1638, "lr": 4.021624091027895e-06, "epoch": 1.8159806295399514, "percentage": 36.32, "elapsed_time": "0:07:06", "remaining_time": "0:12:26", "throughput": 1515.36, "total_tokens": 645552} +{"current_steps": 755, "total_steps": 2065, "loss": 0.1092, "lr": 4.00480011681244e-06, "epoch": 1.828087167070218, "percentage": 36.56, "elapsed_time": "0:07:06", "remaining_time": "0:12:19", "throughput": 1524.26, "total_tokens": 649904} +{"current_steps": 760, "total_steps": 2065, "loss": 0.1118, "lr": 3.987868589092894e-06, "epoch": 1.8401937046004844, "percentage": 36.8, "elapsed_time": "0:07:06", "remaining_time": "0:12:12", "throughput": 1532.85, "total_tokens": 654128} +{"current_steps": 765, "total_steps": 2065, "loss": 0.1015, "lr": 3.970830718026746e-06, "epoch": 1.8523002421307506, "percentage": 37.05, "elapsed_time": "0:07:07", "remaining_time": "0:12:05", "throughput": 1542.13, "total_tokens": 658672} +{"current_steps": 770, "total_steps": 2065, "loss": 0.1207, "lr": 3.9536877213722335e-06, "epoch": 1.8644067796610169, "percentage": 37.29, "elapsed_time": "0:07:07", "remaining_time": "0:11:58", "throughput": 1551.12, "total_tokens": 663088} +{"current_steps": 775, "total_steps": 2065, "loss": 0.083, "lr": 3.936440824401299e-06, "epoch": 1.8765133171912833, "percentage": 37.53, "elapsed_time": "0:07:07", "remaining_time": "0:11:52", "throughput": 1559.95, "total_tokens": 667440} +{"current_steps": 780, "total_steps": 2065, "loss": 0.0249, "lr": 3.919091259812013e-06, "epoch": 1.8886198547215496, "percentage": 37.77, "elapsed_time": "0:07:08", "remaining_time": "0:11:45", "throughput": 1568.76, "total_tokens": 671792} +{"current_steps": 785, "total_steps": 2065, "loss": 0.0425, "lr": 3.901640267640475e-06, "epoch": 1.900726392251816, "percentage": 38.01, "elapsed_time": "0:07:08", "remaining_time": "0:11:38", "throughput": 1578.01, "total_tokens": 676336} +{"current_steps": 790, "total_steps": 2065, "loss": 0.0402, "lr": 3.884089095172181e-06, "epoch": 1.9128329297820823, "percentage": 38.26, "elapsed_time": "0:07:08", "remaining_time": "0:11:32", "throughput": 1586.65, "total_tokens": 680624} +{"current_steps": 795, "total_steps": 2065, "loss": 0.0155, "lr": 3.866438996852873e-06, "epoch": 1.9249394673123486, "percentage": 38.5, "elapsed_time": "0:07:09", "remaining_time": "0:11:25", "throughput": 1595.56, "total_tokens": 685040} +{"current_steps": 800, "total_steps": 2065, "loss": 0.0372, "lr": 3.848691234198879e-06, "epoch": 1.937046004842615, "percentage": 38.74, "elapsed_time": "0:07:09", "remaining_time": "0:11:19", "throughput": 1604.31, "total_tokens": 689392} +{"current_steps": 805, "total_steps": 2065, "loss": 0.1257, "lr": 3.830847075706957e-06, "epoch": 1.9491525423728815, "percentage": 38.98, "elapsed_time": "0:07:10", "remaining_time": "0:11:13", "throughput": 1612.62, "total_tokens": 693552} +{"current_steps": 810, "total_steps": 2065, "loss": 0.0454, "lr": 3.812907796763616e-06, "epoch": 1.9612590799031477, "percentage": 39.23, "elapsed_time": "0:07:10", "remaining_time": "0:11:06", "throughput": 1621.64, "total_tokens": 698032} +{"current_steps": 815, "total_steps": 2065, "loss": 0.2136, "lr": 3.794874679553975e-06, "epoch": 1.973365617433414, "percentage": 39.47, "elapsed_time": "0:07:10", "remaining_time": "0:11:00", "throughput": 1629.5, "total_tokens": 702000} +{"current_steps": 820, "total_steps": 2065, "loss": 0.1643, "lr": 3.7767490129701057e-06, "epoch": 1.9854721549636802, "percentage": 39.71, "elapsed_time": "0:07:11", "remaining_time": "0:10:54", "throughput": 1637.76, "total_tokens": 706160} +{"current_steps": 825, "total_steps": 2065, "loss": 0.0475, "lr": 3.7585320925189246e-06, "epoch": 1.9975786924939467, "percentage": 39.95, "elapsed_time": "0:07:11", "remaining_time": "0:10:48", "throughput": 1647.02, "total_tokens": 710768} +{"current_steps": 830, "total_steps": 2065, "loss": 0.0011, "lr": 3.7402252202295876e-06, "epoch": 2.009685230024213, "percentage": 40.19, "elapsed_time": "0:07:12", "remaining_time": "0:10:42", "throughput": 1654.4, "total_tokens": 714744} +{"current_steps": 832, "total_steps": 2065, "eval_loss": 0.2067757099866867, "epoch": 2.0145278450363198, "percentage": 40.29, "elapsed_time": "0:07:12", "remaining_time": "0:10:41", "throughput": 1655.23, "total_tokens": 716344} +{"current_steps": 835, "total_steps": 2065, "loss": 0.0057, "lr": 3.7218297045604362e-06, "epoch": 2.0217917675544794, "percentage": 40.44, "elapsed_time": "0:08:22", "remaining_time": "0:12:20", "throughput": 1429.44, "total_tokens": 718776} +{"current_steps": 840, "total_steps": 2065, "loss": 0.0114, "lr": 3.703346860305473e-06, "epoch": 2.0338983050847457, "percentage": 40.68, "elapsed_time": "0:08:23", "remaining_time": "0:12:13", "throughput": 1436.3, "total_tokens": 722744} +{"current_steps": 845, "total_steps": 2065, "loss": 0.0047, "lr": 3.6847780085003908e-06, "epoch": 2.046004842615012, "percentage": 40.92, "elapsed_time": "0:08:23", "remaining_time": "0:12:07", "throughput": 1444.0, "total_tokens": 727160} +{"current_steps": 850, "total_steps": 2065, "loss": 0.0867, "lr": 3.666124476328155e-06, "epoch": 2.0581113801452786, "percentage": 41.16, "elapsed_time": "0:08:23", "remaining_time": "0:12:00", "throughput": 1451.7, "total_tokens": 731576} +{"current_steps": 855, "total_steps": 2065, "loss": 0.0084, "lr": 3.647387597024139e-06, "epoch": 2.070217917675545, "percentage": 41.4, "elapsed_time": "0:08:24", "remaining_time": "0:11:53", "throughput": 1459.77, "total_tokens": 736184} +{"current_steps": 860, "total_steps": 2065, "loss": 0.0011, "lr": 3.6285687097808396e-06, "epoch": 2.082324455205811, "percentage": 41.65, "elapsed_time": "0:08:24", "remaining_time": "0:11:47", "throughput": 1467.19, "total_tokens": 740472} +{"current_steps": 865, "total_steps": 2065, "loss": 0.0528, "lr": 3.609669159652158e-06, "epoch": 2.0944309927360774, "percentage": 41.89, "elapsed_time": "0:08:25", "remaining_time": "0:11:40", "throughput": 1474.6, "total_tokens": 744760} +{"current_steps": 870, "total_steps": 2065, "loss": 0.0003, "lr": 3.5906902974572623e-06, "epoch": 2.106537530266344, "percentage": 42.13, "elapsed_time": "0:08:25", "remaining_time": "0:11:34", "throughput": 1482.26, "total_tokens": 749176} +{"current_steps": 875, "total_steps": 2065, "loss": 0.0329, "lr": 3.5716334796840403e-06, "epoch": 2.1186440677966103, "percentage": 42.37, "elapsed_time": "0:08:25", "remaining_time": "0:11:27", "throughput": 1489.79, "total_tokens": 753528} +{"current_steps": 880, "total_steps": 2065, "loss": 0.0022, "lr": 3.5525000683921467e-06, "epoch": 2.1307506053268765, "percentage": 42.62, "elapsed_time": "0:08:26", "remaining_time": "0:11:21", "throughput": 1496.92, "total_tokens": 757688} +{"current_steps": 885, "total_steps": 2065, "loss": 0.0268, "lr": 3.533291431115653e-06, "epoch": 2.142857142857143, "percentage": 42.86, "elapsed_time": "0:08:26", "remaining_time": "0:11:15", "throughput": 1504.42, "total_tokens": 762040} +{"current_steps": 890, "total_steps": 2065, "loss": 0.0746, "lr": 3.514008940765304e-06, "epoch": 2.154963680387409, "percentage": 43.1, "elapsed_time": "0:08:26", "remaining_time": "0:11:09", "throughput": 1511.54, "total_tokens": 766200} +{"current_steps": 895, "total_steps": 2065, "loss": 0.0202, "lr": 3.494653975530388e-06, "epoch": 2.1670702179176757, "percentage": 43.34, "elapsed_time": "0:08:27", "remaining_time": "0:11:03", "throughput": 1519.26, "total_tokens": 770680} +{"current_steps": 900, "total_steps": 2065, "loss": 0.0023, "lr": 3.475227918780239e-06, "epoch": 2.179176755447942, "percentage": 43.58, "elapsed_time": "0:08:27", "remaining_time": "0:10:57", "throughput": 1526.36, "total_tokens": 774840} +{"current_steps": 905, "total_steps": 2065, "loss": 0.0001, "lr": 3.455732158965356e-06, "epoch": 2.1912832929782082, "percentage": 43.83, "elapsed_time": "0:08:28", "remaining_time": "0:10:51", "throughput": 1533.82, "total_tokens": 779192} +{"current_steps": 910, "total_steps": 2065, "loss": 0.0001, "lr": 3.436168089518168e-06, "epoch": 2.2033898305084745, "percentage": 44.07, "elapsed_time": "0:08:28", "remaining_time": "0:10:45", "throughput": 1541.38, "total_tokens": 783608} +{"current_steps": 915, "total_steps": 2065, "loss": 0.0365, "lr": 3.4165371087534428e-06, "epoch": 2.2154963680387407, "percentage": 44.31, "elapsed_time": "0:08:28", "remaining_time": "0:10:39", "throughput": 1549.06, "total_tokens": 788088} +{"current_steps": 920, "total_steps": 2065, "loss": 0.0, "lr": 3.396840619768338e-06, "epoch": 2.2276029055690074, "percentage": 44.55, "elapsed_time": "0:08:29", "remaining_time": "0:10:33", "throughput": 1556.72, "total_tokens": 792568} +{"current_steps": 925, "total_steps": 2065, "loss": 0.0001, "lr": 3.377080030342125e-06, "epoch": 2.2397094430992737, "percentage": 44.79, "elapsed_time": "0:08:29", "remaining_time": "0:10:27", "throughput": 1564.63, "total_tokens": 797176} +{"current_steps": 930, "total_steps": 2065, "loss": 0.0038, "lr": 3.3572567528355614e-06, "epoch": 2.25181598062954, "percentage": 45.04, "elapsed_time": "0:08:29", "remaining_time": "0:10:22", "throughput": 1571.78, "total_tokens": 801400} +{"current_steps": 935, "total_steps": 2065, "loss": 0.0059, "lr": 3.3373722040899515e-06, "epoch": 2.263922518159806, "percentage": 45.28, "elapsed_time": "0:08:30", "remaining_time": "0:10:16", "throughput": 1579.54, "total_tokens": 805944} +{"current_steps": 936, "total_steps": 2065, "eval_loss": 0.26913806796073914, "epoch": 2.2663438256658597, "percentage": 45.33, "elapsed_time": "0:08:30", "remaining_time": "0:10:16", "throughput": 1578.75, "total_tokens": 806712} +{"current_steps": 940, "total_steps": 2065, "loss": 0.0006, "lr": 3.3174278053258753e-06, "epoch": 2.2760290556900724, "percentage": 45.52, "elapsed_time": "0:09:14", "remaining_time": "0:11:03", "throughput": 1460.35, "total_tokens": 810040} +{"current_steps": 945, "total_steps": 2065, "loss": 0.0482, "lr": 3.2974249820416094e-06, "epoch": 2.288135593220339, "percentage": 45.76, "elapsed_time": "0:09:15", "remaining_time": "0:10:57", "throughput": 1467.22, "total_tokens": 814392} +{"current_steps": 950, "total_steps": 2065, "loss": 0.0175, "lr": 3.2773651639112432e-06, "epoch": 2.3002421307506054, "percentage": 46.0, "elapsed_time": "0:09:15", "remaining_time": "0:10:51", "throughput": 1474.29, "total_tokens": 818872} +{"current_steps": 955, "total_steps": 2065, "loss": 0.0039, "lr": 3.2572497846824922e-06, "epoch": 2.3123486682808716, "percentage": 46.25, "elapsed_time": "0:09:15", "remaining_time": "0:10:46", "throughput": 1480.92, "total_tokens": 823096} +{"current_steps": 960, "total_steps": 2065, "loss": 0.0549, "lr": 3.2370802820742273e-06, "epoch": 2.324455205811138, "percentage": 46.49, "elapsed_time": "0:09:16", "remaining_time": "0:10:40", "throughput": 1487.2, "total_tokens": 827128} +{"current_steps": 965, "total_steps": 2065, "loss": 0.0011, "lr": 3.2168580976737105e-06, "epoch": 2.3365617433414045, "percentage": 46.73, "elapsed_time": "0:09:16", "remaining_time": "0:10:34", "throughput": 1493.7, "total_tokens": 831288} +{"current_steps": 970, "total_steps": 2065, "loss": 0.0202, "lr": 3.1965846768335625e-06, "epoch": 2.348668280871671, "percentage": 46.97, "elapsed_time": "0:09:16", "remaining_time": "0:10:28", "throughput": 1500.52, "total_tokens": 835640} +{"current_steps": 975, "total_steps": 2065, "loss": 0.0019, "lr": 3.176261468568457e-06, "epoch": 2.360774818401937, "percentage": 47.22, "elapsed_time": "0:09:17", "remaining_time": "0:10:22", "throughput": 1506.89, "total_tokens": 839736} +{"current_steps": 980, "total_steps": 2065, "loss": 0.0363, "lr": 3.155889925451557e-06, "epoch": 2.3728813559322033, "percentage": 47.46, "elapsed_time": "0:09:17", "remaining_time": "0:10:17", "throughput": 1513.58, "total_tokens": 844024} +{"current_steps": 985, "total_steps": 2065, "loss": 0.0001, "lr": 3.1354715035106892e-06, "epoch": 2.38498789346247, "percentage": 47.7, "elapsed_time": "0:09:18", "remaining_time": "0:10:11", "throughput": 1520.15, "total_tokens": 848248} +{"current_steps": 990, "total_steps": 2065, "loss": 0.0, "lr": 3.115007662124282e-06, "epoch": 2.3970944309927362, "percentage": 47.94, "elapsed_time": "0:09:18", "remaining_time": "0:10:06", "throughput": 1526.71, "total_tokens": 852472} +{"current_steps": 995, "total_steps": 2065, "loss": 0.0006, "lr": 3.0944998639170544e-06, "epoch": 2.4092009685230025, "percentage": 48.18, "elapsed_time": "0:09:18", "remaining_time": "0:10:00", "throughput": 1533.49, "total_tokens": 856824} +{"current_steps": 1000, "total_steps": 2065, "loss": 0.0018, "lr": 3.0739495746554785e-06, "epoch": 2.4213075060532687, "percentage": 48.43, "elapsed_time": "0:09:19", "remaining_time": "0:09:55", "throughput": 1539.92, "total_tokens": 860984} +{"current_steps": 1005, "total_steps": 2065, "loss": 0.068, "lr": 3.0533582631430153e-06, "epoch": 2.433414043583535, "percentage": 48.67, "elapsed_time": "0:09:19", "remaining_time": "0:09:50", "throughput": 1546.57, "total_tokens": 865272} +{"current_steps": 1010, "total_steps": 2065, "loss": 0.0395, "lr": 3.0327274011151355e-06, "epoch": 2.4455205811138017, "percentage": 48.91, "elapsed_time": "0:09:19", "remaining_time": "0:09:44", "throughput": 1553.21, "total_tokens": 869560} +{"current_steps": 1015, "total_steps": 2065, "loss": 0.0, "lr": 3.012058463134126e-06, "epoch": 2.457627118644068, "percentage": 49.15, "elapsed_time": "0:09:20", "remaining_time": "0:09:39", "throughput": 1560.06, "total_tokens": 873976} +{"current_steps": 1020, "total_steps": 2065, "loss": 0.0, "lr": 2.991352926483702e-06, "epoch": 2.469733656174334, "percentage": 49.39, "elapsed_time": "0:09:20", "remaining_time": "0:09:34", "throughput": 1566.57, "total_tokens": 878200} +{"current_steps": 1025, "total_steps": 2065, "loss": 0.0008, "lr": 2.9706122710634166e-06, "epoch": 2.4818401937046004, "percentage": 49.64, "elapsed_time": "0:09:20", "remaining_time": "0:09:29", "throughput": 1573.86, "total_tokens": 882872} +{"current_steps": 1030, "total_steps": 2065, "loss": 0.0, "lr": 2.949837979282889e-06, "epoch": 2.4939467312348667, "percentage": 49.88, "elapsed_time": "0:09:21", "remaining_time": "0:09:24", "throughput": 1580.35, "total_tokens": 887096} +{"current_steps": 1035, "total_steps": 2065, "loss": 0.0032, "lr": 2.9290315359558504e-06, "epoch": 2.5060532687651333, "percentage": 50.12, "elapsed_time": "0:09:21", "remaining_time": "0:09:18", "throughput": 1587.28, "total_tokens": 891576} +{"current_steps": 1040, "total_steps": 2065, "loss": 0.0756, "lr": 2.908194428194019e-06, "epoch": 2.5181598062953996, "percentage": 50.36, "elapsed_time": "0:09:22", "remaining_time": "0:09:13", "throughput": 1593.64, "total_tokens": 895736} +{"current_steps": 1040, "total_steps": 2065, "eval_loss": 0.28947436809539795, "epoch": 2.5181598062953996, "percentage": 50.36, "elapsed_time": "0:09:22", "remaining_time": "0:09:14", "throughput": 1591.83, "total_tokens": 895736} +{"current_steps": 1045, "total_steps": 2065, "loss": 0.0001, "lr": 2.88732814530081e-06, "epoch": 2.530266343825666, "percentage": 50.61, "elapsed_time": "0:09:56", "remaining_time": "0:09:42", "throughput": 1508.91, "total_tokens": 900024} +{"current_steps": 1050, "total_steps": 2065, "loss": 0.0128, "lr": 2.8664341786648932e-06, "epoch": 2.542372881355932, "percentage": 50.85, "elapsed_time": "0:09:56", "remaining_time": "0:09:36", "throughput": 1515.38, "total_tokens": 904440} +{"current_steps": 1055, "total_steps": 2065, "loss": 0.0001, "lr": 2.845514021653595e-06, "epoch": 2.5544794188861983, "percentage": 51.09, "elapsed_time": "0:09:57", "remaining_time": "0:09:31", "throughput": 1521.62, "total_tokens": 908728} +{"current_steps": 1060, "total_steps": 2065, "loss": 0.0443, "lr": 2.8245691695061605e-06, "epoch": 2.566585956416465, "percentage": 51.33, "elapsed_time": "0:09:57", "remaining_time": "0:09:26", "throughput": 1527.86, "total_tokens": 913016} +{"current_steps": 1065, "total_steps": 2065, "loss": 0.0032, "lr": 2.8036011192268863e-06, "epoch": 2.5786924939467313, "percentage": 51.57, "elapsed_time": "0:09:57", "remaining_time": "0:09:21", "throughput": 1534.09, "total_tokens": 917304} +{"current_steps": 1070, "total_steps": 2065, "loss": 0.0001, "lr": 2.7826113694781254e-06, "epoch": 2.5907990314769975, "percentage": 51.82, "elapsed_time": "0:09:58", "remaining_time": "0:09:16", "throughput": 1540.21, "total_tokens": 921528} +{"current_steps": 1075, "total_steps": 2065, "loss": 0.0, "lr": 2.7616014204731683e-06, "epoch": 2.6029055690072638, "percentage": 52.06, "elapsed_time": "0:09:58", "remaining_time": "0:09:11", "throughput": 1546.64, "total_tokens": 925944} +{"current_steps": 1080, "total_steps": 2065, "loss": 0.0001, "lr": 2.7405727738690193e-06, "epoch": 2.61501210653753, "percentage": 52.3, "elapsed_time": "0:09:59", "remaining_time": "0:09:06", "throughput": 1553.68, "total_tokens": 930744} +{"current_steps": 1085, "total_steps": 2065, "loss": 0.0725, "lr": 2.7195269326590685e-06, "epoch": 2.6271186440677967, "percentage": 52.54, "elapsed_time": "0:09:59", "remaining_time": "0:09:01", "throughput": 1560.39, "total_tokens": 935352} +{"current_steps": 1090, "total_steps": 2065, "loss": 0.0295, "lr": 2.698465401065667e-06, "epoch": 2.639225181598063, "percentage": 52.78, "elapsed_time": "0:09:59", "remaining_time": "0:08:56", "throughput": 1566.57, "total_tokens": 939640} +{"current_steps": 1095, "total_steps": 2065, "loss": 0.0001, "lr": 2.6773896844326126e-06, "epoch": 2.651331719128329, "percentage": 53.03, "elapsed_time": "0:10:00", "remaining_time": "0:08:51", "throughput": 1572.33, "total_tokens": 943672} +{"current_steps": 1100, "total_steps": 2065, "loss": 0.0001, "lr": 2.656301289117561e-06, "epoch": 2.663438256658596, "percentage": 53.27, "elapsed_time": "0:10:00", "remaining_time": "0:08:46", "throughput": 1578.1, "total_tokens": 947704} +{"current_steps": 1105, "total_steps": 2065, "loss": 0.0196, "lr": 2.6352017223843584e-06, "epoch": 2.6755447941888617, "percentage": 53.51, "elapsed_time": "0:10:00", "remaining_time": "0:08:42", "throughput": 1584.16, "total_tokens": 951928} +{"current_steps": 1110, "total_steps": 2065, "loss": 0.0294, "lr": 2.6140924922953125e-06, "epoch": 2.6876513317191284, "percentage": 53.75, "elapsed_time": "0:10:01", "remaining_time": "0:08:37", "throughput": 1590.32, "total_tokens": 956216} +{"current_steps": 1115, "total_steps": 2065, "loss": 0.0001, "lr": 2.592975107603406e-06, "epoch": 2.6997578692493946, "percentage": 54.0, "elapsed_time": "0:10:01", "remaining_time": "0:08:32", "throughput": 1596.47, "total_tokens": 960504} +{"current_steps": 1120, "total_steps": 2065, "loss": 0.0135, "lr": 2.571851077644461e-06, "epoch": 2.711864406779661, "percentage": 54.24, "elapsed_time": "0:10:02", "remaining_time": "0:08:27", "throughput": 1603.03, "total_tokens": 965048} +{"current_steps": 1125, "total_steps": 2065, "loss": 0.0001, "lr": 2.55072191222926e-06, "epoch": 2.7239709443099276, "percentage": 54.48, "elapsed_time": "0:10:02", "remaining_time": "0:08:23", "throughput": 1608.96, "total_tokens": 969208} +{"current_steps": 1130, "total_steps": 2065, "loss": 0.0991, "lr": 2.5295891215356362e-06, "epoch": 2.736077481840194, "percentage": 54.72, "elapsed_time": "0:10:02", "remaining_time": "0:08:18", "throughput": 1615.29, "total_tokens": 973624} +{"current_steps": 1135, "total_steps": 2065, "loss": 0.0064, "lr": 2.5084542160005338e-06, "epoch": 2.74818401937046, "percentage": 54.96, "elapsed_time": "0:10:03", "remaining_time": "0:08:14", "throughput": 1621.51, "total_tokens": 977976} +{"current_steps": 1140, "total_steps": 2065, "loss": 0.0001, "lr": 2.4873187062120515e-06, "epoch": 2.7602905569007263, "percentage": 55.21, "elapsed_time": "0:10:03", "remaining_time": "0:08:09", "throughput": 1627.52, "total_tokens": 982200} +{"current_steps": 1144, "total_steps": 2065, "eval_loss": 0.22601255774497986, "epoch": 2.7699757869249395, "percentage": 55.4, "elapsed_time": "0:10:04", "remaining_time": "0:08:06", "throughput": 1630.57, "total_tokens": 985592} +{"current_steps": 1145, "total_steps": 2065, "loss": 0.0002, "lr": 2.4661841028014786e-06, "epoch": 2.7723970944309926, "percentage": 55.45, "elapsed_time": "0:11:04", "remaining_time": "0:08:53", "throughput": 1485.47, "total_tokens": 986488} +{"current_steps": 1150, "total_steps": 2065, "loss": 0.0002, "lr": 2.445051916335321e-06, "epoch": 2.7845036319612593, "percentage": 55.69, "elapsed_time": "0:11:04", "remaining_time": "0:08:48", "throughput": 1490.64, "total_tokens": 990456} +{"current_steps": 1155, "total_steps": 2065, "loss": 0.0766, "lr": 2.4239236572073354e-06, "epoch": 2.7966101694915255, "percentage": 55.93, "elapsed_time": "0:11:04", "remaining_time": "0:08:43", "throughput": 1496.25, "total_tokens": 994744} +{"current_steps": 1160, "total_steps": 2065, "loss": 0.0501, "lr": 2.4028008355305817e-06, "epoch": 2.8087167070217918, "percentage": 56.17, "elapsed_time": "0:11:05", "remaining_time": "0:08:38", "throughput": 1502.06, "total_tokens": 999160} +{"current_steps": 1165, "total_steps": 2065, "loss": 0.0289, "lr": 2.3816849610294784e-06, "epoch": 2.820823244552058, "percentage": 56.42, "elapsed_time": "0:11:05", "remaining_time": "0:08:34", "throughput": 1507.39, "total_tokens": 1003256} +{"current_steps": 1170, "total_steps": 2065, "loss": 0.0884, "lr": 2.3605775429319115e-06, "epoch": 2.8329297820823243, "percentage": 56.66, "elapsed_time": "0:11:05", "remaining_time": "0:08:29", "throughput": 1512.89, "total_tokens": 1007480} +{"current_steps": 1175, "total_steps": 2065, "loss": 0.0004, "lr": 2.3394800898613536e-06, "epoch": 2.845036319612591, "percentage": 56.9, "elapsed_time": "0:11:06", "remaining_time": "0:08:24", "throughput": 1518.65, "total_tokens": 1011896} +{"current_steps": 1180, "total_steps": 2065, "loss": 0.0004, "lr": 2.318394109729041e-06, "epoch": 2.857142857142857, "percentage": 57.14, "elapsed_time": "0:11:06", "remaining_time": "0:08:20", "throughput": 1523.96, "total_tokens": 1015992} +{"current_steps": 1185, "total_steps": 2065, "loss": 0.003, "lr": 2.297321109626198e-06, "epoch": 2.8692493946731235, "percentage": 57.38, "elapsed_time": "0:11:07", "remaining_time": "0:08:15", "throughput": 1529.74, "total_tokens": 1020408} +{"current_steps": 1190, "total_steps": 2065, "loss": 0.0003, "lr": 2.27626259571632e-06, "epoch": 2.8813559322033897, "percentage": 57.63, "elapsed_time": "0:11:07", "remaining_time": "0:08:10", "throughput": 1535.77, "total_tokens": 1025016} +{"current_steps": 1195, "total_steps": 2065, "loss": 0.0571, "lr": 2.2552200731275215e-06, "epoch": 2.893462469733656, "percentage": 57.87, "elapsed_time": "0:11:07", "remaining_time": "0:08:06", "throughput": 1541.44, "total_tokens": 1029368} +{"current_steps": 1200, "total_steps": 2065, "loss": 0.0007, "lr": 2.2341950458449576e-06, "epoch": 2.9055690072639226, "percentage": 58.11, "elapsed_time": "0:11:08", "remaining_time": "0:08:01", "throughput": 1546.91, "total_tokens": 1033592} +{"current_steps": 1205, "total_steps": 2065, "loss": 0.0001, "lr": 2.2131890166033333e-06, "epoch": 2.917675544794189, "percentage": 58.35, "elapsed_time": "0:11:08", "remaining_time": "0:07:57", "throughput": 1552.19, "total_tokens": 1037688} +{"current_steps": 1210, "total_steps": 2065, "loss": 0.0136, "lr": 2.1922034867794923e-06, "epoch": 2.929782082324455, "percentage": 58.6, "elapsed_time": "0:11:08", "remaining_time": "0:07:52", "throughput": 1557.65, "total_tokens": 1041912} +{"current_steps": 1215, "total_steps": 2065, "loss": 0.0003, "lr": 2.171239956285115e-06, "epoch": 2.9418886198547214, "percentage": 58.84, "elapsed_time": "0:11:09", "remaining_time": "0:07:48", "throughput": 1563.48, "total_tokens": 1046392} +{"current_steps": 1220, "total_steps": 2065, "loss": 0.0001, "lr": 2.150299923459505e-06, "epoch": 2.9539951573849876, "percentage": 59.08, "elapsed_time": "0:11:09", "remaining_time": "0:07:43", "throughput": 1568.93, "total_tokens": 1050616} +{"current_steps": 1225, "total_steps": 2065, "loss": 0.0001, "lr": 2.1293848849625065e-06, "epoch": 2.9661016949152543, "percentage": 59.32, "elapsed_time": "0:11:10", "remaining_time": "0:07:39", "throughput": 1574.38, "total_tokens": 1054840} +{"current_steps": 1230, "total_steps": 2065, "loss": 0.0001, "lr": 2.108496335667527e-06, "epoch": 2.9782082324455206, "percentage": 59.56, "elapsed_time": "0:11:10", "remaining_time": "0:07:35", "throughput": 1579.63, "total_tokens": 1058936} +{"current_steps": 1235, "total_steps": 2065, "loss": 0.0001, "lr": 2.0876357685546942e-06, "epoch": 2.990314769975787, "percentage": 59.81, "elapsed_time": "0:11:10", "remaining_time": "0:07:30", "throughput": 1585.26, "total_tokens": 1063288} +{"current_steps": 1240, "total_steps": 2065, "loss": 0.0, "lr": 2.0668046746041497e-06, "epoch": 3.002421307506053, "percentage": 60.05, "elapsed_time": "0:11:11", "remaining_time": "0:07:26", "throughput": 1590.19, "total_tokens": 1067392} +{"current_steps": 1245, "total_steps": 2065, "loss": 0.0, "lr": 2.0460045426894816e-06, "epoch": 3.0145278450363198, "percentage": 60.29, "elapsed_time": "0:11:11", "remaining_time": "0:07:22", "throughput": 1595.98, "total_tokens": 1071872} +{"current_steps": 1248, "total_steps": 2065, "eval_loss": 0.22526989877223969, "epoch": 3.0217917675544794, "percentage": 60.44, "elapsed_time": "0:11:12", "remaining_time": "0:07:20", "throughput": 1598.06, "total_tokens": 1074624} +{"current_steps": 1250, "total_steps": 2065, "loss": 0.0, "lr": 2.0252368594713083e-06, "epoch": 3.026634382566586, "percentage": 60.53, "elapsed_time": "0:11:53", "remaining_time": "0:07:45", "throughput": 1507.92, "total_tokens": 1076416} +{"current_steps": 1255, "total_steps": 2065, "loss": 0.0024, "lr": 2.004503109291023e-06, "epoch": 3.0387409200968523, "percentage": 60.77, "elapsed_time": "0:11:54", "remaining_time": "0:07:40", "throughput": 1512.89, "total_tokens": 1080512} +{"current_steps": 1260, "total_steps": 2065, "loss": 0.0, "lr": 1.9838047740647024e-06, "epoch": 3.0508474576271185, "percentage": 61.02, "elapsed_time": "0:11:54", "remaining_time": "0:07:36", "throughput": 1517.85, "total_tokens": 1084608} +{"current_steps": 1265, "total_steps": 2065, "loss": 0.0, "lr": 1.9631433331771886e-06, "epoch": 3.062953995157385, "percentage": 61.26, "elapsed_time": "0:11:54", "remaining_time": "0:07:32", "throughput": 1523.24, "total_tokens": 1089024} +{"current_steps": 1270, "total_steps": 2065, "loss": 0.0, "lr": 1.942520263376351e-06, "epoch": 3.0750605326876514, "percentage": 61.5, "elapsed_time": "0:11:55", "remaining_time": "0:07:27", "throughput": 1528.54, "total_tokens": 1093376} +{"current_steps": 1275, "total_steps": 2065, "loss": 0.0, "lr": 1.921937038667539e-06, "epoch": 3.0871670702179177, "percentage": 61.74, "elapsed_time": "0:11:55", "remaining_time": "0:07:23", "throughput": 1533.84, "total_tokens": 1097728} +{"current_steps": 1280, "total_steps": 2065, "loss": 0.0831, "lr": 1.901395130208229e-06, "epoch": 3.099273607748184, "percentage": 61.99, "elapsed_time": "0:11:56", "remaining_time": "0:07:19", "throughput": 1538.86, "total_tokens": 1101888} +{"current_steps": 1285, "total_steps": 2065, "loss": 0.0, "lr": 1.880896006202876e-06, "epoch": 3.11138014527845, "percentage": 62.23, "elapsed_time": "0:11:56", "remaining_time": "0:07:14", "throughput": 1544.06, "total_tokens": 1106176} +{"current_steps": 1290, "total_steps": 2065, "loss": 0.0, "lr": 1.860441131797977e-06, "epoch": 3.123486682808717, "percentage": 62.47, "elapsed_time": "0:11:56", "remaining_time": "0:07:10", "throughput": 1549.0, "total_tokens": 1110272} +{"current_steps": 1295, "total_steps": 2065, "loss": 0.0001, "lr": 1.8400319689773474e-06, "epoch": 3.135593220338983, "percentage": 62.71, "elapsed_time": "0:11:57", "remaining_time": "0:07:06", "throughput": 1554.09, "total_tokens": 1114496} +{"current_steps": 1300, "total_steps": 2065, "loss": 0.0001, "lr": 1.8196699764576316e-06, "epoch": 3.1476997578692494, "percentage": 62.95, "elapsed_time": "0:11:57", "remaining_time": "0:07:02", "throughput": 1559.27, "total_tokens": 1118784} +{"current_steps": 1305, "total_steps": 2065, "loss": 0.0001, "lr": 1.7993566095840442e-06, "epoch": 3.1598062953995156, "percentage": 63.2, "elapsed_time": "0:11:57", "remaining_time": "0:06:58", "throughput": 1564.36, "total_tokens": 1123008} +{"current_steps": 1310, "total_steps": 2065, "loss": 0.0001, "lr": 1.7790933202263437e-06, "epoch": 3.171912832929782, "percentage": 63.44, "elapsed_time": "0:11:58", "remaining_time": "0:06:53", "throughput": 1569.7, "total_tokens": 1127424} +{"current_steps": 1315, "total_steps": 2065, "loss": 0.0001, "lr": 1.7588815566750728e-06, "epoch": 3.1840193704600486, "percentage": 63.68, "elapsed_time": "0:11:58", "remaining_time": "0:06:49", "throughput": 1575.04, "total_tokens": 1131840} +{"current_steps": 1320, "total_steps": 2065, "loss": 0.0001, "lr": 1.7387227635380362e-06, "epoch": 3.196125907990315, "percentage": 63.92, "elapsed_time": "0:11:58", "remaining_time": "0:06:45", "throughput": 1580.28, "total_tokens": 1136192} +{"current_steps": 1325, "total_steps": 2065, "loss": 0.0001, "lr": 1.7186183816370522e-06, "epoch": 3.208232445520581, "percentage": 64.16, "elapsed_time": "0:11:59", "remaining_time": "0:06:41", "throughput": 1585.52, "total_tokens": 1140544} +{"current_steps": 1330, "total_steps": 2065, "loss": 0.0001, "lr": 1.6985698479049703e-06, "epoch": 3.2203389830508473, "percentage": 64.41, "elapsed_time": "0:11:59", "remaining_time": "0:06:37", "throughput": 1591.27, "total_tokens": 1145280} +{"current_steps": 1335, "total_steps": 2065, "loss": 0.0039, "lr": 1.6785785952829718e-06, "epoch": 3.232445520581114, "percentage": 64.65, "elapsed_time": "0:12:00", "remaining_time": "0:06:33", "throughput": 1596.85, "total_tokens": 1149888} +{"current_steps": 1340, "total_steps": 2065, "loss": 0.0, "lr": 1.6586460526181476e-06, "epoch": 3.2445520581113803, "percentage": 64.89, "elapsed_time": "0:12:00", "remaining_time": "0:06:29", "throughput": 1601.65, "total_tokens": 1153920} +{"current_steps": 1345, "total_steps": 2065, "loss": 0.0, "lr": 1.6387736445613772e-06, "epoch": 3.2566585956416465, "percentage": 65.13, "elapsed_time": "0:12:00", "remaining_time": "0:06:25", "throughput": 1607.3, "total_tokens": 1158592} +{"current_steps": 1350, "total_steps": 2065, "loss": 0.0, "lr": 1.618962791465501e-06, "epoch": 3.2687651331719128, "percentage": 65.38, "elapsed_time": "0:12:01", "remaining_time": "0:06:21", "throughput": 1612.33, "total_tokens": 1162816} +{"current_steps": 1352, "total_steps": 2065, "eval_loss": 0.25782492756843567, "epoch": 3.2736077481840193, "percentage": 65.47, "elapsed_time": "0:12:01", "remaining_time": "0:06:20", "throughput": 1613.0, "total_tokens": 1164544} +{"current_steps": 1355, "total_steps": 2065, "loss": 0.0002, "lr": 1.599214909283805e-06, "epoch": 3.280871670702179, "percentage": 65.62, "elapsed_time": "0:12:51", "remaining_time": "0:06:44", "throughput": 1513.19, "total_tokens": 1167232} +{"current_steps": 1360, "total_steps": 2065, "loss": 0.0, "lr": 1.579531409468815e-06, "epoch": 3.2929782082324457, "percentage": 65.86, "elapsed_time": "0:12:51", "remaining_time": "0:06:40", "throughput": 1518.18, "total_tokens": 1171648} +{"current_steps": 1365, "total_steps": 2065, "loss": 0.0, "lr": 1.5599136988714186e-06, "epoch": 3.305084745762712, "percentage": 66.1, "elapsed_time": "0:12:52", "remaining_time": "0:06:35", "throughput": 1522.86, "total_tokens": 1175808} +{"current_steps": 1370, "total_steps": 2065, "loss": 0.0, "lr": 1.5403631796403085e-06, "epoch": 3.317191283292978, "percentage": 66.34, "elapsed_time": "0:12:52", "remaining_time": "0:06:31", "throughput": 1527.84, "total_tokens": 1180224} +{"current_steps": 1375, "total_steps": 2065, "loss": 0.0, "lr": 1.5208812491217669e-06, "epoch": 3.3292978208232444, "percentage": 66.59, "elapsed_time": "0:12:52", "remaining_time": "0:06:27", "throughput": 1532.9, "total_tokens": 1184704} +{"current_steps": 1380, "total_steps": 2065, "loss": 0.053, "lr": 1.5014692997597962e-06, "epoch": 3.341404358353511, "percentage": 66.83, "elapsed_time": "0:12:53", "remaining_time": "0:06:23", "throughput": 1537.72, "total_tokens": 1188992} +{"current_steps": 1385, "total_steps": 2065, "loss": 0.0, "lr": 1.4821287189965865e-06, "epoch": 3.3535108958837774, "percentage": 67.07, "elapsed_time": "0:12:53", "remaining_time": "0:06:19", "throughput": 1542.69, "total_tokens": 1193408} +{"current_steps": 1390, "total_steps": 2065, "loss": 0.0002, "lr": 1.4628608891733626e-06, "epoch": 3.3656174334140436, "percentage": 67.31, "elapsed_time": "0:12:53", "remaining_time": "0:06:15", "throughput": 1547.58, "total_tokens": 1197760} +{"current_steps": 1395, "total_steps": 2065, "loss": 0.0, "lr": 1.443667187431572e-06, "epoch": 3.37772397094431, "percentage": 67.55, "elapsed_time": "0:12:54", "remaining_time": "0:06:11", "throughput": 1552.06, "total_tokens": 1201792} +{"current_steps": 1400, "total_steps": 2065, "loss": 0.0, "lr": 1.4245489856144633e-06, "epoch": 3.389830508474576, "percentage": 67.8, "elapsed_time": "0:12:54", "remaining_time": "0:06:07", "throughput": 1556.54, "total_tokens": 1205824} +{"current_steps": 1405, "total_steps": 2065, "loss": 0.0, "lr": 1.4055076501690313e-06, "epoch": 3.401937046004843, "percentage": 68.04, "elapsed_time": "0:12:55", "remaining_time": "0:06:04", "throughput": 1561.49, "total_tokens": 1210240} +{"current_steps": 1410, "total_steps": 2065, "loss": 0.0, "lr": 1.3865445420483524e-06, "epoch": 3.414043583535109, "percentage": 68.28, "elapsed_time": "0:12:55", "remaining_time": "0:06:00", "throughput": 1566.2, "total_tokens": 1214464} +{"current_steps": 1415, "total_steps": 2065, "loss": 0.0005, "lr": 1.367661016614315e-06, "epoch": 3.4261501210653753, "percentage": 68.52, "elapsed_time": "0:12:55", "remaining_time": "0:05:56", "throughput": 1570.98, "total_tokens": 1218752} +{"current_steps": 1420, "total_steps": 2065, "loss": 0.0, "lr": 1.348858423540744e-06, "epoch": 3.4382566585956416, "percentage": 68.77, "elapsed_time": "0:12:56", "remaining_time": "0:05:52", "throughput": 1575.91, "total_tokens": 1223168} +{"current_steps": 1425, "total_steps": 2065, "loss": 0.0, "lr": 1.3301381067169367e-06, "epoch": 3.450363196125908, "percentage": 69.01, "elapsed_time": "0:12:56", "remaining_time": "0:05:48", "throughput": 1580.53, "total_tokens": 1227328} +{"current_steps": 1430, "total_steps": 2065, "loss": 0.0, "lr": 1.3115014041516088e-06, "epoch": 3.4624697336561745, "percentage": 69.25, "elapsed_time": "0:12:56", "remaining_time": "0:05:44", "throughput": 1584.98, "total_tokens": 1231360} +{"current_steps": 1435, "total_steps": 2065, "loss": 0.0, "lr": 1.2929496478772635e-06, "epoch": 3.4745762711864407, "percentage": 69.49, "elapsed_time": "0:12:57", "remaining_time": "0:05:41", "throughput": 1589.51, "total_tokens": 1235456} +{"current_steps": 1440, "total_steps": 2065, "loss": 0.0, "lr": 1.2744841638549843e-06, "epoch": 3.486682808716707, "percentage": 69.73, "elapsed_time": "0:12:57", "remaining_time": "0:05:37", "throughput": 1594.1, "total_tokens": 1239616} +{"current_steps": 1445, "total_steps": 2065, "loss": 0.0, "lr": 1.2561062718796663e-06, "epoch": 3.4987893462469732, "percentage": 69.98, "elapsed_time": "0:12:57", "remaining_time": "0:05:33", "throughput": 1598.94, "total_tokens": 1243968} +{"current_steps": 1450, "total_steps": 2065, "loss": 0.0001, "lr": 1.2378172854856831e-06, "epoch": 3.5108958837772395, "percentage": 70.22, "elapsed_time": "0:12:58", "remaining_time": "0:05:30", "throughput": 1603.53, "total_tokens": 1248128} +{"current_steps": 1455, "total_steps": 2065, "loss": 0.0, "lr": 1.2196185118530063e-06, "epoch": 3.523002421307506, "percentage": 70.46, "elapsed_time": "0:12:58", "remaining_time": "0:05:26", "throughput": 1608.12, "total_tokens": 1252288} +{"current_steps": 1456, "total_steps": 2065, "eval_loss": 0.2580437958240509, "epoch": 3.5254237288135593, "percentage": 70.51, "elapsed_time": "0:12:59", "remaining_time": "0:05:26", "throughput": 1607.92, "total_tokens": 1253248} +{"current_steps": 1460, "total_steps": 2065, "loss": 0.0, "lr": 1.2015112517137744e-06, "epoch": 3.5351089588377724, "percentage": 70.7, "elapsed_time": "0:13:42", "remaining_time": "0:05:40", "throughput": 1528.3, "total_tokens": 1256640} +{"current_steps": 1465, "total_steps": 2065, "loss": 0.0, "lr": 1.183496799259326e-06, "epoch": 3.5472154963680387, "percentage": 70.94, "elapsed_time": "0:13:42", "remaining_time": "0:05:36", "throughput": 1533.43, "total_tokens": 1261440} +{"current_steps": 1470, "total_steps": 2065, "loss": 0.0, "lr": 1.165576442047699e-06, "epoch": 3.559322033898305, "percentage": 71.19, "elapsed_time": "0:13:42", "remaining_time": "0:05:33", "throughput": 1537.88, "total_tokens": 1265664} +{"current_steps": 1475, "total_steps": 2065, "loss": 0.0, "lr": 1.147751460911604e-06, "epoch": 3.571428571428571, "percentage": 71.43, "elapsed_time": "0:13:43", "remaining_time": "0:05:29", "throughput": 1542.47, "total_tokens": 1270016} +{"current_steps": 1480, "total_steps": 2065, "loss": 0.0, "lr": 1.1300231298668786e-06, "epoch": 3.583535108958838, "percentage": 71.67, "elapsed_time": "0:13:43", "remaining_time": "0:05:25", "throughput": 1547.29, "total_tokens": 1274560} +{"current_steps": 1485, "total_steps": 2065, "loss": 0.0, "lr": 1.112392716021429e-06, "epoch": 3.595641646489104, "percentage": 71.91, "elapsed_time": "0:13:44", "remaining_time": "0:05:21", "throughput": 1551.96, "total_tokens": 1278976} +{"current_steps": 1490, "total_steps": 2065, "loss": 0.0, "lr": 1.0948614794846668e-06, "epoch": 3.6077481840193704, "percentage": 72.15, "elapsed_time": "0:13:44", "remaining_time": "0:05:18", "throughput": 1556.39, "total_tokens": 1283200} +{"current_steps": 1495, "total_steps": 2065, "loss": 0.0, "lr": 1.0774306732774414e-06, "epoch": 3.619854721549637, "percentage": 72.4, "elapsed_time": "0:13:44", "remaining_time": "0:05:14", "throughput": 1560.67, "total_tokens": 1287296} +{"current_steps": 1500, "total_steps": 2065, "loss": 0.0, "lr": 1.0601015432424818e-06, "epoch": 3.6319612590799033, "percentage": 72.64, "elapsed_time": "0:13:45", "remaining_time": "0:05:10", "throughput": 1565.32, "total_tokens": 1291712} +{"current_steps": 1505, "total_steps": 2065, "loss": 0.0328, "lr": 1.0428753279553561e-06, "epoch": 3.6440677966101696, "percentage": 72.88, "elapsed_time": "0:13:45", "remaining_time": "0:05:07", "throughput": 1569.73, "total_tokens": 1295936} +{"current_steps": 1510, "total_steps": 2065, "loss": 0.0527, "lr": 1.0257532586359422e-06, "epoch": 3.656174334140436, "percentage": 73.12, "elapsed_time": "0:13:45", "remaining_time": "0:05:03", "throughput": 1574.67, "total_tokens": 1300608} +{"current_steps": 1515, "total_steps": 2065, "loss": 0.0, "lr": 1.008736559060429e-06, "epoch": 3.668280871670702, "percentage": 73.37, "elapsed_time": "0:13:46", "remaining_time": "0:04:59", "throughput": 1579.31, "total_tokens": 1305024} +{"current_steps": 1520, "total_steps": 2065, "loss": 0.0001, "lr": 9.918264454738504e-07, "epoch": 3.6803874092009687, "percentage": 73.61, "elapsed_time": "0:13:46", "remaining_time": "0:04:56", "throughput": 1583.86, "total_tokens": 1309376} +{"current_steps": 1525, "total_steps": 2065, "loss": 0.0001, "lr": 9.750241265031529e-07, "epoch": 3.692493946731235, "percentage": 73.85, "elapsed_time": "0:13:47", "remaining_time": "0:04:52", "throughput": 1588.34, "total_tokens": 1313664} +{"current_steps": 1530, "total_steps": 2065, "loss": 0.0001, "lr": 9.583308030708135e-07, "epoch": 3.7046004842615012, "percentage": 74.09, "elapsed_time": "0:13:47", "remaining_time": "0:04:49", "throughput": 1592.97, "total_tokens": 1318080} +{"current_steps": 1535, "total_steps": 2065, "loss": 0.0001, "lr": 9.417476683090007e-07, "epoch": 3.7167070217917675, "percentage": 74.33, "elapsed_time": "0:13:47", "remaining_time": "0:04:45", "throughput": 1597.51, "total_tokens": 1322432} +{"current_steps": 1540, "total_steps": 2065, "loss": 0.0003, "lr": 9.252759074743034e-07, "epoch": 3.7288135593220337, "percentage": 74.58, "elapsed_time": "0:13:48", "remaining_time": "0:04:42", "throughput": 1602.13, "total_tokens": 1326848} +{"current_steps": 1545, "total_steps": 2065, "loss": 0.0, "lr": 9.08916697863014e-07, "epoch": 3.7409200968523004, "percentage": 74.82, "elapsed_time": "0:13:48", "remaining_time": "0:04:38", "throughput": 1606.82, "total_tokens": 1331328} +{"current_steps": 1550, "total_steps": 2065, "loss": 0.0, "lr": 8.926712087269801e-07, "epoch": 3.7530266343825667, "percentage": 75.06, "elapsed_time": "0:13:48", "remaining_time": "0:04:35", "throughput": 1611.05, "total_tokens": 1335424} +{"current_steps": 1555, "total_steps": 2065, "loss": 0.0, "lr": 8.765406011900368e-07, "epoch": 3.765133171912833, "percentage": 75.3, "elapsed_time": "0:13:49", "remaining_time": "0:04:31", "throughput": 1615.51, "total_tokens": 1339712} +{"current_steps": 1560, "total_steps": 2065, "loss": 0.0, "lr": 8.605260281650152e-07, "epoch": 3.777239709443099, "percentage": 75.54, "elapsed_time": "0:13:49", "remaining_time": "0:04:28", "throughput": 1619.96, "total_tokens": 1344000} +{"current_steps": 1560, "total_steps": 2065, "eval_loss": 0.2703007757663727, "epoch": 3.777239709443099, "percentage": 75.54, "elapsed_time": "0:13:50", "remaining_time": "0:04:28", "throughput": 1618.71, "total_tokens": 1344000} +{"current_steps": 1565, "total_steps": 2065, "loss": 0.0, "lr": 8.44628634271342e-07, "epoch": 3.7893462469733654, "percentage": 75.79, "elapsed_time": "0:14:28", "remaining_time": "0:04:37", "throughput": 1551.82, "total_tokens": 1348224} +{"current_steps": 1570, "total_steps": 2065, "loss": 0.0017, "lr": 8.288495557532241e-07, "epoch": 3.801452784503632, "percentage": 76.03, "elapsed_time": "0:14:29", "remaining_time": "0:04:34", "throughput": 1556.17, "total_tokens": 1352576} +{"current_steps": 1575, "total_steps": 2065, "loss": 0.0616, "lr": 8.131899203984464e-07, "epoch": 3.8135593220338984, "percentage": 76.27, "elapsed_time": "0:14:29", "remaining_time": "0:04:30", "throughput": 1560.44, "total_tokens": 1356864} +{"current_steps": 1580, "total_steps": 2065, "loss": 0.0, "lr": 7.976508474577549e-07, "epoch": 3.8256658595641646, "percentage": 76.51, "elapsed_time": "0:14:29", "remaining_time": "0:04:27", "throughput": 1564.7, "total_tokens": 1361152} +{"current_steps": 1585, "total_steps": 2065, "loss": 0.0, "lr": 7.822334475648655e-07, "epoch": 3.837772397094431, "percentage": 76.76, "elapsed_time": "0:14:30", "remaining_time": "0:04:23", "throughput": 1568.9, "total_tokens": 1365376} +{"current_steps": 1590, "total_steps": 2065, "loss": 0.0, "lr": 7.66938822657081e-07, "epoch": 3.849878934624697, "percentage": 77.0, "elapsed_time": "0:14:30", "remaining_time": "0:04:20", "throughput": 1573.23, "total_tokens": 1369728} +{"current_steps": 1595, "total_steps": 2065, "loss": 0.0, "lr": 7.517680658965328e-07, "epoch": 3.861985472154964, "percentage": 77.24, "elapsed_time": "0:14:31", "remaining_time": "0:04:16", "throughput": 1577.63, "total_tokens": 1374144} +{"current_steps": 1600, "total_steps": 2065, "loss": 0.0, "lr": 7.367222615920477e-07, "epoch": 3.87409200968523, "percentage": 77.48, "elapsed_time": "0:14:31", "remaining_time": "0:04:13", "throughput": 1581.81, "total_tokens": 1378368} +{"current_steps": 1605, "total_steps": 2065, "loss": 0.0, "lr": 7.21802485121649e-07, "epoch": 3.8861985472154963, "percentage": 77.72, "elapsed_time": "0:14:31", "remaining_time": "0:04:09", "throughput": 1585.85, "total_tokens": 1382464} +{"current_steps": 1610, "total_steps": 2065, "loss": 0.0, "lr": 7.070098028556949e-07, "epoch": 3.898305084745763, "percentage": 77.97, "elapsed_time": "0:14:32", "remaining_time": "0:04:06", "throughput": 1590.24, "total_tokens": 1386880} +{"current_steps": 1615, "total_steps": 2065, "loss": 0.0, "lr": 6.923452720806612e-07, "epoch": 3.910411622276029, "percentage": 78.21, "elapsed_time": "0:14:32", "remaining_time": "0:04:03", "throughput": 1594.63, "total_tokens": 1391296} +{"current_steps": 1620, "total_steps": 2065, "loss": 0.0, "lr": 6.778099409235739e-07, "epoch": 3.9225181598062955, "percentage": 78.45, "elapsed_time": "0:14:32", "remaining_time": "0:03:59", "throughput": 1598.72, "total_tokens": 1395456} +{"current_steps": 1625, "total_steps": 2065, "loss": 0.0, "lr": 6.634048482770946e-07, "epoch": 3.9346246973365617, "percentage": 78.69, "elapsed_time": "0:14:33", "remaining_time": "0:03:56", "throughput": 1602.82, "total_tokens": 1399616} +{"current_steps": 1630, "total_steps": 2065, "loss": 0.0, "lr": 6.491310237252679e-07, "epoch": 3.946731234866828, "percentage": 78.93, "elapsed_time": "0:14:33", "remaining_time": "0:03:53", "throughput": 1606.83, "total_tokens": 1403712} +{"current_steps": 1635, "total_steps": 2065, "loss": 0.0, "lr": 6.349894874699345e-07, "epoch": 3.9588377723970947, "percentage": 79.18, "elapsed_time": "0:14:33", "remaining_time": "0:03:49", "throughput": 1611.21, "total_tokens": 1408128} +{"current_steps": 1640, "total_steps": 2065, "loss": 0.0, "lr": 6.209812502578113e-07, "epoch": 3.970944309927361, "percentage": 79.42, "elapsed_time": "0:14:34", "remaining_time": "0:03:46", "throughput": 1615.5, "total_tokens": 1412480} +{"current_steps": 1645, "total_steps": 2065, "loss": 0.0, "lr": 6.071073133082492e-07, "epoch": 3.983050847457627, "percentage": 79.66, "elapsed_time": "0:14:34", "remaining_time": "0:03:43", "throughput": 1619.65, "total_tokens": 1416704} +{"current_steps": 1650, "total_steps": 2065, "loss": 0.0, "lr": 5.933686682416759e-07, "epoch": 3.9951573849878934, "percentage": 79.9, "elapsed_time": "0:14:35", "remaining_time": "0:03:40", "throughput": 1624.02, "total_tokens": 1421120} +{"current_steps": 1655, "total_steps": 2065, "loss": 0.0, "lr": 5.797662970087184e-07, "epoch": 4.00726392251816, "percentage": 80.15, "elapsed_time": "0:14:35", "remaining_time": "0:03:36", "throughput": 1627.44, "total_tokens": 1424944} +{"current_steps": 1660, "total_steps": 2065, "loss": 0.0, "lr": 5.663011718200201e-07, "epoch": 4.019370460048426, "percentage": 80.39, "elapsed_time": "0:14:35", "remaining_time": "0:03:33", "throughput": 1631.72, "total_tokens": 1429296} +{"current_steps": 1664, "total_steps": 2065, "eval_loss": 0.2501881718635559, "epoch": 4.0290556900726395, "percentage": 80.58, "elapsed_time": "0:14:36", "remaining_time": "0:03:31", "throughput": 1634.11, "total_tokens": 1432880} +{"current_steps": 1665, "total_steps": 2065, "loss": 0.0, "lr": 5.529742550767545e-07, "epoch": 4.031476997578692, "percentage": 80.63, "elapsed_time": "0:15:11", "remaining_time": "0:03:38", "throughput": 1572.94, "total_tokens": 1433776} +{"current_steps": 1670, "total_steps": 2065, "loss": 0.0, "lr": 5.397864993018367e-07, "epoch": 4.043583535108959, "percentage": 80.87, "elapsed_time": "0:15:11", "remaining_time": "0:03:35", "throughput": 1576.93, "total_tokens": 1438000} +{"current_steps": 1675, "total_steps": 2065, "loss": 0.0, "lr": 5.267388470718449e-07, "epoch": 4.0556900726392255, "percentage": 81.11, "elapsed_time": "0:15:12", "remaining_time": "0:03:32", "throughput": 1581.05, "total_tokens": 1442352} +{"current_steps": 1680, "total_steps": 2065, "loss": 0.0, "lr": 5.138322309496504e-07, "epoch": 4.067796610169491, "percentage": 81.36, "elapsed_time": "0:15:12", "remaining_time": "0:03:29", "throughput": 1585.18, "total_tokens": 1446704} +{"current_steps": 1685, "total_steps": 2065, "loss": 0.0, "lr": 5.010675734177631e-07, "epoch": 4.079903147699758, "percentage": 81.6, "elapsed_time": "0:15:13", "remaining_time": "0:03:25", "throughput": 1589.1, "total_tokens": 1450864} +{"current_steps": 1690, "total_steps": 2065, "loss": 0.0, "lr": 4.884457868124001e-07, "epoch": 4.092009685230024, "percentage": 81.84, "elapsed_time": "0:15:13", "remaining_time": "0:03:22", "throughput": 1593.08, "total_tokens": 1455088} +{"current_steps": 1695, "total_steps": 2065, "loss": 0.0051, "lr": 4.759677732582782e-07, "epoch": 4.1041162227602905, "percentage": 82.08, "elapsed_time": "0:15:13", "remaining_time": "0:03:19", "throughput": 1597.13, "total_tokens": 1459376} +{"current_steps": 1700, "total_steps": 2065, "loss": 0.0, "lr": 4.6363442460413215e-07, "epoch": 4.116222760290557, "percentage": 82.32, "elapsed_time": "0:15:14", "remaining_time": "0:03:16", "throughput": 1601.11, "total_tokens": 1463600} +{"current_steps": 1705, "total_steps": 2065, "loss": 0.0, "lr": 4.514466223589753e-07, "epoch": 4.128329297820823, "percentage": 82.57, "elapsed_time": "0:15:14", "remaining_time": "0:03:13", "throughput": 1605.35, "total_tokens": 1468080} +{"current_steps": 1710, "total_steps": 2065, "loss": 0.0253, "lr": 4.394052376290914e-07, "epoch": 4.14043583535109, "percentage": 82.81, "elapsed_time": "0:15:14", "remaining_time": "0:03:09", "throughput": 1609.67, "total_tokens": 1472624} +{"current_steps": 1715, "total_steps": 2065, "loss": 0.0, "lr": 4.2751113105577587e-07, "epoch": 4.1525423728813555, "percentage": 83.05, "elapsed_time": "0:15:15", "remaining_time": "0:03:06", "throughput": 1613.84, "total_tokens": 1477040} +{"current_steps": 1720, "total_steps": 2065, "loss": 0.0, "lr": 4.157651527538223e-07, "epoch": 4.164648910411622, "percentage": 83.29, "elapsed_time": "0:15:15", "remaining_time": "0:03:03", "throughput": 1617.87, "total_tokens": 1481328} +{"current_steps": 1725, "total_steps": 2065, "loss": 0.0, "lr": 4.041681422507604e-07, "epoch": 4.176755447941889, "percentage": 83.54, "elapsed_time": "0:15:15", "remaining_time": "0:03:00", "throughput": 1622.1, "total_tokens": 1485808} +{"current_steps": 1730, "total_steps": 2065, "loss": 0.0, "lr": 3.927209284268535e-07, "epoch": 4.188861985472155, "percentage": 83.78, "elapsed_time": "0:15:16", "remaining_time": "0:02:57", "throughput": 1626.19, "total_tokens": 1490160} +{"current_steps": 1735, "total_steps": 2065, "loss": 0.0, "lr": 3.8142432945585425e-07, "epoch": 4.200968523002421, "percentage": 84.02, "elapsed_time": "0:15:16", "remaining_time": "0:02:54", "throughput": 1630.28, "total_tokens": 1494512} +{"current_steps": 1740, "total_steps": 2065, "loss": 0.0, "lr": 3.702791527465274e-07, "epoch": 4.213075060532688, "percentage": 84.26, "elapsed_time": "0:15:17", "remaining_time": "0:02:51", "throughput": 1633.96, "total_tokens": 1498480} +{"current_steps": 1745, "total_steps": 2065, "loss": 0.0, "lr": 3.592861948849416e-07, "epoch": 4.225181598062954, "percentage": 84.5, "elapsed_time": "0:15:17", "remaining_time": "0:02:48", "throughput": 1637.97, "total_tokens": 1502768} +{"current_steps": 1750, "total_steps": 2065, "loss": 0.0, "lr": 3.484462415775333e-07, "epoch": 4.237288135593221, "percentage": 84.75, "elapsed_time": "0:15:17", "remaining_time": "0:02:45", "throughput": 1641.91, "total_tokens": 1506992} +{"current_steps": 1755, "total_steps": 2065, "loss": 0.0, "lr": 3.377600675949527e-07, "epoch": 4.249394673123486, "percentage": 84.99, "elapsed_time": "0:15:18", "remaining_time": "0:02:42", "throughput": 1646.12, "total_tokens": 1511472} +{"current_steps": 1760, "total_steps": 2065, "loss": 0.0, "lr": 3.272284367166825e-07, "epoch": 4.261501210653753, "percentage": 85.23, "elapsed_time": "0:15:18", "remaining_time": "0:02:39", "throughput": 1650.19, "total_tokens": 1515824} +{"current_steps": 1765, "total_steps": 2065, "loss": 0.0001, "lr": 3.1685210167645336e-07, "epoch": 4.27360774818402, "percentage": 85.47, "elapsed_time": "0:15:18", "remaining_time": "0:02:36", "throughput": 1654.26, "total_tokens": 1520176} +{"current_steps": 1768, "total_steps": 2065, "eval_loss": 0.25040701031684875, "epoch": 4.280871670702179, "percentage": 85.62, "elapsed_time": "0:15:21", "remaining_time": "0:02:34", "throughput": 1652.2, "total_tokens": 1522544} +{"current_steps": 1770, "total_steps": 2065, "loss": 0.0018, "lr": 3.066318041084398e-07, "epoch": 4.285714285714286, "percentage": 85.71, "elapsed_time": "0:16:08", "remaining_time": "0:02:41", "throughput": 1573.81, "total_tokens": 1524336} +{"current_steps": 1775, "total_steps": 2065, "loss": 0.0, "lr": 2.9656827449425495e-07, "epoch": 4.297820823244552, "percentage": 85.96, "elapsed_time": "0:16:08", "remaining_time": "0:02:38", "throughput": 1577.57, "total_tokens": 1528560} +{"current_steps": 1780, "total_steps": 2065, "loss": 0.026, "lr": 2.86662232110739e-07, "epoch": 4.309927360774818, "percentage": 86.2, "elapsed_time": "0:16:09", "remaining_time": "0:02:35", "throughput": 1581.27, "total_tokens": 1532720} +{"current_steps": 1785, "total_steps": 2065, "loss": 0.0, "lr": 2.769143849785513e-07, "epoch": 4.322033898305085, "percentage": 86.44, "elapsed_time": "0:16:09", "remaining_time": "0:02:32", "throughput": 1585.02, "total_tokens": 1536944} +{"current_steps": 1790, "total_steps": 2065, "loss": 0.0, "lr": 2.673254298115646e-07, "epoch": 4.3341404358353515, "percentage": 86.68, "elapsed_time": "0:16:10", "remaining_time": "0:02:29", "throughput": 1588.78, "total_tokens": 1541168} +{"current_steps": 1795, "total_steps": 2065, "loss": 0.0, "lr": 2.5789605196706675e-07, "epoch": 4.346246973365617, "percentage": 86.92, "elapsed_time": "0:16:10", "remaining_time": "0:02:25", "throughput": 1592.59, "total_tokens": 1545456} +{"current_steps": 1800, "total_steps": 2065, "loss": 0.0, "lr": 2.4862692539677907e-07, "epoch": 4.358353510895884, "percentage": 87.17, "elapsed_time": "0:16:10", "remaining_time": "0:02:22", "throughput": 1596.54, "total_tokens": 1549872} +{"current_steps": 1805, "total_steps": 2065, "loss": 0.0, "lr": 2.39518712598685e-07, "epoch": 4.37046004842615, "percentage": 87.41, "elapsed_time": "0:16:11", "remaining_time": "0:02:19", "throughput": 1600.47, "total_tokens": 1554288} +{"current_steps": 1810, "total_steps": 2065, "loss": 0.0, "lr": 2.3057206456967908e-07, "epoch": 4.3825665859564165, "percentage": 87.65, "elapsed_time": "0:16:11", "remaining_time": "0:02:16", "throughput": 1604.08, "total_tokens": 1558384} +{"current_steps": 1815, "total_steps": 2065, "loss": 0.0, "lr": 2.2178762075903747e-07, "epoch": 4.394673123486683, "percentage": 87.89, "elapsed_time": "0:16:11", "remaining_time": "0:02:13", "throughput": 1607.76, "total_tokens": 1562544} +{"current_steps": 1820, "total_steps": 2065, "loss": 0.0, "lr": 2.131660090227139e-07, "epoch": 4.406779661016949, "percentage": 88.14, "elapsed_time": "0:16:12", "remaining_time": "0:02:10", "throughput": 1611.94, "total_tokens": 1567216} +{"current_steps": 1825, "total_steps": 2065, "loss": 0.0, "lr": 2.0470784557846652e-07, "epoch": 4.418886198547216, "percentage": 88.38, "elapsed_time": "0:16:12", "remaining_time": "0:02:07", "throughput": 1615.8, "total_tokens": 1571568} +{"current_steps": 1830, "total_steps": 2065, "loss": 0.0, "lr": 1.9641373496181143e-07, "epoch": 4.4309927360774815, "percentage": 88.62, "elapsed_time": "0:16:12", "remaining_time": "0:02:04", "throughput": 1619.52, "total_tokens": 1575792} +{"current_steps": 1835, "total_steps": 2065, "loss": 0.0, "lr": 1.882842699828169e-07, "epoch": 4.443099273607748, "percentage": 88.86, "elapsed_time": "0:16:13", "remaining_time": "0:02:02", "throughput": 1623.31, "total_tokens": 1580080} +{"current_steps": 1840, "total_steps": 2065, "loss": 0.0, "lr": 1.8032003168373306e-07, "epoch": 4.455205811138015, "percentage": 89.1, "elapsed_time": "0:16:13", "remaining_time": "0:01:59", "throughput": 1626.84, "total_tokens": 1584112} +{"current_steps": 1845, "total_steps": 2065, "loss": 0.0, "lr": 1.7252158929746133e-07, "epoch": 4.467312348668281, "percentage": 89.35, "elapsed_time": "0:16:14", "remaining_time": "0:01:56", "throughput": 1630.63, "total_tokens": 1588400} +{"current_steps": 1850, "total_steps": 2065, "loss": 0.0, "lr": 1.6488950020686956e-07, "epoch": 4.479418886198547, "percentage": 89.59, "elapsed_time": "0:16:14", "remaining_time": "0:01:53", "throughput": 1634.54, "total_tokens": 1592816} +{"current_steps": 1855, "total_steps": 2065, "loss": 0.0, "lr": 1.5742430990495465e-07, "epoch": 4.491525423728813, "percentage": 89.83, "elapsed_time": "0:16:14", "remaining_time": "0:01:50", "throughput": 1638.51, "total_tokens": 1597296} +{"current_steps": 1860, "total_steps": 2065, "loss": 0.0184, "lr": 1.501265519558537e-07, "epoch": 4.50363196125908, "percentage": 90.07, "elapsed_time": "0:16:15", "remaining_time": "0:01:47", "throughput": 1642.35, "total_tokens": 1601648} +{"current_steps": 1865, "total_steps": 2065, "loss": 0.0, "lr": 1.4299674795670765e-07, "epoch": 4.5157384987893465, "percentage": 90.31, "elapsed_time": "0:16:15", "remaining_time": "0:01:44", "throughput": 1646.12, "total_tokens": 1605936} +{"current_steps": 1870, "total_steps": 2065, "loss": 0.0, "lr": 1.360354075003828e-07, "epoch": 4.527845036319612, "percentage": 90.56, "elapsed_time": "0:16:15", "remaining_time": "0:01:41", "throughput": 1649.77, "total_tokens": 1610096} +{"current_steps": 1872, "total_steps": 2065, "eval_loss": 0.2488991767168045, "epoch": 4.532687651331719, "percentage": 90.65, "elapsed_time": "0:16:16", "remaining_time": "0:01:40", "throughput": 1650.2, "total_tokens": 1611760} +{"current_steps": 1875, "total_steps": 2065, "loss": 0.0, "lr": 1.2924302813904582e-07, "epoch": 4.539951573849879, "percentage": 90.8, "elapsed_time": "0:17:29", "remaining_time": "0:01:46", "throughput": 1538.37, "total_tokens": 1614384} +{"current_steps": 1880, "total_steps": 2065, "loss": 0.0, "lr": 1.2262009534860368e-07, "epoch": 4.552058111380145, "percentage": 91.04, "elapsed_time": "0:17:29", "remaining_time": "0:01:43", "throughput": 1542.04, "total_tokens": 1618800} +{"current_steps": 1885, "total_steps": 2065, "loss": 0.0, "lr": 1.161670824940045e-07, "epoch": 4.5641646489104115, "percentage": 91.28, "elapsed_time": "0:17:30", "remaining_time": "0:01:40", "throughput": 1545.46, "total_tokens": 1622960} +{"current_steps": 1890, "total_steps": 2065, "loss": 0.0, "lr": 1.0988445079540389e-07, "epoch": 4.576271186440678, "percentage": 91.53, "elapsed_time": "0:17:30", "remaining_time": "0:01:37", "throughput": 1548.83, "total_tokens": 1627056} +{"current_steps": 1895, "total_steps": 2065, "loss": 0.0002, "lr": 1.0377264929520126e-07, "epoch": 4.588377723970944, "percentage": 91.77, "elapsed_time": "0:17:30", "remaining_time": "0:01:34", "throughput": 1552.42, "total_tokens": 1631408} +{"current_steps": 1900, "total_steps": 2065, "loss": 0.0, "lr": 9.783211482594285e-08, "epoch": 4.600484261501211, "percentage": 92.01, "elapsed_time": "0:17:31", "remaining_time": "0:01:31", "throughput": 1556.13, "total_tokens": 1635888} +{"current_steps": 1905, "total_steps": 2065, "loss": 0.0, "lr": 9.206327197910203e-08, "epoch": 4.6125907990314765, "percentage": 92.25, "elapsed_time": "0:17:31", "remaining_time": "0:01:28", "throughput": 1559.67, "total_tokens": 1640176} +{"current_steps": 1910, "total_steps": 2065, "loss": 0.0, "lr": 8.64665330747308e-08, "epoch": 4.624697336561743, "percentage": 92.49, "elapsed_time": "0:17:31", "remaining_time": "0:01:25", "throughput": 1563.26, "total_tokens": 1644528} +{"current_steps": 1915, "total_steps": 2065, "loss": 0.0, "lr": 8.104229813199111e-08, "epoch": 4.63680387409201, "percentage": 92.74, "elapsed_time": "0:17:32", "remaining_time": "0:01:22", "throughput": 1567.2, "total_tokens": 1649264} +{"current_steps": 1920, "total_steps": 2065, "loss": 0.0, "lr": 7.579095484056193e-08, "epoch": 4.648910411622276, "percentage": 92.98, "elapsed_time": "0:17:32", "remaining_time": "0:01:19", "throughput": 1570.96, "total_tokens": 1653808} +{"current_steps": 1925, "total_steps": 2065, "loss": 0.0, "lr": 7.071287853293141e-08, "epoch": 4.661016949152542, "percentage": 93.22, "elapsed_time": "0:17:33", "remaining_time": "0:01:16", "throughput": 1574.66, "total_tokens": 1658288} +{"current_steps": 1930, "total_steps": 2065, "loss": 0.0, "lr": 6.580843215757082e-08, "epoch": 4.673123486682809, "percentage": 93.46, "elapsed_time": "0:17:33", "remaining_time": "0:01:13", "throughput": 1578.18, "total_tokens": 1662576} +{"current_steps": 1935, "total_steps": 2065, "loss": 0.0, "lr": 6.107796625299117e-08, "epoch": 4.685230024213075, "percentage": 93.7, "elapsed_time": "0:17:33", "remaining_time": "0:01:10", "throughput": 1581.87, "total_tokens": 1667056} +{"current_steps": 1940, "total_steps": 2065, "loss": 0.0, "lr": 5.652181892269182e-08, "epoch": 4.697336561743342, "percentage": 93.95, "elapsed_time": "0:17:34", "remaining_time": "0:01:07", "throughput": 1585.56, "total_tokens": 1671536} +{"current_steps": 1945, "total_steps": 2065, "loss": 0.0, "lr": 5.214031581099149e-08, "epoch": 4.709443099273607, "percentage": 94.19, "elapsed_time": "0:17:34", "remaining_time": "0:01:05", "throughput": 1589.13, "total_tokens": 1675888} +{"current_steps": 1950, "total_steps": 2065, "loss": 0.0, "lr": 4.793377007975719e-08, "epoch": 4.721549636803874, "percentage": 94.43, "elapsed_time": "0:17:34", "remaining_time": "0:01:02", "throughput": 1592.64, "total_tokens": 1680176} +{"current_steps": 1955, "total_steps": 2065, "loss": 0.0, "lr": 4.3902482386018186e-08, "epoch": 4.733656174334141, "percentage": 94.67, "elapsed_time": "0:17:35", "remaining_time": "0:00:59", "throughput": 1596.08, "total_tokens": 1684400} +{"current_steps": 1960, "total_steps": 2065, "loss": 0.0357, "lr": 4.004674086047905e-08, "epoch": 4.745762711864407, "percentage": 94.92, "elapsed_time": "0:17:35", "remaining_time": "0:00:56", "throughput": 1599.7, "total_tokens": 1688816} +{"current_steps": 1965, "total_steps": 2065, "loss": 0.0, "lr": 3.636682108692502e-08, "epoch": 4.757869249394673, "percentage": 95.16, "elapsed_time": "0:17:36", "remaining_time": "0:00:53", "throughput": 1603.44, "total_tokens": 1693360} +{"current_steps": 1970, "total_steps": 2065, "loss": 0.0, "lr": 3.286298608252442e-08, "epoch": 4.76997578692494, "percentage": 95.4, "elapsed_time": "0:17:36", "remaining_time": "0:00:50", "throughput": 1606.87, "total_tokens": 1697584} +{"current_steps": 1975, "total_steps": 2065, "loss": 0.0, "lr": 2.953548627903202e-08, "epoch": 4.782082324455206, "percentage": 95.64, "elapsed_time": "0:17:36", "remaining_time": "0:00:48", "throughput": 1610.49, "total_tokens": 1702000} +{"current_steps": 1976, "total_steps": 2065, "eval_loss": 0.2507624924182892, "epoch": 4.784503631961259, "percentage": 95.69, "elapsed_time": "0:17:37", "remaining_time": "0:00:47", "throughput": 1610.21, "total_tokens": 1702832} +{"current_steps": 1980, "total_steps": 2065, "loss": 0.0, "lr": 2.6384559504886164e-08, "epoch": 4.7941888619854724, "percentage": 95.88, "elapsed_time": "0:18:14", "remaining_time": "0:00:46", "throughput": 1559.14, "total_tokens": 1706416} +{"current_steps": 1985, "total_steps": 2065, "loss": 0.0, "lr": 2.3410430968214825e-08, "epoch": 4.806295399515738, "percentage": 96.13, "elapsed_time": "0:18:14", "remaining_time": "0:00:44", "throughput": 1562.76, "total_tokens": 1710960} +{"current_steps": 1990, "total_steps": 2065, "loss": 0.0, "lr": 2.0613313240735457e-08, "epoch": 4.818401937046005, "percentage": 96.37, "elapsed_time": "0:18:15", "remaining_time": "0:00:41", "throughput": 1566.32, "total_tokens": 1715440} +{"current_steps": 1995, "total_steps": 2065, "loss": 0.0, "lr": 1.7993406242563238e-08, "epoch": 4.830508474576272, "percentage": 96.61, "elapsed_time": "0:18:15", "remaining_time": "0:00:38", "throughput": 1569.7, "total_tokens": 1719728} +{"current_steps": 2000, "total_steps": 2065, "loss": 0.0, "lr": 1.5550897227922522e-08, "epoch": 4.842615012106537, "percentage": 96.85, "elapsed_time": "0:18:15", "remaining_time": "0:00:35", "throughput": 1573.32, "total_tokens": 1724272} +{"current_steps": 2005, "total_steps": 2065, "loss": 0.0, "lr": 1.3285960771761696e-08, "epoch": 4.854721549636804, "percentage": 97.09, "elapsed_time": "0:18:16", "remaining_time": "0:00:32", "throughput": 1576.7, "total_tokens": 1728560} +{"current_steps": 2010, "total_steps": 2065, "loss": 0.0, "lr": 1.119875875727705e-08, "epoch": 4.86682808716707, "percentage": 97.34, "elapsed_time": "0:18:16", "remaining_time": "0:00:30", "throughput": 1580.31, "total_tokens": 1733104} +{"current_steps": 2015, "total_steps": 2065, "loss": 0.0, "lr": 9.289440364341484e-09, "epoch": 4.878934624697337, "percentage": 97.58, "elapsed_time": "0:18:17", "remaining_time": "0:00:27", "throughput": 1583.57, "total_tokens": 1737264} +{"current_steps": 2020, "total_steps": 2065, "loss": 0.0, "lr": 7.558142058842755e-09, "epoch": 4.891041162227603, "percentage": 97.82, "elapsed_time": "0:18:17", "remaining_time": "0:00:24", "throughput": 1586.83, "total_tokens": 1741424} +{"current_steps": 2025, "total_steps": 2065, "loss": 0.0, "lr": 6.004987582929056e-09, "epoch": 4.903147699757869, "percentage": 98.06, "elapsed_time": "0:18:17", "remaining_time": "0:00:21", "throughput": 1590.14, "total_tokens": 1745648} +{"current_steps": 2030, "total_steps": 2065, "loss": 0.0, "lr": 4.6300879461655404e-09, "epoch": 4.915254237288136, "percentage": 98.31, "elapsed_time": "0:18:18", "remaining_time": "0:00:18", "throughput": 1593.45, "total_tokens": 1749872} +{"current_steps": 2035, "total_steps": 2065, "loss": 0.0, "lr": 3.4335414175995506e-09, "epoch": 4.927360774818402, "percentage": 98.55, "elapsed_time": "0:18:18", "remaining_time": "0:00:16", "throughput": 1596.94, "total_tokens": 1754288} +{"current_steps": 2040, "total_steps": 2065, "loss": 0.0, "lr": 2.4154335187365207e-09, "epoch": 4.939467312348668, "percentage": 98.79, "elapsed_time": "0:18:18", "remaining_time": "0:00:13", "throughput": 1600.36, "total_tokens": 1758640} +{"current_steps": 2045, "total_steps": 2065, "loss": 0.0003, "lr": 1.575837017428472e-09, "epoch": 4.951573849878935, "percentage": 99.03, "elapsed_time": "0:18:19", "remaining_time": "0:00:10", "throughput": 1603.72, "total_tokens": 1762928} +{"current_steps": 2050, "total_steps": 2065, "loss": 0.0, "lr": 9.14811922672898e-10, "epoch": 4.963680387409201, "percentage": 99.27, "elapsed_time": "0:18:19", "remaining_time": "0:00:08", "throughput": 1607.19, "total_tokens": 1767344} +{"current_steps": 2055, "total_steps": 2065, "loss": 0.0, "lr": 4.3240548032230657e-10, "epoch": 4.9757869249394675, "percentage": 99.52, "elapsed_time": "0:18:20", "remaining_time": "0:00:05", "throughput": 1610.55, "total_tokens": 1771632} +{"current_steps": 2060, "total_steps": 2065, "loss": 0.0, "lr": 1.2865216970914253e-10, "epoch": 4.987893462469733, "percentage": 99.76, "elapsed_time": "0:18:20", "remaining_time": "0:00:02", "throughput": 1613.74, "total_tokens": 1775728} +{"current_steps": 2065, "total_steps": 2065, "loss": 0.0, "lr": 3.573701180537015e-12, "epoch": 5.0, "percentage": 100.0, "elapsed_time": "0:18:20", "remaining_time": "0:00:00", "throughput": 1617.05, "total_tokens": 1780000} +{"current_steps": 2065, "total_steps": 2065, "epoch": 5.0, "percentage": 100.0, "elapsed_time": "0:18:59", "remaining_time": "0:00:00", "throughput": 1561.49, "total_tokens": 1780000} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..43e33c1 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,3519 @@ +{ + "best_global_step": 416, + "best_metric": 0.10842076689004898, + "best_model_checkpoint": "saves_bts_preliminary/base/llama-3.2-1b-instruct/train_mrpc_42_1776331557/checkpoint-416", + "epoch": 5.0, + "eval_steps": 104, + "global_step": 2065, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.012106537530266344, + "grad_norm": 320.0541076660156, + "learning_rate": 9.661835748792271e-08, + "loss": 0.81, + "num_input_tokens_seen": 4352, + "step": 5 + }, + { + "epoch": 0.024213075060532687, + "grad_norm": 293.6126403808594, + "learning_rate": 2.173913043478261e-07, + "loss": 0.8, + "num_input_tokens_seen": 8768, + "step": 10 + }, + { + "epoch": 0.03631961259079903, + "grad_norm": 176.21604919433594, + "learning_rate": 3.3816425120772945e-07, + "loss": 0.6335, + "num_input_tokens_seen": 12992, + "step": 15 + }, + { + "epoch": 0.048426150121065374, + "grad_norm": 67.18666076660156, + "learning_rate": 4.5893719806763294e-07, + "loss": 0.3717, + "num_input_tokens_seen": 17344, + "step": 20 + }, + { + "epoch": 0.06053268765133172, + "grad_norm": 41.770694732666016, + "learning_rate": 5.797101449275363e-07, + "loss": 0.2438, + "num_input_tokens_seen": 21696, + "step": 25 + }, + { + "epoch": 0.07263922518159806, + "grad_norm": 54.93859100341797, + "learning_rate": 7.004830917874397e-07, + "loss": 0.2523, + "num_input_tokens_seen": 26112, + "step": 30 + }, + { + "epoch": 0.0847457627118644, + "grad_norm": 14.744933128356934, + "learning_rate": 8.212560386473431e-07, + "loss": 0.2223, + "num_input_tokens_seen": 30208, + "step": 35 + }, + { + "epoch": 0.09685230024213075, + "grad_norm": 6.022367477416992, + "learning_rate": 9.420289855072465e-07, + "loss": 0.2184, + "num_input_tokens_seen": 34688, + "step": 40 + }, + { + "epoch": 0.1089588377723971, + "grad_norm": 39.68272399902344, + "learning_rate": 1.0628019323671499e-06, + "loss": 0.2163, + "num_input_tokens_seen": 38784, + "step": 45 + }, + { + "epoch": 0.12106537530266344, + "grad_norm": 9.175082206726074, + "learning_rate": 1.1835748792270531e-06, + "loss": 0.2198, + "num_input_tokens_seen": 43200, + "step": 50 + }, + { + "epoch": 0.13317191283292978, + "grad_norm": 42.212303161621094, + "learning_rate": 1.3043478260869566e-06, + "loss": 0.2224, + "num_input_tokens_seen": 47296, + "step": 55 + }, + { + "epoch": 0.14527845036319612, + "grad_norm": 5.82745885848999, + "learning_rate": 1.42512077294686e-06, + "loss": 0.2272, + "num_input_tokens_seen": 51712, + "step": 60 + }, + { + "epoch": 0.15738498789346247, + "grad_norm": 18.959779739379883, + "learning_rate": 1.5458937198067634e-06, + "loss": 0.1665, + "num_input_tokens_seen": 55872, + "step": 65 + }, + { + "epoch": 0.1694915254237288, + "grad_norm": 54.883968353271484, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.1835, + "num_input_tokens_seen": 59840, + "step": 70 + }, + { + "epoch": 0.18159806295399517, + "grad_norm": 87.76690673828125, + "learning_rate": 1.7874396135265702e-06, + "loss": 0.1898, + "num_input_tokens_seen": 64000, + "step": 75 + }, + { + "epoch": 0.1937046004842615, + "grad_norm": 22.004396438598633, + "learning_rate": 1.9082125603864736e-06, + "loss": 0.2149, + "num_input_tokens_seen": 68352, + "step": 80 + }, + { + "epoch": 0.20581113801452786, + "grad_norm": 13.515002250671387, + "learning_rate": 2.028985507246377e-06, + "loss": 0.1519, + "num_input_tokens_seen": 72768, + "step": 85 + }, + { + "epoch": 0.2179176755447942, + "grad_norm": 22.01325798034668, + "learning_rate": 2.1497584541062806e-06, + "loss": 0.1468, + "num_input_tokens_seen": 77120, + "step": 90 + }, + { + "epoch": 0.23002421307506055, + "grad_norm": 24.60677146911621, + "learning_rate": 2.270531400966184e-06, + "loss": 0.2277, + "num_input_tokens_seen": 81664, + "step": 95 + }, + { + "epoch": 0.24213075060532688, + "grad_norm": 19.255220413208008, + "learning_rate": 2.391304347826087e-06, + "loss": 0.1552, + "num_input_tokens_seen": 86080, + "step": 100 + }, + { + "epoch": 0.25181598062953997, + "eval_loss": 0.1484687179327011, + "eval_runtime": 0.6174, + "eval_samples_per_second": 594.395, + "eval_steps_per_second": 74.502, + "num_input_tokens_seen": 89600, + "step": 104 + }, + { + "epoch": 0.2542372881355932, + "grad_norm": 51.306358337402344, + "learning_rate": 2.5120772946859904e-06, + "loss": 0.1673, + "num_input_tokens_seen": 90432, + "step": 105 + }, + { + "epoch": 0.26634382566585957, + "grad_norm": 20.150880813598633, + "learning_rate": 2.632850241545894e-06, + "loss": 0.1694, + "num_input_tokens_seen": 94528, + "step": 110 + }, + { + "epoch": 0.2784503631961259, + "grad_norm": 16.875028610229492, + "learning_rate": 2.7536231884057974e-06, + "loss": 0.1627, + "num_input_tokens_seen": 98816, + "step": 115 + }, + { + "epoch": 0.29055690072639223, + "grad_norm": 17.16132164001465, + "learning_rate": 2.8743961352657007e-06, + "loss": 0.2205, + "num_input_tokens_seen": 103104, + "step": 120 + }, + { + "epoch": 0.3026634382566586, + "grad_norm": 19.57474708557129, + "learning_rate": 2.995169082125604e-06, + "loss": 0.1841, + "num_input_tokens_seen": 107328, + "step": 125 + }, + { + "epoch": 0.31476997578692495, + "grad_norm": 17.67765998840332, + "learning_rate": 3.1159420289855073e-06, + "loss": 0.1779, + "num_input_tokens_seen": 111488, + "step": 130 + }, + { + "epoch": 0.3268765133171913, + "grad_norm": 21.58500099182129, + "learning_rate": 3.236714975845411e-06, + "loss": 0.158, + "num_input_tokens_seen": 115968, + "step": 135 + }, + { + "epoch": 0.3389830508474576, + "grad_norm": 41.02679443359375, + "learning_rate": 3.3574879227053142e-06, + "loss": 0.2089, + "num_input_tokens_seen": 120192, + "step": 140 + }, + { + "epoch": 0.35108958837772397, + "grad_norm": 13.272953987121582, + "learning_rate": 3.4782608695652175e-06, + "loss": 0.1007, + "num_input_tokens_seen": 124416, + "step": 145 + }, + { + "epoch": 0.36319612590799033, + "grad_norm": 58.32763671875, + "learning_rate": 3.5990338164251208e-06, + "loss": 0.2718, + "num_input_tokens_seen": 128832, + "step": 150 + }, + { + "epoch": 0.37530266343825663, + "grad_norm": 46.48350524902344, + "learning_rate": 3.7198067632850245e-06, + "loss": 0.3704, + "num_input_tokens_seen": 132992, + "step": 155 + }, + { + "epoch": 0.387409200968523, + "grad_norm": 13.923331260681152, + "learning_rate": 3.840579710144928e-06, + "loss": 0.1945, + "num_input_tokens_seen": 137280, + "step": 160 + }, + { + "epoch": 0.39951573849878935, + "grad_norm": 76.48035430908203, + "learning_rate": 3.961352657004831e-06, + "loss": 0.253, + "num_input_tokens_seen": 141568, + "step": 165 + }, + { + "epoch": 0.4116222760290557, + "grad_norm": 8.948328971862793, + "learning_rate": 4.082125603864734e-06, + "loss": 0.1622, + "num_input_tokens_seen": 145984, + "step": 170 + }, + { + "epoch": 0.423728813559322, + "grad_norm": 44.44621276855469, + "learning_rate": 4.202898550724638e-06, + "loss": 0.1974, + "num_input_tokens_seen": 150144, + "step": 175 + }, + { + "epoch": 0.4358353510895884, + "grad_norm": 12.341259956359863, + "learning_rate": 4.323671497584541e-06, + "loss": 0.2191, + "num_input_tokens_seen": 154624, + "step": 180 + }, + { + "epoch": 0.44794188861985473, + "grad_norm": 13.725370407104492, + "learning_rate": 4.444444444444444e-06, + "loss": 0.2192, + "num_input_tokens_seen": 158784, + "step": 185 + }, + { + "epoch": 0.4600484261501211, + "grad_norm": 9.8717622756958, + "learning_rate": 4.565217391304348e-06, + "loss": 0.1887, + "num_input_tokens_seen": 163072, + "step": 190 + }, + { + "epoch": 0.4721549636803874, + "grad_norm": 15.11948299407959, + "learning_rate": 4.6859903381642516e-06, + "loss": 0.1951, + "num_input_tokens_seen": 167104, + "step": 195 + }, + { + "epoch": 0.48426150121065376, + "grad_norm": 10.298319816589355, + "learning_rate": 4.806763285024155e-06, + "loss": 0.1486, + "num_input_tokens_seen": 171456, + "step": 200 + }, + { + "epoch": 0.4963680387409201, + "grad_norm": 22.512561798095703, + "learning_rate": 4.927536231884059e-06, + "loss": 0.2178, + "num_input_tokens_seen": 175808, + "step": 205 + }, + { + "epoch": 0.5036319612590799, + "eval_loss": 0.1319892704486847, + "eval_runtime": 0.6163, + "eval_samples_per_second": 595.512, + "eval_steps_per_second": 74.642, + "num_input_tokens_seen": 178688, + "step": 208 + }, + { + "epoch": 0.5084745762711864, + "grad_norm": 11.861852645874023, + "learning_rate": 4.999985705205496e-06, + "loss": 0.1052, + "num_input_tokens_seen": 180224, + "step": 210 + }, + { + "epoch": 0.5205811138014528, + "grad_norm": 15.08198070526123, + "learning_rate": 4.999824890644693e-06, + "loss": 0.1655, + "num_input_tokens_seen": 184704, + "step": 215 + }, + { + "epoch": 0.5326876513317191, + "grad_norm": 9.965168952941895, + "learning_rate": 4.999485404562269e-06, + "loss": 0.3684, + "num_input_tokens_seen": 189184, + "step": 220 + }, + { + "epoch": 0.5447941888619855, + "grad_norm": 7.275655269622803, + "learning_rate": 4.998967271222521e-06, + "loss": 0.1527, + "num_input_tokens_seen": 193536, + "step": 225 + }, + { + "epoch": 0.5569007263922519, + "grad_norm": 7.0880584716796875, + "learning_rate": 4.998270527658311e-06, + "loss": 0.1238, + "num_input_tokens_seen": 197888, + "step": 230 + }, + { + "epoch": 0.5690072639225182, + "grad_norm": 27.60887908935547, + "learning_rate": 4.997395223668422e-06, + "loss": 0.2147, + "num_input_tokens_seen": 202112, + "step": 235 + }, + { + "epoch": 0.5811138014527845, + "grad_norm": 43.02740478515625, + "learning_rate": 4.996341421813993e-06, + "loss": 0.1162, + "num_input_tokens_seen": 206528, + "step": 240 + }, + { + "epoch": 0.5932203389830508, + "grad_norm": 30.406055450439453, + "learning_rate": 4.995109197414051e-06, + "loss": 0.1311, + "num_input_tokens_seen": 210944, + "step": 245 + }, + { + "epoch": 0.6053268765133172, + "grad_norm": 14.91820240020752, + "learning_rate": 4.9936986385401305e-06, + "loss": 0.1437, + "num_input_tokens_seen": 215104, + "step": 250 + }, + { + "epoch": 0.6174334140435835, + "grad_norm": 20.09491729736328, + "learning_rate": 4.992109846009972e-06, + "loss": 0.1597, + "num_input_tokens_seen": 219328, + "step": 255 + }, + { + "epoch": 0.6295399515738499, + "grad_norm": 6.193624973297119, + "learning_rate": 4.990342933380321e-06, + "loss": 0.1878, + "num_input_tokens_seen": 223680, + "step": 260 + }, + { + "epoch": 0.6416464891041163, + "grad_norm": 6.540223121643066, + "learning_rate": 4.988398026938811e-06, + "loss": 0.1445, + "num_input_tokens_seen": 227904, + "step": 265 + }, + { + "epoch": 0.6537530266343826, + "grad_norm": 17.89214515686035, + "learning_rate": 4.986275265694935e-06, + "loss": 0.0992, + "num_input_tokens_seen": 231936, + "step": 270 + }, + { + "epoch": 0.6658595641646489, + "grad_norm": 0.7999329566955566, + "learning_rate": 4.983974801370115e-06, + "loss": 0.0608, + "num_input_tokens_seen": 236160, + "step": 275 + }, + { + "epoch": 0.6779661016949152, + "grad_norm": 36.78638458251953, + "learning_rate": 4.981496798386849e-06, + "loss": 0.2262, + "num_input_tokens_seen": 240320, + "step": 280 + }, + { + "epoch": 0.6900726392251816, + "grad_norm": 18.634634017944336, + "learning_rate": 4.9788414338569715e-06, + "loss": 0.1165, + "num_input_tokens_seen": 244800, + "step": 285 + }, + { + "epoch": 0.7021791767554479, + "grad_norm": 35.7069091796875, + "learning_rate": 4.9760088975689815e-06, + "loss": 0.2377, + "num_input_tokens_seen": 249152, + "step": 290 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 10.031001091003418, + "learning_rate": 4.972999391974488e-06, + "loss": 0.1377, + "num_input_tokens_seen": 253376, + "step": 295 + }, + { + "epoch": 0.7263922518159807, + "grad_norm": 7.3871612548828125, + "learning_rate": 4.969813132173735e-06, + "loss": 0.19, + "num_input_tokens_seen": 257664, + "step": 300 + }, + { + "epoch": 0.738498789346247, + "grad_norm": 16.09194564819336, + "learning_rate": 4.966450345900229e-06, + "loss": 0.1146, + "num_input_tokens_seen": 262016, + "step": 305 + }, + { + "epoch": 0.7506053268765133, + "grad_norm": 7.399415969848633, + "learning_rate": 4.962911273504461e-06, + "loss": 0.1165, + "num_input_tokens_seen": 266432, + "step": 310 + }, + { + "epoch": 0.7554479418886199, + "eval_loss": 0.11303775012493134, + "eval_runtime": 1.7273, + "eval_samples_per_second": 212.47, + "eval_steps_per_second": 26.631, + "num_input_tokens_seen": 267968, + "step": 312 + }, + { + "epoch": 0.7627118644067796, + "grad_norm": 11.618612289428711, + "learning_rate": 4.959196167936729e-06, + "loss": 0.181, + "num_input_tokens_seen": 270464, + "step": 315 + }, + { + "epoch": 0.774818401937046, + "grad_norm": 11.343527793884277, + "learning_rate": 4.955305294729056e-06, + "loss": 0.0946, + "num_input_tokens_seen": 274688, + "step": 320 + }, + { + "epoch": 0.7869249394673123, + "grad_norm": 9.781023025512695, + "learning_rate": 4.9512389319762165e-06, + "loss": 0.1293, + "num_input_tokens_seen": 278848, + "step": 325 + }, + { + "epoch": 0.7990314769975787, + "grad_norm": 23.389354705810547, + "learning_rate": 4.946997370315857e-06, + "loss": 0.124, + "num_input_tokens_seen": 283136, + "step": 330 + }, + { + "epoch": 0.8111380145278451, + "grad_norm": 21.524974822998047, + "learning_rate": 4.9425809129077204e-06, + "loss": 0.1767, + "num_input_tokens_seen": 287680, + "step": 335 + }, + { + "epoch": 0.8232445520581114, + "grad_norm": 12.489716529846191, + "learning_rate": 4.937989875411986e-06, + "loss": 0.0811, + "num_input_tokens_seen": 292224, + "step": 340 + }, + { + "epoch": 0.8353510895883777, + "grad_norm": 11.355611801147461, + "learning_rate": 4.933224585966696e-06, + "loss": 0.1567, + "num_input_tokens_seen": 296448, + "step": 345 + }, + { + "epoch": 0.847457627118644, + "grad_norm": 24.681901931762695, + "learning_rate": 4.928285385164316e-06, + "loss": 0.1363, + "num_input_tokens_seen": 300736, + "step": 350 + }, + { + "epoch": 0.8595641646489104, + "grad_norm": 8.876485824584961, + "learning_rate": 4.92317262602738e-06, + "loss": 0.1348, + "num_input_tokens_seen": 304960, + "step": 355 + }, + { + "epoch": 0.8716707021791767, + "grad_norm": 20.233808517456055, + "learning_rate": 4.917886673983267e-06, + "loss": 0.1694, + "num_input_tokens_seen": 309184, + "step": 360 + }, + { + "epoch": 0.8837772397094431, + "grad_norm": 20.60609245300293, + "learning_rate": 4.912427906838079e-06, + "loss": 0.1352, + "num_input_tokens_seen": 313408, + "step": 365 + }, + { + "epoch": 0.8958837772397095, + "grad_norm": 6.115711688995361, + "learning_rate": 4.906796714749635e-06, + "loss": 0.0933, + "num_input_tokens_seen": 317888, + "step": 370 + }, + { + "epoch": 0.9079903147699758, + "grad_norm": 18.18195152282715, + "learning_rate": 4.900993500199591e-06, + "loss": 0.1488, + "num_input_tokens_seen": 322048, + "step": 375 + }, + { + "epoch": 0.9200968523002422, + "grad_norm": 12.584397315979004, + "learning_rate": 4.895018677964669e-06, + "loss": 0.087, + "num_input_tokens_seen": 326592, + "step": 380 + }, + { + "epoch": 0.9322033898305084, + "grad_norm": 7.245577812194824, + "learning_rate": 4.888872675087012e-06, + "loss": 0.1017, + "num_input_tokens_seen": 330880, + "step": 385 + }, + { + "epoch": 0.9443099273607748, + "grad_norm": 51.712425231933594, + "learning_rate": 4.882555930843664e-06, + "loss": 0.1105, + "num_input_tokens_seen": 335104, + "step": 390 + }, + { + "epoch": 0.9564164648910412, + "grad_norm": 25.56722068786621, + "learning_rate": 4.876068896715171e-06, + "loss": 0.1437, + "num_input_tokens_seen": 339392, + "step": 395 + }, + { + "epoch": 0.9685230024213075, + "grad_norm": 20.534461975097656, + "learning_rate": 4.8694120363533105e-06, + "loss": 0.146, + "num_input_tokens_seen": 343744, + "step": 400 + }, + { + "epoch": 0.9806295399515739, + "grad_norm": 8.779641151428223, + "learning_rate": 4.862585825547957e-06, + "loss": 0.0985, + "num_input_tokens_seen": 348160, + "step": 405 + }, + { + "epoch": 0.9927360774818402, + "grad_norm": 17.242847442626953, + "learning_rate": 4.855590752193075e-06, + "loss": 0.116, + "num_input_tokens_seen": 352448, + "step": 410 + }, + { + "epoch": 1.0048426150121066, + "grad_norm": 13.249277114868164, + "learning_rate": 4.848427316251843e-06, + "loss": 0.1193, + "num_input_tokens_seen": 356656, + "step": 415 + }, + { + "epoch": 1.0072639225181599, + "eval_loss": 0.10842076689004898, + "eval_runtime": 0.63, + "eval_samples_per_second": 582.523, + "eval_steps_per_second": 73.014, + "num_input_tokens_seen": 357488, + "step": 416 + }, + { + "epoch": 1.0169491525423728, + "grad_norm": 1.1179414987564087, + "learning_rate": 4.841096029720921e-06, + "loss": 0.073, + "num_input_tokens_seen": 360880, + "step": 420 + }, + { + "epoch": 1.0290556900726393, + "grad_norm": 31.719369888305664, + "learning_rate": 4.833597416593861e-06, + "loss": 0.0535, + "num_input_tokens_seen": 365104, + "step": 425 + }, + { + "epoch": 1.0411622276029056, + "grad_norm": 48.02503204345703, + "learning_rate": 4.825932012823652e-06, + "loss": 0.1458, + "num_input_tokens_seen": 369776, + "step": 430 + }, + { + "epoch": 1.053268765133172, + "grad_norm": 97.04767608642578, + "learning_rate": 4.818100366284408e-06, + "loss": 0.1602, + "num_input_tokens_seen": 374000, + "step": 435 + }, + { + "epoch": 1.0653753026634383, + "grad_norm": 37.753238677978516, + "learning_rate": 4.81010303673222e-06, + "loss": 0.2577, + "num_input_tokens_seen": 378096, + "step": 440 + }, + { + "epoch": 1.0774818401937045, + "grad_norm": 0.17760241031646729, + "learning_rate": 4.80194059576514e-06, + "loss": 0.0566, + "num_input_tokens_seen": 382256, + "step": 445 + }, + { + "epoch": 1.089588377723971, + "grad_norm": 34.652015686035156, + "learning_rate": 4.793613626782331e-06, + "loss": 0.1761, + "num_input_tokens_seen": 386672, + "step": 450 + }, + { + "epoch": 1.1016949152542372, + "grad_norm": 28.04759407043457, + "learning_rate": 4.785122724942367e-06, + "loss": 0.0591, + "num_input_tokens_seen": 390960, + "step": 455 + }, + { + "epoch": 1.1138014527845037, + "grad_norm": 2.957566976547241, + "learning_rate": 4.7764684971206974e-06, + "loss": 0.0952, + "num_input_tokens_seen": 395440, + "step": 460 + }, + { + "epoch": 1.12590799031477, + "grad_norm": 44.540069580078125, + "learning_rate": 4.767651561866269e-06, + "loss": 0.0664, + "num_input_tokens_seen": 399600, + "step": 465 + }, + { + "epoch": 1.1380145278450362, + "grad_norm": 24.51837730407715, + "learning_rate": 4.758672549357316e-06, + "loss": 0.1001, + "num_input_tokens_seen": 403888, + "step": 470 + }, + { + "epoch": 1.1501210653753027, + "grad_norm": 40.098114013671875, + "learning_rate": 4.7495321013563225e-06, + "loss": 0.2506, + "num_input_tokens_seen": 408176, + "step": 475 + }, + { + "epoch": 1.162227602905569, + "grad_norm": 2.672497510910034, + "learning_rate": 4.740230871164148e-06, + "loss": 0.044, + "num_input_tokens_seen": 412208, + "step": 480 + }, + { + "epoch": 1.1743341404358354, + "grad_norm": 0.2532678544521332, + "learning_rate": 4.730769523573337e-06, + "loss": 0.1472, + "num_input_tokens_seen": 416624, + "step": 485 + }, + { + "epoch": 1.1864406779661016, + "grad_norm": 4.472592830657959, + "learning_rate": 4.721148734820605e-06, + "loss": 0.1661, + "num_input_tokens_seen": 421040, + "step": 490 + }, + { + "epoch": 1.1985472154963681, + "grad_norm": 37.826171875, + "learning_rate": 4.711369192538503e-06, + "loss": 0.094, + "num_input_tokens_seen": 425136, + "step": 495 + }, + { + "epoch": 1.2106537530266344, + "grad_norm": 14.548481941223145, + "learning_rate": 4.701431595706269e-06, + "loss": 0.1282, + "num_input_tokens_seen": 429680, + "step": 500 + }, + { + "epoch": 1.2227602905569008, + "grad_norm": 24.612041473388672, + "learning_rate": 4.691336654599873e-06, + "loss": 0.0874, + "num_input_tokens_seen": 434224, + "step": 505 + }, + { + "epoch": 1.234866828087167, + "grad_norm": 11.3072509765625, + "learning_rate": 4.6810850907412486e-06, + "loss": 0.0403, + "num_input_tokens_seen": 438320, + "step": 510 + }, + { + "epoch": 1.2469733656174333, + "grad_norm": 9.126791000366211, + "learning_rate": 4.6706776368467236e-06, + "loss": 0.0227, + "num_input_tokens_seen": 442672, + "step": 515 + }, + { + "epoch": 1.2590799031476998, + "grad_norm": 23.92775535583496, + "learning_rate": 4.6601150367746485e-06, + "loss": 0.0685, + "num_input_tokens_seen": 446896, + "step": 520 + }, + { + "epoch": 1.2590799031476998, + "eval_loss": 0.19028596580028534, + "eval_runtime": 0.7167, + "eval_samples_per_second": 512.081, + "eval_steps_per_second": 64.185, + "num_input_tokens_seen": 446896, + "step": 520 + }, + { + "epoch": 1.271186440677966, + "grad_norm": 43.593448638916016, + "learning_rate": 4.649398045472235e-06, + "loss": 0.1008, + "num_input_tokens_seen": 451312, + "step": 525 + }, + { + "epoch": 1.2832929782082325, + "grad_norm": 8.737812042236328, + "learning_rate": 4.638527428921592e-06, + "loss": 0.3076, + "num_input_tokens_seen": 455408, + "step": 530 + }, + { + "epoch": 1.2953995157384988, + "grad_norm": 2.4155948162078857, + "learning_rate": 4.627503964084981e-06, + "loss": 0.0462, + "num_input_tokens_seen": 460080, + "step": 535 + }, + { + "epoch": 1.307506053268765, + "grad_norm": 2.872014284133911, + "learning_rate": 4.616328438849284e-06, + "loss": 0.0124, + "num_input_tokens_seen": 464496, + "step": 540 + }, + { + "epoch": 1.3196125907990315, + "grad_norm": 34.14030456542969, + "learning_rate": 4.605001651969686e-06, + "loss": 0.1408, + "num_input_tokens_seen": 468720, + "step": 545 + }, + { + "epoch": 1.331719128329298, + "grad_norm": 59.56473922729492, + "learning_rate": 4.5935244130125925e-06, + "loss": 0.115, + "num_input_tokens_seen": 473264, + "step": 550 + }, + { + "epoch": 1.3438256658595642, + "grad_norm": 1.2305806875228882, + "learning_rate": 4.581897542297761e-06, + "loss": 0.0061, + "num_input_tokens_seen": 477552, + "step": 555 + }, + { + "epoch": 1.3559322033898304, + "grad_norm": 52.619632720947266, + "learning_rate": 4.570121870839671e-06, + "loss": 0.0843, + "num_input_tokens_seen": 482032, + "step": 560 + }, + { + "epoch": 1.368038740920097, + "grad_norm": 97.8170394897461, + "learning_rate": 4.558198240288131e-06, + "loss": 0.0764, + "num_input_tokens_seen": 486384, + "step": 565 + }, + { + "epoch": 1.3801452784503632, + "grad_norm": 35.3338737487793, + "learning_rate": 4.5461275028681186e-06, + "loss": 0.1836, + "num_input_tokens_seen": 490672, + "step": 570 + }, + { + "epoch": 1.3922518159806296, + "grad_norm": 27.42061996459961, + "learning_rate": 4.533910521318872e-06, + "loss": 0.1097, + "num_input_tokens_seen": 494960, + "step": 575 + }, + { + "epoch": 1.4043583535108959, + "grad_norm": 7.799497604370117, + "learning_rate": 4.521548168832227e-06, + "loss": 0.1144, + "num_input_tokens_seen": 499120, + "step": 580 + }, + { + "epoch": 1.4164648910411621, + "grad_norm": 11.070670127868652, + "learning_rate": 4.509041328990204e-06, + "loss": 0.0169, + "num_input_tokens_seen": 503408, + "step": 585 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 5.323161602020264, + "learning_rate": 4.496390895701858e-06, + "loss": 0.0424, + "num_input_tokens_seen": 507312, + "step": 590 + }, + { + "epoch": 1.4406779661016949, + "grad_norm": 26.43401527404785, + "learning_rate": 4.483597773139387e-06, + "loss": 0.053, + "num_input_tokens_seen": 511600, + "step": 595 + }, + { + "epoch": 1.4527845036319613, + "grad_norm": 0.05975044146180153, + "learning_rate": 4.470662875673506e-06, + "loss": 0.0615, + "num_input_tokens_seen": 515888, + "step": 600 + }, + { + "epoch": 1.4648910411622276, + "grad_norm": 52.62843322753906, + "learning_rate": 4.4575871278080964e-06, + "loss": 0.2071, + "num_input_tokens_seen": 519920, + "step": 605 + }, + { + "epoch": 1.4769975786924938, + "grad_norm": 10.014227867126465, + "learning_rate": 4.444371464114126e-06, + "loss": 0.0688, + "num_input_tokens_seen": 524336, + "step": 610 + }, + { + "epoch": 1.4891041162227603, + "grad_norm": 0.5611757636070251, + "learning_rate": 4.431016829162851e-06, + "loss": 0.071, + "num_input_tokens_seen": 528496, + "step": 615 + }, + { + "epoch": 1.5012106537530268, + "grad_norm": 0.10402300208806992, + "learning_rate": 4.417524177458309e-06, + "loss": 0.0801, + "num_input_tokens_seen": 532784, + "step": 620 + }, + { + "epoch": 1.5108958837772397, + "eval_loss": 0.1981746405363083, + "eval_runtime": 0.6398, + "eval_samples_per_second": 573.626, + "eval_steps_per_second": 71.899, + "num_input_tokens_seen": 536176, + "step": 624 + }, + { + "epoch": 1.513317191283293, + "grad_norm": 24.07758331298828, + "learning_rate": 4.403894473369092e-06, + "loss": 0.0258, + "num_input_tokens_seen": 537136, + "step": 625 + }, + { + "epoch": 1.5254237288135593, + "grad_norm": 48.458106994628906, + "learning_rate": 4.390128691059423e-06, + "loss": 0.199, + "num_input_tokens_seen": 541552, + "step": 630 + }, + { + "epoch": 1.5375302663438255, + "grad_norm": 8.228415489196777, + "learning_rate": 4.376227814419524e-06, + "loss": 0.1964, + "num_input_tokens_seen": 545648, + "step": 635 + }, + { + "epoch": 1.549636803874092, + "grad_norm": 12.28972339630127, + "learning_rate": 4.3621928369952995e-06, + "loss": 0.06, + "num_input_tokens_seen": 550256, + "step": 640 + }, + { + "epoch": 1.5617433414043584, + "grad_norm": 15.129716873168945, + "learning_rate": 4.348024761917321e-06, + "loss": 0.1114, + "num_input_tokens_seen": 554928, + "step": 645 + }, + { + "epoch": 1.5738498789346247, + "grad_norm": 0.27470338344573975, + "learning_rate": 4.333724601829132e-06, + "loss": 0.0725, + "num_input_tokens_seen": 559344, + "step": 650 + }, + { + "epoch": 1.585956416464891, + "grad_norm": 0.1773267388343811, + "learning_rate": 4.319293378814868e-06, + "loss": 0.1308, + "num_input_tokens_seen": 563760, + "step": 655 + }, + { + "epoch": 1.5980629539951574, + "grad_norm": 14.22355842590332, + "learning_rate": 4.3047321243262065e-06, + "loss": 0.0653, + "num_input_tokens_seen": 568112, + "step": 660 + }, + { + "epoch": 1.6101694915254239, + "grad_norm": 0.1500890851020813, + "learning_rate": 4.290041879108641e-06, + "loss": 0.006, + "num_input_tokens_seen": 572464, + "step": 665 + }, + { + "epoch": 1.6222760290556901, + "grad_norm": 30.997364044189453, + "learning_rate": 4.275223693127103e-06, + "loss": 0.0771, + "num_input_tokens_seen": 576752, + "step": 670 + }, + { + "epoch": 1.6343825665859564, + "grad_norm": 44.13563919067383, + "learning_rate": 4.260278625490911e-06, + "loss": 0.034, + "num_input_tokens_seen": 580976, + "step": 675 + }, + { + "epoch": 1.6464891041162226, + "grad_norm": 1.0693022012710571, + "learning_rate": 4.245207744378075e-06, + "loss": 0.1429, + "num_input_tokens_seen": 585264, + "step": 680 + }, + { + "epoch": 1.658595641646489, + "grad_norm": 32.29472351074219, + "learning_rate": 4.2300121269589475e-06, + "loss": 0.0664, + "num_input_tokens_seen": 589744, + "step": 685 + }, + { + "epoch": 1.6707021791767556, + "grad_norm": 40.11149597167969, + "learning_rate": 4.2146928593192375e-06, + "loss": 0.0792, + "num_input_tokens_seen": 593968, + "step": 690 + }, + { + "epoch": 1.6828087167070218, + "grad_norm": 43.619258880615234, + "learning_rate": 4.19925103638238e-06, + "loss": 0.1061, + "num_input_tokens_seen": 598256, + "step": 695 + }, + { + "epoch": 1.694915254237288, + "grad_norm": 0.6466928720474243, + "learning_rate": 4.183687761831282e-06, + "loss": 0.0958, + "num_input_tokens_seen": 602608, + "step": 700 + }, + { + "epoch": 1.7070217917675545, + "grad_norm": 27.355270385742188, + "learning_rate": 4.168004148029435e-06, + "loss": 0.1234, + "num_input_tokens_seen": 607088, + "step": 705 + }, + { + "epoch": 1.7191283292978208, + "grad_norm": 17.83440399169922, + "learning_rate": 4.152201315941414e-06, + "loss": 0.1094, + "num_input_tokens_seen": 611248, + "step": 710 + }, + { + "epoch": 1.7312348668280872, + "grad_norm": 1.9912621974945068, + "learning_rate": 4.136280395052754e-06, + "loss": 0.1047, + "num_input_tokens_seen": 615536, + "step": 715 + }, + { + "epoch": 1.7433414043583535, + "grad_norm": 7.247110843658447, + "learning_rate": 4.120242523289223e-06, + "loss": 0.0341, + "num_input_tokens_seen": 619952, + "step": 720 + }, + { + "epoch": 1.7554479418886197, + "grad_norm": 42.79171371459961, + "learning_rate": 4.104088846935493e-06, + "loss": 0.2066, + "num_input_tokens_seen": 624368, + "step": 725 + }, + { + "epoch": 1.7627118644067796, + "eval_loss": 0.14485575258731842, + "eval_runtime": 0.6316, + "eval_samples_per_second": 581.029, + "eval_steps_per_second": 72.826, + "num_input_tokens_seen": 626992, + "step": 728 + }, + { + "epoch": 1.7675544794188862, + "grad_norm": 15.863067626953125, + "learning_rate": 4.087820520553205e-06, + "loss": 0.0104, + "num_input_tokens_seen": 628720, + "step": 730 + }, + { + "epoch": 1.7796610169491527, + "grad_norm": 4.8767571449279785, + "learning_rate": 4.071438706898457e-06, + "loss": 0.0572, + "num_input_tokens_seen": 633008, + "step": 735 + }, + { + "epoch": 1.791767554479419, + "grad_norm": 8.256195068359375, + "learning_rate": 4.0549445768386895e-06, + "loss": 0.1222, + "num_input_tokens_seen": 637360, + "step": 740 + }, + { + "epoch": 1.8038740920096852, + "grad_norm": 40.003360748291016, + "learning_rate": 4.038339309269002e-06, + "loss": 0.1171, + "num_input_tokens_seen": 641648, + "step": 745 + }, + { + "epoch": 1.8159806295399514, + "grad_norm": 1.1402125358581543, + "learning_rate": 4.021624091027895e-06, + "loss": 0.1638, + "num_input_tokens_seen": 645552, + "step": 750 + }, + { + "epoch": 1.828087167070218, + "grad_norm": 19.136348724365234, + "learning_rate": 4.00480011681244e-06, + "loss": 0.1092, + "num_input_tokens_seen": 649904, + "step": 755 + }, + { + "epoch": 1.8401937046004844, + "grad_norm": 16.968360900878906, + "learning_rate": 3.987868589092894e-06, + "loss": 0.1118, + "num_input_tokens_seen": 654128, + "step": 760 + }, + { + "epoch": 1.8523002421307506, + "grad_norm": 15.944784164428711, + "learning_rate": 3.970830718026746e-06, + "loss": 0.1015, + "num_input_tokens_seen": 658672, + "step": 765 + }, + { + "epoch": 1.8644067796610169, + "grad_norm": 20.015836715698242, + "learning_rate": 3.9536877213722335e-06, + "loss": 0.1207, + "num_input_tokens_seen": 663088, + "step": 770 + }, + { + "epoch": 1.8765133171912833, + "grad_norm": 30.892627716064453, + "learning_rate": 3.936440824401299e-06, + "loss": 0.083, + "num_input_tokens_seen": 667440, + "step": 775 + }, + { + "epoch": 1.8886198547215496, + "grad_norm": 19.33950424194336, + "learning_rate": 3.919091259812013e-06, + "loss": 0.0249, + "num_input_tokens_seen": 671792, + "step": 780 + }, + { + "epoch": 1.900726392251816, + "grad_norm": 24.891815185546875, + "learning_rate": 3.901640267640475e-06, + "loss": 0.0425, + "num_input_tokens_seen": 676336, + "step": 785 + }, + { + "epoch": 1.9128329297820823, + "grad_norm": 28.49574089050293, + "learning_rate": 3.884089095172181e-06, + "loss": 0.0402, + "num_input_tokens_seen": 680624, + "step": 790 + }, + { + "epoch": 1.9249394673123486, + "grad_norm": 0.016377810388803482, + "learning_rate": 3.866438996852873e-06, + "loss": 0.0155, + "num_input_tokens_seen": 685040, + "step": 795 + }, + { + "epoch": 1.937046004842615, + "grad_norm": 0.16267594695091248, + "learning_rate": 3.848691234198879e-06, + "loss": 0.0372, + "num_input_tokens_seen": 689392, + "step": 800 + }, + { + "epoch": 1.9491525423728815, + "grad_norm": 1.1569898128509521, + "learning_rate": 3.830847075706957e-06, + "loss": 0.1257, + "num_input_tokens_seen": 693552, + "step": 805 + }, + { + "epoch": 1.9612590799031477, + "grad_norm": 1.0089043378829956, + "learning_rate": 3.812907796763616e-06, + "loss": 0.0454, + "num_input_tokens_seen": 698032, + "step": 810 + }, + { + "epoch": 1.973365617433414, + "grad_norm": 0.9869695901870728, + "learning_rate": 3.794874679553975e-06, + "loss": 0.2136, + "num_input_tokens_seen": 702000, + "step": 815 + }, + { + "epoch": 1.9854721549636802, + "grad_norm": 23.745838165283203, + "learning_rate": 3.7767490129701057e-06, + "loss": 0.1643, + "num_input_tokens_seen": 706160, + "step": 820 + }, + { + "epoch": 1.9975786924939467, + "grad_norm": 4.922607898712158, + "learning_rate": 3.7585320925189246e-06, + "loss": 0.0475, + "num_input_tokens_seen": 710768, + "step": 825 + }, + { + "epoch": 2.009685230024213, + "grad_norm": 0.3702712655067444, + "learning_rate": 3.7402252202295876e-06, + "loss": 0.0011, + "num_input_tokens_seen": 714744, + "step": 830 + }, + { + "epoch": 2.0145278450363198, + "eval_loss": 0.2067757099866867, + "eval_runtime": 0.634, + "eval_samples_per_second": 578.871, + "eval_steps_per_second": 72.556, + "num_input_tokens_seen": 716344, + "step": 832 + }, + { + "epoch": 2.0217917675544794, + "grad_norm": 66.86534118652344, + "learning_rate": 3.7218297045604362e-06, + "loss": 0.0057, + "num_input_tokens_seen": 718776, + "step": 835 + }, + { + "epoch": 2.0338983050847457, + "grad_norm": 0.06272957473993301, + "learning_rate": 3.703346860305473e-06, + "loss": 0.0114, + "num_input_tokens_seen": 722744, + "step": 840 + }, + { + "epoch": 2.046004842615012, + "grad_norm": 0.00682666152715683, + "learning_rate": 3.6847780085003908e-06, + "loss": 0.0047, + "num_input_tokens_seen": 727160, + "step": 845 + }, + { + "epoch": 2.0581113801452786, + "grad_norm": 60.80389404296875, + "learning_rate": 3.666124476328155e-06, + "loss": 0.0867, + "num_input_tokens_seen": 731576, + "step": 850 + }, + { + "epoch": 2.070217917675545, + "grad_norm": 15.216704368591309, + "learning_rate": 3.647387597024139e-06, + "loss": 0.0084, + "num_input_tokens_seen": 736184, + "step": 855 + }, + { + "epoch": 2.082324455205811, + "grad_norm": 0.1580037623643875, + "learning_rate": 3.6285687097808396e-06, + "loss": 0.0011, + "num_input_tokens_seen": 740472, + "step": 860 + }, + { + "epoch": 2.0944309927360774, + "grad_norm": 56.17728805541992, + "learning_rate": 3.609669159652158e-06, + "loss": 0.0528, + "num_input_tokens_seen": 744760, + "step": 865 + }, + { + "epoch": 2.106537530266344, + "grad_norm": 0.03734096884727478, + "learning_rate": 3.5906902974572623e-06, + "loss": 0.0003, + "num_input_tokens_seen": 749176, + "step": 870 + }, + { + "epoch": 2.1186440677966103, + "grad_norm": 0.013891610316932201, + "learning_rate": 3.5716334796840403e-06, + "loss": 0.0329, + "num_input_tokens_seen": 753528, + "step": 875 + }, + { + "epoch": 2.1307506053268765, + "grad_norm": 0.11342939734458923, + "learning_rate": 3.5525000683921467e-06, + "loss": 0.0022, + "num_input_tokens_seen": 757688, + "step": 880 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 0.012256304733455181, + "learning_rate": 3.533291431115653e-06, + "loss": 0.0268, + "num_input_tokens_seen": 762040, + "step": 885 + }, + { + "epoch": 2.154963680387409, + "grad_norm": 55.32374572753906, + "learning_rate": 3.514008940765304e-06, + "loss": 0.0746, + "num_input_tokens_seen": 766200, + "step": 890 + }, + { + "epoch": 2.1670702179176757, + "grad_norm": 0.006180486176162958, + "learning_rate": 3.494653975530388e-06, + "loss": 0.0202, + "num_input_tokens_seen": 770680, + "step": 895 + }, + { + "epoch": 2.179176755447942, + "grad_norm": 0.8855127096176147, + "learning_rate": 3.475227918780239e-06, + "loss": 0.0023, + "num_input_tokens_seen": 774840, + "step": 900 + }, + { + "epoch": 2.1912832929782082, + "grad_norm": 0.0062964423559606075, + "learning_rate": 3.455732158965356e-06, + "loss": 0.0001, + "num_input_tokens_seen": 779192, + "step": 905 + }, + { + "epoch": 2.2033898305084745, + "grad_norm": 0.005455203354358673, + "learning_rate": 3.436168089518168e-06, + "loss": 0.0001, + "num_input_tokens_seen": 783608, + "step": 910 + }, + { + "epoch": 2.2154963680387407, + "grad_norm": 40.38529968261719, + "learning_rate": 3.4165371087534428e-06, + "loss": 0.0365, + "num_input_tokens_seen": 788088, + "step": 915 + }, + { + "epoch": 2.2276029055690074, + "grad_norm": 0.01571381278336048, + "learning_rate": 3.396840619768338e-06, + "loss": 0.0, + "num_input_tokens_seen": 792568, + "step": 920 + }, + { + "epoch": 2.2397094430992737, + "grad_norm": 0.002371912356466055, + "learning_rate": 3.377080030342125e-06, + "loss": 0.0001, + "num_input_tokens_seen": 797176, + "step": 925 + }, + { + "epoch": 2.25181598062954, + "grad_norm": 31.288312911987305, + "learning_rate": 3.3572567528355614e-06, + "loss": 0.0038, + "num_input_tokens_seen": 801400, + "step": 930 + }, + { + "epoch": 2.263922518159806, + "grad_norm": 0.009536500088870525, + "learning_rate": 3.3373722040899515e-06, + "loss": 0.0059, + "num_input_tokens_seen": 805944, + "step": 935 + }, + { + "epoch": 2.2663438256658597, + "eval_loss": 0.26913806796073914, + "eval_runtime": 0.697, + "eval_samples_per_second": 526.575, + "eval_steps_per_second": 66.001, + "num_input_tokens_seen": 806712, + "step": 936 + }, + { + "epoch": 2.2760290556900724, + "grad_norm": 0.0007474619778804481, + "learning_rate": 3.3174278053258753e-06, + "loss": 0.0006, + "num_input_tokens_seen": 810040, + "step": 940 + }, + { + "epoch": 2.288135593220339, + "grad_norm": 42.942989349365234, + "learning_rate": 3.2974249820416094e-06, + "loss": 0.0482, + "num_input_tokens_seen": 814392, + "step": 945 + }, + { + "epoch": 2.3002421307506054, + "grad_norm": 62.229331970214844, + "learning_rate": 3.2773651639112432e-06, + "loss": 0.0175, + "num_input_tokens_seen": 818872, + "step": 950 + }, + { + "epoch": 2.3123486682808716, + "grad_norm": 0.0007935139001347125, + "learning_rate": 3.2572497846824922e-06, + "loss": 0.0039, + "num_input_tokens_seen": 823096, + "step": 955 + }, + { + "epoch": 2.324455205811138, + "grad_norm": 2.6152658462524414, + "learning_rate": 3.2370802820742273e-06, + "loss": 0.0549, + "num_input_tokens_seen": 827128, + "step": 960 + }, + { + "epoch": 2.3365617433414045, + "grad_norm": 0.9258336424827576, + "learning_rate": 3.2168580976737105e-06, + "loss": 0.0011, + "num_input_tokens_seen": 831288, + "step": 965 + }, + { + "epoch": 2.348668280871671, + "grad_norm": 0.28666022419929504, + "learning_rate": 3.1965846768335625e-06, + "loss": 0.0202, + "num_input_tokens_seen": 835640, + "step": 970 + }, + { + "epoch": 2.360774818401937, + "grad_norm": 0.0008573018712922931, + "learning_rate": 3.176261468568457e-06, + "loss": 0.0019, + "num_input_tokens_seen": 839736, + "step": 975 + }, + { + "epoch": 2.3728813559322033, + "grad_norm": 55.564476013183594, + "learning_rate": 3.155889925451557e-06, + "loss": 0.0363, + "num_input_tokens_seen": 844024, + "step": 980 + }, + { + "epoch": 2.38498789346247, + "grad_norm": 0.025824718177318573, + "learning_rate": 3.1354715035106892e-06, + "loss": 0.0001, + "num_input_tokens_seen": 848248, + "step": 985 + }, + { + "epoch": 2.3970944309927362, + "grad_norm": 0.0016786637715995312, + "learning_rate": 3.115007662124282e-06, + "loss": 0.0, + "num_input_tokens_seen": 852472, + "step": 990 + }, + { + "epoch": 2.4092009685230025, + "grad_norm": 0.014527814462780952, + "learning_rate": 3.0944998639170544e-06, + "loss": 0.0006, + "num_input_tokens_seen": 856824, + "step": 995 + }, + { + "epoch": 2.4213075060532687, + "grad_norm": 4.990849018096924, + "learning_rate": 3.0739495746554785e-06, + "loss": 0.0018, + "num_input_tokens_seen": 860984, + "step": 1000 + }, + { + "epoch": 2.433414043583535, + "grad_norm": 0.5683162808418274, + "learning_rate": 3.0533582631430153e-06, + "loss": 0.068, + "num_input_tokens_seen": 865272, + "step": 1005 + }, + { + "epoch": 2.4455205811138017, + "grad_norm": 5.13648796081543, + "learning_rate": 3.0327274011151355e-06, + "loss": 0.0395, + "num_input_tokens_seen": 869560, + "step": 1010 + }, + { + "epoch": 2.457627118644068, + "grad_norm": 0.0019914067815989256, + "learning_rate": 3.012058463134126e-06, + "loss": 0.0, + "num_input_tokens_seen": 873976, + "step": 1015 + }, + { + "epoch": 2.469733656174334, + "grad_norm": 0.0011213916586712003, + "learning_rate": 2.991352926483702e-06, + "loss": 0.0, + "num_input_tokens_seen": 878200, + "step": 1020 + }, + { + "epoch": 2.4818401937046004, + "grad_norm": 0.0030807037837803364, + "learning_rate": 2.9706122710634166e-06, + "loss": 0.0008, + "num_input_tokens_seen": 882872, + "step": 1025 + }, + { + "epoch": 2.4939467312348667, + "grad_norm": 0.0031619234941899776, + "learning_rate": 2.949837979282889e-06, + "loss": 0.0, + "num_input_tokens_seen": 887096, + "step": 1030 + }, + { + "epoch": 2.5060532687651333, + "grad_norm": 29.275707244873047, + "learning_rate": 2.9290315359558504e-06, + "loss": 0.0032, + "num_input_tokens_seen": 891576, + "step": 1035 + }, + { + "epoch": 2.5181598062953996, + "grad_norm": 0.013175534084439278, + "learning_rate": 2.908194428194019e-06, + "loss": 0.0756, + "num_input_tokens_seen": 895736, + "step": 1040 + }, + { + "epoch": 2.5181598062953996, + "eval_loss": 0.28947436809539795, + "eval_runtime": 0.636, + "eval_samples_per_second": 577.029, + "eval_steps_per_second": 72.325, + "num_input_tokens_seen": 895736, + "step": 1040 + }, + { + "epoch": 2.530266343825666, + "grad_norm": 0.002881130203604698, + "learning_rate": 2.88732814530081e-06, + "loss": 0.0001, + "num_input_tokens_seen": 900024, + "step": 1045 + }, + { + "epoch": 2.542372881355932, + "grad_norm": 122.53758239746094, + "learning_rate": 2.8664341786648932e-06, + "loss": 0.0128, + "num_input_tokens_seen": 904440, + "step": 1050 + }, + { + "epoch": 2.5544794188861983, + "grad_norm": 0.0015194405568763614, + "learning_rate": 2.845514021653595e-06, + "loss": 0.0001, + "num_input_tokens_seen": 908728, + "step": 1055 + }, + { + "epoch": 2.566585956416465, + "grad_norm": 0.13878583908081055, + "learning_rate": 2.8245691695061605e-06, + "loss": 0.0443, + "num_input_tokens_seen": 913016, + "step": 1060 + }, + { + "epoch": 2.5786924939467313, + "grad_norm": 26.11018180847168, + "learning_rate": 2.8036011192268863e-06, + "loss": 0.0032, + "num_input_tokens_seen": 917304, + "step": 1065 + }, + { + "epoch": 2.5907990314769975, + "grad_norm": 0.003500137245282531, + "learning_rate": 2.7826113694781254e-06, + "loss": 0.0001, + "num_input_tokens_seen": 921528, + "step": 1070 + }, + { + "epoch": 2.6029055690072638, + "grad_norm": 0.00933838915079832, + "learning_rate": 2.7616014204731683e-06, + "loss": 0.0, + "num_input_tokens_seen": 925944, + "step": 1075 + }, + { + "epoch": 2.61501210653753, + "grad_norm": 0.0023421638179570436, + "learning_rate": 2.7405727738690193e-06, + "loss": 0.0001, + "num_input_tokens_seen": 930744, + "step": 1080 + }, + { + "epoch": 2.6271186440677967, + "grad_norm": 0.009054594673216343, + "learning_rate": 2.7195269326590685e-06, + "loss": 0.0725, + "num_input_tokens_seen": 935352, + "step": 1085 + }, + { + "epoch": 2.639225181598063, + "grad_norm": 0.02030082233250141, + "learning_rate": 2.698465401065667e-06, + "loss": 0.0295, + "num_input_tokens_seen": 939640, + "step": 1090 + }, + { + "epoch": 2.651331719128329, + "grad_norm": 0.27530986070632935, + "learning_rate": 2.6773896844326126e-06, + "loss": 0.0001, + "num_input_tokens_seen": 943672, + "step": 1095 + }, + { + "epoch": 2.663438256658596, + "grad_norm": 0.03071773052215576, + "learning_rate": 2.656301289117561e-06, + "loss": 0.0001, + "num_input_tokens_seen": 947704, + "step": 1100 + }, + { + "epoch": 2.6755447941888617, + "grad_norm": 0.014886329881846905, + "learning_rate": 2.6352017223843584e-06, + "loss": 0.0196, + "num_input_tokens_seen": 951928, + "step": 1105 + }, + { + "epoch": 2.6876513317191284, + "grad_norm": 0.06265253573656082, + "learning_rate": 2.6140924922953125e-06, + "loss": 0.0294, + "num_input_tokens_seen": 956216, + "step": 1110 + }, + { + "epoch": 2.6997578692493946, + "grad_norm": 0.04291946068406105, + "learning_rate": 2.592975107603406e-06, + "loss": 0.0001, + "num_input_tokens_seen": 960504, + "step": 1115 + }, + { + "epoch": 2.711864406779661, + "grad_norm": 0.01645965874195099, + "learning_rate": 2.571851077644461e-06, + "loss": 0.0135, + "num_input_tokens_seen": 965048, + "step": 1120 + }, + { + "epoch": 2.7239709443099276, + "grad_norm": 0.007254968397319317, + "learning_rate": 2.55072191222926e-06, + "loss": 0.0001, + "num_input_tokens_seen": 969208, + "step": 1125 + }, + { + "epoch": 2.736077481840194, + "grad_norm": 0.114701047539711, + "learning_rate": 2.5295891215356362e-06, + "loss": 0.0991, + "num_input_tokens_seen": 973624, + "step": 1130 + }, + { + "epoch": 2.74818401937046, + "grad_norm": 0.0054718079045414925, + "learning_rate": 2.5084542160005338e-06, + "loss": 0.0064, + "num_input_tokens_seen": 977976, + "step": 1135 + }, + { + "epoch": 2.7602905569007263, + "grad_norm": 0.28143173456192017, + "learning_rate": 2.4873187062120515e-06, + "loss": 0.0001, + "num_input_tokens_seen": 982200, + "step": 1140 + }, + { + "epoch": 2.7699757869249395, + "eval_loss": 0.22601255774497986, + "eval_runtime": 0.6803, + "eval_samples_per_second": 539.434, + "eval_steps_per_second": 67.613, + "num_input_tokens_seen": 985592, + "step": 1144 + }, + { + "epoch": 2.7723970944309926, + "grad_norm": 0.02263481356203556, + "learning_rate": 2.4661841028014786e-06, + "loss": 0.0002, + "num_input_tokens_seen": 986488, + "step": 1145 + }, + { + "epoch": 2.7845036319612593, + "grad_norm": 0.00884742010384798, + "learning_rate": 2.445051916335321e-06, + "loss": 0.0002, + "num_input_tokens_seen": 990456, + "step": 1150 + }, + { + "epoch": 2.7966101694915255, + "grad_norm": 0.019946428015828133, + "learning_rate": 2.4239236572073354e-06, + "loss": 0.0766, + "num_input_tokens_seen": 994744, + "step": 1155 + }, + { + "epoch": 2.8087167070217918, + "grad_norm": 0.010249280370771885, + "learning_rate": 2.4028008355305817e-06, + "loss": 0.0501, + "num_input_tokens_seen": 999160, + "step": 1160 + }, + { + "epoch": 2.820823244552058, + "grad_norm": 56.294708251953125, + "learning_rate": 2.3816849610294784e-06, + "loss": 0.0289, + "num_input_tokens_seen": 1003256, + "step": 1165 + }, + { + "epoch": 2.8329297820823243, + "grad_norm": 0.007834532298147678, + "learning_rate": 2.3605775429319115e-06, + "loss": 0.0884, + "num_input_tokens_seen": 1007480, + "step": 1170 + }, + { + "epoch": 2.845036319612591, + "grad_norm": 0.02283744513988495, + "learning_rate": 2.3394800898613536e-06, + "loss": 0.0004, + "num_input_tokens_seen": 1011896, + "step": 1175 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.05437995120882988, + "learning_rate": 2.318394109729041e-06, + "loss": 0.0004, + "num_input_tokens_seen": 1015992, + "step": 1180 + }, + { + "epoch": 2.8692493946731235, + "grad_norm": 0.41270506381988525, + "learning_rate": 2.297321109626198e-06, + "loss": 0.003, + "num_input_tokens_seen": 1020408, + "step": 1185 + }, + { + "epoch": 2.8813559322033897, + "grad_norm": 0.04989304393529892, + "learning_rate": 2.27626259571632e-06, + "loss": 0.0003, + "num_input_tokens_seen": 1025016, + "step": 1190 + }, + { + "epoch": 2.893462469733656, + "grad_norm": 0.09522224217653275, + "learning_rate": 2.2552200731275215e-06, + "loss": 0.0571, + "num_input_tokens_seen": 1029368, + "step": 1195 + }, + { + "epoch": 2.9055690072639226, + "grad_norm": 3.600926160812378, + "learning_rate": 2.2341950458449576e-06, + "loss": 0.0007, + "num_input_tokens_seen": 1033592, + "step": 1200 + }, + { + "epoch": 2.917675544794189, + "grad_norm": 0.005034204572439194, + "learning_rate": 2.2131890166033333e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1037688, + "step": 1205 + }, + { + "epoch": 2.929782082324455, + "grad_norm": 0.01115792989730835, + "learning_rate": 2.1922034867794923e-06, + "loss": 0.0136, + "num_input_tokens_seen": 1041912, + "step": 1210 + }, + { + "epoch": 2.9418886198547214, + "grad_norm": 0.01456889882683754, + "learning_rate": 2.171239956285115e-06, + "loss": 0.0003, + "num_input_tokens_seen": 1046392, + "step": 1215 + }, + { + "epoch": 2.9539951573849876, + "grad_norm": 0.009599031880497932, + "learning_rate": 2.150299923459505e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1050616, + "step": 1220 + }, + { + "epoch": 2.9661016949152543, + "grad_norm": 0.04260379076004028, + "learning_rate": 2.1293848849625065e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1054840, + "step": 1225 + }, + { + "epoch": 2.9782082324455206, + "grad_norm": 0.006233742460608482, + "learning_rate": 2.108496335667527e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1058936, + "step": 1230 + }, + { + "epoch": 2.990314769975787, + "grad_norm": 0.005781834479421377, + "learning_rate": 2.0876357685546942e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1063288, + "step": 1235 + }, + { + "epoch": 3.002421307506053, + "grad_norm": 0.004296495113521814, + "learning_rate": 2.0668046746041497e-06, + "loss": 0.0, + "num_input_tokens_seen": 1067392, + "step": 1240 + }, + { + "epoch": 3.0145278450363198, + "grad_norm": 0.05696773901581764, + "learning_rate": 2.0460045426894816e-06, + "loss": 0.0, + "num_input_tokens_seen": 1071872, + "step": 1245 + }, + { + "epoch": 3.0217917675544794, + "eval_loss": 0.22526989877223969, + "eval_runtime": 0.6509, + "eval_samples_per_second": 563.818, + "eval_steps_per_second": 70.669, + "num_input_tokens_seen": 1074624, + "step": 1248 + }, + { + "epoch": 3.026634382566586, + "grad_norm": 0.033577512949705124, + "learning_rate": 2.0252368594713083e-06, + "loss": 0.0, + "num_input_tokens_seen": 1076416, + "step": 1250 + }, + { + "epoch": 3.0387409200968523, + "grad_norm": 8.000839233398438, + "learning_rate": 2.004503109291023e-06, + "loss": 0.0024, + "num_input_tokens_seen": 1080512, + "step": 1255 + }, + { + "epoch": 3.0508474576271185, + "grad_norm": 0.0029002963565289974, + "learning_rate": 1.9838047740647024e-06, + "loss": 0.0, + "num_input_tokens_seen": 1084608, + "step": 1260 + }, + { + "epoch": 3.062953995157385, + "grad_norm": 0.0019197538495063782, + "learning_rate": 1.9631433331771886e-06, + "loss": 0.0, + "num_input_tokens_seen": 1089024, + "step": 1265 + }, + { + "epoch": 3.0750605326876514, + "grad_norm": 0.1197233721613884, + "learning_rate": 1.942520263376351e-06, + "loss": 0.0, + "num_input_tokens_seen": 1093376, + "step": 1270 + }, + { + "epoch": 3.0871670702179177, + "grad_norm": 0.005993293132632971, + "learning_rate": 1.921937038667539e-06, + "loss": 0.0, + "num_input_tokens_seen": 1097728, + "step": 1275 + }, + { + "epoch": 3.099273607748184, + "grad_norm": 8.694519996643066, + "learning_rate": 1.901395130208229e-06, + "loss": 0.0831, + "num_input_tokens_seen": 1101888, + "step": 1280 + }, + { + "epoch": 3.11138014527845, + "grad_norm": 0.007167529780417681, + "learning_rate": 1.880896006202876e-06, + "loss": 0.0, + "num_input_tokens_seen": 1106176, + "step": 1285 + }, + { + "epoch": 3.123486682808717, + "grad_norm": 0.014599725604057312, + "learning_rate": 1.860441131797977e-06, + "loss": 0.0, + "num_input_tokens_seen": 1110272, + "step": 1290 + }, + { + "epoch": 3.135593220338983, + "grad_norm": 0.016075119376182556, + "learning_rate": 1.8400319689773474e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1114496, + "step": 1295 + }, + { + "epoch": 3.1476997578692494, + "grad_norm": 0.04632039740681648, + "learning_rate": 1.8196699764576316e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1118784, + "step": 1300 + }, + { + "epoch": 3.1598062953995156, + "grad_norm": 0.014973443932831287, + "learning_rate": 1.7993566095840442e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1123008, + "step": 1305 + }, + { + "epoch": 3.171912832929782, + "grad_norm": 0.017391176894307137, + "learning_rate": 1.7790933202263437e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1127424, + "step": 1310 + }, + { + "epoch": 3.1840193704600486, + "grad_norm": 0.0033686573151499033, + "learning_rate": 1.7588815566750728e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1131840, + "step": 1315 + }, + { + "epoch": 3.196125907990315, + "grad_norm": 0.012609965167939663, + "learning_rate": 1.7387227635380362e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1136192, + "step": 1320 + }, + { + "epoch": 3.208232445520581, + "grad_norm": 0.12501056492328644, + "learning_rate": 1.7186183816370522e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1140544, + "step": 1325 + }, + { + "epoch": 3.2203389830508473, + "grad_norm": 0.0041845571249723434, + "learning_rate": 1.6985698479049703e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1145280, + "step": 1330 + }, + { + "epoch": 3.232445520581114, + "grad_norm": 0.0181956198066473, + "learning_rate": 1.6785785952829718e-06, + "loss": 0.0039, + "num_input_tokens_seen": 1149888, + "step": 1335 + }, + { + "epoch": 3.2445520581113803, + "grad_norm": 0.005230342503637075, + "learning_rate": 1.6586460526181476e-06, + "loss": 0.0, + "num_input_tokens_seen": 1153920, + "step": 1340 + }, + { + "epoch": 3.2566585956416465, + "grad_norm": 0.0063026342540979385, + "learning_rate": 1.6387736445613772e-06, + "loss": 0.0, + "num_input_tokens_seen": 1158592, + "step": 1345 + }, + { + "epoch": 3.2687651331719128, + "grad_norm": 0.004648419097065926, + "learning_rate": 1.618962791465501e-06, + "loss": 0.0, + "num_input_tokens_seen": 1162816, + "step": 1350 + }, + { + "epoch": 3.2736077481840193, + "eval_loss": 0.25782492756843567, + "eval_runtime": 0.6532, + "eval_samples_per_second": 561.843, + "eval_steps_per_second": 70.422, + "num_input_tokens_seen": 1164544, + "step": 1352 + }, + { + "epoch": 3.280871670702179, + "grad_norm": 0.003744626184925437, + "learning_rate": 1.599214909283805e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1167232, + "step": 1355 + }, + { + "epoch": 3.2929782082324457, + "grad_norm": 0.002472149208188057, + "learning_rate": 1.579531409468815e-06, + "loss": 0.0, + "num_input_tokens_seen": 1171648, + "step": 1360 + }, + { + "epoch": 3.305084745762712, + "grad_norm": 0.00250844843685627, + "learning_rate": 1.5599136988714186e-06, + "loss": 0.0, + "num_input_tokens_seen": 1175808, + "step": 1365 + }, + { + "epoch": 3.317191283292978, + "grad_norm": 0.0014152796939015388, + "learning_rate": 1.5403631796403085e-06, + "loss": 0.0, + "num_input_tokens_seen": 1180224, + "step": 1370 + }, + { + "epoch": 3.3292978208232444, + "grad_norm": 0.00422420259565115, + "learning_rate": 1.5208812491217669e-06, + "loss": 0.0, + "num_input_tokens_seen": 1184704, + "step": 1375 + }, + { + "epoch": 3.341404358353511, + "grad_norm": 0.004841359332203865, + "learning_rate": 1.5014692997597962e-06, + "loss": 0.053, + "num_input_tokens_seen": 1188992, + "step": 1380 + }, + { + "epoch": 3.3535108958837774, + "grad_norm": 0.0019620037637650967, + "learning_rate": 1.4821287189965865e-06, + "loss": 0.0, + "num_input_tokens_seen": 1193408, + "step": 1385 + }, + { + "epoch": 3.3656174334140436, + "grad_norm": 0.0014666810166090727, + "learning_rate": 1.4628608891733626e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1197760, + "step": 1390 + }, + { + "epoch": 3.37772397094431, + "grad_norm": 0.0015612264396622777, + "learning_rate": 1.443667187431572e-06, + "loss": 0.0, + "num_input_tokens_seen": 1201792, + "step": 1395 + }, + { + "epoch": 3.389830508474576, + "grad_norm": 0.03145941346883774, + "learning_rate": 1.4245489856144633e-06, + "loss": 0.0, + "num_input_tokens_seen": 1205824, + "step": 1400 + }, + { + "epoch": 3.401937046004843, + "grad_norm": 0.0026610263157635927, + "learning_rate": 1.4055076501690313e-06, + "loss": 0.0, + "num_input_tokens_seen": 1210240, + "step": 1405 + }, + { + "epoch": 3.414043583535109, + "grad_norm": 0.002824948402121663, + "learning_rate": 1.3865445420483524e-06, + "loss": 0.0, + "num_input_tokens_seen": 1214464, + "step": 1410 + }, + { + "epoch": 3.4261501210653753, + "grad_norm": 3.925107002258301, + "learning_rate": 1.367661016614315e-06, + "loss": 0.0005, + "num_input_tokens_seen": 1218752, + "step": 1415 + }, + { + "epoch": 3.4382566585956416, + "grad_norm": 0.0029422210063785315, + "learning_rate": 1.348858423540744e-06, + "loss": 0.0, + "num_input_tokens_seen": 1223168, + "step": 1420 + }, + { + "epoch": 3.450363196125908, + "grad_norm": 0.00631983857601881, + "learning_rate": 1.3301381067169367e-06, + "loss": 0.0, + "num_input_tokens_seen": 1227328, + "step": 1425 + }, + { + "epoch": 3.4624697336561745, + "grad_norm": 0.0015673706075176597, + "learning_rate": 1.3115014041516088e-06, + "loss": 0.0, + "num_input_tokens_seen": 1231360, + "step": 1430 + }, + { + "epoch": 3.4745762711864407, + "grad_norm": 0.001485335873439908, + "learning_rate": 1.2929496478772635e-06, + "loss": 0.0, + "num_input_tokens_seen": 1235456, + "step": 1435 + }, + { + "epoch": 3.486682808716707, + "grad_norm": 0.001463928259909153, + "learning_rate": 1.2744841638549843e-06, + "loss": 0.0, + "num_input_tokens_seen": 1239616, + "step": 1440 + }, + { + "epoch": 3.4987893462469732, + "grad_norm": 0.013075617142021656, + "learning_rate": 1.2561062718796663e-06, + "loss": 0.0, + "num_input_tokens_seen": 1243968, + "step": 1445 + }, + { + "epoch": 3.5108958837772395, + "grad_norm": 0.00566933723166585, + "learning_rate": 1.2378172854856831e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1248128, + "step": 1450 + }, + { + "epoch": 3.523002421307506, + "grad_norm": 0.004368732683360577, + "learning_rate": 1.2196185118530063e-06, + "loss": 0.0, + "num_input_tokens_seen": 1252288, + "step": 1455 + }, + { + "epoch": 3.5254237288135593, + "eval_loss": 0.2580437958240509, + "eval_runtime": 0.6456, + "eval_samples_per_second": 568.425, + "eval_steps_per_second": 71.247, + "num_input_tokens_seen": 1253248, + "step": 1456 + }, + { + "epoch": 3.5351089588377724, + "grad_norm": 0.0046086618676781654, + "learning_rate": 1.2015112517137744e-06, + "loss": 0.0, + "num_input_tokens_seen": 1256640, + "step": 1460 + }, + { + "epoch": 3.5472154963680387, + "grad_norm": 0.0021092540118843317, + "learning_rate": 1.183496799259326e-06, + "loss": 0.0, + "num_input_tokens_seen": 1261440, + "step": 1465 + }, + { + "epoch": 3.559322033898305, + "grad_norm": 0.0011951872147619724, + "learning_rate": 1.165576442047699e-06, + "loss": 0.0, + "num_input_tokens_seen": 1265664, + "step": 1470 + }, + { + "epoch": 3.571428571428571, + "grad_norm": 0.003473962191492319, + "learning_rate": 1.147751460911604e-06, + "loss": 0.0, + "num_input_tokens_seen": 1270016, + "step": 1475 + }, + { + "epoch": 3.583535108958838, + "grad_norm": 0.0040196748450398445, + "learning_rate": 1.1300231298668786e-06, + "loss": 0.0, + "num_input_tokens_seen": 1274560, + "step": 1480 + }, + { + "epoch": 3.595641646489104, + "grad_norm": 0.0007644465658813715, + "learning_rate": 1.112392716021429e-06, + "loss": 0.0, + "num_input_tokens_seen": 1278976, + "step": 1485 + }, + { + "epoch": 3.6077481840193704, + "grad_norm": 0.00214450154453516, + "learning_rate": 1.0948614794846668e-06, + "loss": 0.0, + "num_input_tokens_seen": 1283200, + "step": 1490 + }, + { + "epoch": 3.619854721549637, + "grad_norm": 0.004525753669440746, + "learning_rate": 1.0774306732774414e-06, + "loss": 0.0, + "num_input_tokens_seen": 1287296, + "step": 1495 + }, + { + "epoch": 3.6319612590799033, + "grad_norm": 0.0008026693249121308, + "learning_rate": 1.0601015432424818e-06, + "loss": 0.0, + "num_input_tokens_seen": 1291712, + "step": 1500 + }, + { + "epoch": 3.6440677966101696, + "grad_norm": 0.0020796298049390316, + "learning_rate": 1.0428753279553561e-06, + "loss": 0.0328, + "num_input_tokens_seen": 1295936, + "step": 1505 + }, + { + "epoch": 3.656174334140436, + "grad_norm": 16.17505645751953, + "learning_rate": 1.0257532586359422e-06, + "loss": 0.0527, + "num_input_tokens_seen": 1300608, + "step": 1510 + }, + { + "epoch": 3.668280871670702, + "grad_norm": 0.005263039376586676, + "learning_rate": 1.008736559060429e-06, + "loss": 0.0, + "num_input_tokens_seen": 1305024, + "step": 1515 + }, + { + "epoch": 3.6803874092009687, + "grad_norm": 0.015520356595516205, + "learning_rate": 9.918264454738504e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1309376, + "step": 1520 + }, + { + "epoch": 3.692493946731235, + "grad_norm": 0.01658783107995987, + "learning_rate": 9.750241265031529e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1313664, + "step": 1525 + }, + { + "epoch": 3.7046004842615012, + "grad_norm": 0.0006883519235998392, + "learning_rate": 9.583308030708135e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1318080, + "step": 1530 + }, + { + "epoch": 3.7167070217917675, + "grad_norm": 0.0012563606724143028, + "learning_rate": 9.417476683090007e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1322432, + "step": 1535 + }, + { + "epoch": 3.7288135593220337, + "grad_norm": 0.0474010594189167, + "learning_rate": 9.252759074743034e-07, + "loss": 0.0003, + "num_input_tokens_seen": 1326848, + "step": 1540 + }, + { + "epoch": 3.7409200968523004, + "grad_norm": 0.012124676257371902, + "learning_rate": 9.08916697863014e-07, + "loss": 0.0, + "num_input_tokens_seen": 1331328, + "step": 1545 + }, + { + "epoch": 3.7530266343825667, + "grad_norm": 0.0076986453495919704, + "learning_rate": 8.926712087269801e-07, + "loss": 0.0, + "num_input_tokens_seen": 1335424, + "step": 1550 + }, + { + "epoch": 3.765133171912833, + "grad_norm": 0.004324712324887514, + "learning_rate": 8.765406011900368e-07, + "loss": 0.0, + "num_input_tokens_seen": 1339712, + "step": 1555 + }, + { + "epoch": 3.777239709443099, + "grad_norm": 0.007034031208604574, + "learning_rate": 8.605260281650152e-07, + "loss": 0.0, + "num_input_tokens_seen": 1344000, + "step": 1560 + }, + { + "epoch": 3.777239709443099, + "eval_loss": 0.2703007757663727, + "eval_runtime": 0.6408, + "eval_samples_per_second": 572.737, + "eval_steps_per_second": 71.787, + "num_input_tokens_seen": 1344000, + "step": 1560 + }, + { + "epoch": 3.7893462469733654, + "grad_norm": 0.00895662046968937, + "learning_rate": 8.44628634271342e-07, + "loss": 0.0, + "num_input_tokens_seen": 1348224, + "step": 1565 + }, + { + "epoch": 3.801452784503632, + "grad_norm": 0.002478554379194975, + "learning_rate": 8.288495557532241e-07, + "loss": 0.0017, + "num_input_tokens_seen": 1352576, + "step": 1570 + }, + { + "epoch": 3.8135593220338984, + "grad_norm": 11.973031997680664, + "learning_rate": 8.131899203984464e-07, + "loss": 0.0616, + "num_input_tokens_seen": 1356864, + "step": 1575 + }, + { + "epoch": 3.8256658595641646, + "grad_norm": 0.0017213321989402175, + "learning_rate": 7.976508474577549e-07, + "loss": 0.0, + "num_input_tokens_seen": 1361152, + "step": 1580 + }, + { + "epoch": 3.837772397094431, + "grad_norm": 0.0008722307975403965, + "learning_rate": 7.822334475648655e-07, + "loss": 0.0, + "num_input_tokens_seen": 1365376, + "step": 1585 + }, + { + "epoch": 3.849878934624697, + "grad_norm": 0.0017557705286890268, + "learning_rate": 7.66938822657081e-07, + "loss": 0.0, + "num_input_tokens_seen": 1369728, + "step": 1590 + }, + { + "epoch": 3.861985472154964, + "grad_norm": 0.0016538921045139432, + "learning_rate": 7.517680658965328e-07, + "loss": 0.0, + "num_input_tokens_seen": 1374144, + "step": 1595 + }, + { + "epoch": 3.87409200968523, + "grad_norm": 0.0021663156803697348, + "learning_rate": 7.367222615920477e-07, + "loss": 0.0, + "num_input_tokens_seen": 1378368, + "step": 1600 + }, + { + "epoch": 3.8861985472154963, + "grad_norm": 0.001977034378796816, + "learning_rate": 7.21802485121649e-07, + "loss": 0.0, + "num_input_tokens_seen": 1382464, + "step": 1605 + }, + { + "epoch": 3.898305084745763, + "grad_norm": 0.012750508263707161, + "learning_rate": 7.070098028556949e-07, + "loss": 0.0, + "num_input_tokens_seen": 1386880, + "step": 1610 + }, + { + "epoch": 3.910411622276029, + "grad_norm": 0.004628063179552555, + "learning_rate": 6.923452720806612e-07, + "loss": 0.0, + "num_input_tokens_seen": 1391296, + "step": 1615 + }, + { + "epoch": 3.9225181598062955, + "grad_norm": 0.0018564671045169234, + "learning_rate": 6.778099409235739e-07, + "loss": 0.0, + "num_input_tokens_seen": 1395456, + "step": 1620 + }, + { + "epoch": 3.9346246973365617, + "grad_norm": 0.0023303565103560686, + "learning_rate": 6.634048482770946e-07, + "loss": 0.0, + "num_input_tokens_seen": 1399616, + "step": 1625 + }, + { + "epoch": 3.946731234866828, + "grad_norm": 0.0011153841624036431, + "learning_rate": 6.491310237252679e-07, + "loss": 0.0, + "num_input_tokens_seen": 1403712, + "step": 1630 + }, + { + "epoch": 3.9588377723970947, + "grad_norm": 0.0012228480773046613, + "learning_rate": 6.349894874699345e-07, + "loss": 0.0, + "num_input_tokens_seen": 1408128, + "step": 1635 + }, + { + "epoch": 3.970944309927361, + "grad_norm": 0.0019976331386715174, + "learning_rate": 6.209812502578113e-07, + "loss": 0.0, + "num_input_tokens_seen": 1412480, + "step": 1640 + }, + { + "epoch": 3.983050847457627, + "grad_norm": 0.0029911224264651537, + "learning_rate": 6.071073133082492e-07, + "loss": 0.0, + "num_input_tokens_seen": 1416704, + "step": 1645 + }, + { + "epoch": 3.9951573849878934, + "grad_norm": 0.0036039084661751986, + "learning_rate": 5.933686682416759e-07, + "loss": 0.0, + "num_input_tokens_seen": 1421120, + "step": 1650 + }, + { + "epoch": 4.00726392251816, + "grad_norm": 0.00185173109639436, + "learning_rate": 5.797662970087184e-07, + "loss": 0.0, + "num_input_tokens_seen": 1424944, + "step": 1655 + }, + { + "epoch": 4.019370460048426, + "grad_norm": 0.002781663788482547, + "learning_rate": 5.663011718200201e-07, + "loss": 0.0, + "num_input_tokens_seen": 1429296, + "step": 1660 + }, + { + "epoch": 4.0290556900726395, + "eval_loss": 0.2501881718635559, + "eval_runtime": 0.6431, + "eval_samples_per_second": 570.66, + "eval_steps_per_second": 71.527, + "num_input_tokens_seen": 1432880, + "step": 1664 + }, + { + "epoch": 4.031476997578692, + "grad_norm": 0.0033902269788086414, + "learning_rate": 5.529742550767545e-07, + "loss": 0.0, + "num_input_tokens_seen": 1433776, + "step": 1665 + }, + { + "epoch": 4.043583535108959, + "grad_norm": 0.005406382493674755, + "learning_rate": 5.397864993018367e-07, + "loss": 0.0, + "num_input_tokens_seen": 1438000, + "step": 1670 + }, + { + "epoch": 4.0556900726392255, + "grad_norm": 0.0016813945258036256, + "learning_rate": 5.267388470718449e-07, + "loss": 0.0, + "num_input_tokens_seen": 1442352, + "step": 1675 + }, + { + "epoch": 4.067796610169491, + "grad_norm": 0.0014291841071099043, + "learning_rate": 5.138322309496504e-07, + "loss": 0.0, + "num_input_tokens_seen": 1446704, + "step": 1680 + }, + { + "epoch": 4.079903147699758, + "grad_norm": 0.0015732025494799018, + "learning_rate": 5.010675734177631e-07, + "loss": 0.0, + "num_input_tokens_seen": 1450864, + "step": 1685 + }, + { + "epoch": 4.092009685230024, + "grad_norm": 0.0017645714106038213, + "learning_rate": 4.884457868124001e-07, + "loss": 0.0, + "num_input_tokens_seen": 1455088, + "step": 1690 + }, + { + "epoch": 4.1041162227602905, + "grad_norm": 0.0008963189902715385, + "learning_rate": 4.759677732582782e-07, + "loss": 0.0051, + "num_input_tokens_seen": 1459376, + "step": 1695 + }, + { + "epoch": 4.116222760290557, + "grad_norm": 0.005383517127484083, + "learning_rate": 4.6363442460413215e-07, + "loss": 0.0, + "num_input_tokens_seen": 1463600, + "step": 1700 + }, + { + "epoch": 4.128329297820823, + "grad_norm": 0.001712340977974236, + "learning_rate": 4.514466223589753e-07, + "loss": 0.0, + "num_input_tokens_seen": 1468080, + "step": 1705 + }, + { + "epoch": 4.14043583535109, + "grad_norm": 0.0034400331787765026, + "learning_rate": 4.394052376290914e-07, + "loss": 0.0253, + "num_input_tokens_seen": 1472624, + "step": 1710 + }, + { + "epoch": 4.1525423728813555, + "grad_norm": 0.0054069641046226025, + "learning_rate": 4.2751113105577587e-07, + "loss": 0.0, + "num_input_tokens_seen": 1477040, + "step": 1715 + }, + { + "epoch": 4.164648910411622, + "grad_norm": 0.01111331395804882, + "learning_rate": 4.157651527538223e-07, + "loss": 0.0, + "num_input_tokens_seen": 1481328, + "step": 1720 + }, + { + "epoch": 4.176755447941889, + "grad_norm": 0.0013017026940360665, + "learning_rate": 4.041681422507604e-07, + "loss": 0.0, + "num_input_tokens_seen": 1485808, + "step": 1725 + }, + { + "epoch": 4.188861985472155, + "grad_norm": 0.002490431070327759, + "learning_rate": 3.927209284268535e-07, + "loss": 0.0, + "num_input_tokens_seen": 1490160, + "step": 1730 + }, + { + "epoch": 4.200968523002421, + "grad_norm": 0.008939598686993122, + "learning_rate": 3.8142432945585425e-07, + "loss": 0.0, + "num_input_tokens_seen": 1494512, + "step": 1735 + }, + { + "epoch": 4.213075060532688, + "grad_norm": 0.002455121139064431, + "learning_rate": 3.702791527465274e-07, + "loss": 0.0, + "num_input_tokens_seen": 1498480, + "step": 1740 + }, + { + "epoch": 4.225181598062954, + "grad_norm": 0.0009606365929357708, + "learning_rate": 3.592861948849416e-07, + "loss": 0.0, + "num_input_tokens_seen": 1502768, + "step": 1745 + }, + { + "epoch": 4.237288135593221, + "grad_norm": 0.0018587886588647962, + "learning_rate": 3.484462415775333e-07, + "loss": 0.0, + "num_input_tokens_seen": 1506992, + "step": 1750 + }, + { + "epoch": 4.249394673123486, + "grad_norm": 0.003451196476817131, + "learning_rate": 3.377600675949527e-07, + "loss": 0.0, + "num_input_tokens_seen": 1511472, + "step": 1755 + }, + { + "epoch": 4.261501210653753, + "grad_norm": 0.002900092862546444, + "learning_rate": 3.272284367166825e-07, + "loss": 0.0, + "num_input_tokens_seen": 1515824, + "step": 1760 + }, + { + "epoch": 4.27360774818402, + "grad_norm": 0.005786838009953499, + "learning_rate": 3.1685210167645336e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1520176, + "step": 1765 + }, + { + "epoch": 4.280871670702179, + "eval_loss": 0.25040701031684875, + "eval_runtime": 2.3855, + "eval_samples_per_second": 153.845, + "eval_steps_per_second": 19.283, + "num_input_tokens_seen": 1522544, + "step": 1768 + }, + { + "epoch": 4.285714285714286, + "grad_norm": 0.0017267963849008083, + "learning_rate": 3.066318041084398e-07, + "loss": 0.0018, + "num_input_tokens_seen": 1524336, + "step": 1770 + }, + { + "epoch": 4.297820823244552, + "grad_norm": 0.011876060627400875, + "learning_rate": 2.9656827449425495e-07, + "loss": 0.0, + "num_input_tokens_seen": 1528560, + "step": 1775 + }, + { + "epoch": 4.309927360774818, + "grad_norm": 0.005988541524857283, + "learning_rate": 2.86662232110739e-07, + "loss": 0.026, + "num_input_tokens_seen": 1532720, + "step": 1780 + }, + { + "epoch": 4.322033898305085, + "grad_norm": 0.003772641299292445, + "learning_rate": 2.769143849785513e-07, + "loss": 0.0, + "num_input_tokens_seen": 1536944, + "step": 1785 + }, + { + "epoch": 4.3341404358353515, + "grad_norm": 0.0012762263650074601, + "learning_rate": 2.673254298115646e-07, + "loss": 0.0, + "num_input_tokens_seen": 1541168, + "step": 1790 + }, + { + "epoch": 4.346246973365617, + "grad_norm": 0.002060825005173683, + "learning_rate": 2.5789605196706675e-07, + "loss": 0.0, + "num_input_tokens_seen": 1545456, + "step": 1795 + }, + { + "epoch": 4.358353510895884, + "grad_norm": 0.002298081526532769, + "learning_rate": 2.4862692539677907e-07, + "loss": 0.0, + "num_input_tokens_seen": 1549872, + "step": 1800 + }, + { + "epoch": 4.37046004842615, + "grad_norm": 0.0016389107331633568, + "learning_rate": 2.39518712598685e-07, + "loss": 0.0, + "num_input_tokens_seen": 1554288, + "step": 1805 + }, + { + "epoch": 4.3825665859564165, + "grad_norm": 0.0016518625197932124, + "learning_rate": 2.3057206456967908e-07, + "loss": 0.0, + "num_input_tokens_seen": 1558384, + "step": 1810 + }, + { + "epoch": 4.394673123486683, + "grad_norm": 0.003522375365719199, + "learning_rate": 2.2178762075903747e-07, + "loss": 0.0, + "num_input_tokens_seen": 1562544, + "step": 1815 + }, + { + "epoch": 4.406779661016949, + "grad_norm": 0.001208233181387186, + "learning_rate": 2.131660090227139e-07, + "loss": 0.0, + "num_input_tokens_seen": 1567216, + "step": 1820 + }, + { + "epoch": 4.418886198547216, + "grad_norm": 0.0012071267701685429, + "learning_rate": 2.0470784557846652e-07, + "loss": 0.0, + "num_input_tokens_seen": 1571568, + "step": 1825 + }, + { + "epoch": 4.4309927360774815, + "grad_norm": 0.0016566209960728884, + "learning_rate": 1.9641373496181143e-07, + "loss": 0.0, + "num_input_tokens_seen": 1575792, + "step": 1830 + }, + { + "epoch": 4.443099273607748, + "grad_norm": 0.002269514137879014, + "learning_rate": 1.882842699828169e-07, + "loss": 0.0, + "num_input_tokens_seen": 1580080, + "step": 1835 + }, + { + "epoch": 4.455205811138015, + "grad_norm": 0.0023223496973514557, + "learning_rate": 1.8032003168373306e-07, + "loss": 0.0, + "num_input_tokens_seen": 1584112, + "step": 1840 + }, + { + "epoch": 4.467312348668281, + "grad_norm": 0.001878439332358539, + "learning_rate": 1.7252158929746133e-07, + "loss": 0.0, + "num_input_tokens_seen": 1588400, + "step": 1845 + }, + { + "epoch": 4.479418886198547, + "grad_norm": 0.009905189275741577, + "learning_rate": 1.6488950020686956e-07, + "loss": 0.0, + "num_input_tokens_seen": 1592816, + "step": 1850 + }, + { + "epoch": 4.491525423728813, + "grad_norm": 0.0024043757002800703, + "learning_rate": 1.5742430990495465e-07, + "loss": 0.0, + "num_input_tokens_seen": 1597296, + "step": 1855 + }, + { + "epoch": 4.50363196125908, + "grad_norm": 0.0012564613716676831, + "learning_rate": 1.501265519558537e-07, + "loss": 0.0184, + "num_input_tokens_seen": 1601648, + "step": 1860 + }, + { + "epoch": 4.5157384987893465, + "grad_norm": 0.014631015248596668, + "learning_rate": 1.4299674795670765e-07, + "loss": 0.0, + "num_input_tokens_seen": 1605936, + "step": 1865 + }, + { + "epoch": 4.527845036319612, + "grad_norm": 0.002841898240149021, + "learning_rate": 1.360354075003828e-07, + "loss": 0.0, + "num_input_tokens_seen": 1610096, + "step": 1870 + }, + { + "epoch": 4.532687651331719, + "eval_loss": 0.2488991767168045, + "eval_runtime": 0.6331, + "eval_samples_per_second": 579.694, + "eval_steps_per_second": 72.659, + "num_input_tokens_seen": 1611760, + "step": 1872 + }, + { + "epoch": 4.539951573849879, + "grad_norm": 0.008398376405239105, + "learning_rate": 1.2924302813904582e-07, + "loss": 0.0, + "num_input_tokens_seen": 1614384, + "step": 1875 + }, + { + "epoch": 4.552058111380145, + "grad_norm": 0.0019534530583769083, + "learning_rate": 1.2262009534860368e-07, + "loss": 0.0, + "num_input_tokens_seen": 1618800, + "step": 1880 + }, + { + "epoch": 4.5641646489104115, + "grad_norm": 0.0013132602907717228, + "learning_rate": 1.161670824940045e-07, + "loss": 0.0, + "num_input_tokens_seen": 1622960, + "step": 1885 + }, + { + "epoch": 4.576271186440678, + "grad_norm": 0.0015446435427293181, + "learning_rate": 1.0988445079540389e-07, + "loss": 0.0, + "num_input_tokens_seen": 1627056, + "step": 1890 + }, + { + "epoch": 4.588377723970944, + "grad_norm": 0.0025299135595560074, + "learning_rate": 1.0377264929520126e-07, + "loss": 0.0002, + "num_input_tokens_seen": 1631408, + "step": 1895 + }, + { + "epoch": 4.600484261501211, + "grad_norm": 0.0017427592538297176, + "learning_rate": 9.783211482594285e-08, + "loss": 0.0, + "num_input_tokens_seen": 1635888, + "step": 1900 + }, + { + "epoch": 4.6125907990314765, + "grad_norm": 0.002218902576714754, + "learning_rate": 9.206327197910203e-08, + "loss": 0.0, + "num_input_tokens_seen": 1640176, + "step": 1905 + }, + { + "epoch": 4.624697336561743, + "grad_norm": 0.004275280050933361, + "learning_rate": 8.64665330747308e-08, + "loss": 0.0, + "num_input_tokens_seen": 1644528, + "step": 1910 + }, + { + "epoch": 4.63680387409201, + "grad_norm": 0.00501580024138093, + "learning_rate": 8.104229813199111e-08, + "loss": 0.0, + "num_input_tokens_seen": 1649264, + "step": 1915 + }, + { + "epoch": 4.648910411622276, + "grad_norm": 0.004407305270433426, + "learning_rate": 7.579095484056193e-08, + "loss": 0.0, + "num_input_tokens_seen": 1653808, + "step": 1920 + }, + { + "epoch": 4.661016949152542, + "grad_norm": 0.001284220372326672, + "learning_rate": 7.071287853293141e-08, + "loss": 0.0, + "num_input_tokens_seen": 1658288, + "step": 1925 + }, + { + "epoch": 4.673123486682809, + "grad_norm": 0.0011293090647086501, + "learning_rate": 6.580843215757082e-08, + "loss": 0.0, + "num_input_tokens_seen": 1662576, + "step": 1930 + }, + { + "epoch": 4.685230024213075, + "grad_norm": 0.007353964261710644, + "learning_rate": 6.107796625299117e-08, + "loss": 0.0, + "num_input_tokens_seen": 1667056, + "step": 1935 + }, + { + "epoch": 4.697336561743342, + "grad_norm": 0.0041248914785683155, + "learning_rate": 5.652181892269182e-08, + "loss": 0.0, + "num_input_tokens_seen": 1671536, + "step": 1940 + }, + { + "epoch": 4.709443099273607, + "grad_norm": 0.0036760459188371897, + "learning_rate": 5.214031581099149e-08, + "loss": 0.0, + "num_input_tokens_seen": 1675888, + "step": 1945 + }, + { + "epoch": 4.721549636803874, + "grad_norm": 0.0022840022575110197, + "learning_rate": 4.793377007975719e-08, + "loss": 0.0, + "num_input_tokens_seen": 1680176, + "step": 1950 + }, + { + "epoch": 4.733656174334141, + "grad_norm": 0.004901141859591007, + "learning_rate": 4.3902482386018186e-08, + "loss": 0.0, + "num_input_tokens_seen": 1684400, + "step": 1955 + }, + { + "epoch": 4.745762711864407, + "grad_norm": 0.004255094565451145, + "learning_rate": 4.004674086047905e-08, + "loss": 0.0357, + "num_input_tokens_seen": 1688816, + "step": 1960 + }, + { + "epoch": 4.757869249394673, + "grad_norm": 0.0026743041817098856, + "learning_rate": 3.636682108692502e-08, + "loss": 0.0, + "num_input_tokens_seen": 1693360, + "step": 1965 + }, + { + "epoch": 4.76997578692494, + "grad_norm": 0.0023194768000394106, + "learning_rate": 3.286298608252442e-08, + "loss": 0.0, + "num_input_tokens_seen": 1697584, + "step": 1970 + }, + { + "epoch": 4.782082324455206, + "grad_norm": 0.003945828415453434, + "learning_rate": 2.953548627903202e-08, + "loss": 0.0, + "num_input_tokens_seen": 1702000, + "step": 1975 + }, + { + "epoch": 4.784503631961259, + "eval_loss": 0.2507624924182892, + "eval_runtime": 0.6499, + "eval_samples_per_second": 564.699, + "eval_steps_per_second": 70.78, + "num_input_tokens_seen": 1702832, + "step": 1976 + }, + { + "epoch": 4.7941888619854724, + "grad_norm": 0.0016053339932113886, + "learning_rate": 2.6384559504886164e-08, + "loss": 0.0, + "num_input_tokens_seen": 1706416, + "step": 1980 + }, + { + "epoch": 4.806295399515738, + "grad_norm": 0.0013666612794622779, + "learning_rate": 2.3410430968214825e-08, + "loss": 0.0, + "num_input_tokens_seen": 1710960, + "step": 1985 + }, + { + "epoch": 4.818401937046005, + "grad_norm": 0.0037024938501417637, + "learning_rate": 2.0613313240735457e-08, + "loss": 0.0, + "num_input_tokens_seen": 1715440, + "step": 1990 + }, + { + "epoch": 4.830508474576272, + "grad_norm": 0.0014027768047526479, + "learning_rate": 1.7993406242563238e-08, + "loss": 0.0, + "num_input_tokens_seen": 1719728, + "step": 1995 + }, + { + "epoch": 4.842615012106537, + "grad_norm": 0.0023848332930356264, + "learning_rate": 1.5550897227922522e-08, + "loss": 0.0, + "num_input_tokens_seen": 1724272, + "step": 2000 + }, + { + "epoch": 4.854721549636804, + "grad_norm": 0.001933931838721037, + "learning_rate": 1.3285960771761696e-08, + "loss": 0.0, + "num_input_tokens_seen": 1728560, + "step": 2005 + }, + { + "epoch": 4.86682808716707, + "grad_norm": 0.0011232432443648577, + "learning_rate": 1.119875875727705e-08, + "loss": 0.0, + "num_input_tokens_seen": 1733104, + "step": 2010 + }, + { + "epoch": 4.878934624697337, + "grad_norm": 0.0012837464455515146, + "learning_rate": 9.289440364341484e-09, + "loss": 0.0, + "num_input_tokens_seen": 1737264, + "step": 2015 + }, + { + "epoch": 4.891041162227603, + "grad_norm": 0.003382457885891199, + "learning_rate": 7.558142058842755e-09, + "loss": 0.0, + "num_input_tokens_seen": 1741424, + "step": 2020 + }, + { + "epoch": 4.903147699757869, + "grad_norm": 0.002113591879606247, + "learning_rate": 6.004987582929056e-09, + "loss": 0.0, + "num_input_tokens_seen": 1745648, + "step": 2025 + }, + { + "epoch": 4.915254237288136, + "grad_norm": 0.0015381674747914076, + "learning_rate": 4.6300879461655404e-09, + "loss": 0.0, + "num_input_tokens_seen": 1749872, + "step": 2030 + }, + { + "epoch": 4.927360774818402, + "grad_norm": 0.0024200750049203634, + "learning_rate": 3.4335414175995506e-09, + "loss": 0.0, + "num_input_tokens_seen": 1754288, + "step": 2035 + }, + { + "epoch": 4.939467312348668, + "grad_norm": 0.0022603883408010006, + "learning_rate": 2.4154335187365207e-09, + "loss": 0.0, + "num_input_tokens_seen": 1758640, + "step": 2040 + }, + { + "epoch": 4.951573849878935, + "grad_norm": 0.001337168039754033, + "learning_rate": 1.575837017428472e-09, + "loss": 0.0003, + "num_input_tokens_seen": 1762928, + "step": 2045 + }, + { + "epoch": 4.963680387409201, + "grad_norm": 0.002117044758051634, + "learning_rate": 9.14811922672898e-10, + "loss": 0.0, + "num_input_tokens_seen": 1767344, + "step": 2050 + }, + { + "epoch": 4.9757869249394675, + "grad_norm": 0.000865236681420356, + "learning_rate": 4.3240548032230657e-10, + "loss": 0.0, + "num_input_tokens_seen": 1771632, + "step": 2055 + }, + { + "epoch": 4.987893462469733, + "grad_norm": 0.001379348454065621, + "learning_rate": 1.2865216970914253e-10, + "loss": 0.0, + "num_input_tokens_seen": 1775728, + "step": 2060 + }, + { + "epoch": 5.0, + "grad_norm": 0.0014177365228533745, + "learning_rate": 3.573701180537015e-12, + "loss": 0.0, + "num_input_tokens_seen": 1780000, + "step": 2065 + }, + { + "epoch": 5.0, + "num_input_tokens_seen": 1780000, + "step": 2065, + "total_flos": 1.039320047616e+16, + "train_loss": 0.06261659951716346, + "train_runtime": 1141.6604, + "train_samples_per_second": 14.457, + "train_steps_per_second": 1.809 + } + ], + "logging_steps": 5, + "max_steps": 2065, + "num_input_tokens_seen": 1780000, + "num_train_epochs": 5, + "save_steps": 104, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.039320047616e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..291c207 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38da5c91afc8307ef30418b0b9cb84db7da16d9c341336f894d4ab8cd5bf8fe0 +size 6289 diff --git a/training_eval_loss.png b/training_eval_loss.png new file mode 100644 index 0000000..c5bc3dc Binary files /dev/null and b/training_eval_loss.png differ diff --git a/training_loss.png b/training_loss.png new file mode 100644 index 0000000..d4d39c0 Binary files /dev/null and b/training_loss.png differ