commit d79f33c2aeeb0280af659724e9a8b838679e22c8 Author: ModelHub XC Date: Fri Apr 24 07:55:06 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: rbelanec/train_mrpc_42_1774791061 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..60347e6 --- /dev/null +++ b/README.md @@ -0,0 +1,81 @@ +--- +library_name: transformers +license: llama3.2 +base_model: meta-llama/Llama-3.2-1B-Instruct +tags: +- peft-factory +- full +- llama-factory +- generated_from_trainer +model-index: +- name: train_mrpc_42_1774791061 + results: [] +--- + + + +# train_mrpc_42_1774791061 + +This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) on the mrpc dataset. +It achieves the following results on the evaluation set: +- Loss: 0.1740 +- Num Input Tokens Seen: 1780000 + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 5e-05 +- train_batch_size: 8 +- eval_batch_size: 8 +- seed: 42 +- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_ratio: 0.1 +- num_epochs: 5 + +### Training results + +| Training Loss | Epoch | Step | Validation Loss | Input Tokens Seen | +|:-------------:|:------:|:----:|:---------------:|:-----------------:| +| 0.1681 | 0.2518 | 104 | 0.1740 | 89600 | +| 0.2715 | 0.5036 | 208 | 0.2312 | 178688 | +| 0.2276 | 0.7554 | 312 | 0.2285 | 267968 | +| 0.5572 | 1.0073 | 416 | 0.2625 | 357488 | +| 0.1881 | 1.2591 | 520 | 0.1977 | 446896 | +| 0.1809 | 1.5109 | 624 | 0.1926 | 536176 | +| 0.1949 | 1.7627 | 728 | 0.1982 | 626992 | +| 0.256 | 2.0145 | 832 | 0.1935 | 716344 | +| 0.1601 | 2.2663 | 936 | 0.3867 | 806712 | +| 0.1768 | 2.5182 | 1040 | 0.1944 | 895736 | +| 0.1964 | 2.7700 | 1144 | 0.1932 | 985592 | +| 0.1436 | 3.0218 | 1248 | 0.2053 | 1074624 | +| 0.2252 | 3.2736 | 1352 | 0.2092 | 1164544 | +| 0.1328 | 3.5254 | 1456 | 0.3492 | 1253248 | +| 0.1842 | 3.7772 | 1560 | 0.2190 | 1344000 | +| 0.0897 | 4.0291 | 1664 | 0.2532 | 1432880 | +| 0.0337 | 4.2809 | 1768 | 0.4315 | 1522544 | +| 0.126 | 4.5327 | 1872 | 0.4220 | 1611760 | +| 0.0336 | 4.7845 | 1976 | 0.4348 | 1702832 | + + +### Framework versions + +- Transformers 4.51.3 +- Pytorch 2.10.0+cu128 +- Datasets 4.0.0 +- Tokenizers 0.21.4 diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..de16a78 --- /dev/null +++ b/all_results.json @@ -0,0 +1,13 @@ +{ + "epoch": 5.0, + "eval_loss": 0.17402823269367218, + "eval_runtime": 0.661, + "eval_samples_per_second": 555.2, + "eval_steps_per_second": 69.589, + "num_input_tokens_seen": 1780000, + "total_flos": 1.039320047616e+16, + "train_loss": 0.16683834154997698, + "train_runtime": 1017.6301, + "train_samples_per_second": 16.219, + "train_steps_per_second": 2.029 +} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..5a2b93f --- /dev/null +++ b/config.json @@ -0,0 +1,39 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.51.3", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/eval_results.json b/eval_results.json new file mode 100644 index 0000000..02c223a --- /dev/null +++ b/eval_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 5.0, + "eval_loss": 0.17402823269367218, + "eval_runtime": 0.661, + "eval_samples_per_second": 555.2, + "eval_steps_per_second": 69.589, + "num_input_tokens_seen": 1780000 +} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..2b8ae57 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.51.3" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..a57c502 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26fb6f579372bc8fc0e9c8f9cfe415d614ca68aae079c31813435015e7b0afd2 +size 4943274328 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..14daf45 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,26 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..ddc3ce0 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2069 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/train.yaml b/train.yaml new file mode 100644 index 0000000..e4afdbf --- /dev/null +++ b/train.yaml @@ -0,0 +1,55 @@ +seed: 42 + +### model +model_name_or_path: meta-llama/Llama-3.2-1B-Instruct +trust_remote_code: true +flash_attn: auto +use_cache: false + +### method +stage: sft +do_train: true +finetuning_type: full + +### dataset +dataset: mrpc +template: llama3 +cutoff_len: 2048 +overwrite_cache: true +preprocessing_num_workers: 4 +dataloader_num_workers: 4 +packing: false + +### output +output_dir: saves_bts_preliminary/base/llama-3.2-1b-instruct/train_mrpc_42_1774791061 +logging_steps: 5 +save_steps: 0.05 +overwrite_output_dir: true +save_only_model: false +plot_loss: true +include_num_input_tokens_seen: true +push_to_hub: true +push_to_hub_organization: rbelanec +load_best_model_at_end: true +save_total_limit: 1 + +### train +per_device_train_batch_size: 8 +learning_rate: 5.0e-5 +num_train_epochs: 5 +weight_decay: 1.0e-5 +lr_scheduler_type: cosine +bf16: true +ddp_timeout: 180000000 +resume_from_checkpoint: null +warmup_ratio: 0.1 +optim: adamw_torch +report_to: +- wandb +run_name: base_llama-3.2-1b-instruct_train_mrpc_42_1774791061 + +### eval +per_device_eval_batch_size: 8 +eval_strategy: steps +eval_steps: 0.05 +val_size: 0.1 \ No newline at end of file diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..ac224f8 --- /dev/null +++ b/train_results.json @@ -0,0 +1,9 @@ +{ + "epoch": 5.0, + "num_input_tokens_seen": 1780000, + "total_flos": 1.039320047616e+16, + "train_loss": 0.16683834154997698, + "train_runtime": 1017.6301, + "train_samples_per_second": 16.219, + "train_steps_per_second": 2.029 +} \ No newline at end of file diff --git a/trainer_log.jsonl b/trainer_log.jsonl new file mode 100644 index 0000000..3733a06 --- /dev/null +++ b/trainer_log.jsonl @@ -0,0 +1,433 @@ +{"current_steps": 5, "total_steps": 2065, "loss": 0.7681, "lr": 9.66183574879227e-07, "epoch": 0.012106537530266344, "percentage": 0.24, "elapsed_time": "0:00:01", "remaining_time": "0:11:52", "throughput": 2515.41, "total_tokens": 4352} +{"current_steps": 10, "total_steps": 2065, "loss": 0.3056, "lr": 2.173913043478261e-06, "epoch": 0.024213075060532687, "percentage": 0.48, "elapsed_time": "0:00:02", "remaining_time": "0:07:14", "throughput": 4142.22, "total_tokens": 8768} +{"current_steps": 15, "total_steps": 2065, "loss": 0.183, "lr": 3.3816425120772947e-06, "epoch": 0.03631961259079903, "percentage": 0.73, "elapsed_time": "0:00:02", "remaining_time": "0:05:40", "throughput": 5217.14, "total_tokens": 12992} +{"current_steps": 20, "total_steps": 2065, "loss": 0.4041, "lr": 4.589371980676329e-06, "epoch": 0.048426150121065374, "percentage": 0.97, "elapsed_time": "0:00:02", "remaining_time": "0:04:53", "throughput": 6052.37, "total_tokens": 17344} +{"current_steps": 25, "total_steps": 2065, "loss": 0.4147, "lr": 5.797101449275362e-06, "epoch": 0.06053268765133172, "percentage": 1.21, "elapsed_time": "0:00:03", "remaining_time": "0:04:24", "throughput": 6694.63, "total_tokens": 21696} +{"current_steps": 30, "total_steps": 2065, "loss": 0.2132, "lr": 7.004830917874397e-06, "epoch": 0.07263922518159806, "percentage": 1.45, "elapsed_time": "0:00:03", "remaining_time": "0:04:05", "throughput": 7220.73, "total_tokens": 26112} +{"current_steps": 35, "total_steps": 2065, "loss": 0.2587, "lr": 8.212560386473431e-06, "epoch": 0.0847457627118644, "percentage": 1.69, "elapsed_time": "0:00:03", "remaining_time": "0:03:51", "throughput": 7573.75, "total_tokens": 30208} +{"current_steps": 40, "total_steps": 2065, "loss": 0.2076, "lr": 9.420289855072464e-06, "epoch": 0.09685230024213075, "percentage": 1.94, "elapsed_time": "0:00:04", "remaining_time": "0:03:40", "throughput": 7947.29, "total_tokens": 34688} +{"current_steps": 45, "total_steps": 2065, "loss": 0.1842, "lr": 1.0628019323671499e-05, "epoch": 0.1089588377723971, "percentage": 2.18, "elapsed_time": "0:00:04", "remaining_time": "0:03:32", "throughput": 8196.68, "total_tokens": 38784} +{"current_steps": 50, "total_steps": 2065, "loss": 0.3012, "lr": 1.1835748792270531e-05, "epoch": 0.12106537530266344, "percentage": 2.42, "elapsed_time": "0:00:05", "remaining_time": "0:03:25", "throughput": 8453.07, "total_tokens": 43200} +{"current_steps": 55, "total_steps": 2065, "loss": 0.1951, "lr": 1.3043478260869566e-05, "epoch": 0.13317191283292978, "percentage": 2.66, "elapsed_time": "0:00:05", "remaining_time": "0:03:20", "throughput": 8628.0, "total_tokens": 47296} +{"current_steps": 60, "total_steps": 2065, "loss": 0.2332, "lr": 1.4251207729468599e-05, "epoch": 0.14527845036319612, "percentage": 2.91, "elapsed_time": "0:00:05", "remaining_time": "0:03:15", "throughput": 8833.5, "total_tokens": 51712} +{"current_steps": 65, "total_steps": 2065, "loss": 0.2049, "lr": 1.5458937198067633e-05, "epoch": 0.15738498789346247, "percentage": 3.15, "elapsed_time": "0:00:06", "remaining_time": "0:03:11", "throughput": 8977.04, "total_tokens": 55872} +{"current_steps": 70, "total_steps": 2065, "loss": 0.2103, "lr": 1.6666666666666667e-05, "epoch": 0.1694915254237288, "percentage": 3.39, "elapsed_time": "0:00:06", "remaining_time": "0:03:07", "throughput": 9080.05, "total_tokens": 59840} +{"current_steps": 75, "total_steps": 2065, "loss": 0.3072, "lr": 1.78743961352657e-05, "epoch": 0.18159806295399517, "percentage": 3.63, "elapsed_time": "0:00:06", "remaining_time": "0:03:04", "throughput": 9195.06, "total_tokens": 64000} +{"current_steps": 80, "total_steps": 2065, "loss": 0.3841, "lr": 1.9082125603864733e-05, "epoch": 0.1937046004842615, "percentage": 3.87, "elapsed_time": "0:00:07", "remaining_time": "0:03:01", "throughput": 9322.35, "total_tokens": 68352} +{"current_steps": 85, "total_steps": 2065, "loss": 0.232, "lr": 2.028985507246377e-05, "epoch": 0.20581113801452786, "percentage": 4.12, "elapsed_time": "0:00:07", "remaining_time": "0:02:59", "throughput": 9442.75, "total_tokens": 72768} +{"current_steps": 90, "total_steps": 2065, "loss": 0.2474, "lr": 2.1497584541062805e-05, "epoch": 0.2179176755447942, "percentage": 4.36, "elapsed_time": "0:00:08", "remaining_time": "0:02:57", "throughput": 9543.92, "total_tokens": 77120} +{"current_steps": 95, "total_steps": 2065, "loss": 0.1841, "lr": 2.2705314009661836e-05, "epoch": 0.23002421307506055, "percentage": 4.6, "elapsed_time": "0:00:08", "remaining_time": "0:02:55", "throughput": 9636.51, "total_tokens": 81664} +{"current_steps": 100, "total_steps": 2065, "loss": 0.1681, "lr": 2.391304347826087e-05, "epoch": 0.24213075060532688, "percentage": 4.84, "elapsed_time": "0:00:08", "remaining_time": "0:02:53", "throughput": 9727.12, "total_tokens": 86080} +{"current_steps": 104, "total_steps": 2065, "eval_loss": 0.17402823269367218, "epoch": 0.25181598062953997, "percentage": 5.04, "elapsed_time": "0:00:09", "remaining_time": "0:03:04", "throughput": 9181.88, "total_tokens": 89600} +{"current_steps": 105, "total_steps": 2065, "loss": 0.1488, "lr": 2.5120772946859905e-05, "epoch": 0.2542372881355932, "percentage": 5.08, "elapsed_time": "0:01:08", "remaining_time": "0:21:12", "throughput": 1326.98, "total_tokens": 90432} +{"current_steps": 110, "total_steps": 2065, "loss": 0.2051, "lr": 2.632850241545894e-05, "epoch": 0.26634382566585957, "percentage": 5.33, "elapsed_time": "0:01:08", "remaining_time": "0:20:17", "throughput": 1379.58, "total_tokens": 94528} +{"current_steps": 115, "total_steps": 2065, "loss": 0.16, "lr": 2.753623188405797e-05, "epoch": 0.2784503631961259, "percentage": 5.57, "elapsed_time": "0:01:08", "remaining_time": "0:19:28", "throughput": 1434.33, "total_tokens": 98816} +{"current_steps": 120, "total_steps": 2065, "loss": 0.205, "lr": 2.8743961352657005e-05, "epoch": 0.29055690072639223, "percentage": 5.81, "elapsed_time": "0:01:09", "remaining_time": "0:18:42", "throughput": 1488.59, "total_tokens": 103104} +{"current_steps": 125, "total_steps": 2065, "loss": 0.1846, "lr": 2.995169082125604e-05, "epoch": 0.3026634382566586, "percentage": 6.05, "elapsed_time": "0:01:09", "remaining_time": "0:18:00", "throughput": 1541.33, "total_tokens": 107328} +{"current_steps": 130, "total_steps": 2065, "loss": 0.2243, "lr": 3.1159420289855074e-05, "epoch": 0.31476997578692495, "percentage": 6.3, "elapsed_time": "0:01:10", "remaining_time": "0:17:21", "throughput": 1592.61, "total_tokens": 111488} +{"current_steps": 135, "total_steps": 2065, "loss": 0.2013, "lr": 3.236714975845411e-05, "epoch": 0.3268765133171913, "percentage": 6.54, "elapsed_time": "0:01:10", "remaining_time": "0:16:46", "throughput": 1647.78, "total_tokens": 115968} +{"current_steps": 140, "total_steps": 2065, "loss": 0.2278, "lr": 3.357487922705314e-05, "epoch": 0.3389830508474576, "percentage": 6.78, "elapsed_time": "0:01:10", "remaining_time": "0:16:12", "throughput": 1698.84, "total_tokens": 120192} +{"current_steps": 145, "total_steps": 2065, "loss": 0.1886, "lr": 3.478260869565218e-05, "epoch": 0.35108958837772397, "percentage": 7.02, "elapsed_time": "0:01:11", "remaining_time": "0:15:41", "throughput": 1749.39, "total_tokens": 124416} +{"current_steps": 150, "total_steps": 2065, "loss": 0.1635, "lr": 3.5990338164251205e-05, "epoch": 0.36319612590799033, "percentage": 7.26, "elapsed_time": "0:01:11", "remaining_time": "0:15:12", "throughput": 1801.99, "total_tokens": 128832} +{"current_steps": 155, "total_steps": 2065, "loss": 0.2118, "lr": 3.719806763285024e-05, "epoch": 0.37530266343825663, "percentage": 7.51, "elapsed_time": "0:01:11", "remaining_time": "0:14:45", "throughput": 1850.7, "total_tokens": 132992} +{"current_steps": 160, "total_steps": 2065, "loss": 0.3186, "lr": 3.8405797101449274e-05, "epoch": 0.387409200968523, "percentage": 7.75, "elapsed_time": "0:01:12", "remaining_time": "0:14:20", "throughput": 1900.5, "total_tokens": 137280} +{"current_steps": 165, "total_steps": 2065, "loss": 0.2002, "lr": 3.961352657004831e-05, "epoch": 0.39951573849878935, "percentage": 7.99, "elapsed_time": "0:01:12", "remaining_time": "0:13:56", "throughput": 1949.84, "total_tokens": 141568} +{"current_steps": 170, "total_steps": 2065, "loss": 0.1792, "lr": 4.082125603864734e-05, "epoch": 0.4116222760290557, "percentage": 8.23, "elapsed_time": "0:01:12", "remaining_time": "0:13:33", "throughput": 2000.41, "total_tokens": 145984} +{"current_steps": 175, "total_steps": 2065, "loss": 0.3197, "lr": 4.202898550724638e-05, "epoch": 0.423728813559322, "percentage": 8.47, "elapsed_time": "0:01:13", "remaining_time": "0:13:12", "throughput": 2047.08, "total_tokens": 150144} +{"current_steps": 180, "total_steps": 2065, "loss": 0.3561, "lr": 4.323671497584541e-05, "epoch": 0.4358353510895884, "percentage": 8.72, "elapsed_time": "0:01:13", "remaining_time": "0:12:52", "throughput": 2097.46, "total_tokens": 154624} +{"current_steps": 185, "total_steps": 2065, "loss": 0.373, "lr": 4.4444444444444447e-05, "epoch": 0.44794188861985473, "percentage": 8.96, "elapsed_time": "0:01:14", "remaining_time": "0:12:32", "throughput": 2143.12, "total_tokens": 158784} +{"current_steps": 190, "total_steps": 2065, "loss": 0.3924, "lr": 4.565217391304348e-05, "epoch": 0.4600484261501211, "percentage": 9.2, "elapsed_time": "0:01:14", "remaining_time": "0:12:14", "throughput": 2189.86, "total_tokens": 163072} +{"current_steps": 195, "total_steps": 2065, "loss": 0.2368, "lr": 4.6859903381642516e-05, "epoch": 0.4721549636803874, "percentage": 9.44, "elapsed_time": "0:01:14", "remaining_time": "0:11:57", "throughput": 2233.02, "total_tokens": 167104} +{"current_steps": 200, "total_steps": 2065, "loss": 0.4497, "lr": 4.806763285024155e-05, "epoch": 0.48426150121065376, "percentage": 9.69, "elapsed_time": "0:01:15", "remaining_time": "0:11:41", "throughput": 2279.73, "total_tokens": 171456} +{"current_steps": 205, "total_steps": 2065, "loss": 0.2715, "lr": 4.9275362318840584e-05, "epoch": 0.4963680387409201, "percentage": 9.93, "elapsed_time": "0:01:15", "remaining_time": "0:11:25", "throughput": 2326.06, "total_tokens": 175808} +{"current_steps": 208, "total_steps": 2065, "eval_loss": 0.23122040927410126, "epoch": 0.5036319612590799, "percentage": 10.07, "elapsed_time": "0:01:16", "remaining_time": "0:11:22", "throughput": 2338.42, "total_tokens": 178688} +{"current_steps": 210, "total_steps": 2065, "loss": 0.1981, "lr": 4.9999857052054956e-05, "epoch": 0.5084745762711864, "percentage": 10.17, "elapsed_time": "0:01:43", "remaining_time": "0:15:17", "throughput": 1735.73, "total_tokens": 180224} +{"current_steps": 215, "total_steps": 2065, "loss": 0.1989, "lr": 4.999824890644693e-05, "epoch": 0.5205811138014528, "percentage": 10.41, "elapsed_time": "0:01:44", "remaining_time": "0:14:56", "throughput": 1772.44, "total_tokens": 184704} +{"current_steps": 220, "total_steps": 2065, "loss": 0.2336, "lr": 4.9994854045622684e-05, "epoch": 0.5326876513317191, "percentage": 10.65, "elapsed_time": "0:01:44", "remaining_time": "0:14:37", "throughput": 1808.9, "total_tokens": 189184} +{"current_steps": 225, "total_steps": 2065, "loss": 0.1595, "lr": 4.9989672712225204e-05, "epoch": 0.5447941888619855, "percentage": 10.9, "elapsed_time": "0:01:44", "remaining_time": "0:14:18", "throughput": 1843.9, "total_tokens": 193536} +{"current_steps": 230, "total_steps": 2065, "loss": 0.2147, "lr": 4.998270527658311e-05, "epoch": 0.5569007263922519, "percentage": 11.14, "elapsed_time": "0:01:45", "remaining_time": "0:14:00", "throughput": 1878.72, "total_tokens": 197888} +{"current_steps": 235, "total_steps": 2065, "loss": 0.1959, "lr": 4.9973952236684216e-05, "epoch": 0.5690072639225182, "percentage": 11.38, "elapsed_time": "0:01:45", "remaining_time": "0:13:43", "throughput": 1912.06, "total_tokens": 202112} +{"current_steps": 240, "total_steps": 2065, "loss": 0.2085, "lr": 4.996341421813993e-05, "epoch": 0.5811138014527845, "percentage": 11.62, "elapsed_time": "0:01:46", "remaining_time": "0:13:26", "throughput": 1946.98, "total_tokens": 206528} +{"current_steps": 245, "total_steps": 2065, "loss": 0.2304, "lr": 4.9951091974140506e-05, "epoch": 0.5932203389830508, "percentage": 11.86, "elapsed_time": "0:01:46", "remaining_time": "0:13:12", "throughput": 1977.19, "total_tokens": 210944} +{"current_steps": 250, "total_steps": 2065, "loss": 0.2171, "lr": 4.99369863854013e-05, "epoch": 0.6053268765133172, "percentage": 12.11, "elapsed_time": "0:01:47", "remaining_time": "0:12:57", "throughput": 2009.17, "total_tokens": 215104} +{"current_steps": 255, "total_steps": 2065, "loss": 0.2458, "lr": 4.992109846009972e-05, "epoch": 0.6174334140435835, "percentage": 12.35, "elapsed_time": "0:01:47", "remaining_time": "0:12:42", "throughput": 2041.56, "total_tokens": 219328} +{"current_steps": 260, "total_steps": 2065, "loss": 0.219, "lr": 4.990342933380321e-05, "epoch": 0.6295399515738499, "percentage": 12.59, "elapsed_time": "0:01:47", "remaining_time": "0:12:28", "throughput": 2074.86, "total_tokens": 223680} +{"current_steps": 265, "total_steps": 2065, "loss": 0.3803, "lr": 4.9883980269388106e-05, "epoch": 0.6416464891041163, "percentage": 12.83, "elapsed_time": "0:01:48", "remaining_time": "0:12:14", "throughput": 2106.8, "total_tokens": 227904} +{"current_steps": 270, "total_steps": 2065, "loss": 0.3005, "lr": 4.986275265694935e-05, "epoch": 0.6537530266343826, "percentage": 13.08, "elapsed_time": "0:01:48", "remaining_time": "0:12:01", "throughput": 2136.82, "total_tokens": 231936} +{"current_steps": 275, "total_steps": 2065, "loss": 0.2954, "lr": 4.9839748013701145e-05, "epoch": 0.6658595641646489, "percentage": 13.32, "elapsed_time": "0:01:48", "remaining_time": "0:11:48", "throughput": 2168.34, "total_tokens": 236160} +{"current_steps": 280, "total_steps": 2065, "loss": 0.2924, "lr": 4.981496798386849e-05, "epoch": 0.6779661016949152, "percentage": 13.56, "elapsed_time": "0:01:49", "remaining_time": "0:11:36", "throughput": 2199.06, "total_tokens": 240320} +{"current_steps": 285, "total_steps": 2065, "loss": 0.1771, "lr": 4.978841433856971e-05, "epoch": 0.6900726392251816, "percentage": 13.8, "elapsed_time": "0:01:49", "remaining_time": "0:11:24", "throughput": 2232.26, "total_tokens": 244800} +{"current_steps": 290, "total_steps": 2065, "loss": 0.194, "lr": 4.976008897568981e-05, "epoch": 0.7021791767554479, "percentage": 14.04, "elapsed_time": "0:01:50", "remaining_time": "0:11:13", "throughput": 2264.28, "total_tokens": 249152} +{"current_steps": 295, "total_steps": 2065, "loss": 0.2064, "lr": 4.972999391974488e-05, "epoch": 0.7142857142857143, "percentage": 14.29, "elapsed_time": "0:01:50", "remaining_time": "0:11:02", "throughput": 2294.98, "total_tokens": 253376} +{"current_steps": 300, "total_steps": 2065, "loss": 0.2096, "lr": 4.969813132173735e-05, "epoch": 0.7263922518159807, "percentage": 14.53, "elapsed_time": "0:01:50", "remaining_time": "0:10:51", "throughput": 2326.0, "total_tokens": 257664} +{"current_steps": 305, "total_steps": 2065, "loss": 0.1712, "lr": 4.966450345900229e-05, "epoch": 0.738498789346247, "percentage": 14.77, "elapsed_time": "0:01:51", "remaining_time": "0:10:41", "throughput": 2357.33, "total_tokens": 262016} +{"current_steps": 310, "total_steps": 2065, "loss": 0.2276, "lr": 4.962911273504461e-05, "epoch": 0.7506053268765133, "percentage": 15.01, "elapsed_time": "0:01:51", "remaining_time": "0:10:31", "throughput": 2389.12, "total_tokens": 266432} +{"current_steps": 312, "total_steps": 2065, "eval_loss": 0.22853781282901764, "epoch": 0.7554479418886199, "percentage": 15.11, "elapsed_time": "0:01:53", "remaining_time": "0:10:40", "throughput": 2351.03, "total_tokens": 267968} +{"current_steps": 315, "total_steps": 2065, "loss": 0.2349, "lr": 4.9591961679367284e-05, "epoch": 0.7627118644067796, "percentage": 15.25, "elapsed_time": "0:02:22", "remaining_time": "0:13:12", "throughput": 1895.52, "total_tokens": 270464} +{"current_steps": 320, "total_steps": 2065, "loss": 0.2824, "lr": 4.955305294729056e-05, "epoch": 0.774818401937046, "percentage": 15.5, "elapsed_time": "0:02:23", "remaining_time": "0:13:00", "throughput": 1920.08, "total_tokens": 274688} +{"current_steps": 325, "total_steps": 2065, "loss": 0.3105, "lr": 4.951238931976216e-05, "epoch": 0.7869249394673123, "percentage": 15.74, "elapsed_time": "0:02:23", "remaining_time": "0:12:47", "throughput": 1944.08, "total_tokens": 278848} +{"current_steps": 330, "total_steps": 2065, "loss": 0.2667, "lr": 4.9469973703158565e-05, "epoch": 0.7990314769975787, "percentage": 15.98, "elapsed_time": "0:02:23", "remaining_time": "0:12:36", "throughput": 1968.81, "total_tokens": 283136} +{"current_steps": 335, "total_steps": 2065, "loss": 0.2213, "lr": 4.9425809129077204e-05, "epoch": 0.8111380145278451, "percentage": 16.22, "elapsed_time": "0:02:24", "remaining_time": "0:12:24", "throughput": 1995.13, "total_tokens": 287680} +{"current_steps": 340, "total_steps": 2065, "loss": 0.1887, "lr": 4.937989875411985e-05, "epoch": 0.8232445520581114, "percentage": 16.46, "elapsed_time": "0:02:24", "remaining_time": "0:12:13", "throughput": 2021.36, "total_tokens": 292224} +{"current_steps": 345, "total_steps": 2065, "loss": 0.2499, "lr": 4.933224585966696e-05, "epoch": 0.8353510895883777, "percentage": 16.71, "elapsed_time": "0:02:24", "remaining_time": "0:12:02", "throughput": 2045.3, "total_tokens": 296448} +{"current_steps": 350, "total_steps": 2065, "loss": 0.2431, "lr": 4.928285385164315e-05, "epoch": 0.847457627118644, "percentage": 16.95, "elapsed_time": "0:02:25", "remaining_time": "0:11:52", "throughput": 2069.54, "total_tokens": 300736} +{"current_steps": 355, "total_steps": 2065, "loss": 0.2588, "lr": 4.923172626027379e-05, "epoch": 0.8595641646489104, "percentage": 17.19, "elapsed_time": "0:02:25", "remaining_time": "0:11:41", "throughput": 2093.18, "total_tokens": 304960} +{"current_steps": 360, "total_steps": 2065, "loss": 0.2322, "lr": 4.917886673983267e-05, "epoch": 0.8716707021791767, "percentage": 17.43, "elapsed_time": "0:02:26", "remaining_time": "0:11:31", "throughput": 2116.72, "total_tokens": 309184} +{"current_steps": 365, "total_steps": 2065, "loss": 0.2314, "lr": 4.912427906838078e-05, "epoch": 0.8837772397094431, "percentage": 17.68, "elapsed_time": "0:02:26", "remaining_time": "0:11:22", "throughput": 2140.14, "total_tokens": 313408} +{"current_steps": 370, "total_steps": 2065, "loss": 0.1782, "lr": 4.906796714749635e-05, "epoch": 0.8958837772397095, "percentage": 17.92, "elapsed_time": "0:02:26", "remaining_time": "0:11:12", "throughput": 2165.16, "total_tokens": 317888} +{"current_steps": 375, "total_steps": 2065, "loss": 0.1873, "lr": 4.900993500199591e-05, "epoch": 0.9079903147699758, "percentage": 18.16, "elapsed_time": "0:02:27", "remaining_time": "0:11:03", "throughput": 2187.92, "total_tokens": 322048} +{"current_steps": 380, "total_steps": 2065, "loss": 0.1985, "lr": 4.895018677964669e-05, "epoch": 0.9200968523002422, "percentage": 18.4, "elapsed_time": "0:02:27", "remaining_time": "0:10:54", "throughput": 2212.99, "total_tokens": 326592} +{"current_steps": 385, "total_steps": 2065, "loss": 0.3036, "lr": 4.8888726750870126e-05, "epoch": 0.9322033898305084, "percentage": 18.64, "elapsed_time": "0:02:27", "remaining_time": "0:10:45", "throughput": 2236.34, "total_tokens": 330880} +{"current_steps": 390, "total_steps": 2065, "loss": 0.2224, "lr": 4.882555930843664e-05, "epoch": 0.9443099273607748, "percentage": 18.89, "elapsed_time": "0:02:28", "remaining_time": "0:10:37", "throughput": 2259.21, "total_tokens": 335104} +{"current_steps": 395, "total_steps": 2065, "loss": 0.1898, "lr": 4.87606889671517e-05, "epoch": 0.9564164648910412, "percentage": 19.13, "elapsed_time": "0:02:28", "remaining_time": "0:10:28", "throughput": 2282.36, "total_tokens": 339392} +{"current_steps": 400, "total_steps": 2065, "loss": 0.1663, "lr": 4.8694120363533104e-05, "epoch": 0.9685230024213075, "percentage": 19.37, "elapsed_time": "0:02:29", "remaining_time": "0:10:20", "throughput": 2305.78, "total_tokens": 343744} +{"current_steps": 405, "total_steps": 2065, "loss": 0.1954, "lr": 4.8625858255479574e-05, "epoch": 0.9806295399515739, "percentage": 19.61, "elapsed_time": "0:02:29", "remaining_time": "0:10:12", "throughput": 2329.48, "total_tokens": 348160} +{"current_steps": 410, "total_steps": 2065, "loss": 0.2606, "lr": 4.855590752193076e-05, "epoch": 0.9927360774818402, "percentage": 19.85, "elapsed_time": "0:02:29", "remaining_time": "0:10:04", "throughput": 2352.34, "total_tokens": 352448} +{"current_steps": 415, "total_steps": 2065, "loss": 0.5572, "lr": 4.848427316251842e-05, "epoch": 1.0048426150121066, "percentage": 20.1, "elapsed_time": "0:02:30", "remaining_time": "0:09:59", "throughput": 2365.4, "total_tokens": 356656} +{"current_steps": 416, "total_steps": 2065, "eval_loss": 0.2624819278717041, "epoch": 1.0072639225181599, "percentage": 20.15, "elapsed_time": "0:02:31", "remaining_time": "0:10:01", "throughput": 2356.64, "total_tokens": 357488} +{"current_steps": 420, "total_steps": 2065, "loss": 0.2346, "lr": 4.841096029720921e-05, "epoch": 1.0169491525423728, "percentage": 20.34, "elapsed_time": "0:03:34", "remaining_time": "0:14:00", "throughput": 1682.04, "total_tokens": 360880} +{"current_steps": 425, "total_steps": 2065, "loss": 0.1819, "lr": 4.8335974165938615e-05, "epoch": 1.0290556900726393, "percentage": 20.58, "elapsed_time": "0:03:34", "remaining_time": "0:13:49", "throughput": 1698.77, "total_tokens": 365104} +{"current_steps": 430, "total_steps": 2065, "loss": 0.1495, "lr": 4.825932012823652e-05, "epoch": 1.0411622276029056, "percentage": 20.82, "elapsed_time": "0:03:35", "remaining_time": "0:13:38", "throughput": 1717.5, "total_tokens": 369776} +{"current_steps": 435, "total_steps": 2065, "loss": 0.2583, "lr": 4.8181003662844074e-05, "epoch": 1.053268765133172, "percentage": 21.07, "elapsed_time": "0:03:35", "remaining_time": "0:13:28", "throughput": 1734.11, "total_tokens": 374000} +{"current_steps": 440, "total_steps": 2065, "loss": 0.2093, "lr": 4.8101030367322195e-05, "epoch": 1.0653753026634383, "percentage": 21.31, "elapsed_time": "0:03:36", "remaining_time": "0:13:17", "throughput": 1750.11, "total_tokens": 378096} +{"current_steps": 445, "total_steps": 2065, "loss": 0.1806, "lr": 4.8019405957651395e-05, "epoch": 1.0774818401937045, "percentage": 21.55, "elapsed_time": "0:03:36", "remaining_time": "0:13:07", "throughput": 1766.35, "total_tokens": 382256} +{"current_steps": 450, "total_steps": 2065, "loss": 0.3307, "lr": 4.793613626782331e-05, "epoch": 1.089588377723971, "percentage": 21.79, "elapsed_time": "0:03:36", "remaining_time": "0:12:58", "throughput": 1783.69, "total_tokens": 386672} +{"current_steps": 455, "total_steps": 2065, "loss": 0.2208, "lr": 4.785122724942367e-05, "epoch": 1.1016949152542372, "percentage": 22.03, "elapsed_time": "0:03:37", "remaining_time": "0:12:48", "throughput": 1800.37, "total_tokens": 390960} +{"current_steps": 460, "total_steps": 2065, "loss": 0.2978, "lr": 4.776468497120698e-05, "epoch": 1.1138014527845037, "percentage": 22.28, "elapsed_time": "0:03:37", "remaining_time": "0:12:38", "throughput": 1817.85, "total_tokens": 395440} +{"current_steps": 465, "total_steps": 2065, "loss": 0.2315, "lr": 4.7676515618662684e-05, "epoch": 1.12590799031477, "percentage": 22.52, "elapsed_time": "0:03:37", "remaining_time": "0:12:29", "throughput": 1833.82, "total_tokens": 399600} +{"current_steps": 470, "total_steps": 2065, "loss": 0.2236, "lr": 4.758672549357316e-05, "epoch": 1.1380145278450362, "percentage": 22.76, "elapsed_time": "0:03:38", "remaining_time": "0:12:20", "throughput": 1850.34, "total_tokens": 403888} +{"current_steps": 475, "total_steps": 2065, "loss": 0.1689, "lr": 4.749532101356322e-05, "epoch": 1.1501210653753027, "percentage": 23.0, "elapsed_time": "0:03:38", "remaining_time": "0:12:11", "throughput": 1866.81, "total_tokens": 408176} +{"current_steps": 480, "total_steps": 2065, "loss": 0.2012, "lr": 4.740230871164147e-05, "epoch": 1.162227602905569, "percentage": 23.24, "elapsed_time": "0:03:39", "remaining_time": "0:12:03", "throughput": 1882.08, "total_tokens": 412208} +{"current_steps": 485, "total_steps": 2065, "loss": 0.1816, "lr": 4.730769523573337e-05, "epoch": 1.1743341404358354, "percentage": 23.49, "elapsed_time": "0:03:39", "remaining_time": "0:11:54", "throughput": 1898.99, "total_tokens": 416624} +{"current_steps": 490, "total_steps": 2065, "loss": 0.2491, "lr": 4.7211487348206054e-05, "epoch": 1.1864406779661016, "percentage": 23.73, "elapsed_time": "0:03:39", "remaining_time": "0:11:46", "throughput": 1915.85, "total_tokens": 421040} +{"current_steps": 495, "total_steps": 2065, "loss": 0.203, "lr": 4.711369192538503e-05, "epoch": 1.1985472154963681, "percentage": 23.97, "elapsed_time": "0:03:40", "remaining_time": "0:11:38", "throughput": 1931.24, "total_tokens": 425136} +{"current_steps": 500, "total_steps": 2065, "loss": 0.4102, "lr": 4.7014315957062685e-05, "epoch": 1.2106537530266344, "percentage": 24.21, "elapsed_time": "0:03:40", "remaining_time": "0:11:30", "throughput": 1948.52, "total_tokens": 429680} +{"current_steps": 505, "total_steps": 2065, "loss": 0.2409, "lr": 4.691336654599873e-05, "epoch": 1.2227602905569008, "percentage": 24.46, "elapsed_time": "0:03:40", "remaining_time": "0:11:22", "throughput": 1965.78, "total_tokens": 434224} +{"current_steps": 510, "total_steps": 2065, "loss": 0.2191, "lr": 4.6810850907412484e-05, "epoch": 1.234866828087167, "percentage": 24.7, "elapsed_time": "0:03:41", "remaining_time": "0:11:14", "throughput": 1980.99, "total_tokens": 438320} +{"current_steps": 515, "total_steps": 2065, "loss": 0.1975, "lr": 4.670677636846723e-05, "epoch": 1.2469733656174333, "percentage": 24.94, "elapsed_time": "0:03:41", "remaining_time": "0:11:07", "throughput": 1997.3, "total_tokens": 442672} +{"current_steps": 520, "total_steps": 2065, "loss": 0.1881, "lr": 4.660115036774648e-05, "epoch": 1.2590799031476998, "percentage": 25.18, "elapsed_time": "0:03:42", "remaining_time": "0:10:59", "throughput": 2012.99, "total_tokens": 446896} +{"current_steps": 520, "total_steps": 2065, "eval_loss": 0.1976936012506485, "epoch": 1.2590799031476998, "percentage": 25.18, "elapsed_time": "0:03:42", "remaining_time": "0:11:01", "throughput": 2006.94, "total_tokens": 446896} +{"current_steps": 525, "total_steps": 2065, "loss": 0.2485, "lr": 4.6493980454722344e-05, "epoch": 1.271186440677966, "percentage": 25.42, "elapsed_time": "0:04:26", "remaining_time": "0:13:01", "throughput": 1694.99, "total_tokens": 451312} +{"current_steps": 530, "total_steps": 2065, "loss": 0.2053, "lr": 4.638527428921592e-05, "epoch": 1.2832929782082325, "percentage": 25.67, "elapsed_time": "0:04:26", "remaining_time": "0:12:52", "throughput": 1708.0, "total_tokens": 455408} +{"current_steps": 535, "total_steps": 2065, "loss": 0.1867, "lr": 4.627503964084981e-05, "epoch": 1.2953995157384988, "percentage": 25.91, "elapsed_time": "0:04:27", "remaining_time": "0:12:43", "throughput": 1723.07, "total_tokens": 460080} +{"current_steps": 540, "total_steps": 2065, "loss": 0.1674, "lr": 4.6163284388492835e-05, "epoch": 1.307506053268765, "percentage": 26.15, "elapsed_time": "0:04:27", "remaining_time": "0:12:35", "throughput": 1737.15, "total_tokens": 464496} +{"current_steps": 545, "total_steps": 2065, "loss": 0.2045, "lr": 4.605001651969686e-05, "epoch": 1.3196125907990315, "percentage": 26.39, "elapsed_time": "0:04:27", "remaining_time": "0:12:26", "throughput": 1750.5, "total_tokens": 468720} +{"current_steps": 550, "total_steps": 2065, "loss": 0.191, "lr": 4.593524413012592e-05, "epoch": 1.331719128329298, "percentage": 26.63, "elapsed_time": "0:04:28", "remaining_time": "0:12:18", "throughput": 1764.97, "total_tokens": 473264} +{"current_steps": 555, "total_steps": 2065, "loss": 0.1828, "lr": 4.5818975422977606e-05, "epoch": 1.3438256658595642, "percentage": 26.88, "elapsed_time": "0:04:28", "remaining_time": "0:12:10", "throughput": 1778.47, "total_tokens": 477552} +{"current_steps": 560, "total_steps": 2065, "loss": 0.1546, "lr": 4.570121870839671e-05, "epoch": 1.3559322033898304, "percentage": 27.12, "elapsed_time": "0:04:28", "remaining_time": "0:12:02", "throughput": 1792.64, "total_tokens": 482032} +{"current_steps": 565, "total_steps": 2065, "loss": 0.2025, "lr": 4.558198240288131e-05, "epoch": 1.368038740920097, "percentage": 27.36, "elapsed_time": "0:04:29", "remaining_time": "0:11:54", "throughput": 1806.28, "total_tokens": 486384} +{"current_steps": 570, "total_steps": 2065, "loss": 0.2413, "lr": 4.546127502868118e-05, "epoch": 1.3801452784503632, "percentage": 27.6, "elapsed_time": "0:04:29", "remaining_time": "0:11:47", "throughput": 1819.67, "total_tokens": 490672} +{"current_steps": 575, "total_steps": 2065, "loss": 0.2163, "lr": 4.5339105213188714e-05, "epoch": 1.3922518159806296, "percentage": 27.85, "elapsed_time": "0:04:30", "remaining_time": "0:11:39", "throughput": 1833.01, "total_tokens": 494960} +{"current_steps": 580, "total_steps": 2065, "loss": 0.3013, "lr": 4.521548168832227e-05, "epoch": 1.4043583535108959, "percentage": 28.09, "elapsed_time": "0:04:30", "remaining_time": "0:11:32", "throughput": 1845.85, "total_tokens": 499120} +{"current_steps": 585, "total_steps": 2065, "loss": 0.2324, "lr": 4.509041328990204e-05, "epoch": 1.4164648910411621, "percentage": 28.33, "elapsed_time": "0:04:30", "remaining_time": "0:11:25", "throughput": 1859.16, "total_tokens": 503408} +{"current_steps": 590, "total_steps": 2065, "loss": 0.1956, "lr": 4.4963908957018576e-05, "epoch": 1.4285714285714286, "percentage": 28.57, "elapsed_time": "0:04:31", "remaining_time": "0:11:17", "throughput": 1871.05, "total_tokens": 507312} +{"current_steps": 595, "total_steps": 2065, "loss": 0.2206, "lr": 4.483597773139386e-05, "epoch": 1.4406779661016949, "percentage": 28.81, "elapsed_time": "0:04:31", "remaining_time": "0:11:10", "throughput": 1884.29, "total_tokens": 511600} +{"current_steps": 600, "total_steps": 2065, "loss": 0.1973, "lr": 4.470662875673506e-05, "epoch": 1.4527845036319613, "percentage": 29.06, "elapsed_time": "0:04:31", "remaining_time": "0:11:03", "throughput": 1897.48, "total_tokens": 515888} +{"current_steps": 605, "total_steps": 2065, "loss": 0.1848, "lr": 4.457587127808096e-05, "epoch": 1.4648910411622276, "percentage": 29.3, "elapsed_time": "0:04:32", "remaining_time": "0:10:56", "throughput": 1909.72, "total_tokens": 519920} +{"current_steps": 610, "total_steps": 2065, "loss": 0.1922, "lr": 4.4443714641141255e-05, "epoch": 1.4769975786924938, "percentage": 29.54, "elapsed_time": "0:04:32", "remaining_time": "0:10:50", "throughput": 1923.28, "total_tokens": 524336} +{"current_steps": 615, "total_steps": 2065, "loss": 0.1922, "lr": 4.4310168291628504e-05, "epoch": 1.4891041162227603, "percentage": 29.78, "elapsed_time": "0:04:32", "remaining_time": "0:10:43", "throughput": 1935.9, "total_tokens": 528496} +{"current_steps": 620, "total_steps": 2065, "loss": 0.1809, "lr": 4.4175241774583084e-05, "epoch": 1.5012106537530268, "percentage": 30.02, "elapsed_time": "0:04:33", "remaining_time": "0:10:37", "throughput": 1948.96, "total_tokens": 532784} +{"current_steps": 624, "total_steps": 2065, "eval_loss": 0.19258780777454376, "epoch": 1.5108958837772397, "percentage": 30.22, "elapsed_time": "0:04:34", "remaining_time": "0:10:33", "throughput": 1954.71, "total_tokens": 536176} +{"current_steps": 625, "total_steps": 2065, "loss": 0.2205, "lr": 4.403894473369092e-05, "epoch": 1.513317191283293, "percentage": 30.27, "elapsed_time": "0:05:14", "remaining_time": "0:12:05", "throughput": 1706.31, "total_tokens": 537136} +{"current_steps": 630, "total_steps": 2065, "loss": 0.26, "lr": 4.390128691059423e-05, "epoch": 1.5254237288135593, "percentage": 30.51, "elapsed_time": "0:05:15", "remaining_time": "0:11:57", "throughput": 1718.3, "total_tokens": 541552} +{"current_steps": 635, "total_steps": 2065, "loss": 0.2678, "lr": 4.3762278144195236e-05, "epoch": 1.5375302663438255, "percentage": 30.75, "elapsed_time": "0:05:15", "remaining_time": "0:11:50", "throughput": 1729.27, "total_tokens": 545648} +{"current_steps": 640, "total_steps": 2065, "loss": 0.2246, "lr": 4.362192836995299e-05, "epoch": 1.549636803874092, "percentage": 30.99, "elapsed_time": "0:05:15", "remaining_time": "0:11:43", "throughput": 1741.78, "total_tokens": 550256} +{"current_steps": 645, "total_steps": 2065, "loss": 0.2397, "lr": 4.348024761917321e-05, "epoch": 1.5617433414043584, "percentage": 31.23, "elapsed_time": "0:05:16", "remaining_time": "0:11:36", "throughput": 1754.47, "total_tokens": 554928} +{"current_steps": 650, "total_steps": 2065, "loss": 0.2303, "lr": 4.333724601829132e-05, "epoch": 1.5738498789346247, "percentage": 31.48, "elapsed_time": "0:05:16", "remaining_time": "0:11:29", "throughput": 1766.33, "total_tokens": 559344} +{"current_steps": 655, "total_steps": 2065, "loss": 0.2178, "lr": 4.319293378814868e-05, "epoch": 1.585956416464891, "percentage": 31.72, "elapsed_time": "0:05:17", "remaining_time": "0:11:22", "throughput": 1778.17, "total_tokens": 563760} +{"current_steps": 660, "total_steps": 2065, "loss": 0.1945, "lr": 4.304732124326206e-05, "epoch": 1.5980629539951574, "percentage": 31.96, "elapsed_time": "0:05:17", "remaining_time": "0:11:15", "throughput": 1789.8, "total_tokens": 568112} +{"current_steps": 665, "total_steps": 2065, "loss": 0.1908, "lr": 4.2900418791086403e-05, "epoch": 1.6101694915254239, "percentage": 32.2, "elapsed_time": "0:05:17", "remaining_time": "0:11:09", "throughput": 1801.39, "total_tokens": 572464} +{"current_steps": 670, "total_steps": 2065, "loss": 0.2026, "lr": 4.275223693127103e-05, "epoch": 1.6222760290556901, "percentage": 32.45, "elapsed_time": "0:05:18", "remaining_time": "0:11:02", "throughput": 1812.76, "total_tokens": 576752} +{"current_steps": 675, "total_steps": 2065, "loss": 0.1959, "lr": 4.260278625490911e-05, "epoch": 1.6343825665859564, "percentage": 32.69, "elapsed_time": "0:05:18", "remaining_time": "0:10:55", "throughput": 1823.89, "total_tokens": 580976} +{"current_steps": 680, "total_steps": 2065, "loss": 0.2025, "lr": 4.2452077443780744e-05, "epoch": 1.6464891041162226, "percentage": 32.93, "elapsed_time": "0:05:18", "remaining_time": "0:10:49", "throughput": 1835.2, "total_tokens": 585264} +{"current_steps": 685, "total_steps": 2065, "loss": 0.1777, "lr": 4.2300121269589475e-05, "epoch": 1.658595641646489, "percentage": 33.17, "elapsed_time": "0:05:19", "remaining_time": "0:10:43", "throughput": 1847.07, "total_tokens": 589744} +{"current_steps": 690, "total_steps": 2065, "loss": 0.2142, "lr": 4.214692859319237e-05, "epoch": 1.6707021791767556, "percentage": 33.41, "elapsed_time": "0:05:19", "remaining_time": "0:10:37", "throughput": 1858.13, "total_tokens": 593968} +{"current_steps": 695, "total_steps": 2065, "loss": 0.2096, "lr": 4.19925103638238e-05, "epoch": 1.6828087167070218, "percentage": 33.66, "elapsed_time": "0:05:20", "remaining_time": "0:10:30", "throughput": 1869.38, "total_tokens": 598256} +{"current_steps": 700, "total_steps": 2065, "loss": 0.1881, "lr": 4.183687761831281e-05, "epoch": 1.694915254237288, "percentage": 33.9, "elapsed_time": "0:05:20", "remaining_time": "0:10:24", "throughput": 1880.79, "total_tokens": 602608} +{"current_steps": 705, "total_steps": 2065, "loss": 0.1678, "lr": 4.168004148029435e-05, "epoch": 1.7070217917675545, "percentage": 34.14, "elapsed_time": "0:05:20", "remaining_time": "0:10:18", "throughput": 1892.56, "total_tokens": 607088} +{"current_steps": 710, "total_steps": 2065, "loss": 0.243, "lr": 4.1522013159414144e-05, "epoch": 1.7191283292978208, "percentage": 34.38, "elapsed_time": "0:05:21", "remaining_time": "0:10:12", "throughput": 1903.34, "total_tokens": 611248} +{"current_steps": 715, "total_steps": 2065, "loss": 0.2024, "lr": 4.136280395052754e-05, "epoch": 1.7312348668280872, "percentage": 34.62, "elapsed_time": "0:05:21", "remaining_time": "0:10:07", "throughput": 1914.47, "total_tokens": 615536} +{"current_steps": 720, "total_steps": 2065, "loss": 0.1803, "lr": 4.120242523289223e-05, "epoch": 1.7433414043583535, "percentage": 34.87, "elapsed_time": "0:05:21", "remaining_time": "0:10:01", "throughput": 1925.97, "total_tokens": 619952} +{"current_steps": 725, "total_steps": 2065, "loss": 0.1949, "lr": 4.1040888469354925e-05, "epoch": 1.7554479418886197, "percentage": 35.11, "elapsed_time": "0:05:22", "remaining_time": "0:09:55", "throughput": 1937.45, "total_tokens": 624368} +{"current_steps": 728, "total_steps": 2065, "eval_loss": 0.19822187721729279, "epoch": 1.7627118644067796, "percentage": 35.25, "elapsed_time": "0:05:23", "remaining_time": "0:09:54", "throughput": 1937.69, "total_tokens": 626992} +{"current_steps": 730, "total_steps": 2065, "loss": 0.1935, "lr": 4.087820520553205e-05, "epoch": 1.7675544794188862, "percentage": 35.35, "elapsed_time": "0:06:23", "remaining_time": "0:11:40", "throughput": 1641.4, "total_tokens": 628720} +{"current_steps": 735, "total_steps": 2065, "loss": 0.1884, "lr": 4.0714387068984574e-05, "epoch": 1.7796610169491527, "percentage": 35.59, "elapsed_time": "0:06:23", "remaining_time": "0:11:33", "throughput": 1650.99, "total_tokens": 633008} +{"current_steps": 740, "total_steps": 2065, "loss": 0.2014, "lr": 4.05494457683869e-05, "epoch": 1.791767554479419, "percentage": 35.84, "elapsed_time": "0:06:23", "remaining_time": "0:11:27", "throughput": 1660.72, "total_tokens": 637360} +{"current_steps": 745, "total_steps": 2065, "loss": 0.2152, "lr": 4.038339309269002e-05, "epoch": 1.8038740920096852, "percentage": 36.08, "elapsed_time": "0:06:24", "remaining_time": "0:11:20", "throughput": 1670.27, "total_tokens": 641648} +{"current_steps": 750, "total_steps": 2065, "loss": 0.192, "lr": 4.021624091027895e-05, "epoch": 1.8159806295399514, "percentage": 36.32, "elapsed_time": "0:06:24", "remaining_time": "0:11:14", "throughput": 1678.85, "total_tokens": 645552} +{"current_steps": 755, "total_steps": 2065, "loss": 0.3049, "lr": 4.004800116812441e-05, "epoch": 1.828087167070218, "percentage": 36.56, "elapsed_time": "0:06:24", "remaining_time": "0:11:07", "throughput": 1688.54, "total_tokens": 649904} +{"current_steps": 760, "total_steps": 2065, "loss": 0.184, "lr": 3.987868589092893e-05, "epoch": 1.8401937046004844, "percentage": 36.8, "elapsed_time": "0:06:25", "remaining_time": "0:11:01", "throughput": 1697.87, "total_tokens": 654128} +{"current_steps": 765, "total_steps": 2065, "loss": 0.1914, "lr": 3.9708307180267456e-05, "epoch": 1.8523002421307506, "percentage": 37.05, "elapsed_time": "0:06:25", "remaining_time": "0:10:55", "throughput": 1707.98, "total_tokens": 658672} +{"current_steps": 770, "total_steps": 2065, "loss": 0.4553, "lr": 3.953687721372233e-05, "epoch": 1.8644067796610169, "percentage": 37.29, "elapsed_time": "0:06:26", "remaining_time": "0:10:49", "throughput": 1717.76, "total_tokens": 663088} +{"current_steps": 775, "total_steps": 2065, "loss": 0.1709, "lr": 3.936440824401299e-05, "epoch": 1.8765133171912833, "percentage": 37.53, "elapsed_time": "0:06:26", "remaining_time": "0:10:43", "throughput": 1727.35, "total_tokens": 667440} +{"current_steps": 780, "total_steps": 2065, "loss": 0.1831, "lr": 3.919091259812013e-05, "epoch": 1.8886198547215496, "percentage": 37.77, "elapsed_time": "0:06:26", "remaining_time": "0:10:37", "throughput": 1736.94, "total_tokens": 671792} +{"current_steps": 785, "total_steps": 2065, "loss": 0.2175, "lr": 3.9016402676404753e-05, "epoch": 1.900726392251816, "percentage": 38.01, "elapsed_time": "0:06:27", "remaining_time": "0:10:31", "throughput": 1746.98, "total_tokens": 676336} +{"current_steps": 790, "total_steps": 2065, "loss": 0.18, "lr": 3.884089095172181e-05, "epoch": 1.9128329297820823, "percentage": 38.26, "elapsed_time": "0:06:27", "remaining_time": "0:10:25", "throughput": 1756.37, "total_tokens": 680624} +{"current_steps": 795, "total_steps": 2065, "loss": 0.1914, "lr": 3.866438996852872e-05, "epoch": 1.9249394673123486, "percentage": 38.5, "elapsed_time": "0:06:27", "remaining_time": "0:10:19", "throughput": 1766.05, "total_tokens": 685040} +{"current_steps": 800, "total_steps": 2065, "loss": 0.1935, "lr": 3.848691234198879e-05, "epoch": 1.937046004842615, "percentage": 38.74, "elapsed_time": "0:06:28", "remaining_time": "0:10:13", "throughput": 1775.55, "total_tokens": 689392} +{"current_steps": 805, "total_steps": 2065, "loss": 0.2046, "lr": 3.830847075706956e-05, "epoch": 1.9491525423728815, "percentage": 38.98, "elapsed_time": "0:06:28", "remaining_time": "0:10:08", "throughput": 1784.57, "total_tokens": 693552} +{"current_steps": 810, "total_steps": 2065, "loss": 0.2291, "lr": 3.812907796763616e-05, "epoch": 1.9612590799031477, "percentage": 39.23, "elapsed_time": "0:06:29", "remaining_time": "0:10:02", "throughput": 1794.36, "total_tokens": 698032} +{"current_steps": 815, "total_steps": 2065, "loss": 0.1751, "lr": 3.7948746795539745e-05, "epoch": 1.973365617433414, "percentage": 39.47, "elapsed_time": "0:06:29", "remaining_time": "0:09:57", "throughput": 1802.88, "total_tokens": 702000} +{"current_steps": 820, "total_steps": 2065, "loss": 0.1795, "lr": 3.776749012970105e-05, "epoch": 1.9854721549636802, "percentage": 39.71, "elapsed_time": "0:06:29", "remaining_time": "0:09:51", "throughput": 1811.84, "total_tokens": 706160} +{"current_steps": 825, "total_steps": 2065, "loss": 0.1852, "lr": 3.758532092518924e-05, "epoch": 1.9975786924939467, "percentage": 39.95, "elapsed_time": "0:06:30", "remaining_time": "0:09:46", "throughput": 1821.89, "total_tokens": 710768} +{"current_steps": 830, "total_steps": 2065, "loss": 0.256, "lr": 3.740225220229587e-05, "epoch": 2.009685230024213, "percentage": 40.19, "elapsed_time": "0:06:30", "remaining_time": "0:09:41", "throughput": 1829.93, "total_tokens": 714744} +{"current_steps": 832, "total_steps": 2065, "eval_loss": 0.1934857964515686, "epoch": 2.0145278450363198, "percentage": 40.29, "elapsed_time": "0:06:31", "remaining_time": "0:09:39", "throughput": 1830.36, "total_tokens": 716344} +{"current_steps": 835, "total_steps": 2065, "loss": 0.1878, "lr": 3.721829704560436e-05, "epoch": 2.0217917675544794, "percentage": 40.44, "elapsed_time": "0:07:01", "remaining_time": "0:10:21", "throughput": 1704.63, "total_tokens": 718776} +{"current_steps": 840, "total_steps": 2065, "loss": 0.2215, "lr": 3.7033468603054725e-05, "epoch": 2.0338983050847457, "percentage": 40.68, "elapsed_time": "0:07:02", "remaining_time": "0:10:15", "throughput": 1712.55, "total_tokens": 722744} +{"current_steps": 845, "total_steps": 2065, "loss": 0.1657, "lr": 3.6847780085003905e-05, "epoch": 2.046004842615012, "percentage": 40.92, "elapsed_time": "0:07:02", "remaining_time": "0:10:09", "throughput": 1721.42, "total_tokens": 727160} +{"current_steps": 850, "total_steps": 2065, "loss": 0.1957, "lr": 3.666124476328155e-05, "epoch": 2.0581113801452786, "percentage": 41.16, "elapsed_time": "0:07:02", "remaining_time": "0:10:04", "throughput": 1730.33, "total_tokens": 731576} +{"current_steps": 855, "total_steps": 2065, "loss": 0.1881, "lr": 3.647387597024139e-05, "epoch": 2.070217917675545, "percentage": 41.4, "elapsed_time": "0:07:03", "remaining_time": "0:09:58", "throughput": 1739.66, "total_tokens": 736184} +{"current_steps": 860, "total_steps": 2065, "loss": 0.2041, "lr": 3.6285687097808394e-05, "epoch": 2.082324455205811, "percentage": 41.65, "elapsed_time": "0:07:03", "remaining_time": "0:09:53", "throughput": 1748.23, "total_tokens": 740472} +{"current_steps": 865, "total_steps": 2065, "loss": 0.213, "lr": 3.609669159652158e-05, "epoch": 2.0944309927360774, "percentage": 41.89, "elapsed_time": "0:07:03", "remaining_time": "0:09:48", "throughput": 1756.79, "total_tokens": 744760} +{"current_steps": 870, "total_steps": 2065, "loss": 0.1913, "lr": 3.590690297457262e-05, "epoch": 2.106537530266344, "percentage": 42.13, "elapsed_time": "0:07:04", "remaining_time": "0:09:42", "throughput": 1765.65, "total_tokens": 749176} +{"current_steps": 875, "total_steps": 2065, "loss": 0.1961, "lr": 3.57163347968404e-05, "epoch": 2.1186440677966103, "percentage": 42.37, "elapsed_time": "0:07:04", "remaining_time": "0:09:37", "throughput": 1774.36, "total_tokens": 753528} +{"current_steps": 880, "total_steps": 2065, "loss": 0.1981, "lr": 3.552500068392147e-05, "epoch": 2.1307506053268765, "percentage": 42.62, "elapsed_time": "0:07:05", "remaining_time": "0:09:32", "throughput": 1782.59, "total_tokens": 757688} +{"current_steps": 885, "total_steps": 2065, "loss": 0.2002, "lr": 3.533291431115653e-05, "epoch": 2.142857142857143, "percentage": 42.86, "elapsed_time": "0:07:05", "remaining_time": "0:09:27", "throughput": 1791.24, "total_tokens": 762040} +{"current_steps": 890, "total_steps": 2065, "loss": 0.1856, "lr": 3.514008940765304e-05, "epoch": 2.154963680387409, "percentage": 43.1, "elapsed_time": "0:07:05", "remaining_time": "0:09:22", "throughput": 1799.47, "total_tokens": 766200} +{"current_steps": 895, "total_steps": 2065, "loss": 0.2107, "lr": 3.494653975530388e-05, "epoch": 2.1670702179176757, "percentage": 43.34, "elapsed_time": "0:07:06", "remaining_time": "0:09:17", "throughput": 1808.39, "total_tokens": 770680} +{"current_steps": 900, "total_steps": 2065, "loss": 0.1771, "lr": 3.475227918780239e-05, "epoch": 2.179176755447942, "percentage": 43.58, "elapsed_time": "0:07:06", "remaining_time": "0:09:12", "throughput": 1816.58, "total_tokens": 774840} +{"current_steps": 905, "total_steps": 2065, "loss": 0.1924, "lr": 3.4557321589653556e-05, "epoch": 2.1912832929782082, "percentage": 43.83, "elapsed_time": "0:07:06", "remaining_time": "0:09:07", "throughput": 1825.19, "total_tokens": 779192} +{"current_steps": 910, "total_steps": 2065, "loss": 0.1687, "lr": 3.436168089518168e-05, "epoch": 2.2033898305084745, "percentage": 44.07, "elapsed_time": "0:07:07", "remaining_time": "0:09:02", "throughput": 1833.92, "total_tokens": 783608} +{"current_steps": 915, "total_steps": 2065, "loss": 0.1922, "lr": 3.416537108753443e-05, "epoch": 2.2154963680387407, "percentage": 44.31, "elapsed_time": "0:07:07", "remaining_time": "0:08:57", "throughput": 1842.79, "total_tokens": 788088} +{"current_steps": 920, "total_steps": 2065, "loss": 0.1721, "lr": 3.3968406197683376e-05, "epoch": 2.2276029055690074, "percentage": 44.55, "elapsed_time": "0:07:08", "remaining_time": "0:08:52", "throughput": 1851.61, "total_tokens": 792568} +{"current_steps": 925, "total_steps": 2065, "loss": 0.2058, "lr": 3.3770800303421254e-05, "epoch": 2.2397094430992737, "percentage": 44.79, "elapsed_time": "0:07:08", "remaining_time": "0:08:47", "throughput": 1860.74, "total_tokens": 797176} +{"current_steps": 930, "total_steps": 2065, "loss": 0.1925, "lr": 3.357256752835561e-05, "epoch": 2.25181598062954, "percentage": 45.04, "elapsed_time": "0:07:08", "remaining_time": "0:08:43", "throughput": 1868.97, "total_tokens": 801400} +{"current_steps": 935, "total_steps": 2065, "loss": 0.1601, "lr": 3.3373722040899517e-05, "epoch": 2.263922518159806, "percentage": 45.28, "elapsed_time": "0:07:09", "remaining_time": "0:08:38", "throughput": 1877.92, "total_tokens": 805944} +{"current_steps": 936, "total_steps": 2065, "eval_loss": 0.38670673966407776, "epoch": 2.2663438256658597, "percentage": 45.33, "elapsed_time": "0:07:11", "remaining_time": "0:08:40", "throughput": 1869.66, "total_tokens": 806712} +{"current_steps": 940, "total_steps": 2065, "loss": 0.9421, "lr": 3.317427805325875e-05, "epoch": 2.2760290556900724, "percentage": 45.52, "elapsed_time": "0:07:46", "remaining_time": "0:09:18", "throughput": 1736.71, "total_tokens": 810040} +{"current_steps": 945, "total_steps": 2065, "loss": 0.191, "lr": 3.297424982041609e-05, "epoch": 2.288135593220339, "percentage": 45.76, "elapsed_time": "0:07:46", "remaining_time": "0:09:13", "throughput": 1744.65, "total_tokens": 814392} +{"current_steps": 950, "total_steps": 2065, "loss": 0.1962, "lr": 3.277365163911243e-05, "epoch": 2.3002421307506054, "percentage": 46.0, "elapsed_time": "0:07:47", "remaining_time": "0:09:08", "throughput": 1752.84, "total_tokens": 818872} +{"current_steps": 955, "total_steps": 2065, "loss": 0.2261, "lr": 3.257249784682492e-05, "epoch": 2.3123486682808716, "percentage": 46.25, "elapsed_time": "0:07:47", "remaining_time": "0:09:03", "throughput": 1760.48, "total_tokens": 823096} +{"current_steps": 960, "total_steps": 2065, "loss": 0.1945, "lr": 3.2370802820742275e-05, "epoch": 2.324455205811138, "percentage": 46.49, "elapsed_time": "0:07:47", "remaining_time": "0:08:58", "throughput": 1767.72, "total_tokens": 827128} +{"current_steps": 965, "total_steps": 2065, "loss": 0.2272, "lr": 3.2168580976737104e-05, "epoch": 2.3365617433414045, "percentage": 46.73, "elapsed_time": "0:07:48", "remaining_time": "0:08:53", "throughput": 1775.21, "total_tokens": 831288} +{"current_steps": 970, "total_steps": 2065, "loss": 0.1824, "lr": 3.196584676833562e-05, "epoch": 2.348668280871671, "percentage": 46.97, "elapsed_time": "0:07:48", "remaining_time": "0:08:49", "throughput": 1783.09, "total_tokens": 835640} +{"current_steps": 975, "total_steps": 2065, "loss": 0.156, "lr": 3.1762614685684567e-05, "epoch": 2.360774818401937, "percentage": 47.22, "elapsed_time": "0:07:49", "remaining_time": "0:08:44", "throughput": 1790.43, "total_tokens": 839736} +{"current_steps": 980, "total_steps": 2065, "loss": 0.2199, "lr": 3.155889925451557e-05, "epoch": 2.3728813559322033, "percentage": 47.46, "elapsed_time": "0:07:49", "remaining_time": "0:08:39", "throughput": 1798.14, "total_tokens": 844024} +{"current_steps": 985, "total_steps": 2065, "loss": 0.1885, "lr": 3.1354715035106894e-05, "epoch": 2.38498789346247, "percentage": 47.7, "elapsed_time": "0:07:49", "remaining_time": "0:08:35", "throughput": 1805.72, "total_tokens": 848248} +{"current_steps": 990, "total_steps": 2065, "loss": 0.1645, "lr": 3.1150076621242816e-05, "epoch": 2.3970944309927362, "percentage": 47.94, "elapsed_time": "0:07:50", "remaining_time": "0:08:30", "throughput": 1813.28, "total_tokens": 852472} +{"current_steps": 995, "total_steps": 2065, "loss": 0.1747, "lr": 3.0944998639170544e-05, "epoch": 2.4092009685230025, "percentage": 48.18, "elapsed_time": "0:07:50", "remaining_time": "0:08:25", "throughput": 1821.09, "total_tokens": 856824} +{"current_steps": 1000, "total_steps": 2065, "loss": 0.1751, "lr": 3.073949574655479e-05, "epoch": 2.4213075060532687, "percentage": 48.43, "elapsed_time": "0:07:50", "remaining_time": "0:08:21", "throughput": 1828.5, "total_tokens": 860984} +{"current_steps": 1005, "total_steps": 2065, "loss": 0.1975, "lr": 3.053358263143015e-05, "epoch": 2.433414043583535, "percentage": 48.67, "elapsed_time": "0:07:51", "remaining_time": "0:08:17", "throughput": 1836.17, "total_tokens": 865272} +{"current_steps": 1010, "total_steps": 2065, "loss": 0.1765, "lr": 3.032727401115135e-05, "epoch": 2.4455205811138017, "percentage": 48.91, "elapsed_time": "0:07:51", "remaining_time": "0:08:12", "throughput": 1843.81, "total_tokens": 869560} +{"current_steps": 1015, "total_steps": 2065, "loss": 0.1624, "lr": 3.012058463134126e-05, "epoch": 2.457627118644068, "percentage": 49.15, "elapsed_time": "0:07:51", "remaining_time": "0:08:08", "throughput": 1851.71, "total_tokens": 873976} +{"current_steps": 1020, "total_steps": 2065, "loss": 0.2237, "lr": 2.991352926483702e-05, "epoch": 2.469733656174334, "percentage": 49.39, "elapsed_time": "0:07:52", "remaining_time": "0:08:03", "throughput": 1859.2, "total_tokens": 878200} +{"current_steps": 1025, "total_steps": 2065, "loss": 0.2024, "lr": 2.9706122710634165e-05, "epoch": 2.4818401937046004, "percentage": 49.64, "elapsed_time": "0:07:52", "remaining_time": "0:07:59", "throughput": 1867.61, "total_tokens": 882872} +{"current_steps": 1030, "total_steps": 2065, "loss": 0.2673, "lr": 2.949837979282889e-05, "epoch": 2.4939467312348667, "percentage": 49.88, "elapsed_time": "0:07:53", "remaining_time": "0:07:55", "throughput": 1875.07, "total_tokens": 887096} +{"current_steps": 1035, "total_steps": 2065, "loss": 0.2168, "lr": 2.92903153595585e-05, "epoch": 2.5060532687651333, "percentage": 50.12, "elapsed_time": "0:07:53", "remaining_time": "0:07:51", "throughput": 1883.05, "total_tokens": 891576} +{"current_steps": 1040, "total_steps": 2065, "loss": 0.1768, "lr": 2.908194428194019e-05, "epoch": 2.5181598062953996, "percentage": 50.36, "elapsed_time": "0:07:53", "remaining_time": "0:07:47", "throughput": 1890.36, "total_tokens": 895736} +{"current_steps": 1040, "total_steps": 2065, "eval_loss": 0.1943914145231247, "epoch": 2.5181598062953996, "percentage": 50.36, "elapsed_time": "0:07:54", "remaining_time": "0:07:47", "throughput": 1887.68, "total_tokens": 895736} +{"current_steps": 1045, "total_steps": 2065, "loss": 0.1555, "lr": 2.88732814530081e-05, "epoch": 2.530266343825666, "percentage": 50.61, "elapsed_time": "0:08:52", "remaining_time": "0:08:39", "throughput": 1691.76, "total_tokens": 900024} +{"current_steps": 1050, "total_steps": 2065, "loss": 0.1744, "lr": 2.866434178664893e-05, "epoch": 2.542372881355932, "percentage": 50.85, "elapsed_time": "0:08:52", "remaining_time": "0:08:34", "throughput": 1698.88, "total_tokens": 904440} +{"current_steps": 1055, "total_steps": 2065, "loss": 0.1842, "lr": 2.8455140216535947e-05, "epoch": 2.5544794188861983, "percentage": 51.09, "elapsed_time": "0:08:52", "remaining_time": "0:08:30", "throughput": 1705.74, "total_tokens": 908728} +{"current_steps": 1060, "total_steps": 2065, "loss": 0.2018, "lr": 2.8245691695061604e-05, "epoch": 2.566585956416465, "percentage": 51.33, "elapsed_time": "0:08:53", "remaining_time": "0:08:25", "throughput": 1712.59, "total_tokens": 913016} +{"current_steps": 1065, "total_steps": 2065, "loss": 0.2027, "lr": 2.8036011192268863e-05, "epoch": 2.5786924939467313, "percentage": 51.57, "elapsed_time": "0:08:53", "remaining_time": "0:08:20", "throughput": 1719.43, "total_tokens": 917304} +{"current_steps": 1070, "total_steps": 2065, "loss": 0.1984, "lr": 2.7826113694781252e-05, "epoch": 2.5907990314769975, "percentage": 51.82, "elapsed_time": "0:08:53", "remaining_time": "0:08:16", "throughput": 1726.15, "total_tokens": 921528} +{"current_steps": 1075, "total_steps": 2065, "loss": 0.1674, "lr": 2.761601420473168e-05, "epoch": 2.6029055690072638, "percentage": 52.06, "elapsed_time": "0:08:54", "remaining_time": "0:08:11", "throughput": 1733.21, "total_tokens": 925944} +{"current_steps": 1080, "total_steps": 2065, "loss": 0.1523, "lr": 2.740572773869019e-05, "epoch": 2.61501210653753, "percentage": 52.3, "elapsed_time": "0:08:54", "remaining_time": "0:08:07", "throughput": 1740.95, "total_tokens": 930744} +{"current_steps": 1085, "total_steps": 2065, "loss": 0.1263, "lr": 2.7195269326590682e-05, "epoch": 2.6271186440677967, "percentage": 52.54, "elapsed_time": "0:08:54", "remaining_time": "0:08:03", "throughput": 1748.34, "total_tokens": 935352} +{"current_steps": 1090, "total_steps": 2065, "loss": 0.1656, "lr": 2.6984654010656667e-05, "epoch": 2.639225181598063, "percentage": 52.78, "elapsed_time": "0:08:55", "remaining_time": "0:07:58", "throughput": 1755.13, "total_tokens": 939640} +{"current_steps": 1095, "total_steps": 2065, "loss": 0.2926, "lr": 2.6773896844326125e-05, "epoch": 2.651331719128329, "percentage": 53.03, "elapsed_time": "0:08:55", "remaining_time": "0:07:54", "throughput": 1761.45, "total_tokens": 943672} +{"current_steps": 1100, "total_steps": 2065, "loss": 0.1547, "lr": 2.656301289117561e-05, "epoch": 2.663438256658596, "percentage": 53.27, "elapsed_time": "0:08:56", "remaining_time": "0:07:50", "throughput": 1767.77, "total_tokens": 947704} +{"current_steps": 1105, "total_steps": 2065, "loss": 0.2428, "lr": 2.6352017223843585e-05, "epoch": 2.6755447941888617, "percentage": 53.51, "elapsed_time": "0:08:56", "remaining_time": "0:07:46", "throughput": 1774.42, "total_tokens": 951928} +{"current_steps": 1110, "total_steps": 2065, "loss": 0.1649, "lr": 2.6140924922953125e-05, "epoch": 2.6876513317191284, "percentage": 53.75, "elapsed_time": "0:08:56", "remaining_time": "0:07:41", "throughput": 1781.17, "total_tokens": 956216} +{"current_steps": 1115, "total_steps": 2065, "loss": 0.1597, "lr": 2.5929751076034058e-05, "epoch": 2.6997578692493946, "percentage": 54.0, "elapsed_time": "0:08:57", "remaining_time": "0:07:37", "throughput": 1787.91, "total_tokens": 960504} +{"current_steps": 1120, "total_steps": 2065, "loss": 0.1407, "lr": 2.571851077644461e-05, "epoch": 2.711864406779661, "percentage": 54.24, "elapsed_time": "0:08:57", "remaining_time": "0:07:33", "throughput": 1795.11, "total_tokens": 965048} +{"current_steps": 1125, "total_steps": 2065, "loss": 0.1667, "lr": 2.5507219122292598e-05, "epoch": 2.7239709443099276, "percentage": 54.48, "elapsed_time": "0:08:57", "remaining_time": "0:07:29", "throughput": 1801.6, "total_tokens": 969208} +{"current_steps": 1130, "total_steps": 2065, "loss": 0.1438, "lr": 2.529589121535636e-05, "epoch": 2.736077481840194, "percentage": 54.72, "elapsed_time": "0:08:58", "remaining_time": "0:07:25", "throughput": 1808.55, "total_tokens": 973624} +{"current_steps": 1135, "total_steps": 2065, "loss": 0.2294, "lr": 2.5084542160005335e-05, "epoch": 2.74818401937046, "percentage": 54.96, "elapsed_time": "0:08:58", "remaining_time": "0:07:21", "throughput": 1815.37, "total_tokens": 977976} +{"current_steps": 1140, "total_steps": 2065, "loss": 0.1964, "lr": 2.487318706212051e-05, "epoch": 2.7602905569007263, "percentage": 55.21, "elapsed_time": "0:08:59", "remaining_time": "0:07:17", "throughput": 1821.95, "total_tokens": 982200} +{"current_steps": 1144, "total_steps": 2065, "eval_loss": 0.19318054616451263, "epoch": 2.7699757869249395, "percentage": 55.4, "elapsed_time": "0:09:00", "remaining_time": "0:07:14", "throughput": 1825.11, "total_tokens": 985592} +{"current_steps": 1145, "total_steps": 2065, "loss": 0.203, "lr": 2.4661841028014785e-05, "epoch": 2.7723970944309926, "percentage": 55.45, "elapsed_time": "0:09:35", "remaining_time": "0:07:42", "throughput": 1714.45, "total_tokens": 986488} +{"current_steps": 1150, "total_steps": 2065, "loss": 0.1983, "lr": 2.445051916335321e-05, "epoch": 2.7845036319612593, "percentage": 55.69, "elapsed_time": "0:09:35", "remaining_time": "0:07:38", "throughput": 1720.25, "total_tokens": 990456} +{"current_steps": 1155, "total_steps": 2065, "loss": 0.1825, "lr": 2.4239236572073352e-05, "epoch": 2.7966101694915255, "percentage": 55.93, "elapsed_time": "0:09:36", "remaining_time": "0:07:33", "throughput": 1726.57, "total_tokens": 994744} +{"current_steps": 1160, "total_steps": 2065, "loss": 0.178, "lr": 2.4028008355305815e-05, "epoch": 2.8087167070217918, "percentage": 56.17, "elapsed_time": "0:09:36", "remaining_time": "0:07:29", "throughput": 1733.1, "total_tokens": 999160} +{"current_steps": 1165, "total_steps": 2065, "loss": 0.1709, "lr": 2.3816849610294783e-05, "epoch": 2.820823244552058, "percentage": 56.42, "elapsed_time": "0:09:36", "remaining_time": "0:07:25", "throughput": 1739.09, "total_tokens": 1003256} +{"current_steps": 1170, "total_steps": 2065, "loss": 0.1853, "lr": 2.3605775429319115e-05, "epoch": 2.8329297820823243, "percentage": 56.66, "elapsed_time": "0:09:37", "remaining_time": "0:07:21", "throughput": 1745.28, "total_tokens": 1007480} +{"current_steps": 1175, "total_steps": 2065, "loss": 0.1431, "lr": 2.3394800898613535e-05, "epoch": 2.845036319612591, "percentage": 56.9, "elapsed_time": "0:09:37", "remaining_time": "0:07:17", "throughput": 1751.79, "total_tokens": 1011896} +{"current_steps": 1180, "total_steps": 2065, "loss": 0.2253, "lr": 2.318394109729041e-05, "epoch": 2.857142857142857, "percentage": 57.14, "elapsed_time": "0:09:38", "remaining_time": "0:07:13", "throughput": 1757.76, "total_tokens": 1015992} +{"current_steps": 1185, "total_steps": 2065, "loss": 0.1686, "lr": 2.297321109626198e-05, "epoch": 2.8692493946731235, "percentage": 57.38, "elapsed_time": "0:09:38", "remaining_time": "0:07:09", "throughput": 1764.24, "total_tokens": 1020408} +{"current_steps": 1190, "total_steps": 2065, "loss": 0.1988, "lr": 2.27626259571632e-05, "epoch": 2.8813559322033897, "percentage": 57.63, "elapsed_time": "0:09:38", "remaining_time": "0:07:05", "throughput": 1771.03, "total_tokens": 1025016} +{"current_steps": 1195, "total_steps": 2065, "loss": 0.1682, "lr": 2.2552200731275213e-05, "epoch": 2.893462469733656, "percentage": 57.87, "elapsed_time": "0:09:39", "remaining_time": "0:07:01", "throughput": 1777.39, "total_tokens": 1029368} +{"current_steps": 1200, "total_steps": 2065, "loss": 0.1918, "lr": 2.2341950458449576e-05, "epoch": 2.9055690072639226, "percentage": 58.11, "elapsed_time": "0:09:39", "remaining_time": "0:06:57", "throughput": 1783.54, "total_tokens": 1033592} +{"current_steps": 1205, "total_steps": 2065, "loss": 0.2047, "lr": 2.213189016603333e-05, "epoch": 2.917675544794189, "percentage": 58.35, "elapsed_time": "0:09:39", "remaining_time": "0:06:53", "throughput": 1789.47, "total_tokens": 1037688} +{"current_steps": 1210, "total_steps": 2065, "loss": 0.1686, "lr": 2.1922034867794925e-05, "epoch": 2.929782082324455, "percentage": 58.6, "elapsed_time": "0:09:40", "remaining_time": "0:06:50", "throughput": 1795.6, "total_tokens": 1041912} +{"current_steps": 1215, "total_steps": 2065, "loss": 0.1663, "lr": 2.1712399562851147e-05, "epoch": 2.9418886198547214, "percentage": 58.84, "elapsed_time": "0:09:40", "remaining_time": "0:06:46", "throughput": 1802.14, "total_tokens": 1046392} +{"current_steps": 1220, "total_steps": 2065, "loss": 0.1158, "lr": 2.150299923459505e-05, "epoch": 2.9539951573849876, "percentage": 59.08, "elapsed_time": "0:09:41", "remaining_time": "0:06:42", "throughput": 1808.26, "total_tokens": 1050616} +{"current_steps": 1225, "total_steps": 2065, "loss": 0.1857, "lr": 2.1293848849625065e-05, "epoch": 2.9661016949152543, "percentage": 59.32, "elapsed_time": "0:09:41", "remaining_time": "0:06:38", "throughput": 1814.37, "total_tokens": 1054840} +{"current_steps": 1230, "total_steps": 2065, "loss": 0.2051, "lr": 2.108496335667527e-05, "epoch": 2.9782082324455206, "percentage": 59.56, "elapsed_time": "0:09:41", "remaining_time": "0:06:34", "throughput": 1820.25, "total_tokens": 1058936} +{"current_steps": 1235, "total_steps": 2065, "loss": 0.137, "lr": 2.0876357685546944e-05, "epoch": 2.990314769975787, "percentage": 59.81, "elapsed_time": "0:09:42", "remaining_time": "0:06:31", "throughput": 1826.56, "total_tokens": 1063288} +{"current_steps": 1240, "total_steps": 2065, "loss": 0.294, "lr": 2.06680467460415e-05, "epoch": 3.002421307506053, "percentage": 60.05, "elapsed_time": "0:09:42", "remaining_time": "0:06:27", "throughput": 1832.19, "total_tokens": 1067392} +{"current_steps": 1245, "total_steps": 2065, "loss": 0.1436, "lr": 2.0460045426894817e-05, "epoch": 3.0145278450363198, "percentage": 60.29, "elapsed_time": "0:09:42", "remaining_time": "0:06:23", "throughput": 1838.67, "total_tokens": 1071872} +{"current_steps": 1248, "total_steps": 2065, "eval_loss": 0.20527909696102142, "epoch": 3.0217917675544794, "percentage": 60.44, "elapsed_time": "0:09:43", "remaining_time": "0:06:22", "throughput": 1840.65, "total_tokens": 1074624} +{"current_steps": 1250, "total_steps": 2065, "loss": 0.1503, "lr": 2.0252368594713083e-05, "epoch": 3.026634382566586, "percentage": 60.53, "elapsed_time": "0:10:13", "remaining_time": "0:06:40", "throughput": 1753.59, "total_tokens": 1076416} +{"current_steps": 1255, "total_steps": 2065, "loss": 0.156, "lr": 2.004503109291023e-05, "epoch": 3.0387409200968523, "percentage": 60.77, "elapsed_time": "0:10:14", "remaining_time": "0:06:36", "throughput": 1759.2, "total_tokens": 1080512} +{"current_steps": 1260, "total_steps": 2065, "loss": 0.1971, "lr": 1.9838047740647026e-05, "epoch": 3.0508474576271185, "percentage": 61.02, "elapsed_time": "0:10:14", "remaining_time": "0:06:32", "throughput": 1764.79, "total_tokens": 1084608} +{"current_steps": 1265, "total_steps": 2065, "loss": 0.1813, "lr": 1.9631433331771886e-05, "epoch": 3.062953995157385, "percentage": 61.26, "elapsed_time": "0:10:14", "remaining_time": "0:06:28", "throughput": 1770.87, "total_tokens": 1089024} +{"current_steps": 1270, "total_steps": 2065, "loss": 0.133, "lr": 1.9425202633763513e-05, "epoch": 3.0750605326876514, "percentage": 61.5, "elapsed_time": "0:10:15", "remaining_time": "0:06:25", "throughput": 1776.84, "total_tokens": 1093376} +{"current_steps": 1275, "total_steps": 2065, "loss": 0.089, "lr": 1.9219370386675388e-05, "epoch": 3.0871670702179177, "percentage": 61.74, "elapsed_time": "0:10:15", "remaining_time": "0:06:21", "throughput": 1782.81, "total_tokens": 1097728} +{"current_steps": 1280, "total_steps": 2065, "loss": 0.2836, "lr": 1.901395130208229e-05, "epoch": 3.099273607748184, "percentage": 61.99, "elapsed_time": "0:10:16", "remaining_time": "0:06:17", "throughput": 1788.45, "total_tokens": 1101888} +{"current_steps": 1285, "total_steps": 2065, "loss": 0.1116, "lr": 1.880896006202876e-05, "epoch": 3.11138014527845, "percentage": 62.23, "elapsed_time": "0:10:16", "remaining_time": "0:06:14", "throughput": 1794.31, "total_tokens": 1106176} +{"current_steps": 1290, "total_steps": 2065, "loss": 0.1027, "lr": 1.860441131797977e-05, "epoch": 3.123486682808717, "percentage": 62.47, "elapsed_time": "0:10:16", "remaining_time": "0:06:10", "throughput": 1799.87, "total_tokens": 1110272} +{"current_steps": 1295, "total_steps": 2065, "loss": 0.1582, "lr": 1.8400319689773474e-05, "epoch": 3.135593220338983, "percentage": 62.71, "elapsed_time": "0:10:17", "remaining_time": "0:06:07", "throughput": 1805.6, "total_tokens": 1114496} +{"current_steps": 1300, "total_steps": 2065, "loss": 0.0408, "lr": 1.8196699764576318e-05, "epoch": 3.1476997578692494, "percentage": 62.95, "elapsed_time": "0:10:17", "remaining_time": "0:06:03", "throughput": 1811.43, "total_tokens": 1118784} +{"current_steps": 1305, "total_steps": 2065, "loss": 0.1234, "lr": 1.7993566095840443e-05, "epoch": 3.1598062953995156, "percentage": 63.2, "elapsed_time": "0:10:18", "remaining_time": "0:05:59", "throughput": 1817.16, "total_tokens": 1123008} +{"current_steps": 1310, "total_steps": 2065, "loss": 0.2236, "lr": 1.7790933202263434e-05, "epoch": 3.171912832929782, "percentage": 63.44, "elapsed_time": "0:10:18", "remaining_time": "0:05:56", "throughput": 1823.19, "total_tokens": 1127424} +{"current_steps": 1315, "total_steps": 2065, "loss": 0.1958, "lr": 1.758881556675073e-05, "epoch": 3.1840193704600486, "percentage": 63.68, "elapsed_time": "0:10:18", "remaining_time": "0:05:52", "throughput": 1829.2, "total_tokens": 1131840} +{"current_steps": 1320, "total_steps": 2065, "loss": 0.1238, "lr": 1.738722763538036e-05, "epoch": 3.196125907990315, "percentage": 63.92, "elapsed_time": "0:10:19", "remaining_time": "0:05:49", "throughput": 1835.11, "total_tokens": 1136192} +{"current_steps": 1325, "total_steps": 2065, "loss": 0.1027, "lr": 1.7186183816370522e-05, "epoch": 3.208232445520581, "percentage": 64.16, "elapsed_time": "0:10:19", "remaining_time": "0:05:45", "throughput": 1841.02, "total_tokens": 1140544} +{"current_steps": 1330, "total_steps": 2065, "loss": 0.0907, "lr": 1.6985698479049702e-05, "epoch": 3.2203389830508473, "percentage": 64.41, "elapsed_time": "0:10:19", "remaining_time": "0:05:42", "throughput": 1847.51, "total_tokens": 1145280} +{"current_steps": 1335, "total_steps": 2065, "loss": 0.1037, "lr": 1.6785785952829717e-05, "epoch": 3.232445520581114, "percentage": 64.65, "elapsed_time": "0:10:20", "remaining_time": "0:05:39", "throughput": 1853.81, "total_tokens": 1149888} +{"current_steps": 1340, "total_steps": 2065, "loss": 0.1776, "lr": 1.6586460526181473e-05, "epoch": 3.2445520581113803, "percentage": 64.89, "elapsed_time": "0:10:20", "remaining_time": "0:05:35", "throughput": 1859.21, "total_tokens": 1153920} +{"current_steps": 1345, "total_steps": 2065, "loss": 0.2125, "lr": 1.6387736445613772e-05, "epoch": 3.2566585956416465, "percentage": 65.13, "elapsed_time": "0:10:21", "remaining_time": "0:05:32", "throughput": 1865.59, "total_tokens": 1158592} +{"current_steps": 1350, "total_steps": 2065, "loss": 0.2252, "lr": 1.6189627914655008e-05, "epoch": 3.2687651331719128, "percentage": 65.38, "elapsed_time": "0:10:21", "remaining_time": "0:05:29", "throughput": 1871.26, "total_tokens": 1162816} +{"current_steps": 1352, "total_steps": 2065, "eval_loss": 0.2091810256242752, "epoch": 3.2736077481840193, "percentage": 65.47, "elapsed_time": "0:10:22", "remaining_time": "0:05:28", "throughput": 1871.63, "total_tokens": 1164544} +{"current_steps": 1355, "total_steps": 2065, "loss": 0.1163, "lr": 1.599214909283805e-05, "epoch": 3.280871670702179, "percentage": 65.62, "elapsed_time": "0:11:34", "remaining_time": "0:06:03", "throughput": 1680.92, "total_tokens": 1167232} +{"current_steps": 1360, "total_steps": 2065, "loss": 0.1094, "lr": 1.579531409468815e-05, "epoch": 3.2929782082324457, "percentage": 65.86, "elapsed_time": "0:11:34", "remaining_time": "0:06:00", "throughput": 1686.37, "total_tokens": 1171648} +{"current_steps": 1365, "total_steps": 2065, "loss": 0.141, "lr": 1.5599136988714186e-05, "epoch": 3.305084745762712, "percentage": 66.1, "elapsed_time": "0:11:35", "remaining_time": "0:05:56", "throughput": 1691.46, "total_tokens": 1175808} +{"current_steps": 1370, "total_steps": 2065, "loss": 0.1296, "lr": 1.5403631796403085e-05, "epoch": 3.317191283292978, "percentage": 66.34, "elapsed_time": "0:11:35", "remaining_time": "0:05:52", "throughput": 1696.9, "total_tokens": 1180224} +{"current_steps": 1375, "total_steps": 2065, "loss": 0.1375, "lr": 1.520881249121767e-05, "epoch": 3.3292978208232444, "percentage": 66.59, "elapsed_time": "0:11:35", "remaining_time": "0:05:49", "throughput": 1702.42, "total_tokens": 1184704} +{"current_steps": 1380, "total_steps": 2065, "loss": 0.1459, "lr": 1.5014692997597962e-05, "epoch": 3.341404358353511, "percentage": 66.83, "elapsed_time": "0:11:36", "remaining_time": "0:05:45", "throughput": 1707.67, "total_tokens": 1188992} +{"current_steps": 1385, "total_steps": 2065, "loss": 0.1535, "lr": 1.4821287189965866e-05, "epoch": 3.3535108958837774, "percentage": 67.07, "elapsed_time": "0:11:36", "remaining_time": "0:05:42", "throughput": 1713.09, "total_tokens": 1193408} +{"current_steps": 1390, "total_steps": 2065, "loss": 0.1246, "lr": 1.4628608891733625e-05, "epoch": 3.3656174334140436, "percentage": 67.31, "elapsed_time": "0:11:37", "remaining_time": "0:05:38", "throughput": 1718.42, "total_tokens": 1197760} +{"current_steps": 1395, "total_steps": 2065, "loss": 0.0863, "lr": 1.4436671874315722e-05, "epoch": 3.37772397094431, "percentage": 67.55, "elapsed_time": "0:11:37", "remaining_time": "0:05:34", "throughput": 1723.3, "total_tokens": 1201792} +{"current_steps": 1400, "total_steps": 2065, "loss": 0.0968, "lr": 1.4245489856144634e-05, "epoch": 3.389830508474576, "percentage": 67.8, "elapsed_time": "0:11:37", "remaining_time": "0:05:31", "throughput": 1728.18, "total_tokens": 1205824} +{"current_steps": 1405, "total_steps": 2065, "loss": 0.0749, "lr": 1.4055076501690311e-05, "epoch": 3.401937046004843, "percentage": 68.04, "elapsed_time": "0:11:38", "remaining_time": "0:05:27", "throughput": 1733.57, "total_tokens": 1210240} +{"current_steps": 1410, "total_steps": 2065, "loss": 0.09, "lr": 1.3865445420483526e-05, "epoch": 3.414043583535109, "percentage": 68.28, "elapsed_time": "0:11:38", "remaining_time": "0:05:24", "throughput": 1738.7, "total_tokens": 1214464} +{"current_steps": 1415, "total_steps": 2065, "loss": 0.1746, "lr": 1.367661016614315e-05, "epoch": 3.4261501210653753, "percentage": 68.52, "elapsed_time": "0:11:38", "remaining_time": "0:05:21", "throughput": 1743.91, "total_tokens": 1218752} +{"current_steps": 1420, "total_steps": 2065, "loss": 0.0826, "lr": 1.3488584235407439e-05, "epoch": 3.4382566585956416, "percentage": 68.77, "elapsed_time": "0:11:39", "remaining_time": "0:05:17", "throughput": 1749.29, "total_tokens": 1223168} +{"current_steps": 1425, "total_steps": 2065, "loss": 0.1469, "lr": 1.3301381067169366e-05, "epoch": 3.450363196125908, "percentage": 69.01, "elapsed_time": "0:11:39", "remaining_time": "0:05:14", "throughput": 1754.31, "total_tokens": 1227328} +{"current_steps": 1430, "total_steps": 2065, "loss": 0.1454, "lr": 1.3115014041516089e-05, "epoch": 3.4624697336561745, "percentage": 69.25, "elapsed_time": "0:11:39", "remaining_time": "0:05:10", "throughput": 1759.16, "total_tokens": 1231360} +{"current_steps": 1435, "total_steps": 2065, "loss": 0.0455, "lr": 1.2929496478772635e-05, "epoch": 3.4745762711864407, "percentage": 69.49, "elapsed_time": "0:11:40", "remaining_time": "0:05:07", "throughput": 1764.09, "total_tokens": 1235456} +{"current_steps": 1440, "total_steps": 2065, "loss": 0.106, "lr": 1.2744841638549842e-05, "epoch": 3.486682808716707, "percentage": 69.73, "elapsed_time": "0:11:40", "remaining_time": "0:05:04", "throughput": 1769.09, "total_tokens": 1239616} +{"current_steps": 1445, "total_steps": 2065, "loss": 0.0763, "lr": 1.2561062718796662e-05, "epoch": 3.4987893462469732, "percentage": 69.98, "elapsed_time": "0:11:41", "remaining_time": "0:05:00", "throughput": 1774.36, "total_tokens": 1243968} +{"current_steps": 1450, "total_steps": 2065, "loss": 0.0978, "lr": 1.2378172854856831e-05, "epoch": 3.5108958837772395, "percentage": 70.22, "elapsed_time": "0:11:41", "remaining_time": "0:04:57", "throughput": 1779.36, "total_tokens": 1248128} +{"current_steps": 1455, "total_steps": 2065, "loss": 0.1328, "lr": 1.2196185118530063e-05, "epoch": 3.523002421307506, "percentage": 70.46, "elapsed_time": "0:11:41", "remaining_time": "0:04:54", "throughput": 1784.35, "total_tokens": 1252288} +{"current_steps": 1456, "total_steps": 2065, "eval_loss": 0.3491859436035156, "epoch": 3.5254237288135593, "percentage": 70.51, "elapsed_time": "0:11:42", "remaining_time": "0:04:53", "throughput": 1783.88, "total_tokens": 1253248} +{"current_steps": 1460, "total_steps": 2065, "loss": 0.1139, "lr": 1.2015112517137744e-05, "epoch": 3.5351089588377724, "percentage": 70.7, "elapsed_time": "0:12:11", "remaining_time": "0:05:03", "throughput": 1718.3, "total_tokens": 1256640} +{"current_steps": 1465, "total_steps": 2065, "loss": 0.1247, "lr": 1.183496799259326e-05, "epoch": 3.5472154963680387, "percentage": 70.94, "elapsed_time": "0:12:11", "remaining_time": "0:04:59", "throughput": 1723.96, "total_tokens": 1261440} +{"current_steps": 1470, "total_steps": 2065, "loss": 0.0777, "lr": 1.1655764420476988e-05, "epoch": 3.559322033898305, "percentage": 71.19, "elapsed_time": "0:12:12", "remaining_time": "0:04:56", "throughput": 1728.86, "total_tokens": 1265664} +{"current_steps": 1475, "total_steps": 2065, "loss": 0.0848, "lr": 1.1477514609116039e-05, "epoch": 3.571428571428571, "percentage": 71.43, "elapsed_time": "0:12:12", "remaining_time": "0:04:52", "throughput": 1733.92, "total_tokens": 1270016} +{"current_steps": 1480, "total_steps": 2065, "loss": 0.1263, "lr": 1.1300231298668786e-05, "epoch": 3.583535108958838, "percentage": 71.67, "elapsed_time": "0:12:12", "remaining_time": "0:04:49", "throughput": 1739.23, "total_tokens": 1274560} +{"current_steps": 1485, "total_steps": 2065, "loss": 0.1362, "lr": 1.1123927160214289e-05, "epoch": 3.595641646489104, "percentage": 71.91, "elapsed_time": "0:12:13", "remaining_time": "0:04:46", "throughput": 1744.36, "total_tokens": 1278976} +{"current_steps": 1490, "total_steps": 2065, "loss": 0.1068, "lr": 1.0948614794846668e-05, "epoch": 3.6077481840193704, "percentage": 72.15, "elapsed_time": "0:12:13", "remaining_time": "0:04:43", "throughput": 1749.24, "total_tokens": 1283200} +{"current_steps": 1495, "total_steps": 2065, "loss": 0.2069, "lr": 1.0774306732774414e-05, "epoch": 3.619854721549637, "percentage": 72.4, "elapsed_time": "0:12:13", "remaining_time": "0:04:39", "throughput": 1753.94, "total_tokens": 1287296} +{"current_steps": 1500, "total_steps": 2065, "loss": 0.1368, "lr": 1.0601015432424819e-05, "epoch": 3.6319612590799033, "percentage": 72.64, "elapsed_time": "0:12:14", "remaining_time": "0:04:36", "throughput": 1759.06, "total_tokens": 1291712} +{"current_steps": 1505, "total_steps": 2065, "loss": 0.1959, "lr": 1.042875327955356e-05, "epoch": 3.6440677966101696, "percentage": 72.88, "elapsed_time": "0:12:14", "remaining_time": "0:04:33", "throughput": 1763.91, "total_tokens": 1295936} +{"current_steps": 1510, "total_steps": 2065, "loss": 0.0932, "lr": 1.0257532586359422e-05, "epoch": 3.656174334140436, "percentage": 73.12, "elapsed_time": "0:12:15", "remaining_time": "0:04:30", "throughput": 1769.35, "total_tokens": 1300608} +{"current_steps": 1515, "total_steps": 2065, "loss": 0.1347, "lr": 1.0087365590604289e-05, "epoch": 3.668280871670702, "percentage": 73.37, "elapsed_time": "0:12:15", "remaining_time": "0:04:26", "throughput": 1774.45, "total_tokens": 1305024} +{"current_steps": 1520, "total_steps": 2065, "loss": 0.1287, "lr": 9.918264454738504e-06, "epoch": 3.6803874092009687, "percentage": 73.61, "elapsed_time": "0:12:15", "remaining_time": "0:04:23", "throughput": 1779.47, "total_tokens": 1309376} +{"current_steps": 1525, "total_steps": 2065, "loss": 0.0818, "lr": 9.75024126503153e-06, "epoch": 3.692493946731235, "percentage": 73.85, "elapsed_time": "0:12:16", "remaining_time": "0:04:20", "throughput": 1784.39, "total_tokens": 1313664} +{"current_steps": 1530, "total_steps": 2065, "loss": 0.0869, "lr": 9.583308030708135e-06, "epoch": 3.7046004842615012, "percentage": 74.09, "elapsed_time": "0:12:16", "remaining_time": "0:04:17", "throughput": 1789.47, "total_tokens": 1318080} +{"current_steps": 1535, "total_steps": 2065, "loss": 0.0893, "lr": 9.417476683090007e-06, "epoch": 3.7167070217917675, "percentage": 74.33, "elapsed_time": "0:12:16", "remaining_time": "0:04:14", "throughput": 1794.46, "total_tokens": 1322432} +{"current_steps": 1540, "total_steps": 2065, "loss": 0.1556, "lr": 9.252759074743034e-06, "epoch": 3.7288135593220337, "percentage": 74.58, "elapsed_time": "0:12:17", "remaining_time": "0:04:11", "throughput": 1799.53, "total_tokens": 1326848} +{"current_steps": 1545, "total_steps": 2065, "loss": 0.0774, "lr": 9.08916697863014e-06, "epoch": 3.7409200968523004, "percentage": 74.82, "elapsed_time": "0:12:17", "remaining_time": "0:04:08", "throughput": 1804.68, "total_tokens": 1331328} +{"current_steps": 1550, "total_steps": 2065, "loss": 0.1253, "lr": 8.926712087269801e-06, "epoch": 3.7530266343825667, "percentage": 75.06, "elapsed_time": "0:12:18", "remaining_time": "0:04:05", "throughput": 1809.33, "total_tokens": 1335424} +{"current_steps": 1555, "total_steps": 2065, "loss": 0.1276, "lr": 8.765406011900368e-06, "epoch": 3.765133171912833, "percentage": 75.3, "elapsed_time": "0:12:18", "remaining_time": "0:04:02", "throughput": 1814.22, "total_tokens": 1339712} +{"current_steps": 1560, "total_steps": 2065, "loss": 0.1842, "lr": 8.605260281650152e-06, "epoch": 3.777239709443099, "percentage": 75.54, "elapsed_time": "0:12:18", "remaining_time": "0:03:59", "throughput": 1819.11, "total_tokens": 1344000} +{"current_steps": 1560, "total_steps": 2065, "eval_loss": 0.21899566054344177, "epoch": 3.777239709443099, "percentage": 75.54, "elapsed_time": "0:12:19", "remaining_time": "0:03:59", "throughput": 1817.44, "total_tokens": 1344000} +{"current_steps": 1565, "total_steps": 2065, "loss": 0.0881, "lr": 8.446286342713419e-06, "epoch": 3.7893462469733654, "percentage": 75.79, "elapsed_time": "0:12:58", "remaining_time": "0:04:08", "throughput": 1731.06, "total_tokens": 1348224} +{"current_steps": 1570, "total_steps": 2065, "loss": 0.1348, "lr": 8.288495557532241e-06, "epoch": 3.801452784503632, "percentage": 76.03, "elapsed_time": "0:12:59", "remaining_time": "0:04:05", "throughput": 1735.78, "total_tokens": 1352576} +{"current_steps": 1575, "total_steps": 2065, "loss": 0.134, "lr": 8.131899203984463e-06, "epoch": 3.8135593220338984, "percentage": 76.27, "elapsed_time": "0:12:59", "remaining_time": "0:04:02", "throughput": 1740.44, "total_tokens": 1356864} +{"current_steps": 1580, "total_steps": 2065, "loss": 0.1141, "lr": 7.976508474577548e-06, "epoch": 3.8256658595641646, "percentage": 76.51, "elapsed_time": "0:12:59", "remaining_time": "0:03:59", "throughput": 1745.1, "total_tokens": 1361152} +{"current_steps": 1585, "total_steps": 2065, "loss": 0.0705, "lr": 7.822334475648654e-06, "epoch": 3.837772397094431, "percentage": 76.76, "elapsed_time": "0:13:00", "remaining_time": "0:03:56", "throughput": 1749.67, "total_tokens": 1365376} +{"current_steps": 1590, "total_steps": 2065, "loss": 0.0907, "lr": 7.669388226570809e-06, "epoch": 3.849878934624697, "percentage": 77.0, "elapsed_time": "0:13:00", "remaining_time": "0:03:53", "throughput": 1754.39, "total_tokens": 1369728} +{"current_steps": 1595, "total_steps": 2065, "loss": 0.1261, "lr": 7.517680658965329e-06, "epoch": 3.861985472154964, "percentage": 77.24, "elapsed_time": "0:13:01", "remaining_time": "0:03:50", "throughput": 1759.19, "total_tokens": 1374144} +{"current_steps": 1600, "total_steps": 2065, "loss": 0.1084, "lr": 7.367222615920477e-06, "epoch": 3.87409200968523, "percentage": 77.48, "elapsed_time": "0:13:01", "remaining_time": "0:03:47", "throughput": 1763.75, "total_tokens": 1378368} +{"current_steps": 1605, "total_steps": 2065, "loss": 0.0813, "lr": 7.2180248512164896e-06, "epoch": 3.8861985472154963, "percentage": 77.72, "elapsed_time": "0:13:01", "remaining_time": "0:03:44", "throughput": 1768.14, "total_tokens": 1382464} +{"current_steps": 1610, "total_steps": 2065, "loss": 0.0805, "lr": 7.070098028556948e-06, "epoch": 3.898305084745763, "percentage": 77.97, "elapsed_time": "0:13:02", "remaining_time": "0:03:41", "throughput": 1772.92, "total_tokens": 1386880} +{"current_steps": 1615, "total_steps": 2065, "loss": 0.1924, "lr": 6.923452720806611e-06, "epoch": 3.910411622276029, "percentage": 78.21, "elapsed_time": "0:13:02", "remaining_time": "0:03:38", "throughput": 1777.7, "total_tokens": 1391296} +{"current_steps": 1620, "total_steps": 2065, "loss": 0.0609, "lr": 6.778099409235739e-06, "epoch": 3.9225181598062955, "percentage": 78.45, "elapsed_time": "0:13:03", "remaining_time": "0:03:35", "throughput": 1782.16, "total_tokens": 1395456} +{"current_steps": 1625, "total_steps": 2065, "loss": 0.0932, "lr": 6.634048482770946e-06, "epoch": 3.9346246973365617, "percentage": 78.69, "elapsed_time": "0:13:03", "remaining_time": "0:03:32", "throughput": 1786.62, "total_tokens": 1399616} +{"current_steps": 1630, "total_steps": 2065, "loss": 0.1241, "lr": 6.491310237252679e-06, "epoch": 3.946731234866828, "percentage": 78.93, "elapsed_time": "0:13:03", "remaining_time": "0:03:29", "throughput": 1790.99, "total_tokens": 1403712} +{"current_steps": 1635, "total_steps": 2065, "loss": 0.1232, "lr": 6.349894874699344e-06, "epoch": 3.9588377723970947, "percentage": 79.18, "elapsed_time": "0:13:04", "remaining_time": "0:03:26", "throughput": 1795.76, "total_tokens": 1408128} +{"current_steps": 1640, "total_steps": 2065, "loss": 0.0787, "lr": 6.209812502578114e-06, "epoch": 3.970944309927361, "percentage": 79.42, "elapsed_time": "0:13:04", "remaining_time": "0:03:23", "throughput": 1800.44, "total_tokens": 1412480} +{"current_steps": 1645, "total_steps": 2065, "loss": 0.0494, "lr": 6.071073133082492e-06, "epoch": 3.983050847457627, "percentage": 79.66, "elapsed_time": "0:13:04", "remaining_time": "0:03:20", "throughput": 1804.95, "total_tokens": 1416704} +{"current_steps": 1650, "total_steps": 2065, "loss": 0.0969, "lr": 5.933686682416758e-06, "epoch": 3.9951573849878934, "percentage": 79.9, "elapsed_time": "0:13:05", "remaining_time": "0:03:17", "throughput": 1809.72, "total_tokens": 1421120} +{"current_steps": 1655, "total_steps": 2065, "loss": 0.09, "lr": 5.797662970087184e-06, "epoch": 4.00726392251816, "percentage": 80.15, "elapsed_time": "0:13:05", "remaining_time": "0:03:14", "throughput": 1813.5, "total_tokens": 1424944} +{"current_steps": 1660, "total_steps": 2065, "loss": 0.0897, "lr": 5.663011718200201e-06, "epoch": 4.019370460048426, "percentage": 80.39, "elapsed_time": "0:13:06", "remaining_time": "0:03:11", "throughput": 1818.17, "total_tokens": 1429296} +{"current_steps": 1664, "total_steps": 2065, "eval_loss": 0.2532218098640442, "epoch": 4.0290556900726395, "percentage": 80.58, "elapsed_time": "0:13:07", "remaining_time": "0:03:09", "throughput": 1820.54, "total_tokens": 1432880} +{"current_steps": 1665, "total_steps": 2065, "loss": 0.0316, "lr": 5.529742550767544e-06, "epoch": 4.031476997578692, "percentage": 80.63, "elapsed_time": "0:13:55", "remaining_time": "0:03:20", "throughput": 1715.95, "total_tokens": 1433776} +{"current_steps": 1670, "total_steps": 2065, "loss": 0.0492, "lr": 5.397864993018367e-06, "epoch": 4.043583535108959, "percentage": 80.87, "elapsed_time": "0:13:55", "remaining_time": "0:03:17", "throughput": 1720.23, "total_tokens": 1438000} +{"current_steps": 1675, "total_steps": 2065, "loss": 0.029, "lr": 5.267388470718449e-06, "epoch": 4.0556900726392255, "percentage": 81.11, "elapsed_time": "0:13:56", "remaining_time": "0:03:14", "throughput": 1724.67, "total_tokens": 1442352} +{"current_steps": 1680, "total_steps": 2065, "loss": 0.052, "lr": 5.138322309496504e-06, "epoch": 4.067796610169491, "percentage": 81.36, "elapsed_time": "0:13:56", "remaining_time": "0:03:11", "throughput": 1729.11, "total_tokens": 1446704} +{"current_steps": 1685, "total_steps": 2065, "loss": 0.0469, "lr": 5.010675734177631e-06, "epoch": 4.079903147699758, "percentage": 81.6, "elapsed_time": "0:13:57", "remaining_time": "0:03:08", "throughput": 1733.32, "total_tokens": 1450864} +{"current_steps": 1690, "total_steps": 2065, "loss": 0.0316, "lr": 4.884457868124001e-06, "epoch": 4.092009685230024, "percentage": 81.84, "elapsed_time": "0:13:57", "remaining_time": "0:03:05", "throughput": 1737.6, "total_tokens": 1455088} +{"current_steps": 1695, "total_steps": 2065, "loss": 0.0228, "lr": 4.759677732582782e-06, "epoch": 4.1041162227602905, "percentage": 82.08, "elapsed_time": "0:13:57", "remaining_time": "0:03:02", "throughput": 1741.94, "total_tokens": 1459376} +{"current_steps": 1700, "total_steps": 2065, "loss": 0.0529, "lr": 4.636344246041321e-06, "epoch": 4.116222760290557, "percentage": 82.32, "elapsed_time": "0:13:58", "remaining_time": "0:02:59", "throughput": 1746.21, "total_tokens": 1463600} +{"current_steps": 1705, "total_steps": 2065, "loss": 0.0565, "lr": 4.514466223589753e-06, "epoch": 4.128329297820823, "percentage": 82.57, "elapsed_time": "0:13:58", "remaining_time": "0:02:57", "throughput": 1750.77, "total_tokens": 1468080} +{"current_steps": 1710, "total_steps": 2065, "loss": 0.0695, "lr": 4.3940523762909135e-06, "epoch": 4.14043583535109, "percentage": 82.81, "elapsed_time": "0:13:58", "remaining_time": "0:02:54", "throughput": 1755.4, "total_tokens": 1472624} +{"current_steps": 1715, "total_steps": 2065, "loss": 0.0511, "lr": 4.275111310557758e-06, "epoch": 4.1525423728813555, "percentage": 83.05, "elapsed_time": "0:13:59", "remaining_time": "0:02:51", "throughput": 1759.87, "total_tokens": 1477040} +{"current_steps": 1720, "total_steps": 2065, "loss": 0.0311, "lr": 4.1576515275382226e-06, "epoch": 4.164648910411622, "percentage": 83.29, "elapsed_time": "0:13:59", "remaining_time": "0:02:48", "throughput": 1764.19, "total_tokens": 1481328} +{"current_steps": 1725, "total_steps": 2065, "loss": 0.0394, "lr": 4.0416814225076035e-06, "epoch": 4.176755447941889, "percentage": 83.54, "elapsed_time": "0:14:00", "remaining_time": "0:02:45", "throughput": 1768.73, "total_tokens": 1485808} +{"current_steps": 1730, "total_steps": 2065, "loss": 0.0255, "lr": 3.9272092842685345e-06, "epoch": 4.188861985472155, "percentage": 83.78, "elapsed_time": "0:14:00", "remaining_time": "0:02:42", "throughput": 1773.13, "total_tokens": 1490160} +{"current_steps": 1735, "total_steps": 2065, "loss": 0.0073, "lr": 3.814243294558542e-06, "epoch": 4.200968523002421, "percentage": 84.02, "elapsed_time": "0:14:00", "remaining_time": "0:02:39", "throughput": 1777.51, "total_tokens": 1494512} +{"current_steps": 1740, "total_steps": 2065, "loss": 0.0562, "lr": 3.702791527465274e-06, "epoch": 4.213075060532688, "percentage": 84.26, "elapsed_time": "0:14:01", "remaining_time": "0:02:37", "throughput": 1781.45, "total_tokens": 1498480} +{"current_steps": 1745, "total_steps": 2065, "loss": 0.0463, "lr": 3.592861948849416e-06, "epoch": 4.225181598062954, "percentage": 84.5, "elapsed_time": "0:14:01", "remaining_time": "0:02:34", "throughput": 1785.76, "total_tokens": 1502768} +{"current_steps": 1750, "total_steps": 2065, "loss": 0.0429, "lr": 3.484462415775333e-06, "epoch": 4.237288135593221, "percentage": 84.75, "elapsed_time": "0:14:01", "remaining_time": "0:02:31", "throughput": 1789.99, "total_tokens": 1506992} +{"current_steps": 1755, "total_steps": 2065, "loss": 0.0035, "lr": 3.377600675949527e-06, "epoch": 4.249394673123486, "percentage": 84.99, "elapsed_time": "0:14:02", "remaining_time": "0:02:28", "throughput": 1794.51, "total_tokens": 1511472} +{"current_steps": 1760, "total_steps": 2065, "loss": 0.0395, "lr": 3.272284367166825e-06, "epoch": 4.261501210653753, "percentage": 85.23, "elapsed_time": "0:14:02", "remaining_time": "0:02:26", "throughput": 1798.87, "total_tokens": 1515824} +{"current_steps": 1765, "total_steps": 2065, "loss": 0.0337, "lr": 3.1685210167645335e-06, "epoch": 4.27360774818402, "percentage": 85.47, "elapsed_time": "0:14:03", "remaining_time": "0:02:23", "throughput": 1803.24, "total_tokens": 1520176} +{"current_steps": 1768, "total_steps": 2065, "eval_loss": 0.4314914643764496, "epoch": 4.280871670702179, "percentage": 85.62, "elapsed_time": "0:14:04", "remaining_time": "0:02:21", "throughput": 1803.91, "total_tokens": 1522544} +{"current_steps": 1770, "total_steps": 2065, "loss": 0.008, "lr": 3.0663180410843982e-06, "epoch": 4.285714285714286, "percentage": 85.71, "elapsed_time": "0:14:32", "remaining_time": "0:02:25", "throughput": 1747.79, "total_tokens": 1524336} +{"current_steps": 1775, "total_steps": 2065, "loss": 0.1379, "lr": 2.9656827449425494e-06, "epoch": 4.297820823244552, "percentage": 85.96, "elapsed_time": "0:14:32", "remaining_time": "0:02:22", "throughput": 1751.87, "total_tokens": 1528560} +{"current_steps": 1780, "total_steps": 2065, "loss": 0.0391, "lr": 2.86662232110739e-06, "epoch": 4.309927360774818, "percentage": 86.2, "elapsed_time": "0:14:32", "remaining_time": "0:02:19", "throughput": 1755.89, "total_tokens": 1532720} +{"current_steps": 1785, "total_steps": 2065, "loss": 0.0481, "lr": 2.7691438497855134e-06, "epoch": 4.322033898305085, "percentage": 86.44, "elapsed_time": "0:14:33", "remaining_time": "0:02:16", "throughput": 1759.98, "total_tokens": 1536944} +{"current_steps": 1790, "total_steps": 2065, "loss": 0.0365, "lr": 2.673254298115646e-06, "epoch": 4.3341404358353515, "percentage": 86.68, "elapsed_time": "0:14:33", "remaining_time": "0:02:14", "throughput": 1764.07, "total_tokens": 1541168} +{"current_steps": 1795, "total_steps": 2065, "loss": 0.0094, "lr": 2.5789605196706674e-06, "epoch": 4.346246973365617, "percentage": 86.92, "elapsed_time": "0:14:34", "remaining_time": "0:02:11", "throughput": 1768.23, "total_tokens": 1545456} +{"current_steps": 1800, "total_steps": 2065, "loss": 0.0798, "lr": 2.4862692539677906e-06, "epoch": 4.358353510895884, "percentage": 87.17, "elapsed_time": "0:14:34", "remaining_time": "0:02:08", "throughput": 1772.52, "total_tokens": 1549872} +{"current_steps": 1805, "total_steps": 2065, "loss": 0.113, "lr": 2.3951871259868503e-06, "epoch": 4.37046004842615, "percentage": 87.41, "elapsed_time": "0:14:34", "remaining_time": "0:02:06", "throughput": 1776.82, "total_tokens": 1554288} +{"current_steps": 1810, "total_steps": 2065, "loss": 0.1113, "lr": 2.3057206456967905e-06, "epoch": 4.3825665859564165, "percentage": 87.65, "elapsed_time": "0:14:35", "remaining_time": "0:02:03", "throughput": 1780.75, "total_tokens": 1558384} +{"current_steps": 1815, "total_steps": 2065, "loss": 0.0523, "lr": 2.217876207590375e-06, "epoch": 4.394673123486683, "percentage": 87.89, "elapsed_time": "0:14:35", "remaining_time": "0:02:00", "throughput": 1784.75, "total_tokens": 1562544} +{"current_steps": 1820, "total_steps": 2065, "loss": 0.0659, "lr": 2.131660090227139e-06, "epoch": 4.406779661016949, "percentage": 88.14, "elapsed_time": "0:14:35", "remaining_time": "0:01:57", "throughput": 1789.31, "total_tokens": 1567216} +{"current_steps": 1825, "total_steps": 2065, "loss": 0.0756, "lr": 2.0470784557846652e-06, "epoch": 4.418886198547216, "percentage": 88.38, "elapsed_time": "0:14:36", "remaining_time": "0:01:55", "throughput": 1793.51, "total_tokens": 1571568} +{"current_steps": 1830, "total_steps": 2065, "loss": 0.0018, "lr": 1.964137349618114e-06, "epoch": 4.4309927360774815, "percentage": 88.62, "elapsed_time": "0:14:36", "remaining_time": "0:01:52", "throughput": 1797.57, "total_tokens": 1575792} +{"current_steps": 1835, "total_steps": 2065, "loss": 0.0419, "lr": 1.8828426998281689e-06, "epoch": 4.443099273607748, "percentage": 88.86, "elapsed_time": "0:14:36", "remaining_time": "0:01:49", "throughput": 1801.7, "total_tokens": 1580080} +{"current_steps": 1840, "total_steps": 2065, "loss": 0.0692, "lr": 1.8032003168373306e-06, "epoch": 4.455205811138015, "percentage": 89.1, "elapsed_time": "0:14:37", "remaining_time": "0:01:47", "throughput": 1805.53, "total_tokens": 1584112} +{"current_steps": 1845, "total_steps": 2065, "loss": 0.0456, "lr": 1.7252158929746131e-06, "epoch": 4.467312348668281, "percentage": 89.35, "elapsed_time": "0:14:37", "remaining_time": "0:01:44", "throughput": 1809.65, "total_tokens": 1588400} +{"current_steps": 1850, "total_steps": 2065, "loss": 0.0504, "lr": 1.6488950020686955e-06, "epoch": 4.479418886198547, "percentage": 89.59, "elapsed_time": "0:14:38", "remaining_time": "0:01:42", "throughput": 1813.9, "total_tokens": 1592816} +{"current_steps": 1855, "total_steps": 2065, "loss": 0.0573, "lr": 1.5742430990495466e-06, "epoch": 4.491525423728813, "percentage": 89.83, "elapsed_time": "0:14:38", "remaining_time": "0:01:39", "throughput": 1818.23, "total_tokens": 1597296} +{"current_steps": 1860, "total_steps": 2065, "loss": 0.0293, "lr": 1.5012655195585368e-06, "epoch": 4.50363196125908, "percentage": 90.07, "elapsed_time": "0:14:38", "remaining_time": "0:01:36", "throughput": 1822.4, "total_tokens": 1601648} +{"current_steps": 1865, "total_steps": 2065, "loss": 0.1156, "lr": 1.4299674795670764e-06, "epoch": 4.5157384987893465, "percentage": 90.31, "elapsed_time": "0:14:39", "remaining_time": "0:01:34", "throughput": 1826.5, "total_tokens": 1605936} +{"current_steps": 1870, "total_steps": 2065, "loss": 0.126, "lr": 1.360354075003828e-06, "epoch": 4.527845036319612, "percentage": 90.56, "elapsed_time": "0:14:39", "remaining_time": "0:01:31", "throughput": 1830.46, "total_tokens": 1610096} +{"current_steps": 1872, "total_steps": 2065, "eval_loss": 0.42201921343803406, "epoch": 4.532687651331719, "percentage": 90.65, "elapsed_time": "0:14:40", "remaining_time": "0:01:30", "throughput": 1830.66, "total_tokens": 1611760} +{"current_steps": 1875, "total_steps": 2065, "loss": 0.0436, "lr": 1.2924302813904582e-06, "epoch": 4.539951573849879, "percentage": 90.8, "elapsed_time": "0:15:21", "remaining_time": "0:01:33", "throughput": 1752.76, "total_tokens": 1614384} +{"current_steps": 1880, "total_steps": 2065, "loss": 0.0591, "lr": 1.226200953486037e-06, "epoch": 4.552058111380145, "percentage": 91.04, "elapsed_time": "0:15:21", "remaining_time": "0:01:30", "throughput": 1756.85, "total_tokens": 1618800} +{"current_steps": 1885, "total_steps": 2065, "loss": 0.0027, "lr": 1.1616708249400449e-06, "epoch": 4.5641646489104115, "percentage": 91.28, "elapsed_time": "0:15:21", "remaining_time": "0:01:28", "throughput": 1760.65, "total_tokens": 1622960} +{"current_steps": 1890, "total_steps": 2065, "loss": 0.037, "lr": 1.0988445079540388e-06, "epoch": 4.576271186440678, "percentage": 91.53, "elapsed_time": "0:15:22", "remaining_time": "0:01:25", "throughput": 1764.39, "total_tokens": 1627056} +{"current_steps": 1895, "total_steps": 2065, "loss": 0.0205, "lr": 1.0377264929520125e-06, "epoch": 4.588377723970944, "percentage": 91.77, "elapsed_time": "0:15:22", "remaining_time": "0:01:22", "throughput": 1768.39, "total_tokens": 1631408} +{"current_steps": 1900, "total_steps": 2065, "loss": 0.0687, "lr": 9.783211482594285e-07, "epoch": 4.600484261501211, "percentage": 92.01, "elapsed_time": "0:15:22", "remaining_time": "0:01:20", "throughput": 1772.52, "total_tokens": 1635888} +{"current_steps": 1905, "total_steps": 2065, "loss": 0.0049, "lr": 9.206327197910203e-07, "epoch": 4.6125907990314765, "percentage": 92.25, "elapsed_time": "0:15:23", "remaining_time": "0:01:17", "throughput": 1776.45, "total_tokens": 1640176} +{"current_steps": 1910, "total_steps": 2065, "loss": 0.056, "lr": 8.646653307473079e-07, "epoch": 4.624697336561743, "percentage": 92.49, "elapsed_time": "0:15:23", "remaining_time": "0:01:14", "throughput": 1780.44, "total_tokens": 1644528} +{"current_steps": 1915, "total_steps": 2065, "loss": 0.002, "lr": 8.10422981319911e-07, "epoch": 4.63680387409201, "percentage": 92.74, "elapsed_time": "0:15:24", "remaining_time": "0:01:12", "throughput": 1784.83, "total_tokens": 1649264} +{"current_steps": 1920, "total_steps": 2065, "loss": 0.0111, "lr": 7.579095484056192e-07, "epoch": 4.648910411622276, "percentage": 92.98, "elapsed_time": "0:15:24", "remaining_time": "0:01:09", "throughput": 1789.01, "total_tokens": 1653808} +{"current_steps": 1925, "total_steps": 2065, "loss": 0.0023, "lr": 7.07128785329314e-07, "epoch": 4.661016949152542, "percentage": 93.22, "elapsed_time": "0:15:24", "remaining_time": "0:01:07", "throughput": 1793.13, "total_tokens": 1658288} +{"current_steps": 1930, "total_steps": 2065, "loss": 0.0228, "lr": 6.580843215757082e-07, "epoch": 4.673123486682809, "percentage": 93.46, "elapsed_time": "0:15:25", "remaining_time": "0:01:04", "throughput": 1797.04, "total_tokens": 1662576} +{"current_steps": 1935, "total_steps": 2065, "loss": 0.0221, "lr": 6.107796625299117e-07, "epoch": 4.685230024213075, "percentage": 93.7, "elapsed_time": "0:15:25", "remaining_time": "0:01:02", "throughput": 1801.15, "total_tokens": 1667056} +{"current_steps": 1940, "total_steps": 2065, "loss": 0.0733, "lr": 5.652181892269181e-07, "epoch": 4.697336561743342, "percentage": 93.95, "elapsed_time": "0:15:25", "remaining_time": "0:00:59", "throughput": 1805.25, "total_tokens": 1671536} +{"current_steps": 1945, "total_steps": 2065, "loss": 0.0023, "lr": 5.214031581099149e-07, "epoch": 4.709443099273607, "percentage": 94.19, "elapsed_time": "0:15:26", "remaining_time": "0:00:57", "throughput": 1809.22, "total_tokens": 1675888} +{"current_steps": 1950, "total_steps": 2065, "loss": 0.0341, "lr": 4.793377007975719e-07, "epoch": 4.721549636803874, "percentage": 94.43, "elapsed_time": "0:15:26", "remaining_time": "0:00:54", "throughput": 1813.12, "total_tokens": 1680176} +{"current_steps": 1955, "total_steps": 2065, "loss": 0.0568, "lr": 4.3902482386018186e-07, "epoch": 4.733656174334141, "percentage": 94.67, "elapsed_time": "0:15:27", "remaining_time": "0:00:52", "throughput": 1816.95, "total_tokens": 1684400} +{"current_steps": 1960, "total_steps": 2065, "loss": 0.1211, "lr": 4.004674086047905e-07, "epoch": 4.745762711864407, "percentage": 94.92, "elapsed_time": "0:15:27", "remaining_time": "0:00:49", "throughput": 1820.97, "total_tokens": 1688816} +{"current_steps": 1965, "total_steps": 2065, "loss": 0.0408, "lr": 3.636682108692502e-07, "epoch": 4.757869249394673, "percentage": 95.16, "elapsed_time": "0:15:27", "remaining_time": "0:00:47", "throughput": 1825.13, "total_tokens": 1693360} +{"current_steps": 1970, "total_steps": 2065, "loss": 0.0647, "lr": 3.2862986082524416e-07, "epoch": 4.76997578692494, "percentage": 95.4, "elapsed_time": "0:15:28", "remaining_time": "0:00:44", "throughput": 1828.94, "total_tokens": 1697584} +{"current_steps": 1975, "total_steps": 2065, "loss": 0.0336, "lr": 2.953548627903202e-07, "epoch": 4.782082324455206, "percentage": 95.64, "elapsed_time": "0:15:28", "remaining_time": "0:00:42", "throughput": 1832.96, "total_tokens": 1702000} +{"current_steps": 1976, "total_steps": 2065, "eval_loss": 0.4348176121711731, "epoch": 4.784503631961259, "percentage": 95.69, "elapsed_time": "0:15:29", "remaining_time": "0:00:41", "throughput": 1832.42, "total_tokens": 1702832} +{"current_steps": 1980, "total_steps": 2065, "loss": 0.1448, "lr": 2.6384559504886166e-07, "epoch": 4.7941888619854724, "percentage": 95.88, "elapsed_time": "0:16:21", "remaining_time": "0:00:42", "throughput": 1737.89, "total_tokens": 1706416} +{"current_steps": 1985, "total_steps": 2065, "loss": 0.0163, "lr": 2.3410430968214824e-07, "epoch": 4.806295399515738, "percentage": 96.13, "elapsed_time": "0:16:22", "remaining_time": "0:00:39", "throughput": 1741.85, "total_tokens": 1710960} +{"current_steps": 1990, "total_steps": 2065, "loss": 0.1048, "lr": 2.0613313240735454e-07, "epoch": 4.818401937046005, "percentage": 96.37, "elapsed_time": "0:16:22", "remaining_time": "0:00:37", "throughput": 1745.75, "total_tokens": 1715440} +{"current_steps": 1995, "total_steps": 2065, "loss": 0.0295, "lr": 1.7993406242563238e-07, "epoch": 4.830508474576272, "percentage": 96.61, "elapsed_time": "0:16:23", "remaining_time": "0:00:34", "throughput": 1749.45, "total_tokens": 1719728} +{"current_steps": 2000, "total_steps": 2065, "loss": 0.0007, "lr": 1.5550897227922523e-07, "epoch": 4.842615012106537, "percentage": 96.85, "elapsed_time": "0:16:23", "remaining_time": "0:00:31", "throughput": 1753.41, "total_tokens": 1724272} +{"current_steps": 2005, "total_steps": 2065, "loss": 0.064, "lr": 1.3285960771761697e-07, "epoch": 4.854721549636804, "percentage": 97.09, "elapsed_time": "0:16:23", "remaining_time": "0:00:29", "throughput": 1757.11, "total_tokens": 1728560} +{"current_steps": 2010, "total_steps": 2065, "loss": 0.0289, "lr": 1.119875875727705e-07, "epoch": 4.86682808716707, "percentage": 97.34, "elapsed_time": "0:16:24", "remaining_time": "0:00:26", "throughput": 1761.06, "total_tokens": 1733104} +{"current_steps": 2015, "total_steps": 2065, "loss": 0.0127, "lr": 9.289440364341485e-08, "epoch": 4.878934624697337, "percentage": 97.58, "elapsed_time": "0:16:24", "remaining_time": "0:00:24", "throughput": 1764.62, "total_tokens": 1737264} +{"current_steps": 2020, "total_steps": 2065, "loss": 0.0664, "lr": 7.558142058842754e-08, "epoch": 4.891041162227603, "percentage": 97.82, "elapsed_time": "0:16:24", "remaining_time": "0:00:21", "throughput": 1768.18, "total_tokens": 1741424} +{"current_steps": 2025, "total_steps": 2065, "loss": 0.0657, "lr": 6.004987582929055e-08, "epoch": 4.903147699757869, "percentage": 98.06, "elapsed_time": "0:16:25", "remaining_time": "0:00:19", "throughput": 1771.8, "total_tokens": 1745648} +{"current_steps": 2030, "total_steps": 2065, "loss": 0.045, "lr": 4.63008794616554e-08, "epoch": 4.915254237288136, "percentage": 98.31, "elapsed_time": "0:16:25", "remaining_time": "0:00:16", "throughput": 1775.41, "total_tokens": 1749872} +{"current_steps": 2035, "total_steps": 2065, "loss": 0.0431, "lr": 3.433541417599551e-08, "epoch": 4.927360774818402, "percentage": 98.55, "elapsed_time": "0:16:25", "remaining_time": "0:00:14", "throughput": 1779.22, "total_tokens": 1754288} +{"current_steps": 2040, "total_steps": 2065, "loss": 0.0332, "lr": 2.4154335187365207e-08, "epoch": 4.939467312348668, "percentage": 98.79, "elapsed_time": "0:16:26", "remaining_time": "0:00:12", "throughput": 1782.96, "total_tokens": 1758640} +{"current_steps": 2045, "total_steps": 2065, "loss": 0.0602, "lr": 1.5758370174284722e-08, "epoch": 4.951573849878935, "percentage": 99.03, "elapsed_time": "0:16:26", "remaining_time": "0:00:09", "throughput": 1786.63, "total_tokens": 1762928} +{"current_steps": 2050, "total_steps": 2065, "loss": 0.0118, "lr": 9.14811922672898e-09, "epoch": 4.963680387409201, "percentage": 99.27, "elapsed_time": "0:16:27", "remaining_time": "0:00:07", "throughput": 1790.43, "total_tokens": 1767344} +{"current_steps": 2055, "total_steps": 2065, "loss": 0.0392, "lr": 4.324054803223065e-09, "epoch": 4.9757869249394675, "percentage": 99.52, "elapsed_time": "0:16:27", "remaining_time": "0:00:04", "throughput": 1794.1, "total_tokens": 1771632} +{"current_steps": 2060, "total_steps": 2065, "loss": 0.0333, "lr": 1.286521697091425e-09, "epoch": 4.987893462469733, "percentage": 99.76, "elapsed_time": "0:16:27", "remaining_time": "0:00:02", "throughput": 1797.58, "total_tokens": 1775728} +{"current_steps": 2065, "total_steps": 2065, "loss": 0.0653, "lr": 3.5737011805370145e-11, "epoch": 5.0, "percentage": 100.0, "elapsed_time": "0:16:28", "remaining_time": "0:00:00", "throughput": 1801.14, "total_tokens": 1780000} +{"current_steps": 2065, "total_steps": 2065, "epoch": 5.0, "percentage": 100.0, "elapsed_time": "0:16:54", "remaining_time": "0:00:00", "throughput": 1753.7, "total_tokens": 1780000} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..08037a7 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,3519 @@ +{ + "best_global_step": 104, + "best_metric": 0.17402823269367218, + "best_model_checkpoint": "saves_bts_preliminary/base/llama-3.2-1b-instruct/train_mrpc_42_1774791061/checkpoint-104", + "epoch": 5.0, + "eval_steps": 104, + "global_step": 2065, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.012106537530266344, + "grad_norm": 262.4778747558594, + "learning_rate": 9.66183574879227e-07, + "loss": 0.7681, + "num_input_tokens_seen": 4352, + "step": 5 + }, + { + "epoch": 0.024213075060532687, + "grad_norm": 26.363384246826172, + "learning_rate": 2.173913043478261e-06, + "loss": 0.3056, + "num_input_tokens_seen": 8768, + "step": 10 + }, + { + "epoch": 0.03631961259079903, + "grad_norm": 10.327119827270508, + "learning_rate": 3.3816425120772947e-06, + "loss": 0.183, + "num_input_tokens_seen": 12992, + "step": 15 + }, + { + "epoch": 0.048426150121065374, + "grad_norm": 36.403324127197266, + "learning_rate": 4.589371980676329e-06, + "loss": 0.4041, + "num_input_tokens_seen": 17344, + "step": 20 + }, + { + "epoch": 0.06053268765133172, + "grad_norm": 8.729621887207031, + "learning_rate": 5.797101449275362e-06, + "loss": 0.4147, + "num_input_tokens_seen": 21696, + "step": 25 + }, + { + "epoch": 0.07263922518159806, + "grad_norm": 4.769359111785889, + "learning_rate": 7.004830917874397e-06, + "loss": 0.2132, + "num_input_tokens_seen": 26112, + "step": 30 + }, + { + "epoch": 0.0847457627118644, + "grad_norm": 4.588466644287109, + "learning_rate": 8.212560386473431e-06, + "loss": 0.2587, + "num_input_tokens_seen": 30208, + "step": 35 + }, + { + "epoch": 0.09685230024213075, + "grad_norm": 21.823162078857422, + "learning_rate": 9.420289855072464e-06, + "loss": 0.2076, + "num_input_tokens_seen": 34688, + "step": 40 + }, + { + "epoch": 0.1089588377723971, + "grad_norm": 18.038860321044922, + "learning_rate": 1.0628019323671499e-05, + "loss": 0.1842, + "num_input_tokens_seen": 38784, + "step": 45 + }, + { + "epoch": 0.12106537530266344, + "grad_norm": 12.918279647827148, + "learning_rate": 1.1835748792270531e-05, + "loss": 0.3012, + "num_input_tokens_seen": 43200, + "step": 50 + }, + { + "epoch": 0.13317191283292978, + "grad_norm": 24.635744094848633, + "learning_rate": 1.3043478260869566e-05, + "loss": 0.1951, + "num_input_tokens_seen": 47296, + "step": 55 + }, + { + "epoch": 0.14527845036319612, + "grad_norm": 14.053600311279297, + "learning_rate": 1.4251207729468599e-05, + "loss": 0.2332, + "num_input_tokens_seen": 51712, + "step": 60 + }, + { + "epoch": 0.15738498789346247, + "grad_norm": 8.166345596313477, + "learning_rate": 1.5458937198067633e-05, + "loss": 0.2049, + "num_input_tokens_seen": 55872, + "step": 65 + }, + { + "epoch": 0.1694915254237288, + "grad_norm": 27.84511947631836, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.2103, + "num_input_tokens_seen": 59840, + "step": 70 + }, + { + "epoch": 0.18159806295399517, + "grad_norm": 55.02257537841797, + "learning_rate": 1.78743961352657e-05, + "loss": 0.3072, + "num_input_tokens_seen": 64000, + "step": 75 + }, + { + "epoch": 0.1937046004842615, + "grad_norm": 11.449199676513672, + "learning_rate": 1.9082125603864733e-05, + "loss": 0.3841, + "num_input_tokens_seen": 68352, + "step": 80 + }, + { + "epoch": 0.20581113801452786, + "grad_norm": 11.381855964660645, + "learning_rate": 2.028985507246377e-05, + "loss": 0.232, + "num_input_tokens_seen": 72768, + "step": 85 + }, + { + "epoch": 0.2179176755447942, + "grad_norm": 42.495670318603516, + "learning_rate": 2.1497584541062805e-05, + "loss": 0.2474, + "num_input_tokens_seen": 77120, + "step": 90 + }, + { + "epoch": 0.23002421307506055, + "grad_norm": 21.28970718383789, + "learning_rate": 2.2705314009661836e-05, + "loss": 0.1841, + "num_input_tokens_seen": 81664, + "step": 95 + }, + { + "epoch": 0.24213075060532688, + "grad_norm": 17.023759841918945, + "learning_rate": 2.391304347826087e-05, + "loss": 0.1681, + "num_input_tokens_seen": 86080, + "step": 100 + }, + { + "epoch": 0.25181598062953997, + "eval_loss": 0.17402823269367218, + "eval_runtime": 0.639, + "eval_samples_per_second": 574.368, + "eval_steps_per_second": 71.992, + "num_input_tokens_seen": 89600, + "step": 104 + }, + { + "epoch": 0.2542372881355932, + "grad_norm": 16.713178634643555, + "learning_rate": 2.5120772946859905e-05, + "loss": 0.1488, + "num_input_tokens_seen": 90432, + "step": 105 + }, + { + "epoch": 0.26634382566585957, + "grad_norm": 6.363961219787598, + "learning_rate": 2.632850241545894e-05, + "loss": 0.2051, + "num_input_tokens_seen": 94528, + "step": 110 + }, + { + "epoch": 0.2784503631961259, + "grad_norm": 7.700758934020996, + "learning_rate": 2.753623188405797e-05, + "loss": 0.16, + "num_input_tokens_seen": 98816, + "step": 115 + }, + { + "epoch": 0.29055690072639223, + "grad_norm": 8.657270431518555, + "learning_rate": 2.8743961352657005e-05, + "loss": 0.205, + "num_input_tokens_seen": 103104, + "step": 120 + }, + { + "epoch": 0.3026634382566586, + "grad_norm": 7.297232151031494, + "learning_rate": 2.995169082125604e-05, + "loss": 0.1846, + "num_input_tokens_seen": 107328, + "step": 125 + }, + { + "epoch": 0.31476997578692495, + "grad_norm": 13.21757984161377, + "learning_rate": 3.1159420289855074e-05, + "loss": 0.2243, + "num_input_tokens_seen": 111488, + "step": 130 + }, + { + "epoch": 0.3268765133171913, + "grad_norm": 6.457214832305908, + "learning_rate": 3.236714975845411e-05, + "loss": 0.2013, + "num_input_tokens_seen": 115968, + "step": 135 + }, + { + "epoch": 0.3389830508474576, + "grad_norm": 29.321474075317383, + "learning_rate": 3.357487922705314e-05, + "loss": 0.2278, + "num_input_tokens_seen": 120192, + "step": 140 + }, + { + "epoch": 0.35108958837772397, + "grad_norm": 10.676529884338379, + "learning_rate": 3.478260869565218e-05, + "loss": 0.1886, + "num_input_tokens_seen": 124416, + "step": 145 + }, + { + "epoch": 0.36319612590799033, + "grad_norm": 11.802507400512695, + "learning_rate": 3.5990338164251205e-05, + "loss": 0.1635, + "num_input_tokens_seen": 128832, + "step": 150 + }, + { + "epoch": 0.37530266343825663, + "grad_norm": 9.175806999206543, + "learning_rate": 3.719806763285024e-05, + "loss": 0.2118, + "num_input_tokens_seen": 132992, + "step": 155 + }, + { + "epoch": 0.387409200968523, + "grad_norm": 17.557262420654297, + "learning_rate": 3.8405797101449274e-05, + "loss": 0.3186, + "num_input_tokens_seen": 137280, + "step": 160 + }, + { + "epoch": 0.39951573849878935, + "grad_norm": 31.175756454467773, + "learning_rate": 3.961352657004831e-05, + "loss": 0.2002, + "num_input_tokens_seen": 141568, + "step": 165 + }, + { + "epoch": 0.4116222760290557, + "grad_norm": 12.988505363464355, + "learning_rate": 4.082125603864734e-05, + "loss": 0.1792, + "num_input_tokens_seen": 145984, + "step": 170 + }, + { + "epoch": 0.423728813559322, + "grad_norm": 43.43312454223633, + "learning_rate": 4.202898550724638e-05, + "loss": 0.3197, + "num_input_tokens_seen": 150144, + "step": 175 + }, + { + "epoch": 0.4358353510895884, + "grad_norm": 10.99770736694336, + "learning_rate": 4.323671497584541e-05, + "loss": 0.3561, + "num_input_tokens_seen": 154624, + "step": 180 + }, + { + "epoch": 0.44794188861985473, + "grad_norm": 8.507532119750977, + "learning_rate": 4.4444444444444447e-05, + "loss": 0.373, + "num_input_tokens_seen": 158784, + "step": 185 + }, + { + "epoch": 0.4600484261501211, + "grad_norm": 129.54592895507812, + "learning_rate": 4.565217391304348e-05, + "loss": 0.3924, + "num_input_tokens_seen": 163072, + "step": 190 + }, + { + "epoch": 0.4721549636803874, + "grad_norm": 15.108623504638672, + "learning_rate": 4.6859903381642516e-05, + "loss": 0.2368, + "num_input_tokens_seen": 167104, + "step": 195 + }, + { + "epoch": 0.48426150121065376, + "grad_norm": 9.902148246765137, + "learning_rate": 4.806763285024155e-05, + "loss": 0.4497, + "num_input_tokens_seen": 171456, + "step": 200 + }, + { + "epoch": 0.4963680387409201, + "grad_norm": 16.188369750976562, + "learning_rate": 4.9275362318840584e-05, + "loss": 0.2715, + "num_input_tokens_seen": 175808, + "step": 205 + }, + { + "epoch": 0.5036319612590799, + "eval_loss": 0.23122040927410126, + "eval_runtime": 0.6326, + "eval_samples_per_second": 580.165, + "eval_steps_per_second": 72.718, + "num_input_tokens_seen": 178688, + "step": 208 + }, + { + "epoch": 0.5084745762711864, + "grad_norm": 0.7912328839302063, + "learning_rate": 4.9999857052054956e-05, + "loss": 0.1981, + "num_input_tokens_seen": 180224, + "step": 210 + }, + { + "epoch": 0.5205811138014528, + "grad_norm": 4.983211040496826, + "learning_rate": 4.999824890644693e-05, + "loss": 0.1989, + "num_input_tokens_seen": 184704, + "step": 215 + }, + { + "epoch": 0.5326876513317191, + "grad_norm": 16.626827239990234, + "learning_rate": 4.9994854045622684e-05, + "loss": 0.2336, + "num_input_tokens_seen": 189184, + "step": 220 + }, + { + "epoch": 0.5447941888619855, + "grad_norm": 5.18185567855835, + "learning_rate": 4.9989672712225204e-05, + "loss": 0.1595, + "num_input_tokens_seen": 193536, + "step": 225 + }, + { + "epoch": 0.5569007263922519, + "grad_norm": 8.547920227050781, + "learning_rate": 4.998270527658311e-05, + "loss": 0.2147, + "num_input_tokens_seen": 197888, + "step": 230 + }, + { + "epoch": 0.5690072639225182, + "grad_norm": 1.19011652469635, + "learning_rate": 4.9973952236684216e-05, + "loss": 0.1959, + "num_input_tokens_seen": 202112, + "step": 235 + }, + { + "epoch": 0.5811138014527845, + "grad_norm": 12.658636093139648, + "learning_rate": 4.996341421813993e-05, + "loss": 0.2085, + "num_input_tokens_seen": 206528, + "step": 240 + }, + { + "epoch": 0.5932203389830508, + "grad_norm": 20.122756958007812, + "learning_rate": 4.9951091974140506e-05, + "loss": 0.2304, + "num_input_tokens_seen": 210944, + "step": 245 + }, + { + "epoch": 0.6053268765133172, + "grad_norm": 10.99802303314209, + "learning_rate": 4.99369863854013e-05, + "loss": 0.2171, + "num_input_tokens_seen": 215104, + "step": 250 + }, + { + "epoch": 0.6174334140435835, + "grad_norm": 7.956684112548828, + "learning_rate": 4.992109846009972e-05, + "loss": 0.2458, + "num_input_tokens_seen": 219328, + "step": 255 + }, + { + "epoch": 0.6295399515738499, + "grad_norm": 19.862939834594727, + "learning_rate": 4.990342933380321e-05, + "loss": 0.219, + "num_input_tokens_seen": 223680, + "step": 260 + }, + { + "epoch": 0.6416464891041163, + "grad_norm": 7.302405834197998, + "learning_rate": 4.9883980269388106e-05, + "loss": 0.3803, + "num_input_tokens_seen": 227904, + "step": 265 + }, + { + "epoch": 0.6537530266343826, + "grad_norm": 9.361984252929688, + "learning_rate": 4.986275265694935e-05, + "loss": 0.3005, + "num_input_tokens_seen": 231936, + "step": 270 + }, + { + "epoch": 0.6658595641646489, + "grad_norm": 16.678607940673828, + "learning_rate": 4.9839748013701145e-05, + "loss": 0.2954, + "num_input_tokens_seen": 236160, + "step": 275 + }, + { + "epoch": 0.6779661016949152, + "grad_norm": 9.596780776977539, + "learning_rate": 4.981496798386849e-05, + "loss": 0.2924, + "num_input_tokens_seen": 240320, + "step": 280 + }, + { + "epoch": 0.6900726392251816, + "grad_norm": 6.522184371948242, + "learning_rate": 4.978841433856971e-05, + "loss": 0.1771, + "num_input_tokens_seen": 244800, + "step": 285 + }, + { + "epoch": 0.7021791767554479, + "grad_norm": 8.720867156982422, + "learning_rate": 4.976008897568981e-05, + "loss": 0.194, + "num_input_tokens_seen": 249152, + "step": 290 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 11.178607940673828, + "learning_rate": 4.972999391974488e-05, + "loss": 0.2064, + "num_input_tokens_seen": 253376, + "step": 295 + }, + { + "epoch": 0.7263922518159807, + "grad_norm": 12.191368103027344, + "learning_rate": 4.969813132173735e-05, + "loss": 0.2096, + "num_input_tokens_seen": 257664, + "step": 300 + }, + { + "epoch": 0.738498789346247, + "grad_norm": 5.037217617034912, + "learning_rate": 4.966450345900229e-05, + "loss": 0.1712, + "num_input_tokens_seen": 262016, + "step": 305 + }, + { + "epoch": 0.7506053268765133, + "grad_norm": 10.153473854064941, + "learning_rate": 4.962911273504461e-05, + "loss": 0.2276, + "num_input_tokens_seen": 266432, + "step": 310 + }, + { + "epoch": 0.7554479418886199, + "eval_loss": 0.22853781282901764, + "eval_runtime": 2.3445, + "eval_samples_per_second": 156.536, + "eval_steps_per_second": 19.62, + "num_input_tokens_seen": 267968, + "step": 312 + }, + { + "epoch": 0.7627118644067796, + "grad_norm": 12.040881156921387, + "learning_rate": 4.9591961679367284e-05, + "loss": 0.2349, + "num_input_tokens_seen": 270464, + "step": 315 + }, + { + "epoch": 0.774818401937046, + "grad_norm": 12.473306655883789, + "learning_rate": 4.955305294729056e-05, + "loss": 0.2824, + "num_input_tokens_seen": 274688, + "step": 320 + }, + { + "epoch": 0.7869249394673123, + "grad_norm": 21.77474594116211, + "learning_rate": 4.951238931976216e-05, + "loss": 0.3105, + "num_input_tokens_seen": 278848, + "step": 325 + }, + { + "epoch": 0.7990314769975787, + "grad_norm": 17.280487060546875, + "learning_rate": 4.9469973703158565e-05, + "loss": 0.2667, + "num_input_tokens_seen": 283136, + "step": 330 + }, + { + "epoch": 0.8111380145278451, + "grad_norm": 6.448112487792969, + "learning_rate": 4.9425809129077204e-05, + "loss": 0.2213, + "num_input_tokens_seen": 287680, + "step": 335 + }, + { + "epoch": 0.8232445520581114, + "grad_norm": 1.0759979486465454, + "learning_rate": 4.937989875411985e-05, + "loss": 0.1887, + "num_input_tokens_seen": 292224, + "step": 340 + }, + { + "epoch": 0.8353510895883777, + "grad_norm": 8.703038215637207, + "learning_rate": 4.933224585966696e-05, + "loss": 0.2499, + "num_input_tokens_seen": 296448, + "step": 345 + }, + { + "epoch": 0.847457627118644, + "grad_norm": 16.416717529296875, + "learning_rate": 4.928285385164315e-05, + "loss": 0.2431, + "num_input_tokens_seen": 300736, + "step": 350 + }, + { + "epoch": 0.8595641646489104, + "grad_norm": 6.670568943023682, + "learning_rate": 4.923172626027379e-05, + "loss": 0.2588, + "num_input_tokens_seen": 304960, + "step": 355 + }, + { + "epoch": 0.8716707021791767, + "grad_norm": 3.8800857067108154, + "learning_rate": 4.917886673983267e-05, + "loss": 0.2322, + "num_input_tokens_seen": 309184, + "step": 360 + }, + { + "epoch": 0.8837772397094431, + "grad_norm": 8.991925239562988, + "learning_rate": 4.912427906838078e-05, + "loss": 0.2314, + "num_input_tokens_seen": 313408, + "step": 365 + }, + { + "epoch": 0.8958837772397095, + "grad_norm": 9.208677291870117, + "learning_rate": 4.906796714749635e-05, + "loss": 0.1782, + "num_input_tokens_seen": 317888, + "step": 370 + }, + { + "epoch": 0.9079903147699758, + "grad_norm": 6.636046886444092, + "learning_rate": 4.900993500199591e-05, + "loss": 0.1873, + "num_input_tokens_seen": 322048, + "step": 375 + }, + { + "epoch": 0.9200968523002422, + "grad_norm": 10.718189239501953, + "learning_rate": 4.895018677964669e-05, + "loss": 0.1985, + "num_input_tokens_seen": 326592, + "step": 380 + }, + { + "epoch": 0.9322033898305084, + "grad_norm": 22.99626922607422, + "learning_rate": 4.8888726750870126e-05, + "loss": 0.3036, + "num_input_tokens_seen": 330880, + "step": 385 + }, + { + "epoch": 0.9443099273607748, + "grad_norm": 3.320899486541748, + "learning_rate": 4.882555930843664e-05, + "loss": 0.2224, + "num_input_tokens_seen": 335104, + "step": 390 + }, + { + "epoch": 0.9564164648910412, + "grad_norm": 5.677978038787842, + "learning_rate": 4.87606889671517e-05, + "loss": 0.1898, + "num_input_tokens_seen": 339392, + "step": 395 + }, + { + "epoch": 0.9685230024213075, + "grad_norm": 11.17044448852539, + "learning_rate": 4.8694120363533104e-05, + "loss": 0.1663, + "num_input_tokens_seen": 343744, + "step": 400 + }, + { + "epoch": 0.9806295399515739, + "grad_norm": 9.493459701538086, + "learning_rate": 4.8625858255479574e-05, + "loss": 0.1954, + "num_input_tokens_seen": 348160, + "step": 405 + }, + { + "epoch": 0.9927360774818402, + "grad_norm": 13.322687149047852, + "learning_rate": 4.855590752193076e-05, + "loss": 0.2606, + "num_input_tokens_seen": 352448, + "step": 410 + }, + { + "epoch": 1.0048426150121066, + "grad_norm": 13.647954940795898, + "learning_rate": 4.848427316251842e-05, + "loss": 0.5572, + "num_input_tokens_seen": 356656, + "step": 415 + }, + { + "epoch": 1.0072639225181599, + "eval_loss": 0.2624819278717041, + "eval_runtime": 0.8628, + "eval_samples_per_second": 425.363, + "eval_steps_per_second": 53.315, + "num_input_tokens_seen": 357488, + "step": 416 + }, + { + "epoch": 1.0169491525423728, + "grad_norm": 43.02584457397461, + "learning_rate": 4.841096029720921e-05, + "loss": 0.2346, + "num_input_tokens_seen": 360880, + "step": 420 + }, + { + "epoch": 1.0290556900726393, + "grad_norm": 8.104162216186523, + "learning_rate": 4.8335974165938615e-05, + "loss": 0.1819, + "num_input_tokens_seen": 365104, + "step": 425 + }, + { + "epoch": 1.0411622276029056, + "grad_norm": 5.002182483673096, + "learning_rate": 4.825932012823652e-05, + "loss": 0.1495, + "num_input_tokens_seen": 369776, + "step": 430 + }, + { + "epoch": 1.053268765133172, + "grad_norm": 27.77912139892578, + "learning_rate": 4.8181003662844074e-05, + "loss": 0.2583, + "num_input_tokens_seen": 374000, + "step": 435 + }, + { + "epoch": 1.0653753026634383, + "grad_norm": 9.262914657592773, + "learning_rate": 4.8101030367322195e-05, + "loss": 0.2093, + "num_input_tokens_seen": 378096, + "step": 440 + }, + { + "epoch": 1.0774818401937045, + "grad_norm": 5.5975823402404785, + "learning_rate": 4.8019405957651395e-05, + "loss": 0.1806, + "num_input_tokens_seen": 382256, + "step": 445 + }, + { + "epoch": 1.089588377723971, + "grad_norm": 10.306631088256836, + "learning_rate": 4.793613626782331e-05, + "loss": 0.3307, + "num_input_tokens_seen": 386672, + "step": 450 + }, + { + "epoch": 1.1016949152542372, + "grad_norm": 4.157079696655273, + "learning_rate": 4.785122724942367e-05, + "loss": 0.2208, + "num_input_tokens_seen": 390960, + "step": 455 + }, + { + "epoch": 1.1138014527845037, + "grad_norm": 0.7576245069503784, + "learning_rate": 4.776468497120698e-05, + "loss": 0.2978, + "num_input_tokens_seen": 395440, + "step": 460 + }, + { + "epoch": 1.12590799031477, + "grad_norm": 6.9619035720825195, + "learning_rate": 4.7676515618662684e-05, + "loss": 0.2315, + "num_input_tokens_seen": 399600, + "step": 465 + }, + { + "epoch": 1.1380145278450362, + "grad_norm": 1.4395357370376587, + "learning_rate": 4.758672549357316e-05, + "loss": 0.2236, + "num_input_tokens_seen": 403888, + "step": 470 + }, + { + "epoch": 1.1501210653753027, + "grad_norm": 18.561601638793945, + "learning_rate": 4.749532101356322e-05, + "loss": 0.1689, + "num_input_tokens_seen": 408176, + "step": 475 + }, + { + "epoch": 1.162227602905569, + "grad_norm": 16.139604568481445, + "learning_rate": 4.740230871164147e-05, + "loss": 0.2012, + "num_input_tokens_seen": 412208, + "step": 480 + }, + { + "epoch": 1.1743341404358354, + "grad_norm": 1.962085247039795, + "learning_rate": 4.730769523573337e-05, + "loss": 0.1816, + "num_input_tokens_seen": 416624, + "step": 485 + }, + { + "epoch": 1.1864406779661016, + "grad_norm": 3.118806838989258, + "learning_rate": 4.7211487348206054e-05, + "loss": 0.2491, + "num_input_tokens_seen": 421040, + "step": 490 + }, + { + "epoch": 1.1985472154963681, + "grad_norm": 3.9620296955108643, + "learning_rate": 4.711369192538503e-05, + "loss": 0.203, + "num_input_tokens_seen": 425136, + "step": 495 + }, + { + "epoch": 1.2106537530266344, + "grad_norm": 4.469512462615967, + "learning_rate": 4.7014315957062685e-05, + "loss": 0.4102, + "num_input_tokens_seen": 429680, + "step": 500 + }, + { + "epoch": 1.2227602905569008, + "grad_norm": 8.607080459594727, + "learning_rate": 4.691336654599873e-05, + "loss": 0.2409, + "num_input_tokens_seen": 434224, + "step": 505 + }, + { + "epoch": 1.234866828087167, + "grad_norm": 9.237229347229004, + "learning_rate": 4.6810850907412484e-05, + "loss": 0.2191, + "num_input_tokens_seen": 438320, + "step": 510 + }, + { + "epoch": 1.2469733656174333, + "grad_norm": 5.81946325302124, + "learning_rate": 4.670677636846723e-05, + "loss": 0.1975, + "num_input_tokens_seen": 442672, + "step": 515 + }, + { + "epoch": 1.2590799031476998, + "grad_norm": 2.934025764465332, + "learning_rate": 4.660115036774648e-05, + "loss": 0.1881, + "num_input_tokens_seen": 446896, + "step": 520 + }, + { + "epoch": 1.2590799031476998, + "eval_loss": 0.1976936012506485, + "eval_runtime": 0.6676, + "eval_samples_per_second": 549.73, + "eval_steps_per_second": 68.904, + "num_input_tokens_seen": 446896, + "step": 520 + }, + { + "epoch": 1.271186440677966, + "grad_norm": 2.785706043243408, + "learning_rate": 4.6493980454722344e-05, + "loss": 0.2485, + "num_input_tokens_seen": 451312, + "step": 525 + }, + { + "epoch": 1.2832929782082325, + "grad_norm": 9.8702392578125, + "learning_rate": 4.638527428921592e-05, + "loss": 0.2053, + "num_input_tokens_seen": 455408, + "step": 530 + }, + { + "epoch": 1.2953995157384988, + "grad_norm": 7.424989223480225, + "learning_rate": 4.627503964084981e-05, + "loss": 0.1867, + "num_input_tokens_seen": 460080, + "step": 535 + }, + { + "epoch": 1.307506053268765, + "grad_norm": 4.052550792694092, + "learning_rate": 4.6163284388492835e-05, + "loss": 0.1674, + "num_input_tokens_seen": 464496, + "step": 540 + }, + { + "epoch": 1.3196125907990315, + "grad_norm": 2.9404428005218506, + "learning_rate": 4.605001651969686e-05, + "loss": 0.2045, + "num_input_tokens_seen": 468720, + "step": 545 + }, + { + "epoch": 1.331719128329298, + "grad_norm": 6.4158148765563965, + "learning_rate": 4.593524413012592e-05, + "loss": 0.191, + "num_input_tokens_seen": 473264, + "step": 550 + }, + { + "epoch": 1.3438256658595642, + "grad_norm": 2.213015556335449, + "learning_rate": 4.5818975422977606e-05, + "loss": 0.1828, + "num_input_tokens_seen": 477552, + "step": 555 + }, + { + "epoch": 1.3559322033898304, + "grad_norm": 5.9616804122924805, + "learning_rate": 4.570121870839671e-05, + "loss": 0.1546, + "num_input_tokens_seen": 482032, + "step": 560 + }, + { + "epoch": 1.368038740920097, + "grad_norm": 0.6267197132110596, + "learning_rate": 4.558198240288131e-05, + "loss": 0.2025, + "num_input_tokens_seen": 486384, + "step": 565 + }, + { + "epoch": 1.3801452784503632, + "grad_norm": 9.450618743896484, + "learning_rate": 4.546127502868118e-05, + "loss": 0.2413, + "num_input_tokens_seen": 490672, + "step": 570 + }, + { + "epoch": 1.3922518159806296, + "grad_norm": 5.918724536895752, + "learning_rate": 4.5339105213188714e-05, + "loss": 0.2163, + "num_input_tokens_seen": 494960, + "step": 575 + }, + { + "epoch": 1.4043583535108959, + "grad_norm": 2.0229716300964355, + "learning_rate": 4.521548168832227e-05, + "loss": 0.3013, + "num_input_tokens_seen": 499120, + "step": 580 + }, + { + "epoch": 1.4164648910411621, + "grad_norm": 4.871718406677246, + "learning_rate": 4.509041328990204e-05, + "loss": 0.2324, + "num_input_tokens_seen": 503408, + "step": 585 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 4.264101028442383, + "learning_rate": 4.4963908957018576e-05, + "loss": 0.1956, + "num_input_tokens_seen": 507312, + "step": 590 + }, + { + "epoch": 1.4406779661016949, + "grad_norm": 0.7742087841033936, + "learning_rate": 4.483597773139386e-05, + "loss": 0.2206, + "num_input_tokens_seen": 511600, + "step": 595 + }, + { + "epoch": 1.4527845036319613, + "grad_norm": 1.387762427330017, + "learning_rate": 4.470662875673506e-05, + "loss": 0.1973, + "num_input_tokens_seen": 515888, + "step": 600 + }, + { + "epoch": 1.4648910411622276, + "grad_norm": 8.138726234436035, + "learning_rate": 4.457587127808096e-05, + "loss": 0.1848, + "num_input_tokens_seen": 519920, + "step": 605 + }, + { + "epoch": 1.4769975786924938, + "grad_norm": 3.1052446365356445, + "learning_rate": 4.4443714641141255e-05, + "loss": 0.1922, + "num_input_tokens_seen": 524336, + "step": 610 + }, + { + "epoch": 1.4891041162227603, + "grad_norm": 1.7755212783813477, + "learning_rate": 4.4310168291628504e-05, + "loss": 0.1922, + "num_input_tokens_seen": 528496, + "step": 615 + }, + { + "epoch": 1.5012106537530268, + "grad_norm": 8.44454288482666, + "learning_rate": 4.4175241774583084e-05, + "loss": 0.1809, + "num_input_tokens_seen": 532784, + "step": 620 + }, + { + "epoch": 1.5108958837772397, + "eval_loss": 0.19258780777454376, + "eval_runtime": 0.6591, + "eval_samples_per_second": 556.848, + "eval_steps_per_second": 69.796, + "num_input_tokens_seen": 536176, + "step": 624 + }, + { + "epoch": 1.513317191283293, + "grad_norm": 6.506056785583496, + "learning_rate": 4.403894473369092e-05, + "loss": 0.2205, + "num_input_tokens_seen": 537136, + "step": 625 + }, + { + "epoch": 1.5254237288135593, + "grad_norm": 15.012322425842285, + "learning_rate": 4.390128691059423e-05, + "loss": 0.26, + "num_input_tokens_seen": 541552, + "step": 630 + }, + { + "epoch": 1.5375302663438255, + "grad_norm": 2.567143440246582, + "learning_rate": 4.3762278144195236e-05, + "loss": 0.2678, + "num_input_tokens_seen": 545648, + "step": 635 + }, + { + "epoch": 1.549636803874092, + "grad_norm": 9.604016304016113, + "learning_rate": 4.362192836995299e-05, + "loss": 0.2246, + "num_input_tokens_seen": 550256, + "step": 640 + }, + { + "epoch": 1.5617433414043584, + "grad_norm": 6.7328104972839355, + "learning_rate": 4.348024761917321e-05, + "loss": 0.2397, + "num_input_tokens_seen": 554928, + "step": 645 + }, + { + "epoch": 1.5738498789346247, + "grad_norm": 13.930996894836426, + "learning_rate": 4.333724601829132e-05, + "loss": 0.2303, + "num_input_tokens_seen": 559344, + "step": 650 + }, + { + "epoch": 1.585956416464891, + "grad_norm": 7.173315048217773, + "learning_rate": 4.319293378814868e-05, + "loss": 0.2178, + "num_input_tokens_seen": 563760, + "step": 655 + }, + { + "epoch": 1.5980629539951574, + "grad_norm": 1.3246958255767822, + "learning_rate": 4.304732124326206e-05, + "loss": 0.1945, + "num_input_tokens_seen": 568112, + "step": 660 + }, + { + "epoch": 1.6101694915254239, + "grad_norm": 10.188156127929688, + "learning_rate": 4.2900418791086403e-05, + "loss": 0.1908, + "num_input_tokens_seen": 572464, + "step": 665 + }, + { + "epoch": 1.6222760290556901, + "grad_norm": 7.808104515075684, + "learning_rate": 4.275223693127103e-05, + "loss": 0.2026, + "num_input_tokens_seen": 576752, + "step": 670 + }, + { + "epoch": 1.6343825665859564, + "grad_norm": 0.8921657204627991, + "learning_rate": 4.260278625490911e-05, + "loss": 0.1959, + "num_input_tokens_seen": 580976, + "step": 675 + }, + { + "epoch": 1.6464891041162226, + "grad_norm": 6.147708892822266, + "learning_rate": 4.2452077443780744e-05, + "loss": 0.2025, + "num_input_tokens_seen": 585264, + "step": 680 + }, + { + "epoch": 1.658595641646489, + "grad_norm": 5.73768424987793, + "learning_rate": 4.2300121269589475e-05, + "loss": 0.1777, + "num_input_tokens_seen": 589744, + "step": 685 + }, + { + "epoch": 1.6707021791767556, + "grad_norm": 5.188973426818848, + "learning_rate": 4.214692859319237e-05, + "loss": 0.2142, + "num_input_tokens_seen": 593968, + "step": 690 + }, + { + "epoch": 1.6828087167070218, + "grad_norm": 20.29938316345215, + "learning_rate": 4.19925103638238e-05, + "loss": 0.2096, + "num_input_tokens_seen": 598256, + "step": 695 + }, + { + "epoch": 1.694915254237288, + "grad_norm": 3.481995105743408, + "learning_rate": 4.183687761831281e-05, + "loss": 0.1881, + "num_input_tokens_seen": 602608, + "step": 700 + }, + { + "epoch": 1.7070217917675545, + "grad_norm": 2.9380016326904297, + "learning_rate": 4.168004148029435e-05, + "loss": 0.1678, + "num_input_tokens_seen": 607088, + "step": 705 + }, + { + "epoch": 1.7191283292978208, + "grad_norm": 6.645642280578613, + "learning_rate": 4.1522013159414144e-05, + "loss": 0.243, + "num_input_tokens_seen": 611248, + "step": 710 + }, + { + "epoch": 1.7312348668280872, + "grad_norm": 5.701453685760498, + "learning_rate": 4.136280395052754e-05, + "loss": 0.2024, + "num_input_tokens_seen": 615536, + "step": 715 + }, + { + "epoch": 1.7433414043583535, + "grad_norm": 4.573903560638428, + "learning_rate": 4.120242523289223e-05, + "loss": 0.1803, + "num_input_tokens_seen": 619952, + "step": 720 + }, + { + "epoch": 1.7554479418886197, + "grad_norm": 3.025674819946289, + "learning_rate": 4.1040888469354925e-05, + "loss": 0.1949, + "num_input_tokens_seen": 624368, + "step": 725 + }, + { + "epoch": 1.7627118644067796, + "eval_loss": 0.19822187721729279, + "eval_runtime": 1.1195, + "eval_samples_per_second": 327.835, + "eval_steps_per_second": 41.091, + "num_input_tokens_seen": 626992, + "step": 728 + }, + { + "epoch": 1.7675544794188862, + "grad_norm": 5.934816360473633, + "learning_rate": 4.087820520553205e-05, + "loss": 0.1935, + "num_input_tokens_seen": 628720, + "step": 730 + }, + { + "epoch": 1.7796610169491527, + "grad_norm": 1.3624376058578491, + "learning_rate": 4.0714387068984574e-05, + "loss": 0.1884, + "num_input_tokens_seen": 633008, + "step": 735 + }, + { + "epoch": 1.791767554479419, + "grad_norm": 2.1475796699523926, + "learning_rate": 4.05494457683869e-05, + "loss": 0.2014, + "num_input_tokens_seen": 637360, + "step": 740 + }, + { + "epoch": 1.8038740920096852, + "grad_norm": 10.264263153076172, + "learning_rate": 4.038339309269002e-05, + "loss": 0.2152, + "num_input_tokens_seen": 641648, + "step": 745 + }, + { + "epoch": 1.8159806295399514, + "grad_norm": 4.37279748916626, + "learning_rate": 4.021624091027895e-05, + "loss": 0.192, + "num_input_tokens_seen": 645552, + "step": 750 + }, + { + "epoch": 1.828087167070218, + "grad_norm": 10.11119270324707, + "learning_rate": 4.004800116812441e-05, + "loss": 0.3049, + "num_input_tokens_seen": 649904, + "step": 755 + }, + { + "epoch": 1.8401937046004844, + "grad_norm": 0.4716910719871521, + "learning_rate": 3.987868589092893e-05, + "loss": 0.184, + "num_input_tokens_seen": 654128, + "step": 760 + }, + { + "epoch": 1.8523002421307506, + "grad_norm": 8.259904861450195, + "learning_rate": 3.9708307180267456e-05, + "loss": 0.1914, + "num_input_tokens_seen": 658672, + "step": 765 + }, + { + "epoch": 1.8644067796610169, + "grad_norm": 14.706856727600098, + "learning_rate": 3.953687721372233e-05, + "loss": 0.4553, + "num_input_tokens_seen": 663088, + "step": 770 + }, + { + "epoch": 1.8765133171912833, + "grad_norm": 9.08963394165039, + "learning_rate": 3.936440824401299e-05, + "loss": 0.1709, + "num_input_tokens_seen": 667440, + "step": 775 + }, + { + "epoch": 1.8886198547215496, + "grad_norm": 4.246565818786621, + "learning_rate": 3.919091259812013e-05, + "loss": 0.1831, + "num_input_tokens_seen": 671792, + "step": 780 + }, + { + "epoch": 1.900726392251816, + "grad_norm": 11.860783576965332, + "learning_rate": 3.9016402676404753e-05, + "loss": 0.2175, + "num_input_tokens_seen": 676336, + "step": 785 + }, + { + "epoch": 1.9128329297820823, + "grad_norm": 5.474867820739746, + "learning_rate": 3.884089095172181e-05, + "loss": 0.18, + "num_input_tokens_seen": 680624, + "step": 790 + }, + { + "epoch": 1.9249394673123486, + "grad_norm": 2.7666966915130615, + "learning_rate": 3.866438996852872e-05, + "loss": 0.1914, + "num_input_tokens_seen": 685040, + "step": 795 + }, + { + "epoch": 1.937046004842615, + "grad_norm": 10.039326667785645, + "learning_rate": 3.848691234198879e-05, + "loss": 0.1935, + "num_input_tokens_seen": 689392, + "step": 800 + }, + { + "epoch": 1.9491525423728815, + "grad_norm": 3.919206142425537, + "learning_rate": 3.830847075706956e-05, + "loss": 0.2046, + "num_input_tokens_seen": 693552, + "step": 805 + }, + { + "epoch": 1.9612590799031477, + "grad_norm": 16.429906845092773, + "learning_rate": 3.812907796763616e-05, + "loss": 0.2291, + "num_input_tokens_seen": 698032, + "step": 810 + }, + { + "epoch": 1.973365617433414, + "grad_norm": 6.558701992034912, + "learning_rate": 3.7948746795539745e-05, + "loss": 0.1751, + "num_input_tokens_seen": 702000, + "step": 815 + }, + { + "epoch": 1.9854721549636802, + "grad_norm": 8.950061798095703, + "learning_rate": 3.776749012970105e-05, + "loss": 0.1795, + "num_input_tokens_seen": 706160, + "step": 820 + }, + { + "epoch": 1.9975786924939467, + "grad_norm": 3.701720714569092, + "learning_rate": 3.758532092518924e-05, + "loss": 0.1852, + "num_input_tokens_seen": 710768, + "step": 825 + }, + { + "epoch": 2.009685230024213, + "grad_norm": 6.777426719665527, + "learning_rate": 3.740225220229587e-05, + "loss": 0.256, + "num_input_tokens_seen": 714744, + "step": 830 + }, + { + "epoch": 2.0145278450363198, + "eval_loss": 0.1934857964515686, + "eval_runtime": 0.6627, + "eval_samples_per_second": 553.776, + "eval_steps_per_second": 69.411, + "num_input_tokens_seen": 716344, + "step": 832 + }, + { + "epoch": 2.0217917675544794, + "grad_norm": 7.20669412612915, + "learning_rate": 3.721829704560436e-05, + "loss": 0.1878, + "num_input_tokens_seen": 718776, + "step": 835 + }, + { + "epoch": 2.0338983050847457, + "grad_norm": 6.232179164886475, + "learning_rate": 3.7033468603054725e-05, + "loss": 0.2215, + "num_input_tokens_seen": 722744, + "step": 840 + }, + { + "epoch": 2.046004842615012, + "grad_norm": 8.393187522888184, + "learning_rate": 3.6847780085003905e-05, + "loss": 0.1657, + "num_input_tokens_seen": 727160, + "step": 845 + }, + { + "epoch": 2.0581113801452786, + "grad_norm": 9.579306602478027, + "learning_rate": 3.666124476328155e-05, + "loss": 0.1957, + "num_input_tokens_seen": 731576, + "step": 850 + }, + { + "epoch": 2.070217917675545, + "grad_norm": 8.12859058380127, + "learning_rate": 3.647387597024139e-05, + "loss": 0.1881, + "num_input_tokens_seen": 736184, + "step": 855 + }, + { + "epoch": 2.082324455205811, + "grad_norm": 11.758556365966797, + "learning_rate": 3.6285687097808394e-05, + "loss": 0.2041, + "num_input_tokens_seen": 740472, + "step": 860 + }, + { + "epoch": 2.0944309927360774, + "grad_norm": 1.7637454271316528, + "learning_rate": 3.609669159652158e-05, + "loss": 0.213, + "num_input_tokens_seen": 744760, + "step": 865 + }, + { + "epoch": 2.106537530266344, + "grad_norm": 5.633957386016846, + "learning_rate": 3.590690297457262e-05, + "loss": 0.1913, + "num_input_tokens_seen": 749176, + "step": 870 + }, + { + "epoch": 2.1186440677966103, + "grad_norm": 4.531621932983398, + "learning_rate": 3.57163347968404e-05, + "loss": 0.1961, + "num_input_tokens_seen": 753528, + "step": 875 + }, + { + "epoch": 2.1307506053268765, + "grad_norm": 6.524752140045166, + "learning_rate": 3.552500068392147e-05, + "loss": 0.1981, + "num_input_tokens_seen": 757688, + "step": 880 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 5.924046516418457, + "learning_rate": 3.533291431115653e-05, + "loss": 0.2002, + "num_input_tokens_seen": 762040, + "step": 885 + }, + { + "epoch": 2.154963680387409, + "grad_norm": 4.7628068923950195, + "learning_rate": 3.514008940765304e-05, + "loss": 0.1856, + "num_input_tokens_seen": 766200, + "step": 890 + }, + { + "epoch": 2.1670702179176757, + "grad_norm": 9.14155101776123, + "learning_rate": 3.494653975530388e-05, + "loss": 0.2107, + "num_input_tokens_seen": 770680, + "step": 895 + }, + { + "epoch": 2.179176755447942, + "grad_norm": 7.742560386657715, + "learning_rate": 3.475227918780239e-05, + "loss": 0.1771, + "num_input_tokens_seen": 774840, + "step": 900 + }, + { + "epoch": 2.1912832929782082, + "grad_norm": 1.2218825817108154, + "learning_rate": 3.4557321589653556e-05, + "loss": 0.1924, + "num_input_tokens_seen": 779192, + "step": 905 + }, + { + "epoch": 2.2033898305084745, + "grad_norm": 10.382070541381836, + "learning_rate": 3.436168089518168e-05, + "loss": 0.1687, + "num_input_tokens_seen": 783608, + "step": 910 + }, + { + "epoch": 2.2154963680387407, + "grad_norm": 2.07893967628479, + "learning_rate": 3.416537108753443e-05, + "loss": 0.1922, + "num_input_tokens_seen": 788088, + "step": 915 + }, + { + "epoch": 2.2276029055690074, + "grad_norm": 14.99792194366455, + "learning_rate": 3.3968406197683376e-05, + "loss": 0.1721, + "num_input_tokens_seen": 792568, + "step": 920 + }, + { + "epoch": 2.2397094430992737, + "grad_norm": 4.237668037414551, + "learning_rate": 3.3770800303421254e-05, + "loss": 0.2058, + "num_input_tokens_seen": 797176, + "step": 925 + }, + { + "epoch": 2.25181598062954, + "grad_norm": 2.3142411708831787, + "learning_rate": 3.357256752835561e-05, + "loss": 0.1925, + "num_input_tokens_seen": 801400, + "step": 930 + }, + { + "epoch": 2.263922518159806, + "grad_norm": 3.0918896198272705, + "learning_rate": 3.3373722040899517e-05, + "loss": 0.1601, + "num_input_tokens_seen": 805944, + "step": 935 + }, + { + "epoch": 2.2663438256658597, + "eval_loss": 0.38670673966407776, + "eval_runtime": 2.26, + "eval_samples_per_second": 162.386, + "eval_steps_per_second": 20.354, + "num_input_tokens_seen": 806712, + "step": 936 + }, + { + "epoch": 2.2760290556900724, + "grad_norm": 3.9013168811798096, + "learning_rate": 3.317427805325875e-05, + "loss": 0.9421, + "num_input_tokens_seen": 810040, + "step": 940 + }, + { + "epoch": 2.288135593220339, + "grad_norm": 1.7496920824050903, + "learning_rate": 3.297424982041609e-05, + "loss": 0.191, + "num_input_tokens_seen": 814392, + "step": 945 + }, + { + "epoch": 2.3002421307506054, + "grad_norm": 6.5397491455078125, + "learning_rate": 3.277365163911243e-05, + "loss": 0.1962, + "num_input_tokens_seen": 818872, + "step": 950 + }, + { + "epoch": 2.3123486682808716, + "grad_norm": 2.407987594604492, + "learning_rate": 3.257249784682492e-05, + "loss": 0.2261, + "num_input_tokens_seen": 823096, + "step": 955 + }, + { + "epoch": 2.324455205811138, + "grad_norm": 3.1127803325653076, + "learning_rate": 3.2370802820742275e-05, + "loss": 0.1945, + "num_input_tokens_seen": 827128, + "step": 960 + }, + { + "epoch": 2.3365617433414045, + "grad_norm": 10.151595115661621, + "learning_rate": 3.2168580976737104e-05, + "loss": 0.2272, + "num_input_tokens_seen": 831288, + "step": 965 + }, + { + "epoch": 2.348668280871671, + "grad_norm": 1.2875597476959229, + "learning_rate": 3.196584676833562e-05, + "loss": 0.1824, + "num_input_tokens_seen": 835640, + "step": 970 + }, + { + "epoch": 2.360774818401937, + "grad_norm": 0.8216660022735596, + "learning_rate": 3.1762614685684567e-05, + "loss": 0.156, + "num_input_tokens_seen": 839736, + "step": 975 + }, + { + "epoch": 2.3728813559322033, + "grad_norm": 7.343863010406494, + "learning_rate": 3.155889925451557e-05, + "loss": 0.2199, + "num_input_tokens_seen": 844024, + "step": 980 + }, + { + "epoch": 2.38498789346247, + "grad_norm": 2.2787206172943115, + "learning_rate": 3.1354715035106894e-05, + "loss": 0.1885, + "num_input_tokens_seen": 848248, + "step": 985 + }, + { + "epoch": 2.3970944309927362, + "grad_norm": 6.654670238494873, + "learning_rate": 3.1150076621242816e-05, + "loss": 0.1645, + "num_input_tokens_seen": 852472, + "step": 990 + }, + { + "epoch": 2.4092009685230025, + "grad_norm": 3.4156064987182617, + "learning_rate": 3.0944998639170544e-05, + "loss": 0.1747, + "num_input_tokens_seen": 856824, + "step": 995 + }, + { + "epoch": 2.4213075060532687, + "grad_norm": 0.4972361624240875, + "learning_rate": 3.073949574655479e-05, + "loss": 0.1751, + "num_input_tokens_seen": 860984, + "step": 1000 + }, + { + "epoch": 2.433414043583535, + "grad_norm": 0.7988845705986023, + "learning_rate": 3.053358263143015e-05, + "loss": 0.1975, + "num_input_tokens_seen": 865272, + "step": 1005 + }, + { + "epoch": 2.4455205811138017, + "grad_norm": 5.293003082275391, + "learning_rate": 3.032727401115135e-05, + "loss": 0.1765, + "num_input_tokens_seen": 869560, + "step": 1010 + }, + { + "epoch": 2.457627118644068, + "grad_norm": 3.4668216705322266, + "learning_rate": 3.012058463134126e-05, + "loss": 0.1624, + "num_input_tokens_seen": 873976, + "step": 1015 + }, + { + "epoch": 2.469733656174334, + "grad_norm": 1.981259822845459, + "learning_rate": 2.991352926483702e-05, + "loss": 0.2237, + "num_input_tokens_seen": 878200, + "step": 1020 + }, + { + "epoch": 2.4818401937046004, + "grad_norm": 15.534086227416992, + "learning_rate": 2.9706122710634165e-05, + "loss": 0.2024, + "num_input_tokens_seen": 882872, + "step": 1025 + }, + { + "epoch": 2.4939467312348667, + "grad_norm": 2.0866310596466064, + "learning_rate": 2.949837979282889e-05, + "loss": 0.2673, + "num_input_tokens_seen": 887096, + "step": 1030 + }, + { + "epoch": 2.5060532687651333, + "grad_norm": 1.296164870262146, + "learning_rate": 2.92903153595585e-05, + "loss": 0.2168, + "num_input_tokens_seen": 891576, + "step": 1035 + }, + { + "epoch": 2.5181598062953996, + "grad_norm": 3.0610435009002686, + "learning_rate": 2.908194428194019e-05, + "loss": 0.1768, + "num_input_tokens_seen": 895736, + "step": 1040 + }, + { + "epoch": 2.5181598062953996, + "eval_loss": 0.1943914145231247, + "eval_runtime": 0.6714, + "eval_samples_per_second": 546.608, + "eval_steps_per_second": 68.512, + "num_input_tokens_seen": 895736, + "step": 1040 + }, + { + "epoch": 2.530266343825666, + "grad_norm": 13.436739921569824, + "learning_rate": 2.88732814530081e-05, + "loss": 0.1555, + "num_input_tokens_seen": 900024, + "step": 1045 + }, + { + "epoch": 2.542372881355932, + "grad_norm": 9.469161987304688, + "learning_rate": 2.866434178664893e-05, + "loss": 0.1744, + "num_input_tokens_seen": 904440, + "step": 1050 + }, + { + "epoch": 2.5544794188861983, + "grad_norm": 6.683951377868652, + "learning_rate": 2.8455140216535947e-05, + "loss": 0.1842, + "num_input_tokens_seen": 908728, + "step": 1055 + }, + { + "epoch": 2.566585956416465, + "grad_norm": 4.156672954559326, + "learning_rate": 2.8245691695061604e-05, + "loss": 0.2018, + "num_input_tokens_seen": 913016, + "step": 1060 + }, + { + "epoch": 2.5786924939467313, + "grad_norm": 2.5280745029449463, + "learning_rate": 2.8036011192268863e-05, + "loss": 0.2027, + "num_input_tokens_seen": 917304, + "step": 1065 + }, + { + "epoch": 2.5907990314769975, + "grad_norm": 3.3346853256225586, + "learning_rate": 2.7826113694781252e-05, + "loss": 0.1984, + "num_input_tokens_seen": 921528, + "step": 1070 + }, + { + "epoch": 2.6029055690072638, + "grad_norm": 6.732588768005371, + "learning_rate": 2.761601420473168e-05, + "loss": 0.1674, + "num_input_tokens_seen": 925944, + "step": 1075 + }, + { + "epoch": 2.61501210653753, + "grad_norm": 5.7978363037109375, + "learning_rate": 2.740572773869019e-05, + "loss": 0.1523, + "num_input_tokens_seen": 930744, + "step": 1080 + }, + { + "epoch": 2.6271186440677967, + "grad_norm": 4.692154884338379, + "learning_rate": 2.7195269326590682e-05, + "loss": 0.1263, + "num_input_tokens_seen": 935352, + "step": 1085 + }, + { + "epoch": 2.639225181598063, + "grad_norm": 8.889333724975586, + "learning_rate": 2.6984654010656667e-05, + "loss": 0.1656, + "num_input_tokens_seen": 939640, + "step": 1090 + }, + { + "epoch": 2.651331719128329, + "grad_norm": 4.259967803955078, + "learning_rate": 2.6773896844326125e-05, + "loss": 0.2926, + "num_input_tokens_seen": 943672, + "step": 1095 + }, + { + "epoch": 2.663438256658596, + "grad_norm": 3.0391273498535156, + "learning_rate": 2.656301289117561e-05, + "loss": 0.1547, + "num_input_tokens_seen": 947704, + "step": 1100 + }, + { + "epoch": 2.6755447941888617, + "grad_norm": 9.067920684814453, + "learning_rate": 2.6352017223843585e-05, + "loss": 0.2428, + "num_input_tokens_seen": 951928, + "step": 1105 + }, + { + "epoch": 2.6876513317191284, + "grad_norm": 7.765347957611084, + "learning_rate": 2.6140924922953125e-05, + "loss": 0.1649, + "num_input_tokens_seen": 956216, + "step": 1110 + }, + { + "epoch": 2.6997578692493946, + "grad_norm": 1.6490931510925293, + "learning_rate": 2.5929751076034058e-05, + "loss": 0.1597, + "num_input_tokens_seen": 960504, + "step": 1115 + }, + { + "epoch": 2.711864406779661, + "grad_norm": 1.5548573732376099, + "learning_rate": 2.571851077644461e-05, + "loss": 0.1407, + "num_input_tokens_seen": 965048, + "step": 1120 + }, + { + "epoch": 2.7239709443099276, + "grad_norm": 5.526769161224365, + "learning_rate": 2.5507219122292598e-05, + "loss": 0.1667, + "num_input_tokens_seen": 969208, + "step": 1125 + }, + { + "epoch": 2.736077481840194, + "grad_norm": 5.792220115661621, + "learning_rate": 2.529589121535636e-05, + "loss": 0.1438, + "num_input_tokens_seen": 973624, + "step": 1130 + }, + { + "epoch": 2.74818401937046, + "grad_norm": 6.361023902893066, + "learning_rate": 2.5084542160005335e-05, + "loss": 0.2294, + "num_input_tokens_seen": 977976, + "step": 1135 + }, + { + "epoch": 2.7602905569007263, + "grad_norm": 1.0617471933364868, + "learning_rate": 2.487318706212051e-05, + "loss": 0.1964, + "num_input_tokens_seen": 982200, + "step": 1140 + }, + { + "epoch": 2.7699757869249395, + "eval_loss": 0.19318054616451263, + "eval_runtime": 0.6508, + "eval_samples_per_second": 563.894, + "eval_steps_per_second": 70.679, + "num_input_tokens_seen": 985592, + "step": 1144 + }, + { + "epoch": 2.7723970944309926, + "grad_norm": 7.693630695343018, + "learning_rate": 2.4661841028014785e-05, + "loss": 0.203, + "num_input_tokens_seen": 986488, + "step": 1145 + }, + { + "epoch": 2.7845036319612593, + "grad_norm": 4.296042442321777, + "learning_rate": 2.445051916335321e-05, + "loss": 0.1983, + "num_input_tokens_seen": 990456, + "step": 1150 + }, + { + "epoch": 2.7966101694915255, + "grad_norm": 2.928414821624756, + "learning_rate": 2.4239236572073352e-05, + "loss": 0.1825, + "num_input_tokens_seen": 994744, + "step": 1155 + }, + { + "epoch": 2.8087167070217918, + "grad_norm": 2.411320686340332, + "learning_rate": 2.4028008355305815e-05, + "loss": 0.178, + "num_input_tokens_seen": 999160, + "step": 1160 + }, + { + "epoch": 2.820823244552058, + "grad_norm": 6.881911754608154, + "learning_rate": 2.3816849610294783e-05, + "loss": 0.1709, + "num_input_tokens_seen": 1003256, + "step": 1165 + }, + { + "epoch": 2.8329297820823243, + "grad_norm": 4.286351680755615, + "learning_rate": 2.3605775429319115e-05, + "loss": 0.1853, + "num_input_tokens_seen": 1007480, + "step": 1170 + }, + { + "epoch": 2.845036319612591, + "grad_norm": 3.7688863277435303, + "learning_rate": 2.3394800898613535e-05, + "loss": 0.1431, + "num_input_tokens_seen": 1011896, + "step": 1175 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 3.717094898223877, + "learning_rate": 2.318394109729041e-05, + "loss": 0.2253, + "num_input_tokens_seen": 1015992, + "step": 1180 + }, + { + "epoch": 2.8692493946731235, + "grad_norm": 7.443727493286133, + "learning_rate": 2.297321109626198e-05, + "loss": 0.1686, + "num_input_tokens_seen": 1020408, + "step": 1185 + }, + { + "epoch": 2.8813559322033897, + "grad_norm": 12.574480056762695, + "learning_rate": 2.27626259571632e-05, + "loss": 0.1988, + "num_input_tokens_seen": 1025016, + "step": 1190 + }, + { + "epoch": 2.893462469733656, + "grad_norm": 9.311829566955566, + "learning_rate": 2.2552200731275213e-05, + "loss": 0.1682, + "num_input_tokens_seen": 1029368, + "step": 1195 + }, + { + "epoch": 2.9055690072639226, + "grad_norm": 4.659236431121826, + "learning_rate": 2.2341950458449576e-05, + "loss": 0.1918, + "num_input_tokens_seen": 1033592, + "step": 1200 + }, + { + "epoch": 2.917675544794189, + "grad_norm": 1.1926063299179077, + "learning_rate": 2.213189016603333e-05, + "loss": 0.2047, + "num_input_tokens_seen": 1037688, + "step": 1205 + }, + { + "epoch": 2.929782082324455, + "grad_norm": 1.54401433467865, + "learning_rate": 2.1922034867794925e-05, + "loss": 0.1686, + "num_input_tokens_seen": 1041912, + "step": 1210 + }, + { + "epoch": 2.9418886198547214, + "grad_norm": 6.956883430480957, + "learning_rate": 2.1712399562851147e-05, + "loss": 0.1663, + "num_input_tokens_seen": 1046392, + "step": 1215 + }, + { + "epoch": 2.9539951573849876, + "grad_norm": 6.875396728515625, + "learning_rate": 2.150299923459505e-05, + "loss": 0.1158, + "num_input_tokens_seen": 1050616, + "step": 1220 + }, + { + "epoch": 2.9661016949152543, + "grad_norm": 4.653652191162109, + "learning_rate": 2.1293848849625065e-05, + "loss": 0.1857, + "num_input_tokens_seen": 1054840, + "step": 1225 + }, + { + "epoch": 2.9782082324455206, + "grad_norm": 4.641164302825928, + "learning_rate": 2.108496335667527e-05, + "loss": 0.2051, + "num_input_tokens_seen": 1058936, + "step": 1230 + }, + { + "epoch": 2.990314769975787, + "grad_norm": 4.4205002784729, + "learning_rate": 2.0876357685546944e-05, + "loss": 0.137, + "num_input_tokens_seen": 1063288, + "step": 1235 + }, + { + "epoch": 3.002421307506053, + "grad_norm": 9.87366771697998, + "learning_rate": 2.06680467460415e-05, + "loss": 0.294, + "num_input_tokens_seen": 1067392, + "step": 1240 + }, + { + "epoch": 3.0145278450363198, + "grad_norm": 1.3809499740600586, + "learning_rate": 2.0460045426894817e-05, + "loss": 0.1436, + "num_input_tokens_seen": 1071872, + "step": 1245 + }, + { + "epoch": 3.0217917675544794, + "eval_loss": 0.20527909696102142, + "eval_runtime": 0.667, + "eval_samples_per_second": 550.187, + "eval_steps_per_second": 68.961, + "num_input_tokens_seen": 1074624, + "step": 1248 + }, + { + "epoch": 3.026634382566586, + "grad_norm": 1.1184051036834717, + "learning_rate": 2.0252368594713083e-05, + "loss": 0.1503, + "num_input_tokens_seen": 1076416, + "step": 1250 + }, + { + "epoch": 3.0387409200968523, + "grad_norm": 3.941237211227417, + "learning_rate": 2.004503109291023e-05, + "loss": 0.156, + "num_input_tokens_seen": 1080512, + "step": 1255 + }, + { + "epoch": 3.0508474576271185, + "grad_norm": 2.0000264644622803, + "learning_rate": 1.9838047740647026e-05, + "loss": 0.1971, + "num_input_tokens_seen": 1084608, + "step": 1260 + }, + { + "epoch": 3.062953995157385, + "grad_norm": 11.35123062133789, + "learning_rate": 1.9631433331771886e-05, + "loss": 0.1813, + "num_input_tokens_seen": 1089024, + "step": 1265 + }, + { + "epoch": 3.0750605326876514, + "grad_norm": 2.1008217334747314, + "learning_rate": 1.9425202633763513e-05, + "loss": 0.133, + "num_input_tokens_seen": 1093376, + "step": 1270 + }, + { + "epoch": 3.0871670702179177, + "grad_norm": 5.499813556671143, + "learning_rate": 1.9219370386675388e-05, + "loss": 0.089, + "num_input_tokens_seen": 1097728, + "step": 1275 + }, + { + "epoch": 3.099273607748184, + "grad_norm": 8.502225875854492, + "learning_rate": 1.901395130208229e-05, + "loss": 0.2836, + "num_input_tokens_seen": 1101888, + "step": 1280 + }, + { + "epoch": 3.11138014527845, + "grad_norm": 14.45283031463623, + "learning_rate": 1.880896006202876e-05, + "loss": 0.1116, + "num_input_tokens_seen": 1106176, + "step": 1285 + }, + { + "epoch": 3.123486682808717, + "grad_norm": 3.364891767501831, + "learning_rate": 1.860441131797977e-05, + "loss": 0.1027, + "num_input_tokens_seen": 1110272, + "step": 1290 + }, + { + "epoch": 3.135593220338983, + "grad_norm": 8.516124725341797, + "learning_rate": 1.8400319689773474e-05, + "loss": 0.1582, + "num_input_tokens_seen": 1114496, + "step": 1295 + }, + { + "epoch": 3.1476997578692494, + "grad_norm": 11.724932670593262, + "learning_rate": 1.8196699764576318e-05, + "loss": 0.0408, + "num_input_tokens_seen": 1118784, + "step": 1300 + }, + { + "epoch": 3.1598062953995156, + "grad_norm": 8.753253936767578, + "learning_rate": 1.7993566095840443e-05, + "loss": 0.1234, + "num_input_tokens_seen": 1123008, + "step": 1305 + }, + { + "epoch": 3.171912832929782, + "grad_norm": 8.221136093139648, + "learning_rate": 1.7790933202263434e-05, + "loss": 0.2236, + "num_input_tokens_seen": 1127424, + "step": 1310 + }, + { + "epoch": 3.1840193704600486, + "grad_norm": 17.435853958129883, + "learning_rate": 1.758881556675073e-05, + "loss": 0.1958, + "num_input_tokens_seen": 1131840, + "step": 1315 + }, + { + "epoch": 3.196125907990315, + "grad_norm": 5.691689491271973, + "learning_rate": 1.738722763538036e-05, + "loss": 0.1238, + "num_input_tokens_seen": 1136192, + "step": 1320 + }, + { + "epoch": 3.208232445520581, + "grad_norm": 2.6163206100463867, + "learning_rate": 1.7186183816370522e-05, + "loss": 0.1027, + "num_input_tokens_seen": 1140544, + "step": 1325 + }, + { + "epoch": 3.2203389830508473, + "grad_norm": 5.7949724197387695, + "learning_rate": 1.6985698479049702e-05, + "loss": 0.0907, + "num_input_tokens_seen": 1145280, + "step": 1330 + }, + { + "epoch": 3.232445520581114, + "grad_norm": 5.007083892822266, + "learning_rate": 1.6785785952829717e-05, + "loss": 0.1037, + "num_input_tokens_seen": 1149888, + "step": 1335 + }, + { + "epoch": 3.2445520581113803, + "grad_norm": 12.367361068725586, + "learning_rate": 1.6586460526181473e-05, + "loss": 0.1776, + "num_input_tokens_seen": 1153920, + "step": 1340 + }, + { + "epoch": 3.2566585956416465, + "grad_norm": 16.06878089904785, + "learning_rate": 1.6387736445613772e-05, + "loss": 0.2125, + "num_input_tokens_seen": 1158592, + "step": 1345 + }, + { + "epoch": 3.2687651331719128, + "grad_norm": 7.7484588623046875, + "learning_rate": 1.6189627914655008e-05, + "loss": 0.2252, + "num_input_tokens_seen": 1162816, + "step": 1350 + }, + { + "epoch": 3.2736077481840193, + "eval_loss": 0.2091810256242752, + "eval_runtime": 0.6785, + "eval_samples_per_second": 540.886, + "eval_steps_per_second": 67.795, + "num_input_tokens_seen": 1164544, + "step": 1352 + }, + { + "epoch": 3.280871670702179, + "grad_norm": 9.04961109161377, + "learning_rate": 1.599214909283805e-05, + "loss": 0.1163, + "num_input_tokens_seen": 1167232, + "step": 1355 + }, + { + "epoch": 3.2929782082324457, + "grad_norm": 3.317920446395874, + "learning_rate": 1.579531409468815e-05, + "loss": 0.1094, + "num_input_tokens_seen": 1171648, + "step": 1360 + }, + { + "epoch": 3.305084745762712, + "grad_norm": 8.250765800476074, + "learning_rate": 1.5599136988714186e-05, + "loss": 0.141, + "num_input_tokens_seen": 1175808, + "step": 1365 + }, + { + "epoch": 3.317191283292978, + "grad_norm": 5.985897541046143, + "learning_rate": 1.5403631796403085e-05, + "loss": 0.1296, + "num_input_tokens_seen": 1180224, + "step": 1370 + }, + { + "epoch": 3.3292978208232444, + "grad_norm": 4.8227314949035645, + "learning_rate": 1.520881249121767e-05, + "loss": 0.1375, + "num_input_tokens_seen": 1184704, + "step": 1375 + }, + { + "epoch": 3.341404358353511, + "grad_norm": 2.318727970123291, + "learning_rate": 1.5014692997597962e-05, + "loss": 0.1459, + "num_input_tokens_seen": 1188992, + "step": 1380 + }, + { + "epoch": 3.3535108958837774, + "grad_norm": 13.753244400024414, + "learning_rate": 1.4821287189965866e-05, + "loss": 0.1535, + "num_input_tokens_seen": 1193408, + "step": 1385 + }, + { + "epoch": 3.3656174334140436, + "grad_norm": 1.9978270530700684, + "learning_rate": 1.4628608891733625e-05, + "loss": 0.1246, + "num_input_tokens_seen": 1197760, + "step": 1390 + }, + { + "epoch": 3.37772397094431, + "grad_norm": 6.705835819244385, + "learning_rate": 1.4436671874315722e-05, + "loss": 0.0863, + "num_input_tokens_seen": 1201792, + "step": 1395 + }, + { + "epoch": 3.389830508474576, + "grad_norm": 7.748871326446533, + "learning_rate": 1.4245489856144634e-05, + "loss": 0.0968, + "num_input_tokens_seen": 1205824, + "step": 1400 + }, + { + "epoch": 3.401937046004843, + "grad_norm": 4.018503189086914, + "learning_rate": 1.4055076501690311e-05, + "loss": 0.0749, + "num_input_tokens_seen": 1210240, + "step": 1405 + }, + { + "epoch": 3.414043583535109, + "grad_norm": 4.750000953674316, + "learning_rate": 1.3865445420483526e-05, + "loss": 0.09, + "num_input_tokens_seen": 1214464, + "step": 1410 + }, + { + "epoch": 3.4261501210653753, + "grad_norm": 9.335100173950195, + "learning_rate": 1.367661016614315e-05, + "loss": 0.1746, + "num_input_tokens_seen": 1218752, + "step": 1415 + }, + { + "epoch": 3.4382566585956416, + "grad_norm": 4.242533206939697, + "learning_rate": 1.3488584235407439e-05, + "loss": 0.0826, + "num_input_tokens_seen": 1223168, + "step": 1420 + }, + { + "epoch": 3.450363196125908, + "grad_norm": 1.9875125885009766, + "learning_rate": 1.3301381067169366e-05, + "loss": 0.1469, + "num_input_tokens_seen": 1227328, + "step": 1425 + }, + { + "epoch": 3.4624697336561745, + "grad_norm": 10.304492950439453, + "learning_rate": 1.3115014041516089e-05, + "loss": 0.1454, + "num_input_tokens_seen": 1231360, + "step": 1430 + }, + { + "epoch": 3.4745762711864407, + "grad_norm": 2.467794418334961, + "learning_rate": 1.2929496478772635e-05, + "loss": 0.0455, + "num_input_tokens_seen": 1235456, + "step": 1435 + }, + { + "epoch": 3.486682808716707, + "grad_norm": 5.000001907348633, + "learning_rate": 1.2744841638549842e-05, + "loss": 0.106, + "num_input_tokens_seen": 1239616, + "step": 1440 + }, + { + "epoch": 3.4987893462469732, + "grad_norm": 0.32030388712882996, + "learning_rate": 1.2561062718796662e-05, + "loss": 0.0763, + "num_input_tokens_seen": 1243968, + "step": 1445 + }, + { + "epoch": 3.5108958837772395, + "grad_norm": 1.8182225227355957, + "learning_rate": 1.2378172854856831e-05, + "loss": 0.0978, + "num_input_tokens_seen": 1248128, + "step": 1450 + }, + { + "epoch": 3.523002421307506, + "grad_norm": 5.48933219909668, + "learning_rate": 1.2196185118530063e-05, + "loss": 0.1328, + "num_input_tokens_seen": 1252288, + "step": 1455 + }, + { + "epoch": 3.5254237288135593, + "eval_loss": 0.3491859436035156, + "eval_runtime": 0.6747, + "eval_samples_per_second": 543.942, + "eval_steps_per_second": 68.178, + "num_input_tokens_seen": 1253248, + "step": 1456 + }, + { + "epoch": 3.5351089588377724, + "grad_norm": 1.86709725856781, + "learning_rate": 1.2015112517137744e-05, + "loss": 0.1139, + "num_input_tokens_seen": 1256640, + "step": 1460 + }, + { + "epoch": 3.5472154963680387, + "grad_norm": 10.584001541137695, + "learning_rate": 1.183496799259326e-05, + "loss": 0.1247, + "num_input_tokens_seen": 1261440, + "step": 1465 + }, + { + "epoch": 3.559322033898305, + "grad_norm": 0.81782066822052, + "learning_rate": 1.1655764420476988e-05, + "loss": 0.0777, + "num_input_tokens_seen": 1265664, + "step": 1470 + }, + { + "epoch": 3.571428571428571, + "grad_norm": 4.23323917388916, + "learning_rate": 1.1477514609116039e-05, + "loss": 0.0848, + "num_input_tokens_seen": 1270016, + "step": 1475 + }, + { + "epoch": 3.583535108958838, + "grad_norm": 4.22898006439209, + "learning_rate": 1.1300231298668786e-05, + "loss": 0.1263, + "num_input_tokens_seen": 1274560, + "step": 1480 + }, + { + "epoch": 3.595641646489104, + "grad_norm": 7.585851669311523, + "learning_rate": 1.1123927160214289e-05, + "loss": 0.1362, + "num_input_tokens_seen": 1278976, + "step": 1485 + }, + { + "epoch": 3.6077481840193704, + "grad_norm": 2.0685174465179443, + "learning_rate": 1.0948614794846668e-05, + "loss": 0.1068, + "num_input_tokens_seen": 1283200, + "step": 1490 + }, + { + "epoch": 3.619854721549637, + "grad_norm": 4.345080852508545, + "learning_rate": 1.0774306732774414e-05, + "loss": 0.2069, + "num_input_tokens_seen": 1287296, + "step": 1495 + }, + { + "epoch": 3.6319612590799033, + "grad_norm": 15.997807502746582, + "learning_rate": 1.0601015432424819e-05, + "loss": 0.1368, + "num_input_tokens_seen": 1291712, + "step": 1500 + }, + { + "epoch": 3.6440677966101696, + "grad_norm": 6.712691783905029, + "learning_rate": 1.042875327955356e-05, + "loss": 0.1959, + "num_input_tokens_seen": 1295936, + "step": 1505 + }, + { + "epoch": 3.656174334140436, + "grad_norm": 5.0442962646484375, + "learning_rate": 1.0257532586359422e-05, + "loss": 0.0932, + "num_input_tokens_seen": 1300608, + "step": 1510 + }, + { + "epoch": 3.668280871670702, + "grad_norm": 5.707069396972656, + "learning_rate": 1.0087365590604289e-05, + "loss": 0.1347, + "num_input_tokens_seen": 1305024, + "step": 1515 + }, + { + "epoch": 3.6803874092009687, + "grad_norm": 2.964393138885498, + "learning_rate": 9.918264454738504e-06, + "loss": 0.1287, + "num_input_tokens_seen": 1309376, + "step": 1520 + }, + { + "epoch": 3.692493946731235, + "grad_norm": 10.144442558288574, + "learning_rate": 9.75024126503153e-06, + "loss": 0.0818, + "num_input_tokens_seen": 1313664, + "step": 1525 + }, + { + "epoch": 3.7046004842615012, + "grad_norm": 8.710615158081055, + "learning_rate": 9.583308030708135e-06, + "loss": 0.0869, + "num_input_tokens_seen": 1318080, + "step": 1530 + }, + { + "epoch": 3.7167070217917675, + "grad_norm": 2.1846084594726562, + "learning_rate": 9.417476683090007e-06, + "loss": 0.0893, + "num_input_tokens_seen": 1322432, + "step": 1535 + }, + { + "epoch": 3.7288135593220337, + "grad_norm": 3.826754570007324, + "learning_rate": 9.252759074743034e-06, + "loss": 0.1556, + "num_input_tokens_seen": 1326848, + "step": 1540 + }, + { + "epoch": 3.7409200968523004, + "grad_norm": 10.382698059082031, + "learning_rate": 9.08916697863014e-06, + "loss": 0.0774, + "num_input_tokens_seen": 1331328, + "step": 1545 + }, + { + "epoch": 3.7530266343825667, + "grad_norm": 7.099722862243652, + "learning_rate": 8.926712087269801e-06, + "loss": 0.1253, + "num_input_tokens_seen": 1335424, + "step": 1550 + }, + { + "epoch": 3.765133171912833, + "grad_norm": 5.015311241149902, + "learning_rate": 8.765406011900368e-06, + "loss": 0.1276, + "num_input_tokens_seen": 1339712, + "step": 1555 + }, + { + "epoch": 3.777239709443099, + "grad_norm": 4.82669734954834, + "learning_rate": 8.605260281650152e-06, + "loss": 0.1842, + "num_input_tokens_seen": 1344000, + "step": 1560 + }, + { + "epoch": 3.777239709443099, + "eval_loss": 0.21899566054344177, + "eval_runtime": 0.6796, + "eval_samples_per_second": 539.994, + "eval_steps_per_second": 67.683, + "num_input_tokens_seen": 1344000, + "step": 1560 + }, + { + "epoch": 3.7893462469733654, + "grad_norm": 3.010295867919922, + "learning_rate": 8.446286342713419e-06, + "loss": 0.0881, + "num_input_tokens_seen": 1348224, + "step": 1565 + }, + { + "epoch": 3.801452784503632, + "grad_norm": 2.3779475688934326, + "learning_rate": 8.288495557532241e-06, + "loss": 0.1348, + "num_input_tokens_seen": 1352576, + "step": 1570 + }, + { + "epoch": 3.8135593220338984, + "grad_norm": 6.911816120147705, + "learning_rate": 8.131899203984463e-06, + "loss": 0.134, + "num_input_tokens_seen": 1356864, + "step": 1575 + }, + { + "epoch": 3.8256658595641646, + "grad_norm": 9.250137329101562, + "learning_rate": 7.976508474577548e-06, + "loss": 0.1141, + "num_input_tokens_seen": 1361152, + "step": 1580 + }, + { + "epoch": 3.837772397094431, + "grad_norm": 4.86985445022583, + "learning_rate": 7.822334475648654e-06, + "loss": 0.0705, + "num_input_tokens_seen": 1365376, + "step": 1585 + }, + { + "epoch": 3.849878934624697, + "grad_norm": 0.7732688188552856, + "learning_rate": 7.669388226570809e-06, + "loss": 0.0907, + "num_input_tokens_seen": 1369728, + "step": 1590 + }, + { + "epoch": 3.861985472154964, + "grad_norm": 5.062341213226318, + "learning_rate": 7.517680658965329e-06, + "loss": 0.1261, + "num_input_tokens_seen": 1374144, + "step": 1595 + }, + { + "epoch": 3.87409200968523, + "grad_norm": 8.762838363647461, + "learning_rate": 7.367222615920477e-06, + "loss": 0.1084, + "num_input_tokens_seen": 1378368, + "step": 1600 + }, + { + "epoch": 3.8861985472154963, + "grad_norm": 8.905739784240723, + "learning_rate": 7.2180248512164896e-06, + "loss": 0.0813, + "num_input_tokens_seen": 1382464, + "step": 1605 + }, + { + "epoch": 3.898305084745763, + "grad_norm": 0.5714547038078308, + "learning_rate": 7.070098028556948e-06, + "loss": 0.0805, + "num_input_tokens_seen": 1386880, + "step": 1610 + }, + { + "epoch": 3.910411622276029, + "grad_norm": 8.167064666748047, + "learning_rate": 6.923452720806611e-06, + "loss": 0.1924, + "num_input_tokens_seen": 1391296, + "step": 1615 + }, + { + "epoch": 3.9225181598062955, + "grad_norm": 3.438431739807129, + "learning_rate": 6.778099409235739e-06, + "loss": 0.0609, + "num_input_tokens_seen": 1395456, + "step": 1620 + }, + { + "epoch": 3.9346246973365617, + "grad_norm": 7.784511089324951, + "learning_rate": 6.634048482770946e-06, + "loss": 0.0932, + "num_input_tokens_seen": 1399616, + "step": 1625 + }, + { + "epoch": 3.946731234866828, + "grad_norm": 13.272894859313965, + "learning_rate": 6.491310237252679e-06, + "loss": 0.1241, + "num_input_tokens_seen": 1403712, + "step": 1630 + }, + { + "epoch": 3.9588377723970947, + "grad_norm": 12.38925838470459, + "learning_rate": 6.349894874699344e-06, + "loss": 0.1232, + "num_input_tokens_seen": 1408128, + "step": 1635 + }, + { + "epoch": 3.970944309927361, + "grad_norm": 5.343148231506348, + "learning_rate": 6.209812502578114e-06, + "loss": 0.0787, + "num_input_tokens_seen": 1412480, + "step": 1640 + }, + { + "epoch": 3.983050847457627, + "grad_norm": 1.2886254787445068, + "learning_rate": 6.071073133082492e-06, + "loss": 0.0494, + "num_input_tokens_seen": 1416704, + "step": 1645 + }, + { + "epoch": 3.9951573849878934, + "grad_norm": 10.778816223144531, + "learning_rate": 5.933686682416758e-06, + "loss": 0.0969, + "num_input_tokens_seen": 1421120, + "step": 1650 + }, + { + "epoch": 4.00726392251816, + "grad_norm": 0.2529144883155823, + "learning_rate": 5.797662970087184e-06, + "loss": 0.09, + "num_input_tokens_seen": 1424944, + "step": 1655 + }, + { + "epoch": 4.019370460048426, + "grad_norm": 6.2160162925720215, + "learning_rate": 5.663011718200201e-06, + "loss": 0.0897, + "num_input_tokens_seen": 1429296, + "step": 1660 + }, + { + "epoch": 4.0290556900726395, + "eval_loss": 0.2532218098640442, + "eval_runtime": 0.672, + "eval_samples_per_second": 546.104, + "eval_steps_per_second": 68.449, + "num_input_tokens_seen": 1432880, + "step": 1664 + }, + { + "epoch": 4.031476997578692, + "grad_norm": 0.9374585747718811, + "learning_rate": 5.529742550767544e-06, + "loss": 0.0316, + "num_input_tokens_seen": 1433776, + "step": 1665 + }, + { + "epoch": 4.043583535108959, + "grad_norm": 1.9009536504745483, + "learning_rate": 5.397864993018367e-06, + "loss": 0.0492, + "num_input_tokens_seen": 1438000, + "step": 1670 + }, + { + "epoch": 4.0556900726392255, + "grad_norm": 7.239864349365234, + "learning_rate": 5.267388470718449e-06, + "loss": 0.029, + "num_input_tokens_seen": 1442352, + "step": 1675 + }, + { + "epoch": 4.067796610169491, + "grad_norm": 2.098872661590576, + "learning_rate": 5.138322309496504e-06, + "loss": 0.052, + "num_input_tokens_seen": 1446704, + "step": 1680 + }, + { + "epoch": 4.079903147699758, + "grad_norm": 1.4036399126052856, + "learning_rate": 5.010675734177631e-06, + "loss": 0.0469, + "num_input_tokens_seen": 1450864, + "step": 1685 + }, + { + "epoch": 4.092009685230024, + "grad_norm": 11.33265495300293, + "learning_rate": 4.884457868124001e-06, + "loss": 0.0316, + "num_input_tokens_seen": 1455088, + "step": 1690 + }, + { + "epoch": 4.1041162227602905, + "grad_norm": 1.9709900617599487, + "learning_rate": 4.759677732582782e-06, + "loss": 0.0228, + "num_input_tokens_seen": 1459376, + "step": 1695 + }, + { + "epoch": 4.116222760290557, + "grad_norm": 0.01155536063015461, + "learning_rate": 4.636344246041321e-06, + "loss": 0.0529, + "num_input_tokens_seen": 1463600, + "step": 1700 + }, + { + "epoch": 4.128329297820823, + "grad_norm": 19.08058738708496, + "learning_rate": 4.514466223589753e-06, + "loss": 0.0565, + "num_input_tokens_seen": 1468080, + "step": 1705 + }, + { + "epoch": 4.14043583535109, + "grad_norm": 1.3092641830444336, + "learning_rate": 4.3940523762909135e-06, + "loss": 0.0695, + "num_input_tokens_seen": 1472624, + "step": 1710 + }, + { + "epoch": 4.1525423728813555, + "grad_norm": 0.055544547736644745, + "learning_rate": 4.275111310557758e-06, + "loss": 0.0511, + "num_input_tokens_seen": 1477040, + "step": 1715 + }, + { + "epoch": 4.164648910411622, + "grad_norm": 0.16590368747711182, + "learning_rate": 4.1576515275382226e-06, + "loss": 0.0311, + "num_input_tokens_seen": 1481328, + "step": 1720 + }, + { + "epoch": 4.176755447941889, + "grad_norm": 0.1331050992012024, + "learning_rate": 4.0416814225076035e-06, + "loss": 0.0394, + "num_input_tokens_seen": 1485808, + "step": 1725 + }, + { + "epoch": 4.188861985472155, + "grad_norm": 1.6521071195602417, + "learning_rate": 3.9272092842685345e-06, + "loss": 0.0255, + "num_input_tokens_seen": 1490160, + "step": 1730 + }, + { + "epoch": 4.200968523002421, + "grad_norm": 0.42354145646095276, + "learning_rate": 3.814243294558542e-06, + "loss": 0.0073, + "num_input_tokens_seen": 1494512, + "step": 1735 + }, + { + "epoch": 4.213075060532688, + "grad_norm": 2.2178032398223877, + "learning_rate": 3.702791527465274e-06, + "loss": 0.0562, + "num_input_tokens_seen": 1498480, + "step": 1740 + }, + { + "epoch": 4.225181598062954, + "grad_norm": 13.911809921264648, + "learning_rate": 3.592861948849416e-06, + "loss": 0.0463, + "num_input_tokens_seen": 1502768, + "step": 1745 + }, + { + "epoch": 4.237288135593221, + "grad_norm": 0.01323059480637312, + "learning_rate": 3.484462415775333e-06, + "loss": 0.0429, + "num_input_tokens_seen": 1506992, + "step": 1750 + }, + { + "epoch": 4.249394673123486, + "grad_norm": 0.1997198611497879, + "learning_rate": 3.377600675949527e-06, + "loss": 0.0035, + "num_input_tokens_seen": 1511472, + "step": 1755 + }, + { + "epoch": 4.261501210653753, + "grad_norm": 9.309453010559082, + "learning_rate": 3.272284367166825e-06, + "loss": 0.0395, + "num_input_tokens_seen": 1515824, + "step": 1760 + }, + { + "epoch": 4.27360774818402, + "grad_norm": 1.514168620109558, + "learning_rate": 3.1685210167645335e-06, + "loss": 0.0337, + "num_input_tokens_seen": 1520176, + "step": 1765 + }, + { + "epoch": 4.280871670702179, + "eval_loss": 0.4314914643764496, + "eval_runtime": 0.8115, + "eval_samples_per_second": 452.254, + "eval_steps_per_second": 56.686, + "num_input_tokens_seen": 1522544, + "step": 1768 + }, + { + "epoch": 4.285714285714286, + "grad_norm": 0.23039455711841583, + "learning_rate": 3.0663180410843982e-06, + "loss": 0.008, + "num_input_tokens_seen": 1524336, + "step": 1770 + }, + { + "epoch": 4.297820823244552, + "grad_norm": 0.17007200419902802, + "learning_rate": 2.9656827449425494e-06, + "loss": 0.1379, + "num_input_tokens_seen": 1528560, + "step": 1775 + }, + { + "epoch": 4.309927360774818, + "grad_norm": 5.092523097991943, + "learning_rate": 2.86662232110739e-06, + "loss": 0.0391, + "num_input_tokens_seen": 1532720, + "step": 1780 + }, + { + "epoch": 4.322033898305085, + "grad_norm": 8.858246803283691, + "learning_rate": 2.7691438497855134e-06, + "loss": 0.0481, + "num_input_tokens_seen": 1536944, + "step": 1785 + }, + { + "epoch": 4.3341404358353515, + "grad_norm": 0.16653333604335785, + "learning_rate": 2.673254298115646e-06, + "loss": 0.0365, + "num_input_tokens_seen": 1541168, + "step": 1790 + }, + { + "epoch": 4.346246973365617, + "grad_norm": 0.057360630482435226, + "learning_rate": 2.5789605196706674e-06, + "loss": 0.0094, + "num_input_tokens_seen": 1545456, + "step": 1795 + }, + { + "epoch": 4.358353510895884, + "grad_norm": 18.321725845336914, + "learning_rate": 2.4862692539677906e-06, + "loss": 0.0798, + "num_input_tokens_seen": 1549872, + "step": 1800 + }, + { + "epoch": 4.37046004842615, + "grad_norm": 0.05611402168869972, + "learning_rate": 2.3951871259868503e-06, + "loss": 0.113, + "num_input_tokens_seen": 1554288, + "step": 1805 + }, + { + "epoch": 4.3825665859564165, + "grad_norm": 7.665430068969727, + "learning_rate": 2.3057206456967905e-06, + "loss": 0.1113, + "num_input_tokens_seen": 1558384, + "step": 1810 + }, + { + "epoch": 4.394673123486683, + "grad_norm": 9.430697441101074, + "learning_rate": 2.217876207590375e-06, + "loss": 0.0523, + "num_input_tokens_seen": 1562544, + "step": 1815 + }, + { + "epoch": 4.406779661016949, + "grad_norm": 0.0549406073987484, + "learning_rate": 2.131660090227139e-06, + "loss": 0.0659, + "num_input_tokens_seen": 1567216, + "step": 1820 + }, + { + "epoch": 4.418886198547216, + "grad_norm": 0.08962647616863251, + "learning_rate": 2.0470784557846652e-06, + "loss": 0.0756, + "num_input_tokens_seen": 1571568, + "step": 1825 + }, + { + "epoch": 4.4309927360774815, + "grad_norm": 0.09955435991287231, + "learning_rate": 1.964137349618114e-06, + "loss": 0.0018, + "num_input_tokens_seen": 1575792, + "step": 1830 + }, + { + "epoch": 4.443099273607748, + "grad_norm": 0.7829030156135559, + "learning_rate": 1.8828426998281689e-06, + "loss": 0.0419, + "num_input_tokens_seen": 1580080, + "step": 1835 + }, + { + "epoch": 4.455205811138015, + "grad_norm": 3.134791851043701, + "learning_rate": 1.8032003168373306e-06, + "loss": 0.0692, + "num_input_tokens_seen": 1584112, + "step": 1840 + }, + { + "epoch": 4.467312348668281, + "grad_norm": 1.7574918270111084, + "learning_rate": 1.7252158929746131e-06, + "loss": 0.0456, + "num_input_tokens_seen": 1588400, + "step": 1845 + }, + { + "epoch": 4.479418886198547, + "grad_norm": 27.999475479125977, + "learning_rate": 1.6488950020686955e-06, + "loss": 0.0504, + "num_input_tokens_seen": 1592816, + "step": 1850 + }, + { + "epoch": 4.491525423728813, + "grad_norm": 0.16101866960525513, + "learning_rate": 1.5742430990495466e-06, + "loss": 0.0573, + "num_input_tokens_seen": 1597296, + "step": 1855 + }, + { + "epoch": 4.50363196125908, + "grad_norm": 0.11599753797054291, + "learning_rate": 1.5012655195585368e-06, + "loss": 0.0293, + "num_input_tokens_seen": 1601648, + "step": 1860 + }, + { + "epoch": 4.5157384987893465, + "grad_norm": 6.17954683303833, + "learning_rate": 1.4299674795670764e-06, + "loss": 0.1156, + "num_input_tokens_seen": 1605936, + "step": 1865 + }, + { + "epoch": 4.527845036319612, + "grad_norm": 1.049548625946045, + "learning_rate": 1.360354075003828e-06, + "loss": 0.126, + "num_input_tokens_seen": 1610096, + "step": 1870 + }, + { + "epoch": 4.532687651331719, + "eval_loss": 0.42201921343803406, + "eval_runtime": 0.693, + "eval_samples_per_second": 529.558, + "eval_steps_per_second": 66.375, + "num_input_tokens_seen": 1611760, + "step": 1872 + }, + { + "epoch": 4.539951573849879, + "grad_norm": 13.409820556640625, + "learning_rate": 1.2924302813904582e-06, + "loss": 0.0436, + "num_input_tokens_seen": 1614384, + "step": 1875 + }, + { + "epoch": 4.552058111380145, + "grad_norm": 3.9212989807128906, + "learning_rate": 1.226200953486037e-06, + "loss": 0.0591, + "num_input_tokens_seen": 1618800, + "step": 1880 + }, + { + "epoch": 4.5641646489104115, + "grad_norm": 0.7789947986602783, + "learning_rate": 1.1616708249400449e-06, + "loss": 0.0027, + "num_input_tokens_seen": 1622960, + "step": 1885 + }, + { + "epoch": 4.576271186440678, + "grad_norm": 16.51002311706543, + "learning_rate": 1.0988445079540388e-06, + "loss": 0.037, + "num_input_tokens_seen": 1627056, + "step": 1890 + }, + { + "epoch": 4.588377723970944, + "grad_norm": 0.03825072944164276, + "learning_rate": 1.0377264929520125e-06, + "loss": 0.0205, + "num_input_tokens_seen": 1631408, + "step": 1895 + }, + { + "epoch": 4.600484261501211, + "grad_norm": 13.03893756866455, + "learning_rate": 9.783211482594285e-07, + "loss": 0.0687, + "num_input_tokens_seen": 1635888, + "step": 1900 + }, + { + "epoch": 4.6125907990314765, + "grad_norm": 0.19233529269695282, + "learning_rate": 9.206327197910203e-07, + "loss": 0.0049, + "num_input_tokens_seen": 1640176, + "step": 1905 + }, + { + "epoch": 4.624697336561743, + "grad_norm": 9.149880409240723, + "learning_rate": 8.646653307473079e-07, + "loss": 0.056, + "num_input_tokens_seen": 1644528, + "step": 1910 + }, + { + "epoch": 4.63680387409201, + "grad_norm": 0.09057964384555817, + "learning_rate": 8.10422981319911e-07, + "loss": 0.002, + "num_input_tokens_seen": 1649264, + "step": 1915 + }, + { + "epoch": 4.648910411622276, + "grad_norm": 0.645796537399292, + "learning_rate": 7.579095484056192e-07, + "loss": 0.0111, + "num_input_tokens_seen": 1653808, + "step": 1920 + }, + { + "epoch": 4.661016949152542, + "grad_norm": 0.02393440343439579, + "learning_rate": 7.07128785329314e-07, + "loss": 0.0023, + "num_input_tokens_seen": 1658288, + "step": 1925 + }, + { + "epoch": 4.673123486682809, + "grad_norm": 0.03354793041944504, + "learning_rate": 6.580843215757082e-07, + "loss": 0.0228, + "num_input_tokens_seen": 1662576, + "step": 1930 + }, + { + "epoch": 4.685230024213075, + "grad_norm": 1.0874401330947876, + "learning_rate": 6.107796625299117e-07, + "loss": 0.0221, + "num_input_tokens_seen": 1667056, + "step": 1935 + }, + { + "epoch": 4.697336561743342, + "grad_norm": 0.94743412733078, + "learning_rate": 5.652181892269181e-07, + "loss": 0.0733, + "num_input_tokens_seen": 1671536, + "step": 1940 + }, + { + "epoch": 4.709443099273607, + "grad_norm": 0.04832937568426132, + "learning_rate": 5.214031581099149e-07, + "loss": 0.0023, + "num_input_tokens_seen": 1675888, + "step": 1945 + }, + { + "epoch": 4.721549636803874, + "grad_norm": 12.764242172241211, + "learning_rate": 4.793377007975719e-07, + "loss": 0.0341, + "num_input_tokens_seen": 1680176, + "step": 1950 + }, + { + "epoch": 4.733656174334141, + "grad_norm": 6.990570068359375, + "learning_rate": 4.3902482386018186e-07, + "loss": 0.0568, + "num_input_tokens_seen": 1684400, + "step": 1955 + }, + { + "epoch": 4.745762711864407, + "grad_norm": 26.958158493041992, + "learning_rate": 4.004674086047905e-07, + "loss": 0.1211, + "num_input_tokens_seen": 1688816, + "step": 1960 + }, + { + "epoch": 4.757869249394673, + "grad_norm": 1.086872935295105, + "learning_rate": 3.636682108692502e-07, + "loss": 0.0408, + "num_input_tokens_seen": 1693360, + "step": 1965 + }, + { + "epoch": 4.76997578692494, + "grad_norm": 15.128409385681152, + "learning_rate": 3.2862986082524416e-07, + "loss": 0.0647, + "num_input_tokens_seen": 1697584, + "step": 1970 + }, + { + "epoch": 4.782082324455206, + "grad_norm": 7.18263053894043, + "learning_rate": 2.953548627903202e-07, + "loss": 0.0336, + "num_input_tokens_seen": 1702000, + "step": 1975 + }, + { + "epoch": 4.784503631961259, + "eval_loss": 0.4348176121711731, + "eval_runtime": 0.6821, + "eval_samples_per_second": 538.039, + "eval_steps_per_second": 67.438, + "num_input_tokens_seen": 1702832, + "step": 1976 + }, + { + "epoch": 4.7941888619854724, + "grad_norm": 0.357972115278244, + "learning_rate": 2.6384559504886166e-07, + "loss": 0.1448, + "num_input_tokens_seen": 1706416, + "step": 1980 + }, + { + "epoch": 4.806295399515738, + "grad_norm": 5.933152198791504, + "learning_rate": 2.3410430968214824e-07, + "loss": 0.0163, + "num_input_tokens_seen": 1710960, + "step": 1985 + }, + { + "epoch": 4.818401937046005, + "grad_norm": 21.378908157348633, + "learning_rate": 2.0613313240735454e-07, + "loss": 0.1048, + "num_input_tokens_seen": 1715440, + "step": 1990 + }, + { + "epoch": 4.830508474576272, + "grad_norm": 0.03769972547888756, + "learning_rate": 1.7993406242563238e-07, + "loss": 0.0295, + "num_input_tokens_seen": 1719728, + "step": 1995 + }, + { + "epoch": 4.842615012106537, + "grad_norm": 0.04536456614732742, + "learning_rate": 1.5550897227922523e-07, + "loss": 0.0007, + "num_input_tokens_seen": 1724272, + "step": 2000 + }, + { + "epoch": 4.854721549636804, + "grad_norm": 12.351763725280762, + "learning_rate": 1.3285960771761697e-07, + "loss": 0.064, + "num_input_tokens_seen": 1728560, + "step": 2005 + }, + { + "epoch": 4.86682808716707, + "grad_norm": 11.032571792602539, + "learning_rate": 1.119875875727705e-07, + "loss": 0.0289, + "num_input_tokens_seen": 1733104, + "step": 2010 + }, + { + "epoch": 4.878934624697337, + "grad_norm": 21.032617568969727, + "learning_rate": 9.289440364341485e-08, + "loss": 0.0127, + "num_input_tokens_seen": 1737264, + "step": 2015 + }, + { + "epoch": 4.891041162227603, + "grad_norm": 3.019296169281006, + "learning_rate": 7.558142058842754e-08, + "loss": 0.0664, + "num_input_tokens_seen": 1741424, + "step": 2020 + }, + { + "epoch": 4.903147699757869, + "grad_norm": 0.06446848809719086, + "learning_rate": 6.004987582929055e-08, + "loss": 0.0657, + "num_input_tokens_seen": 1745648, + "step": 2025 + }, + { + "epoch": 4.915254237288136, + "grad_norm": 15.37187385559082, + "learning_rate": 4.63008794616554e-08, + "loss": 0.045, + "num_input_tokens_seen": 1749872, + "step": 2030 + }, + { + "epoch": 4.927360774818402, + "grad_norm": 0.0873086079955101, + "learning_rate": 3.433541417599551e-08, + "loss": 0.0431, + "num_input_tokens_seen": 1754288, + "step": 2035 + }, + { + "epoch": 4.939467312348668, + "grad_norm": 0.19230781495571136, + "learning_rate": 2.4154335187365207e-08, + "loss": 0.0332, + "num_input_tokens_seen": 1758640, + "step": 2040 + }, + { + "epoch": 4.951573849878935, + "grad_norm": 0.0936799943447113, + "learning_rate": 1.5758370174284722e-08, + "loss": 0.0602, + "num_input_tokens_seen": 1762928, + "step": 2045 + }, + { + "epoch": 4.963680387409201, + "grad_norm": 0.07743958383798599, + "learning_rate": 9.14811922672898e-09, + "loss": 0.0118, + "num_input_tokens_seen": 1767344, + "step": 2050 + }, + { + "epoch": 4.9757869249394675, + "grad_norm": 0.36676138639450073, + "learning_rate": 4.324054803223065e-09, + "loss": 0.0392, + "num_input_tokens_seen": 1771632, + "step": 2055 + }, + { + "epoch": 4.987893462469733, + "grad_norm": 11.666873931884766, + "learning_rate": 1.286521697091425e-09, + "loss": 0.0333, + "num_input_tokens_seen": 1775728, + "step": 2060 + }, + { + "epoch": 5.0, + "grad_norm": 0.10359911620616913, + "learning_rate": 3.5737011805370145e-11, + "loss": 0.0653, + "num_input_tokens_seen": 1780000, + "step": 2065 + }, + { + "epoch": 5.0, + "num_input_tokens_seen": 1780000, + "step": 2065, + "total_flos": 1.039320047616e+16, + "train_loss": 0.16683834154997698, + "train_runtime": 1017.6301, + "train_samples_per_second": 16.219, + "train_steps_per_second": 2.029 + } + ], + "logging_steps": 5, + "max_steps": 2065, + "num_input_tokens_seen": 1780000, + "num_train_epochs": 5, + "save_steps": 104, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.039320047616e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..229467d --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc3066df7979caafc538601d30e2b2585d0e6731f7b4b98fe324aea6b808a11f +size 6289 diff --git a/training_eval_loss.png b/training_eval_loss.png new file mode 100644 index 0000000..6e74591 Binary files /dev/null and b/training_eval_loss.png differ diff --git a/training_loss.png b/training_loss.png new file mode 100644 index 0000000..bca358d Binary files /dev/null and b/training_loss.png differ