commit c242a5c6b9a8bd63b17400d3d3a86a6c20b43ed4 Author: ModelHub XC Date: Wed Apr 22 00:20:58 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: rbelanec/train_rte_42_1774791065 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..b740891 --- /dev/null +++ b/README.md @@ -0,0 +1,81 @@ +--- +library_name: transformers +license: llama3.2 +base_model: meta-llama/Llama-3.2-1B-Instruct +tags: +- peft-factory +- full +- llama-factory +- generated_from_trainer +model-index: +- name: train_rte_42_1774791065 + results: [] +--- + + + +# train_rte_42_1774791065 + +This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) on the rte dataset. +It achieves the following results on the evaluation set: +- Loss: 0.1407 +- Num Input Tokens Seen: 2035272 + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 5e-05 +- train_batch_size: 8 +- eval_batch_size: 8 +- seed: 42 +- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_ratio: 0.1 +- num_epochs: 5 + +### Training results + +| Training Loss | Epoch | Step | Validation Loss | Input Tokens Seen | +|:-------------:|:------:|:----:|:---------------:|:-----------------:| +| 0.2508 | 0.2527 | 71 | 0.1407 | 105024 | +| 0.1769 | 0.5053 | 142 | 0.1558 | 209536 | +| 0.1924 | 0.7580 | 213 | 0.1600 | 312576 | +| 0.1956 | 1.0107 | 284 | 0.1684 | 414040 | +| 0.1589 | 1.2633 | 355 | 0.1601 | 517656 | +| 0.1947 | 1.5160 | 426 | 0.1815 | 624344 | +| 0.1825 | 1.7687 | 497 | 0.1647 | 725656 | +| 0.1568 | 2.0214 | 568 | 0.1555 | 821416 | +| 0.1597 | 2.2740 | 639 | 0.1567 | 926760 | +| 0.1431 | 2.5267 | 710 | 0.1639 | 1025320 | +| 0.1986 | 2.7794 | 781 | 0.1541 | 1128104 | +| 0.137 | 3.0320 | 852 | 0.1852 | 1229440 | +| 0.1422 | 3.2847 | 923 | 0.1646 | 1332544 | +| 0.0911 | 3.5374 | 994 | 0.1804 | 1438336 | +| 0.1203 | 3.7900 | 1065 | 0.1771 | 1539072 | +| 0.0551 | 4.0427 | 1136 | 0.1983 | 1642696 | +| 0.0577 | 4.2954 | 1207 | 0.3402 | 1743624 | +| 0.0319 | 4.5480 | 1278 | 0.3532 | 1849416 | +| 0.0846 | 4.8007 | 1349 | 0.3423 | 1954568 | + + +### Framework versions + +- Transformers 4.51.3 +- Pytorch 2.10.0+cu128 +- Datasets 4.0.0 +- Tokenizers 0.21.4 diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..ac55187 --- /dev/null +++ b/all_results.json @@ -0,0 +1,13 @@ +{ + "epoch": 5.0, + "eval_loss": 0.14072927832603455, + "eval_runtime": 0.5968, + "eval_samples_per_second": 417.253, + "eval_steps_per_second": 53.623, + "num_input_tokens_seen": 2035272, + "total_flos": 1.1883702201974784e+16, + "train_loss": 0.17133487164477065, + "train_runtime": 699.7603, + "train_samples_per_second": 16.013, + "train_steps_per_second": 2.008 +} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..5a2b93f --- /dev/null +++ b/config.json @@ -0,0 +1,39 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.51.3", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/eval_results.json b/eval_results.json new file mode 100644 index 0000000..fb3d52a --- /dev/null +++ b/eval_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 5.0, + "eval_loss": 0.14072927832603455, + "eval_runtime": 0.5968, + "eval_samples_per_second": 417.253, + "eval_steps_per_second": 53.623, + "num_input_tokens_seen": 2035272 +} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..2b8ae57 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.51.3" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..f4d01e7 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f03cff080234d858206e7fd3ce0da58830d2a503759adb77c9db1fd9daf44d96 +size 4943274328 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..14daf45 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,26 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..ddc3ce0 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2069 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/train.yaml b/train.yaml new file mode 100644 index 0000000..42c2f35 --- /dev/null +++ b/train.yaml @@ -0,0 +1,55 @@ +seed: 42 + +### model +model_name_or_path: meta-llama/Llama-3.2-1B-Instruct +trust_remote_code: true +flash_attn: auto +use_cache: false + +### method +stage: sft +do_train: true +finetuning_type: full + +### dataset +dataset: rte +template: llama3 +cutoff_len: 2048 +overwrite_cache: true +preprocessing_num_workers: 4 +dataloader_num_workers: 4 +packing: false + +### output +output_dir: saves_bts_preliminary/base/llama-3.2-1b-instruct/train_rte_42_1774791065 +logging_steps: 5 +save_steps: 0.05 +overwrite_output_dir: true +save_only_model: false +plot_loss: true +include_num_input_tokens_seen: true +push_to_hub: true +push_to_hub_organization: rbelanec +load_best_model_at_end: true +save_total_limit: 1 + +### train +per_device_train_batch_size: 8 +learning_rate: 5.0e-5 +num_train_epochs: 5 +weight_decay: 1.0e-5 +lr_scheduler_type: cosine +bf16: true +ddp_timeout: 180000000 +resume_from_checkpoint: null +warmup_ratio: 0.1 +optim: adamw_torch +report_to: +- wandb +run_name: base_llama-3.2-1b-instruct_train_rte_42_1774791065 + +### eval +per_device_eval_batch_size: 8 +eval_strategy: steps +eval_steps: 0.05 +val_size: 0.1 \ No newline at end of file diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..ff70e3e --- /dev/null +++ b/train_results.json @@ -0,0 +1,9 @@ +{ + "epoch": 5.0, + "num_input_tokens_seen": 2035272, + "total_flos": 1.1883702201974784e+16, + "train_loss": 0.17133487164477065, + "train_runtime": 699.7603, + "train_samples_per_second": 16.013, + "train_steps_per_second": 2.008 +} \ No newline at end of file diff --git a/trainer_log.jsonl b/trainer_log.jsonl new file mode 100644 index 0000000..72f0be1 --- /dev/null +++ b/trainer_log.jsonl @@ -0,0 +1,301 @@ +{"current_steps": 5, "total_steps": 1405, "loss": 0.6704, "lr": 1.4184397163120568e-06, "epoch": 0.017793594306049824, "percentage": 0.36, "elapsed_time": "0:00:00", "remaining_time": "0:03:57", "throughput": 9261.64, "total_tokens": 7872} +{"current_steps": 10, "total_steps": 1405, "loss": 0.2548, "lr": 3.1914893617021277e-06, "epoch": 0.03558718861209965, "percentage": 0.71, "elapsed_time": "0:00:01", "remaining_time": "0:03:00", "throughput": 11397.44, "total_tokens": 14784} +{"current_steps": 15, "total_steps": 1405, "loss": 0.9227, "lr": 4.964539007092199e-06, "epoch": 0.05338078291814947, "percentage": 1.07, "elapsed_time": "0:00:01", "remaining_time": "0:02:45", "throughput": 13094.26, "total_tokens": 23424} +{"current_steps": 20, "total_steps": 1405, "loss": 0.1819, "lr": 6.73758865248227e-06, "epoch": 0.0711743772241993, "percentage": 1.42, "elapsed_time": "0:00:02", "remaining_time": "0:02:34", "throughput": 13389.34, "total_tokens": 29824} +{"current_steps": 25, "total_steps": 1405, "loss": 0.228, "lr": 8.510638297872341e-06, "epoch": 0.08896797153024912, "percentage": 1.78, "elapsed_time": "0:00:02", "remaining_time": "0:02:28", "throughput": 14061.51, "total_tokens": 37824} +{"current_steps": 30, "total_steps": 1405, "loss": 0.1572, "lr": 1.0283687943262411e-05, "epoch": 0.10676156583629894, "percentage": 2.14, "elapsed_time": "0:00:03", "remaining_time": "0:02:23", "throughput": 14260.87, "total_tokens": 44608} +{"current_steps": 35, "total_steps": 1405, "loss": 0.1609, "lr": 1.2056737588652483e-05, "epoch": 0.12455516014234876, "percentage": 2.49, "elapsed_time": "0:00:03", "remaining_time": "0:02:19", "throughput": 14545.74, "total_tokens": 51968} +{"current_steps": 40, "total_steps": 1405, "loss": 0.2143, "lr": 1.3829787234042554e-05, "epoch": 0.1423487544483986, "percentage": 2.85, "elapsed_time": "0:00:04", "remaining_time": "0:02:17", "throughput": 14773.94, "total_tokens": 59456} +{"current_steps": 45, "total_steps": 1405, "loss": 0.2034, "lr": 1.5602836879432626e-05, "epoch": 0.1601423487544484, "percentage": 3.2, "elapsed_time": "0:00:04", "remaining_time": "0:02:14", "throughput": 14905.26, "total_tokens": 66496} +{"current_steps": 50, "total_steps": 1405, "loss": 0.2702, "lr": 1.7375886524822697e-05, "epoch": 0.17793594306049823, "percentage": 3.56, "elapsed_time": "0:00:04", "remaining_time": "0:02:12", "throughput": 14984.9, "total_tokens": 73408} +{"current_steps": 55, "total_steps": 1405, "loss": 0.1793, "lr": 1.9148936170212766e-05, "epoch": 0.19572953736654805, "percentage": 3.91, "elapsed_time": "0:00:05", "remaining_time": "0:02:11", "throughput": 15094.9, "total_tokens": 80576} +{"current_steps": 60, "total_steps": 1405, "loss": 0.161, "lr": 2.0921985815602837e-05, "epoch": 0.21352313167259787, "percentage": 4.27, "elapsed_time": "0:00:05", "remaining_time": "0:02:09", "throughput": 15238.44, "total_tokens": 88256} +{"current_steps": 65, "total_steps": 1405, "loss": 0.1808, "lr": 2.269503546099291e-05, "epoch": 0.2313167259786477, "percentage": 4.63, "elapsed_time": "0:00:06", "remaining_time": "0:02:08", "throughput": 15388.52, "total_tokens": 96256} +{"current_steps": 70, "total_steps": 1405, "loss": 0.2508, "lr": 2.446808510638298e-05, "epoch": 0.2491103202846975, "percentage": 4.98, "elapsed_time": "0:00:06", "remaining_time": "0:02:07", "throughput": 15441.44, "total_tokens": 103424} +{"current_steps": 71, "total_steps": 1405, "eval_loss": 0.14072927832603455, "epoch": 0.2526690391459075, "percentage": 5.05, "elapsed_time": "0:00:07", "remaining_time": "0:02:18", "throughput": 14240.56, "total_tokens": 105024} +{"current_steps": 75, "total_steps": 1405, "loss": 0.143, "lr": 2.624113475177305e-05, "epoch": 0.2669039145907473, "percentage": 5.34, "elapsed_time": "0:00:55", "remaining_time": "0:16:19", "throughput": 2001.71, "total_tokens": 110528} +{"current_steps": 80, "total_steps": 1405, "loss": 0.2326, "lr": 2.8014184397163124e-05, "epoch": 0.2846975088967972, "percentage": 5.69, "elapsed_time": "0:00:55", "remaining_time": "0:15:21", "throughput": 2110.33, "total_tokens": 117440} +{"current_steps": 85, "total_steps": 1405, "loss": 0.2053, "lr": 2.9787234042553192e-05, "epoch": 0.302491103202847, "percentage": 6.05, "elapsed_time": "0:00:56", "remaining_time": "0:14:31", "throughput": 2236.67, "total_tokens": 125504} +{"current_steps": 90, "total_steps": 1405, "loss": 0.2409, "lr": 3.156028368794326e-05, "epoch": 0.3202846975088968, "percentage": 6.41, "elapsed_time": "0:00:56", "remaining_time": "0:13:46", "throughput": 2340.64, "total_tokens": 132352} +{"current_steps": 95, "total_steps": 1405, "loss": 0.2063, "lr": 3.3333333333333335e-05, "epoch": 0.33807829181494664, "percentage": 6.76, "elapsed_time": "0:00:56", "remaining_time": "0:13:05", "throughput": 2443.01, "total_tokens": 139200} +{"current_steps": 100, "total_steps": 1405, "loss": 0.244, "lr": 3.5106382978723407e-05, "epoch": 0.35587188612099646, "percentage": 7.12, "elapsed_time": "0:00:57", "remaining_time": "0:12:29", "throughput": 2574.04, "total_tokens": 147904} +{"current_steps": 105, "total_steps": 1405, "loss": 0.183, "lr": 3.687943262411347e-05, "epoch": 0.3736654804270463, "percentage": 7.47, "elapsed_time": "0:00:57", "remaining_time": "0:11:56", "throughput": 2664.85, "total_tokens": 154240} +{"current_steps": 110, "total_steps": 1405, "loss": 0.1615, "lr": 3.865248226950355e-05, "epoch": 0.3914590747330961, "percentage": 7.83, "elapsed_time": "0:00:58", "remaining_time": "0:11:26", "throughput": 2768.47, "total_tokens": 161472} +{"current_steps": 115, "total_steps": 1405, "loss": 0.1703, "lr": 4.0425531914893614e-05, "epoch": 0.4092526690391459, "percentage": 8.19, "elapsed_time": "0:00:58", "remaining_time": "0:10:59", "throughput": 2862.5, "total_tokens": 168192} +{"current_steps": 120, "total_steps": 1405, "loss": 0.246, "lr": 4.219858156028369e-05, "epoch": 0.42704626334519574, "percentage": 8.54, "elapsed_time": "0:00:59", "remaining_time": "0:10:33", "throughput": 2951.15, "total_tokens": 174656} +{"current_steps": 125, "total_steps": 1405, "loss": 0.1665, "lr": 4.3971631205673764e-05, "epoch": 0.44483985765124556, "percentage": 8.9, "elapsed_time": "0:00:59", "remaining_time": "0:10:10", "throughput": 3046.8, "total_tokens": 181632} +{"current_steps": 130, "total_steps": 1405, "loss": 0.1695, "lr": 4.574468085106383e-05, "epoch": 0.4626334519572954, "percentage": 9.25, "elapsed_time": "0:01:00", "remaining_time": "0:09:49", "throughput": 3184.81, "total_tokens": 191488} +{"current_steps": 135, "total_steps": 1405, "loss": 0.1764, "lr": 4.751773049645391e-05, "epoch": 0.4804270462633452, "percentage": 9.61, "elapsed_time": "0:01:00", "remaining_time": "0:09:29", "throughput": 3282.77, "total_tokens": 198848} +{"current_steps": 140, "total_steps": 1405, "loss": 0.1769, "lr": 4.929078014184397e-05, "epoch": 0.498220640569395, "percentage": 9.96, "elapsed_time": "0:01:01", "remaining_time": "0:09:11", "throughput": 3394.7, "total_tokens": 207232} +{"current_steps": 142, "total_steps": 1405, "eval_loss": 0.15581394731998444, "epoch": 0.505338078291815, "percentage": 10.11, "elapsed_time": "0:01:01", "remaining_time": "0:09:09", "throughput": 3393.08, "total_tokens": 209536} +{"current_steps": 145, "total_steps": 1405, "loss": 0.2155, "lr": 4.9999305045921804e-05, "epoch": 0.5160142348754448, "percentage": 10.32, "elapsed_time": "0:01:21", "remaining_time": "0:11:46", "throughput": 2630.39, "total_tokens": 213952} +{"current_steps": 150, "total_steps": 1405, "loss": 0.185, "lr": 4.9995058244251644e-05, "epoch": 0.5338078291814946, "percentage": 10.68, "elapsed_time": "0:01:21", "remaining_time": "0:11:24", "throughput": 2706.78, "total_tokens": 221376} +{"current_steps": 155, "total_steps": 1405, "loss": 0.2471, "lr": 4.998695138156149e-05, "epoch": 0.5516014234875445, "percentage": 11.03, "elapsed_time": "0:01:22", "remaining_time": "0:11:03", "throughput": 2783.85, "total_tokens": 228928} +{"current_steps": 160, "total_steps": 1405, "loss": 0.2061, "lr": 4.997498570981822e-05, "epoch": 0.5693950177935944, "percentage": 11.39, "elapsed_time": "0:01:22", "remaining_time": "0:10:43", "throughput": 2858.73, "total_tokens": 236352} +{"current_steps": 165, "total_steps": 1405, "loss": 0.1488, "lr": 4.995916307691601e-05, "epoch": 0.5871886120996441, "percentage": 11.74, "elapsed_time": "0:01:23", "remaining_time": "0:10:24", "throughput": 2939.86, "total_tokens": 244416} +{"current_steps": 170, "total_steps": 1405, "loss": 0.1625, "lr": 4.993948592639104e-05, "epoch": 0.604982206405694, "percentage": 12.1, "elapsed_time": "0:01:23", "remaining_time": "0:10:07", "throughput": 3008.87, "total_tokens": 251456} +{"current_steps": 175, "total_steps": 1405, "loss": 0.1635, "lr": 4.991595729704405e-05, "epoch": 0.6227758007117438, "percentage": 12.46, "elapsed_time": "0:01:24", "remaining_time": "0:09:50", "throughput": 3081.28, "total_tokens": 258880} +{"current_steps": 180, "total_steps": 1405, "loss": 0.163, "lr": 4.9888580822471086e-05, "epoch": 0.6405693950177936, "percentage": 12.81, "elapsed_time": "0:01:24", "remaining_time": "0:09:34", "throughput": 3140.45, "total_tokens": 265152} +{"current_steps": 185, "total_steps": 1405, "loss": 0.1599, "lr": 4.985736073050237e-05, "epoch": 0.6583629893238434, "percentage": 13.17, "elapsed_time": "0:01:24", "remaining_time": "0:09:19", "throughput": 3211.35, "total_tokens": 272576} +{"current_steps": 190, "total_steps": 1405, "loss": 0.1669, "lr": 4.982230184254933e-05, "epoch": 0.6761565836298933, "percentage": 13.52, "elapsed_time": "0:01:25", "remaining_time": "0:09:05", "throughput": 3278.9, "total_tokens": 279744} +{"current_steps": 195, "total_steps": 1405, "loss": 0.1659, "lr": 4.9783409572860105e-05, "epoch": 0.693950177935943, "percentage": 13.88, "elapsed_time": "0:01:25", "remaining_time": "0:08:52", "throughput": 3353.87, "total_tokens": 287680} +{"current_steps": 200, "total_steps": 1405, "loss": 0.1729, "lr": 4.974068992768331e-05, "epoch": 0.7117437722419929, "percentage": 14.23, "elapsed_time": "0:01:26", "remaining_time": "0:08:39", "throughput": 3417.1, "total_tokens": 294592} +{"current_steps": 205, "total_steps": 1405, "loss": 0.2655, "lr": 4.9694149504340517e-05, "epoch": 0.7295373665480427, "percentage": 14.59, "elapsed_time": "0:01:26", "remaining_time": "0:08:27", "throughput": 3479.09, "total_tokens": 301440} +{"current_steps": 210, "total_steps": 1405, "loss": 0.1924, "lr": 4.964379549020741e-05, "epoch": 0.7473309608540926, "percentage": 14.95, "elapsed_time": "0:01:27", "remaining_time": "0:08:15", "throughput": 3541.9, "total_tokens": 308416} +{"current_steps": 213, "total_steps": 1405, "eval_loss": 0.1600140929222107, "epoch": 0.7580071174377224, "percentage": 15.16, "elapsed_time": "0:01:27", "remaining_time": "0:08:11", "throughput": 3557.0, "total_tokens": 312576} +{"current_steps": 215, "total_steps": 1405, "loss": 0.1666, "lr": 4.958963566160384e-05, "epoch": 0.7651245551601423, "percentage": 15.3, "elapsed_time": "0:01:49", "remaining_time": "0:10:07", "throughput": 2875.22, "total_tokens": 315328} +{"current_steps": 220, "total_steps": 1405, "loss": 0.1668, "lr": 4.953167838259285e-05, "epoch": 0.7829181494661922, "percentage": 15.66, "elapsed_time": "0:01:50", "remaining_time": "0:09:53", "throughput": 2930.4, "total_tokens": 322688} +{"current_steps": 225, "total_steps": 1405, "loss": 0.1826, "lr": 4.946993260368904e-05, "epoch": 0.800711743772242, "percentage": 16.01, "elapsed_time": "0:01:50", "remaining_time": "0:09:39", "throughput": 2978.7, "total_tokens": 329280} +{"current_steps": 230, "total_steps": 1405, "loss": 0.1488, "lr": 4.940440786047628e-05, "epoch": 0.8185053380782918, "percentage": 16.37, "elapsed_time": "0:01:50", "remaining_time": "0:09:27", "throughput": 3035.34, "total_tokens": 336896} +{"current_steps": 235, "total_steps": 1405, "loss": 0.2852, "lr": 4.933511427213511e-05, "epoch": 0.8362989323843416, "percentage": 16.73, "elapsed_time": "0:01:51", "remaining_time": "0:09:14", "throughput": 3088.26, "total_tokens": 344128} +{"current_steps": 240, "total_steps": 1405, "loss": 0.1901, "lr": 4.926206253988001e-05, "epoch": 0.8540925266903915, "percentage": 17.08, "elapsed_time": "0:01:51", "remaining_time": "0:09:02", "throughput": 3137.08, "total_tokens": 350912} +{"current_steps": 245, "total_steps": 1405, "loss": 0.1972, "lr": 4.91852639453068e-05, "epoch": 0.8718861209964412, "percentage": 17.44, "elapsed_time": "0:01:52", "remaining_time": "0:08:51", "throughput": 3188.08, "total_tokens": 358016} +{"current_steps": 250, "total_steps": 1405, "loss": 0.3136, "lr": 4.910473034865033e-05, "epoch": 0.8896797153024911, "percentage": 17.79, "elapsed_time": "0:01:52", "remaining_time": "0:08:40", "throughput": 3235.7, "total_tokens": 364736} +{"current_steps": 255, "total_steps": 1405, "loss": 0.1648, "lr": 4.902047418695292e-05, "epoch": 0.9074733096085409, "percentage": 18.15, "elapsed_time": "0:01:53", "remaining_time": "0:08:30", "throughput": 3284.51, "total_tokens": 371648} +{"current_steps": 260, "total_steps": 1405, "loss": 0.1706, "lr": 4.893250847214369e-05, "epoch": 0.9252669039145908, "percentage": 18.51, "elapsed_time": "0:01:53", "remaining_time": "0:08:20", "throughput": 3337.95, "total_tokens": 379200} +{"current_steps": 265, "total_steps": 1405, "loss": 0.2379, "lr": 4.884084678902898e-05, "epoch": 0.9430604982206405, "percentage": 18.86, "elapsed_time": "0:01:54", "remaining_time": "0:08:10", "throughput": 3394.56, "total_tokens": 387200} +{"current_steps": 270, "total_steps": 1405, "loss": 0.1618, "lr": 4.874550329319457e-05, "epoch": 0.9608540925266904, "percentage": 19.22, "elapsed_time": "0:01:54", "remaining_time": "0:08:01", "throughput": 3450.89, "total_tokens": 395264} +{"current_steps": 275, "total_steps": 1405, "loss": 0.1637, "lr": 4.864649270881944e-05, "epoch": 0.9786476868327402, "percentage": 19.57, "elapsed_time": "0:01:54", "remaining_time": "0:07:52", "throughput": 3498.0, "total_tokens": 402176} +{"current_steps": 280, "total_steps": 1405, "loss": 0.1956, "lr": 4.8543830326401954e-05, "epoch": 0.99644128113879, "percentage": 19.93, "elapsed_time": "0:01:55", "remaining_time": "0:07:43", "throughput": 3551.83, "total_tokens": 409984} +{"current_steps": 284, "total_steps": 1405, "eval_loss": 0.16843144595623016, "epoch": 1.01067615658363, "percentage": 20.21, "elapsed_time": "0:01:56", "remaining_time": "0:07:39", "throughput": 3556.6, "total_tokens": 414040} +{"current_steps": 285, "total_steps": 1405, "loss": 0.1483, "lr": 4.843753200039851e-05, "epoch": 1.0142348754448398, "percentage": 20.28, "elapsed_time": "0:02:18", "remaining_time": "0:09:04", "throughput": 2994.8, "total_tokens": 415256} +{"current_steps": 290, "total_steps": 1405, "loss": 0.1508, "lr": 4.832761414677503e-05, "epoch": 1.0320284697508897, "percentage": 20.64, "elapsed_time": "0:02:19", "remaining_time": "0:08:54", "throughput": 3039.39, "total_tokens": 422808} +{"current_steps": 295, "total_steps": 1405, "loss": 0.1599, "lr": 4.8214093740471836e-05, "epoch": 1.0498220640569396, "percentage": 21.0, "elapsed_time": "0:02:19", "remaining_time": "0:08:45", "throughput": 3082.0, "total_tokens": 430104} +{"current_steps": 300, "total_steps": 1405, "loss": 0.1629, "lr": 4.8096988312782174e-05, "epoch": 1.0676156583629894, "percentage": 21.35, "elapsed_time": "0:02:19", "remaining_time": "0:08:35", "throughput": 3120.14, "total_tokens": 436760} +{"current_steps": 305, "total_steps": 1405, "loss": 0.1729, "lr": 4.7976315948644745e-05, "epoch": 1.085409252669039, "percentage": 21.71, "elapsed_time": "0:02:20", "remaining_time": "0:08:26", "throughput": 3168.2, "total_tokens": 444952} +{"current_steps": 310, "total_steps": 1405, "loss": 3.0413, "lr": 4.7852095283850866e-05, "epoch": 1.103202846975089, "percentage": 22.06, "elapsed_time": "0:02:20", "remaining_time": "0:08:17", "throughput": 3213.29, "total_tokens": 452760} +{"current_steps": 315, "total_steps": 1405, "loss": 0.1785, "lr": 4.772434550216643e-05, "epoch": 1.1209964412811388, "percentage": 22.42, "elapsed_time": "0:02:21", "remaining_time": "0:08:08", "throughput": 3243.84, "total_tokens": 458392} +{"current_steps": 320, "total_steps": 1405, "loss": 0.1666, "lr": 4.7593086332369344e-05, "epoch": 1.1387900355871885, "percentage": 22.78, "elapsed_time": "0:02:21", "remaining_time": "0:08:00", "throughput": 3281.47, "total_tokens": 465112} +{"current_steps": 325, "total_steps": 1405, "loss": 0.2395, "lr": 4.74583380452027e-05, "epoch": 1.1565836298932384, "percentage": 23.13, "elapsed_time": "0:02:22", "remaining_time": "0:07:52", "throughput": 3321.41, "total_tokens": 472216} +{"current_steps": 330, "total_steps": 1405, "loss": 0.2229, "lr": 4.7320121450244394e-05, "epoch": 1.1743772241992882, "percentage": 23.49, "elapsed_time": "0:02:22", "remaining_time": "0:07:44", "throughput": 3362.68, "total_tokens": 479576} +{"current_steps": 335, "total_steps": 1405, "loss": 0.2531, "lr": 4.717845789269333e-05, "epoch": 1.1921708185053381, "percentage": 23.84, "elapsed_time": "0:02:23", "remaining_time": "0:07:36", "throughput": 3401.21, "total_tokens": 486552} +{"current_steps": 340, "total_steps": 1405, "loss": 0.2223, "lr": 4.703336925007311e-05, "epoch": 1.209964412811388, "percentage": 24.2, "elapsed_time": "0:02:23", "remaining_time": "0:07:29", "throughput": 3446.39, "total_tokens": 494616} +{"current_steps": 345, "total_steps": 1405, "loss": 0.1898, "lr": 4.68848779288534e-05, "epoch": 1.2277580071174377, "percentage": 24.56, "elapsed_time": "0:02:23", "remaining_time": "0:07:22", "throughput": 3483.18, "total_tokens": 501400} +{"current_steps": 350, "total_steps": 1405, "loss": 0.1662, "lr": 4.673300686098957e-05, "epoch": 1.2455516014234875, "percentage": 24.91, "elapsed_time": "0:02:24", "remaining_time": "0:07:15", "throughput": 3524.17, "total_tokens": 508888} +{"current_steps": 355, "total_steps": 1405, "loss": 0.1589, "lr": 4.657777950038133e-05, "epoch": 1.2633451957295374, "percentage": 25.27, "elapsed_time": "0:02:24", "remaining_time": "0:07:08", "throughput": 3573.03, "total_tokens": 517656} +{"current_steps": 355, "total_steps": 1405, "eval_loss": 0.1600693166255951, "epoch": 1.2633451957295374, "percentage": 25.27, "elapsed_time": "0:02:25", "remaining_time": "0:07:10", "throughput": 3558.09, "total_tokens": 517656} +{"current_steps": 360, "total_steps": 1405, "loss": 0.1538, "lr": 4.6419219819250636e-05, "epoch": 1.281138790035587, "percentage": 25.62, "elapsed_time": "0:03:20", "remaining_time": "0:09:41", "throughput": 2624.92, "total_tokens": 526232} +{"current_steps": 365, "total_steps": 1405, "loss": 0.1811, "lr": 4.62573523044396e-05, "epoch": 1.298932384341637, "percentage": 25.98, "elapsed_time": "0:03:20", "remaining_time": "0:09:32", "throughput": 2654.85, "total_tokens": 533400} +{"current_steps": 370, "total_steps": 1405, "loss": 0.174, "lr": 4.609220195362886e-05, "epoch": 1.3167259786476868, "percentage": 26.33, "elapsed_time": "0:03:21", "remaining_time": "0:09:23", "throughput": 2691.99, "total_tokens": 542168} +{"current_steps": 375, "total_steps": 1405, "loss": 0.1571, "lr": 4.5923794271477217e-05, "epoch": 1.3345195729537367, "percentage": 26.69, "elapsed_time": "0:03:21", "remaining_time": "0:09:14", "throughput": 2724.59, "total_tokens": 549976} +{"current_steps": 380, "total_steps": 1405, "loss": 0.1641, "lr": 4.575215526568278e-05, "epoch": 1.3523131672597866, "percentage": 27.05, "elapsed_time": "0:03:22", "remaining_time": "0:09:05", "throughput": 2753.47, "total_tokens": 557016} +{"current_steps": 385, "total_steps": 1405, "loss": 1.4814, "lr": 4.5577311442966584e-05, "epoch": 1.3701067615658362, "percentage": 27.4, "elapsed_time": "0:03:22", "remaining_time": "0:08:57", "throughput": 2784.28, "total_tokens": 564504} +{"current_steps": 390, "total_steps": 1405, "loss": 0.1601, "lr": 4.539928980497903e-05, "epoch": 1.387900355871886, "percentage": 27.76, "elapsed_time": "0:03:23", "remaining_time": "0:08:48", "throughput": 2814.37, "total_tokens": 571864} +{"current_steps": 395, "total_steps": 1405, "loss": 0.2213, "lr": 4.521811784412996e-05, "epoch": 1.405693950177936, "percentage": 28.11, "elapsed_time": "0:03:23", "remaining_time": "0:08:40", "throughput": 2840.85, "total_tokens": 578456} +{"current_steps": 400, "total_steps": 1405, "loss": 1.4493, "lr": 4.503382353934294e-05, "epoch": 1.4234875444839858, "percentage": 28.47, "elapsed_time": "0:03:24", "remaining_time": "0:08:32", "throughput": 2865.2, "total_tokens": 584600} +{"current_steps": 405, "total_steps": 1405, "loss": 0.1729, "lr": 4.4846435351734376e-05, "epoch": 1.4412811387900355, "percentage": 28.83, "elapsed_time": "0:03:24", "remaining_time": "0:08:24", "throughput": 2891.23, "total_tokens": 591128} +{"current_steps": 410, "total_steps": 1405, "loss": 0.1539, "lr": 4.4655982220218176e-05, "epoch": 1.4590747330960854, "percentage": 29.18, "elapsed_time": "0:03:24", "remaining_time": "0:08:17", "throughput": 2921.17, "total_tokens": 598552} +{"current_steps": 415, "total_steps": 1405, "loss": 0.1612, "lr": 4.446249355703661e-05, "epoch": 1.4768683274021353, "percentage": 29.54, "elapsed_time": "0:03:25", "remaining_time": "0:08:09", "throughput": 2956.98, "total_tokens": 607320} +{"current_steps": 420, "total_steps": 1405, "loss": 0.1594, "lr": 4.426599924321815e-05, "epoch": 1.4946619217081851, "percentage": 29.89, "elapsed_time": "0:03:25", "remaining_time": "0:08:02", "throughput": 2986.7, "total_tokens": 614744} +{"current_steps": 425, "total_steps": 1405, "loss": 0.1947, "lr": 4.4066529623962784e-05, "epoch": 1.512455516014235, "percentage": 30.25, "elapsed_time": "0:03:26", "remaining_time": "0:07:55", "throughput": 3019.07, "total_tokens": 622808} +{"current_steps": 426, "total_steps": 1405, "eval_loss": 0.18150445818901062, "epoch": 1.5160142348754448, "percentage": 30.32, "elapsed_time": "0:03:26", "remaining_time": "0:07:55", "throughput": 3016.74, "total_tokens": 624344} +{"current_steps": 430, "total_steps": 1405, "loss": 0.1523, "lr": 4.386411550395576e-05, "epoch": 1.5302491103202847, "percentage": 30.6, "elapsed_time": "0:03:48", "remaining_time": "0:08:38", "throughput": 2759.03, "total_tokens": 630488} +{"current_steps": 435, "total_steps": 1405, "loss": 0.1721, "lr": 4.365878814261032e-05, "epoch": 1.5480427046263345, "percentage": 30.96, "elapsed_time": "0:03:48", "remaining_time": "0:08:30", "throughput": 2788.16, "total_tokens": 638424} +{"current_steps": 440, "total_steps": 1405, "loss": 0.1551, "lr": 4.34505792492402e-05, "epoch": 1.5658362989323842, "percentage": 31.32, "elapsed_time": "0:03:49", "remaining_time": "0:08:23", "throughput": 2812.54, "total_tokens": 645208} +{"current_steps": 445, "total_steps": 1405, "loss": 0.1499, "lr": 4.323952097816269e-05, "epoch": 1.583629893238434, "percentage": 31.67, "elapsed_time": "0:03:49", "remaining_time": "0:08:15", "throughput": 2840.95, "total_tokens": 653016} +{"current_steps": 450, "total_steps": 1405, "loss": 0.1843, "lr": 4.3025645923732926e-05, "epoch": 1.601423487544484, "percentage": 32.03, "elapsed_time": "0:03:50", "remaining_time": "0:08:08", "throughput": 2865.85, "total_tokens": 659992} +{"current_steps": 455, "total_steps": 1405, "loss": 0.1579, "lr": 4.2808987115310255e-05, "epoch": 1.6192170818505338, "percentage": 32.38, "elapsed_time": "0:03:50", "remaining_time": "0:08:01", "throughput": 2891.63, "total_tokens": 667224} +{"current_steps": 460, "total_steps": 1405, "loss": 0.1563, "lr": 4.2589578012157426e-05, "epoch": 1.6370106761565837, "percentage": 32.74, "elapsed_time": "0:03:51", "remaining_time": "0:07:54", "throughput": 2920.2, "total_tokens": 675160} +{"current_steps": 465, "total_steps": 1405, "loss": 0.1556, "lr": 4.236745249827336e-05, "epoch": 1.6548042704626336, "percentage": 33.1, "elapsed_time": "0:03:51", "remaining_time": "0:07:48", "throughput": 2950.41, "total_tokens": 683544} +{"current_steps": 470, "total_steps": 1405, "loss": 0.1593, "lr": 4.214264487716033e-05, "epoch": 1.6725978647686834, "percentage": 33.45, "elapsed_time": "0:03:52", "remaining_time": "0:07:41", "throughput": 2970.35, "total_tokens": 689368} +{"current_steps": 475, "total_steps": 1405, "loss": 0.1699, "lr": 4.191518986652642e-05, "epoch": 1.690391459074733, "percentage": 33.81, "elapsed_time": "0:03:52", "remaining_time": "0:07:35", "throughput": 2992.74, "total_tokens": 695832} +{"current_steps": 480, "total_steps": 1405, "loss": 0.1563, "lr": 4.168512259292391e-05, "epoch": 1.708185053380783, "percentage": 34.16, "elapsed_time": "0:03:52", "remaining_time": "0:07:28", "throughput": 3018.34, "total_tokens": 703128} +{"current_steps": 485, "total_steps": 1405, "loss": 0.1507, "lr": 4.1452478586324605e-05, "epoch": 1.7259786476868326, "percentage": 34.52, "elapsed_time": "0:03:53", "remaining_time": "0:07:22", "throughput": 3040.19, "total_tokens": 709528} +{"current_steps": 490, "total_steps": 1405, "loss": 0.1558, "lr": 4.121729377463285e-05, "epoch": 1.7437722419928825, "percentage": 34.88, "elapsed_time": "0:03:53", "remaining_time": "0:07:16", "throughput": 3063.63, "total_tokens": 716312} +{"current_steps": 495, "total_steps": 1405, "loss": 0.1825, "lr": 4.097960447813705e-05, "epoch": 1.7615658362989324, "percentage": 35.23, "elapsed_time": "0:03:54", "remaining_time": "0:07:10", "throughput": 3085.76, "total_tokens": 722776} +{"current_steps": 497, "total_steps": 1405, "eval_loss": 0.16469639539718628, "epoch": 1.7686832740213523, "percentage": 35.37, "elapsed_time": "0:03:54", "remaining_time": "0:07:09", "throughput": 3088.2, "total_tokens": 725656} +{"current_steps": 500, "total_steps": 1405, "loss": 0.1798, "lr": 4.073944740390061e-05, "epoch": 1.7793594306049823, "percentage": 35.59, "elapsed_time": "0:04:17", "remaining_time": "0:07:46", "throughput": 2831.32, "total_tokens": 729944} +{"current_steps": 505, "total_steps": 1405, "loss": 0.1694, "lr": 4.049685964009321e-05, "epoch": 1.7971530249110321, "percentage": 35.94, "elapsed_time": "0:04:18", "remaining_time": "0:07:40", "throughput": 2854.17, "total_tokens": 737112} +{"current_steps": 510, "total_steps": 1405, "loss": 0.1605, "lr": 4.025187865026311e-05, "epoch": 1.814946619217082, "percentage": 36.3, "elapsed_time": "0:04:18", "remaining_time": "0:07:34", "throughput": 2877.43, "total_tokens": 744408} +{"current_steps": 515, "total_steps": 1405, "loss": 0.1574, "lr": 4.000454226755159e-05, "epoch": 1.8327402135231317, "percentage": 36.65, "elapsed_time": "0:04:19", "remaining_time": "0:07:27", "throughput": 2896.23, "total_tokens": 750488} +{"current_steps": 520, "total_steps": 1405, "loss": 0.1703, "lr": 3.975488868885021e-05, "epoch": 1.8505338078291815, "percentage": 37.01, "elapsed_time": "0:04:19", "remaining_time": "0:07:21", "throughput": 2918.39, "total_tokens": 757528} +{"current_steps": 525, "total_steps": 1405, "loss": 0.1545, "lr": 3.9502956468902014e-05, "epoch": 1.8683274021352312, "percentage": 37.37, "elapsed_time": "0:04:19", "remaining_time": "0:07:15", "throughput": 2937.52, "total_tokens": 763736} +{"current_steps": 530, "total_steps": 1405, "loss": 0.1534, "lr": 3.924878451434735e-05, "epoch": 1.886120996441281, "percentage": 37.72, "elapsed_time": "0:04:20", "remaining_time": "0:07:10", "throughput": 2963.41, "total_tokens": 771864} +{"current_steps": 535, "total_steps": 1405, "loss": 0.1537, "lr": 3.899241207771546e-05, "epoch": 1.903914590747331, "percentage": 38.08, "elapsed_time": "0:04:20", "remaining_time": "0:07:04", "throughput": 2984.65, "total_tokens": 778712} +{"current_steps": 540, "total_steps": 1405, "loss": 0.1917, "lr": 3.873387875136252e-05, "epoch": 1.9217081850533808, "percentage": 38.43, "elapsed_time": "0:04:21", "remaining_time": "0:06:58", "throughput": 3001.32, "total_tokens": 784280} +{"current_steps": 545, "total_steps": 1405, "loss": 0.1743, "lr": 3.847322446135736e-05, "epoch": 1.9395017793594307, "percentage": 38.79, "elapsed_time": "0:04:21", "remaining_time": "0:06:53", "throughput": 3026.54, "total_tokens": 792280} +{"current_steps": 550, "total_steps": 1405, "loss": 0.1752, "lr": 3.821048946131549e-05, "epoch": 1.9572953736654806, "percentage": 39.15, "elapsed_time": "0:04:22", "remaining_time": "0:06:47", "throughput": 3045.34, "total_tokens": 798488} +{"current_steps": 555, "total_steps": 1405, "loss": 0.1578, "lr": 3.794571432618267e-05, "epoch": 1.9750889679715302, "percentage": 39.5, "elapsed_time": "0:04:22", "remaining_time": "0:06:42", "throughput": 3069.06, "total_tokens": 806104} +{"current_steps": 560, "total_steps": 1405, "loss": 0.1774, "lr": 3.767893994596876e-05, "epoch": 1.99288256227758, "percentage": 39.86, "elapsed_time": "0:04:23", "remaining_time": "0:06:37", "throughput": 3091.34, "total_tokens": 813336} +{"current_steps": 565, "total_steps": 1405, "loss": 0.1568, "lr": 3.741020751943297e-05, "epoch": 2.0106761565836297, "percentage": 40.21, "elapsed_time": "0:04:23", "remaining_time": "0:06:31", "throughput": 3101.64, "total_tokens": 817576} +{"current_steps": 568, "total_steps": 1405, "eval_loss": 0.15550938248634338, "epoch": 2.02135231316726, "percentage": 40.43, "elapsed_time": "0:04:24", "remaining_time": "0:06:29", "throughput": 3106.17, "total_tokens": 821416} +{"current_steps": 570, "total_steps": 1405, "loss": 0.1565, "lr": 3.713955854772144e-05, "epoch": 2.0284697508896796, "percentage": 40.57, "elapsed_time": "0:05:04", "remaining_time": "0:07:25", "throughput": 2708.15, "total_tokens": 823848} +{"current_steps": 575, "total_steps": 1405, "loss": 0.1536, "lr": 3.686703482795802e-05, "epoch": 2.0462633451957295, "percentage": 40.93, "elapsed_time": "0:05:04", "remaining_time": "0:07:19", "throughput": 2731.49, "total_tokens": 832232} +{"current_steps": 580, "total_steps": 1405, "loss": 0.1624, "lr": 3.6592678446789516e-05, "epoch": 2.0640569395017794, "percentage": 41.28, "elapsed_time": "0:05:05", "remaining_time": "0:07:14", "throughput": 2754.17, "total_tokens": 840424} +{"current_steps": 585, "total_steps": 1405, "loss": 0.1395, "lr": 3.631653177388605e-05, "epoch": 2.0818505338078293, "percentage": 41.64, "elapsed_time": "0:05:05", "remaining_time": "0:07:08", "throughput": 2771.33, "total_tokens": 846824} +{"current_steps": 590, "total_steps": 1405, "loss": 0.196, "lr": 3.60386374553978e-05, "epoch": 2.099644128113879, "percentage": 41.99, "elapsed_time": "0:05:05", "remaining_time": "0:07:02", "throughput": 2789.65, "total_tokens": 853608} +{"current_steps": 595, "total_steps": 1405, "loss": 0.1637, "lr": 3.5759038407369056e-05, "epoch": 2.117437722419929, "percentage": 42.35, "elapsed_time": "0:05:06", "remaining_time": "0:06:57", "throughput": 2809.64, "total_tokens": 860968} +{"current_steps": 600, "total_steps": 1405, "loss": 0.194, "lr": 3.547777780911055e-05, "epoch": 2.135231316725979, "percentage": 42.7, "elapsed_time": "0:05:06", "remaining_time": "0:06:51", "throughput": 2831.31, "total_tokens": 868904} +{"current_steps": 605, "total_steps": 1405, "loss": 0.1592, "lr": 3.519489909653113e-05, "epoch": 2.1530249110320283, "percentage": 43.06, "elapsed_time": "0:05:07", "remaining_time": "0:06:46", "throughput": 2850.58, "total_tokens": 876072} +{"current_steps": 610, "total_steps": 1405, "loss": 0.1549, "lr": 3.4910445955429854e-05, "epoch": 2.170818505338078, "percentage": 43.42, "elapsed_time": "0:05:07", "remaining_time": "0:06:41", "throughput": 2871.33, "total_tokens": 883752} +{"current_steps": 615, "total_steps": 1405, "loss": 0.1533, "lr": 3.4624462314749443e-05, "epoch": 2.188612099644128, "percentage": 43.77, "elapsed_time": "0:05:08", "remaining_time": "0:06:35", "throughput": 2891.64, "total_tokens": 891304} +{"current_steps": 620, "total_steps": 1405, "loss": 0.1483, "lr": 3.433699233979222e-05, "epoch": 2.206405693950178, "percentage": 44.13, "elapsed_time": "0:05:08", "remaining_time": "0:06:30", "throughput": 2912.87, "total_tokens": 899176} +{"current_steps": 625, "total_steps": 1405, "loss": 0.1436, "lr": 3.4048080425399505e-05, "epoch": 2.224199288256228, "percentage": 44.48, "elapsed_time": "0:05:09", "remaining_time": "0:06:25", "throughput": 2935.53, "total_tokens": 907560} +{"current_steps": 630, "total_steps": 1405, "loss": 0.1413, "lr": 3.375777118909561e-05, "epoch": 2.2419928825622777, "percentage": 44.84, "elapsed_time": "0:05:09", "remaining_time": "0:06:20", "throughput": 2956.05, "total_tokens": 915240} +{"current_steps": 635, "total_steps": 1405, "loss": 0.1597, "lr": 3.3466109464197426e-05, "epoch": 2.2597864768683276, "percentage": 45.2, "elapsed_time": "0:05:10", "remaining_time": "0:06:15", "throughput": 2971.91, "total_tokens": 921384} +{"current_steps": 639, "total_steps": 1405, "eval_loss": 0.1567462682723999, "epoch": 2.2740213523131674, "percentage": 45.48, "elapsed_time": "0:05:10", "remaining_time": "0:06:12", "throughput": 2980.22, "total_tokens": 926760} +{"current_steps": 640, "total_steps": 1405, "loss": 0.1653, "lr": 3.317314029289067e-05, "epoch": 2.277580071174377, "percentage": 45.55, "elapsed_time": "0:05:30", "remaining_time": "0:06:35", "throughput": 2804.57, "total_tokens": 927528} +{"current_steps": 645, "total_steps": 1405, "loss": 0.1594, "lr": 3.287890891927386e-05, "epoch": 2.295373665480427, "percentage": 45.91, "elapsed_time": "0:05:31", "remaining_time": "0:06:30", "throughput": 2822.09, "total_tokens": 934568} +{"current_steps": 650, "total_steps": 1405, "loss": 0.1402, "lr": 3.258346078237122e-05, "epoch": 2.3131672597864767, "percentage": 46.26, "elapsed_time": "0:05:31", "remaining_time": "0:06:25", "throughput": 2841.4, "total_tokens": 942248} +{"current_steps": 655, "total_steps": 1405, "loss": 0.2418, "lr": 3.228684150911527e-05, "epoch": 2.3309608540925266, "percentage": 46.62, "elapsed_time": "0:05:32", "remaining_time": "0:06:20", "throughput": 2858.33, "total_tokens": 949096} +{"current_steps": 660, "total_steps": 1405, "loss": 0.1845, "lr": 3.198909690730063e-05, "epoch": 2.3487544483985765, "percentage": 46.98, "elapsed_time": "0:05:32", "remaining_time": "0:06:15", "throughput": 2874.67, "total_tokens": 955752} +{"current_steps": 665, "total_steps": 1405, "loss": 0.1664, "lr": 3.169027295850977e-05, "epoch": 2.3665480427046264, "percentage": 47.33, "elapsed_time": "0:05:32", "remaining_time": "0:06:10", "throughput": 2893.1, "total_tokens": 963176} +{"current_steps": 670, "total_steps": 1405, "loss": 0.1627, "lr": 3.139041581101187e-05, "epoch": 2.3843416370106763, "percentage": 47.69, "elapsed_time": "0:05:33", "remaining_time": "0:06:05", "throughput": 2904.91, "total_tokens": 968232} +{"current_steps": 675, "total_steps": 1405, "loss": 0.1498, "lr": 3.108957177263608e-05, "epoch": 2.402135231316726, "percentage": 48.04, "elapsed_time": "0:05:33", "remaining_time": "0:06:00", "throughput": 2925.75, "total_tokens": 976552} +{"current_steps": 680, "total_steps": 1405, "loss": 0.1656, "lr": 3.078778730362003e-05, "epoch": 2.419928825622776, "percentage": 48.4, "elapsed_time": "0:05:34", "remaining_time": "0:05:56", "throughput": 2943.32, "total_tokens": 983720} +{"current_steps": 685, "total_steps": 1405, "loss": 0.1567, "lr": 3.048510900943484e-05, "epoch": 2.4377224199288254, "percentage": 48.75, "elapsed_time": "0:05:34", "remaining_time": "0:05:51", "throughput": 2963.87, "total_tokens": 991976} +{"current_steps": 690, "total_steps": 1405, "loss": 0.1807, "lr": 3.018158363358773e-05, "epoch": 2.4555160142348753, "percentage": 49.11, "elapsed_time": "0:05:35", "remaining_time": "0:05:47", "throughput": 2978.75, "total_tokens": 998184} +{"current_steps": 695, "total_steps": 1405, "loss": 0.1678, "lr": 2.9877258050403212e-05, "epoch": 2.473309608540925, "percentage": 49.47, "elapsed_time": "0:05:35", "remaining_time": "0:05:42", "throughput": 2997.12, "total_tokens": 1005672} +{"current_steps": 700, "total_steps": 1405, "loss": 0.1531, "lr": 2.9572179257784215e-05, "epoch": 2.491103202846975, "percentage": 49.82, "elapsed_time": "0:05:35", "remaining_time": "0:05:38", "throughput": 3015.24, "total_tokens": 1013096} +{"current_steps": 705, "total_steps": 1405, "loss": 0.1337, "lr": 2.9266394369954052e-05, "epoch": 2.508896797153025, "percentage": 50.18, "elapsed_time": "0:05:36", "remaining_time": "0:05:34", "throughput": 3029.96, "total_tokens": 1019304} +{"current_steps": 710, "total_steps": 1405, "loss": 0.1431, "lr": 2.8959950610180374e-05, "epoch": 2.526690391459075, "percentage": 50.53, "elapsed_time": "0:05:36", "remaining_time": "0:05:29", "throughput": 3044.1, "total_tokens": 1025320} +{"current_steps": 710, "total_steps": 1405, "eval_loss": 0.16391661763191223, "epoch": 2.526690391459075, "percentage": 50.53, "elapsed_time": "0:05:37", "remaining_time": "0:05:30", "throughput": 3038.61, "total_tokens": 1025320} +{"current_steps": 715, "total_steps": 1405, "loss": 0.1675, "lr": 2.865289530348243e-05, "epoch": 2.5444839857651247, "percentage": 50.89, "elapsed_time": "0:05:58", "remaining_time": "0:05:45", "throughput": 2882.35, "total_tokens": 1032552} +{"current_steps": 720, "total_steps": 1405, "loss": 2.4615, "lr": 2.834527586932243e-05, "epoch": 2.562277580071174, "percentage": 51.25, "elapsed_time": "0:05:58", "remaining_time": "0:05:41", "throughput": 2899.29, "total_tokens": 1039912} +{"current_steps": 725, "total_steps": 1405, "loss": 0.1636, "lr": 2.8037139814282493e-05, "epoch": 2.580071174377224, "percentage": 51.6, "elapsed_time": "0:05:59", "remaining_time": "0:05:36", "throughput": 2916.01, "total_tokens": 1047208} +{"current_steps": 730, "total_steps": 1405, "loss": 0.1652, "lr": 2.7728534724728027e-05, "epoch": 2.597864768683274, "percentage": 51.96, "elapsed_time": "0:05:59", "remaining_time": "0:05:32", "throughput": 2931.23, "total_tokens": 1053928} +{"current_steps": 735, "total_steps": 1405, "loss": 0.1482, "lr": 2.741950825945881e-05, "epoch": 2.6156583629893237, "percentage": 52.31, "elapsed_time": "0:06:00", "remaining_time": "0:05:28", "throughput": 2948.87, "total_tokens": 1061608} +{"current_steps": 740, "total_steps": 1405, "loss": 0.1501, "lr": 2.711010814234896e-05, "epoch": 2.6334519572953736, "percentage": 52.67, "elapsed_time": "0:06:00", "remaining_time": "0:05:23", "throughput": 2962.04, "total_tokens": 1067560} +{"current_steps": 745, "total_steps": 1405, "loss": 0.1743, "lr": 2.6800382154976732e-05, "epoch": 2.6512455516014235, "percentage": 53.02, "elapsed_time": "0:06:00", "remaining_time": "0:05:19", "throughput": 2976.8, "total_tokens": 1074152} +{"current_steps": 750, "total_steps": 1405, "loss": 0.1441, "lr": 2.6490378129245498e-05, "epoch": 2.6690391459074734, "percentage": 53.38, "elapsed_time": "0:06:01", "remaining_time": "0:05:15", "throughput": 2996.98, "total_tokens": 1082856} +{"current_steps": 755, "total_steps": 1405, "loss": 0.1495, "lr": 2.6180143939996925e-05, "epoch": 2.6868327402135233, "percentage": 53.74, "elapsed_time": "0:06:01", "remaining_time": "0:05:11", "throughput": 3011.87, "total_tokens": 1089512} +{"current_steps": 760, "total_steps": 1405, "loss": 0.1464, "lr": 2.5869727497617495e-05, "epoch": 2.704626334519573, "percentage": 54.09, "elapsed_time": "0:06:02", "remaining_time": "0:05:07", "throughput": 3026.86, "total_tokens": 1096232} +{"current_steps": 765, "total_steps": 1405, "loss": 0.1572, "lr": 2.55591767406396e-05, "epoch": 2.722419928825623, "percentage": 54.45, "elapsed_time": "0:06:02", "remaining_time": "0:05:03", "throughput": 3044.92, "total_tokens": 1104168} +{"current_steps": 770, "total_steps": 1405, "loss": 0.1326, "lr": 2.5248539628338246e-05, "epoch": 2.7402135231316724, "percentage": 54.8, "elapsed_time": "0:06:03", "remaining_time": "0:04:59", "throughput": 3063.29, "total_tokens": 1112232} +{"current_steps": 775, "total_steps": 1405, "loss": 0.1734, "lr": 2.4937864133324516e-05, "epoch": 2.7580071174377223, "percentage": 55.16, "elapsed_time": "0:06:03", "remaining_time": "0:04:55", "throughput": 3078.33, "total_tokens": 1119016} +{"current_steps": 780, "total_steps": 1405, "loss": 0.1986, "lr": 2.462719823413707e-05, "epoch": 2.775800711743772, "percentage": 55.52, "elapsed_time": "0:06:03", "remaining_time": "0:04:51", "throughput": 3095.58, "total_tokens": 1126696} +{"current_steps": 781, "total_steps": 1405, "eval_loss": 0.15414386987686157, "epoch": 2.7793594306049823, "percentage": 55.59, "elapsed_time": "0:06:04", "remaining_time": "0:04:51", "throughput": 3093.52, "total_tokens": 1128104} +{"current_steps": 785, "total_steps": 1405, "loss": 0.1576, "lr": 2.4316589907832654e-05, "epoch": 2.793594306049822, "percentage": 55.87, "elapsed_time": "0:06:26", "remaining_time": "0:05:05", "throughput": 2934.79, "total_tokens": 1134184} +{"current_steps": 790, "total_steps": 1405, "loss": 0.1392, "lr": 2.4006087122576863e-05, "epoch": 2.811387900355872, "percentage": 56.23, "elapsed_time": "0:06:26", "remaining_time": "0:05:01", "throughput": 2947.66, "total_tokens": 1140392} +{"current_steps": 795, "total_steps": 1405, "loss": 0.2025, "lr": 2.3695737830236266e-05, "epoch": 2.829181494661922, "percentage": 56.58, "elapsed_time": "0:06:27", "remaining_time": "0:04:57", "throughput": 2964.64, "total_tokens": 1148328} +{"current_steps": 800, "total_steps": 1405, "loss": 0.1781, "lr": 2.338558995897307e-05, "epoch": 2.8469750889679717, "percentage": 56.94, "elapsed_time": "0:06:27", "remaining_time": "0:04:53", "throughput": 2976.22, "total_tokens": 1154024} +{"current_steps": 805, "total_steps": 1405, "loss": 0.195, "lr": 2.3075691405843435e-05, "epoch": 2.864768683274021, "percentage": 57.3, "elapsed_time": "0:06:28", "remaining_time": "0:04:49", "throughput": 2990.39, "total_tokens": 1160808} +{"current_steps": 810, "total_steps": 1405, "loss": 0.1597, "lr": 2.2766090029400573e-05, "epoch": 2.882562277580071, "percentage": 57.65, "elapsed_time": "0:06:28", "remaining_time": "0:04:45", "throughput": 3005.29, "total_tokens": 1167912} +{"current_steps": 815, "total_steps": 1405, "loss": 0.1433, "lr": 2.2456833642303822e-05, "epoch": 2.900355871886121, "percentage": 58.01, "elapsed_time": "0:06:29", "remaining_time": "0:04:41", "throughput": 3019.06, "total_tokens": 1174568} +{"current_steps": 820, "total_steps": 1405, "loss": 0.1553, "lr": 2.214797000393479e-05, "epoch": 2.9181494661921707, "percentage": 58.36, "elapsed_time": "0:06:29", "remaining_time": "0:04:37", "throughput": 3033.42, "total_tokens": 1181480} +{"current_steps": 825, "total_steps": 1405, "loss": 0.1614, "lr": 2.183954681302173e-05, "epoch": 2.9359430604982206, "percentage": 58.72, "elapsed_time": "0:06:29", "remaining_time": "0:04:34", "throughput": 3051.42, "total_tokens": 1189928} +{"current_steps": 830, "total_steps": 1405, "loss": 0.1351, "lr": 2.1531611700273297e-05, "epoch": 2.9537366548042705, "percentage": 59.07, "elapsed_time": "0:06:30", "remaining_time": "0:04:30", "throughput": 3067.25, "total_tokens": 1197480} +{"current_steps": 835, "total_steps": 1405, "loss": 0.1845, "lr": 2.1224212221022777e-05, "epoch": 2.9715302491103204, "percentage": 59.43, "elapsed_time": "0:06:30", "remaining_time": "0:04:26", "throughput": 3081.96, "total_tokens": 1204584} +{"current_steps": 840, "total_steps": 1405, "loss": 0.1616, "lr": 2.0917395847883995e-05, "epoch": 2.9893238434163703, "percentage": 59.79, "elapsed_time": "0:06:31", "remaining_time": "0:04:23", "throughput": 3098.78, "total_tokens": 1212584} +{"current_steps": 845, "total_steps": 1405, "loss": 0.1625, "lr": 2.0611209963419958e-05, "epoch": 3.00711743772242, "percentage": 60.14, "elapsed_time": "0:06:31", "remaining_time": "0:04:19", "throughput": 3108.17, "total_tokens": 1217856} +{"current_steps": 850, "total_steps": 1405, "loss": 0.137, "lr": 2.030570185282544e-05, "epoch": 3.0249110320284696, "percentage": 60.5, "elapsed_time": "0:06:32", "remaining_time": "0:04:16", "throughput": 3126.69, "total_tokens": 1226624} +{"current_steps": 852, "total_steps": 1405, "eval_loss": 0.1851627230644226, "epoch": 3.0320284697508897, "percentage": 60.64, "elapsed_time": "0:06:33", "remaining_time": "0:04:15", "throughput": 3127.64, "total_tokens": 1229440} +{"current_steps": 855, "total_steps": 1405, "loss": 0.1453, "lr": 2.0000918696624588e-05, "epoch": 3.0427046263345194, "percentage": 60.85, "elapsed_time": "0:07:14", "remaining_time": "0:04:39", "throughput": 2836.99, "total_tokens": 1233152} +{"current_steps": 860, "total_steps": 1405, "loss": 0.138, "lr": 1.9696907563384687e-05, "epoch": 3.0604982206405693, "percentage": 61.21, "elapsed_time": "0:07:15", "remaining_time": "0:04:35", "throughput": 2850.2, "total_tokens": 1240128} +{"current_steps": 865, "total_steps": 1405, "loss": 0.1148, "lr": 1.939371540244723e-05, "epoch": 3.078291814946619, "percentage": 61.57, "elapsed_time": "0:07:15", "remaining_time": "0:04:31", "throughput": 2865.38, "total_tokens": 1248064} +{"current_steps": 870, "total_steps": 1405, "loss": 0.1106, "lr": 1.9091389036677382e-05, "epoch": 3.096085409252669, "percentage": 61.92, "elapsed_time": "0:07:16", "remaining_time": "0:04:28", "throughput": 2878.95, "total_tokens": 1255232} +{"current_steps": 875, "total_steps": 1405, "loss": 0.1169, "lr": 1.878997515523299e-05, "epoch": 3.113879003558719, "percentage": 62.28, "elapsed_time": "0:07:16", "remaining_time": "0:04:24", "throughput": 2892.21, "total_tokens": 1262272} +{"current_steps": 880, "total_steps": 1405, "loss": 0.1161, "lr": 1.848952030635424e-05, "epoch": 3.131672597864769, "percentage": 62.63, "elapsed_time": "0:07:16", "remaining_time": "0:04:20", "throughput": 2906.11, "total_tokens": 1269632} +{"current_steps": 885, "total_steps": 1405, "loss": 0.123, "lr": 1.819007089017508e-05, "epoch": 3.1494661921708187, "percentage": 62.99, "elapsed_time": "0:07:17", "remaining_time": "0:04:16", "throughput": 2920.66, "total_tokens": 1277312} +{"current_steps": 890, "total_steps": 1405, "loss": 0.1599, "lr": 1.789167315155749e-05, "epoch": 3.167259786476868, "percentage": 63.35, "elapsed_time": "0:07:17", "remaining_time": "0:04:13", "throughput": 2933.3, "total_tokens": 1284096} +{"current_steps": 895, "total_steps": 1405, "loss": 0.1109, "lr": 1.7594373172949784e-05, "epoch": 3.185053380782918, "percentage": 63.7, "elapsed_time": "0:07:18", "remaining_time": "0:04:09", "throughput": 2947.55, "total_tokens": 1291648} +{"current_steps": 900, "total_steps": 1405, "loss": 0.1569, "lr": 1.7298216867269906e-05, "epoch": 3.202846975088968, "percentage": 64.06, "elapsed_time": "0:07:18", "remaining_time": "0:04:06", "throughput": 2962.83, "total_tokens": 1299712} +{"current_steps": 905, "total_steps": 1405, "loss": 0.1082, "lr": 1.7003249970815026e-05, "epoch": 3.2206405693950177, "percentage": 64.41, "elapsed_time": "0:07:19", "remaining_time": "0:04:02", "throughput": 2974.7, "total_tokens": 1306176} +{"current_steps": 910, "total_steps": 1405, "loss": 0.1387, "lr": 1.6709518036198308e-05, "epoch": 3.2384341637010676, "percentage": 64.77, "elapsed_time": "0:07:19", "remaining_time": "0:03:59", "throughput": 2989.65, "total_tokens": 1314112} +{"current_steps": 915, "total_steps": 1405, "loss": 0.1199, "lr": 1.6417066425314087e-05, "epoch": 3.2562277580071175, "percentage": 65.12, "elapsed_time": "0:07:19", "remaining_time": "0:03:55", "throughput": 3002.55, "total_tokens": 1321088} +{"current_steps": 920, "total_steps": 1405, "loss": 0.1422, "lr": 1.612594030233252e-05, "epoch": 3.2740213523131674, "percentage": 65.48, "elapsed_time": "0:07:20", "remaining_time": "0:03:52", "throughput": 3016.33, "total_tokens": 1328512} +{"current_steps": 923, "total_steps": 1405, "eval_loss": 0.16463510692119598, "epoch": 3.284697508896797, "percentage": 65.69, "elapsed_time": "0:07:21", "remaining_time": "0:03:50", "throughput": 3019.68, "total_tokens": 1332544} +{"current_steps": 925, "total_steps": 1405, "loss": 0.0863, "lr": 1.583618462672472e-05, "epoch": 3.2918149466192173, "percentage": 65.84, "elapsed_time": "0:07:41", "remaining_time": "0:03:59", "throughput": 2894.57, "total_tokens": 1336128} +{"current_steps": 930, "total_steps": 1405, "loss": 0.1155, "lr": 1.5547844146319545e-05, "epoch": 3.309608540925267, "percentage": 66.19, "elapsed_time": "0:07:42", "remaining_time": "0:03:55", "throughput": 2907.84, "total_tokens": 1343552} +{"current_steps": 935, "total_steps": 1405, "loss": 0.1691, "lr": 1.5260963390393075e-05, "epoch": 3.3274021352313166, "percentage": 66.55, "elapsed_time": "0:07:42", "remaining_time": "0:03:52", "throughput": 2922.22, "total_tokens": 1351552} +{"current_steps": 940, "total_steps": 1405, "loss": 0.0983, "lr": 1.4975586662791783e-05, "epoch": 3.3451957295373664, "percentage": 66.9, "elapsed_time": "0:07:42", "remaining_time": "0:03:49", "throughput": 2934.01, "total_tokens": 1358272} +{"current_steps": 945, "total_steps": 1405, "loss": 0.137, "lr": 1.4691758035090602e-05, "epoch": 3.3629893238434163, "percentage": 67.26, "elapsed_time": "0:07:43", "remaining_time": "0:03:45", "throughput": 2949.36, "total_tokens": 1366784} +{"current_steps": 950, "total_steps": 1405, "loss": 0.1389, "lr": 1.4409521339786808e-05, "epoch": 3.380782918149466, "percentage": 67.62, "elapsed_time": "0:07:43", "remaining_time": "0:03:42", "throughput": 2960.74, "total_tokens": 1373312} +{"current_steps": 955, "total_steps": 1405, "loss": 0.0916, "lr": 1.41289201635308e-05, "epoch": 3.398576512455516, "percentage": 67.97, "elapsed_time": "0:07:44", "remaining_time": "0:03:38", "throughput": 2973.88, "total_tokens": 1380736} +{"current_steps": 960, "total_steps": 1405, "loss": 0.096, "lr": 1.3849997840394943e-05, "epoch": 3.416370106761566, "percentage": 68.33, "elapsed_time": "0:07:44", "remaining_time": "0:03:35", "throughput": 2987.73, "total_tokens": 1388544} +{"current_steps": 965, "total_steps": 1405, "loss": 0.1252, "lr": 1.3572797445181345e-05, "epoch": 3.434163701067616, "percentage": 68.68, "elapsed_time": "0:07:45", "remaining_time": "0:03:32", "throughput": 3001.21, "total_tokens": 1396160} +{"current_steps": 970, "total_steps": 1405, "loss": 0.0988, "lr": 1.3297361786769652e-05, "epoch": 3.4519572953736652, "percentage": 69.04, "elapsed_time": "0:07:45", "remaining_time": "0:03:28", "throughput": 3015.3, "total_tokens": 1404096} +{"current_steps": 975, "total_steps": 1405, "loss": 0.1135, "lr": 1.3023733401505981e-05, "epoch": 3.469750889679715, "percentage": 69.4, "elapsed_time": "0:07:46", "remaining_time": "0:03:25", "throughput": 3027.31, "total_tokens": 1411008} +{"current_steps": 980, "total_steps": 1405, "loss": 0.155, "lr": 1.2751954546633871e-05, "epoch": 3.487544483985765, "percentage": 69.75, "elapsed_time": "0:07:46", "remaining_time": "0:03:22", "throughput": 3041.19, "total_tokens": 1418880} +{"current_steps": 985, "total_steps": 1405, "loss": 0.1302, "lr": 1.2482067193768417e-05, "epoch": 3.505338078291815, "percentage": 70.11, "elapsed_time": "0:07:46", "remaining_time": "0:03:19", "throughput": 3053.67, "total_tokens": 1426048} +{"current_steps": 990, "total_steps": 1405, "loss": 0.0911, "lr": 1.2214113022414448e-05, "epoch": 3.5231316725978647, "percentage": 70.46, "elapsed_time": "0:07:47", "remaining_time": "0:03:15", "throughput": 3063.86, "total_tokens": 1432064} +{"current_steps": 994, "total_steps": 1405, "eval_loss": 0.1803617924451828, "epoch": 3.5373665480427046, "percentage": 70.75, "elapsed_time": "0:07:48", "remaining_time": "0:03:13", "throughput": 3070.92, "total_tokens": 1438336} +{"current_steps": 995, "total_steps": 1405, "loss": 0.1165, "lr": 1.1948133413529817e-05, "epoch": 3.5409252669039146, "percentage": 70.82, "elapsed_time": "0:08:11", "remaining_time": "0:03:22", "throughput": 2931.84, "total_tokens": 1439808} +{"current_steps": 1000, "total_steps": 1405, "loss": 0.156, "lr": 1.168416944313486e-05, "epoch": 3.5587188612099645, "percentage": 71.17, "elapsed_time": "0:08:11", "remaining_time": "0:03:19", "throughput": 2944.97, "total_tokens": 1447616} +{"current_steps": 1005, "total_steps": 1405, "loss": 0.0978, "lr": 1.1422261875968845e-05, "epoch": 3.5765124555160144, "percentage": 71.53, "elapsed_time": "0:08:11", "remaining_time": "0:03:15", "throughput": 2955.75, "total_tokens": 1454208} +{"current_steps": 1010, "total_steps": 1405, "loss": 0.0784, "lr": 1.1162451159194614e-05, "epoch": 3.5943060498220643, "percentage": 71.89, "elapsed_time": "0:08:12", "remaining_time": "0:03:12", "throughput": 2971.24, "total_tokens": 1463296} +{"current_steps": 1015, "total_steps": 1405, "loss": 0.1698, "lr": 1.0904777416152166e-05, "epoch": 3.612099644128114, "percentage": 72.24, "elapsed_time": "0:08:12", "remaining_time": "0:03:09", "throughput": 2982.14, "total_tokens": 1469952} +{"current_steps": 1020, "total_steps": 1405, "loss": 0.1033, "lr": 1.0649280440162326e-05, "epoch": 3.6298932384341636, "percentage": 72.6, "elapsed_time": "0:08:13", "remaining_time": "0:03:06", "throughput": 2994.1, "total_tokens": 1477184} +{"current_steps": 1025, "total_steps": 1405, "loss": 0.1025, "lr": 1.0395999688381314e-05, "epoch": 3.6476868327402134, "percentage": 72.95, "elapsed_time": "0:08:13", "remaining_time": "0:03:03", "throughput": 3005.56, "total_tokens": 1484160} +{"current_steps": 1030, "total_steps": 1405, "loss": 0.0885, "lr": 1.0144974275707241e-05, "epoch": 3.6654804270462633, "percentage": 73.31, "elapsed_time": "0:08:14", "remaining_time": "0:02:59", "throughput": 3017.14, "total_tokens": 1491200} +{"current_steps": 1035, "total_steps": 1405, "loss": 0.1678, "lr": 9.896242968739539e-06, "epoch": 3.683274021352313, "percentage": 73.67, "elapsed_time": "0:08:14", "remaining_time": "0:02:56", "throughput": 3028.92, "total_tokens": 1498368} +{"current_steps": 1040, "total_steps": 1405, "loss": 0.1068, "lr": 9.649844179792081e-06, "epoch": 3.701067615658363, "percentage": 74.02, "elapsed_time": "0:08:15", "remaining_time": "0:02:53", "throughput": 3041.56, "total_tokens": 1505984} +{"current_steps": 1045, "total_steps": 1405, "loss": 0.0978, "lr": 9.405815960961054e-06, "epoch": 3.718861209964413, "percentage": 74.38, "elapsed_time": "0:08:15", "remaining_time": "0:02:50", "throughput": 3050.57, "total_tokens": 1511680} +{"current_steps": 1050, "total_steps": 1405, "loss": 0.0966, "lr": 9.16419599824847e-06, "epoch": 3.7366548042704624, "percentage": 74.73, "elapsed_time": "0:08:15", "remaining_time": "0:02:47", "throughput": 3060.55, "total_tokens": 1517888} +{"current_steps": 1055, "total_steps": 1405, "loss": 0.1815, "lr": 8.925021605742211e-06, "epoch": 3.7544483985765122, "percentage": 75.09, "elapsed_time": "0:08:16", "remaining_time": "0:02:44", "throughput": 3073.22, "total_tokens": 1525568} +{"current_steps": 1060, "total_steps": 1405, "loss": 0.1028, "lr": 8.68832971985347e-06, "epoch": 3.772241992882562, "percentage": 75.44, "elapsed_time": "0:08:16", "remaining_time": "0:02:41", "throughput": 3084.46, "total_tokens": 1532480} +{"current_steps": 1065, "total_steps": 1405, "loss": 0.1203, "lr": 8.454156893612591e-06, "epoch": 3.790035587188612, "percentage": 75.8, "elapsed_time": "0:08:17", "remaining_time": "0:02:38", "throughput": 3095.02, "total_tokens": 1539072} +{"current_steps": 1065, "total_steps": 1405, "eval_loss": 0.17713916301727295, "epoch": 3.790035587188612, "percentage": 75.8, "elapsed_time": "0:08:17", "remaining_time": "0:02:38", "throughput": 3091.12, "total_tokens": 1539072} +{"current_steps": 1070, "total_steps": 1405, "loss": 0.1178, "lr": 8.222539291024078e-06, "epoch": 3.807829181494662, "percentage": 76.16, "elapsed_time": "0:09:00", "remaining_time": "0:02:49", "throughput": 2862.34, "total_tokens": 1547584} +{"current_steps": 1075, "total_steps": 1405, "loss": 0.0999, "lr": 7.993512681481639e-06, "epoch": 3.8256227758007118, "percentage": 76.51, "elapsed_time": "0:09:01", "remaining_time": "0:02:46", "throughput": 2872.5, "total_tokens": 1554304} +{"current_steps": 1080, "total_steps": 1405, "loss": 0.145, "lr": 7.767112434244253e-06, "epoch": 3.8434163701067616, "percentage": 76.87, "elapsed_time": "0:09:01", "remaining_time": "0:02:42", "throughput": 2882.43, "total_tokens": 1560896} +{"current_steps": 1085, "total_steps": 1405, "loss": 0.0627, "lr": 7.543373512973947e-06, "epoch": 3.8612099644128115, "percentage": 77.22, "elapsed_time": "0:09:01", "remaining_time": "0:02:39", "throughput": 2892.76, "total_tokens": 1567744} +{"current_steps": 1090, "total_steps": 1405, "loss": 0.1558, "lr": 7.3223304703363135e-06, "epoch": 3.8790035587188614, "percentage": 77.58, "elapsed_time": "0:09:02", "remaining_time": "0:02:36", "throughput": 2902.76, "total_tokens": 1574400} +{"current_steps": 1095, "total_steps": 1405, "loss": 0.0965, "lr": 7.104017442664393e-06, "epoch": 3.8967971530249113, "percentage": 77.94, "elapsed_time": "0:09:02", "remaining_time": "0:02:33", "throughput": 2913.52, "total_tokens": 1581504} +{"current_steps": 1100, "total_steps": 1405, "loss": 0.0914, "lr": 6.8884681446869105e-06, "epoch": 3.914590747330961, "percentage": 78.29, "elapsed_time": "0:09:03", "remaining_time": "0:02:30", "throughput": 2925.79, "total_tokens": 1589504} +{"current_steps": 1105, "total_steps": 1405, "loss": 0.124, "lr": 6.67571586432163e-06, "epoch": 3.9323843416370106, "percentage": 78.65, "elapsed_time": "0:09:03", "remaining_time": "0:02:27", "throughput": 2938.36, "total_tokens": 1597696} +{"current_steps": 1110, "total_steps": 1405, "loss": 0.1388, "lr": 6.465793457534553e-06, "epoch": 3.9501779359430604, "percentage": 79.0, "elapsed_time": "0:09:04", "remaining_time": "0:02:24", "throughput": 2949.82, "total_tokens": 1605248} +{"current_steps": 1115, "total_steps": 1405, "loss": 0.1646, "lr": 6.258733343265932e-06, "epoch": 3.9679715302491103, "percentage": 79.36, "elapsed_time": "0:09:04", "remaining_time": "0:02:21", "throughput": 2963.21, "total_tokens": 1613952} +{"current_steps": 1120, "total_steps": 1405, "loss": 0.1024, "lr": 6.0545674984236826e-06, "epoch": 3.98576512455516, "percentage": 79.72, "elapsed_time": "0:09:05", "remaining_time": "0:02:18", "throughput": 2972.46, "total_tokens": 1620224} +{"current_steps": 1125, "total_steps": 1405, "loss": 0.0889, "lr": 5.853327452945115e-06, "epoch": 4.00355871886121, "percentage": 80.07, "elapsed_time": "0:09:05", "remaining_time": "0:02:15", "throughput": 2979.71, "total_tokens": 1625800} +{"current_steps": 1130, "total_steps": 1405, "loss": 0.0747, "lr": 5.655044284927657e-06, "epoch": 4.0213523131672595, "percentage": 80.43, "elapsed_time": "0:09:06", "remaining_time": "0:02:12", "throughput": 2991.1, "total_tokens": 1633352} +{"current_steps": 1135, "total_steps": 1405, "loss": 0.0551, "lr": 5.459748615829355e-06, "epoch": 4.039145907473309, "percentage": 80.78, "elapsed_time": "0:09:06", "remaining_time": "0:02:10", "throughput": 3002.35, "total_tokens": 1640840} +{"current_steps": 1136, "total_steps": 1405, "eval_loss": 0.19830213487148285, "epoch": 4.04270462633452, "percentage": 80.85, "elapsed_time": "0:09:07", "remaining_time": "0:02:09", "throughput": 3001.97, "total_tokens": 1642696} +{"current_steps": 1140, "total_steps": 1405, "loss": 0.0395, "lr": 5.267470605739952e-06, "epoch": 4.056939501779359, "percentage": 81.14, "elapsed_time": "0:09:29", "remaining_time": "0:02:12", "throughput": 2896.93, "total_tokens": 1648520} +{"current_steps": 1145, "total_steps": 1405, "loss": 0.0215, "lr": 5.078239948723154e-06, "epoch": 4.074733096085409, "percentage": 81.49, "elapsed_time": "0:09:29", "remaining_time": "0:02:09", "throughput": 2907.39, "total_tokens": 1655752} +{"current_steps": 1150, "total_steps": 1405, "loss": 0.0073, "lr": 4.892085868230881e-06, "epoch": 4.092526690391459, "percentage": 81.85, "elapsed_time": "0:09:29", "remaining_time": "0:02:06", "throughput": 2917.72, "total_tokens": 1662920} +{"current_steps": 1155, "total_steps": 1405, "loss": 0.0348, "lr": 4.709037112590217e-06, "epoch": 4.110320284697509, "percentage": 82.21, "elapsed_time": "0:09:30", "remaining_time": "0:02:03", "throughput": 2927.74, "total_tokens": 1669896} +{"current_steps": 1160, "total_steps": 1405, "loss": 0.076, "lr": 4.529121950563716e-06, "epoch": 4.128113879003559, "percentage": 82.56, "elapsed_time": "0:09:30", "remaining_time": "0:02:00", "throughput": 2935.35, "total_tokens": 1675400} +{"current_steps": 1165, "total_steps": 1405, "loss": 0.0705, "lr": 4.352368166983753e-06, "epoch": 4.145907473309609, "percentage": 82.92, "elapsed_time": "0:09:31", "remaining_time": "0:01:57", "throughput": 2946.25, "total_tokens": 1682952} +{"current_steps": 1170, "total_steps": 1405, "loss": 0.088, "lr": 4.178803058461664e-06, "epoch": 4.1637010676156585, "percentage": 83.27, "elapsed_time": "0:09:31", "remaining_time": "0:01:54", "throughput": 2956.74, "total_tokens": 1690248} +{"current_steps": 1175, "total_steps": 1405, "loss": 0.05, "lr": 4.0084534291722376e-06, "epoch": 4.181494661921708, "percentage": 83.63, "elapsed_time": "0:09:32", "remaining_time": "0:01:51", "throughput": 2966.09, "total_tokens": 1696840} +{"current_steps": 1180, "total_steps": 1405, "loss": 0.0689, "lr": 3.841345586714251e-06, "epoch": 4.199288256227758, "percentage": 83.99, "elapsed_time": "0:09:32", "remaining_time": "0:01:49", "throughput": 2975.66, "total_tokens": 1703624} +{"current_steps": 1185, "total_steps": 1405, "loss": 0.0218, "lr": 3.677505338047729e-06, "epoch": 4.217081850533808, "percentage": 84.34, "elapsed_time": "0:09:32", "remaining_time": "0:01:46", "throughput": 2984.64, "total_tokens": 1710024} +{"current_steps": 1190, "total_steps": 1405, "loss": 0.068, "lr": 3.516957985508476e-06, "epoch": 4.234875444839858, "percentage": 84.7, "elapsed_time": "0:09:33", "remaining_time": "0:01:43", "throughput": 2995.78, "total_tokens": 1717768} +{"current_steps": 1195, "total_steps": 1405, "loss": 0.021, "lr": 3.3597283229005877e-06, "epoch": 4.252669039145908, "percentage": 85.05, "elapsed_time": "0:09:33", "remaining_time": "0:01:40", "throughput": 3009.68, "total_tokens": 1727240} +{"current_steps": 1200, "total_steps": 1405, "loss": 0.0422, "lr": 3.205840631667456e-06, "epoch": 4.270462633451958, "percentage": 85.41, "elapsed_time": "0:09:34", "remaining_time": "0:01:38", "throughput": 3019.85, "total_tokens": 1734408} +{"current_steps": 1205, "total_steps": 1405, "loss": 0.0577, "lr": 3.0553186771419162e-06, "epoch": 4.288256227758007, "percentage": 85.77, "elapsed_time": "0:09:34", "remaining_time": "0:01:35", "throughput": 3029.0, "total_tokens": 1740936} +{"current_steps": 1207, "total_steps": 1405, "eval_loss": 0.3402128219604492, "epoch": 4.295373665480427, "percentage": 85.91, "elapsed_time": "0:09:35", "remaining_time": "0:01:34", "throughput": 3029.68, "total_tokens": 1743624} +{"current_steps": 1210, "total_steps": 1405, "loss": 0.0397, "lr": 2.908185704876101e-06, "epoch": 4.306049822064057, "percentage": 86.12, "elapsed_time": "0:09:56", "remaining_time": "0:01:36", "throughput": 2931.73, "total_tokens": 1747784} +{"current_steps": 1215, "total_steps": 1405, "loss": 0.0636, "lr": 2.7644644370515365e-06, "epoch": 4.3238434163701065, "percentage": 86.48, "elapsed_time": "0:09:56", "remaining_time": "0:01:33", "throughput": 2941.47, "total_tokens": 1754888} +{"current_steps": 1220, "total_steps": 1405, "loss": 0.0083, "lr": 2.624177068970124e-06, "epoch": 4.341637010676156, "percentage": 86.83, "elapsed_time": "0:09:57", "remaining_time": "0:01:30", "throughput": 2952.18, "total_tokens": 1762632} +{"current_steps": 1225, "total_steps": 1405, "loss": 0.0331, "lr": 2.4873452656264313e-06, "epoch": 4.359430604982206, "percentage": 87.19, "elapsed_time": "0:09:57", "remaining_time": "0:01:27", "throughput": 2962.18, "total_tokens": 1769928} +{"current_steps": 1230, "total_steps": 1405, "loss": 0.0824, "lr": 2.3539901583619185e-06, "epoch": 4.377224199288256, "percentage": 87.54, "elapsed_time": "0:09:57", "remaining_time": "0:01:25", "throughput": 2972.55, "total_tokens": 1777480} +{"current_steps": 1235, "total_steps": 1405, "loss": 0.0384, "lr": 2.2241323416015453e-06, "epoch": 4.395017793594306, "percentage": 87.9, "elapsed_time": "0:09:58", "remaining_time": "0:01:22", "throughput": 2982.63, "total_tokens": 1784840} +{"current_steps": 1240, "total_steps": 1405, "loss": 0.0435, "lr": 2.09779186967331e-06, "epoch": 4.412811387900356, "percentage": 88.26, "elapsed_time": "0:09:58", "remaining_time": "0:01:19", "throughput": 2993.3, "total_tokens": 1792584} +{"current_steps": 1245, "total_steps": 1405, "loss": 0.0525, "lr": 1.9749882537112296e-06, "epoch": 4.430604982206406, "percentage": 88.61, "elapsed_time": "0:09:59", "remaining_time": "0:01:17", "throughput": 3004.91, "total_tokens": 1800968} +{"current_steps": 1250, "total_steps": 1405, "loss": 0.0777, "lr": 1.8557404586421413e-06, "epoch": 4.448398576512456, "percentage": 88.97, "elapsed_time": "0:09:59", "remaining_time": "0:01:14", "throughput": 3015.14, "total_tokens": 1808456} +{"current_steps": 1255, "total_steps": 1405, "loss": 0.1469, "lr": 1.7400669002569232e-06, "epoch": 4.4661921708185055, "percentage": 89.32, "elapsed_time": "0:10:00", "remaining_time": "0:01:11", "throughput": 3025.63, "total_tokens": 1816136} +{"current_steps": 1260, "total_steps": 1405, "loss": 0.0696, "lr": 1.6279854423664697e-06, "epoch": 4.483985765124555, "percentage": 89.68, "elapsed_time": "0:10:00", "remaining_time": "0:01:09", "throughput": 3036.57, "total_tokens": 1824136} +{"current_steps": 1265, "total_steps": 1405, "loss": 0.0084, "lr": 1.5195133940429345e-06, "epoch": 4.501779359430605, "percentage": 90.04, "elapsed_time": "0:10:01", "remaining_time": "0:01:06", "throughput": 3046.26, "total_tokens": 1831304} +{"current_steps": 1270, "total_steps": 1405, "loss": 0.0259, "lr": 1.4146675069466403e-06, "epoch": 4.519572953736655, "percentage": 90.39, "elapsed_time": "0:10:01", "remaining_time": "0:01:03", "throughput": 3054.47, "total_tokens": 1837512} +{"current_steps": 1275, "total_steps": 1405, "loss": 0.0319, "lr": 1.313463972739068e-06, "epoch": 4.537366548042705, "percentage": 90.75, "elapsed_time": "0:10:02", "remaining_time": "0:01:01", "throughput": 3063.57, "total_tokens": 1844296} +{"current_steps": 1278, "total_steps": 1405, "eval_loss": 0.3532261848449707, "epoch": 4.548042704626335, "percentage": 90.96, "elapsed_time": "0:10:02", "remaining_time": "0:00:59", "throughput": 3067.44, "total_tokens": 1849416} +{"current_steps": 1280, "total_steps": 1405, "loss": 0.0338, "lr": 1.2159184205823432e-06, "epoch": 4.555160142348754, "percentage": 91.1, "elapsed_time": "0:10:42", "remaining_time": "0:01:02", "throughput": 2880.92, "total_tokens": 1851720} +{"current_steps": 1285, "total_steps": 1405, "loss": 0.0457, "lr": 1.122045914725564e-06, "epoch": 4.572953736654805, "percentage": 91.46, "elapsed_time": "0:10:43", "remaining_time": "0:01:00", "throughput": 2888.98, "total_tokens": 1858120} +{"current_steps": 1290, "total_steps": 1405, "loss": 0.0645, "lr": 1.0318609521783818e-06, "epoch": 4.590747330960854, "percentage": 91.81, "elapsed_time": "0:10:43", "remaining_time": "0:00:57", "throughput": 2899.08, "total_tokens": 1865928} +{"current_steps": 1295, "total_steps": 1405, "loss": 0.0261, "lr": 9.453774604721938e-07, "epoch": 4.608540925266904, "percentage": 92.17, "elapsed_time": "0:10:44", "remaining_time": "0:00:54", "throughput": 2909.25, "total_tokens": 1873800} +{"current_steps": 1300, "total_steps": 1405, "loss": 0.054, "lr": 8.62608795509276e-07, "epoch": 4.6263345195729535, "percentage": 92.53, "elapsed_time": "0:10:44", "remaining_time": "0:00:52", "throughput": 2919.58, "total_tokens": 1881800} +{"current_steps": 1305, "total_steps": 1405, "loss": 0.0036, "lr": 7.835677395001795e-07, "epoch": 4.644128113879003, "percentage": 92.88, "elapsed_time": "0:10:44", "remaining_time": "0:00:49", "throughput": 2928.25, "total_tokens": 1888648} +{"current_steps": 1310, "total_steps": 1405, "loss": 0.1115, "lr": 7.082664989897487e-07, "epoch": 4.661921708185053, "percentage": 93.24, "elapsed_time": "0:10:45", "remaining_time": "0:00:46", "throughput": 2936.79, "total_tokens": 1895432} +{"current_steps": 1315, "total_steps": 1405, "loss": 0.0608, "lr": 6.367167029720234e-07, "epoch": 4.679715302491103, "percentage": 93.59, "elapsed_time": "0:10:45", "remaining_time": "0:00:44", "throughput": 2945.61, "total_tokens": 1902408} +{"current_steps": 1320, "total_steps": 1405, "loss": 0.0289, "lr": 5.68929401094323e-07, "epoch": 4.697508896797153, "percentage": 93.95, "elapsed_time": "0:10:46", "remaining_time": "0:00:41", "throughput": 2955.8, "total_tokens": 1910344} +{"current_steps": 1325, "total_steps": 1405, "loss": 0.0309, "lr": 5.049150619508502e-07, "epoch": 4.715302491103203, "percentage": 94.31, "elapsed_time": "0:10:46", "remaining_time": "0:00:39", "throughput": 2966.24, "total_tokens": 1918472} +{"current_steps": 1330, "total_steps": 1405, "loss": 0.0078, "lr": 4.4468357146596475e-07, "epoch": 4.733096085409253, "percentage": 94.66, "elapsed_time": "0:10:47", "remaining_time": "0:00:36", "throughput": 2974.0, "total_tokens": 1924744} +{"current_steps": 1335, "total_steps": 1405, "loss": 0.0676, "lr": 3.8824423136748777e-07, "epoch": 4.750889679715303, "percentage": 95.02, "elapsed_time": "0:10:47", "remaining_time": "0:00:33", "throughput": 2984.42, "total_tokens": 1932872} +{"current_steps": 1340, "total_steps": 1405, "loss": 0.0673, "lr": 3.3560575775019864e-07, "epoch": 4.7686832740213525, "percentage": 95.37, "elapsed_time": "0:10:48", "remaining_time": "0:00:31", "throughput": 2993.47, "total_tokens": 1940040} +{"current_steps": 1345, "total_steps": 1405, "loss": 0.0846, "lr": 2.8677627972978906e-07, "epoch": 4.786476868327402, "percentage": 95.73, "elapsed_time": "0:10:48", "remaining_time": "0:00:28", "throughput": 3004.92, "total_tokens": 1948936} +{"current_steps": 1349, "total_steps": 1405, "eval_loss": 0.34229812026023865, "epoch": 4.800711743772242, "percentage": 96.01, "elapsed_time": "0:10:49", "remaining_time": "0:00:26", "throughput": 3009.22, "total_tokens": 1954568} +{"current_steps": 1350, "total_steps": 1405, "loss": 0.001, "lr": 2.417633381874534e-07, "epoch": 4.804270462633452, "percentage": 96.09, "elapsed_time": "0:11:10", "remaining_time": "0:00:27", "throughput": 2916.75, "total_tokens": 1955912} +{"current_steps": 1355, "total_steps": 1405, "loss": 0.0243, "lr": 2.0057388460533732e-07, "epoch": 4.822064056939502, "percentage": 96.44, "elapsed_time": "0:11:11", "remaining_time": "0:00:24", "throughput": 2925.09, "total_tokens": 1962760} +{"current_steps": 1360, "total_steps": 1405, "loss": 0.0594, "lr": 1.6321427999298755e-07, "epoch": 4.839857651245552, "percentage": 96.8, "elapsed_time": "0:11:11", "remaining_time": "0:00:22", "throughput": 2932.8, "total_tokens": 1969160} +{"current_steps": 1365, "total_steps": 1405, "loss": 0.0329, "lr": 1.2969029390501597e-07, "epoch": 4.857651245551601, "percentage": 97.15, "elapsed_time": "0:11:11", "remaining_time": "0:00:19", "throughput": 2940.76, "total_tokens": 1975752} +{"current_steps": 1370, "total_steps": 1405, "loss": 0.0349, "lr": 1.0000710355008159e-07, "epoch": 4.875444839857651, "percentage": 97.51, "elapsed_time": "0:11:12", "remaining_time": "0:00:17", "throughput": 2949.94, "total_tokens": 1983240} +{"current_steps": 1375, "total_steps": 1405, "loss": 0.004, "lr": 7.416929299135511e-08, "epoch": 4.893238434163701, "percentage": 97.86, "elapsed_time": "0:11:12", "remaining_time": "0:00:14", "throughput": 2959.19, "total_tokens": 1990792} +{"current_steps": 1380, "total_steps": 1405, "loss": 0.028, "lr": 5.218085243859638e-08, "epoch": 4.911032028469751, "percentage": 98.22, "elapsed_time": "0:11:13", "remaining_time": "0:00:12", "throughput": 2968.97, "total_tokens": 1998728} +{"current_steps": 1385, "total_steps": 1405, "loss": 0.046, "lr": 3.4045177631936155e-08, "epoch": 4.9288256227758005, "percentage": 98.58, "elapsed_time": "0:11:13", "remaining_time": "0:00:09", "throughput": 2979.09, "total_tokens": 2006920} +{"current_steps": 1390, "total_steps": 1405, "loss": 0.0136, "lr": 1.976506931745392e-08, "epoch": 4.94661921708185, "percentage": 98.93, "elapsed_time": "0:11:14", "remaining_time": "0:00:07", "throughput": 2986.46, "total_tokens": 2013128} +{"current_steps": 1395, "total_steps": 1405, "loss": 0.0718, "lr": 9.3427328146517e-09, "epoch": 4.9644128113879, "percentage": 99.29, "elapsed_time": "0:11:14", "remaining_time": "0:00:04", "throughput": 2997.08, "total_tokens": 2021704} +{"current_steps": 1400, "total_steps": 1405, "loss": 0.1224, "lr": 2.779777675890327e-09, "epoch": 4.98220640569395, "percentage": 99.64, "elapsed_time": "0:11:14", "remaining_time": "0:00:02", "throughput": 3005.77, "total_tokens": 2028872} +{"current_steps": 1405, "total_steps": 1405, "loss": 0.0499, "lr": 7.72174378022017e-11, "epoch": 5.0, "percentage": 100.0, "elapsed_time": "0:11:15", "remaining_time": "0:00:00", "throughput": 3013.01, "total_tokens": 2035272} +{"current_steps": 1405, "total_steps": 1405, "epoch": 5.0, "percentage": 100.0, "elapsed_time": "0:11:38", "remaining_time": "0:00:00", "throughput": 2915.24, "total_tokens": 2035272} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..b396092 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,2463 @@ +{ + "best_global_step": 71, + "best_metric": 0.14072927832603455, + "best_model_checkpoint": "saves_bts_preliminary/base/llama-3.2-1b-instruct/train_rte_42_1774791065/checkpoint-71", + "epoch": 5.0, + "eval_steps": 71, + "global_step": 1405, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.017793594306049824, + "grad_norm": 100.99552917480469, + "learning_rate": 1.4184397163120568e-06, + "loss": 0.6704, + "num_input_tokens_seen": 7872, + "step": 5 + }, + { + "epoch": 0.03558718861209965, + "grad_norm": 94.50279235839844, + "learning_rate": 3.1914893617021277e-06, + "loss": 0.2548, + "num_input_tokens_seen": 14784, + "step": 10 + }, + { + "epoch": 0.05338078291814947, + "grad_norm": 112.95467376708984, + "learning_rate": 4.964539007092199e-06, + "loss": 0.9227, + "num_input_tokens_seen": 23424, + "step": 15 + }, + { + "epoch": 0.0711743772241993, + "grad_norm": 4.330768585205078, + "learning_rate": 6.73758865248227e-06, + "loss": 0.1819, + "num_input_tokens_seen": 29824, + "step": 20 + }, + { + "epoch": 0.08896797153024912, + "grad_norm": 5.050032138824463, + "learning_rate": 8.510638297872341e-06, + "loss": 0.228, + "num_input_tokens_seen": 37824, + "step": 25 + }, + { + "epoch": 0.10676156583629894, + "grad_norm": 12.993303298950195, + "learning_rate": 1.0283687943262411e-05, + "loss": 0.1572, + "num_input_tokens_seen": 44608, + "step": 30 + }, + { + "epoch": 0.12455516014234876, + "grad_norm": 17.3212947845459, + "learning_rate": 1.2056737588652483e-05, + "loss": 0.1609, + "num_input_tokens_seen": 51968, + "step": 35 + }, + { + "epoch": 0.1423487544483986, + "grad_norm": 15.35787296295166, + "learning_rate": 1.3829787234042554e-05, + "loss": 0.2143, + "num_input_tokens_seen": 59456, + "step": 40 + }, + { + "epoch": 0.1601423487544484, + "grad_norm": 31.886463165283203, + "learning_rate": 1.5602836879432626e-05, + "loss": 0.2034, + "num_input_tokens_seen": 66496, + "step": 45 + }, + { + "epoch": 0.17793594306049823, + "grad_norm": 18.710391998291016, + "learning_rate": 1.7375886524822697e-05, + "loss": 0.2702, + "num_input_tokens_seen": 73408, + "step": 50 + }, + { + "epoch": 0.19572953736654805, + "grad_norm": 23.454370498657227, + "learning_rate": 1.9148936170212766e-05, + "loss": 0.1793, + "num_input_tokens_seen": 80576, + "step": 55 + }, + { + "epoch": 0.21352313167259787, + "grad_norm": 2.1824188232421875, + "learning_rate": 2.0921985815602837e-05, + "loss": 0.161, + "num_input_tokens_seen": 88256, + "step": 60 + }, + { + "epoch": 0.2313167259786477, + "grad_norm": 4.793557167053223, + "learning_rate": 2.269503546099291e-05, + "loss": 0.1808, + "num_input_tokens_seen": 96256, + "step": 65 + }, + { + "epoch": 0.2491103202846975, + "grad_norm": 27.524141311645508, + "learning_rate": 2.446808510638298e-05, + "loss": 0.2508, + "num_input_tokens_seen": 103424, + "step": 70 + }, + { + "epoch": 0.2526690391459075, + "eval_loss": 0.14072927832603455, + "eval_runtime": 0.6083, + "eval_samples_per_second": 409.332, + "eval_steps_per_second": 52.605, + "num_input_tokens_seen": 105024, + "step": 71 + }, + { + "epoch": 0.2669039145907473, + "grad_norm": 7.07442045211792, + "learning_rate": 2.624113475177305e-05, + "loss": 0.143, + "num_input_tokens_seen": 110528, + "step": 75 + }, + { + "epoch": 0.2846975088967972, + "grad_norm": 16.91026496887207, + "learning_rate": 2.8014184397163124e-05, + "loss": 0.2326, + "num_input_tokens_seen": 117440, + "step": 80 + }, + { + "epoch": 0.302491103202847, + "grad_norm": 15.627188682556152, + "learning_rate": 2.9787234042553192e-05, + "loss": 0.2053, + "num_input_tokens_seen": 125504, + "step": 85 + }, + { + "epoch": 0.3202846975088968, + "grad_norm": 24.881179809570312, + "learning_rate": 3.156028368794326e-05, + "loss": 0.2409, + "num_input_tokens_seen": 132352, + "step": 90 + }, + { + "epoch": 0.33807829181494664, + "grad_norm": 19.5406494140625, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.2063, + "num_input_tokens_seen": 139200, + "step": 95 + }, + { + "epoch": 0.35587188612099646, + "grad_norm": 23.64601707458496, + "learning_rate": 3.5106382978723407e-05, + "loss": 0.244, + "num_input_tokens_seen": 147904, + "step": 100 + }, + { + "epoch": 0.3736654804270463, + "grad_norm": 13.374123573303223, + "learning_rate": 3.687943262411347e-05, + "loss": 0.183, + "num_input_tokens_seen": 154240, + "step": 105 + }, + { + "epoch": 0.3914590747330961, + "grad_norm": 7.41645622253418, + "learning_rate": 3.865248226950355e-05, + "loss": 0.1615, + "num_input_tokens_seen": 161472, + "step": 110 + }, + { + "epoch": 0.4092526690391459, + "grad_norm": 11.39084243774414, + "learning_rate": 4.0425531914893614e-05, + "loss": 0.1703, + "num_input_tokens_seen": 168192, + "step": 115 + }, + { + "epoch": 0.42704626334519574, + "grad_norm": 11.221735000610352, + "learning_rate": 4.219858156028369e-05, + "loss": 0.246, + "num_input_tokens_seen": 174656, + "step": 120 + }, + { + "epoch": 0.44483985765124556, + "grad_norm": 10.728532791137695, + "learning_rate": 4.3971631205673764e-05, + "loss": 0.1665, + "num_input_tokens_seen": 181632, + "step": 125 + }, + { + "epoch": 0.4626334519572954, + "grad_norm": 14.208120346069336, + "learning_rate": 4.574468085106383e-05, + "loss": 0.1695, + "num_input_tokens_seen": 191488, + "step": 130 + }, + { + "epoch": 0.4804270462633452, + "grad_norm": 1.208547830581665, + "learning_rate": 4.751773049645391e-05, + "loss": 0.1764, + "num_input_tokens_seen": 198848, + "step": 135 + }, + { + "epoch": 0.498220640569395, + "grad_norm": 19.034914016723633, + "learning_rate": 4.929078014184397e-05, + "loss": 0.1769, + "num_input_tokens_seen": 207232, + "step": 140 + }, + { + "epoch": 0.505338078291815, + "eval_loss": 0.15581394731998444, + "eval_runtime": 0.5742, + "eval_samples_per_second": 433.643, + "eval_steps_per_second": 55.729, + "num_input_tokens_seen": 209536, + "step": 142 + }, + { + "epoch": 0.5160142348754448, + "grad_norm": 15.478862762451172, + "learning_rate": 4.9999305045921804e-05, + "loss": 0.2155, + "num_input_tokens_seen": 213952, + "step": 145 + }, + { + "epoch": 0.5338078291814946, + "grad_norm": 15.320956230163574, + "learning_rate": 4.9995058244251644e-05, + "loss": 0.185, + "num_input_tokens_seen": 221376, + "step": 150 + }, + { + "epoch": 0.5516014234875445, + "grad_norm": 14.336426734924316, + "learning_rate": 4.998695138156149e-05, + "loss": 0.2471, + "num_input_tokens_seen": 228928, + "step": 155 + }, + { + "epoch": 0.5693950177935944, + "grad_norm": 9.861719131469727, + "learning_rate": 4.997498570981822e-05, + "loss": 0.2061, + "num_input_tokens_seen": 236352, + "step": 160 + }, + { + "epoch": 0.5871886120996441, + "grad_norm": 10.547555923461914, + "learning_rate": 4.995916307691601e-05, + "loss": 0.1488, + "num_input_tokens_seen": 244416, + "step": 165 + }, + { + "epoch": 0.604982206405694, + "grad_norm": 6.471895217895508, + "learning_rate": 4.993948592639104e-05, + "loss": 0.1625, + "num_input_tokens_seen": 251456, + "step": 170 + }, + { + "epoch": 0.6227758007117438, + "grad_norm": 10.839587211608887, + "learning_rate": 4.991595729704405e-05, + "loss": 0.1635, + "num_input_tokens_seen": 258880, + "step": 175 + }, + { + "epoch": 0.6405693950177936, + "grad_norm": 14.87012767791748, + "learning_rate": 4.9888580822471086e-05, + "loss": 0.163, + "num_input_tokens_seen": 265152, + "step": 180 + }, + { + "epoch": 0.6583629893238434, + "grad_norm": 12.832857131958008, + "learning_rate": 4.985736073050237e-05, + "loss": 0.1599, + "num_input_tokens_seen": 272576, + "step": 185 + }, + { + "epoch": 0.6761565836298933, + "grad_norm": 9.781329154968262, + "learning_rate": 4.982230184254933e-05, + "loss": 0.1669, + "num_input_tokens_seen": 279744, + "step": 190 + }, + { + "epoch": 0.693950177935943, + "grad_norm": 9.258131980895996, + "learning_rate": 4.9783409572860105e-05, + "loss": 0.1659, + "num_input_tokens_seen": 287680, + "step": 195 + }, + { + "epoch": 0.7117437722419929, + "grad_norm": 6.527733325958252, + "learning_rate": 4.974068992768331e-05, + "loss": 0.1729, + "num_input_tokens_seen": 294592, + "step": 200 + }, + { + "epoch": 0.7295373665480427, + "grad_norm": 17.004568099975586, + "learning_rate": 4.9694149504340517e-05, + "loss": 0.2655, + "num_input_tokens_seen": 301440, + "step": 205 + }, + { + "epoch": 0.7473309608540926, + "grad_norm": 12.954022407531738, + "learning_rate": 4.964379549020741e-05, + "loss": 0.1924, + "num_input_tokens_seen": 308416, + "step": 210 + }, + { + "epoch": 0.7580071174377224, + "eval_loss": 0.1600140929222107, + "eval_runtime": 0.5686, + "eval_samples_per_second": 437.954, + "eval_steps_per_second": 56.283, + "num_input_tokens_seen": 312576, + "step": 213 + }, + { + "epoch": 0.7651245551601423, + "grad_norm": 2.3744094371795654, + "learning_rate": 4.958963566160384e-05, + "loss": 0.1666, + "num_input_tokens_seen": 315328, + "step": 215 + }, + { + "epoch": 0.7829181494661922, + "grad_norm": 4.415204048156738, + "learning_rate": 4.953167838259285e-05, + "loss": 0.1668, + "num_input_tokens_seen": 322688, + "step": 220 + }, + { + "epoch": 0.800711743772242, + "grad_norm": 4.4129319190979, + "learning_rate": 4.946993260368904e-05, + "loss": 0.1826, + "num_input_tokens_seen": 329280, + "step": 225 + }, + { + "epoch": 0.8185053380782918, + "grad_norm": 1.2767548561096191, + "learning_rate": 4.940440786047628e-05, + "loss": 0.1488, + "num_input_tokens_seen": 336896, + "step": 230 + }, + { + "epoch": 0.8362989323843416, + "grad_norm": 10.839607238769531, + "learning_rate": 4.933511427213511e-05, + "loss": 0.2852, + "num_input_tokens_seen": 344128, + "step": 235 + }, + { + "epoch": 0.8540925266903915, + "grad_norm": 16.616424560546875, + "learning_rate": 4.926206253988001e-05, + "loss": 0.1901, + "num_input_tokens_seen": 350912, + "step": 240 + }, + { + "epoch": 0.8718861209964412, + "grad_norm": 3.9430079460144043, + "learning_rate": 4.91852639453068e-05, + "loss": 0.1972, + "num_input_tokens_seen": 358016, + "step": 245 + }, + { + "epoch": 0.8896797153024911, + "grad_norm": 6.706320762634277, + "learning_rate": 4.910473034865033e-05, + "loss": 0.3136, + "num_input_tokens_seen": 364736, + "step": 250 + }, + { + "epoch": 0.9074733096085409, + "grad_norm": 3.7334418296813965, + "learning_rate": 4.902047418695292e-05, + "loss": 0.1648, + "num_input_tokens_seen": 371648, + "step": 255 + }, + { + "epoch": 0.9252669039145908, + "grad_norm": 11.57023811340332, + "learning_rate": 4.893250847214369e-05, + "loss": 0.1706, + "num_input_tokens_seen": 379200, + "step": 260 + }, + { + "epoch": 0.9430604982206405, + "grad_norm": 1.522990345954895, + "learning_rate": 4.884084678902898e-05, + "loss": 0.2379, + "num_input_tokens_seen": 387200, + "step": 265 + }, + { + "epoch": 0.9608540925266904, + "grad_norm": 6.809507846832275, + "learning_rate": 4.874550329319457e-05, + "loss": 0.1618, + "num_input_tokens_seen": 395264, + "step": 270 + }, + { + "epoch": 0.9786476868327402, + "grad_norm": 9.76811695098877, + "learning_rate": 4.864649270881944e-05, + "loss": 0.1637, + "num_input_tokens_seen": 402176, + "step": 275 + }, + { + "epoch": 0.99644128113879, + "grad_norm": 15.906750679016113, + "learning_rate": 4.8543830326401954e-05, + "loss": 0.1956, + "num_input_tokens_seen": 409984, + "step": 280 + }, + { + "epoch": 1.01067615658363, + "eval_loss": 0.16843144595623016, + "eval_runtime": 0.6085, + "eval_samples_per_second": 409.228, + "eval_steps_per_second": 52.592, + "num_input_tokens_seen": 414040, + "step": 284 + }, + { + "epoch": 1.0142348754448398, + "grad_norm": 7.785819053649902, + "learning_rate": 4.843753200039851e-05, + "loss": 0.1483, + "num_input_tokens_seen": 415256, + "step": 285 + }, + { + "epoch": 1.0320284697508897, + "grad_norm": 2.8784444332122803, + "learning_rate": 4.832761414677503e-05, + "loss": 0.1508, + "num_input_tokens_seen": 422808, + "step": 290 + }, + { + "epoch": 1.0498220640569396, + "grad_norm": 9.171720504760742, + "learning_rate": 4.8214093740471836e-05, + "loss": 0.1599, + "num_input_tokens_seen": 430104, + "step": 295 + }, + { + "epoch": 1.0676156583629894, + "grad_norm": 0.9587394595146179, + "learning_rate": 4.8096988312782174e-05, + "loss": 0.1629, + "num_input_tokens_seen": 436760, + "step": 300 + }, + { + "epoch": 1.085409252669039, + "grad_norm": 5.9907379150390625, + "learning_rate": 4.7976315948644745e-05, + "loss": 0.1729, + "num_input_tokens_seen": 444952, + "step": 305 + }, + { + "epoch": 1.103202846975089, + "grad_norm": 0.4214398264884949, + "learning_rate": 4.7852095283850866e-05, + "loss": 3.0413, + "num_input_tokens_seen": 452760, + "step": 310 + }, + { + "epoch": 1.1209964412811388, + "grad_norm": 0.5086872577667236, + "learning_rate": 4.772434550216643e-05, + "loss": 0.1785, + "num_input_tokens_seen": 458392, + "step": 315 + }, + { + "epoch": 1.1387900355871885, + "grad_norm": 0.5129872560501099, + "learning_rate": 4.7593086332369344e-05, + "loss": 0.1666, + "num_input_tokens_seen": 465112, + "step": 320 + }, + { + "epoch": 1.1565836298932384, + "grad_norm": 7.883773326873779, + "learning_rate": 4.74583380452027e-05, + "loss": 0.2395, + "num_input_tokens_seen": 472216, + "step": 325 + }, + { + "epoch": 1.1743772241992882, + "grad_norm": 3.8998472690582275, + "learning_rate": 4.7320121450244394e-05, + "loss": 0.2229, + "num_input_tokens_seen": 479576, + "step": 330 + }, + { + "epoch": 1.1921708185053381, + "grad_norm": 11.560748100280762, + "learning_rate": 4.717845789269333e-05, + "loss": 0.2531, + "num_input_tokens_seen": 486552, + "step": 335 + }, + { + "epoch": 1.209964412811388, + "grad_norm": 20.51876449584961, + "learning_rate": 4.703336925007311e-05, + "loss": 0.2223, + "num_input_tokens_seen": 494616, + "step": 340 + }, + { + "epoch": 1.2277580071174377, + "grad_norm": 10.914800643920898, + "learning_rate": 4.68848779288534e-05, + "loss": 0.1898, + "num_input_tokens_seen": 501400, + "step": 345 + }, + { + "epoch": 1.2455516014234875, + "grad_norm": 6.894437789916992, + "learning_rate": 4.673300686098957e-05, + "loss": 0.1662, + "num_input_tokens_seen": 508888, + "step": 350 + }, + { + "epoch": 1.2633451957295374, + "grad_norm": 4.296377658843994, + "learning_rate": 4.657777950038133e-05, + "loss": 0.1589, + "num_input_tokens_seen": 517656, + "step": 355 + }, + { + "epoch": 1.2633451957295374, + "eval_loss": 0.1600693166255951, + "eval_runtime": 0.607, + "eval_samples_per_second": 410.202, + "eval_steps_per_second": 52.717, + "num_input_tokens_seen": 517656, + "step": 355 + }, + { + "epoch": 1.281138790035587, + "grad_norm": 6.819537162780762, + "learning_rate": 4.6419219819250636e-05, + "loss": 0.1538, + "num_input_tokens_seen": 526232, + "step": 360 + }, + { + "epoch": 1.298932384341637, + "grad_norm": 14.25802230834961, + "learning_rate": 4.62573523044396e-05, + "loss": 0.1811, + "num_input_tokens_seen": 533400, + "step": 365 + }, + { + "epoch": 1.3167259786476868, + "grad_norm": 3.1280250549316406, + "learning_rate": 4.609220195362886e-05, + "loss": 0.174, + "num_input_tokens_seen": 542168, + "step": 370 + }, + { + "epoch": 1.3345195729537367, + "grad_norm": 7.372785568237305, + "learning_rate": 4.5923794271477217e-05, + "loss": 0.1571, + "num_input_tokens_seen": 549976, + "step": 375 + }, + { + "epoch": 1.3523131672597866, + "grad_norm": 7.614220142364502, + "learning_rate": 4.575215526568278e-05, + "loss": 0.1641, + "num_input_tokens_seen": 557016, + "step": 380 + }, + { + "epoch": 1.3701067615658362, + "grad_norm": 38.93210983276367, + "learning_rate": 4.5577311442966584e-05, + "loss": 1.4814, + "num_input_tokens_seen": 564504, + "step": 385 + }, + { + "epoch": 1.387900355871886, + "grad_norm": 5.316745281219482, + "learning_rate": 4.539928980497903e-05, + "loss": 0.1601, + "num_input_tokens_seen": 571864, + "step": 390 + }, + { + "epoch": 1.405693950177936, + "grad_norm": 9.071686744689941, + "learning_rate": 4.521811784412996e-05, + "loss": 0.2213, + "num_input_tokens_seen": 578456, + "step": 395 + }, + { + "epoch": 1.4234875444839858, + "grad_norm": 36.631160736083984, + "learning_rate": 4.503382353934294e-05, + "loss": 1.4493, + "num_input_tokens_seen": 584600, + "step": 400 + }, + { + "epoch": 1.4412811387900355, + "grad_norm": 17.439191818237305, + "learning_rate": 4.4846435351734376e-05, + "loss": 0.1729, + "num_input_tokens_seen": 591128, + "step": 405 + }, + { + "epoch": 1.4590747330960854, + "grad_norm": 4.0148138999938965, + "learning_rate": 4.4655982220218176e-05, + "loss": 0.1539, + "num_input_tokens_seen": 598552, + "step": 410 + }, + { + "epoch": 1.4768683274021353, + "grad_norm": 0.6515812873840332, + "learning_rate": 4.446249355703661e-05, + "loss": 0.1612, + "num_input_tokens_seen": 607320, + "step": 415 + }, + { + "epoch": 1.4946619217081851, + "grad_norm": 4.950193881988525, + "learning_rate": 4.426599924321815e-05, + "loss": 0.1594, + "num_input_tokens_seen": 614744, + "step": 420 + }, + { + "epoch": 1.512455516014235, + "grad_norm": 4.902361869812012, + "learning_rate": 4.4066529623962784e-05, + "loss": 0.1947, + "num_input_tokens_seen": 622808, + "step": 425 + }, + { + "epoch": 1.5160142348754448, + "eval_loss": 0.18150445818901062, + "eval_runtime": 0.6062, + "eval_samples_per_second": 410.733, + "eval_steps_per_second": 52.785, + "num_input_tokens_seen": 624344, + "step": 426 + }, + { + "epoch": 1.5302491103202847, + "grad_norm": 0.29520076513290405, + "learning_rate": 4.386411550395576e-05, + "loss": 0.1523, + "num_input_tokens_seen": 630488, + "step": 430 + }, + { + "epoch": 1.5480427046263345, + "grad_norm": 1.9226378202438354, + "learning_rate": 4.365878814261032e-05, + "loss": 0.1721, + "num_input_tokens_seen": 638424, + "step": 435 + }, + { + "epoch": 1.5658362989323842, + "grad_norm": 6.8878493309021, + "learning_rate": 4.34505792492402e-05, + "loss": 0.1551, + "num_input_tokens_seen": 645208, + "step": 440 + }, + { + "epoch": 1.583629893238434, + "grad_norm": 9.136181831359863, + "learning_rate": 4.323952097816269e-05, + "loss": 0.1499, + "num_input_tokens_seen": 653016, + "step": 445 + }, + { + "epoch": 1.601423487544484, + "grad_norm": 7.4756178855896, + "learning_rate": 4.3025645923732926e-05, + "loss": 0.1843, + "num_input_tokens_seen": 659992, + "step": 450 + }, + { + "epoch": 1.6192170818505338, + "grad_norm": 7.807384490966797, + "learning_rate": 4.2808987115310255e-05, + "loss": 0.1579, + "num_input_tokens_seen": 667224, + "step": 455 + }, + { + "epoch": 1.6370106761565837, + "grad_norm": 0.17006787657737732, + "learning_rate": 4.2589578012157426e-05, + "loss": 0.1563, + "num_input_tokens_seen": 675160, + "step": 460 + }, + { + "epoch": 1.6548042704626336, + "grad_norm": 0.41114601492881775, + "learning_rate": 4.236745249827336e-05, + "loss": 0.1556, + "num_input_tokens_seen": 683544, + "step": 465 + }, + { + "epoch": 1.6725978647686834, + "grad_norm": 2.4918622970581055, + "learning_rate": 4.214264487716033e-05, + "loss": 0.1593, + "num_input_tokens_seen": 689368, + "step": 470 + }, + { + "epoch": 1.690391459074733, + "grad_norm": 10.712060928344727, + "learning_rate": 4.191518986652642e-05, + "loss": 0.1699, + "num_input_tokens_seen": 695832, + "step": 475 + }, + { + "epoch": 1.708185053380783, + "grad_norm": 0.39044228196144104, + "learning_rate": 4.168512259292391e-05, + "loss": 0.1563, + "num_input_tokens_seen": 703128, + "step": 480 + }, + { + "epoch": 1.7259786476868326, + "grad_norm": 4.815671443939209, + "learning_rate": 4.1452478586324605e-05, + "loss": 0.1507, + "num_input_tokens_seen": 709528, + "step": 485 + }, + { + "epoch": 1.7437722419928825, + "grad_norm": 0.5018470287322998, + "learning_rate": 4.121729377463285e-05, + "loss": 0.1558, + "num_input_tokens_seen": 716312, + "step": 490 + }, + { + "epoch": 1.7615658362989324, + "grad_norm": 10.01478099822998, + "learning_rate": 4.097960447813705e-05, + "loss": 0.1825, + "num_input_tokens_seen": 722776, + "step": 495 + }, + { + "epoch": 1.7686832740213523, + "eval_loss": 0.16469639539718628, + "eval_runtime": 0.5964, + "eval_samples_per_second": 417.484, + "eval_steps_per_second": 53.653, + "num_input_tokens_seen": 725656, + "step": 497 + }, + { + "epoch": 1.7793594306049823, + "grad_norm": 3.8590610027313232, + "learning_rate": 4.073944740390061e-05, + "loss": 0.1798, + "num_input_tokens_seen": 729944, + "step": 500 + }, + { + "epoch": 1.7971530249110321, + "grad_norm": 4.1739020347595215, + "learning_rate": 4.049685964009321e-05, + "loss": 0.1694, + "num_input_tokens_seen": 737112, + "step": 505 + }, + { + "epoch": 1.814946619217082, + "grad_norm": 10.671394348144531, + "learning_rate": 4.025187865026311e-05, + "loss": 0.1605, + "num_input_tokens_seen": 744408, + "step": 510 + }, + { + "epoch": 1.8327402135231317, + "grad_norm": 0.9396809935569763, + "learning_rate": 4.000454226755159e-05, + "loss": 0.1574, + "num_input_tokens_seen": 750488, + "step": 515 + }, + { + "epoch": 1.8505338078291815, + "grad_norm": 6.7215447425842285, + "learning_rate": 3.975488868885021e-05, + "loss": 0.1703, + "num_input_tokens_seen": 757528, + "step": 520 + }, + { + "epoch": 1.8683274021352312, + "grad_norm": 0.5858572721481323, + "learning_rate": 3.9502956468902014e-05, + "loss": 0.1545, + "num_input_tokens_seen": 763736, + "step": 525 + }, + { + "epoch": 1.886120996441281, + "grad_norm": 2.219594955444336, + "learning_rate": 3.924878451434735e-05, + "loss": 0.1534, + "num_input_tokens_seen": 771864, + "step": 530 + }, + { + "epoch": 1.903914590747331, + "grad_norm": 1.9175541400909424, + "learning_rate": 3.899241207771546e-05, + "loss": 0.1537, + "num_input_tokens_seen": 778712, + "step": 535 + }, + { + "epoch": 1.9217081850533808, + "grad_norm": 12.399153709411621, + "learning_rate": 3.873387875136252e-05, + "loss": 0.1917, + "num_input_tokens_seen": 784280, + "step": 540 + }, + { + "epoch": 1.9395017793594307, + "grad_norm": 7.259119987487793, + "learning_rate": 3.847322446135736e-05, + "loss": 0.1743, + "num_input_tokens_seen": 792280, + "step": 545 + }, + { + "epoch": 1.9572953736654806, + "grad_norm": 7.568546772003174, + "learning_rate": 3.821048946131549e-05, + "loss": 0.1752, + "num_input_tokens_seen": 798488, + "step": 550 + }, + { + "epoch": 1.9750889679715302, + "grad_norm": 6.783497333526611, + "learning_rate": 3.794571432618267e-05, + "loss": 0.1578, + "num_input_tokens_seen": 806104, + "step": 555 + }, + { + "epoch": 1.99288256227758, + "grad_norm": 9.681258201599121, + "learning_rate": 3.767893994596876e-05, + "loss": 0.1774, + "num_input_tokens_seen": 813336, + "step": 560 + }, + { + "epoch": 2.0106761565836297, + "grad_norm": 3.2600245475769043, + "learning_rate": 3.741020751943297e-05, + "loss": 0.1568, + "num_input_tokens_seen": 817576, + "step": 565 + }, + { + "epoch": 2.02135231316726, + "eval_loss": 0.15550938248634338, + "eval_runtime": 0.6255, + "eval_samples_per_second": 398.079, + "eval_steps_per_second": 51.159, + "num_input_tokens_seen": 821416, + "step": 568 + }, + { + "epoch": 2.0284697508896796, + "grad_norm": 3.0256900787353516, + "learning_rate": 3.713955854772144e-05, + "loss": 0.1565, + "num_input_tokens_seen": 823848, + "step": 570 + }, + { + "epoch": 2.0462633451957295, + "grad_norm": 1.889113187789917, + "learning_rate": 3.686703482795802e-05, + "loss": 0.1536, + "num_input_tokens_seen": 832232, + "step": 575 + }, + { + "epoch": 2.0640569395017794, + "grad_norm": 3.334212303161621, + "learning_rate": 3.6592678446789516e-05, + "loss": 0.1624, + "num_input_tokens_seen": 840424, + "step": 580 + }, + { + "epoch": 2.0818505338078293, + "grad_norm": 3.6044702529907227, + "learning_rate": 3.631653177388605e-05, + "loss": 0.1395, + "num_input_tokens_seen": 846824, + "step": 585 + }, + { + "epoch": 2.099644128113879, + "grad_norm": 8.975861549377441, + "learning_rate": 3.60386374553978e-05, + "loss": 0.196, + "num_input_tokens_seen": 853608, + "step": 590 + }, + { + "epoch": 2.117437722419929, + "grad_norm": 10.559611320495605, + "learning_rate": 3.5759038407369056e-05, + "loss": 0.1637, + "num_input_tokens_seen": 860968, + "step": 595 + }, + { + "epoch": 2.135231316725979, + "grad_norm": 6.914389610290527, + "learning_rate": 3.547777780911055e-05, + "loss": 0.194, + "num_input_tokens_seen": 868904, + "step": 600 + }, + { + "epoch": 2.1530249110320283, + "grad_norm": 8.329413414001465, + "learning_rate": 3.519489909653113e-05, + "loss": 0.1592, + "num_input_tokens_seen": 876072, + "step": 605 + }, + { + "epoch": 2.170818505338078, + "grad_norm": 4.701565742492676, + "learning_rate": 3.4910445955429854e-05, + "loss": 0.1549, + "num_input_tokens_seen": 883752, + "step": 610 + }, + { + "epoch": 2.188612099644128, + "grad_norm": 7.797508716583252, + "learning_rate": 3.4624462314749443e-05, + "loss": 0.1533, + "num_input_tokens_seen": 891304, + "step": 615 + }, + { + "epoch": 2.206405693950178, + "grad_norm": 1.7337656021118164, + "learning_rate": 3.433699233979222e-05, + "loss": 0.1483, + "num_input_tokens_seen": 899176, + "step": 620 + }, + { + "epoch": 2.224199288256228, + "grad_norm": 5.721285343170166, + "learning_rate": 3.4048080425399505e-05, + "loss": 0.1436, + "num_input_tokens_seen": 907560, + "step": 625 + }, + { + "epoch": 2.2419928825622777, + "grad_norm": 3.0777595043182373, + "learning_rate": 3.375777118909561e-05, + "loss": 0.1413, + "num_input_tokens_seen": 915240, + "step": 630 + }, + { + "epoch": 2.2597864768683276, + "grad_norm": 15.890474319458008, + "learning_rate": 3.3466109464197426e-05, + "loss": 0.1597, + "num_input_tokens_seen": 921384, + "step": 635 + }, + { + "epoch": 2.2740213523131674, + "eval_loss": 0.1567462682723999, + "eval_runtime": 0.6255, + "eval_samples_per_second": 398.087, + "eval_steps_per_second": 51.16, + "num_input_tokens_seen": 926760, + "step": 639 + }, + { + "epoch": 2.277580071174377, + "grad_norm": 1.5718131065368652, + "learning_rate": 3.317314029289067e-05, + "loss": 0.1653, + "num_input_tokens_seen": 927528, + "step": 640 + }, + { + "epoch": 2.295373665480427, + "grad_norm": 3.7291853427886963, + "learning_rate": 3.287890891927386e-05, + "loss": 0.1594, + "num_input_tokens_seen": 934568, + "step": 645 + }, + { + "epoch": 2.3131672597864767, + "grad_norm": 4.549835205078125, + "learning_rate": 3.258346078237122e-05, + "loss": 0.1402, + "num_input_tokens_seen": 942248, + "step": 650 + }, + { + "epoch": 2.3309608540925266, + "grad_norm": 14.683507919311523, + "learning_rate": 3.228684150911527e-05, + "loss": 0.2418, + "num_input_tokens_seen": 949096, + "step": 655 + }, + { + "epoch": 2.3487544483985765, + "grad_norm": 1.7894399166107178, + "learning_rate": 3.198909690730063e-05, + "loss": 0.1845, + "num_input_tokens_seen": 955752, + "step": 660 + }, + { + "epoch": 2.3665480427046264, + "grad_norm": 15.066572189331055, + "learning_rate": 3.169027295850977e-05, + "loss": 0.1664, + "num_input_tokens_seen": 963176, + "step": 665 + }, + { + "epoch": 2.3843416370106763, + "grad_norm": 4.301926136016846, + "learning_rate": 3.139041581101187e-05, + "loss": 0.1627, + "num_input_tokens_seen": 968232, + "step": 670 + }, + { + "epoch": 2.402135231316726, + "grad_norm": 5.145651340484619, + "learning_rate": 3.108957177263608e-05, + "loss": 0.1498, + "num_input_tokens_seen": 976552, + "step": 675 + }, + { + "epoch": 2.419928825622776, + "grad_norm": 2.5066633224487305, + "learning_rate": 3.078778730362003e-05, + "loss": 0.1656, + "num_input_tokens_seen": 983720, + "step": 680 + }, + { + "epoch": 2.4377224199288254, + "grad_norm": 3.9444332122802734, + "learning_rate": 3.048510900943484e-05, + "loss": 0.1567, + "num_input_tokens_seen": 991976, + "step": 685 + }, + { + "epoch": 2.4555160142348753, + "grad_norm": 4.341545581817627, + "learning_rate": 3.018158363358773e-05, + "loss": 0.1807, + "num_input_tokens_seen": 998184, + "step": 690 + }, + { + "epoch": 2.473309608540925, + "grad_norm": 4.363418102264404, + "learning_rate": 2.9877258050403212e-05, + "loss": 0.1678, + "num_input_tokens_seen": 1005672, + "step": 695 + }, + { + "epoch": 2.491103202846975, + "grad_norm": 3.3406949043273926, + "learning_rate": 2.9572179257784215e-05, + "loss": 0.1531, + "num_input_tokens_seen": 1013096, + "step": 700 + }, + { + "epoch": 2.508896797153025, + "grad_norm": 2.7513387203216553, + "learning_rate": 2.9266394369954052e-05, + "loss": 0.1337, + "num_input_tokens_seen": 1019304, + "step": 705 + }, + { + "epoch": 2.526690391459075, + "grad_norm": 7.649652481079102, + "learning_rate": 2.8959950610180374e-05, + "loss": 0.1431, + "num_input_tokens_seen": 1025320, + "step": 710 + }, + { + "epoch": 2.526690391459075, + "eval_loss": 0.16391661763191223, + "eval_runtime": 0.6072, + "eval_samples_per_second": 410.078, + "eval_steps_per_second": 52.701, + "num_input_tokens_seen": 1025320, + "step": 710 + }, + { + "epoch": 2.5444839857651247, + "grad_norm": 15.210580825805664, + "learning_rate": 2.865289530348243e-05, + "loss": 0.1675, + "num_input_tokens_seen": 1032552, + "step": 715 + }, + { + "epoch": 2.562277580071174, + "grad_norm": 4.497170925140381, + "learning_rate": 2.834527586932243e-05, + "loss": 2.4615, + "num_input_tokens_seen": 1039912, + "step": 720 + }, + { + "epoch": 2.580071174377224, + "grad_norm": 10.657808303833008, + "learning_rate": 2.8037139814282493e-05, + "loss": 0.1636, + "num_input_tokens_seen": 1047208, + "step": 725 + }, + { + "epoch": 2.597864768683274, + "grad_norm": 1.3169434070587158, + "learning_rate": 2.7728534724728027e-05, + "loss": 0.1652, + "num_input_tokens_seen": 1053928, + "step": 730 + }, + { + "epoch": 2.6156583629893237, + "grad_norm": 2.855050802230835, + "learning_rate": 2.741950825945881e-05, + "loss": 0.1482, + "num_input_tokens_seen": 1061608, + "step": 735 + }, + { + "epoch": 2.6334519572953736, + "grad_norm": 2.2470901012420654, + "learning_rate": 2.711010814234896e-05, + "loss": 0.1501, + "num_input_tokens_seen": 1067560, + "step": 740 + }, + { + "epoch": 2.6512455516014235, + "grad_norm": 4.065670967102051, + "learning_rate": 2.6800382154976732e-05, + "loss": 0.1743, + "num_input_tokens_seen": 1074152, + "step": 745 + }, + { + "epoch": 2.6690391459074734, + "grad_norm": 5.455725193023682, + "learning_rate": 2.6490378129245498e-05, + "loss": 0.1441, + "num_input_tokens_seen": 1082856, + "step": 750 + }, + { + "epoch": 2.6868327402135233, + "grad_norm": 3.1051108837127686, + "learning_rate": 2.6180143939996925e-05, + "loss": 0.1495, + "num_input_tokens_seen": 1089512, + "step": 755 + }, + { + "epoch": 2.704626334519573, + "grad_norm": 2.337266206741333, + "learning_rate": 2.5869727497617495e-05, + "loss": 0.1464, + "num_input_tokens_seen": 1096232, + "step": 760 + }, + { + "epoch": 2.722419928825623, + "grad_norm": 4.207283973693848, + "learning_rate": 2.55591767406396e-05, + "loss": 0.1572, + "num_input_tokens_seen": 1104168, + "step": 765 + }, + { + "epoch": 2.7402135231316724, + "grad_norm": 2.140827178955078, + "learning_rate": 2.5248539628338246e-05, + "loss": 0.1326, + "num_input_tokens_seen": 1112232, + "step": 770 + }, + { + "epoch": 2.7580071174377223, + "grad_norm": 8.35146713256836, + "learning_rate": 2.4937864133324516e-05, + "loss": 0.1734, + "num_input_tokens_seen": 1119016, + "step": 775 + }, + { + "epoch": 2.775800711743772, + "grad_norm": 18.731395721435547, + "learning_rate": 2.462719823413707e-05, + "loss": 0.1986, + "num_input_tokens_seen": 1126696, + "step": 780 + }, + { + "epoch": 2.7793594306049823, + "eval_loss": 0.15414386987686157, + "eval_runtime": 0.6372, + "eval_samples_per_second": 390.788, + "eval_steps_per_second": 50.222, + "num_input_tokens_seen": 1128104, + "step": 781 + }, + { + "epoch": 2.793594306049822, + "grad_norm": 6.263734817504883, + "learning_rate": 2.4316589907832654e-05, + "loss": 0.1576, + "num_input_tokens_seen": 1134184, + "step": 785 + }, + { + "epoch": 2.811387900355872, + "grad_norm": 1.7886258363723755, + "learning_rate": 2.4006087122576863e-05, + "loss": 0.1392, + "num_input_tokens_seen": 1140392, + "step": 790 + }, + { + "epoch": 2.829181494661922, + "grad_norm": 9.585826873779297, + "learning_rate": 2.3695737830236266e-05, + "loss": 0.2025, + "num_input_tokens_seen": 1148328, + "step": 795 + }, + { + "epoch": 2.8469750889679717, + "grad_norm": 3.7239151000976562, + "learning_rate": 2.338558995897307e-05, + "loss": 0.1781, + "num_input_tokens_seen": 1154024, + "step": 800 + }, + { + "epoch": 2.864768683274021, + "grad_norm": 7.329390525817871, + "learning_rate": 2.3075691405843435e-05, + "loss": 0.195, + "num_input_tokens_seen": 1160808, + "step": 805 + }, + { + "epoch": 2.882562277580071, + "grad_norm": 5.577742099761963, + "learning_rate": 2.2766090029400573e-05, + "loss": 0.1597, + "num_input_tokens_seen": 1167912, + "step": 810 + }, + { + "epoch": 2.900355871886121, + "grad_norm": 8.529340744018555, + "learning_rate": 2.2456833642303822e-05, + "loss": 0.1433, + "num_input_tokens_seen": 1174568, + "step": 815 + }, + { + "epoch": 2.9181494661921707, + "grad_norm": 5.017305374145508, + "learning_rate": 2.214797000393479e-05, + "loss": 0.1553, + "num_input_tokens_seen": 1181480, + "step": 820 + }, + { + "epoch": 2.9359430604982206, + "grad_norm": 3.5880136489868164, + "learning_rate": 2.183954681302173e-05, + "loss": 0.1614, + "num_input_tokens_seen": 1189928, + "step": 825 + }, + { + "epoch": 2.9537366548042705, + "grad_norm": 1.7257145643234253, + "learning_rate": 2.1531611700273297e-05, + "loss": 0.1351, + "num_input_tokens_seen": 1197480, + "step": 830 + }, + { + "epoch": 2.9715302491103204, + "grad_norm": 4.875583171844482, + "learning_rate": 2.1224212221022777e-05, + "loss": 0.1845, + "num_input_tokens_seen": 1204584, + "step": 835 + }, + { + "epoch": 2.9893238434163703, + "grad_norm": 5.411481857299805, + "learning_rate": 2.0917395847883995e-05, + "loss": 0.1616, + "num_input_tokens_seen": 1212584, + "step": 840 + }, + { + "epoch": 3.00711743772242, + "grad_norm": 4.330006122589111, + "learning_rate": 2.0611209963419958e-05, + "loss": 0.1625, + "num_input_tokens_seen": 1217856, + "step": 845 + }, + { + "epoch": 3.0249110320284696, + "grad_norm": 10.39330768585205, + "learning_rate": 2.030570185282544e-05, + "loss": 0.137, + "num_input_tokens_seen": 1226624, + "step": 850 + }, + { + "epoch": 3.0320284697508897, + "eval_loss": 0.1851627230644226, + "eval_runtime": 0.6345, + "eval_samples_per_second": 392.434, + "eval_steps_per_second": 50.433, + "num_input_tokens_seen": 1229440, + "step": 852 + }, + { + "epoch": 3.0427046263345194, + "grad_norm": 3.0105044841766357, + "learning_rate": 2.0000918696624588e-05, + "loss": 0.1453, + "num_input_tokens_seen": 1233152, + "step": 855 + }, + { + "epoch": 3.0604982206405693, + "grad_norm": 2.1030280590057373, + "learning_rate": 1.9696907563384687e-05, + "loss": 0.138, + "num_input_tokens_seen": 1240128, + "step": 860 + }, + { + "epoch": 3.078291814946619, + "grad_norm": 2.1849405765533447, + "learning_rate": 1.939371540244723e-05, + "loss": 0.1148, + "num_input_tokens_seen": 1248064, + "step": 865 + }, + { + "epoch": 3.096085409252669, + "grad_norm": 6.3520402908325195, + "learning_rate": 1.9091389036677382e-05, + "loss": 0.1106, + "num_input_tokens_seen": 1255232, + "step": 870 + }, + { + "epoch": 3.113879003558719, + "grad_norm": 3.93772554397583, + "learning_rate": 1.878997515523299e-05, + "loss": 0.1169, + "num_input_tokens_seen": 1262272, + "step": 875 + }, + { + "epoch": 3.131672597864769, + "grad_norm": 6.558725833892822, + "learning_rate": 1.848952030635424e-05, + "loss": 0.1161, + "num_input_tokens_seen": 1269632, + "step": 880 + }, + { + "epoch": 3.1494661921708187, + "grad_norm": 3.3383939266204834, + "learning_rate": 1.819007089017508e-05, + "loss": 0.123, + "num_input_tokens_seen": 1277312, + "step": 885 + }, + { + "epoch": 3.167259786476868, + "grad_norm": 15.820018768310547, + "learning_rate": 1.789167315155749e-05, + "loss": 0.1599, + "num_input_tokens_seen": 1284096, + "step": 890 + }, + { + "epoch": 3.185053380782918, + "grad_norm": 2.621346950531006, + "learning_rate": 1.7594373172949784e-05, + "loss": 0.1109, + "num_input_tokens_seen": 1291648, + "step": 895 + }, + { + "epoch": 3.202846975088968, + "grad_norm": 6.172404766082764, + "learning_rate": 1.7298216867269906e-05, + "loss": 0.1569, + "num_input_tokens_seen": 1299712, + "step": 900 + }, + { + "epoch": 3.2206405693950177, + "grad_norm": 10.012272834777832, + "learning_rate": 1.7003249970815026e-05, + "loss": 0.1082, + "num_input_tokens_seen": 1306176, + "step": 905 + }, + { + "epoch": 3.2384341637010676, + "grad_norm": 3.6646652221679688, + "learning_rate": 1.6709518036198308e-05, + "loss": 0.1387, + "num_input_tokens_seen": 1314112, + "step": 910 + }, + { + "epoch": 3.2562277580071175, + "grad_norm": 9.655856132507324, + "learning_rate": 1.6417066425314087e-05, + "loss": 0.1199, + "num_input_tokens_seen": 1321088, + "step": 915 + }, + { + "epoch": 3.2740213523131674, + "grad_norm": 7.546687602996826, + "learning_rate": 1.612594030233252e-05, + "loss": 0.1422, + "num_input_tokens_seen": 1328512, + "step": 920 + }, + { + "epoch": 3.284697508896797, + "eval_loss": 0.16463510692119598, + "eval_runtime": 0.6174, + "eval_samples_per_second": 403.311, + "eval_steps_per_second": 51.831, + "num_input_tokens_seen": 1332544, + "step": 923 + }, + { + "epoch": 3.2918149466192173, + "grad_norm": 3.2389485836029053, + "learning_rate": 1.583618462672472e-05, + "loss": 0.0863, + "num_input_tokens_seen": 1336128, + "step": 925 + }, + { + "epoch": 3.309608540925267, + "grad_norm": 3.8101906776428223, + "learning_rate": 1.5547844146319545e-05, + "loss": 0.1155, + "num_input_tokens_seen": 1343552, + "step": 930 + }, + { + "epoch": 3.3274021352313166, + "grad_norm": 5.337780475616455, + "learning_rate": 1.5260963390393075e-05, + "loss": 0.1691, + "num_input_tokens_seen": 1351552, + "step": 935 + }, + { + "epoch": 3.3451957295373664, + "grad_norm": 4.4513840675354, + "learning_rate": 1.4975586662791783e-05, + "loss": 0.0983, + "num_input_tokens_seen": 1358272, + "step": 940 + }, + { + "epoch": 3.3629893238434163, + "grad_norm": 7.950605392456055, + "learning_rate": 1.4691758035090602e-05, + "loss": 0.137, + "num_input_tokens_seen": 1366784, + "step": 945 + }, + { + "epoch": 3.380782918149466, + "grad_norm": 2.973015785217285, + "learning_rate": 1.4409521339786808e-05, + "loss": 0.1389, + "num_input_tokens_seen": 1373312, + "step": 950 + }, + { + "epoch": 3.398576512455516, + "grad_norm": 1.8699113130569458, + "learning_rate": 1.41289201635308e-05, + "loss": 0.0916, + "num_input_tokens_seen": 1380736, + "step": 955 + }, + { + "epoch": 3.416370106761566, + "grad_norm": 1.629996657371521, + "learning_rate": 1.3849997840394943e-05, + "loss": 0.096, + "num_input_tokens_seen": 1388544, + "step": 960 + }, + { + "epoch": 3.434163701067616, + "grad_norm": 3.142674446105957, + "learning_rate": 1.3572797445181345e-05, + "loss": 0.1252, + "num_input_tokens_seen": 1396160, + "step": 965 + }, + { + "epoch": 3.4519572953736652, + "grad_norm": 1.9603294134140015, + "learning_rate": 1.3297361786769652e-05, + "loss": 0.0988, + "num_input_tokens_seen": 1404096, + "step": 970 + }, + { + "epoch": 3.469750889679715, + "grad_norm": 18.924589157104492, + "learning_rate": 1.3023733401505981e-05, + "loss": 0.1135, + "num_input_tokens_seen": 1411008, + "step": 975 + }, + { + "epoch": 3.487544483985765, + "grad_norm": 4.6644487380981445, + "learning_rate": 1.2751954546633871e-05, + "loss": 0.155, + "num_input_tokens_seen": 1418880, + "step": 980 + }, + { + "epoch": 3.505338078291815, + "grad_norm": 8.87281608581543, + "learning_rate": 1.2482067193768417e-05, + "loss": 0.1302, + "num_input_tokens_seen": 1426048, + "step": 985 + }, + { + "epoch": 3.5231316725978647, + "grad_norm": 6.374912738800049, + "learning_rate": 1.2214113022414448e-05, + "loss": 0.0911, + "num_input_tokens_seen": 1432064, + "step": 990 + }, + { + "epoch": 3.5373665480427046, + "eval_loss": 0.1803617924451828, + "eval_runtime": 0.6287, + "eval_samples_per_second": 396.078, + "eval_steps_per_second": 50.902, + "num_input_tokens_seen": 1438336, + "step": 994 + }, + { + "epoch": 3.5409252669039146, + "grad_norm": 7.5531110763549805, + "learning_rate": 1.1948133413529817e-05, + "loss": 0.1165, + "num_input_tokens_seen": 1439808, + "step": 995 + }, + { + "epoch": 3.5587188612099645, + "grad_norm": 10.984672546386719, + "learning_rate": 1.168416944313486e-05, + "loss": 0.156, + "num_input_tokens_seen": 1447616, + "step": 1000 + }, + { + "epoch": 3.5765124555160144, + "grad_norm": 5.665327072143555, + "learning_rate": 1.1422261875968845e-05, + "loss": 0.0978, + "num_input_tokens_seen": 1454208, + "step": 1005 + }, + { + "epoch": 3.5943060498220643, + "grad_norm": 5.291867256164551, + "learning_rate": 1.1162451159194614e-05, + "loss": 0.0784, + "num_input_tokens_seen": 1463296, + "step": 1010 + }, + { + "epoch": 3.612099644128114, + "grad_norm": 4.302516937255859, + "learning_rate": 1.0904777416152166e-05, + "loss": 0.1698, + "num_input_tokens_seen": 1469952, + "step": 1015 + }, + { + "epoch": 3.6298932384341636, + "grad_norm": 2.612572193145752, + "learning_rate": 1.0649280440162326e-05, + "loss": 0.1033, + "num_input_tokens_seen": 1477184, + "step": 1020 + }, + { + "epoch": 3.6476868327402134, + "grad_norm": 7.643741607666016, + "learning_rate": 1.0395999688381314e-05, + "loss": 0.1025, + "num_input_tokens_seen": 1484160, + "step": 1025 + }, + { + "epoch": 3.6654804270462633, + "grad_norm": 1.1666496992111206, + "learning_rate": 1.0144974275707241e-05, + "loss": 0.0885, + "num_input_tokens_seen": 1491200, + "step": 1030 + }, + { + "epoch": 3.683274021352313, + "grad_norm": 8.459441184997559, + "learning_rate": 9.896242968739539e-06, + "loss": 0.1678, + "num_input_tokens_seen": 1498368, + "step": 1035 + }, + { + "epoch": 3.701067615658363, + "grad_norm": 7.720543384552002, + "learning_rate": 9.649844179792081e-06, + "loss": 0.1068, + "num_input_tokens_seen": 1505984, + "step": 1040 + }, + { + "epoch": 3.718861209964413, + "grad_norm": 1.8878631591796875, + "learning_rate": 9.405815960961054e-06, + "loss": 0.0978, + "num_input_tokens_seen": 1511680, + "step": 1045 + }, + { + "epoch": 3.7366548042704624, + "grad_norm": 2.47867488861084, + "learning_rate": 9.16419599824847e-06, + "loss": 0.0966, + "num_input_tokens_seen": 1517888, + "step": 1050 + }, + { + "epoch": 3.7544483985765122, + "grad_norm": 3.3050386905670166, + "learning_rate": 8.925021605742211e-06, + "loss": 0.1815, + "num_input_tokens_seen": 1525568, + "step": 1055 + }, + { + "epoch": 3.772241992882562, + "grad_norm": 6.0262837409973145, + "learning_rate": 8.68832971985347e-06, + "loss": 0.1028, + "num_input_tokens_seen": 1532480, + "step": 1060 + }, + { + "epoch": 3.790035587188612, + "grad_norm": 2.8200912475585938, + "learning_rate": 8.454156893612591e-06, + "loss": 0.1203, + "num_input_tokens_seen": 1539072, + "step": 1065 + }, + { + "epoch": 3.790035587188612, + "eval_loss": 0.17713916301727295, + "eval_runtime": 0.6261, + "eval_samples_per_second": 397.715, + "eval_steps_per_second": 51.112, + "num_input_tokens_seen": 1539072, + "step": 1065 + }, + { + "epoch": 3.807829181494662, + "grad_norm": 2.3930211067199707, + "learning_rate": 8.222539291024078e-06, + "loss": 0.1178, + "num_input_tokens_seen": 1547584, + "step": 1070 + }, + { + "epoch": 3.8256227758007118, + "grad_norm": 7.24454402923584, + "learning_rate": 7.993512681481639e-06, + "loss": 0.0999, + "num_input_tokens_seen": 1554304, + "step": 1075 + }, + { + "epoch": 3.8434163701067616, + "grad_norm": 7.17146110534668, + "learning_rate": 7.767112434244253e-06, + "loss": 0.145, + "num_input_tokens_seen": 1560896, + "step": 1080 + }, + { + "epoch": 3.8612099644128115, + "grad_norm": 4.711667060852051, + "learning_rate": 7.543373512973947e-06, + "loss": 0.0627, + "num_input_tokens_seen": 1567744, + "step": 1085 + }, + { + "epoch": 3.8790035587188614, + "grad_norm": 12.18324089050293, + "learning_rate": 7.3223304703363135e-06, + "loss": 0.1558, + "num_input_tokens_seen": 1574400, + "step": 1090 + }, + { + "epoch": 3.8967971530249113, + "grad_norm": 2.6999011039733887, + "learning_rate": 7.104017442664393e-06, + "loss": 0.0965, + "num_input_tokens_seen": 1581504, + "step": 1095 + }, + { + "epoch": 3.914590747330961, + "grad_norm": 5.639074802398682, + "learning_rate": 6.8884681446869105e-06, + "loss": 0.0914, + "num_input_tokens_seen": 1589504, + "step": 1100 + }, + { + "epoch": 3.9323843416370106, + "grad_norm": 6.777685165405273, + "learning_rate": 6.67571586432163e-06, + "loss": 0.124, + "num_input_tokens_seen": 1597696, + "step": 1105 + }, + { + "epoch": 3.9501779359430604, + "grad_norm": 5.154758453369141, + "learning_rate": 6.465793457534553e-06, + "loss": 0.1388, + "num_input_tokens_seen": 1605248, + "step": 1110 + }, + { + "epoch": 3.9679715302491103, + "grad_norm": 4.713754653930664, + "learning_rate": 6.258733343265932e-06, + "loss": 0.1646, + "num_input_tokens_seen": 1613952, + "step": 1115 + }, + { + "epoch": 3.98576512455516, + "grad_norm": 5.546712875366211, + "learning_rate": 6.0545674984236826e-06, + "loss": 0.1024, + "num_input_tokens_seen": 1620224, + "step": 1120 + }, + { + "epoch": 4.00355871886121, + "grad_norm": 1.0218762159347534, + "learning_rate": 5.853327452945115e-06, + "loss": 0.0889, + "num_input_tokens_seen": 1625800, + "step": 1125 + }, + { + "epoch": 4.0213523131672595, + "grad_norm": 7.033966541290283, + "learning_rate": 5.655044284927657e-06, + "loss": 0.0747, + "num_input_tokens_seen": 1633352, + "step": 1130 + }, + { + "epoch": 4.039145907473309, + "grad_norm": 1.1709257364273071, + "learning_rate": 5.459748615829355e-06, + "loss": 0.0551, + "num_input_tokens_seen": 1640840, + "step": 1135 + }, + { + "epoch": 4.04270462633452, + "eval_loss": 0.19830213487148285, + "eval_runtime": 0.616, + "eval_samples_per_second": 404.216, + "eval_steps_per_second": 51.947, + "num_input_tokens_seen": 1642696, + "step": 1136 + }, + { + "epoch": 4.056939501779359, + "grad_norm": 2.399528980255127, + "learning_rate": 5.267470605739952e-06, + "loss": 0.0395, + "num_input_tokens_seen": 1648520, + "step": 1140 + }, + { + "epoch": 4.074733096085409, + "grad_norm": 3.8567628860473633, + "learning_rate": 5.078239948723154e-06, + "loss": 0.0215, + "num_input_tokens_seen": 1655752, + "step": 1145 + }, + { + "epoch": 4.092526690391459, + "grad_norm": 2.231137990951538, + "learning_rate": 4.892085868230881e-06, + "loss": 0.0073, + "num_input_tokens_seen": 1662920, + "step": 1150 + }, + { + "epoch": 4.110320284697509, + "grad_norm": 8.699728012084961, + "learning_rate": 4.709037112590217e-06, + "loss": 0.0348, + "num_input_tokens_seen": 1669896, + "step": 1155 + }, + { + "epoch": 4.128113879003559, + "grad_norm": 8.660861015319824, + "learning_rate": 4.529121950563716e-06, + "loss": 0.076, + "num_input_tokens_seen": 1675400, + "step": 1160 + }, + { + "epoch": 4.145907473309609, + "grad_norm": 7.111387252807617, + "learning_rate": 4.352368166983753e-06, + "loss": 0.0705, + "num_input_tokens_seen": 1682952, + "step": 1165 + }, + { + "epoch": 4.1637010676156585, + "grad_norm": 6.721922874450684, + "learning_rate": 4.178803058461664e-06, + "loss": 0.088, + "num_input_tokens_seen": 1690248, + "step": 1170 + }, + { + "epoch": 4.181494661921708, + "grad_norm": 1.4173535108566284, + "learning_rate": 4.0084534291722376e-06, + "loss": 0.05, + "num_input_tokens_seen": 1696840, + "step": 1175 + }, + { + "epoch": 4.199288256227758, + "grad_norm": 0.436257928609848, + "learning_rate": 3.841345586714251e-06, + "loss": 0.0689, + "num_input_tokens_seen": 1703624, + "step": 1180 + }, + { + "epoch": 4.217081850533808, + "grad_norm": 0.09257882088422775, + "learning_rate": 3.677505338047729e-06, + "loss": 0.0218, + "num_input_tokens_seen": 1710024, + "step": 1185 + }, + { + "epoch": 4.234875444839858, + "grad_norm": 0.0730605497956276, + "learning_rate": 3.516957985508476e-06, + "loss": 0.068, + "num_input_tokens_seen": 1717768, + "step": 1190 + }, + { + "epoch": 4.252669039145908, + "grad_norm": 0.23621395230293274, + "learning_rate": 3.3597283229005877e-06, + "loss": 0.021, + "num_input_tokens_seen": 1727240, + "step": 1195 + }, + { + "epoch": 4.270462633451958, + "grad_norm": 0.33008334040641785, + "learning_rate": 3.205840631667456e-06, + "loss": 0.0422, + "num_input_tokens_seen": 1734408, + "step": 1200 + }, + { + "epoch": 4.288256227758007, + "grad_norm": 9.555450439453125, + "learning_rate": 3.0553186771419162e-06, + "loss": 0.0577, + "num_input_tokens_seen": 1740936, + "step": 1205 + }, + { + "epoch": 4.295373665480427, + "eval_loss": 0.3402128219604492, + "eval_runtime": 0.6132, + "eval_samples_per_second": 406.087, + "eval_steps_per_second": 52.188, + "num_input_tokens_seen": 1743624, + "step": 1207 + }, + { + "epoch": 4.306049822064057, + "grad_norm": 1.463619589805603, + "learning_rate": 2.908185704876101e-06, + "loss": 0.0397, + "num_input_tokens_seen": 1747784, + "step": 1210 + }, + { + "epoch": 4.3238434163701065, + "grad_norm": 8.920357704162598, + "learning_rate": 2.7644644370515365e-06, + "loss": 0.0636, + "num_input_tokens_seen": 1754888, + "step": 1215 + }, + { + "epoch": 4.341637010676156, + "grad_norm": 1.068237543106079, + "learning_rate": 2.624177068970124e-06, + "loss": 0.0083, + "num_input_tokens_seen": 1762632, + "step": 1220 + }, + { + "epoch": 4.359430604982206, + "grad_norm": 15.559476852416992, + "learning_rate": 2.4873452656264313e-06, + "loss": 0.0331, + "num_input_tokens_seen": 1769928, + "step": 1225 + }, + { + "epoch": 4.377224199288256, + "grad_norm": 0.16921275854110718, + "learning_rate": 2.3539901583619185e-06, + "loss": 0.0824, + "num_input_tokens_seen": 1777480, + "step": 1230 + }, + { + "epoch": 4.395017793594306, + "grad_norm": 0.30731886625289917, + "learning_rate": 2.2241323416015453e-06, + "loss": 0.0384, + "num_input_tokens_seen": 1784840, + "step": 1235 + }, + { + "epoch": 4.412811387900356, + "grad_norm": 0.8764639496803284, + "learning_rate": 2.09779186967331e-06, + "loss": 0.0435, + "num_input_tokens_seen": 1792584, + "step": 1240 + }, + { + "epoch": 4.430604982206406, + "grad_norm": 10.101332664489746, + "learning_rate": 1.9749882537112296e-06, + "loss": 0.0525, + "num_input_tokens_seen": 1800968, + "step": 1245 + }, + { + "epoch": 4.448398576512456, + "grad_norm": 0.037536390125751495, + "learning_rate": 1.8557404586421413e-06, + "loss": 0.0777, + "num_input_tokens_seen": 1808456, + "step": 1250 + }, + { + "epoch": 4.4661921708185055, + "grad_norm": 14.205132484436035, + "learning_rate": 1.7400669002569232e-06, + "loss": 0.1469, + "num_input_tokens_seen": 1816136, + "step": 1255 + }, + { + "epoch": 4.483985765124555, + "grad_norm": 16.095531463623047, + "learning_rate": 1.6279854423664697e-06, + "loss": 0.0696, + "num_input_tokens_seen": 1824136, + "step": 1260 + }, + { + "epoch": 4.501779359430605, + "grad_norm": 0.2532411217689514, + "learning_rate": 1.5195133940429345e-06, + "loss": 0.0084, + "num_input_tokens_seen": 1831304, + "step": 1265 + }, + { + "epoch": 4.519572953736655, + "grad_norm": 5.228630065917969, + "learning_rate": 1.4146675069466403e-06, + "loss": 0.0259, + "num_input_tokens_seen": 1837512, + "step": 1270 + }, + { + "epoch": 4.537366548042705, + "grad_norm": 1.4012762308120728, + "learning_rate": 1.313463972739068e-06, + "loss": 0.0319, + "num_input_tokens_seen": 1844296, + "step": 1275 + }, + { + "epoch": 4.548042704626335, + "eval_loss": 0.3532261848449707, + "eval_runtime": 0.6553, + "eval_samples_per_second": 379.991, + "eval_steps_per_second": 48.834, + "num_input_tokens_seen": 1849416, + "step": 1278 + }, + { + "epoch": 4.555160142348754, + "grad_norm": 0.7564399838447571, + "learning_rate": 1.2159184205823432e-06, + "loss": 0.0338, + "num_input_tokens_seen": 1851720, + "step": 1280 + }, + { + "epoch": 4.572953736654805, + "grad_norm": 0.5453316569328308, + "learning_rate": 1.122045914725564e-06, + "loss": 0.0457, + "num_input_tokens_seen": 1858120, + "step": 1285 + }, + { + "epoch": 4.590747330960854, + "grad_norm": 9.23385238647461, + "learning_rate": 1.0318609521783818e-06, + "loss": 0.0645, + "num_input_tokens_seen": 1865928, + "step": 1290 + }, + { + "epoch": 4.608540925266904, + "grad_norm": 6.625101566314697, + "learning_rate": 9.453774604721938e-07, + "loss": 0.0261, + "num_input_tokens_seen": 1873800, + "step": 1295 + }, + { + "epoch": 4.6263345195729535, + "grad_norm": 11.140019416809082, + "learning_rate": 8.62608795509276e-07, + "loss": 0.054, + "num_input_tokens_seen": 1881800, + "step": 1300 + }, + { + "epoch": 4.644128113879003, + "grad_norm": 1.8604605197906494, + "learning_rate": 7.835677395001795e-07, + "loss": 0.0036, + "num_input_tokens_seen": 1888648, + "step": 1305 + }, + { + "epoch": 4.661921708185053, + "grad_norm": 10.582422256469727, + "learning_rate": 7.082664989897487e-07, + "loss": 0.1115, + "num_input_tokens_seen": 1895432, + "step": 1310 + }, + { + "epoch": 4.679715302491103, + "grad_norm": 5.248901844024658, + "learning_rate": 6.367167029720234e-07, + "loss": 0.0608, + "num_input_tokens_seen": 1902408, + "step": 1315 + }, + { + "epoch": 4.697508896797153, + "grad_norm": 0.2927665412425995, + "learning_rate": 5.68929401094323e-07, + "loss": 0.0289, + "num_input_tokens_seen": 1910344, + "step": 1320 + }, + { + "epoch": 4.715302491103203, + "grad_norm": 0.10143531113862991, + "learning_rate": 5.049150619508502e-07, + "loss": 0.0309, + "num_input_tokens_seen": 1918472, + "step": 1325 + }, + { + "epoch": 4.733096085409253, + "grad_norm": 1.0533421039581299, + "learning_rate": 4.4468357146596475e-07, + "loss": 0.0078, + "num_input_tokens_seen": 1924744, + "step": 1330 + }, + { + "epoch": 4.750889679715303, + "grad_norm": 0.11136168986558914, + "learning_rate": 3.8824423136748777e-07, + "loss": 0.0676, + "num_input_tokens_seen": 1932872, + "step": 1335 + }, + { + "epoch": 4.7686832740213525, + "grad_norm": 7.673605442047119, + "learning_rate": 3.3560575775019864e-07, + "loss": 0.0673, + "num_input_tokens_seen": 1940040, + "step": 1340 + }, + { + "epoch": 4.786476868327402, + "grad_norm": 9.804219245910645, + "learning_rate": 2.8677627972978906e-07, + "loss": 0.0846, + "num_input_tokens_seen": 1948936, + "step": 1345 + }, + { + "epoch": 4.800711743772242, + "eval_loss": 0.34229812026023865, + "eval_runtime": 0.623, + "eval_samples_per_second": 399.663, + "eval_steps_per_second": 51.362, + "num_input_tokens_seen": 1954568, + "step": 1349 + }, + { + "epoch": 4.804270462633452, + "grad_norm": 0.05879069119691849, + "learning_rate": 2.417633381874534e-07, + "loss": 0.001, + "num_input_tokens_seen": 1955912, + "step": 1350 + }, + { + "epoch": 4.822064056939502, + "grad_norm": 5.924869537353516, + "learning_rate": 2.0057388460533732e-07, + "loss": 0.0243, + "num_input_tokens_seen": 1962760, + "step": 1355 + }, + { + "epoch": 4.839857651245552, + "grad_norm": 5.939824104309082, + "learning_rate": 1.6321427999298755e-07, + "loss": 0.0594, + "num_input_tokens_seen": 1969160, + "step": 1360 + }, + { + "epoch": 4.857651245551601, + "grad_norm": 1.1909672021865845, + "learning_rate": 1.2969029390501597e-07, + "loss": 0.0329, + "num_input_tokens_seen": 1975752, + "step": 1365 + }, + { + "epoch": 4.875444839857651, + "grad_norm": 2.516611099243164, + "learning_rate": 1.0000710355008159e-07, + "loss": 0.0349, + "num_input_tokens_seen": 1983240, + "step": 1370 + }, + { + "epoch": 4.893238434163701, + "grad_norm": 2.7162880897521973, + "learning_rate": 7.416929299135511e-08, + "loss": 0.004, + "num_input_tokens_seen": 1990792, + "step": 1375 + }, + { + "epoch": 4.911032028469751, + "grad_norm": 0.11341112107038498, + "learning_rate": 5.218085243859638e-08, + "loss": 0.028, + "num_input_tokens_seen": 1998728, + "step": 1380 + }, + { + "epoch": 4.9288256227758005, + "grad_norm": 12.289280891418457, + "learning_rate": 3.4045177631936155e-08, + "loss": 0.046, + "num_input_tokens_seen": 2006920, + "step": 1385 + }, + { + "epoch": 4.94661921708185, + "grad_norm": 5.766960144042969, + "learning_rate": 1.976506931745392e-08, + "loss": 0.0136, + "num_input_tokens_seen": 2013128, + "step": 1390 + }, + { + "epoch": 4.9644128113879, + "grad_norm": 0.9045501351356506, + "learning_rate": 9.3427328146517e-09, + "loss": 0.0718, + "num_input_tokens_seen": 2021704, + "step": 1395 + }, + { + "epoch": 4.98220640569395, + "grad_norm": 1.2223786115646362, + "learning_rate": 2.779777675890327e-09, + "loss": 0.1224, + "num_input_tokens_seen": 2028872, + "step": 1400 + }, + { + "epoch": 5.0, + "grad_norm": 0.01060063298791647, + "learning_rate": 7.72174378022017e-11, + "loss": 0.0499, + "num_input_tokens_seen": 2035272, + "step": 1405 + }, + { + "epoch": 5.0, + "num_input_tokens_seen": 2035272, + "step": 1405, + "total_flos": 1.1883702201974784e+16, + "train_loss": 0.17133487164477065, + "train_runtime": 699.7603, + "train_samples_per_second": 16.013, + "train_steps_per_second": 2.008 + } + ], + "logging_steps": 5, + "max_steps": 1405, + "num_input_tokens_seen": 2035272, + "num_train_epochs": 5, + "save_steps": 71, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.1883702201974784e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..94191df --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eccf16f4bf6cae28454d431aeb6753fe6e61852ac86054d5e48a347a445e0d46 +size 6289 diff --git a/training_eval_loss.png b/training_eval_loss.png new file mode 100644 index 0000000..fe9e945 Binary files /dev/null and b/training_eval_loss.png differ diff --git a/training_loss.png b/training_loss.png new file mode 100644 index 0000000..2426449 Binary files /dev/null and b/training_loss.png differ