commit cf2c3dc843147f21841c2a190ca915fc6e5c9e69 Author: ModelHub XC Date: Tue May 5 03:00:41 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: rbelanec/train_rte_42_1776331559 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..6cad1ec --- /dev/null +++ b/README.md @@ -0,0 +1,81 @@ +--- +library_name: transformers +license: llama3.2 +base_model: meta-llama/Llama-3.2-1B-Instruct +tags: +- peft-factory +- full +- llama-factory +- generated_from_trainer +model-index: +- name: train_rte_42_1776331559 + results: [] +--- + + + +# train_rte_42_1776331559 + +This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) on the rte dataset. +It achieves the following results on the evaluation set: +- Loss: 0.1189 +- Num Input Tokens Seen: 2035272 + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 5e-06 +- train_batch_size: 8 +- eval_batch_size: 8 +- seed: 42 +- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_ratio: 0.1 +- num_epochs: 5 + +### Training results + +| Training Loss | Epoch | Step | Validation Loss | Input Tokens Seen | +|:-------------:|:------:|:----:|:---------------:|:-----------------:| +| 0.2309 | 0.2527 | 71 | 0.1802 | 105024 | +| 0.1861 | 0.5053 | 142 | 0.2462 | 209536 | +| 0.0658 | 0.7580 | 213 | 0.1589 | 312576 | +| 0.0765 | 1.0107 | 284 | 0.1189 | 414040 | +| 0.1848 | 1.2633 | 355 | 0.2128 | 517656 | +| 0.0306 | 1.5160 | 426 | 0.1791 | 624344 | +| 0.1029 | 1.7687 | 497 | 0.1360 | 725656 | +| 0.1868 | 2.0214 | 568 | 0.1606 | 821416 | +| 0.0259 | 2.2740 | 639 | 0.2542 | 926760 | +| 0.029 | 2.5267 | 710 | 0.2361 | 1025320 | +| 0.0005 | 2.7794 | 781 | 0.2352 | 1128104 | +| 0.0001 | 3.0320 | 852 | 0.2580 | 1229440 | +| 0.0001 | 3.2847 | 923 | 0.2295 | 1332544 | +| 0.0001 | 3.5374 | 994 | 0.2405 | 1438336 | +| 0.0 | 3.7900 | 1065 | 0.2512 | 1539072 | +| 0.0 | 4.0427 | 1136 | 0.2552 | 1642696 | +| 0.0 | 4.2954 | 1207 | 0.2572 | 1743624 | +| 0.0 | 4.5480 | 1278 | 0.2590 | 1849416 | +| 0.0 | 4.8007 | 1349 | 0.2602 | 1954568 | + + +### Framework versions + +- Transformers 4.51.3 +- Pytorch 2.10.0+cu128 +- Datasets 4.0.0 +- Tokenizers 0.21.4 diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..7f39b7d --- /dev/null +++ b/all_results.json @@ -0,0 +1,13 @@ +{ + "epoch": 5.0, + "eval_loss": 0.11889845132827759, + "eval_runtime": 0.6177, + "eval_samples_per_second": 403.126, + "eval_steps_per_second": 51.807, + "num_input_tokens_seen": 2035272, + "total_flos": 1.1883702201974784e+16, + "train_loss": 0.05568007128206763, + "train_runtime": 1085.6649, + "train_samples_per_second": 10.321, + "train_steps_per_second": 1.294 +} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..5a2b93f --- /dev/null +++ b/config.json @@ -0,0 +1,39 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.51.3", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/eval_results.json b/eval_results.json new file mode 100644 index 0000000..082223f --- /dev/null +++ b/eval_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 5.0, + "eval_loss": 0.11889845132827759, + "eval_runtime": 0.6177, + "eval_samples_per_second": 403.126, + "eval_steps_per_second": 51.807, + "num_input_tokens_seen": 2035272 +} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..2b8ae57 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.51.3" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..fe88087 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f42b11bdaec0cab6d8549f9878c2966047f97164729de4a3943c13261f00c19 +size 4943274328 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..14daf45 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,26 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..ddc3ce0 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2069 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/train.yaml b/train.yaml new file mode 100644 index 0000000..d5758a1 --- /dev/null +++ b/train.yaml @@ -0,0 +1,55 @@ +seed: 42 + +### model +model_name_or_path: meta-llama/Llama-3.2-1B-Instruct +trust_remote_code: true +flash_attn: auto +use_cache: false + +### method +stage: sft +do_train: true +finetuning_type: full + +### dataset +dataset: rte +template: llama3 +cutoff_len: 2048 +overwrite_cache: true +preprocessing_num_workers: 4 +dataloader_num_workers: 4 +packing: false + +### output +output_dir: saves_bts_preliminary/base/llama-3.2-1b-instruct/train_rte_42_1776331559 +logging_steps: 5 +save_steps: 0.05 +overwrite_output_dir: true +save_only_model: false +plot_loss: true +include_num_input_tokens_seen: true +push_to_hub: true +push_to_hub_organization: rbelanec +load_best_model_at_end: true +save_total_limit: 1 + +### train +per_device_train_batch_size: 8 +learning_rate: 5.0e-6 +num_train_epochs: 5 +weight_decay: 1.0e-5 +lr_scheduler_type: cosine +bf16: true +ddp_timeout: 180000000 +resume_from_checkpoint: null +warmup_ratio: 0.1 +optim: adamw_torch +report_to: +- wandb +run_name: base_llama-3.2-1b-instruct_train_rte_42_1776331559 + +### eval +per_device_eval_batch_size: 8 +eval_strategy: steps +eval_steps: 0.05 +val_size: 0.1 \ No newline at end of file diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..b0dd51e --- /dev/null +++ b/train_results.json @@ -0,0 +1,9 @@ +{ + "epoch": 5.0, + "num_input_tokens_seen": 2035272, + "total_flos": 1.1883702201974784e+16, + "train_loss": 0.05568007128206763, + "train_runtime": 1085.6649, + "train_samples_per_second": 10.321, + "train_steps_per_second": 1.294 +} \ No newline at end of file diff --git a/trainer_log.jsonl b/trainer_log.jsonl new file mode 100644 index 0000000..a50cb60 --- /dev/null +++ b/trainer_log.jsonl @@ -0,0 +1,301 @@ +{"current_steps": 5, "total_steps": 1405, "loss": 0.7401, "lr": 1.4184397163120568e-07, "epoch": 0.017793594306049824, "percentage": 0.36, "elapsed_time": "0:00:00", "remaining_time": "0:03:34", "throughput": 10273.35, "total_tokens": 7872} +{"current_steps": 10, "total_steps": 1405, "loss": 0.6171, "lr": 3.1914893617021275e-07, "epoch": 0.03558718861209965, "percentage": 0.71, "elapsed_time": "0:00:01", "remaining_time": "0:02:49", "throughput": 12160.95, "total_tokens": 14784} +{"current_steps": 15, "total_steps": 1405, "loss": 0.4092, "lr": 4.964539007092199e-07, "epoch": 0.05338078291814947, "percentage": 1.07, "elapsed_time": "0:00:01", "remaining_time": "0:02:38", "throughput": 13729.69, "total_tokens": 23424} +{"current_steps": 20, "total_steps": 1405, "loss": 0.2659, "lr": 6.73758865248227e-07, "epoch": 0.0711743772241993, "percentage": 1.42, "elapsed_time": "0:00:02", "remaining_time": "0:02:28", "throughput": 13884.51, "total_tokens": 29824} +{"current_steps": 25, "total_steps": 1405, "loss": 0.2666, "lr": 8.510638297872341e-07, "epoch": 0.08896797153024912, "percentage": 1.78, "elapsed_time": "0:00:02", "remaining_time": "0:02:24", "throughput": 14476.05, "total_tokens": 37824} +{"current_steps": 30, "total_steps": 1405, "loss": 0.2573, "lr": 1.0283687943262412e-06, "epoch": 0.10676156583629894, "percentage": 2.14, "elapsed_time": "0:00:03", "remaining_time": "0:02:19", "throughput": 14604.05, "total_tokens": 44608} +{"current_steps": 35, "total_steps": 1405, "loss": 0.3322, "lr": 1.2056737588652482e-06, "epoch": 0.12455516014234876, "percentage": 2.49, "elapsed_time": "0:00:03", "remaining_time": "0:02:17", "throughput": 14841.7, "total_tokens": 51968} +{"current_steps": 40, "total_steps": 1405, "loss": 0.1441, "lr": 1.3829787234042555e-06, "epoch": 0.1423487544483986, "percentage": 2.85, "elapsed_time": "0:00:03", "remaining_time": "0:02:14", "throughput": 15033.22, "total_tokens": 59456} +{"current_steps": 45, "total_steps": 1405, "loss": 0.163, "lr": 1.5602836879432626e-06, "epoch": 0.1601423487544484, "percentage": 3.2, "elapsed_time": "0:00:04", "remaining_time": "0:02:12", "throughput": 15135.47, "total_tokens": 66496} +{"current_steps": 50, "total_steps": 1405, "loss": 0.1815, "lr": 1.7375886524822697e-06, "epoch": 0.17793594306049823, "percentage": 3.56, "elapsed_time": "0:00:04", "remaining_time": "0:02:11", "throughput": 15185.55, "total_tokens": 73408} +{"current_steps": 55, "total_steps": 1405, "loss": 0.1484, "lr": 1.9148936170212767e-06, "epoch": 0.19572953736654805, "percentage": 3.91, "elapsed_time": "0:00:05", "remaining_time": "0:02:09", "throughput": 15274.84, "total_tokens": 80576} +{"current_steps": 60, "total_steps": 1405, "loss": 0.1828, "lr": 2.092198581560284e-06, "epoch": 0.21352313167259787, "percentage": 4.27, "elapsed_time": "0:00:05", "remaining_time": "0:02:08", "throughput": 15397.99, "total_tokens": 88256} +{"current_steps": 65, "total_steps": 1405, "loss": 0.1709, "lr": 2.269503546099291e-06, "epoch": 0.2313167259786477, "percentage": 4.63, "elapsed_time": "0:00:06", "remaining_time": "0:02:07", "throughput": 15537.96, "total_tokens": 96256} +{"current_steps": 70, "total_steps": 1405, "loss": 0.2309, "lr": 2.446808510638298e-06, "epoch": 0.2491103202846975, "percentage": 4.98, "elapsed_time": "0:00:06", "remaining_time": "0:02:06", "throughput": 15587.07, "total_tokens": 103424} +{"current_steps": 71, "total_steps": 1405, "eval_loss": 0.18017518520355225, "epoch": 0.2526690391459075, "percentage": 5.05, "elapsed_time": "0:00:07", "remaining_time": "0:02:17", "throughput": 14316.64, "total_tokens": 105024} +{"current_steps": 75, "total_steps": 1405, "loss": 0.1733, "lr": 2.624113475177305e-06, "epoch": 0.2669039145907473, "percentage": 5.34, "elapsed_time": "0:01:18", "remaining_time": "0:23:06", "throughput": 1413.24, "total_tokens": 110528} +{"current_steps": 80, "total_steps": 1405, "loss": 0.1469, "lr": 2.8014184397163125e-06, "epoch": 0.2846975088967972, "percentage": 5.69, "elapsed_time": "0:01:18", "remaining_time": "0:21:42", "throughput": 1493.33, "total_tokens": 117440} +{"current_steps": 85, "total_steps": 1405, "loss": 0.1417, "lr": 2.978723404255319e-06, "epoch": 0.302491103202847, "percentage": 6.05, "elapsed_time": "0:01:19", "remaining_time": "0:20:28", "throughput": 1586.53, "total_tokens": 125504} +{"current_steps": 90, "total_steps": 1405, "loss": 0.1887, "lr": 3.1560283687943267e-06, "epoch": 0.3202846975088968, "percentage": 6.41, "elapsed_time": "0:01:19", "remaining_time": "0:19:22", "throughput": 1663.97, "total_tokens": 132352} +{"current_steps": 95, "total_steps": 1405, "loss": 0.1507, "lr": 3.3333333333333333e-06, "epoch": 0.33807829181494664, "percentage": 6.76, "elapsed_time": "0:01:19", "remaining_time": "0:18:22", "throughput": 1740.55, "total_tokens": 139200} +{"current_steps": 100, "total_steps": 1405, "loss": 0.0966, "lr": 3.510638297872341e-06, "epoch": 0.35587188612099646, "percentage": 7.12, "elapsed_time": "0:01:20", "remaining_time": "0:17:29", "throughput": 1838.31, "total_tokens": 147904} +{"current_steps": 105, "total_steps": 1405, "loss": 0.2218, "lr": 3.6879432624113475e-06, "epoch": 0.3736654804270463, "percentage": 7.47, "elapsed_time": "0:01:20", "remaining_time": "0:16:41", "throughput": 1907.09, "total_tokens": 154240} +{"current_steps": 110, "total_steps": 1405, "loss": 0.1246, "lr": 3.865248226950355e-06, "epoch": 0.3914590747330961, "percentage": 7.83, "elapsed_time": "0:01:21", "remaining_time": "0:15:57", "throughput": 1985.57, "total_tokens": 161472} +{"current_steps": 115, "total_steps": 1405, "loss": 0.1689, "lr": 4.042553191489362e-06, "epoch": 0.4092526690391459, "percentage": 8.19, "elapsed_time": "0:01:21", "remaining_time": "0:15:17", "throughput": 2057.26, "total_tokens": 168192} +{"current_steps": 120, "total_steps": 1405, "loss": 0.1818, "lr": 4.219858156028369e-06, "epoch": 0.42704626334519574, "percentage": 8.54, "elapsed_time": "0:01:22", "remaining_time": "0:14:40", "throughput": 2125.23, "total_tokens": 174656} +{"current_steps": 125, "total_steps": 1405, "loss": 0.1189, "lr": 4.397163120567377e-06, "epoch": 0.44483985765124556, "percentage": 8.9, "elapsed_time": "0:01:22", "remaining_time": "0:14:05", "throughput": 2198.52, "total_tokens": 181632} +{"current_steps": 130, "total_steps": 1405, "loss": 0.0973, "lr": 4.574468085106383e-06, "epoch": 0.4626334519572954, "percentage": 9.25, "elapsed_time": "0:01:23", "remaining_time": "0:13:35", "throughput": 2303.5, "total_tokens": 191488} +{"current_steps": 135, "total_steps": 1405, "loss": 0.1978, "lr": 4.751773049645391e-06, "epoch": 0.4804270462633452, "percentage": 9.61, "elapsed_time": "0:01:23", "remaining_time": "0:13:06", "throughput": 2379.18, "total_tokens": 198848} +{"current_steps": 140, "total_steps": 1405, "loss": 0.1861, "lr": 4.929078014184397e-06, "epoch": 0.498220640569395, "percentage": 9.96, "elapsed_time": "0:01:24", "remaining_time": "0:12:39", "throughput": 2465.53, "total_tokens": 207232} +{"current_steps": 142, "total_steps": 1405, "eval_loss": 0.2461702525615692, "epoch": 0.505338078291815, "percentage": 10.11, "elapsed_time": "0:01:24", "remaining_time": "0:12:34", "throughput": 2471.44, "total_tokens": 209536} +{"current_steps": 145, "total_steps": 1405, "loss": 0.2676, "lr": 4.999930504592181e-06, "epoch": 0.5160142348754448, "percentage": 10.32, "elapsed_time": "0:02:00", "remaining_time": "0:17:27", "throughput": 1775.06, "total_tokens": 213952} +{"current_steps": 150, "total_steps": 1405, "loss": 0.1686, "lr": 4.999505824425164e-06, "epoch": 0.5338078291814946, "percentage": 10.68, "elapsed_time": "0:02:00", "remaining_time": "0:16:52", "throughput": 1829.83, "total_tokens": 221376} +{"current_steps": 155, "total_steps": 1405, "loss": 0.1074, "lr": 4.998695138156149e-06, "epoch": 0.5516014234875445, "percentage": 11.03, "elapsed_time": "0:02:01", "remaining_time": "0:16:19", "throughput": 1885.24, "total_tokens": 228928} +{"current_steps": 160, "total_steps": 1405, "loss": 0.1216, "lr": 4.997498570981822e-06, "epoch": 0.5693950177935944, "percentage": 11.39, "elapsed_time": "0:02:01", "remaining_time": "0:15:48", "throughput": 1939.28, "total_tokens": 236352} +{"current_steps": 165, "total_steps": 1405, "loss": 0.1426, "lr": 4.995916307691601e-06, "epoch": 0.5871886120996441, "percentage": 11.74, "elapsed_time": "0:02:02", "remaining_time": "0:15:19", "throughput": 1997.84, "total_tokens": 244416} +{"current_steps": 170, "total_steps": 1405, "loss": 0.175, "lr": 4.993948592639105e-06, "epoch": 0.604982206405694, "percentage": 12.1, "elapsed_time": "0:02:02", "remaining_time": "0:14:51", "throughput": 2048.09, "total_tokens": 251456} +{"current_steps": 175, "total_steps": 1405, "loss": 0.1179, "lr": 4.991595729704405e-06, "epoch": 0.6227758007117438, "percentage": 12.46, "elapsed_time": "0:02:03", "remaining_time": "0:14:26", "throughput": 2100.88, "total_tokens": 258880} +{"current_steps": 180, "total_steps": 1405, "loss": 0.1235, "lr": 4.988858082247109e-06, "epoch": 0.6405693950177936, "percentage": 12.81, "elapsed_time": "0:02:03", "remaining_time": "0:14:01", "throughput": 2144.54, "total_tokens": 265152} +{"current_steps": 185, "total_steps": 1405, "loss": 0.1596, "lr": 4.985736073050237e-06, "epoch": 0.6583629893238434, "percentage": 13.17, "elapsed_time": "0:02:04", "remaining_time": "0:13:38", "throughput": 2196.61, "total_tokens": 272576} +{"current_steps": 190, "total_steps": 1405, "loss": 0.1188, "lr": 4.982230184254934e-06, "epoch": 0.6761565836298933, "percentage": 13.52, "elapsed_time": "0:02:04", "remaining_time": "0:13:16", "throughput": 2246.44, "total_tokens": 279744} +{"current_steps": 195, "total_steps": 1405, "loss": 0.1255, "lr": 4.9783409572860105e-06, "epoch": 0.693950177935943, "percentage": 13.88, "elapsed_time": "0:02:04", "remaining_time": "0:12:55", "throughput": 2301.63, "total_tokens": 287680} +{"current_steps": 200, "total_steps": 1405, "loss": 0.0801, "lr": 4.9740689927683314e-06, "epoch": 0.7117437722419929, "percentage": 14.23, "elapsed_time": "0:02:05", "remaining_time": "0:12:35", "throughput": 2348.72, "total_tokens": 294592} +{"current_steps": 205, "total_steps": 1405, "loss": 0.0902, "lr": 4.9694149504340515e-06, "epoch": 0.7295373665480427, "percentage": 14.59, "elapsed_time": "0:02:05", "remaining_time": "0:12:16", "throughput": 2395.03, "total_tokens": 301440} +{"current_steps": 210, "total_steps": 1405, "loss": 0.0658, "lr": 4.964379549020741e-06, "epoch": 0.7473309608540926, "percentage": 14.95, "elapsed_time": "0:02:06", "remaining_time": "0:11:58", "throughput": 2441.99, "total_tokens": 308416} +{"current_steps": 213, "total_steps": 1405, "eval_loss": 0.1589389145374298, "epoch": 0.7580071174377224, "percentage": 15.16, "elapsed_time": "0:02:08", "remaining_time": "0:11:57", "throughput": 2437.63, "total_tokens": 312576} +{"current_steps": 215, "total_steps": 1405, "loss": 0.1047, "lr": 4.9589635661603845e-06, "epoch": 0.7651245551601423, "percentage": 15.3, "elapsed_time": "0:02:58", "remaining_time": "0:16:26", "throughput": 1769.32, "total_tokens": 315328} +{"current_steps": 220, "total_steps": 1405, "loss": 0.0899, "lr": 4.953167838259285e-06, "epoch": 0.7829181494661922, "percentage": 15.66, "elapsed_time": "0:02:58", "remaining_time": "0:16:02", "throughput": 1806.05, "total_tokens": 322688} +{"current_steps": 225, "total_steps": 1405, "loss": 0.1884, "lr": 4.946993260368904e-06, "epoch": 0.800711743772242, "percentage": 16.01, "elapsed_time": "0:02:59", "remaining_time": "0:15:39", "throughput": 1838.5, "total_tokens": 329280} +{"current_steps": 230, "total_steps": 1405, "loss": 0.0862, "lr": 4.9404407860476275e-06, "epoch": 0.8185053380782918, "percentage": 16.37, "elapsed_time": "0:02:59", "remaining_time": "0:15:17", "throughput": 1876.32, "total_tokens": 336896} +{"current_steps": 235, "total_steps": 1405, "loss": 0.1129, "lr": 4.933511427213511e-06, "epoch": 0.8362989323843416, "percentage": 16.73, "elapsed_time": "0:02:59", "remaining_time": "0:14:56", "throughput": 1911.89, "total_tokens": 344128} +{"current_steps": 240, "total_steps": 1405, "loss": 0.0612, "lr": 4.926206253988001e-06, "epoch": 0.8540925266903915, "percentage": 17.08, "elapsed_time": "0:03:00", "remaining_time": "0:14:35", "throughput": 1944.94, "total_tokens": 350912} +{"current_steps": 245, "total_steps": 1405, "loss": 0.1364, "lr": 4.91852639453068e-06, "epoch": 0.8718861209964412, "percentage": 17.44, "elapsed_time": "0:03:00", "remaining_time": "0:14:16", "throughput": 1979.49, "total_tokens": 358016} +{"current_steps": 250, "total_steps": 1405, "loss": 0.0751, "lr": 4.910473034865033e-06, "epoch": 0.8896797153024911, "percentage": 17.79, "elapsed_time": "0:03:01", "remaining_time": "0:13:57", "throughput": 2011.88, "total_tokens": 364736} +{"current_steps": 255, "total_steps": 1405, "loss": 0.1051, "lr": 4.902047418695293e-06, "epoch": 0.9074733096085409, "percentage": 18.15, "elapsed_time": "0:03:01", "remaining_time": "0:13:39", "throughput": 2045.11, "total_tokens": 371648} +{"current_steps": 260, "total_steps": 1405, "loss": 0.0675, "lr": 4.893250847214369e-06, "epoch": 0.9252669039145908, "percentage": 18.51, "elapsed_time": "0:03:02", "remaining_time": "0:13:22", "throughput": 2081.5, "total_tokens": 379200} +{"current_steps": 265, "total_steps": 1405, "loss": 0.1611, "lr": 4.884084678902898e-06, "epoch": 0.9430604982206405, "percentage": 18.86, "elapsed_time": "0:03:02", "remaining_time": "0:13:05", "throughput": 2120.01, "total_tokens": 387200} +{"current_steps": 270, "total_steps": 1405, "loss": 0.1232, "lr": 4.874550329319457e-06, "epoch": 0.9608540925266904, "percentage": 19.22, "elapsed_time": "0:03:03", "remaining_time": "0:12:49", "throughput": 2158.55, "total_tokens": 395264} +{"current_steps": 275, "total_steps": 1405, "loss": 0.135, "lr": 4.864649270881944e-06, "epoch": 0.9786476868327402, "percentage": 19.57, "elapsed_time": "0:03:03", "remaining_time": "0:12:34", "throughput": 2191.09, "total_tokens": 402176} +{"current_steps": 280, "total_steps": 1405, "loss": 0.0765, "lr": 4.854383032640196e-06, "epoch": 0.99644128113879, "percentage": 19.93, "elapsed_time": "0:03:04", "remaining_time": "0:12:19", "throughput": 2228.07, "total_tokens": 409984} +{"current_steps": 284, "total_steps": 1405, "eval_loss": 0.11889845132827759, "epoch": 1.01067615658363, "percentage": 20.21, "elapsed_time": "0:03:05", "remaining_time": "0:12:10", "throughput": 2237.6, "total_tokens": 414040} +{"current_steps": 285, "total_steps": 1405, "loss": 0.0754, "lr": 4.843753200039851e-06, "epoch": 1.0142348754448398, "percentage": 20.28, "elapsed_time": "0:04:34", "remaining_time": "0:17:58", "throughput": 1513.6, "total_tokens": 415256} +{"current_steps": 290, "total_steps": 1405, "loss": 0.067, "lr": 4.832761414677502e-06, "epoch": 1.0320284697508897, "percentage": 20.64, "elapsed_time": "0:04:34", "remaining_time": "0:17:36", "throughput": 1538.59, "total_tokens": 422808} +{"current_steps": 295, "total_steps": 1405, "loss": 0.0342, "lr": 4.821409374047184e-06, "epoch": 1.0498220640569396, "percentage": 21.0, "elapsed_time": "0:04:35", "remaining_time": "0:17:15", "throughput": 1562.61, "total_tokens": 430104} +{"current_steps": 300, "total_steps": 1405, "loss": 0.0683, "lr": 4.809698831278217e-06, "epoch": 1.0676156583629894, "percentage": 21.35, "elapsed_time": "0:04:35", "remaining_time": "0:16:55", "throughput": 1584.31, "total_tokens": 436760} +{"current_steps": 305, "total_steps": 1405, "loss": 0.1073, "lr": 4.797631594864475e-06, "epoch": 1.085409252669039, "percentage": 21.71, "elapsed_time": "0:04:36", "remaining_time": "0:16:35", "throughput": 1611.32, "total_tokens": 444952} +{"current_steps": 310, "total_steps": 1405, "loss": 0.1453, "lr": 4.785209528385087e-06, "epoch": 1.103202846975089, "percentage": 22.06, "elapsed_time": "0:04:36", "remaining_time": "0:16:17", "throughput": 1636.86, "total_tokens": 452760} +{"current_steps": 315, "total_steps": 1405, "loss": 0.2851, "lr": 4.7724345502166435e-06, "epoch": 1.1209964412811388, "percentage": 22.42, "elapsed_time": "0:04:37", "remaining_time": "0:15:58", "throughput": 1654.76, "total_tokens": 458392} +{"current_steps": 320, "total_steps": 1405, "loss": 0.0427, "lr": 4.759308633236934e-06, "epoch": 1.1387900355871885, "percentage": 22.78, "elapsed_time": "0:04:37", "remaining_time": "0:15:40", "throughput": 1676.42, "total_tokens": 465112} +{"current_steps": 325, "total_steps": 1405, "loss": 0.0369, "lr": 4.74583380452027e-06, "epoch": 1.1565836298932384, "percentage": 23.13, "elapsed_time": "0:04:37", "remaining_time": "0:15:23", "throughput": 1699.35, "total_tokens": 472216} +{"current_steps": 330, "total_steps": 1405, "loss": 0.1237, "lr": 4.7320121450244395e-06, "epoch": 1.1743772241992882, "percentage": 23.49, "elapsed_time": "0:04:38", "remaining_time": "0:15:06", "throughput": 1723.08, "total_tokens": 479576} +{"current_steps": 335, "total_steps": 1405, "loss": 0.0822, "lr": 4.717845789269333e-06, "epoch": 1.1921708185053381, "percentage": 23.84, "elapsed_time": "0:04:38", "remaining_time": "0:14:50", "throughput": 1745.4, "total_tokens": 486552} +{"current_steps": 340, "total_steps": 1405, "loss": 0.0835, "lr": 4.703336925007311e-06, "epoch": 1.209964412811388, "percentage": 24.2, "elapsed_time": "0:04:39", "remaining_time": "0:14:34", "throughput": 1771.37, "total_tokens": 494616} +{"current_steps": 345, "total_steps": 1405, "loss": 0.0413, "lr": 4.68848779288534e-06, "epoch": 1.2277580071174377, "percentage": 24.56, "elapsed_time": "0:04:39", "remaining_time": "0:14:19", "throughput": 1792.87, "total_tokens": 501400} +{"current_steps": 350, "total_steps": 1405, "loss": 0.0284, "lr": 4.673300686098957e-06, "epoch": 1.2455516014234875, "percentage": 24.91, "elapsed_time": "0:04:40", "remaining_time": "0:14:04", "throughput": 1816.7, "total_tokens": 508888} +{"current_steps": 355, "total_steps": 1405, "loss": 0.1848, "lr": 4.657777950038133e-06, "epoch": 1.2633451957295374, "percentage": 25.27, "elapsed_time": "0:04:40", "remaining_time": "0:13:49", "throughput": 1844.83, "total_tokens": 517656} +{"current_steps": 355, "total_steps": 1405, "eval_loss": 0.21280953288078308, "epoch": 1.2633451957295374, "percentage": 25.27, "elapsed_time": "0:04:41", "remaining_time": "0:13:51", "throughput": 1840.79, "total_tokens": 517656} +{"current_steps": 360, "total_steps": 1405, "loss": 0.0684, "lr": 4.641921981925064e-06, "epoch": 1.281138790035587, "percentage": 25.62, "elapsed_time": "0:05:17", "remaining_time": "0:15:21", "throughput": 1658.11, "total_tokens": 526232} +{"current_steps": 365, "total_steps": 1405, "loss": 0.0556, "lr": 4.625735230443959e-06, "epoch": 1.298932384341637, "percentage": 25.98, "elapsed_time": "0:05:17", "remaining_time": "0:15:05", "throughput": 1678.36, "total_tokens": 533400} +{"current_steps": 370, "total_steps": 1405, "loss": 0.1059, "lr": 4.609220195362886e-06, "epoch": 1.3167259786476868, "percentage": 26.33, "elapsed_time": "0:05:18", "remaining_time": "0:14:50", "throughput": 1703.34, "total_tokens": 542168} +{"current_steps": 375, "total_steps": 1405, "loss": 0.0525, "lr": 4.592379427147722e-06, "epoch": 1.3345195729537367, "percentage": 26.69, "elapsed_time": "0:05:18", "remaining_time": "0:14:35", "throughput": 1725.39, "total_tokens": 549976} +{"current_steps": 380, "total_steps": 1405, "loss": 0.118, "lr": 4.575215526568278e-06, "epoch": 1.3523131672597866, "percentage": 27.05, "elapsed_time": "0:05:19", "remaining_time": "0:14:20", "throughput": 1745.07, "total_tokens": 557016} +{"current_steps": 385, "total_steps": 1405, "loss": 0.0831, "lr": 4.557731144296659e-06, "epoch": 1.3701067615658362, "percentage": 27.4, "elapsed_time": "0:05:19", "remaining_time": "0:14:06", "throughput": 1766.02, "total_tokens": 564504} +{"current_steps": 390, "total_steps": 1405, "loss": 0.0283, "lr": 4.539928980497903e-06, "epoch": 1.387900355871886, "percentage": 27.76, "elapsed_time": "0:05:20", "remaining_time": "0:13:53", "throughput": 1786.54, "total_tokens": 571864} +{"current_steps": 395, "total_steps": 1405, "loss": 0.0483, "lr": 4.521811784412996e-06, "epoch": 1.405693950177936, "percentage": 28.11, "elapsed_time": "0:05:20", "remaining_time": "0:13:39", "throughput": 1804.73, "total_tokens": 578456} +{"current_steps": 400, "total_steps": 1405, "loss": 0.1155, "lr": 4.503382353934295e-06, "epoch": 1.4234875444839858, "percentage": 28.47, "elapsed_time": "0:05:20", "remaining_time": "0:13:26", "throughput": 1821.54, "total_tokens": 584600} +{"current_steps": 405, "total_steps": 1405, "loss": 0.0086, "lr": 4.484643535173438e-06, "epoch": 1.4412811387900355, "percentage": 28.83, "elapsed_time": "0:05:21", "remaining_time": "0:13:13", "throughput": 1839.46, "total_tokens": 591128} +{"current_steps": 410, "total_steps": 1405, "loss": 0.0969, "lr": 4.465598222021818e-06, "epoch": 1.4590747330960854, "percentage": 29.18, "elapsed_time": "0:05:21", "remaining_time": "0:13:00", "throughput": 1859.96, "total_tokens": 598552} +{"current_steps": 415, "total_steps": 1405, "loss": 0.0715, "lr": 4.446249355703661e-06, "epoch": 1.4768683274021353, "percentage": 29.54, "elapsed_time": "0:05:22", "remaining_time": "0:12:48", "throughput": 1884.37, "total_tokens": 607320} +{"current_steps": 420, "total_steps": 1405, "loss": 0.094, "lr": 4.426599924321815e-06, "epoch": 1.4946619217081851, "percentage": 29.89, "elapsed_time": "0:05:22", "remaining_time": "0:12:36", "throughput": 1904.77, "total_tokens": 614744} +{"current_steps": 425, "total_steps": 1405, "loss": 0.0306, "lr": 4.406652962396278e-06, "epoch": 1.512455516014235, "percentage": 30.25, "elapsed_time": "0:05:23", "remaining_time": "0:12:25", "throughput": 1926.98, "total_tokens": 622808} +{"current_steps": 426, "total_steps": 1405, "eval_loss": 0.17913997173309326, "epoch": 1.5160142348754448, "percentage": 30.32, "elapsed_time": "0:05:23", "remaining_time": "0:12:24", "throughput": 1927.68, "total_tokens": 624344} +{"current_steps": 430, "total_steps": 1405, "loss": 0.103, "lr": 4.386411550395576e-06, "epoch": 1.5302491103202847, "percentage": 30.6, "elapsed_time": "0:05:57", "remaining_time": "0:13:30", "throughput": 1764.39, "total_tokens": 630488} +{"current_steps": 435, "total_steps": 1405, "loss": 0.0775, "lr": 4.365878814261032e-06, "epoch": 1.5480427046263345, "percentage": 30.96, "elapsed_time": "0:05:57", "remaining_time": "0:13:17", "throughput": 1784.31, "total_tokens": 638424} +{"current_steps": 440, "total_steps": 1405, "loss": 0.0767, "lr": 4.34505792492402e-06, "epoch": 1.5658362989323842, "percentage": 31.32, "elapsed_time": "0:05:58", "remaining_time": "0:13:05", "throughput": 1801.12, "total_tokens": 645208} +{"current_steps": 445, "total_steps": 1405, "loss": 0.013, "lr": 4.3239520978162685e-06, "epoch": 1.583629893238434, "percentage": 31.67, "elapsed_time": "0:05:58", "remaining_time": "0:12:53", "throughput": 1820.61, "total_tokens": 653016} +{"current_steps": 450, "total_steps": 1405, "loss": 0.0129, "lr": 4.302564592373293e-06, "epoch": 1.601423487544484, "percentage": 32.03, "elapsed_time": "0:05:59", "remaining_time": "0:12:42", "throughput": 1837.82, "total_tokens": 659992} +{"current_steps": 455, "total_steps": 1405, "loss": 0.1234, "lr": 4.280898711531026e-06, "epoch": 1.6192170818505338, "percentage": 32.38, "elapsed_time": "0:05:59", "remaining_time": "0:12:30", "throughput": 1855.64, "total_tokens": 667224} +{"current_steps": 460, "total_steps": 1405, "loss": 0.1334, "lr": 4.258957801215743e-06, "epoch": 1.6370106761565837, "percentage": 32.74, "elapsed_time": "0:06:00", "remaining_time": "0:12:19", "throughput": 1875.29, "total_tokens": 675160} +{"current_steps": 465, "total_steps": 1405, "loss": 0.1297, "lr": 4.236745249827336e-06, "epoch": 1.6548042704626336, "percentage": 33.1, "elapsed_time": "0:06:00", "remaining_time": "0:12:08", "throughput": 1896.07, "total_tokens": 683544} +{"current_steps": 470, "total_steps": 1405, "loss": 0.0334, "lr": 4.2142644877160334e-06, "epoch": 1.6725978647686834, "percentage": 33.45, "elapsed_time": "0:06:00", "remaining_time": "0:11:57", "throughput": 1910.07, "total_tokens": 689368} +{"current_steps": 475, "total_steps": 1405, "loss": 0.0779, "lr": 4.191518986652642e-06, "epoch": 1.690391459074733, "percentage": 33.81, "elapsed_time": "0:06:01", "remaining_time": "0:11:47", "throughput": 1925.73, "total_tokens": 695832} +{"current_steps": 480, "total_steps": 1405, "loss": 0.0085, "lr": 4.168512259292391e-06, "epoch": 1.708185053380783, "percentage": 34.16, "elapsed_time": "0:06:01", "remaining_time": "0:11:37", "throughput": 1943.52, "total_tokens": 703128} +{"current_steps": 485, "total_steps": 1405, "loss": 0.0617, "lr": 4.14524785863246e-06, "epoch": 1.7259786476868326, "percentage": 34.52, "elapsed_time": "0:06:02", "remaining_time": "0:11:27", "throughput": 1958.92, "total_tokens": 709528} +{"current_steps": 490, "total_steps": 1405, "loss": 0.0537, "lr": 4.121729377463285e-06, "epoch": 1.7437722419928825, "percentage": 34.88, "elapsed_time": "0:06:02", "remaining_time": "0:11:17", "throughput": 1975.31, "total_tokens": 716312} +{"current_steps": 495, "total_steps": 1405, "loss": 0.1029, "lr": 4.0979604478137045e-06, "epoch": 1.7615658362989324, "percentage": 35.23, "elapsed_time": "0:06:03", "remaining_time": "0:11:07", "throughput": 1990.84, "total_tokens": 722776} +{"current_steps": 497, "total_steps": 1405, "eval_loss": 0.13597266376018524, "epoch": 1.7686832740213523, "percentage": 35.37, "elapsed_time": "0:06:03", "remaining_time": "0:11:04", "throughput": 1994.61, "total_tokens": 725656} +{"current_steps": 500, "total_steps": 1405, "loss": 0.1142, "lr": 4.0739447403900605e-06, "epoch": 1.7793594306049823, "percentage": 35.59, "elapsed_time": "0:06:38", "remaining_time": "0:12:01", "throughput": 1832.08, "total_tokens": 729944} +{"current_steps": 505, "total_steps": 1405, "loss": 0.0837, "lr": 4.0496859640093215e-06, "epoch": 1.7971530249110321, "percentage": 35.94, "elapsed_time": "0:06:38", "remaining_time": "0:11:50", "throughput": 1848.01, "total_tokens": 737112} +{"current_steps": 510, "total_steps": 1405, "loss": 0.0124, "lr": 4.025187865026311e-06, "epoch": 1.814946619217082, "percentage": 36.3, "elapsed_time": "0:06:39", "remaining_time": "0:11:40", "throughput": 1864.23, "total_tokens": 744408} +{"current_steps": 515, "total_steps": 1405, "loss": 0.059, "lr": 4.0004542267551585e-06, "epoch": 1.8327402135231317, "percentage": 36.65, "elapsed_time": "0:06:39", "remaining_time": "0:11:30", "throughput": 1877.49, "total_tokens": 750488} +{"current_steps": 520, "total_steps": 1405, "loss": 0.0662, "lr": 3.975488868885022e-06, "epoch": 1.8505338078291815, "percentage": 37.01, "elapsed_time": "0:06:40", "remaining_time": "0:11:21", "throughput": 1893.01, "total_tokens": 757528} +{"current_steps": 525, "total_steps": 1405, "loss": 0.0299, "lr": 3.950295646890202e-06, "epoch": 1.8683274021352312, "percentage": 37.37, "elapsed_time": "0:06:40", "remaining_time": "0:11:11", "throughput": 1906.54, "total_tokens": 763736} +{"current_steps": 530, "total_steps": 1405, "loss": 0.0666, "lr": 3.924878451434736e-06, "epoch": 1.886120996441281, "percentage": 37.72, "elapsed_time": "0:06:41", "remaining_time": "0:11:02", "throughput": 1924.6, "total_tokens": 771864} +{"current_steps": 535, "total_steps": 1405, "loss": 0.072, "lr": 3.899241207771546e-06, "epoch": 1.903914590747331, "percentage": 38.08, "elapsed_time": "0:06:41", "remaining_time": "0:10:52", "throughput": 1939.57, "total_tokens": 778712} +{"current_steps": 540, "total_steps": 1405, "loss": 0.0475, "lr": 3.873387875136252e-06, "epoch": 1.9217081850533808, "percentage": 38.43, "elapsed_time": "0:06:41", "remaining_time": "0:10:43", "throughput": 1951.5, "total_tokens": 784280} +{"current_steps": 545, "total_steps": 1405, "loss": 0.1443, "lr": 3.847322446135736e-06, "epoch": 1.9395017793594307, "percentage": 38.79, "elapsed_time": "0:06:42", "remaining_time": "0:10:34", "throughput": 1969.14, "total_tokens": 792280} +{"current_steps": 550, "total_steps": 1405, "loss": 0.1501, "lr": 3.821048946131549e-06, "epoch": 1.9572953736654806, "percentage": 39.15, "elapsed_time": "0:06:42", "remaining_time": "0:10:26", "throughput": 1982.52, "total_tokens": 798488} +{"current_steps": 555, "total_steps": 1405, "loss": 0.0502, "lr": 3.794571432618267e-06, "epoch": 1.9750889679715302, "percentage": 39.5, "elapsed_time": "0:06:43", "remaining_time": "0:10:17", "throughput": 1999.18, "total_tokens": 806104} +{"current_steps": 560, "total_steps": 1405, "loss": 0.0142, "lr": 3.767893994596876e-06, "epoch": 1.99288256227758, "percentage": 39.86, "elapsed_time": "0:06:43", "remaining_time": "0:10:09", "throughput": 2014.91, "total_tokens": 813336} +{"current_steps": 565, "total_steps": 1405, "loss": 0.1868, "lr": 3.7410207519432972e-06, "epoch": 2.0106761565836297, "percentage": 40.21, "elapsed_time": "0:06:44", "remaining_time": "0:10:00", "throughput": 2022.87, "total_tokens": 817576} +{"current_steps": 568, "total_steps": 1405, "eval_loss": 0.16060441732406616, "epoch": 2.02135231316726, "percentage": 40.43, "elapsed_time": "0:06:45", "remaining_time": "0:09:56", "throughput": 2028.08, "total_tokens": 821416} +{"current_steps": 570, "total_steps": 1405, "loss": 0.0138, "lr": 3.713955854772144e-06, "epoch": 2.0284697508896796, "percentage": 40.57, "elapsed_time": "0:07:32", "remaining_time": "0:11:02", "throughput": 1821.36, "total_tokens": 823848} +{"current_steps": 575, "total_steps": 1405, "loss": 0.1069, "lr": 3.686703482795802e-06, "epoch": 2.0462633451957295, "percentage": 40.93, "elapsed_time": "0:07:32", "remaining_time": "0:10:53", "throughput": 1837.99, "total_tokens": 832232} +{"current_steps": 580, "total_steps": 1405, "loss": 0.042, "lr": 3.6592678446789516e-06, "epoch": 2.0640569395017794, "percentage": 41.28, "elapsed_time": "0:07:33", "remaining_time": "0:10:44", "throughput": 1854.17, "total_tokens": 840424} +{"current_steps": 585, "total_steps": 1405, "loss": 0.0325, "lr": 3.631653177388605e-06, "epoch": 2.0818505338078293, "percentage": 41.64, "elapsed_time": "0:07:33", "remaining_time": "0:10:35", "throughput": 1866.56, "total_tokens": 846824} +{"current_steps": 590, "total_steps": 1405, "loss": 0.003, "lr": 3.6038637455397802e-06, "epoch": 2.099644128113879, "percentage": 41.99, "elapsed_time": "0:07:34", "remaining_time": "0:10:27", "throughput": 1879.75, "total_tokens": 853608} +{"current_steps": 595, "total_steps": 1405, "loss": 0.1002, "lr": 3.575903840736906e-06, "epoch": 2.117437722419929, "percentage": 42.35, "elapsed_time": "0:07:34", "remaining_time": "0:10:18", "throughput": 1894.1, "total_tokens": 860968} +{"current_steps": 600, "total_steps": 1405, "loss": 0.0251, "lr": 3.547777780911055e-06, "epoch": 2.135231316725979, "percentage": 42.7, "elapsed_time": "0:07:35", "remaining_time": "0:10:10", "throughput": 1909.63, "total_tokens": 868904} +{"current_steps": 605, "total_steps": 1405, "loss": 0.0005, "lr": 3.519489909653113e-06, "epoch": 2.1530249110320283, "percentage": 43.06, "elapsed_time": "0:07:35", "remaining_time": "0:10:02", "throughput": 1923.52, "total_tokens": 876072} +{"current_steps": 610, "total_steps": 1405, "loss": 0.0155, "lr": 3.4910445955429856e-06, "epoch": 2.170818505338078, "percentage": 43.42, "elapsed_time": "0:07:35", "remaining_time": "0:09:54", "throughput": 1938.44, "total_tokens": 883752} +{"current_steps": 615, "total_steps": 1405, "loss": 0.0004, "lr": 3.4624462314749447e-06, "epoch": 2.188612099644128, "percentage": 43.77, "elapsed_time": "0:07:36", "remaining_time": "0:09:46", "throughput": 1953.08, "total_tokens": 891304} +{"current_steps": 620, "total_steps": 1405, "loss": 0.0163, "lr": 3.433699233979222e-06, "epoch": 2.206405693950178, "percentage": 44.13, "elapsed_time": "0:07:36", "remaining_time": "0:09:38", "throughput": 1968.35, "total_tokens": 899176} +{"current_steps": 625, "total_steps": 1405, "loss": 0.0001, "lr": 3.4048080425399506e-06, "epoch": 2.224199288256228, "percentage": 44.48, "elapsed_time": "0:07:37", "remaining_time": "0:09:30", "throughput": 1984.65, "total_tokens": 907560} +{"current_steps": 630, "total_steps": 1405, "loss": 0.0027, "lr": 3.375777118909561e-06, "epoch": 2.2419928825622777, "percentage": 44.84, "elapsed_time": "0:07:37", "remaining_time": "0:09:23", "throughput": 1999.46, "total_tokens": 915240} +{"current_steps": 635, "total_steps": 1405, "loss": 0.0259, "lr": 3.346610946419743e-06, "epoch": 2.2597864768683276, "percentage": 45.2, "elapsed_time": "0:07:38", "remaining_time": "0:09:15", "throughput": 2011.06, "total_tokens": 921384} +{"current_steps": 639, "total_steps": 1405, "eval_loss": 0.25424328446388245, "epoch": 2.2740213523131674, "percentage": 45.48, "elapsed_time": "0:07:39", "remaining_time": "0:09:10", "throughput": 2018.66, "total_tokens": 926760} +{"current_steps": 640, "total_steps": 1405, "loss": 0.0233, "lr": 3.3173140292890673e-06, "epoch": 2.277580071174377, "percentage": 45.55, "elapsed_time": "0:08:47", "remaining_time": "0:10:30", "throughput": 1758.9, "total_tokens": 927528} +{"current_steps": 645, "total_steps": 1405, "loss": 0.0266, "lr": 3.2878908919273867e-06, "epoch": 2.295373665480427, "percentage": 45.91, "elapsed_time": "0:08:47", "remaining_time": "0:10:21", "throughput": 1770.76, "total_tokens": 934568} +{"current_steps": 650, "total_steps": 1405, "loss": 0.0006, "lr": 3.2583460782371217e-06, "epoch": 2.3131672597864767, "percentage": 46.26, "elapsed_time": "0:08:48", "remaining_time": "0:10:13", "throughput": 1783.78, "total_tokens": 942248} +{"current_steps": 655, "total_steps": 1405, "loss": 0.0003, "lr": 3.228684150911527e-06, "epoch": 2.3309608540925266, "percentage": 46.62, "elapsed_time": "0:08:48", "remaining_time": "0:10:05", "throughput": 1795.27, "total_tokens": 949096} +{"current_steps": 660, "total_steps": 1405, "loss": 0.0082, "lr": 3.1989096907300634e-06, "epoch": 2.3487544483985765, "percentage": 46.98, "elapsed_time": "0:08:49", "remaining_time": "0:09:57", "throughput": 1806.39, "total_tokens": 955752} +{"current_steps": 665, "total_steps": 1405, "loss": 0.0486, "lr": 3.1690272958509772e-06, "epoch": 2.3665480427046264, "percentage": 47.33, "elapsed_time": "0:08:49", "remaining_time": "0:09:49", "throughput": 1818.88, "total_tokens": 963176} +{"current_steps": 670, "total_steps": 1405, "loss": 0.0001, "lr": 3.139041581101187e-06, "epoch": 2.3843416370106763, "percentage": 47.69, "elapsed_time": "0:08:49", "remaining_time": "0:09:41", "throughput": 1827.08, "total_tokens": 968232} +{"current_steps": 675, "total_steps": 1405, "loss": 0.0222, "lr": 3.108957177263608e-06, "epoch": 2.402135231316726, "percentage": 48.04, "elapsed_time": "0:08:50", "remaining_time": "0:09:33", "throughput": 1841.14, "total_tokens": 976552} +{"current_steps": 680, "total_steps": 1405, "loss": 0.0075, "lr": 3.078778730362003e-06, "epoch": 2.419928825622776, "percentage": 48.4, "elapsed_time": "0:08:50", "remaining_time": "0:09:25", "throughput": 1853.1, "total_tokens": 983720} +{"current_steps": 685, "total_steps": 1405, "loss": 0.0003, "lr": 3.0485109009434844e-06, "epoch": 2.4377224199288254, "percentage": 48.75, "elapsed_time": "0:08:51", "remaining_time": "0:09:18", "throughput": 1867.0, "total_tokens": 991976} +{"current_steps": 690, "total_steps": 1405, "loss": 0.061, "lr": 3.018158363358773e-06, "epoch": 2.4555160142348753, "percentage": 49.11, "elapsed_time": "0:08:51", "remaining_time": "0:09:10", "throughput": 1877.23, "total_tokens": 998184} +{"current_steps": 695, "total_steps": 1405, "loss": 0.0, "lr": 2.9877258050403214e-06, "epoch": 2.473309608540925, "percentage": 49.47, "elapsed_time": "0:08:52", "remaining_time": "0:09:03", "throughput": 1889.72, "total_tokens": 1005672} +{"current_steps": 700, "total_steps": 1405, "loss": 0.0152, "lr": 2.9572179257784215e-06, "epoch": 2.491103202846975, "percentage": 49.82, "elapsed_time": "0:08:52", "remaining_time": "0:08:56", "throughput": 1902.07, "total_tokens": 1013096} +{"current_steps": 705, "total_steps": 1405, "loss": 0.0006, "lr": 2.9266394369954056e-06, "epoch": 2.508896797153025, "percentage": 50.18, "elapsed_time": "0:08:53", "remaining_time": "0:08:49", "throughput": 1912.23, "total_tokens": 1019304} +{"current_steps": 710, "total_steps": 1405, "loss": 0.029, "lr": 2.8959950610180376e-06, "epoch": 2.526690391459075, "percentage": 50.53, "elapsed_time": "0:08:53", "remaining_time": "0:08:42", "throughput": 1922.02, "total_tokens": 1025320} +{"current_steps": 710, "total_steps": 1405, "eval_loss": 0.23608553409576416, "epoch": 2.526690391459075, "percentage": 50.53, "elapsed_time": "0:08:54", "remaining_time": "0:08:42", "throughput": 1919.77, "total_tokens": 1025320} +{"current_steps": 715, "total_steps": 1405, "loss": 0.0, "lr": 2.865289530348243e-06, "epoch": 2.5444839857651247, "percentage": 50.89, "elapsed_time": "0:09:28", "remaining_time": "0:09:08", "throughput": 1816.38, "total_tokens": 1032552} +{"current_steps": 720, "total_steps": 1405, "loss": 0.0, "lr": 2.8345275869322432e-06, "epoch": 2.562277580071174, "percentage": 51.25, "elapsed_time": "0:09:28", "remaining_time": "0:09:01", "throughput": 1827.88, "total_tokens": 1039912} +{"current_steps": 725, "total_steps": 1405, "loss": 0.0092, "lr": 2.8037139814282494e-06, "epoch": 2.580071174377224, "percentage": 51.6, "elapsed_time": "0:09:29", "remaining_time": "0:08:54", "throughput": 1839.26, "total_tokens": 1047208} +{"current_steps": 730, "total_steps": 1405, "loss": 0.0001, "lr": 2.7728534724728027e-06, "epoch": 2.597864768683274, "percentage": 51.96, "elapsed_time": "0:09:29", "remaining_time": "0:08:46", "throughput": 1849.66, "total_tokens": 1053928} +{"current_steps": 735, "total_steps": 1405, "loss": 0.0806, "lr": 2.741950825945881e-06, "epoch": 2.6156583629893237, "percentage": 52.31, "elapsed_time": "0:09:30", "remaining_time": "0:08:39", "throughput": 1861.64, "total_tokens": 1061608} +{"current_steps": 740, "total_steps": 1405, "loss": 0.0747, "lr": 2.7110108142348962e-06, "epoch": 2.6334519572953736, "percentage": 52.67, "elapsed_time": "0:09:30", "remaining_time": "0:08:32", "throughput": 1870.73, "total_tokens": 1067560} +{"current_steps": 745, "total_steps": 1405, "loss": 0.0001, "lr": 2.6800382154976734e-06, "epoch": 2.6512455516014235, "percentage": 53.02, "elapsed_time": "0:09:31", "remaining_time": "0:08:25", "throughput": 1880.86, "total_tokens": 1074152} +{"current_steps": 750, "total_steps": 1405, "loss": 0.0005, "lr": 2.64903781292455e-06, "epoch": 2.6690391459074734, "percentage": 53.38, "elapsed_time": "0:09:31", "remaining_time": "0:08:19", "throughput": 1894.52, "total_tokens": 1082856} +{"current_steps": 755, "total_steps": 1405, "loss": 0.0003, "lr": 2.6180143939996926e-06, "epoch": 2.6868327402135233, "percentage": 53.74, "elapsed_time": "0:09:32", "remaining_time": "0:08:12", "throughput": 1904.74, "total_tokens": 1089512} +{"current_steps": 760, "total_steps": 1405, "loss": 0.0433, "lr": 2.5869727497617495e-06, "epoch": 2.704626334519573, "percentage": 54.09, "elapsed_time": "0:09:32", "remaining_time": "0:08:05", "throughput": 1915.04, "total_tokens": 1096232} +{"current_steps": 765, "total_steps": 1405, "loss": 0.0004, "lr": 2.55591767406396e-06, "epoch": 2.722419928825623, "percentage": 54.45, "elapsed_time": "0:09:32", "remaining_time": "0:07:59", "throughput": 1927.34, "total_tokens": 1104168} +{"current_steps": 770, "total_steps": 1405, "loss": 0.1194, "lr": 2.524853962833825e-06, "epoch": 2.7402135231316724, "percentage": 54.8, "elapsed_time": "0:09:33", "remaining_time": "0:07:52", "throughput": 1939.85, "total_tokens": 1112232} +{"current_steps": 775, "total_steps": 1405, "loss": 0.0031, "lr": 2.4937864133324514e-06, "epoch": 2.7580071174377223, "percentage": 55.16, "elapsed_time": "0:09:33", "remaining_time": "0:07:46", "throughput": 1950.21, "total_tokens": 1119016} +{"current_steps": 780, "total_steps": 1405, "loss": 0.0005, "lr": 2.462719823413707e-06, "epoch": 2.775800711743772, "percentage": 55.52, "elapsed_time": "0:09:34", "remaining_time": "0:07:40", "throughput": 1962.04, "total_tokens": 1126696} +{"current_steps": 781, "total_steps": 1405, "eval_loss": 0.23524385690689087, "epoch": 2.7793594306049823, "percentage": 55.59, "elapsed_time": "0:09:35", "remaining_time": "0:07:39", "throughput": 1960.39, "total_tokens": 1128104} +{"current_steps": 785, "total_steps": 1405, "loss": 0.0423, "lr": 2.4316589907832654e-06, "epoch": 2.793594306049822, "percentage": 55.87, "elapsed_time": "0:10:24", "remaining_time": "0:08:13", "throughput": 1816.76, "total_tokens": 1134184} +{"current_steps": 790, "total_steps": 1405, "loss": 0.001, "lr": 2.4006087122576867e-06, "epoch": 2.811387900355872, "percentage": 56.23, "elapsed_time": "0:10:24", "remaining_time": "0:08:06", "throughput": 1825.49, "total_tokens": 1140392} +{"current_steps": 795, "total_steps": 1405, "loss": 0.031, "lr": 2.3695737830236263e-06, "epoch": 2.829181494661922, "percentage": 56.58, "elapsed_time": "0:10:25", "remaining_time": "0:07:59", "throughput": 1836.84, "total_tokens": 1148328} +{"current_steps": 800, "total_steps": 1405, "loss": 0.0007, "lr": 2.3385589958973073e-06, "epoch": 2.8469750889679717, "percentage": 56.94, "elapsed_time": "0:10:25", "remaining_time": "0:07:53", "throughput": 1844.75, "total_tokens": 1154024} +{"current_steps": 805, "total_steps": 1405, "loss": 0.0003, "lr": 2.3075691405843435e-06, "epoch": 2.864768683274021, "percentage": 57.3, "elapsed_time": "0:10:26", "remaining_time": "0:07:46", "throughput": 1854.32, "total_tokens": 1160808} +{"current_steps": 810, "total_steps": 1405, "loss": 0.0299, "lr": 2.2766090029400573e-06, "epoch": 2.882562277580071, "percentage": 57.65, "elapsed_time": "0:10:26", "remaining_time": "0:07:40", "throughput": 1864.36, "total_tokens": 1167912} +{"current_steps": 815, "total_steps": 1405, "loss": 0.0001, "lr": 2.2456833642303825e-06, "epoch": 2.900355871886121, "percentage": 58.01, "elapsed_time": "0:10:26", "remaining_time": "0:07:33", "throughput": 1873.7, "total_tokens": 1174568} +{"current_steps": 820, "total_steps": 1405, "loss": 0.0001, "lr": 2.214797000393479e-06, "epoch": 2.9181494661921707, "percentage": 58.36, "elapsed_time": "0:10:27", "remaining_time": "0:07:27", "throughput": 1883.4, "total_tokens": 1181480} +{"current_steps": 825, "total_steps": 1405, "loss": 0.0251, "lr": 2.183954681302173e-06, "epoch": 2.9359430604982206, "percentage": 58.72, "elapsed_time": "0:10:27", "remaining_time": "0:07:21", "throughput": 1895.44, "total_tokens": 1189928} +{"current_steps": 830, "total_steps": 1405, "loss": 0.0001, "lr": 2.15316117002733e-06, "epoch": 2.9537366548042705, "percentage": 59.07, "elapsed_time": "0:10:28", "remaining_time": "0:07:15", "throughput": 1906.1, "total_tokens": 1197480} +{"current_steps": 835, "total_steps": 1405, "loss": 0.039, "lr": 2.122421222102278e-06, "epoch": 2.9715302491103204, "percentage": 59.43, "elapsed_time": "0:10:28", "remaining_time": "0:07:09", "throughput": 1916.05, "total_tokens": 1204584} +{"current_steps": 840, "total_steps": 1405, "loss": 0.0311, "lr": 2.0917395847884e-06, "epoch": 2.9893238434163703, "percentage": 59.79, "elapsed_time": "0:10:29", "remaining_time": "0:07:03", "throughput": 1927.35, "total_tokens": 1212584} +{"current_steps": 845, "total_steps": 1405, "loss": 0.0104, "lr": 2.061120996341996e-06, "epoch": 3.00711743772242, "percentage": 60.14, "elapsed_time": "0:10:29", "remaining_time": "0:06:57", "throughput": 1934.0, "total_tokens": 1217856} +{"current_steps": 850, "total_steps": 1405, "loss": 0.0001, "lr": 2.030570185282544e-06, "epoch": 3.0249110320284696, "percentage": 60.5, "elapsed_time": "0:10:30", "remaining_time": "0:06:51", "throughput": 1946.42, "total_tokens": 1226624} +{"current_steps": 852, "total_steps": 1405, "eval_loss": 0.25802624225616455, "epoch": 3.0320284697508897, "percentage": 60.64, "elapsed_time": "0:10:30", "remaining_time": "0:06:49", "throughput": 1948.51, "total_tokens": 1229440} +{"current_steps": 855, "total_steps": 1405, "loss": 0.0, "lr": 2.0000918696624587e-06, "epoch": 3.0427046263345194, "percentage": 60.85, "elapsed_time": "0:11:23", "remaining_time": "0:07:19", "throughput": 1805.25, "total_tokens": 1233152} +{"current_steps": 860, "total_steps": 1405, "loss": 0.0, "lr": 1.9696907563384687e-06, "epoch": 3.0604982206405693, "percentage": 61.21, "elapsed_time": "0:11:23", "remaining_time": "0:07:13", "throughput": 1814.3, "total_tokens": 1240128} +{"current_steps": 865, "total_steps": 1405, "loss": 0.0, "lr": 1.9393715402447228e-06, "epoch": 3.078291814946619, "percentage": 61.57, "elapsed_time": "0:11:23", "remaining_time": "0:07:07", "throughput": 1824.66, "total_tokens": 1248064} +{"current_steps": 870, "total_steps": 1405, "loss": 0.0, "lr": 1.9091389036677384e-06, "epoch": 3.096085409252669, "percentage": 61.92, "elapsed_time": "0:11:24", "remaining_time": "0:07:00", "throughput": 1833.96, "total_tokens": 1255232} +{"current_steps": 875, "total_steps": 1405, "loss": 0.0486, "lr": 1.878997515523299e-06, "epoch": 3.113879003558719, "percentage": 62.28, "elapsed_time": "0:11:24", "remaining_time": "0:06:54", "throughput": 1843.07, "total_tokens": 1262272} +{"current_steps": 880, "total_steps": 1405, "loss": 0.0001, "lr": 1.8489520306354243e-06, "epoch": 3.131672597864769, "percentage": 62.63, "elapsed_time": "0:11:25", "remaining_time": "0:06:48", "throughput": 1852.6, "total_tokens": 1269632} +{"current_steps": 885, "total_steps": 1405, "loss": 0.0001, "lr": 1.8190070890175082e-06, "epoch": 3.1494661921708187, "percentage": 62.99, "elapsed_time": "0:11:25", "remaining_time": "0:06:42", "throughput": 1862.56, "total_tokens": 1277312} +{"current_steps": 890, "total_steps": 1405, "loss": 0.0502, "lr": 1.7891673151557493e-06, "epoch": 3.167259786476868, "percentage": 63.35, "elapsed_time": "0:11:26", "remaining_time": "0:06:37", "throughput": 1871.27, "total_tokens": 1284096} +{"current_steps": 895, "total_steps": 1405, "loss": 0.0001, "lr": 1.7594373172949786e-06, "epoch": 3.185053380782918, "percentage": 63.7, "elapsed_time": "0:11:26", "remaining_time": "0:06:31", "throughput": 1881.04, "total_tokens": 1291648} +{"current_steps": 900, "total_steps": 1405, "loss": 0.0001, "lr": 1.7298216867269906e-06, "epoch": 3.202846975088968, "percentage": 64.06, "elapsed_time": "0:11:27", "remaining_time": "0:06:25", "throughput": 1891.51, "total_tokens": 1299712} +{"current_steps": 905, "total_steps": 1405, "loss": 0.0001, "lr": 1.7003249970815028e-06, "epoch": 3.2206405693950177, "percentage": 64.41, "elapsed_time": "0:11:27", "remaining_time": "0:06:19", "throughput": 1899.73, "total_tokens": 1306176} +{"current_steps": 910, "total_steps": 1405, "loss": 0.0001, "lr": 1.6709518036198307e-06, "epoch": 3.2384341637010676, "percentage": 64.77, "elapsed_time": "0:11:28", "remaining_time": "0:06:14", "throughput": 1909.99, "total_tokens": 1314112} +{"current_steps": 915, "total_steps": 1405, "loss": 0.0251, "lr": 1.6417066425314088e-06, "epoch": 3.2562277580071175, "percentage": 65.12, "elapsed_time": "0:11:28", "remaining_time": "0:06:08", "throughput": 1918.91, "total_tokens": 1321088} +{"current_steps": 920, "total_steps": 1405, "loss": 0.0001, "lr": 1.612594030233252e-06, "epoch": 3.2740213523131674, "percentage": 65.48, "elapsed_time": "0:11:28", "remaining_time": "0:06:03", "throughput": 1928.42, "total_tokens": 1328512} +{"current_steps": 923, "total_steps": 1405, "eval_loss": 0.22950421273708344, "epoch": 3.284697508896797, "percentage": 65.69, "elapsed_time": "0:11:29", "remaining_time": "0:06:00", "throughput": 1931.9, "total_tokens": 1332544} +{"current_steps": 925, "total_steps": 1405, "loss": 0.0005, "lr": 1.5836184626724722e-06, "epoch": 3.2918149466192173, "percentage": 65.84, "elapsed_time": "0:12:15", "remaining_time": "0:06:21", "throughput": 1815.47, "total_tokens": 1336128} +{"current_steps": 930, "total_steps": 1405, "loss": 0.0, "lr": 1.5547844146319547e-06, "epoch": 3.309608540925267, "percentage": 66.19, "elapsed_time": "0:12:16", "remaining_time": "0:06:16", "throughput": 1824.44, "total_tokens": 1343552} +{"current_steps": 935, "total_steps": 1405, "loss": 0.0383, "lr": 1.5260963390393075e-06, "epoch": 3.3274021352313166, "percentage": 66.55, "elapsed_time": "0:12:16", "remaining_time": "0:06:10", "throughput": 1834.14, "total_tokens": 1351552} +{"current_steps": 940, "total_steps": 1405, "loss": 0.0002, "lr": 1.4975586662791783e-06, "epoch": 3.3451957295373664, "percentage": 66.9, "elapsed_time": "0:12:17", "remaining_time": "0:06:04", "throughput": 1842.17, "total_tokens": 1358272} +{"current_steps": 945, "total_steps": 1405, "loss": 0.0001, "lr": 1.4691758035090603e-06, "epoch": 3.3629893238434163, "percentage": 67.26, "elapsed_time": "0:12:17", "remaining_time": "0:05:59", "throughput": 1852.51, "total_tokens": 1366784} +{"current_steps": 950, "total_steps": 1405, "loss": 0.0001, "lr": 1.4409521339786809e-06, "epoch": 3.380782918149466, "percentage": 67.62, "elapsed_time": "0:12:18", "remaining_time": "0:05:53", "throughput": 1860.29, "total_tokens": 1373312} +{"current_steps": 955, "total_steps": 1405, "loss": 0.0001, "lr": 1.41289201635308e-06, "epoch": 3.398576512455516, "percentage": 67.97, "elapsed_time": "0:12:18", "remaining_time": "0:05:48", "throughput": 1869.21, "total_tokens": 1380736} +{"current_steps": 960, "total_steps": 1405, "loss": 0.0001, "lr": 1.3849997840394943e-06, "epoch": 3.416370106761566, "percentage": 68.33, "elapsed_time": "0:12:19", "remaining_time": "0:05:42", "throughput": 1878.61, "total_tokens": 1388544} +{"current_steps": 965, "total_steps": 1405, "loss": 0.0001, "lr": 1.3572797445181346e-06, "epoch": 3.434163701067616, "percentage": 68.68, "elapsed_time": "0:12:19", "remaining_time": "0:05:37", "throughput": 1887.76, "total_tokens": 1396160} +{"current_steps": 970, "total_steps": 1405, "loss": 0.0, "lr": 1.3297361786769654e-06, "epoch": 3.4519572953736652, "percentage": 69.04, "elapsed_time": "0:12:20", "remaining_time": "0:05:31", "throughput": 1897.31, "total_tokens": 1404096} +{"current_steps": 975, "total_steps": 1405, "loss": 0.0004, "lr": 1.302373340150598e-06, "epoch": 3.469750889679715, "percentage": 69.4, "elapsed_time": "0:12:20", "remaining_time": "0:05:26", "throughput": 1905.54, "total_tokens": 1411008} +{"current_steps": 980, "total_steps": 1405, "loss": 0.0001, "lr": 1.2751954546633872e-06, "epoch": 3.487544483985765, "percentage": 69.75, "elapsed_time": "0:12:20", "remaining_time": "0:05:21", "throughput": 1914.97, "total_tokens": 1418880} +{"current_steps": 985, "total_steps": 1405, "loss": 0.0, "lr": 1.2482067193768419e-06, "epoch": 3.505338078291815, "percentage": 70.11, "elapsed_time": "0:12:21", "remaining_time": "0:05:16", "throughput": 1923.5, "total_tokens": 1426048} +{"current_steps": 990, "total_steps": 1405, "loss": 0.0001, "lr": 1.2214113022414448e-06, "epoch": 3.5231316725978647, "percentage": 70.46, "elapsed_time": "0:12:21", "remaining_time": "0:05:10", "throughput": 1930.54, "total_tokens": 1432064} +{"current_steps": 994, "total_steps": 1405, "eval_loss": 0.24046999216079712, "epoch": 3.5373665480427046, "percentage": 70.75, "elapsed_time": "0:12:22", "remaining_time": "0:05:07", "throughput": 1936.33, "total_tokens": 1438336} +{"current_steps": 995, "total_steps": 1405, "loss": 0.0, "lr": 1.1948133413529817e-06, "epoch": 3.5409252669039146, "percentage": 70.82, "elapsed_time": "0:13:12", "remaining_time": "0:05:26", "throughput": 1815.74, "total_tokens": 1439808} +{"current_steps": 1000, "total_steps": 1405, "loss": 0.0001, "lr": 1.168416944313486e-06, "epoch": 3.5587188612099645, "percentage": 71.17, "elapsed_time": "0:13:13", "remaining_time": "0:05:21", "throughput": 1824.55, "total_tokens": 1447616} +{"current_steps": 1005, "total_steps": 1405, "loss": 0.0, "lr": 1.1422261875968845e-06, "epoch": 3.5765124555160144, "percentage": 71.53, "elapsed_time": "0:13:13", "remaining_time": "0:05:15", "throughput": 1831.87, "total_tokens": 1454208} +{"current_steps": 1010, "total_steps": 1405, "loss": 0.0, "lr": 1.1162451159194615e-06, "epoch": 3.5943060498220643, "percentage": 71.89, "elapsed_time": "0:13:14", "remaining_time": "0:05:10", "throughput": 1842.19, "total_tokens": 1463296} +{"current_steps": 1015, "total_steps": 1405, "loss": 0.0009, "lr": 1.0904777416152166e-06, "epoch": 3.612099644128114, "percentage": 72.24, "elapsed_time": "0:13:14", "remaining_time": "0:05:05", "throughput": 1849.56, "total_tokens": 1469952} +{"current_steps": 1020, "total_steps": 1405, "loss": 0.0, "lr": 1.0649280440162326e-06, "epoch": 3.6298932384341636, "percentage": 72.6, "elapsed_time": "0:13:15", "remaining_time": "0:05:00", "throughput": 1857.63, "total_tokens": 1477184} +{"current_steps": 1025, "total_steps": 1405, "loss": 0.0, "lr": 1.0395999688381313e-06, "epoch": 3.6476868327402134, "percentage": 72.95, "elapsed_time": "0:13:15", "remaining_time": "0:04:54", "throughput": 1865.38, "total_tokens": 1484160} +{"current_steps": 1030, "total_steps": 1405, "loss": 0.0001, "lr": 1.0144974275707243e-06, "epoch": 3.6654804270462633, "percentage": 73.31, "elapsed_time": "0:13:16", "remaining_time": "0:04:49", "throughput": 1873.19, "total_tokens": 1491200} +{"current_steps": 1035, "total_steps": 1405, "loss": 0.0, "lr": 9.896242968739538e-07, "epoch": 3.683274021352313, "percentage": 73.67, "elapsed_time": "0:13:16", "remaining_time": "0:04:44", "throughput": 1881.15, "total_tokens": 1498368} +{"current_steps": 1040, "total_steps": 1405, "loss": 0.0, "lr": 9.649844179792082e-07, "epoch": 3.701067615658363, "percentage": 74.02, "elapsed_time": "0:13:16", "remaining_time": "0:04:39", "throughput": 1889.65, "total_tokens": 1505984} +{"current_steps": 1045, "total_steps": 1405, "loss": 0.0, "lr": 9.405815960961054e-07, "epoch": 3.718861209964413, "percentage": 74.38, "elapsed_time": "0:13:17", "remaining_time": "0:04:34", "throughput": 1895.83, "total_tokens": 1511680} +{"current_steps": 1050, "total_steps": 1405, "loss": 0.0, "lr": 9.164195998248471e-07, "epoch": 3.7366548042704624, "percentage": 74.73, "elapsed_time": "0:13:17", "remaining_time": "0:04:29", "throughput": 1902.62, "total_tokens": 1517888} +{"current_steps": 1055, "total_steps": 1405, "loss": 0.0109, "lr": 8.925021605742212e-07, "epoch": 3.7544483985765122, "percentage": 75.09, "elapsed_time": "0:13:18", "remaining_time": "0:04:24", "throughput": 1911.15, "total_tokens": 1525568} +{"current_steps": 1060, "total_steps": 1405, "loss": 0.0, "lr": 8.68832971985347e-07, "epoch": 3.772241992882562, "percentage": 75.44, "elapsed_time": "0:13:18", "remaining_time": "0:04:19", "throughput": 1918.77, "total_tokens": 1532480} +{"current_steps": 1065, "total_steps": 1405, "loss": 0.0, "lr": 8.454156893612592e-07, "epoch": 3.790035587188612, "percentage": 75.8, "elapsed_time": "0:13:19", "remaining_time": "0:04:15", "throughput": 1925.99, "total_tokens": 1539072} +{"current_steps": 1065, "total_steps": 1405, "eval_loss": 0.2512344419956207, "epoch": 3.790035587188612, "percentage": 75.8, "elapsed_time": "0:13:19", "remaining_time": "0:04:15", "throughput": 1924.38, "total_tokens": 1539072} +{"current_steps": 1070, "total_steps": 1405, "loss": 0.0, "lr": 8.222539291024079e-07, "epoch": 3.807829181494662, "percentage": 76.16, "elapsed_time": "0:13:56", "remaining_time": "0:04:21", "throughput": 1850.89, "total_tokens": 1547584} +{"current_steps": 1075, "total_steps": 1405, "loss": 0.0, "lr": 7.993512681481638e-07, "epoch": 3.8256227758007118, "percentage": 76.51, "elapsed_time": "0:13:56", "remaining_time": "0:04:16", "throughput": 1857.97, "total_tokens": 1554304} +{"current_steps": 1080, "total_steps": 1405, "loss": 0.0, "lr": 7.767112434244254e-07, "epoch": 3.8434163701067616, "percentage": 76.87, "elapsed_time": "0:13:56", "remaining_time": "0:04:11", "throughput": 1864.9, "total_tokens": 1560896} +{"current_steps": 1085, "total_steps": 1405, "loss": 0.0, "lr": 7.543373512973947e-07, "epoch": 3.8612099644128115, "percentage": 77.22, "elapsed_time": "0:13:57", "remaining_time": "0:04:06", "throughput": 1872.11, "total_tokens": 1567744} +{"current_steps": 1090, "total_steps": 1405, "loss": 0.032, "lr": 7.322330470336314e-07, "epoch": 3.8790035587188614, "percentage": 77.58, "elapsed_time": "0:13:57", "remaining_time": "0:04:02", "throughput": 1879.1, "total_tokens": 1574400} +{"current_steps": 1095, "total_steps": 1405, "loss": 0.0187, "lr": 7.104017442664393e-07, "epoch": 3.8967971530249113, "percentage": 77.94, "elapsed_time": "0:13:58", "remaining_time": "0:03:57", "throughput": 1886.59, "total_tokens": 1581504} +{"current_steps": 1100, "total_steps": 1405, "loss": 0.0, "lr": 6.88846814468691e-07, "epoch": 3.914590747330961, "percentage": 78.29, "elapsed_time": "0:13:58", "remaining_time": "0:03:52", "throughput": 1895.09, "total_tokens": 1589504} +{"current_steps": 1105, "total_steps": 1405, "loss": 0.0369, "lr": 6.67571586432163e-07, "epoch": 3.9323843416370106, "percentage": 78.65, "elapsed_time": "0:13:59", "remaining_time": "0:03:47", "throughput": 1903.8, "total_tokens": 1597696} +{"current_steps": 1110, "total_steps": 1405, "loss": 0.0, "lr": 6.465793457534553e-07, "epoch": 3.9501779359430604, "percentage": 79.0, "elapsed_time": "0:13:59", "remaining_time": "0:03:43", "throughput": 1911.77, "total_tokens": 1605248} +{"current_steps": 1115, "total_steps": 1405, "loss": 0.0002, "lr": 6.258733343265933e-07, "epoch": 3.9679715302491103, "percentage": 79.36, "elapsed_time": "0:14:00", "remaining_time": "0:03:38", "throughput": 1921.04, "total_tokens": 1613952} +{"current_steps": 1120, "total_steps": 1405, "loss": 0.0, "lr": 6.054567498423683e-07, "epoch": 3.98576512455516, "percentage": 79.72, "elapsed_time": "0:14:00", "remaining_time": "0:03:33", "throughput": 1927.54, "total_tokens": 1620224} +{"current_steps": 1125, "total_steps": 1405, "loss": 0.0, "lr": 5.853327452945115e-07, "epoch": 4.00355871886121, "percentage": 80.07, "elapsed_time": "0:14:01", "remaining_time": "0:03:29", "throughput": 1932.97, "total_tokens": 1625800} +{"current_steps": 1130, "total_steps": 1405, "loss": 0.0, "lr": 5.655044284927658e-07, "epoch": 4.0213523131672595, "percentage": 80.43, "elapsed_time": "0:14:01", "remaining_time": "0:03:24", "throughput": 1940.9, "total_tokens": 1633352} +{"current_steps": 1135, "total_steps": 1405, "loss": 0.0, "lr": 5.459748615829355e-07, "epoch": 4.039145907473309, "percentage": 80.78, "elapsed_time": "0:14:01", "remaining_time": "0:03:20", "throughput": 1948.76, "total_tokens": 1640840} +{"current_steps": 1136, "total_steps": 1405, "eval_loss": 0.2551669180393219, "epoch": 4.04270462633452, "percentage": 80.85, "elapsed_time": "0:14:03", "remaining_time": "0:03:19", "throughput": 1946.88, "total_tokens": 1642696} +{"current_steps": 1140, "total_steps": 1405, "loss": 0.0, "lr": 5.267470605739953e-07, "epoch": 4.056939501779359, "percentage": 81.14, "elapsed_time": "0:14:44", "remaining_time": "0:03:25", "throughput": 1862.79, "total_tokens": 1648520} +{"current_steps": 1145, "total_steps": 1405, "loss": 0.0, "lr": 5.078239948723154e-07, "epoch": 4.074733096085409, "percentage": 81.49, "elapsed_time": "0:14:45", "remaining_time": "0:03:21", "throughput": 1870.02, "total_tokens": 1655752} +{"current_steps": 1150, "total_steps": 1405, "loss": 0.0, "lr": 4.892085868230881e-07, "epoch": 4.092526690391459, "percentage": 81.85, "elapsed_time": "0:14:45", "remaining_time": "0:03:16", "throughput": 1877.18, "total_tokens": 1662920} +{"current_steps": 1155, "total_steps": 1405, "loss": 0.0, "lr": 4.7090371125902175e-07, "epoch": 4.110320284697509, "percentage": 82.21, "elapsed_time": "0:14:46", "remaining_time": "0:03:11", "throughput": 1884.13, "total_tokens": 1669896} +{"current_steps": 1160, "total_steps": 1405, "loss": 0.0, "lr": 4.529121950563717e-07, "epoch": 4.128113879003559, "percentage": 82.56, "elapsed_time": "0:14:46", "remaining_time": "0:03:07", "throughput": 1889.49, "total_tokens": 1675400} +{"current_steps": 1165, "total_steps": 1405, "loss": 0.0, "lr": 4.352368166983753e-07, "epoch": 4.145907473309609, "percentage": 82.92, "elapsed_time": "0:14:47", "remaining_time": "0:03:02", "throughput": 1897.03, "total_tokens": 1682952} +{"current_steps": 1170, "total_steps": 1405, "loss": 0.0, "lr": 4.178803058461664e-07, "epoch": 4.1637010676156585, "percentage": 83.27, "elapsed_time": "0:14:47", "remaining_time": "0:02:58", "throughput": 1904.31, "total_tokens": 1690248} +{"current_steps": 1175, "total_steps": 1405, "loss": 0.0, "lr": 4.0084534291722375e-07, "epoch": 4.181494661921708, "percentage": 83.63, "elapsed_time": "0:14:48", "remaining_time": "0:02:53", "throughput": 1910.82, "total_tokens": 1696840} +{"current_steps": 1180, "total_steps": 1405, "loss": 0.0, "lr": 3.8413455867142513e-07, "epoch": 4.199288256227758, "percentage": 83.99, "elapsed_time": "0:14:48", "remaining_time": "0:02:49", "throughput": 1917.52, "total_tokens": 1703624} +{"current_steps": 1185, "total_steps": 1405, "loss": 0.0, "lr": 3.6775053380477296e-07, "epoch": 4.217081850533808, "percentage": 84.34, "elapsed_time": "0:14:48", "remaining_time": "0:02:45", "throughput": 1923.81, "total_tokens": 1710024} +{"current_steps": 1190, "total_steps": 1405, "loss": 0.0, "lr": 3.516957985508476e-07, "epoch": 4.234875444839858, "percentage": 84.7, "elapsed_time": "0:14:49", "remaining_time": "0:02:40", "throughput": 1931.54, "total_tokens": 1717768} +{"current_steps": 1195, "total_steps": 1405, "loss": 0.0, "lr": 3.3597283229005877e-07, "epoch": 4.252669039145908, "percentage": 85.05, "elapsed_time": "0:14:49", "remaining_time": "0:02:36", "throughput": 1941.1, "total_tokens": 1727240} +{"current_steps": 1200, "total_steps": 1405, "loss": 0.0, "lr": 3.2058406316674563e-07, "epoch": 4.270462633451958, "percentage": 85.41, "elapsed_time": "0:14:50", "remaining_time": "0:02:32", "throughput": 1948.19, "total_tokens": 1734408} +{"current_steps": 1205, "total_steps": 1405, "loss": 0.0, "lr": 3.055318677141916e-07, "epoch": 4.288256227758007, "percentage": 85.77, "elapsed_time": "0:14:50", "remaining_time": "0:02:27", "throughput": 1954.59, "total_tokens": 1740936} +{"current_steps": 1207, "total_steps": 1405, "eval_loss": 0.257210373878479, "epoch": 4.295373665480427, "percentage": 85.91, "elapsed_time": "0:14:51", "remaining_time": "0:02:26", "throughput": 1955.93, "total_tokens": 1743624} +{"current_steps": 1210, "total_steps": 1405, "loss": 0.0, "lr": 2.9081857048761014e-07, "epoch": 4.306049822064057, "percentage": 86.12, "elapsed_time": "0:15:22", "remaining_time": "0:02:28", "throughput": 1895.27, "total_tokens": 1747784} +{"current_steps": 1215, "total_steps": 1405, "loss": 0.0, "lr": 2.764464437051537e-07, "epoch": 4.3238434163701065, "percentage": 86.48, "elapsed_time": "0:15:22", "remaining_time": "0:02:24", "throughput": 1902.07, "total_tokens": 1754888} +{"current_steps": 1220, "total_steps": 1405, "loss": 0.0, "lr": 2.624177068970124e-07, "epoch": 4.341637010676156, "percentage": 86.83, "elapsed_time": "0:15:23", "remaining_time": "0:02:19", "throughput": 1909.52, "total_tokens": 1762632} +{"current_steps": 1225, "total_steps": 1405, "loss": 0.0, "lr": 2.4873452656264316e-07, "epoch": 4.359430604982206, "percentage": 87.19, "elapsed_time": "0:15:23", "remaining_time": "0:02:15", "throughput": 1916.5, "total_tokens": 1769928} +{"current_steps": 1230, "total_steps": 1405, "loss": 0.0, "lr": 2.3539901583619186e-07, "epoch": 4.377224199288256, "percentage": 87.54, "elapsed_time": "0:15:23", "remaining_time": "0:02:11", "throughput": 1923.74, "total_tokens": 1777480} +{"current_steps": 1235, "total_steps": 1405, "loss": 0.0, "lr": 2.2241323416015452e-07, "epoch": 4.395017793594306, "percentage": 87.9, "elapsed_time": "0:15:24", "remaining_time": "0:02:07", "throughput": 1930.78, "total_tokens": 1784840} +{"current_steps": 1240, "total_steps": 1405, "loss": 0.0, "lr": 2.0977918696733103e-07, "epoch": 4.412811387900356, "percentage": 88.26, "elapsed_time": "0:15:24", "remaining_time": "0:02:03", "throughput": 1938.2, "total_tokens": 1792584} +{"current_steps": 1245, "total_steps": 1405, "loss": 0.0, "lr": 1.9749882537112297e-07, "epoch": 4.430604982206406, "percentage": 88.61, "elapsed_time": "0:15:25", "remaining_time": "0:01:58", "throughput": 1946.27, "total_tokens": 1800968} +{"current_steps": 1250, "total_steps": 1405, "loss": 0.0, "lr": 1.8557404586421413e-07, "epoch": 4.448398576512456, "percentage": 88.97, "elapsed_time": "0:15:25", "remaining_time": "0:01:54", "throughput": 1953.42, "total_tokens": 1808456} +{"current_steps": 1255, "total_steps": 1405, "loss": 0.0, "lr": 1.7400669002569233e-07, "epoch": 4.4661921708185055, "percentage": 89.32, "elapsed_time": "0:15:26", "remaining_time": "0:01:50", "throughput": 1960.75, "total_tokens": 1816136} +{"current_steps": 1260, "total_steps": 1405, "loss": 0.0, "lr": 1.62798544236647e-07, "epoch": 4.483985765124555, "percentage": 89.68, "elapsed_time": "0:15:26", "remaining_time": "0:01:46", "throughput": 1968.4, "total_tokens": 1824136} +{"current_steps": 1265, "total_steps": 1405, "loss": 0.0, "lr": 1.5195133940429345e-07, "epoch": 4.501779359430605, "percentage": 90.04, "elapsed_time": "0:15:27", "remaining_time": "0:01:42", "throughput": 1975.18, "total_tokens": 1831304} +{"current_steps": 1270, "total_steps": 1405, "loss": 0.0, "lr": 1.4146675069466403e-07, "epoch": 4.519572953736655, "percentage": 90.39, "elapsed_time": "0:15:27", "remaining_time": "0:01:38", "throughput": 1980.99, "total_tokens": 1837512} +{"current_steps": 1275, "total_steps": 1405, "loss": 0.0, "lr": 1.313463972739068e-07, "epoch": 4.537366548042705, "percentage": 90.75, "elapsed_time": "0:15:28", "remaining_time": "0:01:34", "throughput": 1987.37, "total_tokens": 1844296} +{"current_steps": 1278, "total_steps": 1405, "eval_loss": 0.259037584066391, "epoch": 4.548042704626335, "percentage": 90.96, "elapsed_time": "0:15:28", "remaining_time": "0:01:32", "throughput": 1991.0, "total_tokens": 1849416} +{"current_steps": 1280, "total_steps": 1405, "loss": 0.0, "lr": 1.215918420582343e-07, "epoch": 4.555160142348754, "percentage": 91.1, "elapsed_time": "0:16:13", "remaining_time": "0:01:35", "throughput": 1902.89, "total_tokens": 1851720} +{"current_steps": 1285, "total_steps": 1405, "loss": 0.0, "lr": 1.1220459147255642e-07, "epoch": 4.572953736654805, "percentage": 91.46, "elapsed_time": "0:16:13", "remaining_time": "0:01:30", "throughput": 1908.65, "total_tokens": 1858120} +{"current_steps": 1290, "total_steps": 1405, "loss": 0.0, "lr": 1.0318609521783818e-07, "epoch": 4.590747330960854, "percentage": 91.81, "elapsed_time": "0:16:13", "remaining_time": "0:01:26", "throughput": 1915.77, "total_tokens": 1865928} +{"current_steps": 1295, "total_steps": 1405, "loss": 0.0, "lr": 9.453774604721937e-08, "epoch": 4.608540925266904, "percentage": 92.17, "elapsed_time": "0:16:14", "remaining_time": "0:01:22", "throughput": 1922.95, "total_tokens": 1873800} +{"current_steps": 1300, "total_steps": 1405, "loss": 0.0, "lr": 8.62608795509276e-08, "epoch": 4.6263345195729535, "percentage": 92.53, "elapsed_time": "0:16:14", "remaining_time": "0:01:18", "throughput": 1930.24, "total_tokens": 1881800} +{"current_steps": 1305, "total_steps": 1405, "loss": 0.0, "lr": 7.835677395001795e-08, "epoch": 4.644128113879003, "percentage": 92.88, "elapsed_time": "0:16:15", "remaining_time": "0:01:14", "throughput": 1936.41, "total_tokens": 1888648} +{"current_steps": 1310, "total_steps": 1405, "loss": 0.0, "lr": 7.082664989897486e-08, "epoch": 4.661921708185053, "percentage": 93.24, "elapsed_time": "0:16:15", "remaining_time": "0:01:10", "throughput": 1942.5, "total_tokens": 1895432} +{"current_steps": 1315, "total_steps": 1405, "loss": 0.0, "lr": 6.367167029720234e-08, "epoch": 4.679715302491103, "percentage": 93.59, "elapsed_time": "0:16:16", "remaining_time": "0:01:06", "throughput": 1948.78, "total_tokens": 1902408} +{"current_steps": 1320, "total_steps": 1405, "loss": 0.0277, "lr": 5.68929401094323e-08, "epoch": 4.697508896797153, "percentage": 93.95, "elapsed_time": "0:16:16", "remaining_time": "0:01:02", "throughput": 1955.98, "total_tokens": 1910344} +{"current_steps": 1325, "total_steps": 1405, "loss": 0.0, "lr": 5.049150619508503e-08, "epoch": 4.715302491103203, "percentage": 94.31, "elapsed_time": "0:16:17", "remaining_time": "0:00:58", "throughput": 1963.37, "total_tokens": 1918472} +{"current_steps": 1330, "total_steps": 1405, "loss": 0.0, "lr": 4.446835714659647e-08, "epoch": 4.733096085409253, "percentage": 94.66, "elapsed_time": "0:16:17", "remaining_time": "0:00:55", "throughput": 1968.94, "total_tokens": 1924744} +{"current_steps": 1335, "total_steps": 1405, "loss": 0.0, "lr": 3.882442313674878e-08, "epoch": 4.750889679715303, "percentage": 95.02, "elapsed_time": "0:16:18", "remaining_time": "0:00:51", "throughput": 1976.31, "total_tokens": 1932872} +{"current_steps": 1340, "total_steps": 1405, "loss": 0.0, "lr": 3.3560575775019866e-08, "epoch": 4.7686832740213525, "percentage": 95.37, "elapsed_time": "0:16:18", "remaining_time": "0:00:47", "throughput": 1982.75, "total_tokens": 1940040} +{"current_steps": 1345, "total_steps": 1405, "loss": 0.0, "lr": 2.8677627972978905e-08, "epoch": 4.786476868327402, "percentage": 95.73, "elapsed_time": "0:16:18", "remaining_time": "0:00:43", "throughput": 1990.83, "total_tokens": 1948936} +{"current_steps": 1349, "total_steps": 1405, "eval_loss": 0.2602100372314453, "epoch": 4.800711743772242, "percentage": 96.01, "elapsed_time": "0:16:19", "remaining_time": "0:00:40", "throughput": 1994.66, "total_tokens": 1954568} +{"current_steps": 1350, "total_steps": 1405, "loss": 0.0, "lr": 2.4176333818745347e-08, "epoch": 4.804270462633452, "percentage": 96.09, "elapsed_time": "0:17:22", "remaining_time": "0:00:42", "throughput": 1875.69, "total_tokens": 1955912} +{"current_steps": 1355, "total_steps": 1405, "loss": 0.0, "lr": 2.0057388460533733e-08, "epoch": 4.822064056939502, "percentage": 96.44, "elapsed_time": "0:17:23", "remaining_time": "0:00:38", "throughput": 1881.48, "total_tokens": 1962760} +{"current_steps": 1360, "total_steps": 1405, "loss": 0.0, "lr": 1.6321427999298754e-08, "epoch": 4.839857651245552, "percentage": 96.8, "elapsed_time": "0:17:23", "remaining_time": "0:00:34", "throughput": 1886.85, "total_tokens": 1969160} +{"current_steps": 1365, "total_steps": 1405, "loss": 0.0, "lr": 1.2969029390501597e-08, "epoch": 4.857651245551601, "percentage": 97.15, "elapsed_time": "0:17:24", "remaining_time": "0:00:30", "throughput": 1892.4, "total_tokens": 1975752} +{"current_steps": 1370, "total_steps": 1405, "loss": 0.0, "lr": 1.000071035500816e-08, "epoch": 4.875444839857651, "percentage": 97.51, "elapsed_time": "0:17:24", "remaining_time": "0:00:26", "throughput": 1898.75, "total_tokens": 1983240} +{"current_steps": 1375, "total_steps": 1405, "loss": 0.0, "lr": 7.416929299135511e-09, "epoch": 4.893238434163701, "percentage": 97.86, "elapsed_time": "0:17:24", "remaining_time": "0:00:22", "throughput": 1905.16, "total_tokens": 1990792} +{"current_steps": 1380, "total_steps": 1405, "loss": 0.0, "lr": 5.218085243859639e-09, "epoch": 4.911032028469751, "percentage": 98.22, "elapsed_time": "0:17:25", "remaining_time": "0:00:18", "throughput": 1911.91, "total_tokens": 1998728} +{"current_steps": 1385, "total_steps": 1405, "loss": 0.0, "lr": 3.4045177631936154e-09, "epoch": 4.9288256227758005, "percentage": 98.58, "elapsed_time": "0:17:25", "remaining_time": "0:00:15", "throughput": 1918.89, "total_tokens": 2006920} +{"current_steps": 1390, "total_steps": 1405, "loss": 0.0, "lr": 1.976506931745392e-09, "epoch": 4.94661921708185, "percentage": 98.93, "elapsed_time": "0:17:26", "remaining_time": "0:00:11", "throughput": 1924.06, "total_tokens": 2013128} +{"current_steps": 1395, "total_steps": 1405, "loss": 0.0, "lr": 9.3427328146517e-10, "epoch": 4.9644128113879, "percentage": 99.29, "elapsed_time": "0:17:26", "remaining_time": "0:00:07", "throughput": 1931.37, "total_tokens": 2021704} +{"current_steps": 1400, "total_steps": 1405, "loss": 0.0, "lr": 2.7797776758903274e-10, "epoch": 4.98220640569395, "percentage": 99.64, "elapsed_time": "0:17:27", "remaining_time": "0:00:03", "throughput": 1937.41, "total_tokens": 2028872} +{"current_steps": 1405, "total_steps": 1405, "loss": 0.0, "lr": 7.72174378022017e-12, "epoch": 5.0, "percentage": 100.0, "elapsed_time": "0:17:27", "remaining_time": "0:00:00", "throughput": 1942.54, "total_tokens": 2035272} +{"current_steps": 1405, "total_steps": 1405, "epoch": 5.0, "percentage": 100.0, "elapsed_time": "0:18:04", "remaining_time": "0:00:00", "throughput": 1877.37, "total_tokens": 2035272} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..21332f6 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,2463 @@ +{ + "best_global_step": 284, + "best_metric": 0.11889845132827759, + "best_model_checkpoint": "saves_bts_preliminary/base/llama-3.2-1b-instruct/train_rte_42_1776331559/checkpoint-284", + "epoch": 5.0, + "eval_steps": 71, + "global_step": 1405, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.017793594306049824, + "grad_norm": 214.55776977539062, + "learning_rate": 1.4184397163120568e-07, + "loss": 0.7401, + "num_input_tokens_seen": 7872, + "step": 5 + }, + { + "epoch": 0.03558718861209965, + "grad_norm": 161.54965209960938, + "learning_rate": 3.1914893617021275e-07, + "loss": 0.6171, + "num_input_tokens_seen": 14784, + "step": 10 + }, + { + "epoch": 0.05338078291814947, + "grad_norm": 70.1294937133789, + "learning_rate": 4.964539007092199e-07, + "loss": 0.4092, + "num_input_tokens_seen": 23424, + "step": 15 + }, + { + "epoch": 0.0711743772241993, + "grad_norm": 83.8577880859375, + "learning_rate": 6.73758865248227e-07, + "loss": 0.2659, + "num_input_tokens_seen": 29824, + "step": 20 + }, + { + "epoch": 0.08896797153024912, + "grad_norm": 70.67394256591797, + "learning_rate": 8.510638297872341e-07, + "loss": 0.2666, + "num_input_tokens_seen": 37824, + "step": 25 + }, + { + "epoch": 0.10676156583629894, + "grad_norm": 35.297691345214844, + "learning_rate": 1.0283687943262412e-06, + "loss": 0.2573, + "num_input_tokens_seen": 44608, + "step": 30 + }, + { + "epoch": 0.12455516014234876, + "grad_norm": 8.436112403869629, + "learning_rate": 1.2056737588652482e-06, + "loss": 0.3322, + "num_input_tokens_seen": 51968, + "step": 35 + }, + { + "epoch": 0.1423487544483986, + "grad_norm": 13.455877304077148, + "learning_rate": 1.3829787234042555e-06, + "loss": 0.1441, + "num_input_tokens_seen": 59456, + "step": 40 + }, + { + "epoch": 0.1601423487544484, + "grad_norm": 32.384830474853516, + "learning_rate": 1.5602836879432626e-06, + "loss": 0.163, + "num_input_tokens_seen": 66496, + "step": 45 + }, + { + "epoch": 0.17793594306049823, + "grad_norm": 26.830564498901367, + "learning_rate": 1.7375886524822697e-06, + "loss": 0.1815, + "num_input_tokens_seen": 73408, + "step": 50 + }, + { + "epoch": 0.19572953736654805, + "grad_norm": 7.963785171508789, + "learning_rate": 1.9148936170212767e-06, + "loss": 0.1484, + "num_input_tokens_seen": 80576, + "step": 55 + }, + { + "epoch": 0.21352313167259787, + "grad_norm": 8.084745407104492, + "learning_rate": 2.092198581560284e-06, + "loss": 0.1828, + "num_input_tokens_seen": 88256, + "step": 60 + }, + { + "epoch": 0.2313167259786477, + "grad_norm": 11.574740409851074, + "learning_rate": 2.269503546099291e-06, + "loss": 0.1709, + "num_input_tokens_seen": 96256, + "step": 65 + }, + { + "epoch": 0.2491103202846975, + "grad_norm": 20.736160278320312, + "learning_rate": 2.446808510638298e-06, + "loss": 0.2309, + "num_input_tokens_seen": 103424, + "step": 70 + }, + { + "epoch": 0.2526690391459075, + "eval_loss": 0.18017518520355225, + "eval_runtime": 0.6321, + "eval_samples_per_second": 393.895, + "eval_steps_per_second": 50.621, + "num_input_tokens_seen": 105024, + "step": 71 + }, + { + "epoch": 0.2669039145907473, + "grad_norm": 20.432262420654297, + "learning_rate": 2.624113475177305e-06, + "loss": 0.1733, + "num_input_tokens_seen": 110528, + "step": 75 + }, + { + "epoch": 0.2846975088967972, + "grad_norm": 4.497078895568848, + "learning_rate": 2.8014184397163125e-06, + "loss": 0.1469, + "num_input_tokens_seen": 117440, + "step": 80 + }, + { + "epoch": 0.302491103202847, + "grad_norm": 13.809417724609375, + "learning_rate": 2.978723404255319e-06, + "loss": 0.1417, + "num_input_tokens_seen": 125504, + "step": 85 + }, + { + "epoch": 0.3202846975088968, + "grad_norm": 47.335994720458984, + "learning_rate": 3.1560283687943267e-06, + "loss": 0.1887, + "num_input_tokens_seen": 132352, + "step": 90 + }, + { + "epoch": 0.33807829181494664, + "grad_norm": 33.13681411743164, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.1507, + "num_input_tokens_seen": 139200, + "step": 95 + }, + { + "epoch": 0.35587188612099646, + "grad_norm": 21.207801818847656, + "learning_rate": 3.510638297872341e-06, + "loss": 0.0966, + "num_input_tokens_seen": 147904, + "step": 100 + }, + { + "epoch": 0.3736654804270463, + "grad_norm": 27.387420654296875, + "learning_rate": 3.6879432624113475e-06, + "loss": 0.2218, + "num_input_tokens_seen": 154240, + "step": 105 + }, + { + "epoch": 0.3914590747330961, + "grad_norm": 14.321903228759766, + "learning_rate": 3.865248226950355e-06, + "loss": 0.1246, + "num_input_tokens_seen": 161472, + "step": 110 + }, + { + "epoch": 0.4092526690391459, + "grad_norm": 24.01630401611328, + "learning_rate": 4.042553191489362e-06, + "loss": 0.1689, + "num_input_tokens_seen": 168192, + "step": 115 + }, + { + "epoch": 0.42704626334519574, + "grad_norm": 5.953591346740723, + "learning_rate": 4.219858156028369e-06, + "loss": 0.1818, + "num_input_tokens_seen": 174656, + "step": 120 + }, + { + "epoch": 0.44483985765124556, + "grad_norm": 7.9348039627075195, + "learning_rate": 4.397163120567377e-06, + "loss": 0.1189, + "num_input_tokens_seen": 181632, + "step": 125 + }, + { + "epoch": 0.4626334519572954, + "grad_norm": 31.200048446655273, + "learning_rate": 4.574468085106383e-06, + "loss": 0.0973, + "num_input_tokens_seen": 191488, + "step": 130 + }, + { + "epoch": 0.4804270462633452, + "grad_norm": 20.26383399963379, + "learning_rate": 4.751773049645391e-06, + "loss": 0.1978, + "num_input_tokens_seen": 198848, + "step": 135 + }, + { + "epoch": 0.498220640569395, + "grad_norm": 34.11137771606445, + "learning_rate": 4.929078014184397e-06, + "loss": 0.1861, + "num_input_tokens_seen": 207232, + "step": 140 + }, + { + "epoch": 0.505338078291815, + "eval_loss": 0.2461702525615692, + "eval_runtime": 0.5958, + "eval_samples_per_second": 417.954, + "eval_steps_per_second": 53.713, + "num_input_tokens_seen": 209536, + "step": 142 + }, + { + "epoch": 0.5160142348754448, + "grad_norm": 5.17560338973999, + "learning_rate": 4.999930504592181e-06, + "loss": 0.2676, + "num_input_tokens_seen": 213952, + "step": 145 + }, + { + "epoch": 0.5338078291814946, + "grad_norm": 4.064759731292725, + "learning_rate": 4.999505824425164e-06, + "loss": 0.1686, + "num_input_tokens_seen": 221376, + "step": 150 + }, + { + "epoch": 0.5516014234875445, + "grad_norm": 6.850118160247803, + "learning_rate": 4.998695138156149e-06, + "loss": 0.1074, + "num_input_tokens_seen": 228928, + "step": 155 + }, + { + "epoch": 0.5693950177935944, + "grad_norm": 12.681975364685059, + "learning_rate": 4.997498570981822e-06, + "loss": 0.1216, + "num_input_tokens_seen": 236352, + "step": 160 + }, + { + "epoch": 0.5871886120996441, + "grad_norm": 20.49947166442871, + "learning_rate": 4.995916307691601e-06, + "loss": 0.1426, + "num_input_tokens_seen": 244416, + "step": 165 + }, + { + "epoch": 0.604982206405694, + "grad_norm": 17.282085418701172, + "learning_rate": 4.993948592639105e-06, + "loss": 0.175, + "num_input_tokens_seen": 251456, + "step": 170 + }, + { + "epoch": 0.6227758007117438, + "grad_norm": 12.41250991821289, + "learning_rate": 4.991595729704405e-06, + "loss": 0.1179, + "num_input_tokens_seen": 258880, + "step": 175 + }, + { + "epoch": 0.6405693950177936, + "grad_norm": 28.129770278930664, + "learning_rate": 4.988858082247109e-06, + "loss": 0.1235, + "num_input_tokens_seen": 265152, + "step": 180 + }, + { + "epoch": 0.6583629893238434, + "grad_norm": 27.979137420654297, + "learning_rate": 4.985736073050237e-06, + "loss": 0.1596, + "num_input_tokens_seen": 272576, + "step": 185 + }, + { + "epoch": 0.6761565836298933, + "grad_norm": 5.348273754119873, + "learning_rate": 4.982230184254934e-06, + "loss": 0.1188, + "num_input_tokens_seen": 279744, + "step": 190 + }, + { + "epoch": 0.693950177935943, + "grad_norm": 21.296180725097656, + "learning_rate": 4.9783409572860105e-06, + "loss": 0.1255, + "num_input_tokens_seen": 287680, + "step": 195 + }, + { + "epoch": 0.7117437722419929, + "grad_norm": 8.518939018249512, + "learning_rate": 4.9740689927683314e-06, + "loss": 0.0801, + "num_input_tokens_seen": 294592, + "step": 200 + }, + { + "epoch": 0.7295373665480427, + "grad_norm": 22.954431533813477, + "learning_rate": 4.9694149504340515e-06, + "loss": 0.0902, + "num_input_tokens_seen": 301440, + "step": 205 + }, + { + "epoch": 0.7473309608540926, + "grad_norm": 6.020153999328613, + "learning_rate": 4.964379549020741e-06, + "loss": 0.0658, + "num_input_tokens_seen": 308416, + "step": 210 + }, + { + "epoch": 0.7580071174377224, + "eval_loss": 0.1589389145374298, + "eval_runtime": 1.701, + "eval_samples_per_second": 146.387, + "eval_steps_per_second": 18.813, + "num_input_tokens_seen": 312576, + "step": 213 + }, + { + "epoch": 0.7651245551601423, + "grad_norm": 22.559890747070312, + "learning_rate": 4.9589635661603845e-06, + "loss": 0.1047, + "num_input_tokens_seen": 315328, + "step": 215 + }, + { + "epoch": 0.7829181494661922, + "grad_norm": 21.644763946533203, + "learning_rate": 4.953167838259285e-06, + "loss": 0.0899, + "num_input_tokens_seen": 322688, + "step": 220 + }, + { + "epoch": 0.800711743772242, + "grad_norm": 21.93659782409668, + "learning_rate": 4.946993260368904e-06, + "loss": 0.1884, + "num_input_tokens_seen": 329280, + "step": 225 + }, + { + "epoch": 0.8185053380782918, + "grad_norm": 4.014484405517578, + "learning_rate": 4.9404407860476275e-06, + "loss": 0.0862, + "num_input_tokens_seen": 336896, + "step": 230 + }, + { + "epoch": 0.8362989323843416, + "grad_norm": 9.570290565490723, + "learning_rate": 4.933511427213511e-06, + "loss": 0.1129, + "num_input_tokens_seen": 344128, + "step": 235 + }, + { + "epoch": 0.8540925266903915, + "grad_norm": 8.622743606567383, + "learning_rate": 4.926206253988001e-06, + "loss": 0.0612, + "num_input_tokens_seen": 350912, + "step": 240 + }, + { + "epoch": 0.8718861209964412, + "grad_norm": 20.623458862304688, + "learning_rate": 4.91852639453068e-06, + "loss": 0.1364, + "num_input_tokens_seen": 358016, + "step": 245 + }, + { + "epoch": 0.8896797153024911, + "grad_norm": 20.129985809326172, + "learning_rate": 4.910473034865033e-06, + "loss": 0.0751, + "num_input_tokens_seen": 364736, + "step": 250 + }, + { + "epoch": 0.9074733096085409, + "grad_norm": 38.003440856933594, + "learning_rate": 4.902047418695293e-06, + "loss": 0.1051, + "num_input_tokens_seen": 371648, + "step": 255 + }, + { + "epoch": 0.9252669039145908, + "grad_norm": 12.691219329833984, + "learning_rate": 4.893250847214369e-06, + "loss": 0.0675, + "num_input_tokens_seen": 379200, + "step": 260 + }, + { + "epoch": 0.9430604982206405, + "grad_norm": 13.248903274536133, + "learning_rate": 4.884084678902898e-06, + "loss": 0.1611, + "num_input_tokens_seen": 387200, + "step": 265 + }, + { + "epoch": 0.9608540925266904, + "grad_norm": 19.279905319213867, + "learning_rate": 4.874550329319457e-06, + "loss": 0.1232, + "num_input_tokens_seen": 395264, + "step": 270 + }, + { + "epoch": 0.9786476868327402, + "grad_norm": 10.716254234313965, + "learning_rate": 4.864649270881944e-06, + "loss": 0.135, + "num_input_tokens_seen": 402176, + "step": 275 + }, + { + "epoch": 0.99644128113879, + "grad_norm": 10.358949661254883, + "learning_rate": 4.854383032640196e-06, + "loss": 0.0765, + "num_input_tokens_seen": 409984, + "step": 280 + }, + { + "epoch": 1.01067615658363, + "eval_loss": 0.11889845132827759, + "eval_runtime": 0.6175, + "eval_samples_per_second": 403.229, + "eval_steps_per_second": 51.821, + "num_input_tokens_seen": 414040, + "step": 284 + }, + { + "epoch": 1.0142348754448398, + "grad_norm": 19.193450927734375, + "learning_rate": 4.843753200039851e-06, + "loss": 0.0754, + "num_input_tokens_seen": 415256, + "step": 285 + }, + { + "epoch": 1.0320284697508897, + "grad_norm": 15.135278701782227, + "learning_rate": 4.832761414677502e-06, + "loss": 0.067, + "num_input_tokens_seen": 422808, + "step": 290 + }, + { + "epoch": 1.0498220640569396, + "grad_norm": 39.43471145629883, + "learning_rate": 4.821409374047184e-06, + "loss": 0.0342, + "num_input_tokens_seen": 430104, + "step": 295 + }, + { + "epoch": 1.0676156583629894, + "grad_norm": 88.81342315673828, + "learning_rate": 4.809698831278217e-06, + "loss": 0.0683, + "num_input_tokens_seen": 436760, + "step": 300 + }, + { + "epoch": 1.085409252669039, + "grad_norm": 0.4617443382740021, + "learning_rate": 4.797631594864475e-06, + "loss": 0.1073, + "num_input_tokens_seen": 444952, + "step": 305 + }, + { + "epoch": 1.103202846975089, + "grad_norm": 40.835601806640625, + "learning_rate": 4.785209528385087e-06, + "loss": 0.1453, + "num_input_tokens_seen": 452760, + "step": 310 + }, + { + "epoch": 1.1209964412811388, + "grad_norm": 47.22534942626953, + "learning_rate": 4.7724345502166435e-06, + "loss": 0.2851, + "num_input_tokens_seen": 458392, + "step": 315 + }, + { + "epoch": 1.1387900355871885, + "grad_norm": 22.309940338134766, + "learning_rate": 4.759308633236934e-06, + "loss": 0.0427, + "num_input_tokens_seen": 465112, + "step": 320 + }, + { + "epoch": 1.1565836298932384, + "grad_norm": 0.2836686968803406, + "learning_rate": 4.74583380452027e-06, + "loss": 0.0369, + "num_input_tokens_seen": 472216, + "step": 325 + }, + { + "epoch": 1.1743772241992882, + "grad_norm": 20.73933219909668, + "learning_rate": 4.7320121450244395e-06, + "loss": 0.1237, + "num_input_tokens_seen": 479576, + "step": 330 + }, + { + "epoch": 1.1921708185053381, + "grad_norm": 0.6483802199363708, + "learning_rate": 4.717845789269333e-06, + "loss": 0.0822, + "num_input_tokens_seen": 486552, + "step": 335 + }, + { + "epoch": 1.209964412811388, + "grad_norm": 10.158080101013184, + "learning_rate": 4.703336925007311e-06, + "loss": 0.0835, + "num_input_tokens_seen": 494616, + "step": 340 + }, + { + "epoch": 1.2277580071174377, + "grad_norm": 22.884260177612305, + "learning_rate": 4.68848779288534e-06, + "loss": 0.0413, + "num_input_tokens_seen": 501400, + "step": 345 + }, + { + "epoch": 1.2455516014234875, + "grad_norm": 0.23994407057762146, + "learning_rate": 4.673300686098957e-06, + "loss": 0.0284, + "num_input_tokens_seen": 508888, + "step": 350 + }, + { + "epoch": 1.2633451957295374, + "grad_norm": 28.652849197387695, + "learning_rate": 4.657777950038133e-06, + "loss": 0.1848, + "num_input_tokens_seen": 517656, + "step": 355 + }, + { + "epoch": 1.2633451957295374, + "eval_loss": 0.21280953288078308, + "eval_runtime": 0.6154, + "eval_samples_per_second": 404.643, + "eval_steps_per_second": 52.002, + "num_input_tokens_seen": 517656, + "step": 355 + }, + { + "epoch": 1.281138790035587, + "grad_norm": 29.27073860168457, + "learning_rate": 4.641921981925064e-06, + "loss": 0.0684, + "num_input_tokens_seen": 526232, + "step": 360 + }, + { + "epoch": 1.298932384341637, + "grad_norm": 6.552826404571533, + "learning_rate": 4.625735230443959e-06, + "loss": 0.0556, + "num_input_tokens_seen": 533400, + "step": 365 + }, + { + "epoch": 1.3167259786476868, + "grad_norm": 19.556528091430664, + "learning_rate": 4.609220195362886e-06, + "loss": 0.1059, + "num_input_tokens_seen": 542168, + "step": 370 + }, + { + "epoch": 1.3345195729537367, + "grad_norm": 0.45224103331565857, + "learning_rate": 4.592379427147722e-06, + "loss": 0.0525, + "num_input_tokens_seen": 549976, + "step": 375 + }, + { + "epoch": 1.3523131672597866, + "grad_norm": 7.306344032287598, + "learning_rate": 4.575215526568278e-06, + "loss": 0.118, + "num_input_tokens_seen": 557016, + "step": 380 + }, + { + "epoch": 1.3701067615658362, + "grad_norm": 17.069351196289062, + "learning_rate": 4.557731144296659e-06, + "loss": 0.0831, + "num_input_tokens_seen": 564504, + "step": 385 + }, + { + "epoch": 1.387900355871886, + "grad_norm": 2.3357198238372803, + "learning_rate": 4.539928980497903e-06, + "loss": 0.0283, + "num_input_tokens_seen": 571864, + "step": 390 + }, + { + "epoch": 1.405693950177936, + "grad_norm": 18.61362648010254, + "learning_rate": 4.521811784412996e-06, + "loss": 0.0483, + "num_input_tokens_seen": 578456, + "step": 395 + }, + { + "epoch": 1.4234875444839858, + "grad_norm": 51.09469223022461, + "learning_rate": 4.503382353934295e-06, + "loss": 0.1155, + "num_input_tokens_seen": 584600, + "step": 400 + }, + { + "epoch": 1.4412811387900355, + "grad_norm": 0.02246745675802231, + "learning_rate": 4.484643535173438e-06, + "loss": 0.0086, + "num_input_tokens_seen": 591128, + "step": 405 + }, + { + "epoch": 1.4590747330960854, + "grad_norm": 0.0882764533162117, + "learning_rate": 4.465598222021818e-06, + "loss": 0.0969, + "num_input_tokens_seen": 598552, + "step": 410 + }, + { + "epoch": 1.4768683274021353, + "grad_norm": 33.67585372924805, + "learning_rate": 4.446249355703661e-06, + "loss": 0.0715, + "num_input_tokens_seen": 607320, + "step": 415 + }, + { + "epoch": 1.4946619217081851, + "grad_norm": 24.378616333007812, + "learning_rate": 4.426599924321815e-06, + "loss": 0.094, + "num_input_tokens_seen": 614744, + "step": 420 + }, + { + "epoch": 1.512455516014235, + "grad_norm": 9.61978530883789, + "learning_rate": 4.406652962396278e-06, + "loss": 0.0306, + "num_input_tokens_seen": 622808, + "step": 425 + }, + { + "epoch": 1.5160142348754448, + "eval_loss": 0.17913997173309326, + "eval_runtime": 0.6176, + "eval_samples_per_second": 403.205, + "eval_steps_per_second": 51.817, + "num_input_tokens_seen": 624344, + "step": 426 + }, + { + "epoch": 1.5302491103202847, + "grad_norm": 18.067750930786133, + "learning_rate": 4.386411550395576e-06, + "loss": 0.103, + "num_input_tokens_seen": 630488, + "step": 430 + }, + { + "epoch": 1.5480427046263345, + "grad_norm": 2.518416404724121, + "learning_rate": 4.365878814261032e-06, + "loss": 0.0775, + "num_input_tokens_seen": 638424, + "step": 435 + }, + { + "epoch": 1.5658362989323842, + "grad_norm": 17.197315216064453, + "learning_rate": 4.34505792492402e-06, + "loss": 0.0767, + "num_input_tokens_seen": 645208, + "step": 440 + }, + { + "epoch": 1.583629893238434, + "grad_norm": 1.7272682189941406, + "learning_rate": 4.3239520978162685e-06, + "loss": 0.013, + "num_input_tokens_seen": 653016, + "step": 445 + }, + { + "epoch": 1.601423487544484, + "grad_norm": 2.9054439067840576, + "learning_rate": 4.302564592373293e-06, + "loss": 0.0129, + "num_input_tokens_seen": 659992, + "step": 450 + }, + { + "epoch": 1.6192170818505338, + "grad_norm": 0.1965407431125641, + "learning_rate": 4.280898711531026e-06, + "loss": 0.1234, + "num_input_tokens_seen": 667224, + "step": 455 + }, + { + "epoch": 1.6370106761565837, + "grad_norm": 1.4352664947509766, + "learning_rate": 4.258957801215743e-06, + "loss": 0.1334, + "num_input_tokens_seen": 675160, + "step": 460 + }, + { + "epoch": 1.6548042704626336, + "grad_norm": 7.041714191436768, + "learning_rate": 4.236745249827336e-06, + "loss": 0.1297, + "num_input_tokens_seen": 683544, + "step": 465 + }, + { + "epoch": 1.6725978647686834, + "grad_norm": 22.762420654296875, + "learning_rate": 4.2142644877160334e-06, + "loss": 0.0334, + "num_input_tokens_seen": 689368, + "step": 470 + }, + { + "epoch": 1.690391459074733, + "grad_norm": 15.38829231262207, + "learning_rate": 4.191518986652642e-06, + "loss": 0.0779, + "num_input_tokens_seen": 695832, + "step": 475 + }, + { + "epoch": 1.708185053380783, + "grad_norm": 5.282833576202393, + "learning_rate": 4.168512259292391e-06, + "loss": 0.0085, + "num_input_tokens_seen": 703128, + "step": 480 + }, + { + "epoch": 1.7259786476868326, + "grad_norm": 0.2967665493488312, + "learning_rate": 4.14524785863246e-06, + "loss": 0.0617, + "num_input_tokens_seen": 709528, + "step": 485 + }, + { + "epoch": 1.7437722419928825, + "grad_norm": 18.050338745117188, + "learning_rate": 4.121729377463285e-06, + "loss": 0.0537, + "num_input_tokens_seen": 716312, + "step": 490 + }, + { + "epoch": 1.7615658362989324, + "grad_norm": 34.60630416870117, + "learning_rate": 4.0979604478137045e-06, + "loss": 0.1029, + "num_input_tokens_seen": 722776, + "step": 495 + }, + { + "epoch": 1.7686832740213523, + "eval_loss": 0.13597266376018524, + "eval_runtime": 0.6055, + "eval_samples_per_second": 411.239, + "eval_steps_per_second": 52.85, + "num_input_tokens_seen": 725656, + "step": 497 + }, + { + "epoch": 1.7793594306049823, + "grad_norm": 19.958894729614258, + "learning_rate": 4.0739447403900605e-06, + "loss": 0.1142, + "num_input_tokens_seen": 729944, + "step": 500 + }, + { + "epoch": 1.7971530249110321, + "grad_norm": 13.6007719039917, + "learning_rate": 4.0496859640093215e-06, + "loss": 0.0837, + "num_input_tokens_seen": 737112, + "step": 505 + }, + { + "epoch": 1.814946619217082, + "grad_norm": 2.8377349376678467, + "learning_rate": 4.025187865026311e-06, + "loss": 0.0124, + "num_input_tokens_seen": 744408, + "step": 510 + }, + { + "epoch": 1.8327402135231317, + "grad_norm": 4.554283618927002, + "learning_rate": 4.0004542267551585e-06, + "loss": 0.059, + "num_input_tokens_seen": 750488, + "step": 515 + }, + { + "epoch": 1.8505338078291815, + "grad_norm": 20.501800537109375, + "learning_rate": 3.975488868885022e-06, + "loss": 0.0662, + "num_input_tokens_seen": 757528, + "step": 520 + }, + { + "epoch": 1.8683274021352312, + "grad_norm": 37.35130310058594, + "learning_rate": 3.950295646890202e-06, + "loss": 0.0299, + "num_input_tokens_seen": 763736, + "step": 525 + }, + { + "epoch": 1.886120996441281, + "grad_norm": 39.31193161010742, + "learning_rate": 3.924878451434736e-06, + "loss": 0.0666, + "num_input_tokens_seen": 771864, + "step": 530 + }, + { + "epoch": 1.903914590747331, + "grad_norm": 1.3388175964355469, + "learning_rate": 3.899241207771546e-06, + "loss": 0.072, + "num_input_tokens_seen": 778712, + "step": 535 + }, + { + "epoch": 1.9217081850533808, + "grad_norm": 12.562256813049316, + "learning_rate": 3.873387875136252e-06, + "loss": 0.0475, + "num_input_tokens_seen": 784280, + "step": 540 + }, + { + "epoch": 1.9395017793594307, + "grad_norm": 20.693452835083008, + "learning_rate": 3.847322446135736e-06, + "loss": 0.1443, + "num_input_tokens_seen": 792280, + "step": 545 + }, + { + "epoch": 1.9572953736654806, + "grad_norm": 10.482217788696289, + "learning_rate": 3.821048946131549e-06, + "loss": 0.1501, + "num_input_tokens_seen": 798488, + "step": 550 + }, + { + "epoch": 1.9750889679715302, + "grad_norm": 0.7575608491897583, + "learning_rate": 3.794571432618267e-06, + "loss": 0.0502, + "num_input_tokens_seen": 806104, + "step": 555 + }, + { + "epoch": 1.99288256227758, + "grad_norm": 13.330556869506836, + "learning_rate": 3.767893994596876e-06, + "loss": 0.0142, + "num_input_tokens_seen": 813336, + "step": 560 + }, + { + "epoch": 2.0106761565836297, + "grad_norm": 0.28736042976379395, + "learning_rate": 3.7410207519432972e-06, + "loss": 0.1868, + "num_input_tokens_seen": 817576, + "step": 565 + }, + { + "epoch": 2.02135231316726, + "eval_loss": 0.16060441732406616, + "eval_runtime": 0.6313, + "eval_samples_per_second": 394.424, + "eval_steps_per_second": 50.689, + "num_input_tokens_seen": 821416, + "step": 568 + }, + { + "epoch": 2.0284697508896796, + "grad_norm": 35.438270568847656, + "learning_rate": 3.713955854772144e-06, + "loss": 0.0138, + "num_input_tokens_seen": 823848, + "step": 570 + }, + { + "epoch": 2.0462633451957295, + "grad_norm": 23.84980010986328, + "learning_rate": 3.686703482795802e-06, + "loss": 0.1069, + "num_input_tokens_seen": 832232, + "step": 575 + }, + { + "epoch": 2.0640569395017794, + "grad_norm": 0.2273331582546234, + "learning_rate": 3.6592678446789516e-06, + "loss": 0.042, + "num_input_tokens_seen": 840424, + "step": 580 + }, + { + "epoch": 2.0818505338078293, + "grad_norm": 0.2911248505115509, + "learning_rate": 3.631653177388605e-06, + "loss": 0.0325, + "num_input_tokens_seen": 846824, + "step": 585 + }, + { + "epoch": 2.099644128113879, + "grad_norm": 0.03189073130488396, + "learning_rate": 3.6038637455397802e-06, + "loss": 0.003, + "num_input_tokens_seen": 853608, + "step": 590 + }, + { + "epoch": 2.117437722419929, + "grad_norm": 0.06126879155635834, + "learning_rate": 3.575903840736906e-06, + "loss": 0.1002, + "num_input_tokens_seen": 860968, + "step": 595 + }, + { + "epoch": 2.135231316725979, + "grad_norm": 0.12086429446935654, + "learning_rate": 3.547777780911055e-06, + "loss": 0.0251, + "num_input_tokens_seen": 868904, + "step": 600 + }, + { + "epoch": 2.1530249110320283, + "grad_norm": 0.013016794808208942, + "learning_rate": 3.519489909653113e-06, + "loss": 0.0005, + "num_input_tokens_seen": 876072, + "step": 605 + }, + { + "epoch": 2.170818505338078, + "grad_norm": 41.4961051940918, + "learning_rate": 3.4910445955429856e-06, + "loss": 0.0155, + "num_input_tokens_seen": 883752, + "step": 610 + }, + { + "epoch": 2.188612099644128, + "grad_norm": 0.10037070512771606, + "learning_rate": 3.4624462314749447e-06, + "loss": 0.0004, + "num_input_tokens_seen": 891304, + "step": 615 + }, + { + "epoch": 2.206405693950178, + "grad_norm": 0.04163924232125282, + "learning_rate": 3.433699233979222e-06, + "loss": 0.0163, + "num_input_tokens_seen": 899176, + "step": 620 + }, + { + "epoch": 2.224199288256228, + "grad_norm": 0.09111510962247849, + "learning_rate": 3.4048080425399506e-06, + "loss": 0.0001, + "num_input_tokens_seen": 907560, + "step": 625 + }, + { + "epoch": 2.2419928825622777, + "grad_norm": 0.023458287119865417, + "learning_rate": 3.375777118909561e-06, + "loss": 0.0027, + "num_input_tokens_seen": 915240, + "step": 630 + }, + { + "epoch": 2.2597864768683276, + "grad_norm": 0.007639422547072172, + "learning_rate": 3.346610946419743e-06, + "loss": 0.0259, + "num_input_tokens_seen": 921384, + "step": 635 + }, + { + "epoch": 2.2740213523131674, + "eval_loss": 0.25424328446388245, + "eval_runtime": 0.622, + "eval_samples_per_second": 400.338, + "eval_steps_per_second": 51.449, + "num_input_tokens_seen": 926760, + "step": 639 + }, + { + "epoch": 2.277580071174377, + "grad_norm": 0.02619522623717785, + "learning_rate": 3.3173140292890673e-06, + "loss": 0.0233, + "num_input_tokens_seen": 927528, + "step": 640 + }, + { + "epoch": 2.295373665480427, + "grad_norm": 0.031525298953056335, + "learning_rate": 3.2878908919273867e-06, + "loss": 0.0266, + "num_input_tokens_seen": 934568, + "step": 645 + }, + { + "epoch": 2.3131672597864767, + "grad_norm": 0.08830317109823227, + "learning_rate": 3.2583460782371217e-06, + "loss": 0.0006, + "num_input_tokens_seen": 942248, + "step": 650 + }, + { + "epoch": 2.3309608540925266, + "grad_norm": 0.005108945071697235, + "learning_rate": 3.228684150911527e-06, + "loss": 0.0003, + "num_input_tokens_seen": 949096, + "step": 655 + }, + { + "epoch": 2.3487544483985765, + "grad_norm": 0.04930766299366951, + "learning_rate": 3.1989096907300634e-06, + "loss": 0.0082, + "num_input_tokens_seen": 955752, + "step": 660 + }, + { + "epoch": 2.3665480427046264, + "grad_norm": 0.06299162656068802, + "learning_rate": 3.1690272958509772e-06, + "loss": 0.0486, + "num_input_tokens_seen": 963176, + "step": 665 + }, + { + "epoch": 2.3843416370106763, + "grad_norm": 0.6633197665214539, + "learning_rate": 3.139041581101187e-06, + "loss": 0.0001, + "num_input_tokens_seen": 968232, + "step": 670 + }, + { + "epoch": 2.402135231316726, + "grad_norm": 59.2174186706543, + "learning_rate": 3.108957177263608e-06, + "loss": 0.0222, + "num_input_tokens_seen": 976552, + "step": 675 + }, + { + "epoch": 2.419928825622776, + "grad_norm": 0.0013539546635001898, + "learning_rate": 3.078778730362003e-06, + "loss": 0.0075, + "num_input_tokens_seen": 983720, + "step": 680 + }, + { + "epoch": 2.4377224199288254, + "grad_norm": 0.01872558705508709, + "learning_rate": 3.0485109009434844e-06, + "loss": 0.0003, + "num_input_tokens_seen": 991976, + "step": 685 + }, + { + "epoch": 2.4555160142348753, + "grad_norm": 0.06228525564074516, + "learning_rate": 3.018158363358773e-06, + "loss": 0.061, + "num_input_tokens_seen": 998184, + "step": 690 + }, + { + "epoch": 2.473309608540925, + "grad_norm": 0.015886032953858376, + "learning_rate": 2.9877258050403214e-06, + "loss": 0.0, + "num_input_tokens_seen": 1005672, + "step": 695 + }, + { + "epoch": 2.491103202846975, + "grad_norm": 0.49660900235176086, + "learning_rate": 2.9572179257784215e-06, + "loss": 0.0152, + "num_input_tokens_seen": 1013096, + "step": 700 + }, + { + "epoch": 2.508896797153025, + "grad_norm": 0.01401636004447937, + "learning_rate": 2.9266394369954056e-06, + "loss": 0.0006, + "num_input_tokens_seen": 1019304, + "step": 705 + }, + { + "epoch": 2.526690391459075, + "grad_norm": 0.011517788283526897, + "learning_rate": 2.8959950610180376e-06, + "loss": 0.029, + "num_input_tokens_seen": 1025320, + "step": 710 + }, + { + "epoch": 2.526690391459075, + "eval_loss": 0.23608553409576416, + "eval_runtime": 0.6224, + "eval_samples_per_second": 400.086, + "eval_steps_per_second": 51.417, + "num_input_tokens_seen": 1025320, + "step": 710 + }, + { + "epoch": 2.5444839857651247, + "grad_norm": 0.09000097960233688, + "learning_rate": 2.865289530348243e-06, + "loss": 0.0, + "num_input_tokens_seen": 1032552, + "step": 715 + }, + { + "epoch": 2.562277580071174, + "grad_norm": 0.0029357271268963814, + "learning_rate": 2.8345275869322432e-06, + "loss": 0.0, + "num_input_tokens_seen": 1039912, + "step": 720 + }, + { + "epoch": 2.580071174377224, + "grad_norm": 0.00977697316557169, + "learning_rate": 2.8037139814282494e-06, + "loss": 0.0092, + "num_input_tokens_seen": 1047208, + "step": 725 + }, + { + "epoch": 2.597864768683274, + "grad_norm": 0.03362439572811127, + "learning_rate": 2.7728534724728027e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1053928, + "step": 730 + }, + { + "epoch": 2.6156583629893237, + "grad_norm": 109.98116302490234, + "learning_rate": 2.741950825945881e-06, + "loss": 0.0806, + "num_input_tokens_seen": 1061608, + "step": 735 + }, + { + "epoch": 2.6334519572953736, + "grad_norm": 0.026239968836307526, + "learning_rate": 2.7110108142348962e-06, + "loss": 0.0747, + "num_input_tokens_seen": 1067560, + "step": 740 + }, + { + "epoch": 2.6512455516014235, + "grad_norm": 0.13907551765441895, + "learning_rate": 2.6800382154976734e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1074152, + "step": 745 + }, + { + "epoch": 2.6690391459074734, + "grad_norm": 0.02416655234992504, + "learning_rate": 2.64903781292455e-06, + "loss": 0.0005, + "num_input_tokens_seen": 1082856, + "step": 750 + }, + { + "epoch": 2.6868327402135233, + "grad_norm": 1.5046820640563965, + "learning_rate": 2.6180143939996926e-06, + "loss": 0.0003, + "num_input_tokens_seen": 1089512, + "step": 755 + }, + { + "epoch": 2.704626334519573, + "grad_norm": 0.027851078659296036, + "learning_rate": 2.5869727497617495e-06, + "loss": 0.0433, + "num_input_tokens_seen": 1096232, + "step": 760 + }, + { + "epoch": 2.722419928825623, + "grad_norm": 0.0055125863291323185, + "learning_rate": 2.55591767406396e-06, + "loss": 0.0004, + "num_input_tokens_seen": 1104168, + "step": 765 + }, + { + "epoch": 2.7402135231316724, + "grad_norm": 26.093463897705078, + "learning_rate": 2.524853962833825e-06, + "loss": 0.1194, + "num_input_tokens_seen": 1112232, + "step": 770 + }, + { + "epoch": 2.7580071174377223, + "grad_norm": 0.31872352957725525, + "learning_rate": 2.4937864133324514e-06, + "loss": 0.0031, + "num_input_tokens_seen": 1119016, + "step": 775 + }, + { + "epoch": 2.775800711743772, + "grad_norm": 0.7696136832237244, + "learning_rate": 2.462719823413707e-06, + "loss": 0.0005, + "num_input_tokens_seen": 1126696, + "step": 780 + }, + { + "epoch": 2.7793594306049823, + "eval_loss": 0.23524385690689087, + "eval_runtime": 1.141, + "eval_samples_per_second": 218.224, + "eval_steps_per_second": 28.045, + "num_input_tokens_seen": 1128104, + "step": 781 + }, + { + "epoch": 2.793594306049822, + "grad_norm": 0.014183886349201202, + "learning_rate": 2.4316589907832654e-06, + "loss": 0.0423, + "num_input_tokens_seen": 1134184, + "step": 785 + }, + { + "epoch": 2.811387900355872, + "grad_norm": 0.04211263358592987, + "learning_rate": 2.4006087122576867e-06, + "loss": 0.001, + "num_input_tokens_seen": 1140392, + "step": 790 + }, + { + "epoch": 2.829181494661922, + "grad_norm": 0.09497106820344925, + "learning_rate": 2.3695737830236263e-06, + "loss": 0.031, + "num_input_tokens_seen": 1148328, + "step": 795 + }, + { + "epoch": 2.8469750889679717, + "grad_norm": 0.040660299360752106, + "learning_rate": 2.3385589958973073e-06, + "loss": 0.0007, + "num_input_tokens_seen": 1154024, + "step": 800 + }, + { + "epoch": 2.864768683274021, + "grad_norm": 0.06477546691894531, + "learning_rate": 2.3075691405843435e-06, + "loss": 0.0003, + "num_input_tokens_seen": 1160808, + "step": 805 + }, + { + "epoch": 2.882562277580071, + "grad_norm": 0.022568199783563614, + "learning_rate": 2.2766090029400573e-06, + "loss": 0.0299, + "num_input_tokens_seen": 1167912, + "step": 810 + }, + { + "epoch": 2.900355871886121, + "grad_norm": 0.012872009538114071, + "learning_rate": 2.2456833642303825e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1174568, + "step": 815 + }, + { + "epoch": 2.9181494661921707, + "grad_norm": 0.0069219921715557575, + "learning_rate": 2.214797000393479e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1181480, + "step": 820 + }, + { + "epoch": 2.9359430604982206, + "grad_norm": 74.77655792236328, + "learning_rate": 2.183954681302173e-06, + "loss": 0.0251, + "num_input_tokens_seen": 1189928, + "step": 825 + }, + { + "epoch": 2.9537366548042705, + "grad_norm": 0.00504131056368351, + "learning_rate": 2.15316117002733e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1197480, + "step": 830 + }, + { + "epoch": 2.9715302491103204, + "grad_norm": 0.014680106192827225, + "learning_rate": 2.122421222102278e-06, + "loss": 0.039, + "num_input_tokens_seen": 1204584, + "step": 835 + }, + { + "epoch": 2.9893238434163703, + "grad_norm": 32.642799377441406, + "learning_rate": 2.0917395847884e-06, + "loss": 0.0311, + "num_input_tokens_seen": 1212584, + "step": 840 + }, + { + "epoch": 3.00711743772242, + "grad_norm": 55.226280212402344, + "learning_rate": 2.061120996341996e-06, + "loss": 0.0104, + "num_input_tokens_seen": 1217856, + "step": 845 + }, + { + "epoch": 3.0249110320284696, + "grad_norm": 0.005825079046189785, + "learning_rate": 2.030570185282544e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1226624, + "step": 850 + }, + { + "epoch": 3.0320284697508897, + "eval_loss": 0.25802624225616455, + "eval_runtime": 0.6232, + "eval_samples_per_second": 399.564, + "eval_steps_per_second": 51.35, + "num_input_tokens_seen": 1229440, + "step": 852 + }, + { + "epoch": 3.0427046263345194, + "grad_norm": 0.0032935605850070715, + "learning_rate": 2.0000918696624587e-06, + "loss": 0.0, + "num_input_tokens_seen": 1233152, + "step": 855 + }, + { + "epoch": 3.0604982206405693, + "grad_norm": 0.024462368339300156, + "learning_rate": 1.9696907563384687e-06, + "loss": 0.0, + "num_input_tokens_seen": 1240128, + "step": 860 + }, + { + "epoch": 3.078291814946619, + "grad_norm": 0.004567572381347418, + "learning_rate": 1.9393715402447228e-06, + "loss": 0.0, + "num_input_tokens_seen": 1248064, + "step": 865 + }, + { + "epoch": 3.096085409252669, + "grad_norm": 0.008781103417277336, + "learning_rate": 1.9091389036677384e-06, + "loss": 0.0, + "num_input_tokens_seen": 1255232, + "step": 870 + }, + { + "epoch": 3.113879003558719, + "grad_norm": 0.003714508144184947, + "learning_rate": 1.878997515523299e-06, + "loss": 0.0486, + "num_input_tokens_seen": 1262272, + "step": 875 + }, + { + "epoch": 3.131672597864769, + "grad_norm": 0.0036656211595982313, + "learning_rate": 1.8489520306354243e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1269632, + "step": 880 + }, + { + "epoch": 3.1494661921708187, + "grad_norm": 0.06997384876012802, + "learning_rate": 1.8190070890175082e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1277312, + "step": 885 + }, + { + "epoch": 3.167259786476868, + "grad_norm": 0.0681416317820549, + "learning_rate": 1.7891673151557493e-06, + "loss": 0.0502, + "num_input_tokens_seen": 1284096, + "step": 890 + }, + { + "epoch": 3.185053380782918, + "grad_norm": 0.008159984834492207, + "learning_rate": 1.7594373172949786e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1291648, + "step": 895 + }, + { + "epoch": 3.202846975088968, + "grad_norm": 0.008854905143380165, + "learning_rate": 1.7298216867269906e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1299712, + "step": 900 + }, + { + "epoch": 3.2206405693950177, + "grad_norm": 0.016451064497232437, + "learning_rate": 1.7003249970815028e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1306176, + "step": 905 + }, + { + "epoch": 3.2384341637010676, + "grad_norm": 0.010798790492117405, + "learning_rate": 1.6709518036198307e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1314112, + "step": 910 + }, + { + "epoch": 3.2562277580071175, + "grad_norm": 13.17015266418457, + "learning_rate": 1.6417066425314088e-06, + "loss": 0.0251, + "num_input_tokens_seen": 1321088, + "step": 915 + }, + { + "epoch": 3.2740213523131674, + "grad_norm": 0.006120134145021439, + "learning_rate": 1.612594030233252e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1328512, + "step": 920 + }, + { + "epoch": 3.284697508896797, + "eval_loss": 0.22950421273708344, + "eval_runtime": 0.6162, + "eval_samples_per_second": 404.1, + "eval_steps_per_second": 51.932, + "num_input_tokens_seen": 1332544, + "step": 923 + }, + { + "epoch": 3.2918149466192173, + "grad_norm": 0.008929496631026268, + "learning_rate": 1.5836184626724722e-06, + "loss": 0.0005, + "num_input_tokens_seen": 1336128, + "step": 925 + }, + { + "epoch": 3.309608540925267, + "grad_norm": 0.004387512803077698, + "learning_rate": 1.5547844146319547e-06, + "loss": 0.0, + "num_input_tokens_seen": 1343552, + "step": 930 + }, + { + "epoch": 3.3274021352313166, + "grad_norm": 0.011066491715610027, + "learning_rate": 1.5260963390393075e-06, + "loss": 0.0383, + "num_input_tokens_seen": 1351552, + "step": 935 + }, + { + "epoch": 3.3451957295373664, + "grad_norm": 0.006294109858572483, + "learning_rate": 1.4975586662791783e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1358272, + "step": 940 + }, + { + "epoch": 3.3629893238434163, + "grad_norm": 0.01710674725472927, + "learning_rate": 1.4691758035090603e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1366784, + "step": 945 + }, + { + "epoch": 3.380782918149466, + "grad_norm": 0.006381936836987734, + "learning_rate": 1.4409521339786809e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1373312, + "step": 950 + }, + { + "epoch": 3.398576512455516, + "grad_norm": 0.012110439129173756, + "learning_rate": 1.41289201635308e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1380736, + "step": 955 + }, + { + "epoch": 3.416370106761566, + "grad_norm": 0.03375524654984474, + "learning_rate": 1.3849997840394943e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1388544, + "step": 960 + }, + { + "epoch": 3.434163701067616, + "grad_norm": 0.010791816748678684, + "learning_rate": 1.3572797445181346e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1396160, + "step": 965 + }, + { + "epoch": 3.4519572953736652, + "grad_norm": 0.006806841120123863, + "learning_rate": 1.3297361786769654e-06, + "loss": 0.0, + "num_input_tokens_seen": 1404096, + "step": 970 + }, + { + "epoch": 3.469750889679715, + "grad_norm": 0.006986913271248341, + "learning_rate": 1.302373340150598e-06, + "loss": 0.0004, + "num_input_tokens_seen": 1411008, + "step": 975 + }, + { + "epoch": 3.487544483985765, + "grad_norm": 0.006954096723347902, + "learning_rate": 1.2751954546633872e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1418880, + "step": 980 + }, + { + "epoch": 3.505338078291815, + "grad_norm": 0.0037973152939230204, + "learning_rate": 1.2482067193768419e-06, + "loss": 0.0, + "num_input_tokens_seen": 1426048, + "step": 985 + }, + { + "epoch": 3.5231316725978647, + "grad_norm": 0.1800023317337036, + "learning_rate": 1.2214113022414448e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1432064, + "step": 990 + }, + { + "epoch": 3.5373665480427046, + "eval_loss": 0.24046999216079712, + "eval_runtime": 0.6864, + "eval_samples_per_second": 362.784, + "eval_steps_per_second": 46.623, + "num_input_tokens_seen": 1438336, + "step": 994 + }, + { + "epoch": 3.5409252669039146, + "grad_norm": 0.008008265867829323, + "learning_rate": 1.1948133413529817e-06, + "loss": 0.0, + "num_input_tokens_seen": 1439808, + "step": 995 + }, + { + "epoch": 3.5587188612099645, + "grad_norm": 0.004162625875324011, + "learning_rate": 1.168416944313486e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1447616, + "step": 1000 + }, + { + "epoch": 3.5765124555160144, + "grad_norm": 0.0032155588269233704, + "learning_rate": 1.1422261875968845e-06, + "loss": 0.0, + "num_input_tokens_seen": 1454208, + "step": 1005 + }, + { + "epoch": 3.5943060498220643, + "grad_norm": 0.007771800272166729, + "learning_rate": 1.1162451159194615e-06, + "loss": 0.0, + "num_input_tokens_seen": 1463296, + "step": 1010 + }, + { + "epoch": 3.612099644128114, + "grad_norm": 0.12433820962905884, + "learning_rate": 1.0904777416152166e-06, + "loss": 0.0009, + "num_input_tokens_seen": 1469952, + "step": 1015 + }, + { + "epoch": 3.6298932384341636, + "grad_norm": 0.003224864834919572, + "learning_rate": 1.0649280440162326e-06, + "loss": 0.0, + "num_input_tokens_seen": 1477184, + "step": 1020 + }, + { + "epoch": 3.6476868327402134, + "grad_norm": 0.00848240777850151, + "learning_rate": 1.0395999688381313e-06, + "loss": 0.0, + "num_input_tokens_seen": 1484160, + "step": 1025 + }, + { + "epoch": 3.6654804270462633, + "grad_norm": 0.005719688255339861, + "learning_rate": 1.0144974275707243e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1491200, + "step": 1030 + }, + { + "epoch": 3.683274021352313, + "grad_norm": 0.0028636339120566845, + "learning_rate": 9.896242968739538e-07, + "loss": 0.0, + "num_input_tokens_seen": 1498368, + "step": 1035 + }, + { + "epoch": 3.701067615658363, + "grad_norm": 0.0037352785002440214, + "learning_rate": 9.649844179792082e-07, + "loss": 0.0, + "num_input_tokens_seen": 1505984, + "step": 1040 + }, + { + "epoch": 3.718861209964413, + "grad_norm": 0.0014299631584435701, + "learning_rate": 9.405815960961054e-07, + "loss": 0.0, + "num_input_tokens_seen": 1511680, + "step": 1045 + }, + { + "epoch": 3.7366548042704624, + "grad_norm": 0.002295982325449586, + "learning_rate": 9.164195998248471e-07, + "loss": 0.0, + "num_input_tokens_seen": 1517888, + "step": 1050 + }, + { + "epoch": 3.7544483985765122, + "grad_norm": 0.0017496125074103475, + "learning_rate": 8.925021605742212e-07, + "loss": 0.0109, + "num_input_tokens_seen": 1525568, + "step": 1055 + }, + { + "epoch": 3.772241992882562, + "grad_norm": 0.0030260924249887466, + "learning_rate": 8.68832971985347e-07, + "loss": 0.0, + "num_input_tokens_seen": 1532480, + "step": 1060 + }, + { + "epoch": 3.790035587188612, + "grad_norm": 0.0027729899156838655, + "learning_rate": 8.454156893612592e-07, + "loss": 0.0, + "num_input_tokens_seen": 1539072, + "step": 1065 + }, + { + "epoch": 3.790035587188612, + "eval_loss": 0.2512344419956207, + "eval_runtime": 0.6676, + "eval_samples_per_second": 372.986, + "eval_steps_per_second": 47.934, + "num_input_tokens_seen": 1539072, + "step": 1065 + }, + { + "epoch": 3.807829181494662, + "grad_norm": 0.010402582585811615, + "learning_rate": 8.222539291024079e-07, + "loss": 0.0, + "num_input_tokens_seen": 1547584, + "step": 1070 + }, + { + "epoch": 3.8256227758007118, + "grad_norm": 0.0076858168467879295, + "learning_rate": 7.993512681481638e-07, + "loss": 0.0, + "num_input_tokens_seen": 1554304, + "step": 1075 + }, + { + "epoch": 3.8434163701067616, + "grad_norm": 0.005163070745766163, + "learning_rate": 7.767112434244254e-07, + "loss": 0.0, + "num_input_tokens_seen": 1560896, + "step": 1080 + }, + { + "epoch": 3.8612099644128115, + "grad_norm": 0.007071156986057758, + "learning_rate": 7.543373512973947e-07, + "loss": 0.0, + "num_input_tokens_seen": 1567744, + "step": 1085 + }, + { + "epoch": 3.8790035587188614, + "grad_norm": 0.004036191385239363, + "learning_rate": 7.322330470336314e-07, + "loss": 0.032, + "num_input_tokens_seen": 1574400, + "step": 1090 + }, + { + "epoch": 3.8967971530249113, + "grad_norm": 0.001716184546239674, + "learning_rate": 7.104017442664393e-07, + "loss": 0.0187, + "num_input_tokens_seen": 1581504, + "step": 1095 + }, + { + "epoch": 3.914590747330961, + "grad_norm": 0.03370984271168709, + "learning_rate": 6.88846814468691e-07, + "loss": 0.0, + "num_input_tokens_seen": 1589504, + "step": 1100 + }, + { + "epoch": 3.9323843416370106, + "grad_norm": 19.512357711791992, + "learning_rate": 6.67571586432163e-07, + "loss": 0.0369, + "num_input_tokens_seen": 1597696, + "step": 1105 + }, + { + "epoch": 3.9501779359430604, + "grad_norm": 0.0021878909319639206, + "learning_rate": 6.465793457534553e-07, + "loss": 0.0, + "num_input_tokens_seen": 1605248, + "step": 1110 + }, + { + "epoch": 3.9679715302491103, + "grad_norm": 0.0022776706609874964, + "learning_rate": 6.258733343265933e-07, + "loss": 0.0002, + "num_input_tokens_seen": 1613952, + "step": 1115 + }, + { + "epoch": 3.98576512455516, + "grad_norm": 0.010679907165467739, + "learning_rate": 6.054567498423683e-07, + "loss": 0.0, + "num_input_tokens_seen": 1620224, + "step": 1120 + }, + { + "epoch": 4.00355871886121, + "grad_norm": 0.003633465152233839, + "learning_rate": 5.853327452945115e-07, + "loss": 0.0, + "num_input_tokens_seen": 1625800, + "step": 1125 + }, + { + "epoch": 4.0213523131672595, + "grad_norm": 0.0048196627758443356, + "learning_rate": 5.655044284927658e-07, + "loss": 0.0, + "num_input_tokens_seen": 1633352, + "step": 1130 + }, + { + "epoch": 4.039145907473309, + "grad_norm": 0.0030679679475724697, + "learning_rate": 5.459748615829355e-07, + "loss": 0.0, + "num_input_tokens_seen": 1640840, + "step": 1135 + }, + { + "epoch": 4.04270462633452, + "eval_loss": 0.2551669180393219, + "eval_runtime": 1.6934, + "eval_samples_per_second": 147.043, + "eval_steps_per_second": 18.897, + "num_input_tokens_seen": 1642696, + "step": 1136 + }, + { + "epoch": 4.056939501779359, + "grad_norm": 0.00447492441162467, + "learning_rate": 5.267470605739953e-07, + "loss": 0.0, + "num_input_tokens_seen": 1648520, + "step": 1140 + }, + { + "epoch": 4.074733096085409, + "grad_norm": 0.0019011611584573984, + "learning_rate": 5.078239948723154e-07, + "loss": 0.0, + "num_input_tokens_seen": 1655752, + "step": 1145 + }, + { + "epoch": 4.092526690391459, + "grad_norm": 0.005783023778349161, + "learning_rate": 4.892085868230881e-07, + "loss": 0.0, + "num_input_tokens_seen": 1662920, + "step": 1150 + }, + { + "epoch": 4.110320284697509, + "grad_norm": 0.0052789985202252865, + "learning_rate": 4.7090371125902175e-07, + "loss": 0.0, + "num_input_tokens_seen": 1669896, + "step": 1155 + }, + { + "epoch": 4.128113879003559, + "grad_norm": 0.005814805161207914, + "learning_rate": 4.529121950563717e-07, + "loss": 0.0, + "num_input_tokens_seen": 1675400, + "step": 1160 + }, + { + "epoch": 4.145907473309609, + "grad_norm": 0.010795537382364273, + "learning_rate": 4.352368166983753e-07, + "loss": 0.0, + "num_input_tokens_seen": 1682952, + "step": 1165 + }, + { + "epoch": 4.1637010676156585, + "grad_norm": 0.013921362347900867, + "learning_rate": 4.178803058461664e-07, + "loss": 0.0, + "num_input_tokens_seen": 1690248, + "step": 1170 + }, + { + "epoch": 4.181494661921708, + "grad_norm": 0.00226654764264822, + "learning_rate": 4.0084534291722375e-07, + "loss": 0.0, + "num_input_tokens_seen": 1696840, + "step": 1175 + }, + { + "epoch": 4.199288256227758, + "grad_norm": 0.005677198059856892, + "learning_rate": 3.8413455867142513e-07, + "loss": 0.0, + "num_input_tokens_seen": 1703624, + "step": 1180 + }, + { + "epoch": 4.217081850533808, + "grad_norm": 0.004136247094720602, + "learning_rate": 3.6775053380477296e-07, + "loss": 0.0, + "num_input_tokens_seen": 1710024, + "step": 1185 + }, + { + "epoch": 4.234875444839858, + "grad_norm": 0.0022235463839024305, + "learning_rate": 3.516957985508476e-07, + "loss": 0.0, + "num_input_tokens_seen": 1717768, + "step": 1190 + }, + { + "epoch": 4.252669039145908, + "grad_norm": 0.0016525887185707688, + "learning_rate": 3.3597283229005877e-07, + "loss": 0.0, + "num_input_tokens_seen": 1727240, + "step": 1195 + }, + { + "epoch": 4.270462633451958, + "grad_norm": 0.0030577515717595816, + "learning_rate": 3.2058406316674563e-07, + "loss": 0.0, + "num_input_tokens_seen": 1734408, + "step": 1200 + }, + { + "epoch": 4.288256227758007, + "grad_norm": 0.004889196250587702, + "learning_rate": 3.055318677141916e-07, + "loss": 0.0, + "num_input_tokens_seen": 1740936, + "step": 1205 + }, + { + "epoch": 4.295373665480427, + "eval_loss": 0.257210373878479, + "eval_runtime": 0.6184, + "eval_samples_per_second": 402.654, + "eval_steps_per_second": 51.747, + "num_input_tokens_seen": 1743624, + "step": 1207 + }, + { + "epoch": 4.306049822064057, + "grad_norm": 0.010472727008163929, + "learning_rate": 2.9081857048761014e-07, + "loss": 0.0, + "num_input_tokens_seen": 1747784, + "step": 1210 + }, + { + "epoch": 4.3238434163701065, + "grad_norm": 0.002632361836731434, + "learning_rate": 2.764464437051537e-07, + "loss": 0.0, + "num_input_tokens_seen": 1754888, + "step": 1215 + }, + { + "epoch": 4.341637010676156, + "grad_norm": 0.003168675350025296, + "learning_rate": 2.624177068970124e-07, + "loss": 0.0, + "num_input_tokens_seen": 1762632, + "step": 1220 + }, + { + "epoch": 4.359430604982206, + "grad_norm": 0.001775842742063105, + "learning_rate": 2.4873452656264316e-07, + "loss": 0.0, + "num_input_tokens_seen": 1769928, + "step": 1225 + }, + { + "epoch": 4.377224199288256, + "grad_norm": 0.0023636040277779102, + "learning_rate": 2.3539901583619186e-07, + "loss": 0.0, + "num_input_tokens_seen": 1777480, + "step": 1230 + }, + { + "epoch": 4.395017793594306, + "grad_norm": 0.003756535705178976, + "learning_rate": 2.2241323416015452e-07, + "loss": 0.0, + "num_input_tokens_seen": 1784840, + "step": 1235 + }, + { + "epoch": 4.412811387900356, + "grad_norm": 0.0014317089226096869, + "learning_rate": 2.0977918696733103e-07, + "loss": 0.0, + "num_input_tokens_seen": 1792584, + "step": 1240 + }, + { + "epoch": 4.430604982206406, + "grad_norm": 0.0015214212471619248, + "learning_rate": 1.9749882537112297e-07, + "loss": 0.0, + "num_input_tokens_seen": 1800968, + "step": 1245 + }, + { + "epoch": 4.448398576512456, + "grad_norm": 0.003403919516131282, + "learning_rate": 1.8557404586421413e-07, + "loss": 0.0, + "num_input_tokens_seen": 1808456, + "step": 1250 + }, + { + "epoch": 4.4661921708185055, + "grad_norm": 0.013262175023555756, + "learning_rate": 1.7400669002569233e-07, + "loss": 0.0, + "num_input_tokens_seen": 1816136, + "step": 1255 + }, + { + "epoch": 4.483985765124555, + "grad_norm": 0.001888920902274549, + "learning_rate": 1.62798544236647e-07, + "loss": 0.0, + "num_input_tokens_seen": 1824136, + "step": 1260 + }, + { + "epoch": 4.501779359430605, + "grad_norm": 0.00637847138568759, + "learning_rate": 1.5195133940429345e-07, + "loss": 0.0, + "num_input_tokens_seen": 1831304, + "step": 1265 + }, + { + "epoch": 4.519572953736655, + "grad_norm": 0.0010983651736751199, + "learning_rate": 1.4146675069466403e-07, + "loss": 0.0, + "num_input_tokens_seen": 1837512, + "step": 1270 + }, + { + "epoch": 4.537366548042705, + "grad_norm": 0.0015767315635457635, + "learning_rate": 1.313463972739068e-07, + "loss": 0.0, + "num_input_tokens_seen": 1844296, + "step": 1275 + }, + { + "epoch": 4.548042704626335, + "eval_loss": 0.259037584066391, + "eval_runtime": 0.6197, + "eval_samples_per_second": 401.818, + "eval_steps_per_second": 51.639, + "num_input_tokens_seen": 1849416, + "step": 1278 + }, + { + "epoch": 4.555160142348754, + "grad_norm": 0.003252014284953475, + "learning_rate": 1.215918420582343e-07, + "loss": 0.0, + "num_input_tokens_seen": 1851720, + "step": 1280 + }, + { + "epoch": 4.572953736654805, + "grad_norm": 0.0038456034380942583, + "learning_rate": 1.1220459147255642e-07, + "loss": 0.0, + "num_input_tokens_seen": 1858120, + "step": 1285 + }, + { + "epoch": 4.590747330960854, + "grad_norm": 0.002327981637790799, + "learning_rate": 1.0318609521783818e-07, + "loss": 0.0, + "num_input_tokens_seen": 1865928, + "step": 1290 + }, + { + "epoch": 4.608540925266904, + "grad_norm": 0.002249909332022071, + "learning_rate": 9.453774604721937e-08, + "loss": 0.0, + "num_input_tokens_seen": 1873800, + "step": 1295 + }, + { + "epoch": 4.6263345195729535, + "grad_norm": 0.00320955878123641, + "learning_rate": 8.62608795509276e-08, + "loss": 0.0, + "num_input_tokens_seen": 1881800, + "step": 1300 + }, + { + "epoch": 4.644128113879003, + "grad_norm": 0.010727422311902046, + "learning_rate": 7.835677395001795e-08, + "loss": 0.0, + "num_input_tokens_seen": 1888648, + "step": 1305 + }, + { + "epoch": 4.661921708185053, + "grad_norm": 0.0015563094057142735, + "learning_rate": 7.082664989897486e-08, + "loss": 0.0, + "num_input_tokens_seen": 1895432, + "step": 1310 + }, + { + "epoch": 4.679715302491103, + "grad_norm": 0.0029733225237578154, + "learning_rate": 6.367167029720234e-08, + "loss": 0.0, + "num_input_tokens_seen": 1902408, + "step": 1315 + }, + { + "epoch": 4.697508896797153, + "grad_norm": 0.0011107242899015546, + "learning_rate": 5.68929401094323e-08, + "loss": 0.0277, + "num_input_tokens_seen": 1910344, + "step": 1320 + }, + { + "epoch": 4.715302491103203, + "grad_norm": 0.0024087605997920036, + "learning_rate": 5.049150619508503e-08, + "loss": 0.0, + "num_input_tokens_seen": 1918472, + "step": 1325 + }, + { + "epoch": 4.733096085409253, + "grad_norm": 0.002195857698097825, + "learning_rate": 4.446835714659647e-08, + "loss": 0.0, + "num_input_tokens_seen": 1924744, + "step": 1330 + }, + { + "epoch": 4.750889679715303, + "grad_norm": 0.0017717446899041533, + "learning_rate": 3.882442313674878e-08, + "loss": 0.0, + "num_input_tokens_seen": 1932872, + "step": 1335 + }, + { + "epoch": 4.7686832740213525, + "grad_norm": 0.0018043630989268422, + "learning_rate": 3.3560575775019866e-08, + "loss": 0.0, + "num_input_tokens_seen": 1940040, + "step": 1340 + }, + { + "epoch": 4.786476868327402, + "grad_norm": 0.0031781333964318037, + "learning_rate": 2.8677627972978905e-08, + "loss": 0.0, + "num_input_tokens_seen": 1948936, + "step": 1345 + }, + { + "epoch": 4.800711743772242, + "eval_loss": 0.2602100372314453, + "eval_runtime": 0.6213, + "eval_samples_per_second": 400.766, + "eval_steps_per_second": 51.504, + "num_input_tokens_seen": 1954568, + "step": 1349 + }, + { + "epoch": 4.804270462633452, + "grad_norm": 0.001851449953392148, + "learning_rate": 2.4176333818745347e-08, + "loss": 0.0, + "num_input_tokens_seen": 1955912, + "step": 1350 + }, + { + "epoch": 4.822064056939502, + "grad_norm": 0.003628035541623831, + "learning_rate": 2.0057388460533733e-08, + "loss": 0.0, + "num_input_tokens_seen": 1962760, + "step": 1355 + }, + { + "epoch": 4.839857651245552, + "grad_norm": 0.0011652238899841905, + "learning_rate": 1.6321427999298754e-08, + "loss": 0.0, + "num_input_tokens_seen": 1969160, + "step": 1360 + }, + { + "epoch": 4.857651245551601, + "grad_norm": 0.003991110250353813, + "learning_rate": 1.2969029390501597e-08, + "loss": 0.0, + "num_input_tokens_seen": 1975752, + "step": 1365 + }, + { + "epoch": 4.875444839857651, + "grad_norm": 0.004909892100840807, + "learning_rate": 1.000071035500816e-08, + "loss": 0.0, + "num_input_tokens_seen": 1983240, + "step": 1370 + }, + { + "epoch": 4.893238434163701, + "grad_norm": 0.00255265599116683, + "learning_rate": 7.416929299135511e-09, + "loss": 0.0, + "num_input_tokens_seen": 1990792, + "step": 1375 + }, + { + "epoch": 4.911032028469751, + "grad_norm": 0.0013633174821734428, + "learning_rate": 5.218085243859639e-09, + "loss": 0.0, + "num_input_tokens_seen": 1998728, + "step": 1380 + }, + { + "epoch": 4.9288256227758005, + "grad_norm": 0.005876463372260332, + "learning_rate": 3.4045177631936154e-09, + "loss": 0.0, + "num_input_tokens_seen": 2006920, + "step": 1385 + }, + { + "epoch": 4.94661921708185, + "grad_norm": 0.0021194759756326675, + "learning_rate": 1.976506931745392e-09, + "loss": 0.0, + "num_input_tokens_seen": 2013128, + "step": 1390 + }, + { + "epoch": 4.9644128113879, + "grad_norm": 0.0024136609863489866, + "learning_rate": 9.3427328146517e-10, + "loss": 0.0, + "num_input_tokens_seen": 2021704, + "step": 1395 + }, + { + "epoch": 4.98220640569395, + "grad_norm": 0.16585078835487366, + "learning_rate": 2.7797776758903274e-10, + "loss": 0.0, + "num_input_tokens_seen": 2028872, + "step": 1400 + }, + { + "epoch": 5.0, + "grad_norm": 0.002416080329567194, + "learning_rate": 7.72174378022017e-12, + "loss": 0.0, + "num_input_tokens_seen": 2035272, + "step": 1405 + }, + { + "epoch": 5.0, + "num_input_tokens_seen": 2035272, + "step": 1405, + "total_flos": 1.1883702201974784e+16, + "train_loss": 0.05568007128206763, + "train_runtime": 1085.6649, + "train_samples_per_second": 10.321, + "train_steps_per_second": 1.294 + } + ], + "logging_steps": 5, + "max_steps": 1405, + "num_input_tokens_seen": 2035272, + "num_train_epochs": 5, + "save_steps": 71, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.1883702201974784e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..07bbf66 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70ad1bfbb630f5f8a43a169d4a5d88405c2274dc7d3d7800201dd83bc958921c +size 6289 diff --git a/training_eval_loss.png b/training_eval_loss.png new file mode 100644 index 0000000..3a71505 Binary files /dev/null and b/training_eval_loss.png differ diff --git a/training_loss.png b/training_loss.png new file mode 100644 index 0000000..c78f3fd Binary files /dev/null and b/training_loss.png differ