commit 8d4dae588688508683003d649dc312da9d3cc13d Author: ModelHub XC Date: Sat Jun 20 17:43:19 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: guangyangnlp/Qwen3-1.7B-SFT-medical-2e-5 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..2baf9d8 --- /dev/null +++ b/README.md @@ -0,0 +1,70 @@ +--- +library_name: transformers +license: other +base_model: Qwen/Qwen3-1.7B +tags: +- llama-factory +- full +- generated_from_trainer +model-index: +- name: medical-o1-sft-full + results: [] +--- + + + +# medical-o1-sft-full + +This model is a fine-tuned version of [Qwen/Qwen3-1.7B](https://huggingface.co/Qwen/Qwen3-1.7B) on the medical_o1_train dataset. +It achieves the following results on the evaluation set: +- Loss: 1.4089 + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 2e-05 +- train_batch_size: 16 +- eval_batch_size: 8 +- seed: 42 +- gradient_accumulation_steps: 8 +- total_train_batch_size: 128 +- optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_steps: 0.05 +- num_epochs: 3.0 + +### Training results + +| Training Loss | Epoch | Step | Validation Loss | +|:-------------:|:------:|:----:|:---------------:| +| 1.4962 | 0.3419 | 50 | 1.4686 | +| 1.4215 | 0.6838 | 100 | 1.4337 | +| 1.3304 | 1.0205 | 150 | 1.4194 | +| 1.3097 | 1.3624 | 200 | 1.4159 | +| 1.3175 | 1.7043 | 250 | 1.4089 | +| 1.2195 | 2.0410 | 300 | 1.4176 | +| 1.2726 | 2.3829 | 350 | 1.4229 | +| 1.1895 | 2.7248 | 400 | 1.4216 | + + +### Framework versions + +- Transformers 5.0.0 +- Pytorch 2.10.0+cu128 +- Datasets 4.0.0 +- Tokenizers 0.22.2 diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..ce41b8c --- /dev/null +++ b/all_results.json @@ -0,0 +1,12 @@ +{ + "epoch": 3.0, + "eval_loss": 1.4088929891586304, + "eval_runtime": 13.9279, + "eval_samples_per_second": 70.793, + "eval_steps_per_second": 8.903, + "total_flos": 5.3379040973665075e+17, + "train_loss": 1.3470534840408637, + "train_runtime": 3006.8003, + "train_samples_per_second": 18.676, + "train_steps_per_second": 0.147 +} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..64481b1 --- /dev/null +++ b/config.json @@ -0,0 +1,63 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": null, + "dtype": "float32", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 6144, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 28, + "model_type": "qwen3", + "num_attention_heads": 16, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "pad_token_id": 151643, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": true, + "transformers_version": "5.0.0", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/eval_results.json b/eval_results.json new file mode 100644 index 0000000..b6faabf --- /dev/null +++ b/eval_results.json @@ -0,0 +1,7 @@ +{ + "epoch": 3.0, + "eval_loss": 1.4088929891586304, + "eval_runtime": 13.9279, + "eval_samples_per_second": 70.793, + "eval_steps_per_second": 8.903 +} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..c33fb76 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "temperature": 0.6, + "top_k": 20, + "top_p": 0.95, + "transformers_version": "5.0.0" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..2cff034 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b98f6d75daac7cb177c54105480062a353474e167a59ddbeae8d9df10294546 +size 8126995136 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..c7afbed --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..145e2c7 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,30 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..a4ff596 --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 3.0, + "total_flos": 5.3379040973665075e+17, + "train_loss": 1.3470534840408637, + "train_runtime": 3006.8003, + "train_samples_per_second": 18.676, + "train_steps_per_second": 0.147 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..129af96 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,3194 @@ +{ + "best_global_step": 250, + "best_metric": 1.4088929891586304, + "best_model_checkpoint": "saves/qwen3-1.7B/medical-o1-sft-full/checkpoint-250", + "epoch": 3.0, + "eval_steps": 50, + "global_step": 441, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006837606837606838, + "grad_norm": 83.15293884277344, + "learning_rate": 0.0, + "loss": 2.8199405670166016, + "step": 1 + }, + { + "epoch": 0.013675213675213675, + "grad_norm": 81.79350280761719, + "learning_rate": 8.695652173913044e-07, + "loss": 2.7888758182525635, + "step": 2 + }, + { + "epoch": 0.020512820512820513, + "grad_norm": 83.25151824951172, + "learning_rate": 1.7391304347826088e-06, + "loss": 2.820769786834717, + "step": 3 + }, + { + "epoch": 0.02735042735042735, + "grad_norm": 75.52108001708984, + "learning_rate": 2.6086956521739132e-06, + "loss": 2.734041690826416, + "step": 4 + }, + { + "epoch": 0.03418803418803419, + "grad_norm": 72.11664581298828, + "learning_rate": 3.4782608695652175e-06, + "loss": 2.7135212421417236, + "step": 5 + }, + { + "epoch": 0.041025641025641026, + "grad_norm": 55.534324645996094, + "learning_rate": 4.347826086956522e-06, + "loss": 2.4443650245666504, + "step": 6 + }, + { + "epoch": 0.04786324786324787, + "grad_norm": 48.14010238647461, + "learning_rate": 5.2173913043478265e-06, + "loss": 2.3162710666656494, + "step": 7 + }, + { + "epoch": 0.0547008547008547, + "grad_norm": 20.861207962036133, + "learning_rate": 6.086956521739132e-06, + "loss": 2.0038950443267822, + "step": 8 + }, + { + "epoch": 0.06153846153846154, + "grad_norm": 15.49008846282959, + "learning_rate": 6.956521739130435e-06, + "loss": 1.8993940353393555, + "step": 9 + }, + { + "epoch": 0.06837606837606838, + "grad_norm": 5.190984725952148, + "learning_rate": 7.82608695652174e-06, + "loss": 1.7324286699295044, + "step": 10 + }, + { + "epoch": 0.07521367521367521, + "grad_norm": 4.630637168884277, + "learning_rate": 8.695652173913044e-06, + "loss": 1.654750943183899, + "step": 11 + }, + { + "epoch": 0.08205128205128205, + "grad_norm": 3.784055233001709, + "learning_rate": 9.565217391304349e-06, + "loss": 1.7394911050796509, + "step": 12 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 3.4299561977386475, + "learning_rate": 1.0434782608695653e-05, + "loss": 1.6633565425872803, + "step": 13 + }, + { + "epoch": 0.09572649572649573, + "grad_norm": 4.693484306335449, + "learning_rate": 1.1304347826086957e-05, + "loss": 1.670560359954834, + "step": 14 + }, + { + "epoch": 0.10256410256410256, + "grad_norm": 5.14279317855835, + "learning_rate": 1.2173913043478263e-05, + "loss": 1.647332787513733, + "step": 15 + }, + { + "epoch": 0.1094017094017094, + "grad_norm": 3.8385608196258545, + "learning_rate": 1.3043478260869566e-05, + "loss": 1.6399732828140259, + "step": 16 + }, + { + "epoch": 0.11623931623931624, + "grad_norm": 2.6695456504821777, + "learning_rate": 1.391304347826087e-05, + "loss": 1.5681482553482056, + "step": 17 + }, + { + "epoch": 0.12307692307692308, + "grad_norm": 2.117490291595459, + "learning_rate": 1.4782608695652174e-05, + "loss": 1.6053783893585205, + "step": 18 + }, + { + "epoch": 0.12991452991452992, + "grad_norm": 1.9541882276535034, + "learning_rate": 1.565217391304348e-05, + "loss": 1.5954205989837646, + "step": 19 + }, + { + "epoch": 0.13675213675213677, + "grad_norm": 2.011003255844116, + "learning_rate": 1.6521739130434785e-05, + "loss": 1.5820363759994507, + "step": 20 + }, + { + "epoch": 0.14358974358974358, + "grad_norm": 1.9789162874221802, + "learning_rate": 1.739130434782609e-05, + "loss": 1.532997727394104, + "step": 21 + }, + { + "epoch": 0.15042735042735042, + "grad_norm": 1.8961035013198853, + "learning_rate": 1.8260869565217393e-05, + "loss": 1.5475587844848633, + "step": 22 + }, + { + "epoch": 0.15726495726495726, + "grad_norm": 1.5811997652053833, + "learning_rate": 1.9130434782608697e-05, + "loss": 1.580260992050171, + "step": 23 + }, + { + "epoch": 0.1641025641025641, + "grad_norm": 1.4591213464736938, + "learning_rate": 2e-05, + "loss": 1.5463660955429077, + "step": 24 + }, + { + "epoch": 0.17094017094017094, + "grad_norm": 1.4459729194641113, + "learning_rate": 1.999971756719333e-05, + "loss": 1.5187675952911377, + "step": 25 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 1.4411983489990234, + "learning_rate": 1.9998870284726968e-05, + "loss": 1.529025673866272, + "step": 26 + }, + { + "epoch": 0.18461538461538463, + "grad_norm": 1.3215960264205933, + "learning_rate": 1.9997458200460994e-05, + "loss": 1.513730525970459, + "step": 27 + }, + { + "epoch": 0.19145299145299147, + "grad_norm": 1.324648141860962, + "learning_rate": 1.999548139415919e-05, + "loss": 1.5576432943344116, + "step": 28 + }, + { + "epoch": 0.19829059829059828, + "grad_norm": 1.1139763593673706, + "learning_rate": 1.999293997748454e-05, + "loss": 1.5223976373672485, + "step": 29 + }, + { + "epoch": 0.20512820512820512, + "grad_norm": 1.175620675086975, + "learning_rate": 1.9989834093992945e-05, + "loss": 1.529496431350708, + "step": 30 + }, + { + "epoch": 0.21196581196581196, + "grad_norm": 1.2628631591796875, + "learning_rate": 1.9986163919125077e-05, + "loss": 1.5556331872940063, + "step": 31 + }, + { + "epoch": 0.2188034188034188, + "grad_norm": 1.121780276298523, + "learning_rate": 1.9981929660196492e-05, + "loss": 1.522382140159607, + "step": 32 + }, + { + "epoch": 0.22564102564102564, + "grad_norm": 1.057112693786621, + "learning_rate": 1.997713155638592e-05, + "loss": 1.5269778966903687, + "step": 33 + }, + { + "epoch": 0.23247863247863249, + "grad_norm": 1.1212079524993896, + "learning_rate": 1.9971769878721747e-05, + "loss": 1.5179802179336548, + "step": 34 + }, + { + "epoch": 0.23931623931623933, + "grad_norm": 1.1053107976913452, + "learning_rate": 1.99658449300667e-05, + "loss": 1.4600404500961304, + "step": 35 + }, + { + "epoch": 0.24615384615384617, + "grad_norm": 1.0344611406326294, + "learning_rate": 1.9959357045100764e-05, + "loss": 1.4895355701446533, + "step": 36 + }, + { + "epoch": 0.252991452991453, + "grad_norm": 1.0998711585998535, + "learning_rate": 1.9952306590302247e-05, + "loss": 1.498748779296875, + "step": 37 + }, + { + "epoch": 0.25982905982905985, + "grad_norm": 1.0810974836349487, + "learning_rate": 1.9944693963927092e-05, + "loss": 1.4847540855407715, + "step": 38 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 1.0349794626235962, + "learning_rate": 1.9936519595986395e-05, + "loss": 1.4850821495056152, + "step": 39 + }, + { + "epoch": 0.27350427350427353, + "grad_norm": 0.9509456157684326, + "learning_rate": 1.9927783948222084e-05, + "loss": 1.4879685640335083, + "step": 40 + }, + { + "epoch": 0.28034188034188035, + "grad_norm": 0.9873176217079163, + "learning_rate": 1.9918487514080867e-05, + "loss": 1.5055975914001465, + "step": 41 + }, + { + "epoch": 0.28717948717948716, + "grad_norm": 0.9554620385169983, + "learning_rate": 1.990863081868634e-05, + "loss": 1.4576541185379028, + "step": 42 + }, + { + "epoch": 0.294017094017094, + "grad_norm": 0.915795087814331, + "learning_rate": 1.989821441880933e-05, + "loss": 1.469474196434021, + "step": 43 + }, + { + "epoch": 0.30085470085470084, + "grad_norm": 1.006457805633545, + "learning_rate": 1.988723890283645e-05, + "loss": 1.5073033571243286, + "step": 44 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 0.9496122598648071, + "learning_rate": 1.9875704890736853e-05, + "loss": 1.496271014213562, + "step": 45 + }, + { + "epoch": 0.3145299145299145, + "grad_norm": 0.9319558143615723, + "learning_rate": 1.9863613034027224e-05, + "loss": 1.4825000762939453, + "step": 46 + }, + { + "epoch": 0.3213675213675214, + "grad_norm": 0.9389411807060242, + "learning_rate": 1.985096401573497e-05, + "loss": 1.4443243741989136, + "step": 47 + }, + { + "epoch": 0.3282051282051282, + "grad_norm": 0.9735950827598572, + "learning_rate": 1.9837758550359637e-05, + "loss": 1.4762128591537476, + "step": 48 + }, + { + "epoch": 0.335042735042735, + "grad_norm": 0.9494331479072571, + "learning_rate": 1.982399738383255e-05, + "loss": 1.5045385360717773, + "step": 49 + }, + { + "epoch": 0.3418803418803419, + "grad_norm": 0.9520753026008606, + "learning_rate": 1.9809681293474693e-05, + "loss": 1.496164321899414, + "step": 50 + }, + { + "epoch": 0.3418803418803419, + "eval_loss": 1.4685521125793457, + "eval_runtime": 14.1604, + "eval_samples_per_second": 69.631, + "eval_steps_per_second": 8.757, + "step": 50 + }, + { + "epoch": 0.3487179487179487, + "grad_norm": 0.9688102602958679, + "learning_rate": 1.979481108795278e-05, + "loss": 1.4734501838684082, + "step": 51 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 0.9477071166038513, + "learning_rate": 1.9779387607233587e-05, + "loss": 1.4600017070770264, + "step": 52 + }, + { + "epoch": 0.3623931623931624, + "grad_norm": 0.9507799744606018, + "learning_rate": 1.9763411722536503e-05, + "loss": 1.455001711845398, + "step": 53 + }, + { + "epoch": 0.36923076923076925, + "grad_norm": 0.9292111992835999, + "learning_rate": 1.9746884336284316e-05, + "loss": 1.4742114543914795, + "step": 54 + }, + { + "epoch": 0.37606837606837606, + "grad_norm": 0.9916467666625977, + "learning_rate": 1.972980638205225e-05, + "loss": 1.5147836208343506, + "step": 55 + }, + { + "epoch": 0.38290598290598293, + "grad_norm": 0.9744175672531128, + "learning_rate": 1.971217882451521e-05, + "loss": 1.4713977575302124, + "step": 56 + }, + { + "epoch": 0.38974358974358975, + "grad_norm": 1.0033540725708008, + "learning_rate": 1.9694002659393306e-05, + "loss": 1.4538943767547607, + "step": 57 + }, + { + "epoch": 0.39658119658119656, + "grad_norm": 0.946854293346405, + "learning_rate": 1.9675278913395605e-05, + "loss": 1.4287432432174683, + "step": 58 + }, + { + "epoch": 0.40341880341880343, + "grad_norm": 1.0013198852539062, + "learning_rate": 1.9656008644162134e-05, + "loss": 1.4492701292037964, + "step": 59 + }, + { + "epoch": 0.41025641025641024, + "grad_norm": 1.0438623428344727, + "learning_rate": 1.9636192940204134e-05, + "loss": 1.4924561977386475, + "step": 60 + }, + { + "epoch": 0.4170940170940171, + "grad_norm": 0.9705636501312256, + "learning_rate": 1.961583292084259e-05, + "loss": 1.4596234560012817, + "step": 61 + }, + { + "epoch": 0.4239316239316239, + "grad_norm": 0.9079157114028931, + "learning_rate": 1.9594929736144978e-05, + "loss": 1.44952392578125, + "step": 62 + }, + { + "epoch": 0.4307692307692308, + "grad_norm": 0.9640805125236511, + "learning_rate": 1.957348456686032e-05, + "loss": 1.4430960416793823, + "step": 63 + }, + { + "epoch": 0.4376068376068376, + "grad_norm": 0.9475866556167603, + "learning_rate": 1.9551498624352497e-05, + "loss": 1.446009635925293, + "step": 64 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.948258638381958, + "learning_rate": 1.9528973150531787e-05, + "loss": 1.4411481618881226, + "step": 65 + }, + { + "epoch": 0.4512820512820513, + "grad_norm": 0.9805014133453369, + "learning_rate": 1.9505909417784758e-05, + "loss": 1.4417314529418945, + "step": 66 + }, + { + "epoch": 0.4581196581196581, + "grad_norm": 0.9225365519523621, + "learning_rate": 1.9482308728902358e-05, + "loss": 1.480376958847046, + "step": 67 + }, + { + "epoch": 0.46495726495726497, + "grad_norm": 0.9221044182777405, + "learning_rate": 1.9458172417006347e-05, + "loss": 1.4625794887542725, + "step": 68 + }, + { + "epoch": 0.4717948717948718, + "grad_norm": 0.9901456832885742, + "learning_rate": 1.9433501845473996e-05, + "loss": 1.4856598377227783, + "step": 69 + }, + { + "epoch": 0.47863247863247865, + "grad_norm": 0.9551020860671997, + "learning_rate": 1.9408298407861045e-05, + "loss": 1.4896745681762695, + "step": 70 + }, + { + "epoch": 0.48547008547008547, + "grad_norm": 0.9381822943687439, + "learning_rate": 1.9382563527823026e-05, + "loss": 1.4343875646591187, + "step": 71 + }, + { + "epoch": 0.49230769230769234, + "grad_norm": 0.8770731091499329, + "learning_rate": 1.935629865903482e-05, + "loss": 1.4482182264328003, + "step": 72 + }, + { + "epoch": 0.49914529914529915, + "grad_norm": 0.934929609298706, + "learning_rate": 1.9329505285108544e-05, + "loss": 1.4524080753326416, + "step": 73 + }, + { + "epoch": 0.505982905982906, + "grad_norm": 0.9203254580497742, + "learning_rate": 1.9302184919509758e-05, + "loss": 1.4096636772155762, + "step": 74 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 0.9084986448287964, + "learning_rate": 1.927433910547197e-05, + "loss": 1.423622488975525, + "step": 75 + }, + { + "epoch": 0.5196581196581197, + "grad_norm": 0.8734993934631348, + "learning_rate": 1.9245969415909464e-05, + "loss": 1.4265828132629395, + "step": 76 + }, + { + "epoch": 0.5264957264957265, + "grad_norm": 0.8964496850967407, + "learning_rate": 1.921707745332845e-05, + "loss": 1.4725595712661743, + "step": 77 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.9096109867095947, + "learning_rate": 1.9187664849736542e-05, + "loss": 1.457470417022705, + "step": 78 + }, + { + "epoch": 0.5401709401709401, + "grad_norm": 0.8932516574859619, + "learning_rate": 1.9157733266550577e-05, + "loss": 1.454951286315918, + "step": 79 + }, + { + "epoch": 0.5470085470085471, + "grad_norm": 0.8940214514732361, + "learning_rate": 1.9127284394502765e-05, + "loss": 1.4776511192321777, + "step": 80 + }, + { + "epoch": 0.5538461538461539, + "grad_norm": 0.8789263963699341, + "learning_rate": 1.9096319953545186e-05, + "loss": 1.4376585483551025, + "step": 81 + }, + { + "epoch": 0.5606837606837607, + "grad_norm": 0.9395255446434021, + "learning_rate": 1.906484169275263e-05, + "loss": 1.4360781908035278, + "step": 82 + }, + { + "epoch": 0.5675213675213675, + "grad_norm": 0.8618428707122803, + "learning_rate": 1.903285139022381e-05, + "loss": 1.4329712390899658, + "step": 83 + }, + { + "epoch": 0.5743589743589743, + "grad_norm": 0.9313262104988098, + "learning_rate": 1.900035085298091e-05, + "loss": 1.446253776550293, + "step": 84 + }, + { + "epoch": 0.5811965811965812, + "grad_norm": 0.8763355016708374, + "learning_rate": 1.896734191686752e-05, + "loss": 1.4160209894180298, + "step": 85 + }, + { + "epoch": 0.588034188034188, + "grad_norm": 0.8777135610580444, + "learning_rate": 1.8933826446444933e-05, + "loss": 1.449493408203125, + "step": 86 + }, + { + "epoch": 0.5948717948717949, + "grad_norm": 0.8737928867340088, + "learning_rate": 1.889980633488683e-05, + "loss": 1.377128005027771, + "step": 87 + }, + { + "epoch": 0.6017094017094017, + "grad_norm": 0.923620343208313, + "learning_rate": 1.8865283503872325e-05, + "loss": 1.422142505645752, + "step": 88 + }, + { + "epoch": 0.6085470085470085, + "grad_norm": 0.9419258832931519, + "learning_rate": 1.8830259903477427e-05, + "loss": 1.4897931814193726, + "step": 89 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 0.9292656779289246, + "learning_rate": 1.879473751206489e-05, + "loss": 1.4244943857192993, + "step": 90 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 0.9174057841300964, + "learning_rate": 1.8758718336172462e-05, + "loss": 1.432208776473999, + "step": 91 + }, + { + "epoch": 0.629059829059829, + "grad_norm": 0.9447773694992065, + "learning_rate": 1.8722204410399524e-05, + "loss": 1.4501725435256958, + "step": 92 + }, + { + "epoch": 0.6358974358974359, + "grad_norm": 0.8907484412193298, + "learning_rate": 1.868519779729218e-05, + "loss": 1.4563168287277222, + "step": 93 + }, + { + "epoch": 0.6427350427350428, + "grad_norm": 0.8975157141685486, + "learning_rate": 1.864770058722676e-05, + "loss": 1.4320740699768066, + "step": 94 + }, + { + "epoch": 0.6495726495726496, + "grad_norm": 0.9034259915351868, + "learning_rate": 1.8609714898291716e-05, + "loss": 1.4002689123153687, + "step": 95 + }, + { + "epoch": 0.6564102564102564, + "grad_norm": 0.9356617331504822, + "learning_rate": 1.8571242876167995e-05, + "loss": 1.4669139385223389, + "step": 96 + }, + { + "epoch": 0.6632478632478632, + "grad_norm": 0.9355176091194153, + "learning_rate": 1.853228669400784e-05, + "loss": 1.4444191455841064, + "step": 97 + }, + { + "epoch": 0.67008547008547, + "grad_norm": 0.8931655883789062, + "learning_rate": 1.8492848552312016e-05, + "loss": 1.4415756464004517, + "step": 98 + }, + { + "epoch": 0.676923076923077, + "grad_norm": 0.8951373100280762, + "learning_rate": 1.8452930678805536e-05, + "loss": 1.4061449766159058, + "step": 99 + }, + { + "epoch": 0.6837606837606838, + "grad_norm": 0.9179074168205261, + "learning_rate": 1.8412535328311813e-05, + "loss": 1.4215387105941772, + "step": 100 + }, + { + "epoch": 0.6837606837606838, + "eval_loss": 1.4336893558502197, + "eval_runtime": 13.7947, + "eval_samples_per_second": 71.477, + "eval_steps_per_second": 8.989, + "step": 100 + }, + { + "epoch": 0.6905982905982906, + "grad_norm": 0.977781355381012, + "learning_rate": 1.8371664782625287e-05, + "loss": 1.4540152549743652, + "step": 101 + }, + { + "epoch": 0.6974358974358974, + "grad_norm": 0.9076094627380371, + "learning_rate": 1.8330321350382545e-05, + "loss": 1.415886640548706, + "step": 102 + }, + { + "epoch": 0.7042735042735043, + "grad_norm": 0.8912188410758972, + "learning_rate": 1.8288507366931907e-05, + "loss": 1.4277691841125488, + "step": 103 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 0.8660780787467957, + "learning_rate": 1.8246225194201517e-05, + "loss": 1.39166259765625, + "step": 104 + }, + { + "epoch": 0.717948717948718, + "grad_norm": 0.9204691648483276, + "learning_rate": 1.8203477220565912e-05, + "loss": 1.4161370992660522, + "step": 105 + }, + { + "epoch": 0.7247863247863248, + "grad_norm": 0.9661011695861816, + "learning_rate": 1.8160265860711134e-05, + "loss": 1.4492610692977905, + "step": 106 + }, + { + "epoch": 0.7316239316239316, + "grad_norm": 0.9005808234214783, + "learning_rate": 1.8116593555498308e-05, + "loss": 1.4389468431472778, + "step": 107 + }, + { + "epoch": 0.7384615384615385, + "grad_norm": 0.9088156223297119, + "learning_rate": 1.807246277182578e-05, + "loss": 1.4940838813781738, + "step": 108 + }, + { + "epoch": 0.7452991452991453, + "grad_norm": 0.9402887225151062, + "learning_rate": 1.802787600248977e-05, + "loss": 1.4154539108276367, + "step": 109 + }, + { + "epoch": 0.7521367521367521, + "grad_norm": 0.9380722045898438, + "learning_rate": 1.798283576604356e-05, + "loss": 1.4318289756774902, + "step": 110 + }, + { + "epoch": 0.7589743589743589, + "grad_norm": 0.9319474101066589, + "learning_rate": 1.7937344606655228e-05, + "loss": 1.4192531108856201, + "step": 111 + }, + { + "epoch": 0.7658119658119659, + "grad_norm": 0.9068304896354675, + "learning_rate": 1.789140509396394e-05, + "loss": 1.4170390367507935, + "step": 112 + }, + { + "epoch": 0.7726495726495727, + "grad_norm": 0.8808281421661377, + "learning_rate": 1.784501982293479e-05, + "loss": 1.432860016822815, + "step": 113 + }, + { + "epoch": 0.7794871794871795, + "grad_norm": 0.8805544376373291, + "learning_rate": 1.7798191413712244e-05, + "loss": 1.4037058353424072, + "step": 114 + }, + { + "epoch": 0.7863247863247863, + "grad_norm": 0.8959332704544067, + "learning_rate": 1.775092251147211e-05, + "loss": 1.4175316095352173, + "step": 115 + }, + { + "epoch": 0.7931623931623931, + "grad_norm": 0.8379173278808594, + "learning_rate": 1.770321578627213e-05, + "loss": 1.404625654220581, + "step": 116 + }, + { + "epoch": 0.8, + "grad_norm": 0.8591132164001465, + "learning_rate": 1.765507393290117e-05, + "loss": 1.4534145593643188, + "step": 117 + }, + { + "epoch": 0.8068376068376069, + "grad_norm": 0.8517522215843201, + "learning_rate": 1.7606499670726972e-05, + "loss": 1.4170221090316772, + "step": 118 + }, + { + "epoch": 0.8136752136752137, + "grad_norm": 0.8700085282325745, + "learning_rate": 1.7557495743542586e-05, + "loss": 1.4001213312149048, + "step": 119 + }, + { + "epoch": 0.8205128205128205, + "grad_norm": 0.8774170875549316, + "learning_rate": 1.7508064919411344e-05, + "loss": 1.418135643005371, + "step": 120 + }, + { + "epoch": 0.8273504273504273, + "grad_norm": 0.8984478116035461, + "learning_rate": 1.745820999051053e-05, + "loss": 1.4195680618286133, + "step": 121 + }, + { + "epoch": 0.8341880341880342, + "grad_norm": 0.8648718595504761, + "learning_rate": 1.7407933772973638e-05, + "loss": 1.383607029914856, + "step": 122 + }, + { + "epoch": 0.841025641025641, + "grad_norm": 0.9336929321289062, + "learning_rate": 1.735723910673132e-05, + "loss": 1.4406161308288574, + "step": 123 + }, + { + "epoch": 0.8478632478632478, + "grad_norm": 0.8780763149261475, + "learning_rate": 1.730612885535094e-05, + "loss": 1.4191570281982422, + "step": 124 + }, + { + "epoch": 0.8547008547008547, + "grad_norm": 0.8674494624137878, + "learning_rate": 1.7254605905874862e-05, + "loss": 1.437395691871643, + "step": 125 + }, + { + "epoch": 0.8615384615384616, + "grad_norm": 0.9440014958381653, + "learning_rate": 1.7202673168657318e-05, + "loss": 1.4250893592834473, + "step": 126 + }, + { + "epoch": 0.8683760683760684, + "grad_norm": 0.9403019547462463, + "learning_rate": 1.7150333577200062e-05, + "loss": 1.435499906539917, + "step": 127 + }, + { + "epoch": 0.8752136752136752, + "grad_norm": 0.863822877407074, + "learning_rate": 1.709759008798663e-05, + "loss": 1.409804105758667, + "step": 128 + }, + { + "epoch": 0.882051282051282, + "grad_norm": 0.9274973273277283, + "learning_rate": 1.7044445680315374e-05, + "loss": 1.433601975440979, + "step": 129 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.9369088411331177, + "learning_rate": 1.6990903356131125e-05, + "loss": 1.4320355653762817, + "step": 130 + }, + { + "epoch": 0.8957264957264958, + "grad_norm": 0.8703179955482483, + "learning_rate": 1.6936966139855664e-05, + "loss": 1.4167561531066895, + "step": 131 + }, + { + "epoch": 0.9025641025641026, + "grad_norm": 0.9144904017448425, + "learning_rate": 1.6882637078216867e-05, + "loss": 1.4223415851593018, + "step": 132 + }, + { + "epoch": 0.9094017094017094, + "grad_norm": 0.9126601219177246, + "learning_rate": 1.6827919240076612e-05, + "loss": 1.4480727910995483, + "step": 133 + }, + { + "epoch": 0.9162393162393162, + "grad_norm": 0.8591611981391907, + "learning_rate": 1.6772815716257414e-05, + "loss": 1.40584135055542, + "step": 134 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 0.8316404223442078, + "learning_rate": 1.671732961936785e-05, + "loss": 1.449837565422058, + "step": 135 + }, + { + "epoch": 0.9299145299145299, + "grad_norm": 0.8785284757614136, + "learning_rate": 1.6661464083626734e-05, + "loss": 1.440337061882019, + "step": 136 + }, + { + "epoch": 0.9367521367521368, + "grad_norm": 0.8786150813102722, + "learning_rate": 1.6605222264686085e-05, + "loss": 1.440657138824463, + "step": 137 + }, + { + "epoch": 0.9435897435897436, + "grad_norm": 0.8501399159431458, + "learning_rate": 1.6548607339452853e-05, + "loss": 1.397615671157837, + "step": 138 + }, + { + "epoch": 0.9504273504273504, + "grad_norm": 0.8737369775772095, + "learning_rate": 1.6491622505909483e-05, + "loss": 1.4285824298858643, + "step": 139 + }, + { + "epoch": 0.9572649572649573, + "grad_norm": 0.8369284868240356, + "learning_rate": 1.6434270982933272e-05, + "loss": 1.3992527723312378, + "step": 140 + }, + { + "epoch": 0.9641025641025641, + "grad_norm": 0.8740672469139099, + "learning_rate": 1.637655601011454e-05, + "loss": 1.4451634883880615, + "step": 141 + }, + { + "epoch": 0.9709401709401709, + "grad_norm": 0.873289942741394, + "learning_rate": 1.631848084757364e-05, + "loss": 1.3965365886688232, + "step": 142 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 0.9107730984687805, + "learning_rate": 1.6260048775776804e-05, + "loss": 1.4110256433486938, + "step": 143 + }, + { + "epoch": 0.9846153846153847, + "grad_norm": 0.8785021305084229, + "learning_rate": 1.6201263095350833e-05, + "loss": 1.4294975996017456, + "step": 144 + }, + { + "epoch": 0.9914529914529915, + "grad_norm": 0.8321818113327026, + "learning_rate": 1.6142127126896682e-05, + "loss": 1.4016475677490234, + "step": 145 + }, + { + "epoch": 0.9982905982905983, + "grad_norm": 0.8866358399391174, + "learning_rate": 1.6082644210801846e-05, + "loss": 1.3802778720855713, + "step": 146 + }, + { + "epoch": 1.0, + "grad_norm": 1.623956561088562, + "learning_rate": 1.602281770705172e-05, + "loss": 1.4806468486785889, + "step": 147 + }, + { + "epoch": 1.0068376068376068, + "grad_norm": 1.1759995222091675, + "learning_rate": 1.5962650995039783e-05, + "loss": 1.3020893335342407, + "step": 148 + }, + { + "epoch": 1.0136752136752136, + "grad_norm": 1.0619325637817383, + "learning_rate": 1.5902147473376695e-05, + "loss": 1.2844979763031006, + "step": 149 + }, + { + "epoch": 1.0205128205128204, + "grad_norm": 0.9689248204231262, + "learning_rate": 1.5841310559698346e-05, + "loss": 1.3303570747375488, + "step": 150 + }, + { + "epoch": 1.0205128205128204, + "eval_loss": 1.4194111824035645, + "eval_runtime": 13.7873, + "eval_samples_per_second": 71.515, + "eval_steps_per_second": 8.994, + "step": 150 + }, + { + "epoch": 1.0273504273504273, + "grad_norm": 0.9153519868850708, + "learning_rate": 1.578014369047279e-05, + "loss": 1.3417026996612549, + "step": 151 + }, + { + "epoch": 1.0341880341880343, + "grad_norm": 0.9799442887306213, + "learning_rate": 1.5718650320806145e-05, + "loss": 1.293771743774414, + "step": 152 + }, + { + "epoch": 1.041025641025641, + "grad_norm": 1.0599641799926758, + "learning_rate": 1.56568339242474e-05, + "loss": 1.3117493391036987, + "step": 153 + }, + { + "epoch": 1.047863247863248, + "grad_norm": 0.9470742344856262, + "learning_rate": 1.5594697992592232e-05, + "loss": 1.2798222303390503, + "step": 154 + }, + { + "epoch": 1.0547008547008547, + "grad_norm": 0.9936373829841614, + "learning_rate": 1.5532246035685755e-05, + "loss": 1.3070576190948486, + "step": 155 + }, + { + "epoch": 1.0615384615384615, + "grad_norm": 0.9454049468040466, + "learning_rate": 1.5469481581224274e-05, + "loss": 1.3386294841766357, + "step": 156 + }, + { + "epoch": 1.0683760683760684, + "grad_norm": 0.9544969797134399, + "learning_rate": 1.5406408174555978e-05, + "loss": 1.302185297012329, + "step": 157 + }, + { + "epoch": 1.0752136752136752, + "grad_norm": 0.9065172076225281, + "learning_rate": 1.5343029378480733e-05, + "loss": 1.3039960861206055, + "step": 158 + }, + { + "epoch": 1.082051282051282, + "grad_norm": 0.867220938205719, + "learning_rate": 1.527934877304879e-05, + "loss": 1.3006991147994995, + "step": 159 + }, + { + "epoch": 1.0888888888888888, + "grad_norm": 0.9097728133201599, + "learning_rate": 1.5215369955358568e-05, + "loss": 1.2785807847976685, + "step": 160 + }, + { + "epoch": 1.0957264957264958, + "grad_norm": 0.9294711351394653, + "learning_rate": 1.5151096539353481e-05, + "loss": 1.3051520586013794, + "step": 161 + }, + { + "epoch": 1.1025641025641026, + "grad_norm": 0.9427935481071472, + "learning_rate": 1.5086532155617785e-05, + "loss": 1.3146125078201294, + "step": 162 + }, + { + "epoch": 1.1094017094017095, + "grad_norm": 0.9104812741279602, + "learning_rate": 1.5021680451171499e-05, + "loss": 1.2878390550613403, + "step": 163 + }, + { + "epoch": 1.1162393162393163, + "grad_norm": 0.8972042202949524, + "learning_rate": 1.4956545089264408e-05, + "loss": 1.3068175315856934, + "step": 164 + }, + { + "epoch": 1.123076923076923, + "grad_norm": 0.9040313959121704, + "learning_rate": 1.489112974916912e-05, + "loss": 1.2897545099258423, + "step": 165 + }, + { + "epoch": 1.12991452991453, + "grad_norm": 0.9337772727012634, + "learning_rate": 1.4825438125973263e-05, + "loss": 1.301710844039917, + "step": 166 + }, + { + "epoch": 1.1367521367521367, + "grad_norm": 0.8870652914047241, + "learning_rate": 1.4759473930370738e-05, + "loss": 1.3163543939590454, + "step": 167 + }, + { + "epoch": 1.1435897435897435, + "grad_norm": 0.8637550473213196, + "learning_rate": 1.4693240888452121e-05, + "loss": 1.3200492858886719, + "step": 168 + }, + { + "epoch": 1.1504273504273503, + "grad_norm": 0.8388293981552124, + "learning_rate": 1.4626742741494207e-05, + "loss": 1.307487964630127, + "step": 169 + }, + { + "epoch": 1.1572649572649572, + "grad_norm": 0.9050071835517883, + "learning_rate": 1.4559983245748639e-05, + "loss": 1.2808455228805542, + "step": 170 + }, + { + "epoch": 1.1641025641025642, + "grad_norm": 0.965691089630127, + "learning_rate": 1.449296617222978e-05, + "loss": 1.332348346710205, + "step": 171 + }, + { + "epoch": 1.170940170940171, + "grad_norm": 0.8704518675804138, + "learning_rate": 1.4425695306501656e-05, + "loss": 1.306895136833191, + "step": 172 + }, + { + "epoch": 1.1777777777777778, + "grad_norm": 0.8741139769554138, + "learning_rate": 1.4358174448464155e-05, + "loss": 1.2980892658233643, + "step": 173 + }, + { + "epoch": 1.1846153846153846, + "grad_norm": 0.9941467642784119, + "learning_rate": 1.4290407412138365e-05, + "loss": 1.2821602821350098, + "step": 174 + }, + { + "epoch": 1.1914529914529914, + "grad_norm": 0.9268296957015991, + "learning_rate": 1.4222398025451137e-05, + "loss": 1.302233338356018, + "step": 175 + }, + { + "epoch": 1.1982905982905983, + "grad_norm": 0.8978403806686401, + "learning_rate": 1.4154150130018867e-05, + "loss": 1.265356421470642, + "step": 176 + }, + { + "epoch": 1.205128205128205, + "grad_norm": 0.9328585267066956, + "learning_rate": 1.4085667580930482e-05, + "loss": 1.320369005203247, + "step": 177 + }, + { + "epoch": 1.2119658119658119, + "grad_norm": 0.9113616943359375, + "learning_rate": 1.4016954246529697e-05, + "loss": 1.2897846698760986, + "step": 178 + }, + { + "epoch": 1.218803418803419, + "grad_norm": 0.9257543087005615, + "learning_rate": 1.3948014008196486e-05, + "loss": 1.3368397951126099, + "step": 179 + }, + { + "epoch": 1.2256410256410257, + "grad_norm": 0.8960409164428711, + "learning_rate": 1.3878850760127848e-05, + "loss": 1.3266628980636597, + "step": 180 + }, + { + "epoch": 1.2324786324786325, + "grad_norm": 0.9111725687980652, + "learning_rate": 1.3809468409117845e-05, + "loss": 1.2674126625061035, + "step": 181 + }, + { + "epoch": 1.2393162393162394, + "grad_norm": 0.9564438462257385, + "learning_rate": 1.3739870874336898e-05, + "loss": 1.2953293323516846, + "step": 182 + }, + { + "epoch": 1.2461538461538462, + "grad_norm": 1.0268452167510986, + "learning_rate": 1.3670062087110423e-05, + "loss": 1.3054559230804443, + "step": 183 + }, + { + "epoch": 1.252991452991453, + "grad_norm": 0.8995468020439148, + "learning_rate": 1.3600045990696762e-05, + "loss": 1.3053619861602783, + "step": 184 + }, + { + "epoch": 1.2598290598290598, + "grad_norm": 0.8805936574935913, + "learning_rate": 1.352982654006444e-05, + "loss": 1.3140225410461426, + "step": 185 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 0.9060247540473938, + "learning_rate": 1.3459407701668762e-05, + "loss": 1.3046287298202515, + "step": 186 + }, + { + "epoch": 1.2735042735042734, + "grad_norm": 0.8805747628211975, + "learning_rate": 1.3388793453227766e-05, + "loss": 1.3128578662872314, + "step": 187 + }, + { + "epoch": 1.2803418803418802, + "grad_norm": 0.8997815847396851, + "learning_rate": 1.331798778349752e-05, + "loss": 1.3107125759124756, + "step": 188 + }, + { + "epoch": 1.287179487179487, + "grad_norm": 0.9592490792274475, + "learning_rate": 1.3246994692046837e-05, + "loss": 1.3269885778427124, + "step": 189 + }, + { + "epoch": 1.294017094017094, + "grad_norm": 0.9726372957229614, + "learning_rate": 1.3175818189031326e-05, + "loss": 1.337971806526184, + "step": 190 + }, + { + "epoch": 1.300854700854701, + "grad_norm": 0.9480524659156799, + "learning_rate": 1.3104462294966895e-05, + "loss": 1.287239670753479, + "step": 191 + }, + { + "epoch": 1.3076923076923077, + "grad_norm": 0.9071521162986755, + "learning_rate": 1.3032931040502627e-05, + "loss": 1.2962584495544434, + "step": 192 + }, + { + "epoch": 1.3145299145299145, + "grad_norm": 0.9058794379234314, + "learning_rate": 1.2961228466193116e-05, + "loss": 1.280348300933838, + "step": 193 + }, + { + "epoch": 1.3213675213675213, + "grad_norm": 0.9048560261726379, + "learning_rate": 1.2889358622270225e-05, + "loss": 1.3330844640731812, + "step": 194 + }, + { + "epoch": 1.3282051282051281, + "grad_norm": 0.945749819278717, + "learning_rate": 1.2817325568414299e-05, + "loss": 1.3170994520187378, + "step": 195 + }, + { + "epoch": 1.335042735042735, + "grad_norm": 0.9457980394363403, + "learning_rate": 1.2745133373524855e-05, + "loss": 1.3166072368621826, + "step": 196 + }, + { + "epoch": 1.341880341880342, + "grad_norm": 0.9297810196876526, + "learning_rate": 1.267278611549073e-05, + "loss": 1.3273459672927856, + "step": 197 + }, + { + "epoch": 1.3487179487179488, + "grad_norm": 0.9370136260986328, + "learning_rate": 1.2600287880959762e-05, + "loss": 1.3432742357254028, + "step": 198 + }, + { + "epoch": 1.3555555555555556, + "grad_norm": 0.904547393321991, + "learning_rate": 1.2527642765107919e-05, + "loss": 1.3275690078735352, + "step": 199 + }, + { + "epoch": 1.3623931623931624, + "grad_norm": 0.9034311175346375, + "learning_rate": 1.2454854871407993e-05, + "loss": 1.3097259998321533, + "step": 200 + }, + { + "epoch": 1.3623931623931624, + "eval_loss": 1.4159187078475952, + "eval_runtime": 13.7977, + "eval_samples_per_second": 71.461, + "eval_steps_per_second": 8.987, + "step": 200 + }, + { + "epoch": 1.3692307692307693, + "grad_norm": 0.8713945150375366, + "learning_rate": 1.2381928311397806e-05, + "loss": 1.2865114212036133, + "step": 201 + }, + { + "epoch": 1.376068376068376, + "grad_norm": 0.8947977423667908, + "learning_rate": 1.2308867204447958e-05, + "loss": 1.277376651763916, + "step": 202 + }, + { + "epoch": 1.3829059829059829, + "grad_norm": 0.9047794342041016, + "learning_rate": 1.2235675677529158e-05, + "loss": 1.288478970527649, + "step": 203 + }, + { + "epoch": 1.3897435897435897, + "grad_norm": 0.8953425884246826, + "learning_rate": 1.2162357864979073e-05, + "loss": 1.2861666679382324, + "step": 204 + }, + { + "epoch": 1.3965811965811965, + "grad_norm": 0.9369704723358154, + "learning_rate": 1.2088917908268822e-05, + "loss": 1.2857511043548584, + "step": 205 + }, + { + "epoch": 1.4034188034188033, + "grad_norm": 0.887296736240387, + "learning_rate": 1.2015359955769021e-05, + "loss": 1.2925364971160889, + "step": 206 + }, + { + "epoch": 1.4102564102564101, + "grad_norm": 0.875452995300293, + "learning_rate": 1.1941688162515468e-05, + "loss": 1.3017300367355347, + "step": 207 + }, + { + "epoch": 1.4170940170940172, + "grad_norm": 0.8836603760719299, + "learning_rate": 1.186790668997443e-05, + "loss": 1.2731754779815674, + "step": 208 + }, + { + "epoch": 1.423931623931624, + "grad_norm": 0.8866926431655884, + "learning_rate": 1.1794019705807584e-05, + "loss": 1.3009804487228394, + "step": 209 + }, + { + "epoch": 1.4307692307692308, + "grad_norm": 0.8414238095283508, + "learning_rate": 1.1720031383636585e-05, + "loss": 1.3082433938980103, + "step": 210 + }, + { + "epoch": 1.4376068376068376, + "grad_norm": 0.8662127256393433, + "learning_rate": 1.164594590280734e-05, + "loss": 1.2641851902008057, + "step": 211 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.9151703119277954, + "learning_rate": 1.15717674481539e-05, + "loss": 1.3064939975738525, + "step": 212 + }, + { + "epoch": 1.4512820512820512, + "grad_norm": 0.9086518883705139, + "learning_rate": 1.1497500209762102e-05, + "loss": 1.3118016719818115, + "step": 213 + }, + { + "epoch": 1.458119658119658, + "grad_norm": 0.9340091347694397, + "learning_rate": 1.1423148382732854e-05, + "loss": 1.3228766918182373, + "step": 214 + }, + { + "epoch": 1.464957264957265, + "grad_norm": 0.865403950214386, + "learning_rate": 1.1348716166945195e-05, + "loss": 1.2863235473632812, + "step": 215 + }, + { + "epoch": 1.471794871794872, + "grad_norm": 0.8879923224449158, + "learning_rate": 1.127420776681905e-05, + "loss": 1.3169306516647339, + "step": 216 + }, + { + "epoch": 1.4786324786324787, + "grad_norm": 0.8761537075042725, + "learning_rate": 1.1199627391077732e-05, + "loss": 1.2758698463439941, + "step": 217 + }, + { + "epoch": 1.4854700854700855, + "grad_norm": 0.905274510383606, + "learning_rate": 1.1124979252510209e-05, + "loss": 1.3158073425292969, + "step": 218 + }, + { + "epoch": 1.4923076923076923, + "grad_norm": 0.9052457213401794, + "learning_rate": 1.105026756773314e-05, + "loss": 1.3242114782333374, + "step": 219 + }, + { + "epoch": 1.4991452991452991, + "grad_norm": 0.8539809584617615, + "learning_rate": 1.0975496556952683e-05, + "loss": 1.295405387878418, + "step": 220 + }, + { + "epoch": 1.505982905982906, + "grad_norm": 0.9171442985534668, + "learning_rate": 1.0900670443726136e-05, + "loss": 1.3160406351089478, + "step": 221 + }, + { + "epoch": 1.5128205128205128, + "grad_norm": 0.877983570098877, + "learning_rate": 1.0825793454723325e-05, + "loss": 1.315245509147644, + "step": 222 + }, + { + "epoch": 1.5196581196581196, + "grad_norm": 0.8745649456977844, + "learning_rate": 1.0750869819487884e-05, + "loss": 1.3248393535614014, + "step": 223 + }, + { + "epoch": 1.5264957264957264, + "grad_norm": 0.8661232590675354, + "learning_rate": 1.0675903770198333e-05, + "loss": 1.2788147926330566, + "step": 224 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.8793037533760071, + "learning_rate": 1.0600899541429004e-05, + "loss": 1.288352608680725, + "step": 225 + }, + { + "epoch": 1.54017094017094, + "grad_norm": 0.9148133397102356, + "learning_rate": 1.0525861369910877e-05, + "loss": 1.3211514949798584, + "step": 226 + }, + { + "epoch": 1.547008547008547, + "grad_norm": 0.9006965160369873, + "learning_rate": 1.0450793494292223e-05, + "loss": 1.3327584266662598, + "step": 227 + }, + { + "epoch": 1.5538461538461539, + "grad_norm": 0.8701738119125366, + "learning_rate": 1.0375700154899208e-05, + "loss": 1.3010832071304321, + "step": 228 + }, + { + "epoch": 1.5606837606837607, + "grad_norm": 0.880436360836029, + "learning_rate": 1.0300585593496348e-05, + "loss": 1.3152333498001099, + "step": 229 + }, + { + "epoch": 1.5675213675213675, + "grad_norm": 0.8781545758247375, + "learning_rate": 1.0225454053046922e-05, + "loss": 1.2808175086975098, + "step": 230 + }, + { + "epoch": 1.5743589743589743, + "grad_norm": 0.8630225658416748, + "learning_rate": 1.0150309777473305e-05, + "loss": 1.2873480319976807, + "step": 231 + }, + { + "epoch": 1.5811965811965814, + "grad_norm": 0.8928260803222656, + "learning_rate": 1.007515701141722e-05, + "loss": 1.28458571434021, + "step": 232 + }, + { + "epoch": 1.5880341880341882, + "grad_norm": 0.8699108958244324, + "learning_rate": 1e-05, + "loss": 1.2885918617248535, + "step": 233 + }, + { + "epoch": 1.594871794871795, + "grad_norm": 0.8759332895278931, + "learning_rate": 9.924842988582783e-06, + "loss": 1.2787448167800903, + "step": 234 + }, + { + "epoch": 1.6017094017094018, + "grad_norm": 0.8956566452980042, + "learning_rate": 9.849690222526698e-06, + "loss": 1.304962158203125, + "step": 235 + }, + { + "epoch": 1.6085470085470086, + "grad_norm": 0.8675941824913025, + "learning_rate": 9.77454594695308e-06, + "loss": 1.2871266603469849, + "step": 236 + }, + { + "epoch": 1.6153846153846154, + "grad_norm": 0.9092246294021606, + "learning_rate": 9.699414406503655e-06, + "loss": 1.327986240386963, + "step": 237 + }, + { + "epoch": 1.6222222222222222, + "grad_norm": 0.8909919857978821, + "learning_rate": 9.624299845100795e-06, + "loss": 1.2647631168365479, + "step": 238 + }, + { + "epoch": 1.629059829059829, + "grad_norm": 0.8657082915306091, + "learning_rate": 9.549206505707778e-06, + "loss": 1.294311761856079, + "step": 239 + }, + { + "epoch": 1.6358974358974359, + "grad_norm": 0.8618515133857727, + "learning_rate": 9.474138630089124e-06, + "loss": 1.3014901876449585, + "step": 240 + }, + { + "epoch": 1.6427350427350427, + "grad_norm": 0.8630589246749878, + "learning_rate": 9.399100458570998e-06, + "loss": 1.293131709098816, + "step": 241 + }, + { + "epoch": 1.6495726495726495, + "grad_norm": 0.8735710978507996, + "learning_rate": 9.324096229801673e-06, + "loss": 1.290333867073059, + "step": 242 + }, + { + "epoch": 1.6564102564102563, + "grad_norm": 0.8574416041374207, + "learning_rate": 9.249130180512118e-06, + "loss": 1.3111311197280884, + "step": 243 + }, + { + "epoch": 1.6632478632478631, + "grad_norm": 0.9102303981781006, + "learning_rate": 9.174206545276678e-06, + "loss": 1.271691083908081, + "step": 244 + }, + { + "epoch": 1.67008547008547, + "grad_norm": 0.867579996585846, + "learning_rate": 9.099329556273866e-06, + "loss": 1.3228224515914917, + "step": 245 + }, + { + "epoch": 1.676923076923077, + "grad_norm": 0.8179166316986084, + "learning_rate": 9.024503443047318e-06, + "loss": 1.3084717988967896, + "step": 246 + }, + { + "epoch": 1.6837606837606838, + "grad_norm": 0.8923108577728271, + "learning_rate": 8.949732432266867e-06, + "loss": 1.2903640270233154, + "step": 247 + }, + { + "epoch": 1.6905982905982906, + "grad_norm": 0.9241410493850708, + "learning_rate": 8.875020747489795e-06, + "loss": 1.302449345588684, + "step": 248 + }, + { + "epoch": 1.6974358974358974, + "grad_norm": 0.8430485129356384, + "learning_rate": 8.800372608922272e-06, + "loss": 1.2765015363693237, + "step": 249 + }, + { + "epoch": 1.7042735042735044, + "grad_norm": 0.8592954874038696, + "learning_rate": 8.72579223318095e-06, + "loss": 1.317484736442566, + "step": 250 + }, + { + "epoch": 1.7042735042735044, + "eval_loss": 1.4088929891586304, + "eval_runtime": 13.7993, + "eval_samples_per_second": 71.453, + "eval_steps_per_second": 8.986, + "step": 250 + }, + { + "epoch": 1.7111111111111112, + "grad_norm": 0.916032612323761, + "learning_rate": 8.65128383305481e-06, + "loss": 1.300941824913025, + "step": 251 + }, + { + "epoch": 1.717948717948718, + "grad_norm": 0.8675019145011902, + "learning_rate": 8.576851617267151e-06, + "loss": 1.3122076988220215, + "step": 252 + }, + { + "epoch": 1.7247863247863249, + "grad_norm": 0.8310043811798096, + "learning_rate": 8.5024997902379e-06, + "loss": 1.3160263299942017, + "step": 253 + }, + { + "epoch": 1.7316239316239317, + "grad_norm": 0.8706823587417603, + "learning_rate": 8.428232551846101e-06, + "loss": 1.2773703336715698, + "step": 254 + }, + { + "epoch": 1.7384615384615385, + "grad_norm": 0.8875864744186401, + "learning_rate": 8.35405409719266e-06, + "loss": 1.288883090019226, + "step": 255 + }, + { + "epoch": 1.7452991452991453, + "grad_norm": 0.9055056571960449, + "learning_rate": 8.279968616363417e-06, + "loss": 1.3028110265731812, + "step": 256 + }, + { + "epoch": 1.7521367521367521, + "grad_norm": 0.905623197555542, + "learning_rate": 8.205980294192421e-06, + "loss": 1.3112901449203491, + "step": 257 + }, + { + "epoch": 1.758974358974359, + "grad_norm": 0.847100555896759, + "learning_rate": 8.132093310025572e-06, + "loss": 1.311500906944275, + "step": 258 + }, + { + "epoch": 1.7658119658119658, + "grad_norm": 0.8671444058418274, + "learning_rate": 8.058311837484537e-06, + "loss": 1.308862566947937, + "step": 259 + }, + { + "epoch": 1.7726495726495726, + "grad_norm": 0.844569742679596, + "learning_rate": 7.984640044230984e-06, + "loss": 1.3032524585723877, + "step": 260 + }, + { + "epoch": 1.7794871794871794, + "grad_norm": 0.9013960957527161, + "learning_rate": 7.911082091731182e-06, + "loss": 1.2791337966918945, + "step": 261 + }, + { + "epoch": 1.7863247863247862, + "grad_norm": 0.8714650869369507, + "learning_rate": 7.837642135020929e-06, + "loss": 1.2602317333221436, + "step": 262 + }, + { + "epoch": 1.793162393162393, + "grad_norm": 0.9024747014045715, + "learning_rate": 7.764324322470842e-06, + "loss": 1.279998540878296, + "step": 263 + }, + { + "epoch": 1.8, + "grad_norm": 0.8714993596076965, + "learning_rate": 7.691132795552044e-06, + "loss": 1.284783959388733, + "step": 264 + }, + { + "epoch": 1.8068376068376069, + "grad_norm": 0.8371661305427551, + "learning_rate": 7.618071688602199e-06, + "loss": 1.3234297037124634, + "step": 265 + }, + { + "epoch": 1.8136752136752137, + "grad_norm": 0.8943991661071777, + "learning_rate": 7.545145128592009e-06, + "loss": 1.2969616651535034, + "step": 266 + }, + { + "epoch": 1.8205128205128205, + "grad_norm": 0.8753275275230408, + "learning_rate": 7.472357234892083e-06, + "loss": 1.2795380353927612, + "step": 267 + }, + { + "epoch": 1.8273504273504273, + "grad_norm": 0.8614721894264221, + "learning_rate": 7.3997121190402375e-06, + "loss": 1.3064361810684204, + "step": 268 + }, + { + "epoch": 1.8341880341880343, + "grad_norm": 0.853656530380249, + "learning_rate": 7.3272138845092725e-06, + "loss": 1.3017405271530151, + "step": 269 + }, + { + "epoch": 1.8410256410256411, + "grad_norm": 0.8655431866645813, + "learning_rate": 7.254866626475152e-06, + "loss": 1.304486632347107, + "step": 270 + }, + { + "epoch": 1.847863247863248, + "grad_norm": 0.87064528465271, + "learning_rate": 7.182674431585703e-06, + "loss": 1.2795239686965942, + "step": 271 + }, + { + "epoch": 1.8547008547008548, + "grad_norm": 0.8889244198799133, + "learning_rate": 7.110641377729778e-06, + "loss": 1.294914960861206, + "step": 272 + }, + { + "epoch": 1.8615384615384616, + "grad_norm": 0.9096329212188721, + "learning_rate": 7.038771533806884e-06, + "loss": 1.2885854244232178, + "step": 273 + }, + { + "epoch": 1.8683760683760684, + "grad_norm": 0.8873443007469177, + "learning_rate": 6.967068959497376e-06, + "loss": 1.297377347946167, + "step": 274 + }, + { + "epoch": 1.8752136752136752, + "grad_norm": 0.8182293772697449, + "learning_rate": 6.895537705033108e-06, + "loss": 1.3091909885406494, + "step": 275 + }, + { + "epoch": 1.882051282051282, + "grad_norm": 0.849620521068573, + "learning_rate": 6.824181810968675e-06, + "loss": 1.2712843418121338, + "step": 276 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.8953171372413635, + "learning_rate": 6.7530053079531664e-06, + "loss": 1.305629849433899, + "step": 277 + }, + { + "epoch": 1.8957264957264957, + "grad_norm": 0.8743292689323425, + "learning_rate": 6.6820122165024845e-06, + "loss": 1.3009774684906006, + "step": 278 + }, + { + "epoch": 1.9025641025641025, + "grad_norm": 0.8852370977401733, + "learning_rate": 6.6112065467722375e-06, + "loss": 1.2898852825164795, + "step": 279 + }, + { + "epoch": 1.9094017094017093, + "grad_norm": 0.8812291026115417, + "learning_rate": 6.540592298331239e-06, + "loss": 1.3161499500274658, + "step": 280 + }, + { + "epoch": 1.916239316239316, + "grad_norm": 0.8949340581893921, + "learning_rate": 6.4701734599355605e-06, + "loss": 1.2947360277175903, + "step": 281 + }, + { + "epoch": 1.9230769230769231, + "grad_norm": 0.8372949957847595, + "learning_rate": 6.3999540093032396e-06, + "loss": 1.263576626777649, + "step": 282 + }, + { + "epoch": 1.92991452991453, + "grad_norm": 0.8882158398628235, + "learning_rate": 6.329937912889582e-06, + "loss": 1.2893450260162354, + "step": 283 + }, + { + "epoch": 1.9367521367521368, + "grad_norm": 0.838527500629425, + "learning_rate": 6.260129125663106e-06, + "loss": 1.2985213994979858, + "step": 284 + }, + { + "epoch": 1.9435897435897436, + "grad_norm": 0.8823593258857727, + "learning_rate": 6.1905315908821584e-06, + "loss": 1.306897521018982, + "step": 285 + }, + { + "epoch": 1.9504273504273504, + "grad_norm": 0.8618027567863464, + "learning_rate": 6.121149239872151e-06, + "loss": 1.2990589141845703, + "step": 286 + }, + { + "epoch": 1.9572649572649574, + "grad_norm": 0.8389527797698975, + "learning_rate": 6.051985991803517e-06, + "loss": 1.2886924743652344, + "step": 287 + }, + { + "epoch": 1.9641025641025642, + "grad_norm": 0.8738916516304016, + "learning_rate": 5.983045753470308e-06, + "loss": 1.3003113269805908, + "step": 288 + }, + { + "epoch": 1.970940170940171, + "grad_norm": 0.8567415475845337, + "learning_rate": 5.91433241906952e-06, + "loss": 1.285038948059082, + "step": 289 + }, + { + "epoch": 1.9777777777777779, + "grad_norm": 0.8555871248245239, + "learning_rate": 5.845849869981137e-06, + "loss": 1.2825312614440918, + "step": 290 + }, + { + "epoch": 1.9846153846153847, + "grad_norm": 0.8524548411369324, + "learning_rate": 5.7776019745488665e-06, + "loss": 1.3078036308288574, + "step": 291 + }, + { + "epoch": 1.9914529914529915, + "grad_norm": 0.8610931634902954, + "learning_rate": 5.709592587861637e-06, + "loss": 1.2933144569396973, + "step": 292 + }, + { + "epoch": 1.9982905982905983, + "grad_norm": 0.8547428250312805, + "learning_rate": 5.641825551535849e-06, + "loss": 1.2723497152328491, + "step": 293 + }, + { + "epoch": 2.0, + "grad_norm": 1.6815301179885864, + "learning_rate": 5.574304693498346e-06, + "loss": 1.260840892791748, + "step": 294 + }, + { + "epoch": 2.006837606837607, + "grad_norm": 1.1894463300704956, + "learning_rate": 5.507033827770225e-06, + "loss": 1.2158567905426025, + "step": 295 + }, + { + "epoch": 2.0136752136752136, + "grad_norm": 1.1574074029922485, + "learning_rate": 5.440016754251364e-06, + "loss": 1.188340663909912, + "step": 296 + }, + { + "epoch": 2.0205128205128204, + "grad_norm": 0.9981362819671631, + "learning_rate": 5.373257258505798e-06, + "loss": 1.1729332208633423, + "step": 297 + }, + { + "epoch": 2.0273504273504273, + "grad_norm": 1.0496586561203003, + "learning_rate": 5.306759111547881e-06, + "loss": 1.1735312938690186, + "step": 298 + }, + { + "epoch": 2.034188034188034, + "grad_norm": 0.9409749507904053, + "learning_rate": 5.240526069629265e-06, + "loss": 1.198318600654602, + "step": 299 + }, + { + "epoch": 2.041025641025641, + "grad_norm": 0.9382721781730652, + "learning_rate": 5.174561874026741e-06, + "loss": 1.2194828987121582, + "step": 300 + }, + { + "epoch": 2.041025641025641, + "eval_loss": 1.4175776243209839, + "eval_runtime": 13.7699, + "eval_samples_per_second": 71.606, + "eval_steps_per_second": 9.005, + "step": 300 + }, + { + "epoch": 2.0478632478632477, + "grad_norm": 0.936610996723175, + "learning_rate": 5.1088702508308815e-06, + "loss": 1.2439236640930176, + "step": 301 + }, + { + "epoch": 2.0547008547008545, + "grad_norm": 0.9476950764656067, + "learning_rate": 5.043454910735595e-06, + "loss": 1.2119914293289185, + "step": 302 + }, + { + "epoch": 2.0615384615384613, + "grad_norm": 0.975143313407898, + "learning_rate": 4.978319548828504e-06, + "loss": 1.1766479015350342, + "step": 303 + }, + { + "epoch": 2.0683760683760686, + "grad_norm": 0.9535344243049622, + "learning_rate": 4.913467844382217e-06, + "loss": 1.2154781818389893, + "step": 304 + }, + { + "epoch": 2.0752136752136754, + "grad_norm": 0.9839100241661072, + "learning_rate": 4.848903460646522e-06, + "loss": 1.1973791122436523, + "step": 305 + }, + { + "epoch": 2.082051282051282, + "grad_norm": 0.9296822547912598, + "learning_rate": 4.784630044641435e-06, + "loss": 1.2077343463897705, + "step": 306 + }, + { + "epoch": 2.088888888888889, + "grad_norm": 0.9518297910690308, + "learning_rate": 4.720651226951213e-06, + "loss": 1.2044742107391357, + "step": 307 + }, + { + "epoch": 2.095726495726496, + "grad_norm": 0.9024590253829956, + "learning_rate": 4.65697062151927e-06, + "loss": 1.2214324474334717, + "step": 308 + }, + { + "epoch": 2.1025641025641026, + "grad_norm": 0.8939958214759827, + "learning_rate": 4.593591825444028e-06, + "loss": 1.230959177017212, + "step": 309 + }, + { + "epoch": 2.1094017094017095, + "grad_norm": 0.9565759301185608, + "learning_rate": 4.530518418775734e-06, + "loss": 1.2308049201965332, + "step": 310 + }, + { + "epoch": 2.1162393162393163, + "grad_norm": 0.8952397704124451, + "learning_rate": 4.467753964314245e-06, + "loss": 1.2218645811080933, + "step": 311 + }, + { + "epoch": 2.123076923076923, + "grad_norm": 0.9192137122154236, + "learning_rate": 4.40530200740777e-06, + "loss": 1.1945393085479736, + "step": 312 + }, + { + "epoch": 2.12991452991453, + "grad_norm": 0.9151750206947327, + "learning_rate": 4.343166075752605e-06, + "loss": 1.1909265518188477, + "step": 313 + }, + { + "epoch": 2.1367521367521367, + "grad_norm": 0.912064790725708, + "learning_rate": 4.281349679193862e-06, + "loss": 1.176002860069275, + "step": 314 + }, + { + "epoch": 2.1435897435897435, + "grad_norm": 0.9001777172088623, + "learning_rate": 4.219856309527212e-06, + "loss": 1.2102347612380981, + "step": 315 + }, + { + "epoch": 2.1504273504273503, + "grad_norm": 0.9100410342216492, + "learning_rate": 4.1586894403016576e-06, + "loss": 1.2215776443481445, + "step": 316 + }, + { + "epoch": 2.157264957264957, + "grad_norm": 0.8823668360710144, + "learning_rate": 4.097852526623307e-06, + "loss": 1.1972424983978271, + "step": 317 + }, + { + "epoch": 2.164102564102564, + "grad_norm": 0.8945139050483704, + "learning_rate": 4.03734900496022e-06, + "loss": 1.2440537214279175, + "step": 318 + }, + { + "epoch": 2.1709401709401708, + "grad_norm": 0.858863890171051, + "learning_rate": 3.9771822929482825e-06, + "loss": 1.2240134477615356, + "step": 319 + }, + { + "epoch": 2.1777777777777776, + "grad_norm": 0.9579023122787476, + "learning_rate": 3.917355789198157e-06, + "loss": 1.1975905895233154, + "step": 320 + }, + { + "epoch": 2.184615384615385, + "grad_norm": 0.8992065191268921, + "learning_rate": 3.857872873103322e-06, + "loss": 1.2251243591308594, + "step": 321 + }, + { + "epoch": 2.1914529914529917, + "grad_norm": 0.8930969834327698, + "learning_rate": 3.7987369046491684e-06, + "loss": 1.1994602680206299, + "step": 322 + }, + { + "epoch": 2.1982905982905985, + "grad_norm": 0.8879907727241516, + "learning_rate": 3.7399512242231994e-06, + "loss": 1.2023355960845947, + "step": 323 + }, + { + "epoch": 2.2051282051282053, + "grad_norm": 0.8827998638153076, + "learning_rate": 3.6815191524263628e-06, + "loss": 1.1980074644088745, + "step": 324 + }, + { + "epoch": 2.211965811965812, + "grad_norm": 0.9081103801727295, + "learning_rate": 3.623443989885462e-06, + "loss": 1.2123109102249146, + "step": 325 + }, + { + "epoch": 2.218803418803419, + "grad_norm": 0.8658437132835388, + "learning_rate": 3.565729017066729e-06, + "loss": 1.1860473155975342, + "step": 326 + }, + { + "epoch": 2.2256410256410257, + "grad_norm": 0.8716210722923279, + "learning_rate": 3.508377494090521e-06, + "loss": 1.246274471282959, + "step": 327 + }, + { + "epoch": 2.2324786324786325, + "grad_norm": 0.8930105566978455, + "learning_rate": 3.4513926605471504e-06, + "loss": 1.2249618768692017, + "step": 328 + }, + { + "epoch": 2.2393162393162394, + "grad_norm": 0.8859133720397949, + "learning_rate": 3.3947777353139188e-06, + "loss": 1.2300435304641724, + "step": 329 + }, + { + "epoch": 2.246153846153846, + "grad_norm": 0.876879096031189, + "learning_rate": 3.338535916373267e-06, + "loss": 1.226067066192627, + "step": 330 + }, + { + "epoch": 2.252991452991453, + "grad_norm": 0.8582764863967896, + "learning_rate": 3.2826703806321526e-06, + "loss": 1.2141978740692139, + "step": 331 + }, + { + "epoch": 2.25982905982906, + "grad_norm": 0.9050947427749634, + "learning_rate": 3.2271842837425917e-06, + "loss": 1.199479103088379, + "step": 332 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 0.8743166923522949, + "learning_rate": 3.1720807599233903e-06, + "loss": 1.2526406049728394, + "step": 333 + }, + { + "epoch": 2.2735042735042734, + "grad_norm": 0.9142019152641296, + "learning_rate": 3.1173629217831345e-06, + "loss": 1.1963285207748413, + "step": 334 + }, + { + "epoch": 2.2803418803418802, + "grad_norm": 0.8888209462165833, + "learning_rate": 3.063033860144339e-06, + "loss": 1.209120512008667, + "step": 335 + }, + { + "epoch": 2.287179487179487, + "grad_norm": 0.8925624489784241, + "learning_rate": 3.0090966438688774e-06, + "loss": 1.1804795265197754, + "step": 336 + }, + { + "epoch": 2.294017094017094, + "grad_norm": 0.9087634682655334, + "learning_rate": 2.9555543196846293e-06, + "loss": 1.2147403955459595, + "step": 337 + }, + { + "epoch": 2.3008547008547007, + "grad_norm": 0.9099950194358826, + "learning_rate": 2.9024099120133674e-06, + "loss": 1.2237548828125, + "step": 338 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 0.8658971786499023, + "learning_rate": 2.8496664227999417e-06, + "loss": 1.2072890996932983, + "step": 339 + }, + { + "epoch": 2.3145299145299143, + "grad_norm": 0.8897408843040466, + "learning_rate": 2.7973268313426836e-06, + "loss": 1.2147533893585205, + "step": 340 + }, + { + "epoch": 2.3213675213675216, + "grad_norm": 0.8564779758453369, + "learning_rate": 2.745394094125141e-06, + "loss": 1.2456395626068115, + "step": 341 + }, + { + "epoch": 2.3282051282051284, + "grad_norm": 0.8652287125587463, + "learning_rate": 2.6938711446490607e-06, + "loss": 1.2109252214431763, + "step": 342 + }, + { + "epoch": 2.335042735042735, + "grad_norm": 0.8643552660942078, + "learning_rate": 2.642760893268684e-06, + "loss": 1.1878920793533325, + "step": 343 + }, + { + "epoch": 2.341880341880342, + "grad_norm": 0.8824043869972229, + "learning_rate": 2.5920662270263653e-06, + "loss": 1.1911319494247437, + "step": 344 + }, + { + "epoch": 2.348717948717949, + "grad_norm": 0.8898422122001648, + "learning_rate": 2.541790009489474e-06, + "loss": 1.193242073059082, + "step": 345 + }, + { + "epoch": 2.3555555555555556, + "grad_norm": 0.8772786259651184, + "learning_rate": 2.491935080588658e-06, + "loss": 1.1836318969726562, + "step": 346 + }, + { + "epoch": 2.3623931623931624, + "grad_norm": 0.8587839603424072, + "learning_rate": 2.4425042564574186e-06, + "loss": 1.2118480205535889, + "step": 347 + }, + { + "epoch": 2.3692307692307693, + "grad_norm": 0.8739367127418518, + "learning_rate": 2.3935003292730295e-06, + "loss": 1.2201834917068481, + "step": 348 + }, + { + "epoch": 2.376068376068376, + "grad_norm": 0.8904187679290771, + "learning_rate": 2.344926067098836e-06, + "loss": 1.1912821531295776, + "step": 349 + }, + { + "epoch": 2.382905982905983, + "grad_norm": 0.8717731237411499, + "learning_rate": 2.2967842137278706e-06, + "loss": 1.2726080417633057, + "step": 350 + }, + { + "epoch": 2.382905982905983, + "eval_loss": 1.422935962677002, + "eval_runtime": 13.7932, + "eval_samples_per_second": 71.484, + "eval_steps_per_second": 8.99, + "step": 350 + }, + { + "epoch": 2.3897435897435897, + "grad_norm": 0.8623640537261963, + "learning_rate": 2.249077488527891e-06, + "loss": 1.1917917728424072, + "step": 351 + }, + { + "epoch": 2.3965811965811965, + "grad_norm": 0.9295298457145691, + "learning_rate": 2.201808586287757e-06, + "loss": 1.195438027381897, + "step": 352 + }, + { + "epoch": 2.4034188034188033, + "grad_norm": 0.8726212382316589, + "learning_rate": 2.15498017706521e-06, + "loss": 1.1993173360824585, + "step": 353 + }, + { + "epoch": 2.41025641025641, + "grad_norm": 0.8750997185707092, + "learning_rate": 2.1085949060360654e-06, + "loss": 1.2198253870010376, + "step": 354 + }, + { + "epoch": 2.417094017094017, + "grad_norm": 0.8799977898597717, + "learning_rate": 2.0626553933447734e-06, + "loss": 1.1714023351669312, + "step": 355 + }, + { + "epoch": 2.4239316239316238, + "grad_norm": 0.9106065034866333, + "learning_rate": 2.01716423395644e-06, + "loss": 1.2285724878311157, + "step": 356 + }, + { + "epoch": 2.430769230769231, + "grad_norm": 0.8555257320404053, + "learning_rate": 1.9721239975102313e-06, + "loss": 1.1813218593597412, + "step": 357 + }, + { + "epoch": 2.437606837606838, + "grad_norm": 0.8696889877319336, + "learning_rate": 1.9275372281742242e-06, + "loss": 1.2316478490829468, + "step": 358 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.9041836857795715, + "learning_rate": 1.8834064445016952e-06, + "loss": 1.2227892875671387, + "step": 359 + }, + { + "epoch": 2.4512820512820515, + "grad_norm": 0.8697716593742371, + "learning_rate": 1.8397341392888679e-06, + "loss": 1.224617600440979, + "step": 360 + }, + { + "epoch": 2.4581196581196583, + "grad_norm": 0.8882873058319092, + "learning_rate": 1.7965227794340879e-06, + "loss": 1.1995422840118408, + "step": 361 + }, + { + "epoch": 2.464957264957265, + "grad_norm": 0.8834539651870728, + "learning_rate": 1.7537748057984861e-06, + "loss": 1.2222732305526733, + "step": 362 + }, + { + "epoch": 2.471794871794872, + "grad_norm": 0.899989128112793, + "learning_rate": 1.7114926330680958e-06, + "loss": 1.2143341302871704, + "step": 363 + }, + { + "epoch": 2.4786324786324787, + "grad_norm": 0.8635477423667908, + "learning_rate": 1.6696786496174578e-06, + "loss": 1.2323466539382935, + "step": 364 + }, + { + "epoch": 2.4854700854700855, + "grad_norm": 0.8827865719795227, + "learning_rate": 1.6283352173747148e-06, + "loss": 1.1907907724380493, + "step": 365 + }, + { + "epoch": 2.4923076923076923, + "grad_norm": 0.8702190518379211, + "learning_rate": 1.587464671688187e-06, + "loss": 1.201211929321289, + "step": 366 + }, + { + "epoch": 2.499145299145299, + "grad_norm": 0.8626653552055359, + "learning_rate": 1.5470693211944643e-06, + "loss": 1.1894201040267944, + "step": 367 + }, + { + "epoch": 2.505982905982906, + "grad_norm": 0.879705011844635, + "learning_rate": 1.5071514476879878e-06, + "loss": 1.2102407217025757, + "step": 368 + }, + { + "epoch": 2.5128205128205128, + "grad_norm": 0.8780226707458496, + "learning_rate": 1.4677133059921634e-06, + "loss": 1.235593557357788, + "step": 369 + }, + { + "epoch": 2.5196581196581196, + "grad_norm": 0.8804551362991333, + "learning_rate": 1.4287571238320053e-06, + "loss": 1.2265985012054443, + "step": 370 + }, + { + "epoch": 2.5264957264957264, + "grad_norm": 0.8670660257339478, + "learning_rate": 1.3902851017082863e-06, + "loss": 1.1925873756408691, + "step": 371 + }, + { + "epoch": 2.533333333333333, + "grad_norm": 0.8729323744773865, + "learning_rate": 1.3522994127732415e-06, + "loss": 1.20308518409729, + "step": 372 + }, + { + "epoch": 2.54017094017094, + "grad_norm": 0.8794763088226318, + "learning_rate": 1.3148022027078223e-06, + "loss": 1.2204805612564087, + "step": 373 + }, + { + "epoch": 2.547008547008547, + "grad_norm": 0.870823323726654, + "learning_rate": 1.2777955896004812e-06, + "loss": 1.2257260084152222, + "step": 374 + }, + { + "epoch": 2.5538461538461537, + "grad_norm": 0.8570955991744995, + "learning_rate": 1.2412816638275406e-06, + "loss": 1.2166708707809448, + "step": 375 + }, + { + "epoch": 2.5606837606837605, + "grad_norm": 0.8496021628379822, + "learning_rate": 1.2052624879351105e-06, + "loss": 1.1956825256347656, + "step": 376 + }, + { + "epoch": 2.5675213675213673, + "grad_norm": 0.8563467860221863, + "learning_rate": 1.1697400965225746e-06, + "loss": 1.2383781671524048, + "step": 377 + }, + { + "epoch": 2.574358974358974, + "grad_norm": 0.8653855919837952, + "learning_rate": 1.134716496127679e-06, + "loss": 1.218265414237976, + "step": 378 + }, + { + "epoch": 2.5811965811965814, + "grad_norm": 0.8653165698051453, + "learning_rate": 1.1001936651131717e-06, + "loss": 1.226462483406067, + "step": 379 + }, + { + "epoch": 2.588034188034188, + "grad_norm": 0.8810314536094666, + "learning_rate": 1.0661735535550666e-06, + "loss": 1.176276445388794, + "step": 380 + }, + { + "epoch": 2.594871794871795, + "grad_norm": 0.8538199663162231, + "learning_rate": 1.0326580831324816e-06, + "loss": 1.2393090724945068, + "step": 381 + }, + { + "epoch": 2.601709401709402, + "grad_norm": 0.849739134311676, + "learning_rate": 9.996491470190917e-07, + "loss": 1.2231508493423462, + "step": 382 + }, + { + "epoch": 2.6085470085470086, + "grad_norm": 0.891149640083313, + "learning_rate": 9.671486097761918e-07, + "loss": 1.2225626707077026, + "step": 383 + }, + { + "epoch": 2.6153846153846154, + "grad_norm": 0.8668763637542725, + "learning_rate": 9.351583072473713e-07, + "loss": 1.2182505130767822, + "step": 384 + }, + { + "epoch": 2.6222222222222222, + "grad_norm": 0.8931220173835754, + "learning_rate": 9.036800464548157e-07, + "loss": 1.1996538639068604, + "step": 385 + }, + { + "epoch": 2.629059829059829, + "grad_norm": 0.923690140247345, + "learning_rate": 8.727156054972374e-07, + "loss": 1.238417148590088, + "step": 386 + }, + { + "epoch": 2.635897435897436, + "grad_norm": 0.9119179844856262, + "learning_rate": 8.42266733449425e-07, + "loss": 1.226833462715149, + "step": 387 + }, + { + "epoch": 2.6427350427350427, + "grad_norm": 0.8686037659645081, + "learning_rate": 8.123351502634625e-07, + "loss": 1.1834110021591187, + "step": 388 + }, + { + "epoch": 2.6495726495726495, + "grad_norm": 0.8596007823944092, + "learning_rate": 7.829225466715551e-07, + "loss": 1.1922662258148193, + "step": 389 + }, + { + "epoch": 2.6564102564102563, + "grad_norm": 0.8411397337913513, + "learning_rate": 7.540305840905371e-07, + "loss": 1.2220802307128906, + "step": 390 + }, + { + "epoch": 2.663247863247863, + "grad_norm": 0.8473320007324219, + "learning_rate": 7.256608945280319e-07, + "loss": 1.176034688949585, + "step": 391 + }, + { + "epoch": 2.67008547008547, + "grad_norm": 0.8465791940689087, + "learning_rate": 6.978150804902451e-07, + "loss": 1.2118513584136963, + "step": 392 + }, + { + "epoch": 2.676923076923077, + "grad_norm": 0.8556994199752808, + "learning_rate": 6.704947148914608e-07, + "loss": 1.2035595178604126, + "step": 393 + }, + { + "epoch": 2.683760683760684, + "grad_norm": 0.8603663444519043, + "learning_rate": 6.437013409651849e-07, + "loss": 1.2043513059616089, + "step": 394 + }, + { + "epoch": 2.690598290598291, + "grad_norm": 0.8347552418708801, + "learning_rate": 6.174364721769744e-07, + "loss": 1.260666847229004, + "step": 395 + }, + { + "epoch": 2.6974358974358976, + "grad_norm": 0.867624044418335, + "learning_rate": 5.917015921389569e-07, + "loss": 1.2071622610092163, + "step": 396 + }, + { + "epoch": 2.7042735042735044, + "grad_norm": 0.8668217062950134, + "learning_rate": 5.664981545260073e-07, + "loss": 1.197313904762268, + "step": 397 + }, + { + "epoch": 2.7111111111111112, + "grad_norm": 0.8758941292762756, + "learning_rate": 5.418275829936537e-07, + "loss": 1.1844048500061035, + "step": 398 + }, + { + "epoch": 2.717948717948718, + "grad_norm": 0.866844892501831, + "learning_rate": 5.176912710976467e-07, + "loss": 1.1971948146820068, + "step": 399 + }, + { + "epoch": 2.724786324786325, + "grad_norm": 0.8587160110473633, + "learning_rate": 4.940905822152454e-07, + "loss": 1.1895333528518677, + "step": 400 + }, + { + "epoch": 2.724786324786325, + "eval_loss": 1.4216117858886719, + "eval_runtime": 13.782, + "eval_samples_per_second": 71.543, + "eval_steps_per_second": 8.997, + "step": 400 + }, + { + "epoch": 2.7316239316239317, + "grad_norm": 0.8763930201530457, + "learning_rate": 4.710268494682146e-07, + "loss": 1.1914920806884766, + "step": 401 + }, + { + "epoch": 2.7384615384615385, + "grad_norm": 0.8831557035446167, + "learning_rate": 4.485013756475076e-07, + "loss": 1.1900079250335693, + "step": 402 + }, + { + "epoch": 2.7452991452991453, + "grad_norm": 0.866532027721405, + "learning_rate": 4.265154331396815e-07, + "loss": 1.1844745874404907, + "step": 403 + }, + { + "epoch": 2.752136752136752, + "grad_norm": 0.8787288069725037, + "learning_rate": 4.0507026385502747e-07, + "loss": 1.2126126289367676, + "step": 404 + }, + { + "epoch": 2.758974358974359, + "grad_norm": 0.8669936060905457, + "learning_rate": 3.841670791574137e-07, + "loss": 1.229267954826355, + "step": 405 + }, + { + "epoch": 2.7658119658119658, + "grad_norm": 0.8436914086341858, + "learning_rate": 3.638070597958665e-07, + "loss": 1.1994611024856567, + "step": 406 + }, + { + "epoch": 2.7726495726495726, + "grad_norm": 0.8477561473846436, + "learning_rate": 3.439913558378705e-07, + "loss": 1.2160733938217163, + "step": 407 + }, + { + "epoch": 2.7794871794871794, + "grad_norm": 0.9217561483383179, + "learning_rate": 3.2472108660439706e-07, + "loss": 1.1882672309875488, + "step": 408 + }, + { + "epoch": 2.786324786324786, + "grad_norm": 0.8692064881324768, + "learning_rate": 3.059973406066963e-07, + "loss": 1.186108112335205, + "step": 409 + }, + { + "epoch": 2.793162393162393, + "grad_norm": 0.8593800067901611, + "learning_rate": 2.878211754847926e-07, + "loss": 1.2128371000289917, + "step": 410 + }, + { + "epoch": 2.8, + "grad_norm": 0.8875913023948669, + "learning_rate": 2.701936179477516e-07, + "loss": 1.1906311511993408, + "step": 411 + }, + { + "epoch": 2.8068376068376066, + "grad_norm": 0.8833599090576172, + "learning_rate": 2.5311566371568505e-07, + "loss": 1.1937415599822998, + "step": 412 + }, + { + "epoch": 2.8136752136752134, + "grad_norm": 0.8523573279380798, + "learning_rate": 2.3658827746349976e-07, + "loss": 1.1862268447875977, + "step": 413 + }, + { + "epoch": 2.8205128205128203, + "grad_norm": 0.8653656244277954, + "learning_rate": 2.206123927664161e-07, + "loss": 1.2255483865737915, + "step": 414 + }, + { + "epoch": 2.827350427350427, + "grad_norm": 0.874724805355072, + "learning_rate": 2.0518891204722169e-07, + "loss": 1.2177876234054565, + "step": 415 + }, + { + "epoch": 2.8341880341880343, + "grad_norm": 0.8411559462547302, + "learning_rate": 1.903187065253076e-07, + "loss": 1.2034833431243896, + "step": 416 + }, + { + "epoch": 2.841025641025641, + "grad_norm": 0.8371963500976562, + "learning_rate": 1.7600261616745106e-07, + "loss": 1.1710231304168701, + "step": 417 + }, + { + "epoch": 2.847863247863248, + "grad_norm": 0.8555141687393188, + "learning_rate": 1.622414496403668e-07, + "loss": 1.2024474143981934, + "step": 418 + }, + { + "epoch": 2.8547008547008548, + "grad_norm": 0.8661652207374573, + "learning_rate": 1.490359842650324e-07, + "loss": 1.2498114109039307, + "step": 419 + }, + { + "epoch": 2.8615384615384616, + "grad_norm": 0.8592333197593689, + "learning_rate": 1.3638696597277678e-07, + "loss": 1.2100580930709839, + "step": 420 + }, + { + "epoch": 2.8683760683760684, + "grad_norm": 0.8594926595687866, + "learning_rate": 1.2429510926314835e-07, + "loss": 1.1787865161895752, + "step": 421 + }, + { + "epoch": 2.875213675213675, + "grad_norm": 0.8879026174545288, + "learning_rate": 1.1276109716355288e-07, + "loss": 1.2315534353256226, + "step": 422 + }, + { + "epoch": 2.882051282051282, + "grad_norm": 0.8497971892356873, + "learning_rate": 1.0178558119067316e-07, + "loss": 1.2027359008789062, + "step": 423 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.8838421106338501, + "learning_rate": 9.136918131366412e-08, + "loss": 1.2284358739852905, + "step": 424 + }, + { + "epoch": 2.8957264957264957, + "grad_norm": 0.8940805196762085, + "learning_rate": 8.151248591913519e-08, + "loss": 1.2018911838531494, + "step": 425 + }, + { + "epoch": 2.9025641025641025, + "grad_norm": 0.8463784456253052, + "learning_rate": 7.22160517779169e-08, + "loss": 1.2137906551361084, + "step": 426 + }, + { + "epoch": 2.9094017094017093, + "grad_norm": 0.8508373498916626, + "learning_rate": 6.348040401360833e-08, + "loss": 1.2048455476760864, + "step": 427 + }, + { + "epoch": 2.916239316239316, + "grad_norm": 0.8702911138534546, + "learning_rate": 5.530603607290852e-08, + "loss": 1.216880202293396, + "step": 428 + }, + { + "epoch": 2.9230769230769234, + "grad_norm": 0.8441773653030396, + "learning_rate": 4.7693409697756596e-08, + "loss": 1.2449169158935547, + "step": 429 + }, + { + "epoch": 2.92991452991453, + "grad_norm": 0.8643396496772766, + "learning_rate": 4.0642954899238196e-08, + "loss": 1.2154898643493652, + "step": 430 + }, + { + "epoch": 2.936752136752137, + "grad_norm": 0.8390621542930603, + "learning_rate": 3.4155069933301535e-08, + "loss": 1.1894207000732422, + "step": 431 + }, + { + "epoch": 2.943589743589744, + "grad_norm": 0.8889386057853699, + "learning_rate": 2.823012127825764e-08, + "loss": 1.2449326515197754, + "step": 432 + }, + { + "epoch": 2.9504273504273506, + "grad_norm": 0.8431465029716492, + "learning_rate": 2.2868443614082468e-08, + "loss": 1.1918964385986328, + "step": 433 + }, + { + "epoch": 2.9572649572649574, + "grad_norm": 0.859993577003479, + "learning_rate": 1.8070339803509805e-08, + "loss": 1.1882524490356445, + "step": 434 + }, + { + "epoch": 2.9641025641025642, + "grad_norm": 0.8584935069084167, + "learning_rate": 1.383608087492605e-08, + "loss": 1.2315739393234253, + "step": 435 + }, + { + "epoch": 2.970940170940171, + "grad_norm": 0.8648282289505005, + "learning_rate": 1.0165906007056914e-08, + "loss": 1.235274314880371, + "step": 436 + }, + { + "epoch": 2.977777777777778, + "grad_norm": 0.8602524399757385, + "learning_rate": 7.060022515460452e-09, + "loss": 1.1928036212921143, + "step": 437 + }, + { + "epoch": 2.9846153846153847, + "grad_norm": 0.8722023367881775, + "learning_rate": 4.5186058408153156e-09, + "loss": 1.2146607637405396, + "step": 438 + }, + { + "epoch": 2.9914529914529915, + "grad_norm": 0.8878926038742065, + "learning_rate": 2.5417995390086824e-09, + "loss": 1.1910994052886963, + "step": 439 + }, + { + "epoch": 2.9982905982905983, + "grad_norm": 0.8773415088653564, + "learning_rate": 1.129715273033849e-09, + "loss": 1.1811952590942383, + "step": 440 + }, + { + "epoch": 3.0, + "grad_norm": 1.841178059577942, + "learning_rate": 2.8243280667306084e-10, + "loss": 1.14687180519104, + "step": 441 + }, + { + "epoch": 3.0, + "step": 441, + "total_flos": 5.3379040973665075e+17, + "train_loss": 1.3470534840408637, + "train_runtime": 3006.8003, + "train_samples_per_second": 18.676, + "train_steps_per_second": 0.147 + } + ], + "logging_steps": 1.0, + "max_steps": 441, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.3379040973665075e+17, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}