commit 7c02353428da24d73a1390b5bf4fa5687b1812bb Author: ModelHub XC Date: Sat Jun 20 17:56:19 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: guangyangnlp/Qwen3-4B-SFT-medical-1e-5 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..ee0e798 --- /dev/null +++ b/README.md @@ -0,0 +1,72 @@ +--- +library_name: transformers +license: other +base_model: Qwen/Qwen3-4B +tags: +- llama-factory +- full +- generated_from_trainer +model-index: +- name: medical-o1-sft-full-1e-5 + results: [] +--- + + + +# medical-o1-sft-full-1e-5 + +This model is a fine-tuned version of [Qwen/Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B) on the medical_o1_train dataset. +It achieves the following results on the evaluation set: +- Loss: 1.3088 + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 1e-05 +- train_batch_size: 4 +- eval_batch_size: 8 +- seed: 42 +- gradient_accumulation_steps: 32 +- total_train_batch_size: 128 +- optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_steps: 0.05 +- num_epochs: 3.0 + +### Training results + +| Training Loss | Epoch | Step | Validation Loss | +|:-------------:|:------:|:----:|:---------------:| +| 1.3952 | 0.3009 | 44 | 1.3628 | +| 1.3153 | 0.6017 | 88 | 1.3319 | +| 1.3174 | 0.9026 | 132 | 1.3175 | +| 1.1758 | 1.1983 | 176 | 1.3184 | +| 1.2047 | 1.4991 | 220 | 1.3131 | +| 1.1961 | 1.8 | 264 | 1.3088 | +| 1.1401 | 2.0957 | 308 | 1.3254 | +| 1.1144 | 2.3966 | 352 | 1.3240 | +| 1.1196 | 2.6974 | 396 | 1.3236 | +| 1.0940 | 2.9983 | 440 | 1.3234 | + + +### Framework versions + +- Transformers 5.0.0 +- Pytorch 2.10.0+cu128 +- Datasets 4.0.0 +- Tokenizers 0.22.2 diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..5e18feb --- /dev/null +++ b/all_results.json @@ -0,0 +1,12 @@ +{ + "epoch": 3.0, + "eval_loss": 1.308773159980774, + "eval_runtime": 24.6494, + "eval_samples_per_second": 40.001, + "eval_steps_per_second": 5.031, + "total_flos": 9.743300044908134e+17, + "train_loss": 1.2459275746832088, + "train_runtime": 6646.3979, + "train_samples_per_second": 8.449, + "train_steps_per_second": 0.066 +} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..1e6ad26 --- /dev/null +++ b/config.json @@ -0,0 +1,71 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": null, + "dtype": "float32", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": 151643, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": true, + "transformers_version": "5.0.0", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/eval_results.json b/eval_results.json new file mode 100644 index 0000000..1d0688c --- /dev/null +++ b/eval_results.json @@ -0,0 +1,7 @@ +{ + "epoch": 3.0, + "eval_loss": 1.308773159980774, + "eval_runtime": 24.6494, + "eval_samples_per_second": 40.001, + "eval_steps_per_second": 5.031 +} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..c33fb76 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "temperature": 0.6, + "top_k": 20, + "top_p": 0.95, + "transformers_version": "5.0.0" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..39c1fef --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fce6b23b7b5fd06726350735e0ccbc86ce9237583d2e16e868ee2d4abb7df01b +size 16089918232 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..c7afbed --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..145e2c7 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,30 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..28dc4b3 --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 3.0, + "total_flos": 9.743300044908134e+17, + "train_loss": 1.2459275746832088, + "train_runtime": 6646.3979, + "train_samples_per_second": 8.449, + "train_steps_per_second": 0.066 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..6f11b78 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,3210 @@ +{ + "best_global_step": 264, + "best_metric": 1.308773159980774, + "best_model_checkpoint": "saves/qwen3-4B/medical-o1-sft-full-1e-5/checkpoint-264", + "epoch": 3.0, + "eval_steps": 44, + "global_step": 441, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006837606837606838, + "grad_norm": 24.729957580566406, + "learning_rate": 0.0, + "loss": 2.180166482925415, + "step": 1 + }, + { + "epoch": 0.013675213675213675, + "grad_norm": 25.152711868286133, + "learning_rate": 4.347826086956522e-07, + "loss": 2.1789543628692627, + "step": 2 + }, + { + "epoch": 0.020512820512820513, + "grad_norm": 24.6761417388916, + "learning_rate": 8.695652173913044e-07, + "loss": 2.204561233520508, + "step": 3 + }, + { + "epoch": 0.02735042735042735, + "grad_norm": 24.276906967163086, + "learning_rate": 1.3043478260869566e-06, + "loss": 2.1825883388519287, + "step": 4 + }, + { + "epoch": 0.03418803418803419, + "grad_norm": 23.327831268310547, + "learning_rate": 1.7391304347826088e-06, + "loss": 2.2022361755371094, + "step": 5 + }, + { + "epoch": 0.041025641025641026, + "grad_norm": 20.180011749267578, + "learning_rate": 2.173913043478261e-06, + "loss": 2.0757670402526855, + "step": 6 + }, + { + "epoch": 0.04786324786324787, + "grad_norm": 18.820642471313477, + "learning_rate": 2.6086956521739132e-06, + "loss": 2.024721145629883, + "step": 7 + }, + { + "epoch": 0.0547008547008547, + "grad_norm": 13.223835945129395, + "learning_rate": 3.043478260869566e-06, + "loss": 1.9034565687179565, + "step": 8 + }, + { + "epoch": 0.06153846153846154, + "grad_norm": 11.584263801574707, + "learning_rate": 3.4782608695652175e-06, + "loss": 1.8130236864089966, + "step": 9 + }, + { + "epoch": 0.06837606837606838, + "grad_norm": 5.6841607093811035, + "learning_rate": 3.91304347826087e-06, + "loss": 1.6309248208999634, + "step": 10 + }, + { + "epoch": 0.07521367521367521, + "grad_norm": 4.208008766174316, + "learning_rate": 4.347826086956522e-06, + "loss": 1.5361576080322266, + "step": 11 + }, + { + "epoch": 0.08205128205128205, + "grad_norm": 3.528555154800415, + "learning_rate": 4.782608695652174e-06, + "loss": 1.6088225841522217, + "step": 12 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 3.099165916442871, + "learning_rate": 5.2173913043478265e-06, + "loss": 1.5432047843933105, + "step": 13 + }, + { + "epoch": 0.09572649572649573, + "grad_norm": 6.412608623504639, + "learning_rate": 5.652173913043479e-06, + "loss": 1.5963867902755737, + "step": 14 + }, + { + "epoch": 0.10256410256410256, + "grad_norm": 5.609615802764893, + "learning_rate": 6.086956521739132e-06, + "loss": 1.5698325634002686, + "step": 15 + }, + { + "epoch": 0.1094017094017094, + "grad_norm": 4.161319255828857, + "learning_rate": 6.521739130434783e-06, + "loss": 1.555444598197937, + "step": 16 + }, + { + "epoch": 0.11623931623931624, + "grad_norm": 3.2057743072509766, + "learning_rate": 6.956521739130435e-06, + "loss": 1.475843906402588, + "step": 17 + }, + { + "epoch": 0.12307692307692308, + "grad_norm": 2.5646772384643555, + "learning_rate": 7.391304347826087e-06, + "loss": 1.509574294090271, + "step": 18 + }, + { + "epoch": 0.12991452991452992, + "grad_norm": 1.9250593185424805, + "learning_rate": 7.82608695652174e-06, + "loss": 1.4932482242584229, + "step": 19 + }, + { + "epoch": 0.13675213675213677, + "grad_norm": 1.6663166284561157, + "learning_rate": 8.260869565217392e-06, + "loss": 1.4706228971481323, + "step": 20 + }, + { + "epoch": 0.14358974358974358, + "grad_norm": 1.488690733909607, + "learning_rate": 8.695652173913044e-06, + "loss": 1.4192920923233032, + "step": 21 + }, + { + "epoch": 0.15042735042735042, + "grad_norm": 1.3503153324127197, + "learning_rate": 9.130434782608697e-06, + "loss": 1.427452802658081, + "step": 22 + }, + { + "epoch": 0.15726495726495726, + "grad_norm": 1.2214534282684326, + "learning_rate": 9.565217391304349e-06, + "loss": 1.4610393047332764, + "step": 23 + }, + { + "epoch": 0.1641025641025641, + "grad_norm": 1.1983873844146729, + "learning_rate": 1e-05, + "loss": 1.4273948669433594, + "step": 24 + }, + { + "epoch": 0.17094017094017094, + "grad_norm": 1.1930960416793823, + "learning_rate": 9.999858783596665e-06, + "loss": 1.4003199338912964, + "step": 25 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 1.0275226831436157, + "learning_rate": 9.999435142363484e-06, + "loss": 1.4090672731399536, + "step": 26 + }, + { + "epoch": 0.18461538461538463, + "grad_norm": 1.001726508140564, + "learning_rate": 9.998729100230497e-06, + "loss": 1.3982799053192139, + "step": 27 + }, + { + "epoch": 0.19145299145299147, + "grad_norm": 0.9476358890533447, + "learning_rate": 9.997740697079595e-06, + "loss": 1.4250205755233765, + "step": 28 + }, + { + "epoch": 0.19829059829059828, + "grad_norm": 0.9169353246688843, + "learning_rate": 9.99646998874227e-06, + "loss": 1.407841682434082, + "step": 29 + }, + { + "epoch": 0.20512820512820512, + "grad_norm": 0.9049670696258545, + "learning_rate": 9.994917046996472e-06, + "loss": 1.4163107872009277, + "step": 30 + }, + { + "epoch": 0.21196581196581196, + "grad_norm": 0.902590811252594, + "learning_rate": 9.993081959562539e-06, + "loss": 1.4395619630813599, + "step": 31 + }, + { + "epoch": 0.2188034188034188, + "grad_norm": 0.9725260138511658, + "learning_rate": 9.990964830098246e-06, + "loss": 1.4067661762237549, + "step": 32 + }, + { + "epoch": 0.22564102564102564, + "grad_norm": 0.8750798106193542, + "learning_rate": 9.98856577819296e-06, + "loss": 1.4079771041870117, + "step": 33 + }, + { + "epoch": 0.23247863247863249, + "grad_norm": 0.8549812436103821, + "learning_rate": 9.985884939360873e-06, + "loss": 1.398482322692871, + "step": 34 + }, + { + "epoch": 0.23931623931623933, + "grad_norm": 0.869503378868103, + "learning_rate": 9.98292246503335e-06, + "loss": 1.344150424003601, + "step": 35 + }, + { + "epoch": 0.24615384615384617, + "grad_norm": 0.9242067337036133, + "learning_rate": 9.979678522550382e-06, + "loss": 1.37479567527771, + "step": 36 + }, + { + "epoch": 0.252991452991453, + "grad_norm": 0.8416987657546997, + "learning_rate": 9.976153295151123e-06, + "loss": 1.3731480836868286, + "step": 37 + }, + { + "epoch": 0.25982905982905985, + "grad_norm": 0.9907390475273132, + "learning_rate": 9.972346981963546e-06, + "loss": 1.3624351024627686, + "step": 38 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.8205696940422058, + "learning_rate": 9.968259797993197e-06, + "loss": 1.3645293712615967, + "step": 39 + }, + { + "epoch": 0.27350427350427353, + "grad_norm": 0.8257843852043152, + "learning_rate": 9.963891974111042e-06, + "loss": 1.3727067708969116, + "step": 40 + }, + { + "epoch": 0.28034188034188035, + "grad_norm": 0.7986466288566589, + "learning_rate": 9.959243757040434e-06, + "loss": 1.3945657014846802, + "step": 41 + }, + { + "epoch": 0.28717948717948716, + "grad_norm": 0.9684669971466064, + "learning_rate": 9.95431540934317e-06, + "loss": 1.3376381397247314, + "step": 42 + }, + { + "epoch": 0.294017094017094, + "grad_norm": 0.7717859148979187, + "learning_rate": 9.949107209404664e-06, + "loss": 1.354946494102478, + "step": 43 + }, + { + "epoch": 0.30085470085470084, + "grad_norm": 0.8021324276924133, + "learning_rate": 9.943619451418225e-06, + "loss": 1.3951725959777832, + "step": 44 + }, + { + "epoch": 0.30085470085470084, + "eval_loss": 1.362805724143982, + "eval_runtime": 24.9887, + "eval_samples_per_second": 39.458, + "eval_steps_per_second": 4.962, + "step": 44 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 0.829911470413208, + "learning_rate": 9.937852445368427e-06, + "loss": 1.3832783699035645, + "step": 45 + }, + { + "epoch": 0.3145299145299145, + "grad_norm": 0.8109715580940247, + "learning_rate": 9.931806517013612e-06, + "loss": 1.3637301921844482, + "step": 46 + }, + { + "epoch": 0.3213675213675214, + "grad_norm": 0.7627991437911987, + "learning_rate": 9.925482007867485e-06, + "loss": 1.3353031873703003, + "step": 47 + }, + { + "epoch": 0.3282051282051282, + "grad_norm": 0.7720788717269897, + "learning_rate": 9.918879275179819e-06, + "loss": 1.367252230644226, + "step": 48 + }, + { + "epoch": 0.335042735042735, + "grad_norm": 0.7520493865013123, + "learning_rate": 9.911998691916275e-06, + "loss": 1.386542797088623, + "step": 49 + }, + { + "epoch": 0.3418803418803419, + "grad_norm": 0.7559177875518799, + "learning_rate": 9.904840646737346e-06, + "loss": 1.3789976835250854, + "step": 50 + }, + { + "epoch": 0.3487179487179487, + "grad_norm": 0.770207405090332, + "learning_rate": 9.89740554397639e-06, + "loss": 1.356705904006958, + "step": 51 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 0.7609772086143494, + "learning_rate": 9.889693803616793e-06, + "loss": 1.3461980819702148, + "step": 52 + }, + { + "epoch": 0.3623931623931624, + "grad_norm": 0.7604424953460693, + "learning_rate": 9.881705861268252e-06, + "loss": 1.344923496246338, + "step": 53 + }, + { + "epoch": 0.36923076923076925, + "grad_norm": 0.7701961398124695, + "learning_rate": 9.873442168142158e-06, + "loss": 1.364449143409729, + "step": 54 + }, + { + "epoch": 0.37606837606837606, + "grad_norm": 0.7939377427101135, + "learning_rate": 9.864903191026125e-06, + "loss": 1.4013525247573853, + "step": 55 + }, + { + "epoch": 0.38290598290598293, + "grad_norm": 0.7690542340278625, + "learning_rate": 9.856089412257605e-06, + "loss": 1.3586581945419312, + "step": 56 + }, + { + "epoch": 0.38974358974358975, + "grad_norm": 0.798068106174469, + "learning_rate": 9.847001329696653e-06, + "loss": 1.3378022909164429, + "step": 57 + }, + { + "epoch": 0.39658119658119656, + "grad_norm": 0.7824757695198059, + "learning_rate": 9.837639456697802e-06, + "loss": 1.3118129968643188, + "step": 58 + }, + { + "epoch": 0.40341880341880343, + "grad_norm": 0.7629351019859314, + "learning_rate": 9.828004322081067e-06, + "loss": 1.3393217325210571, + "step": 59 + }, + { + "epoch": 0.41025641025641024, + "grad_norm": 0.7708514332771301, + "learning_rate": 9.818096470102067e-06, + "loss": 1.3732938766479492, + "step": 60 + }, + { + "epoch": 0.4170940170940171, + "grad_norm": 0.8133201003074646, + "learning_rate": 9.807916460421294e-06, + "loss": 1.3423891067504883, + "step": 61 + }, + { + "epoch": 0.4239316239316239, + "grad_norm": 0.7727287411689758, + "learning_rate": 9.797464868072489e-06, + "loss": 1.3378151655197144, + "step": 62 + }, + { + "epoch": 0.4307692307692308, + "grad_norm": 0.7684638500213623, + "learning_rate": 9.78674228343016e-06, + "loss": 1.3335256576538086, + "step": 63 + }, + { + "epoch": 0.4376068376068376, + "grad_norm": 0.7602411508560181, + "learning_rate": 9.775749312176249e-06, + "loss": 1.3320605754852295, + "step": 64 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.8044481873512268, + "learning_rate": 9.764486575265893e-06, + "loss": 1.3325685262680054, + "step": 65 + }, + { + "epoch": 0.4512820512820513, + "grad_norm": 0.7876479029655457, + "learning_rate": 9.752954708892379e-06, + "loss": 1.3242830038070679, + "step": 66 + }, + { + "epoch": 0.4581196581196581, + "grad_norm": 0.7659040689468384, + "learning_rate": 9.741154364451179e-06, + "loss": 1.3692903518676758, + "step": 67 + }, + { + "epoch": 0.46495726495726497, + "grad_norm": 0.8316842317581177, + "learning_rate": 9.729086208503174e-06, + "loss": 1.344923734664917, + "step": 68 + }, + { + "epoch": 0.4717948717948718, + "grad_norm": 0.8216245174407959, + "learning_rate": 9.716750922736998e-06, + "loss": 1.3780957460403442, + "step": 69 + }, + { + "epoch": 0.47863247863247865, + "grad_norm": 0.7839699387550354, + "learning_rate": 9.704149203930522e-06, + "loss": 1.3786989450454712, + "step": 70 + }, + { + "epoch": 0.48547008547008547, + "grad_norm": 0.7707169055938721, + "learning_rate": 9.691281763911513e-06, + "loss": 1.3283625841140747, + "step": 71 + }, + { + "epoch": 0.49230769230769234, + "grad_norm": 0.7598075270652771, + "learning_rate": 9.67814932951741e-06, + "loss": 1.3375245332717896, + "step": 72 + }, + { + "epoch": 0.49914529914529915, + "grad_norm": 0.8022596836090088, + "learning_rate": 9.664752642554272e-06, + "loss": 1.3409022092819214, + "step": 73 + }, + { + "epoch": 0.505982905982906, + "grad_norm": 0.7512302398681641, + "learning_rate": 9.651092459754879e-06, + "loss": 1.2996271848678589, + "step": 74 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 0.7390022277832031, + "learning_rate": 9.637169552735985e-06, + "loss": 1.3141694068908691, + "step": 75 + }, + { + "epoch": 0.5196581196581197, + "grad_norm": 0.7599424123764038, + "learning_rate": 9.622984707954732e-06, + "loss": 1.3220386505126953, + "step": 76 + }, + { + "epoch": 0.5264957264957265, + "grad_norm": 0.7562436461448669, + "learning_rate": 9.608538726664224e-06, + "loss": 1.3605300188064575, + "step": 77 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.7731190919876099, + "learning_rate": 9.593832424868271e-06, + "loss": 1.3461638689041138, + "step": 78 + }, + { + "epoch": 0.5401709401709401, + "grad_norm": 0.7543560266494751, + "learning_rate": 9.578866633275289e-06, + "loss": 1.340885877609253, + "step": 79 + }, + { + "epoch": 0.5470085470085471, + "grad_norm": 0.772647500038147, + "learning_rate": 9.563642197251382e-06, + "loss": 1.3663382530212402, + "step": 80 + }, + { + "epoch": 0.5538461538461539, + "grad_norm": 0.7314751148223877, + "learning_rate": 9.548159976772593e-06, + "loss": 1.3287297487258911, + "step": 81 + }, + { + "epoch": 0.5606837606837607, + "grad_norm": 0.7391103506088257, + "learning_rate": 9.532420846376316e-06, + "loss": 1.3285285234451294, + "step": 82 + }, + { + "epoch": 0.5675213675213675, + "grad_norm": 0.7641813158988953, + "learning_rate": 9.516425695111906e-06, + "loss": 1.3269128799438477, + "step": 83 + }, + { + "epoch": 0.5743589743589743, + "grad_norm": 0.7769819498062134, + "learning_rate": 9.500175426490455e-06, + "loss": 1.3374706506729126, + "step": 84 + }, + { + "epoch": 0.5811965811965812, + "grad_norm": 0.7199158668518066, + "learning_rate": 9.48367095843376e-06, + "loss": 1.3117002248764038, + "step": 85 + }, + { + "epoch": 0.588034188034188, + "grad_norm": 0.7510148882865906, + "learning_rate": 9.466913223222467e-06, + "loss": 1.3387565612792969, + "step": 86 + }, + { + "epoch": 0.5948717948717949, + "grad_norm": 0.7325724363327026, + "learning_rate": 9.449903167443415e-06, + "loss": 1.269672155380249, + "step": 87 + }, + { + "epoch": 0.6017094017094017, + "grad_norm": 0.7675944566726685, + "learning_rate": 9.432641751936162e-06, + "loss": 1.3153454065322876, + "step": 88 + }, + { + "epoch": 0.6017094017094017, + "eval_loss": 1.3318638801574707, + "eval_runtime": 24.6717, + "eval_samples_per_second": 39.965, + "eval_steps_per_second": 5.026, + "step": 88 + }, + { + "epoch": 0.6085470085470085, + "grad_norm": 0.7539426684379578, + "learning_rate": 9.415129951738713e-06, + "loss": 1.378519058227539, + "step": 89 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 0.7739952802658081, + "learning_rate": 9.397368756032445e-06, + "loss": 1.3163981437683105, + "step": 90 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 0.7639786005020142, + "learning_rate": 9.379359168086231e-06, + "loss": 1.3244612216949463, + "step": 91 + }, + { + "epoch": 0.629059829059829, + "grad_norm": 0.7307687997817993, + "learning_rate": 9.361102205199762e-06, + "loss": 1.3425580263137817, + "step": 92 + }, + { + "epoch": 0.6358974358974359, + "grad_norm": 0.7326052188873291, + "learning_rate": 9.34259889864609e-06, + "loss": 1.349947452545166, + "step": 93 + }, + { + "epoch": 0.6427350427350428, + "grad_norm": 0.7336087822914124, + "learning_rate": 9.32385029361338e-06, + "loss": 1.3235843181610107, + "step": 94 + }, + { + "epoch": 0.6495726495726496, + "grad_norm": 0.7857178449630737, + "learning_rate": 9.304857449145858e-06, + "loss": 1.29775071144104, + "step": 95 + }, + { + "epoch": 0.6564102564102564, + "grad_norm": 0.7694044709205627, + "learning_rate": 9.285621438083997e-06, + "loss": 1.3575528860092163, + "step": 96 + }, + { + "epoch": 0.6632478632478632, + "grad_norm": 0.7426573634147644, + "learning_rate": 9.26614334700392e-06, + "loss": 1.334963083267212, + "step": 97 + }, + { + "epoch": 0.67008547008547, + "grad_norm": 0.7567334175109863, + "learning_rate": 9.246424276156008e-06, + "loss": 1.335172176361084, + "step": 98 + }, + { + "epoch": 0.676923076923077, + "grad_norm": 0.733529269695282, + "learning_rate": 9.226465339402768e-06, + "loss": 1.3033547401428223, + "step": 99 + }, + { + "epoch": 0.6837606837606838, + "grad_norm": 0.7475197315216064, + "learning_rate": 9.206267664155906e-06, + "loss": 1.316215991973877, + "step": 100 + }, + { + "epoch": 0.6905982905982906, + "grad_norm": 0.7870779633522034, + "learning_rate": 9.185832391312644e-06, + "loss": 1.347679853439331, + "step": 101 + }, + { + "epoch": 0.6974358974358974, + "grad_norm": 0.764722466468811, + "learning_rate": 9.165160675191272e-06, + "loss": 1.305860996246338, + "step": 102 + }, + { + "epoch": 0.7042735042735043, + "grad_norm": 0.7680871486663818, + "learning_rate": 9.144253683465953e-06, + "loss": 1.3211126327514648, + "step": 103 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 0.734742283821106, + "learning_rate": 9.123112597100759e-06, + "loss": 1.2861220836639404, + "step": 104 + }, + { + "epoch": 0.717948717948718, + "grad_norm": 0.7347426414489746, + "learning_rate": 9.101738610282956e-06, + "loss": 1.315138578414917, + "step": 105 + }, + { + "epoch": 0.7247863247863248, + "grad_norm": 0.7639749646186829, + "learning_rate": 9.080132930355567e-06, + "loss": 1.3426464796066284, + "step": 106 + }, + { + "epoch": 0.7316239316239316, + "grad_norm": 0.7904943227767944, + "learning_rate": 9.058296777749154e-06, + "loss": 1.334005355834961, + "step": 107 + }, + { + "epoch": 0.7384615384615385, + "grad_norm": 0.780296266078949, + "learning_rate": 9.03623138591289e-06, + "loss": 1.3893626928329468, + "step": 108 + }, + { + "epoch": 0.7452991452991453, + "grad_norm": 0.7619044184684753, + "learning_rate": 9.013938001244885e-06, + "loss": 1.3112680912017822, + "step": 109 + }, + { + "epoch": 0.7521367521367521, + "grad_norm": 0.7852951884269714, + "learning_rate": 8.99141788302178e-06, + "loss": 1.3263344764709473, + "step": 110 + }, + { + "epoch": 0.7589743589743589, + "grad_norm": 0.746293306350708, + "learning_rate": 8.968672303327614e-06, + "loss": 1.3137162923812866, + "step": 111 + }, + { + "epoch": 0.7658119658119659, + "grad_norm": 0.7697060704231262, + "learning_rate": 8.94570254698197e-06, + "loss": 1.305846095085144, + "step": 112 + }, + { + "epoch": 0.7726495726495727, + "grad_norm": 0.7505799531936646, + "learning_rate": 8.922509911467395e-06, + "loss": 1.3263046741485596, + "step": 113 + }, + { + "epoch": 0.7794871794871795, + "grad_norm": 0.7378644347190857, + "learning_rate": 8.899095706856122e-06, + "loss": 1.2952595949172974, + "step": 114 + }, + { + "epoch": 0.7863247863247863, + "grad_norm": 0.7393775582313538, + "learning_rate": 8.875461255736055e-06, + "loss": 1.314041018486023, + "step": 115 + }, + { + "epoch": 0.7931623931623931, + "grad_norm": 0.7198286056518555, + "learning_rate": 8.851607893136065e-06, + "loss": 1.301222801208496, + "step": 116 + }, + { + "epoch": 0.8, + "grad_norm": 0.7539902925491333, + "learning_rate": 8.827536966450584e-06, + "loss": 1.3459645509719849, + "step": 117 + }, + { + "epoch": 0.8068376068376069, + "grad_norm": 0.728272020816803, + "learning_rate": 8.803249835363486e-06, + "loss": 1.3075345754623413, + "step": 118 + }, + { + "epoch": 0.8136752136752137, + "grad_norm": 0.7353615164756775, + "learning_rate": 8.778747871771293e-06, + "loss": 1.2967561483383179, + "step": 119 + }, + { + "epoch": 0.8205128205128205, + "grad_norm": 0.7358576655387878, + "learning_rate": 8.754032459705672e-06, + "loss": 1.3145124912261963, + "step": 120 + }, + { + "epoch": 0.8273504273504273, + "grad_norm": 0.7736720442771912, + "learning_rate": 8.729104995255265e-06, + "loss": 1.3146538734436035, + "step": 121 + }, + { + "epoch": 0.8341880341880342, + "grad_norm": 0.7337418794631958, + "learning_rate": 8.703966886486819e-06, + "loss": 1.2823609113693237, + "step": 122 + }, + { + "epoch": 0.841025641025641, + "grad_norm": 0.7514926195144653, + "learning_rate": 8.67861955336566e-06, + "loss": 1.3389618396759033, + "step": 123 + }, + { + "epoch": 0.8478632478632478, + "grad_norm": 0.7190932035446167, + "learning_rate": 8.65306442767547e-06, + "loss": 1.3115108013153076, + "step": 124 + }, + { + "epoch": 0.8547008547008547, + "grad_norm": 0.7332461476325989, + "learning_rate": 8.627302952937431e-06, + "loss": 1.333253264427185, + "step": 125 + }, + { + "epoch": 0.8615384615384616, + "grad_norm": 0.7428878545761108, + "learning_rate": 8.601336584328659e-06, + "loss": 1.3187751770019531, + "step": 126 + }, + { + "epoch": 0.8683760683760684, + "grad_norm": 0.7715012431144714, + "learning_rate": 8.575166788600031e-06, + "loss": 1.3300316333770752, + "step": 127 + }, + { + "epoch": 0.8752136752136752, + "grad_norm": 0.7566640973091125, + "learning_rate": 8.548795043993316e-06, + "loss": 1.307992696762085, + "step": 128 + }, + { + "epoch": 0.882051282051282, + "grad_norm": 0.7760566473007202, + "learning_rate": 8.522222840157687e-06, + "loss": 1.32774817943573, + "step": 129 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.7682384848594666, + "learning_rate": 8.495451678065563e-06, + "loss": 1.3295447826385498, + "step": 130 + }, + { + "epoch": 0.8957264957264958, + "grad_norm": 0.7397897839546204, + "learning_rate": 8.468483069927832e-06, + "loss": 1.3145328760147095, + "step": 131 + }, + { + "epoch": 0.9025641025641026, + "grad_norm": 0.7603890299797058, + "learning_rate": 8.441318539108433e-06, + "loss": 1.3174394369125366, + "step": 132 + }, + { + "epoch": 0.9025641025641026, + "eval_loss": 1.317511796951294, + "eval_runtime": 24.6804, + "eval_samples_per_second": 39.951, + "eval_steps_per_second": 5.024, + "step": 132 + }, + { + "epoch": 0.9094017094017094, + "grad_norm": 0.7623502612113953, + "learning_rate": 8.413959620038306e-06, + "loss": 1.3393348455429077, + "step": 133 + }, + { + "epoch": 0.9162393162393162, + "grad_norm": 0.7669332027435303, + "learning_rate": 8.386407858128707e-06, + "loss": 1.302769660949707, + "step": 134 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 0.7234067320823669, + "learning_rate": 8.358664809683926e-06, + "loss": 1.3381096124649048, + "step": 135 + }, + { + "epoch": 0.9299145299145299, + "grad_norm": 0.7574735283851624, + "learning_rate": 8.330732041813367e-06, + "loss": 1.335377812385559, + "step": 136 + }, + { + "epoch": 0.9367521367521368, + "grad_norm": 0.7575842142105103, + "learning_rate": 8.302611132343042e-06, + "loss": 1.3330005407333374, + "step": 137 + }, + { + "epoch": 0.9435897435897436, + "grad_norm": 0.7127556800842285, + "learning_rate": 8.274303669726427e-06, + "loss": 1.2971893548965454, + "step": 138 + }, + { + "epoch": 0.9504273504273504, + "grad_norm": 0.8172794580459595, + "learning_rate": 8.245811252954741e-06, + "loss": 1.3225749731063843, + "step": 139 + }, + { + "epoch": 0.9572649572649573, + "grad_norm": 0.7154548764228821, + "learning_rate": 8.217135491466636e-06, + "loss": 1.2955387830734253, + "step": 140 + }, + { + "epoch": 0.9641025641025641, + "grad_norm": 0.7610012888908386, + "learning_rate": 8.18827800505727e-06, + "loss": 1.3369195461273193, + "step": 141 + }, + { + "epoch": 0.9709401709401709, + "grad_norm": 0.7487711906433105, + "learning_rate": 8.15924042378682e-06, + "loss": 1.2916451692581177, + "step": 142 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 0.7546627521514893, + "learning_rate": 8.130024387888402e-06, + "loss": 1.310347318649292, + "step": 143 + }, + { + "epoch": 0.9846153846153847, + "grad_norm": 0.7537707090377808, + "learning_rate": 8.100631547675417e-06, + "loss": 1.3267855644226074, + "step": 144 + }, + { + "epoch": 0.9914529914529915, + "grad_norm": 0.7335416078567505, + "learning_rate": 8.071063563448341e-06, + "loss": 1.2958036661148071, + "step": 145 + }, + { + "epoch": 0.9982905982905983, + "grad_norm": 0.773562490940094, + "learning_rate": 8.041322105400923e-06, + "loss": 1.2804107666015625, + "step": 146 + }, + { + "epoch": 1.0, + "grad_norm": 1.4411433935165405, + "learning_rate": 8.01140885352586e-06, + "loss": 1.3802165985107422, + "step": 147 + }, + { + "epoch": 1.0068376068376068, + "grad_norm": 0.9124190211296082, + "learning_rate": 7.981325497519892e-06, + "loss": 1.2135487794876099, + "step": 148 + }, + { + "epoch": 1.0136752136752136, + "grad_norm": 0.8284032344818115, + "learning_rate": 7.951073736688348e-06, + "loss": 1.1935949325561523, + "step": 149 + }, + { + "epoch": 1.0205128205128204, + "grad_norm": 0.8174305558204651, + "learning_rate": 7.920655279849173e-06, + "loss": 1.2410966157913208, + "step": 150 + }, + { + "epoch": 1.0273504273504273, + "grad_norm": 0.7865321040153503, + "learning_rate": 7.890071845236395e-06, + "loss": 1.2489113807678223, + "step": 151 + }, + { + "epoch": 1.0341880341880343, + "grad_norm": 0.812463104724884, + "learning_rate": 7.859325160403073e-06, + "loss": 1.1999475955963135, + "step": 152 + }, + { + "epoch": 1.041025641025641, + "grad_norm": 0.8780131936073303, + "learning_rate": 7.8284169621237e-06, + "loss": 1.2193069458007812, + "step": 153 + }, + { + "epoch": 1.047863247863248, + "grad_norm": 0.8348581790924072, + "learning_rate": 7.797348996296116e-06, + "loss": 1.1925896406173706, + "step": 154 + }, + { + "epoch": 1.0547008547008547, + "grad_norm": 0.8675538897514343, + "learning_rate": 7.766123017842877e-06, + "loss": 1.2143549919128418, + "step": 155 + }, + { + "epoch": 1.0615384615384615, + "grad_norm": 0.8252431750297546, + "learning_rate": 7.734740790612137e-06, + "loss": 1.2455641031265259, + "step": 156 + }, + { + "epoch": 1.0683760683760684, + "grad_norm": 0.8385781049728394, + "learning_rate": 7.703204087277989e-06, + "loss": 1.2102444171905518, + "step": 157 + }, + { + "epoch": 1.0752136752136752, + "grad_norm": 0.827889084815979, + "learning_rate": 7.671514689240366e-06, + "loss": 1.2144052982330322, + "step": 158 + }, + { + "epoch": 1.082051282051282, + "grad_norm": 0.7633846998214722, + "learning_rate": 7.639674386524395e-06, + "loss": 1.2118767499923706, + "step": 159 + }, + { + "epoch": 1.0888888888888888, + "grad_norm": 0.8267090320587158, + "learning_rate": 7.607684977679284e-06, + "loss": 1.188737392425537, + "step": 160 + }, + { + "epoch": 1.0957264957264958, + "grad_norm": 0.8270633816719055, + "learning_rate": 7.575548269676741e-06, + "loss": 1.214994192123413, + "step": 161 + }, + { + "epoch": 1.1025641025641026, + "grad_norm": 0.8160786628723145, + "learning_rate": 7.543266077808893e-06, + "loss": 1.221800446510315, + "step": 162 + }, + { + "epoch": 1.1094017094017095, + "grad_norm": 0.829490065574646, + "learning_rate": 7.510840225585749e-06, + "loss": 1.1974472999572754, + "step": 163 + }, + { + "epoch": 1.1162393162393163, + "grad_norm": 0.8170298933982849, + "learning_rate": 7.478272544632204e-06, + "loss": 1.2150561809539795, + "step": 164 + }, + { + "epoch": 1.123076923076923, + "grad_norm": 0.7731851935386658, + "learning_rate": 7.44556487458456e-06, + "loss": 1.1988686323165894, + "step": 165 + }, + { + "epoch": 1.12991452991453, + "grad_norm": 0.7923320531845093, + "learning_rate": 7.412719062986632e-06, + "loss": 1.2086683511734009, + "step": 166 + }, + { + "epoch": 1.1367521367521367, + "grad_norm": 0.7592716217041016, + "learning_rate": 7.379736965185369e-06, + "loss": 1.215879201889038, + "step": 167 + }, + { + "epoch": 1.1435897435897435, + "grad_norm": 0.7586809396743774, + "learning_rate": 7.3466204442260605e-06, + "loss": 1.2311599254608154, + "step": 168 + }, + { + "epoch": 1.1504273504273503, + "grad_norm": 0.7838971614837646, + "learning_rate": 7.313371370747104e-06, + "loss": 1.2183728218078613, + "step": 169 + }, + { + "epoch": 1.1572649572649572, + "grad_norm": 0.7780983448028564, + "learning_rate": 7.279991622874319e-06, + "loss": 1.1952356100082397, + "step": 170 + }, + { + "epoch": 1.1641025641025642, + "grad_norm": 0.7715050578117371, + "learning_rate": 7.24648308611489e-06, + "loss": 1.2417360544204712, + "step": 171 + }, + { + "epoch": 1.170940170940171, + "grad_norm": 0.7692239880561829, + "learning_rate": 7.212847653250828e-06, + "loss": 1.2170333862304688, + "step": 172 + }, + { + "epoch": 1.1777777777777778, + "grad_norm": 0.7896147966384888, + "learning_rate": 7.1790872242320775e-06, + "loss": 1.2121965885162354, + "step": 173 + }, + { + "epoch": 1.1846153846153846, + "grad_norm": 0.8173856139183044, + "learning_rate": 7.145203706069183e-06, + "loss": 1.1911547183990479, + "step": 174 + }, + { + "epoch": 1.1914529914529914, + "grad_norm": 0.7522553205490112, + "learning_rate": 7.1111990127255684e-06, + "loss": 1.210161566734314, + "step": 175 + }, + { + "epoch": 1.1982905982905983, + "grad_norm": 0.7353285551071167, + "learning_rate": 7.0770750650094335e-06, + "loss": 1.1757725477218628, + "step": 176 + }, + { + "epoch": 1.1982905982905983, + "eval_loss": 1.3184372186660767, + "eval_runtime": 24.8388, + "eval_samples_per_second": 39.696, + "eval_steps_per_second": 4.992, + "step": 176 + }, + { + "epoch": 1.205128205128205, + "grad_norm": 0.7701054811477661, + "learning_rate": 7.042833790465241e-06, + "loss": 1.2243812084197998, + "step": 177 + }, + { + "epoch": 1.2119658119658119, + "grad_norm": 0.7278676629066467, + "learning_rate": 7.008477123264849e-06, + "loss": 1.198972463607788, + "step": 178 + }, + { + "epoch": 1.218803418803419, + "grad_norm": 0.7595424056053162, + "learning_rate": 6.974007004098243e-06, + "loss": 1.2435779571533203, + "step": 179 + }, + { + "epoch": 1.2256410256410257, + "grad_norm": 0.7661744952201843, + "learning_rate": 6.939425380063924e-06, + "loss": 1.2413814067840576, + "step": 180 + }, + { + "epoch": 1.2324786324786325, + "grad_norm": 0.7790281176567078, + "learning_rate": 6.9047342045589224e-06, + "loss": 1.1771953105926514, + "step": 181 + }, + { + "epoch": 1.2393162393162394, + "grad_norm": 0.7655471563339233, + "learning_rate": 6.869935437168449e-06, + "loss": 1.203190565109253, + "step": 182 + }, + { + "epoch": 1.2461538461538462, + "grad_norm": 0.784903347492218, + "learning_rate": 6.835031043555211e-06, + "loss": 1.2171598672866821, + "step": 183 + }, + { + "epoch": 1.252991452991453, + "grad_norm": 0.7539082765579224, + "learning_rate": 6.800022995348381e-06, + "loss": 1.2139626741409302, + "step": 184 + }, + { + "epoch": 1.2598290598290598, + "grad_norm": 0.7623985409736633, + "learning_rate": 6.76491327003222e-06, + "loss": 1.2187587022781372, + "step": 185 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 0.7418251037597656, + "learning_rate": 6.729703850834381e-06, + "loss": 1.2088682651519775, + "step": 186 + }, + { + "epoch": 1.2735042735042734, + "grad_norm": 0.7652315497398376, + "learning_rate": 6.694396726613883e-06, + "loss": 1.2204537391662598, + "step": 187 + }, + { + "epoch": 1.2803418803418802, + "grad_norm": 0.7618216872215271, + "learning_rate": 6.65899389174876e-06, + "loss": 1.220557451248169, + "step": 188 + }, + { + "epoch": 1.287179487179487, + "grad_norm": 0.774918794631958, + "learning_rate": 6.6234973460234184e-06, + "loss": 1.238166093826294, + "step": 189 + }, + { + "epoch": 1.294017094017094, + "grad_norm": 0.7822843790054321, + "learning_rate": 6.587909094515663e-06, + "loss": 1.2424533367156982, + "step": 190 + }, + { + "epoch": 1.300854700854701, + "grad_norm": 0.7934525012969971, + "learning_rate": 6.552231147483448e-06, + "loss": 1.1982380151748657, + "step": 191 + }, + { + "epoch": 1.3076923076923077, + "grad_norm": 0.7817178964614868, + "learning_rate": 6.5164655202513135e-06, + "loss": 1.205663800239563, + "step": 192 + }, + { + "epoch": 1.3145299145299145, + "grad_norm": 0.8002380728721619, + "learning_rate": 6.480614233096558e-06, + "loss": 1.1866426467895508, + "step": 193 + }, + { + "epoch": 1.3213675213675213, + "grad_norm": 0.7488191723823547, + "learning_rate": 6.444679311135112e-06, + "loss": 1.2407163381576538, + "step": 194 + }, + { + "epoch": 1.3282051282051281, + "grad_norm": 0.8069729208946228, + "learning_rate": 6.408662784207149e-06, + "loss": 1.2296785116195679, + "step": 195 + }, + { + "epoch": 1.335042735042735, + "grad_norm": 0.8026877641677856, + "learning_rate": 6.372566686762427e-06, + "loss": 1.228287696838379, + "step": 196 + }, + { + "epoch": 1.341880341880342, + "grad_norm": 0.7794991731643677, + "learning_rate": 6.336393057745365e-06, + "loss": 1.2325451374053955, + "step": 197 + }, + { + "epoch": 1.3487179487179488, + "grad_norm": 0.7851534485816956, + "learning_rate": 6.300143940479881e-06, + "loss": 1.2433525323867798, + "step": 198 + }, + { + "epoch": 1.3555555555555556, + "grad_norm": 0.7642512321472168, + "learning_rate": 6.2638213825539595e-06, + "loss": 1.2330515384674072, + "step": 199 + }, + { + "epoch": 1.3623931623931624, + "grad_norm": 0.8071786165237427, + "learning_rate": 6.227427435703997e-06, + "loss": 1.2169106006622314, + "step": 200 + }, + { + "epoch": 1.3692307692307693, + "grad_norm": 0.7421261668205261, + "learning_rate": 6.190964155698903e-06, + "loss": 1.1981184482574463, + "step": 201 + }, + { + "epoch": 1.376068376068376, + "grad_norm": 0.7663130760192871, + "learning_rate": 6.154433602223979e-06, + "loss": 1.184199333190918, + "step": 202 + }, + { + "epoch": 1.3829059829059829, + "grad_norm": 0.778105616569519, + "learning_rate": 6.117837838764579e-06, + "loss": 1.1941637992858887, + "step": 203 + }, + { + "epoch": 1.3897435897435897, + "grad_norm": 0.7876622676849365, + "learning_rate": 6.0811789324895365e-06, + "loss": 1.1943039894104004, + "step": 204 + }, + { + "epoch": 1.3965811965811965, + "grad_norm": 0.7890434861183167, + "learning_rate": 6.044458954134411e-06, + "loss": 1.1947365999221802, + "step": 205 + }, + { + "epoch": 1.4034188034188033, + "grad_norm": 0.7558045387268066, + "learning_rate": 6.0076799778845105e-06, + "loss": 1.1994682550430298, + "step": 206 + }, + { + "epoch": 1.4102564102564101, + "grad_norm": 0.7472313046455383, + "learning_rate": 5.970844081257734e-06, + "loss": 1.210819959640503, + "step": 207 + }, + { + "epoch": 1.4170940170940172, + "grad_norm": 0.7487971782684326, + "learning_rate": 5.933953344987215e-06, + "loss": 1.1884093284606934, + "step": 208 + }, + { + "epoch": 1.423931623931624, + "grad_norm": 0.7524631023406982, + "learning_rate": 5.897009852903792e-06, + "loss": 1.2101268768310547, + "step": 209 + }, + { + "epoch": 1.4307692307692308, + "grad_norm": 0.7583618760108948, + "learning_rate": 5.860015691818292e-06, + "loss": 1.214969515800476, + "step": 210 + }, + { + "epoch": 1.4376068376068376, + "grad_norm": 0.7619627118110657, + "learning_rate": 5.82297295140367e-06, + "loss": 1.1723865270614624, + "step": 211 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.782787024974823, + "learning_rate": 5.78588372407695e-06, + "loss": 1.2125704288482666, + "step": 212 + }, + { + "epoch": 1.4512820512820512, + "grad_norm": 0.7758169174194336, + "learning_rate": 5.748750104881051e-06, + "loss": 1.219278335571289, + "step": 213 + }, + { + "epoch": 1.458119658119658, + "grad_norm": 0.7914722561836243, + "learning_rate": 5.711574191366427e-06, + "loss": 1.2299978733062744, + "step": 214 + }, + { + "epoch": 1.464957264957265, + "grad_norm": 0.7562519907951355, + "learning_rate": 5.674358083472598e-06, + "loss": 1.1945183277130127, + "step": 215 + }, + { + "epoch": 1.471794871794872, + "grad_norm": 0.7890987396240234, + "learning_rate": 5.637103883409525e-06, + "loss": 1.228225827217102, + "step": 216 + }, + { + "epoch": 1.4786324786324787, + "grad_norm": 0.7438657879829407, + "learning_rate": 5.599813695538866e-06, + "loss": 1.1812902688980103, + "step": 217 + }, + { + "epoch": 1.4854700854700855, + "grad_norm": 0.7696713805198669, + "learning_rate": 5.562489626255104e-06, + "loss": 1.2277076244354248, + "step": 218 + }, + { + "epoch": 1.4923076923076923, + "grad_norm": 0.8019750714302063, + "learning_rate": 5.52513378386657e-06, + "loss": 1.2309683561325073, + "step": 219 + }, + { + "epoch": 1.4991452991452991, + "grad_norm": 0.7668002247810364, + "learning_rate": 5.487748278476342e-06, + "loss": 1.2046821117401123, + "step": 220 + }, + { + "epoch": 1.4991452991452991, + "eval_loss": 1.3131194114685059, + "eval_runtime": 24.7008, + "eval_samples_per_second": 39.918, + "eval_steps_per_second": 5.02, + "step": 220 + }, + { + "epoch": 1.505982905982906, + "grad_norm": 0.7732208967208862, + "learning_rate": 5.450335221863068e-06, + "loss": 1.2219358682632446, + "step": 221 + }, + { + "epoch": 1.5128205128205128, + "grad_norm": 0.7456432580947876, + "learning_rate": 5.412896727361663e-06, + "loss": 1.2196807861328125, + "step": 222 + }, + { + "epoch": 1.5196581196581196, + "grad_norm": 0.7411943674087524, + "learning_rate": 5.375434909743942e-06, + "loss": 1.2303682565689087, + "step": 223 + }, + { + "epoch": 1.5264957264957264, + "grad_norm": 0.7763144373893738, + "learning_rate": 5.337951885099167e-06, + "loss": 1.188888669013977, + "step": 224 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.8138889074325562, + "learning_rate": 5.300449770714502e-06, + "loss": 1.1965391635894775, + "step": 225 + }, + { + "epoch": 1.54017094017094, + "grad_norm": 0.7770660519599915, + "learning_rate": 5.262930684955439e-06, + "loss": 1.233127474784851, + "step": 226 + }, + { + "epoch": 1.547008547008547, + "grad_norm": 0.7718791961669922, + "learning_rate": 5.225396747146112e-06, + "loss": 1.240120768547058, + "step": 227 + }, + { + "epoch": 1.5538461538461539, + "grad_norm": 0.7710370421409607, + "learning_rate": 5.187850077449604e-06, + "loss": 1.202008605003357, + "step": 228 + }, + { + "epoch": 1.5606837606837607, + "grad_norm": 0.7775757908821106, + "learning_rate": 5.150292796748174e-06, + "loss": 1.2269346714019775, + "step": 229 + }, + { + "epoch": 1.5675213675213675, + "grad_norm": 0.7479456067085266, + "learning_rate": 5.112727026523461e-06, + "loss": 1.1906824111938477, + "step": 230 + }, + { + "epoch": 1.5743589743589743, + "grad_norm": 0.7567362189292908, + "learning_rate": 5.075154888736653e-06, + "loss": 1.1966190338134766, + "step": 231 + }, + { + "epoch": 1.5811965811965814, + "grad_norm": 0.7536229491233826, + "learning_rate": 5.03757850570861e-06, + "loss": 1.1917792558670044, + "step": 232 + }, + { + "epoch": 1.5880341880341882, + "grad_norm": 0.7776764035224915, + "learning_rate": 5e-06, + "loss": 1.1941741704940796, + "step": 233 + }, + { + "epoch": 1.594871794871795, + "grad_norm": 0.7667071223258972, + "learning_rate": 4.9624214942913916e-06, + "loss": 1.1881437301635742, + "step": 234 + }, + { + "epoch": 1.6017094017094018, + "grad_norm": 0.773404061794281, + "learning_rate": 4.924845111263349e-06, + "loss": 1.2190567255020142, + "step": 235 + }, + { + "epoch": 1.6085470085470086, + "grad_norm": 0.7392263412475586, + "learning_rate": 4.88727297347654e-06, + "loss": 1.2026817798614502, + "step": 236 + }, + { + "epoch": 1.6153846153846154, + "grad_norm": 0.7713451981544495, + "learning_rate": 4.8497072032518274e-06, + "loss": 1.2358677387237549, + "step": 237 + }, + { + "epoch": 1.6222222222222222, + "grad_norm": 0.7625684142112732, + "learning_rate": 4.8121499225503974e-06, + "loss": 1.1716538667678833, + "step": 238 + }, + { + "epoch": 1.629059829059829, + "grad_norm": 0.7581425309181213, + "learning_rate": 4.774603252853889e-06, + "loss": 1.1988354921340942, + "step": 239 + }, + { + "epoch": 1.6358974358974359, + "grad_norm": 0.751584529876709, + "learning_rate": 4.737069315044562e-06, + "loss": 1.2101967334747314, + "step": 240 + }, + { + "epoch": 1.6427350427350427, + "grad_norm": 0.7554129362106323, + "learning_rate": 4.699550229285499e-06, + "loss": 1.202675223350525, + "step": 241 + }, + { + "epoch": 1.6495726495726495, + "grad_norm": 0.761131227016449, + "learning_rate": 4.662048114900837e-06, + "loss": 1.201820731163025, + "step": 242 + }, + { + "epoch": 1.6564102564102563, + "grad_norm": 0.7265458703041077, + "learning_rate": 4.624565090256059e-06, + "loss": 1.2179176807403564, + "step": 243 + }, + { + "epoch": 1.6632478632478631, + "grad_norm": 0.767880916595459, + "learning_rate": 4.587103272638339e-06, + "loss": 1.1769942045211792, + "step": 244 + }, + { + "epoch": 1.67008547008547, + "grad_norm": 0.7633269429206848, + "learning_rate": 4.549664778136933e-06, + "loss": 1.2298530340194702, + "step": 245 + }, + { + "epoch": 1.676923076923077, + "grad_norm": 0.7275070548057556, + "learning_rate": 4.512251721523659e-06, + "loss": 1.2158825397491455, + "step": 246 + }, + { + "epoch": 1.6837606837606838, + "grad_norm": 0.7592760920524597, + "learning_rate": 4.4748662161334335e-06, + "loss": 1.207166314125061, + "step": 247 + }, + { + "epoch": 1.6905982905982906, + "grad_norm": 0.7778440713882446, + "learning_rate": 4.437510373744897e-06, + "loss": 1.2096598148345947, + "step": 248 + }, + { + "epoch": 1.6974358974358974, + "grad_norm": 0.7637122869491577, + "learning_rate": 4.400186304461136e-06, + "loss": 1.1851915121078491, + "step": 249 + }, + { + "epoch": 1.7042735042735044, + "grad_norm": 0.7784591317176819, + "learning_rate": 4.362896116590475e-06, + "loss": 1.2293877601623535, + "step": 250 + }, + { + "epoch": 1.7111111111111112, + "grad_norm": 0.8099437355995178, + "learning_rate": 4.325641916527405e-06, + "loss": 1.2101249694824219, + "step": 251 + }, + { + "epoch": 1.717948717948718, + "grad_norm": 0.7552655339241028, + "learning_rate": 4.2884258086335755e-06, + "loss": 1.2240850925445557, + "step": 252 + }, + { + "epoch": 1.7247863247863249, + "grad_norm": 0.7730560898780823, + "learning_rate": 4.25124989511895e-06, + "loss": 1.2249057292938232, + "step": 253 + }, + { + "epoch": 1.7316239316239317, + "grad_norm": 0.7381757497787476, + "learning_rate": 4.214116275923051e-06, + "loss": 1.1832340955734253, + "step": 254 + }, + { + "epoch": 1.7384615384615385, + "grad_norm": 0.739567756652832, + "learning_rate": 4.17702704859633e-06, + "loss": 1.200039267539978, + "step": 255 + }, + { + "epoch": 1.7452991452991453, + "grad_norm": 0.774598240852356, + "learning_rate": 4.1399843081817085e-06, + "loss": 1.2123297452926636, + "step": 256 + }, + { + "epoch": 1.7521367521367521, + "grad_norm": 0.8052539229393005, + "learning_rate": 4.1029901470962105e-06, + "loss": 1.2242088317871094, + "step": 257 + }, + { + "epoch": 1.758974358974359, + "grad_norm": 0.7723326683044434, + "learning_rate": 4.066046655012786e-06, + "loss": 1.2281506061553955, + "step": 258 + }, + { + "epoch": 1.7658119658119658, + "grad_norm": 0.7577686309814453, + "learning_rate": 4.029155918742268e-06, + "loss": 1.2183786630630493, + "step": 259 + }, + { + "epoch": 1.7726495726495726, + "grad_norm": 0.7814478278160095, + "learning_rate": 3.992320022115492e-06, + "loss": 1.2138553857803345, + "step": 260 + }, + { + "epoch": 1.7794871794871794, + "grad_norm": 0.7868865132331848, + "learning_rate": 3.955541045865591e-06, + "loss": 1.1890326738357544, + "step": 261 + }, + { + "epoch": 1.7863247863247862, + "grad_norm": 0.7574802041053772, + "learning_rate": 3.918821067510464e-06, + "loss": 1.1699459552764893, + "step": 262 + }, + { + "epoch": 1.793162393162393, + "grad_norm": 0.7787984013557434, + "learning_rate": 3.882162161235421e-06, + "loss": 1.1902029514312744, + "step": 263 + }, + { + "epoch": 1.8, + "grad_norm": 0.780857264995575, + "learning_rate": 3.845566397776022e-06, + "loss": 1.1960508823394775, + "step": 264 + }, + { + "epoch": 1.8, + "eval_loss": 1.308773159980774, + "eval_runtime": 24.5858, + "eval_samples_per_second": 40.104, + "eval_steps_per_second": 5.044, + "step": 264 + }, + { + "epoch": 1.8068376068376069, + "grad_norm": 0.7353282570838928, + "learning_rate": 3.8090358443010993e-06, + "loss": 1.2238385677337646, + "step": 265 + }, + { + "epoch": 1.8136752136752137, + "grad_norm": 0.7844496369361877, + "learning_rate": 3.7725725642960047e-06, + "loss": 1.2065067291259766, + "step": 266 + }, + { + "epoch": 1.8205128205128205, + "grad_norm": 0.7792806029319763, + "learning_rate": 3.7361786174460414e-06, + "loss": 1.1908563375473022, + "step": 267 + }, + { + "epoch": 1.8273504273504273, + "grad_norm": 0.7404017448425293, + "learning_rate": 3.6998560595201188e-06, + "loss": 1.2162412405014038, + "step": 268 + }, + { + "epoch": 1.8341880341880343, + "grad_norm": 0.7953075170516968, + "learning_rate": 3.6636069422546363e-06, + "loss": 1.2134095430374146, + "step": 269 + }, + { + "epoch": 1.8410256410256411, + "grad_norm": 0.7584754824638367, + "learning_rate": 3.627433313237576e-06, + "loss": 1.2177472114562988, + "step": 270 + }, + { + "epoch": 1.847863247863248, + "grad_norm": 0.7290381789207458, + "learning_rate": 3.5913372157928515e-06, + "loss": 1.189732551574707, + "step": 271 + }, + { + "epoch": 1.8547008547008548, + "grad_norm": 0.7861201763153076, + "learning_rate": 3.555320688864889e-06, + "loss": 1.2073522806167603, + "step": 272 + }, + { + "epoch": 1.8615384615384616, + "grad_norm": 0.7544710636138916, + "learning_rate": 3.519385766903442e-06, + "loss": 1.2041759490966797, + "step": 273 + }, + { + "epoch": 1.8683760683760684, + "grad_norm": 0.7539916038513184, + "learning_rate": 3.483534479748688e-06, + "loss": 1.2057629823684692, + "step": 274 + }, + { + "epoch": 1.8752136752136752, + "grad_norm": 0.7374740242958069, + "learning_rate": 3.447768852516554e-06, + "loss": 1.2203168869018555, + "step": 275 + }, + { + "epoch": 1.882051282051282, + "grad_norm": 0.7594785690307617, + "learning_rate": 3.4120909054843375e-06, + "loss": 1.182802438735962, + "step": 276 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.7542571425437927, + "learning_rate": 3.3765026539765832e-06, + "loss": 1.2168110609054565, + "step": 277 + }, + { + "epoch": 1.8957264957264957, + "grad_norm": 0.7577287554740906, + "learning_rate": 3.3410061082512422e-06, + "loss": 1.2106308937072754, + "step": 278 + }, + { + "epoch": 1.9025641025641025, + "grad_norm": 0.7561420798301697, + "learning_rate": 3.3056032733861188e-06, + "loss": 1.20242440700531, + "step": 279 + }, + { + "epoch": 1.9094017094017093, + "grad_norm": 0.7456007599830627, + "learning_rate": 3.2702961491656197e-06, + "loss": 1.2251598834991455, + "step": 280 + }, + { + "epoch": 1.916239316239316, + "grad_norm": 0.790366530418396, + "learning_rate": 3.2350867299677802e-06, + "loss": 1.2062650918960571, + "step": 281 + }, + { + "epoch": 1.9230769230769231, + "grad_norm": 0.7317772507667542, + "learning_rate": 3.1999770046516198e-06, + "loss": 1.1729378700256348, + "step": 282 + }, + { + "epoch": 1.92991452991453, + "grad_norm": 0.7773919105529785, + "learning_rate": 3.164968956444791e-06, + "loss": 1.1983883380889893, + "step": 283 + }, + { + "epoch": 1.9367521367521368, + "grad_norm": 0.7585593461990356, + "learning_rate": 3.130064562831553e-06, + "loss": 1.2086600065231323, + "step": 284 + }, + { + "epoch": 1.9435897435897436, + "grad_norm": 0.7703876495361328, + "learning_rate": 3.0952657954410792e-06, + "loss": 1.2189124822616577, + "step": 285 + }, + { + "epoch": 1.9504273504273504, + "grad_norm": 0.7693601250648499, + "learning_rate": 3.0605746199360755e-06, + "loss": 1.210176706314087, + "step": 286 + }, + { + "epoch": 1.9572649572649574, + "grad_norm": 0.7466776967048645, + "learning_rate": 3.0259929959017585e-06, + "loss": 1.2027801275253296, + "step": 287 + }, + { + "epoch": 1.9641025641025642, + "grad_norm": 0.772388219833374, + "learning_rate": 2.991522876735154e-06, + "loss": 1.2112243175506592, + "step": 288 + }, + { + "epoch": 1.970940170940171, + "grad_norm": 0.7715580463409424, + "learning_rate": 2.95716620953476e-06, + "loss": 1.1904889345169067, + "step": 289 + }, + { + "epoch": 1.9777777777777779, + "grad_norm": 0.7397588491439819, + "learning_rate": 2.9229249349905686e-06, + "loss": 1.1913639307022095, + "step": 290 + }, + { + "epoch": 1.9846153846153847, + "grad_norm": 0.7530134916305542, + "learning_rate": 2.8888009872744332e-06, + "loss": 1.2205219268798828, + "step": 291 + }, + { + "epoch": 1.9914529914529915, + "grad_norm": 0.7689472436904907, + "learning_rate": 2.8547962939308187e-06, + "loss": 1.2000938653945923, + "step": 292 + }, + { + "epoch": 1.9982905982905983, + "grad_norm": 0.7348621487617493, + "learning_rate": 2.8209127757679246e-06, + "loss": 1.1786831617355347, + "step": 293 + }, + { + "epoch": 2.0, + "grad_norm": 1.537250280380249, + "learning_rate": 2.787152346749173e-06, + "loss": 1.1778086423873901, + "step": 294 + }, + { + "epoch": 2.006837606837607, + "grad_norm": 0.9093112945556641, + "learning_rate": 2.7535169138851124e-06, + "loss": 1.1308534145355225, + "step": 295 + }, + { + "epoch": 2.0136752136752136, + "grad_norm": 0.895119845867157, + "learning_rate": 2.720008377125682e-06, + "loss": 1.1030248403549194, + "step": 296 + }, + { + "epoch": 2.0205128205128204, + "grad_norm": 0.822189211845398, + "learning_rate": 2.686628629252899e-06, + "loss": 1.0862432718276978, + "step": 297 + }, + { + "epoch": 2.0273504273504273, + "grad_norm": 0.839640200138092, + "learning_rate": 2.6533795557739407e-06, + "loss": 1.0923850536346436, + "step": 298 + }, + { + "epoch": 2.034188034188034, + "grad_norm": 0.7948157787322998, + "learning_rate": 2.6202630348146323e-06, + "loss": 1.1080037355422974, + "step": 299 + }, + { + "epoch": 2.041025641025641, + "grad_norm": 0.7708576321601868, + "learning_rate": 2.5872809370133704e-06, + "loss": 1.133652687072754, + "step": 300 + }, + { + "epoch": 2.0478632478632477, + "grad_norm": 0.784568727016449, + "learning_rate": 2.5544351254154407e-06, + "loss": 1.1596778631210327, + "step": 301 + }, + { + "epoch": 2.0547008547008545, + "grad_norm": 0.8119481205940247, + "learning_rate": 2.5217274553677975e-06, + "loss": 1.129364252090454, + "step": 302 + }, + { + "epoch": 2.0615384615384613, + "grad_norm": 0.7969528436660767, + "learning_rate": 2.489159774414252e-06, + "loss": 1.0949797630310059, + "step": 303 + }, + { + "epoch": 2.0683760683760686, + "grad_norm": 0.823360800743103, + "learning_rate": 2.4567339221911086e-06, + "loss": 1.1301119327545166, + "step": 304 + }, + { + "epoch": 2.0752136752136754, + "grad_norm": 0.8292282223701477, + "learning_rate": 2.424451730323261e-06, + "loss": 1.1120922565460205, + "step": 305 + }, + { + "epoch": 2.082051282051282, + "grad_norm": 0.8004986047744751, + "learning_rate": 2.3923150223207176e-06, + "loss": 1.1214550733566284, + "step": 306 + }, + { + "epoch": 2.088888888888889, + "grad_norm": 0.8165397644042969, + "learning_rate": 2.3603256134756066e-06, + "loss": 1.1209532022476196, + "step": 307 + }, + { + "epoch": 2.095726495726496, + "grad_norm": 0.8034455180168152, + "learning_rate": 2.328485310759635e-06, + "loss": 1.1401094198226929, + "step": 308 + }, + { + "epoch": 2.095726495726496, + "eval_loss": 1.3253560066223145, + "eval_runtime": 24.6122, + "eval_samples_per_second": 40.061, + "eval_steps_per_second": 5.038, + "step": 308 + }, + { + "epoch": 2.1025641025641026, + "grad_norm": 0.7844864130020142, + "learning_rate": 2.296795912722014e-06, + "loss": 1.144791603088379, + "step": 309 + }, + { + "epoch": 2.1094017094017095, + "grad_norm": 0.7857894897460938, + "learning_rate": 2.265259209387867e-06, + "loss": 1.1488922834396362, + "step": 310 + }, + { + "epoch": 2.1162393162393163, + "grad_norm": 0.7851693630218506, + "learning_rate": 2.2338769821571225e-06, + "loss": 1.1399354934692383, + "step": 311 + }, + { + "epoch": 2.123076923076923, + "grad_norm": 0.8227202296257019, + "learning_rate": 2.202651003703885e-06, + "loss": 1.1063587665557861, + "step": 312 + }, + { + "epoch": 2.12991452991453, + "grad_norm": 0.822938084602356, + "learning_rate": 2.1715830378763025e-06, + "loss": 1.1050540208816528, + "step": 313 + }, + { + "epoch": 2.1367521367521367, + "grad_norm": 0.8058551549911499, + "learning_rate": 2.140674839596931e-06, + "loss": 1.0922585725784302, + "step": 314 + }, + { + "epoch": 2.1435897435897435, + "grad_norm": 0.7917458415031433, + "learning_rate": 2.109928154763606e-06, + "loss": 1.1247828006744385, + "step": 315 + }, + { + "epoch": 2.1504273504273503, + "grad_norm": 0.8290326595306396, + "learning_rate": 2.0793447201508288e-06, + "loss": 1.1369386911392212, + "step": 316 + }, + { + "epoch": 2.157264957264957, + "grad_norm": 0.7832273840904236, + "learning_rate": 2.0489262633116536e-06, + "loss": 1.110697627067566, + "step": 317 + }, + { + "epoch": 2.164102564102564, + "grad_norm": 0.7919285297393799, + "learning_rate": 2.01867450248011e-06, + "loss": 1.157274842262268, + "step": 318 + }, + { + "epoch": 2.1709401709401708, + "grad_norm": 0.7776212096214294, + "learning_rate": 1.9885911464741413e-06, + "loss": 1.139618992805481, + "step": 319 + }, + { + "epoch": 2.1777777777777776, + "grad_norm": 0.7800706624984741, + "learning_rate": 1.9586778945990785e-06, + "loss": 1.1110671758651733, + "step": 320 + }, + { + "epoch": 2.184615384615385, + "grad_norm": 0.8117327094078064, + "learning_rate": 1.928936436551661e-06, + "loss": 1.1395684480667114, + "step": 321 + }, + { + "epoch": 2.1914529914529917, + "grad_norm": 0.7962910532951355, + "learning_rate": 1.8993684523245842e-06, + "loss": 1.1162846088409424, + "step": 322 + }, + { + "epoch": 2.1982905982905985, + "grad_norm": 0.7874794602394104, + "learning_rate": 1.8699756121115997e-06, + "loss": 1.1188956499099731, + "step": 323 + }, + { + "epoch": 2.2051282051282053, + "grad_norm": 0.785068690776825, + "learning_rate": 1.8407595762131814e-06, + "loss": 1.1131058931350708, + "step": 324 + }, + { + "epoch": 2.211965811965812, + "grad_norm": 0.8046601414680481, + "learning_rate": 1.811721994942731e-06, + "loss": 1.1231977939605713, + "step": 325 + }, + { + "epoch": 2.218803418803419, + "grad_norm": 0.759477972984314, + "learning_rate": 1.7828645085333645e-06, + "loss": 1.1036738157272339, + "step": 326 + }, + { + "epoch": 2.2256410256410257, + "grad_norm": 0.7955328226089478, + "learning_rate": 1.7541887470452606e-06, + "loss": 1.166395664215088, + "step": 327 + }, + { + "epoch": 2.2324786324786325, + "grad_norm": 0.7807881236076355, + "learning_rate": 1.7256963302735752e-06, + "loss": 1.1385221481323242, + "step": 328 + }, + { + "epoch": 2.2393162393162394, + "grad_norm": 0.7881447076797485, + "learning_rate": 1.6973888676569594e-06, + "loss": 1.145586609840393, + "step": 329 + }, + { + "epoch": 2.246153846153846, + "grad_norm": 0.8092402815818787, + "learning_rate": 1.6692679581866334e-06, + "loss": 1.1422295570373535, + "step": 330 + }, + { + "epoch": 2.252991452991453, + "grad_norm": 0.7870088219642639, + "learning_rate": 1.6413351903160763e-06, + "loss": 1.1302958726882935, + "step": 331 + }, + { + "epoch": 2.25982905982906, + "grad_norm": 0.8018279075622559, + "learning_rate": 1.6135921418712959e-06, + "loss": 1.114201545715332, + "step": 332 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 0.7955658435821533, + "learning_rate": 1.5860403799616951e-06, + "loss": 1.1686758995056152, + "step": 333 + }, + { + "epoch": 2.2735042735042734, + "grad_norm": 0.8098942637443542, + "learning_rate": 1.5586814608915673e-06, + "loss": 1.1103954315185547, + "step": 334 + }, + { + "epoch": 2.2803418803418802, + "grad_norm": 0.7653470039367676, + "learning_rate": 1.5315169300721694e-06, + "loss": 1.1263670921325684, + "step": 335 + }, + { + "epoch": 2.287179487179487, + "grad_norm": 0.7954714894294739, + "learning_rate": 1.5045483219344387e-06, + "loss": 1.091448187828064, + "step": 336 + }, + { + "epoch": 2.294017094017094, + "grad_norm": 0.7870411276817322, + "learning_rate": 1.4777771598423147e-06, + "loss": 1.127175211906433, + "step": 337 + }, + { + "epoch": 2.3008547008547007, + "grad_norm": 0.8070060014724731, + "learning_rate": 1.4512049560066837e-06, + "loss": 1.1385235786437988, + "step": 338 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 0.7654244303703308, + "learning_rate": 1.4248332113999708e-06, + "loss": 1.1272555589675903, + "step": 339 + }, + { + "epoch": 2.3145299145299143, + "grad_norm": 0.7763322591781616, + "learning_rate": 1.3986634156713418e-06, + "loss": 1.1271766424179077, + "step": 340 + }, + { + "epoch": 2.3213675213675216, + "grad_norm": 0.7544705867767334, + "learning_rate": 1.3726970470625705e-06, + "loss": 1.157515525817871, + "step": 341 + }, + { + "epoch": 2.3282051282051284, + "grad_norm": 0.7676778435707092, + "learning_rate": 1.3469355723245303e-06, + "loss": 1.1277141571044922, + "step": 342 + }, + { + "epoch": 2.335042735042735, + "grad_norm": 0.7713337540626526, + "learning_rate": 1.321380446634342e-06, + "loss": 1.1003583669662476, + "step": 343 + }, + { + "epoch": 2.341880341880342, + "grad_norm": 0.7740820646286011, + "learning_rate": 1.2960331135131826e-06, + "loss": 1.1071029901504517, + "step": 344 + }, + { + "epoch": 2.348717948717949, + "grad_norm": 0.758073091506958, + "learning_rate": 1.270895004744737e-06, + "loss": 1.110722303390503, + "step": 345 + }, + { + "epoch": 2.3555555555555556, + "grad_norm": 0.7693141102790833, + "learning_rate": 1.245967540294329e-06, + "loss": 1.097144365310669, + "step": 346 + }, + { + "epoch": 2.3623931623931624, + "grad_norm": 0.7613301873207092, + "learning_rate": 1.2212521282287093e-06, + "loss": 1.130142092704773, + "step": 347 + }, + { + "epoch": 2.3692307692307693, + "grad_norm": 0.7610928416252136, + "learning_rate": 1.1967501646365147e-06, + "loss": 1.1337437629699707, + "step": 348 + }, + { + "epoch": 2.376068376068376, + "grad_norm": 0.7692887187004089, + "learning_rate": 1.172463033549418e-06, + "loss": 1.1064190864562988, + "step": 349 + }, + { + "epoch": 2.382905982905983, + "grad_norm": 0.7826989889144897, + "learning_rate": 1.1483921068639353e-06, + "loss": 1.1885005235671997, + "step": 350 + }, + { + "epoch": 2.3897435897435897, + "grad_norm": 0.7613060474395752, + "learning_rate": 1.1245387442639456e-06, + "loss": 1.110337734222412, + "step": 351 + }, + { + "epoch": 2.3965811965811965, + "grad_norm": 0.7910706400871277, + "learning_rate": 1.1009042931438784e-06, + "loss": 1.1144278049468994, + "step": 352 + }, + { + "epoch": 2.3965811965811965, + "eval_loss": 1.323965311050415, + "eval_runtime": 24.7109, + "eval_samples_per_second": 39.901, + "eval_steps_per_second": 5.018, + "step": 352 + }, + { + "epoch": 2.4034188034188033, + "grad_norm": 0.7570564150810242, + "learning_rate": 1.077490088532605e-06, + "loss": 1.114471435546875, + "step": 353 + }, + { + "epoch": 2.41025641025641, + "grad_norm": 0.7983273863792419, + "learning_rate": 1.0542974530180327e-06, + "loss": 1.132286787033081, + "step": 354 + }, + { + "epoch": 2.417094017094017, + "grad_norm": 0.7606459856033325, + "learning_rate": 1.0313276966723867e-06, + "loss": 1.0865505933761597, + "step": 355 + }, + { + "epoch": 2.4239316239316238, + "grad_norm": 0.7879711389541626, + "learning_rate": 1.00858211697822e-06, + "loss": 1.1440324783325195, + "step": 356 + }, + { + "epoch": 2.430769230769231, + "grad_norm": 0.762718915939331, + "learning_rate": 9.860619987551157e-07, + "loss": 1.1018445491790771, + "step": 357 + }, + { + "epoch": 2.437606837606838, + "grad_norm": 0.7899941802024841, + "learning_rate": 9.637686140871121e-07, + "loss": 1.1469783782958984, + "step": 358 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.7909042239189148, + "learning_rate": 9.417032222508476e-07, + "loss": 1.1333407163619995, + "step": 359 + }, + { + "epoch": 2.4512820512820515, + "grad_norm": 0.7936816811561584, + "learning_rate": 9.198670696444339e-07, + "loss": 1.1438573598861694, + "step": 360 + }, + { + "epoch": 2.4581196581196583, + "grad_norm": 0.7882561683654785, + "learning_rate": 8.982613897170439e-07, + "loss": 1.1176822185516357, + "step": 361 + }, + { + "epoch": 2.464957264957265, + "grad_norm": 0.7810674905776978, + "learning_rate": 8.768874028992431e-07, + "loss": 1.135961651802063, + "step": 362 + }, + { + "epoch": 2.471794871794872, + "grad_norm": 0.7794176340103149, + "learning_rate": 8.557463165340479e-07, + "loss": 1.1315698623657227, + "step": 363 + }, + { + "epoch": 2.4786324786324787, + "grad_norm": 0.7674309611320496, + "learning_rate": 8.348393248087289e-07, + "loss": 1.1471264362335205, + "step": 364 + }, + { + "epoch": 2.4854700854700855, + "grad_norm": 0.7684411406517029, + "learning_rate": 8.141676086873574e-07, + "loss": 1.1023811101913452, + "step": 365 + }, + { + "epoch": 2.4923076923076923, + "grad_norm": 0.7729819416999817, + "learning_rate": 7.937323358440935e-07, + "loss": 1.1146825551986694, + "step": 366 + }, + { + "epoch": 2.499145299145299, + "grad_norm": 0.7710589170455933, + "learning_rate": 7.735346605972322e-07, + "loss": 1.1076273918151855, + "step": 367 + }, + { + "epoch": 2.505982905982906, + "grad_norm": 0.7700541019439697, + "learning_rate": 7.535757238439939e-07, + "loss": 1.1303023099899292, + "step": 368 + }, + { + "epoch": 2.5128205128205128, + "grad_norm": 0.7796255946159363, + "learning_rate": 7.338566529960817e-07, + "loss": 1.1434168815612793, + "step": 369 + }, + { + "epoch": 2.5196581196581196, + "grad_norm": 0.7890748977661133, + "learning_rate": 7.143785619160026e-07, + "loss": 1.137059211730957, + "step": 370 + }, + { + "epoch": 2.5264957264957264, + "grad_norm": 0.7733116149902344, + "learning_rate": 6.951425508541432e-07, + "loss": 1.1050790548324585, + "step": 371 + }, + { + "epoch": 2.533333333333333, + "grad_norm": 0.7718008160591125, + "learning_rate": 6.761497063866207e-07, + "loss": 1.1239290237426758, + "step": 372 + }, + { + "epoch": 2.54017094017094, + "grad_norm": 0.7675129771232605, + "learning_rate": 6.574011013539111e-07, + "loss": 1.1362709999084473, + "step": 373 + }, + { + "epoch": 2.547008547008547, + "grad_norm": 0.7831134796142578, + "learning_rate": 6.388977948002406e-07, + "loss": 1.1359511613845825, + "step": 374 + }, + { + "epoch": 2.5538461538461537, + "grad_norm": 0.7688263654708862, + "learning_rate": 6.206408319137703e-07, + "loss": 1.1311153173446655, + "step": 375 + }, + { + "epoch": 2.5606837606837605, + "grad_norm": 0.7608706951141357, + "learning_rate": 6.026312439675553e-07, + "loss": 1.1158239841461182, + "step": 376 + }, + { + "epoch": 2.5675213675213673, + "grad_norm": 0.7655665278434753, + "learning_rate": 5.848700482612873e-07, + "loss": 1.1498501300811768, + "step": 377 + }, + { + "epoch": 2.574358974358974, + "grad_norm": 0.7795934081077576, + "learning_rate": 5.673582480638395e-07, + "loss": 1.1341049671173096, + "step": 378 + }, + { + "epoch": 2.5811965811965814, + "grad_norm": 0.7773811221122742, + "learning_rate": 5.500968325565859e-07, + "loss": 1.1404979228973389, + "step": 379 + }, + { + "epoch": 2.588034188034188, + "grad_norm": 0.8611118793487549, + "learning_rate": 5.330867767775333e-07, + "loss": 1.0921636819839478, + "step": 380 + }, + { + "epoch": 2.594871794871795, + "grad_norm": 0.745428204536438, + "learning_rate": 5.163290415662408e-07, + "loss": 1.1557259559631348, + "step": 381 + }, + { + "epoch": 2.601709401709402, + "grad_norm": 0.7756429314613342, + "learning_rate": 4.998245735095459e-07, + "loss": 1.1447691917419434, + "step": 382 + }, + { + "epoch": 2.6085470085470086, + "grad_norm": 0.7908133864402771, + "learning_rate": 4.835743048880959e-07, + "loss": 1.143109917640686, + "step": 383 + }, + { + "epoch": 2.6153846153846154, + "grad_norm": 0.7732424736022949, + "learning_rate": 4.6757915362368567e-07, + "loss": 1.132035493850708, + "step": 384 + }, + { + "epoch": 2.6222222222222222, + "grad_norm": 0.7889422178268433, + "learning_rate": 4.5184002322740784e-07, + "loss": 1.1180846691131592, + "step": 385 + }, + { + "epoch": 2.629059829059829, + "grad_norm": 0.7938551902770996, + "learning_rate": 4.363578027486187e-07, + "loss": 1.1456289291381836, + "step": 386 + }, + { + "epoch": 2.635897435897436, + "grad_norm": 0.8030667901039124, + "learning_rate": 4.211333667247125e-07, + "loss": 1.1397569179534912, + "step": 387 + }, + { + "epoch": 2.6427350427350427, + "grad_norm": 0.7819530367851257, + "learning_rate": 4.0616757513173123e-07, + "loss": 1.1004501581192017, + "step": 388 + }, + { + "epoch": 2.6495726495726495, + "grad_norm": 0.758314311504364, + "learning_rate": 3.9146127333577757e-07, + "loss": 1.1101858615875244, + "step": 389 + }, + { + "epoch": 2.6564102564102563, + "grad_norm": 0.7801131010055542, + "learning_rate": 3.7701529204526856e-07, + "loss": 1.1453076601028442, + "step": 390 + }, + { + "epoch": 2.663247863247863, + "grad_norm": 0.7489244937896729, + "learning_rate": 3.6283044726401594e-07, + "loss": 1.0911612510681152, + "step": 391 + }, + { + "epoch": 2.67008547008547, + "grad_norm": 0.761225700378418, + "learning_rate": 3.4890754024512254e-07, + "loss": 1.130741000175476, + "step": 392 + }, + { + "epoch": 2.676923076923077, + "grad_norm": 0.761887788772583, + "learning_rate": 3.352473574457304e-07, + "loss": 1.120837926864624, + "step": 393 + }, + { + "epoch": 2.683760683760684, + "grad_norm": 0.7792303562164307, + "learning_rate": 3.2185067048259245e-07, + "loss": 1.1177864074707031, + "step": 394 + }, + { + "epoch": 2.690598290598291, + "grad_norm": 0.7689954042434692, + "learning_rate": 3.087182360884872e-07, + "loss": 1.177292823791504, + "step": 395 + }, + { + "epoch": 2.6974358974358976, + "grad_norm": 0.7710866332054138, + "learning_rate": 2.9585079606947843e-07, + "loss": 1.1195672750473022, + "step": 396 + }, + { + "epoch": 2.6974358974358976, + "eval_loss": 1.3236175775527954, + "eval_runtime": 24.7082, + "eval_samples_per_second": 39.906, + "eval_steps_per_second": 5.019, + "step": 396 + }, + { + "epoch": 2.7042735042735044, + "grad_norm": 0.7776737809181213, + "learning_rate": 2.8324907726300366e-07, + "loss": 1.113619327545166, + "step": 397 + }, + { + "epoch": 2.7111111111111112, + "grad_norm": 0.7743112444877625, + "learning_rate": 2.7091379149682683e-07, + "loss": 1.0938081741333008, + "step": 398 + }, + { + "epoch": 2.717948717948718, + "grad_norm": 0.7779694199562073, + "learning_rate": 2.5884563554882336e-07, + "loss": 1.1138122081756592, + "step": 399 + }, + { + "epoch": 2.724786324786325, + "grad_norm": 0.7622742652893066, + "learning_rate": 2.470452911076227e-07, + "loss": 1.1006677150726318, + "step": 400 + }, + { + "epoch": 2.7316239316239317, + "grad_norm": 0.7664272785186768, + "learning_rate": 2.355134247341073e-07, + "loss": 1.1065200567245483, + "step": 401 + }, + { + "epoch": 2.7384615384615385, + "grad_norm": 0.7712447643280029, + "learning_rate": 2.242506878237538e-07, + "loss": 1.1020417213439941, + "step": 402 + }, + { + "epoch": 2.7452991452991453, + "grad_norm": 0.7656382322311401, + "learning_rate": 2.1325771656984075e-07, + "loss": 1.1001569032669067, + "step": 403 + }, + { + "epoch": 2.752136752136752, + "grad_norm": 0.7811654806137085, + "learning_rate": 2.0253513192751374e-07, + "loss": 1.1310510635375977, + "step": 404 + }, + { + "epoch": 2.758974358974359, + "grad_norm": 0.7687283158302307, + "learning_rate": 1.9208353957870684e-07, + "loss": 1.146543264389038, + "step": 405 + }, + { + "epoch": 2.7658119658119658, + "grad_norm": 0.7670867443084717, + "learning_rate": 1.8190352989793325e-07, + "loss": 1.1161731481552124, + "step": 406 + }, + { + "epoch": 2.7726495726495726, + "grad_norm": 0.7807978391647339, + "learning_rate": 1.7199567791893524e-07, + "loss": 1.1282137632369995, + "step": 407 + }, + { + "epoch": 2.7794871794871794, + "grad_norm": 0.7957569360733032, + "learning_rate": 1.6236054330219853e-07, + "loss": 1.1041632890701294, + "step": 408 + }, + { + "epoch": 2.786324786324786, + "grad_norm": 0.7832216024398804, + "learning_rate": 1.5299867030334815e-07, + "loss": 1.108730435371399, + "step": 409 + }, + { + "epoch": 2.793162393162393, + "grad_norm": 0.753606915473938, + "learning_rate": 1.439105877423963e-07, + "loss": 1.131809115409851, + "step": 410 + }, + { + "epoch": 2.8, + "grad_norm": 0.7802961468696594, + "learning_rate": 1.350968089738758e-07, + "loss": 1.1083602905273438, + "step": 411 + }, + { + "epoch": 2.8068376068376066, + "grad_norm": 0.768670380115509, + "learning_rate": 1.2655783185784253e-07, + "loss": 1.1080389022827148, + "step": 412 + }, + { + "epoch": 2.8136752136752134, + "grad_norm": 0.7562652230262756, + "learning_rate": 1.1829413873174988e-07, + "loss": 1.1086317300796509, + "step": 413 + }, + { + "epoch": 2.8205128205128203, + "grad_norm": 0.763107180595398, + "learning_rate": 1.1030619638320805e-07, + "loss": 1.1433099508285522, + "step": 414 + }, + { + "epoch": 2.827350427350427, + "grad_norm": 0.7749531865119934, + "learning_rate": 1.0259445602361084e-07, + "loss": 1.129563331604004, + "step": 415 + }, + { + "epoch": 2.8341880341880343, + "grad_norm": 0.7604458928108215, + "learning_rate": 9.51593532626538e-08, + "loss": 1.120940089225769, + "step": 416 + }, + { + "epoch": 2.841025641025641, + "grad_norm": 0.750518262386322, + "learning_rate": 8.800130808372553e-08, + "loss": 1.0916835069656372, + "step": 417 + }, + { + "epoch": 2.847863247863248, + "grad_norm": 0.7595433592796326, + "learning_rate": 8.11207248201834e-08, + "loss": 1.1178152561187744, + "step": 418 + }, + { + "epoch": 2.8547008547008548, + "grad_norm": 0.7640005350112915, + "learning_rate": 7.45179921325162e-08, + "loss": 1.1630092859268188, + "step": 419 + }, + { + "epoch": 2.8615384615384616, + "grad_norm": 0.8447228074073792, + "learning_rate": 6.819348298638839e-08, + "loss": 1.1273298263549805, + "step": 420 + }, + { + "epoch": 2.8683760683760684, + "grad_norm": 0.7577494978904724, + "learning_rate": 6.214755463157417e-08, + "loss": 1.0993590354919434, + "step": 421 + }, + { + "epoch": 2.875213675213675, + "grad_norm": 0.7751004099845886, + "learning_rate": 5.638054858177644e-08, + "loss": 1.1498969793319702, + "step": 422 + }, + { + "epoch": 2.882051282051282, + "grad_norm": 0.7662968039512634, + "learning_rate": 5.089279059533658e-08, + "loss": 1.1176806688308716, + "step": 423 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.7827076315879822, + "learning_rate": 4.568459065683206e-08, + "loss": 1.1449580192565918, + "step": 424 + }, + { + "epoch": 2.8957264957264957, + "grad_norm": 0.7646909952163696, + "learning_rate": 4.0756242959567596e-08, + "loss": 1.1186950206756592, + "step": 425 + }, + { + "epoch": 2.9025641025641025, + "grad_norm": 0.7541195154190063, + "learning_rate": 3.610802588895845e-08, + "loss": 1.131952166557312, + "step": 426 + }, + { + "epoch": 2.9094017094017093, + "grad_norm": 0.7776208519935608, + "learning_rate": 3.1740202006804166e-08, + "loss": 1.1178792715072632, + "step": 427 + }, + { + "epoch": 2.916239316239316, + "grad_norm": 0.7766209244728088, + "learning_rate": 2.765301803645426e-08, + "loss": 1.1331486701965332, + "step": 428 + }, + { + "epoch": 2.9230769230769234, + "grad_norm": 0.7666369676589966, + "learning_rate": 2.3846704848878298e-08, + "loss": 1.1589261293411255, + "step": 429 + }, + { + "epoch": 2.92991452991453, + "grad_norm": 0.7775545716285706, + "learning_rate": 2.0321477449619098e-08, + "loss": 1.1344677209854126, + "step": 430 + }, + { + "epoch": 2.936752136752137, + "grad_norm": 0.7537861466407776, + "learning_rate": 1.7077534966650767e-08, + "loss": 1.1040513515472412, + "step": 431 + }, + { + "epoch": 2.943589743589744, + "grad_norm": 0.7825785875320435, + "learning_rate": 1.411506063912882e-08, + "loss": 1.1581734418869019, + "step": 432 + }, + { + "epoch": 2.9504273504273506, + "grad_norm": 0.7491230368614197, + "learning_rate": 1.1434221807041234e-08, + "loss": 1.1113041639328003, + "step": 433 + }, + { + "epoch": 2.9572649572649574, + "grad_norm": 0.7601305842399597, + "learning_rate": 9.035169901754902e-09, + "loss": 1.0998278856277466, + "step": 434 + }, + { + "epoch": 2.9641025641025642, + "grad_norm": 0.7869414687156677, + "learning_rate": 6.918040437463025e-09, + "loss": 1.1475398540496826, + "step": 435 + }, + { + "epoch": 2.970940170940171, + "grad_norm": 0.760128915309906, + "learning_rate": 5.082953003528457e-09, + "loss": 1.1517993211746216, + "step": 436 + }, + { + "epoch": 2.977777777777778, + "grad_norm": 0.7626367211341858, + "learning_rate": 3.530011257730226e-09, + "loss": 1.1134616136550903, + "step": 437 + }, + { + "epoch": 2.9846153846153847, + "grad_norm": 0.765670657157898, + "learning_rate": 2.2593029204076578e-09, + "loss": 1.1342540979385376, + "step": 438 + }, + { + "epoch": 2.9914529914529915, + "grad_norm": 0.7739811539649963, + "learning_rate": 1.2708997695043412e-09, + "loss": 1.1077520847320557, + "step": 439 + }, + { + "epoch": 2.9982905982905983, + "grad_norm": 0.7707903385162354, + "learning_rate": 5.648576365169245e-10, + "loss": 1.0939933061599731, + "step": 440 + }, + { + "epoch": 2.9982905982905983, + "eval_loss": 1.3233778476715088, + "eval_runtime": 24.6851, + "eval_samples_per_second": 39.943, + "eval_steps_per_second": 5.023, + "step": 440 + }, + { + "epoch": 3.0, + "grad_norm": 1.5993366241455078, + "learning_rate": 1.4121640333653042e-10, + "loss": 1.0642163753509521, + "step": 441 + }, + { + "epoch": 3.0, + "step": 441, + "total_flos": 9.743300044908134e+17, + "train_loss": 1.2459275746832088, + "train_runtime": 6646.3979, + "train_samples_per_second": 8.449, + "train_steps_per_second": 0.066 + } + ], + "logging_steps": 1.0, + "max_steps": 441, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 44, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 9.743300044908134e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}