{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 24.881889763779526, "eval_steps": 25, "global_step": 175, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.6299212598425197, "grad_norm": 5.632317132462053, "learning_rate": 2.222222222222222e-06, "loss": 0.5498, "step": 5 }, { "epoch": 1.3779527559055118, "grad_norm": 2.7309533902724237, "learning_rate": 5e-06, "loss": 0.5156, "step": 10 }, { "epoch": 2.1259842519685037, "grad_norm": 1.2157903681282713, "learning_rate": 7.77777777777778e-06, "loss": 0.1575, "step": 15 }, { "epoch": 2.7559055118110236, "grad_norm": 0.4902690514544274, "learning_rate": 9.998999018714264e-06, "loss": 0.0449, "step": 20 }, { "epoch": 3.5039370078740157, "grad_norm": 0.4746290087026016, "learning_rate": 9.964006738212574e-06, "loss": 0.0385, "step": 25 }, { "epoch": 3.5039370078740157, "eval_loss": 0.049805961549282074, "eval_runtime": 3.8791, "eval_samples_per_second": 7.476, "eval_steps_per_second": 3.867, "step": 25 }, { "epoch": 4.251968503937007, "grad_norm": 0.2525712314895859, "learning_rate": 9.879365458117678e-06, "loss": 0.0247, "step": 30 }, { "epoch": 4.881889763779528, "grad_norm": 0.31313562383003646, "learning_rate": 9.745921743533653e-06, "loss": 0.0157, "step": 35 }, { "epoch": 5.6299212598425195, "grad_norm": 0.21515126665369913, "learning_rate": 9.565010271724353e-06, "loss": 0.0125, "step": 40 }, { "epoch": 6.377952755905512, "grad_norm": 0.34914670734943754, "learning_rate": 9.338440482939146e-06, "loss": 0.0069, "step": 45 }, { "epoch": 7.125984251968504, "grad_norm": 0.14477627700807097, "learning_rate": 9.068478482754532e-06, "loss": 0.0047, "step": 50 }, { "epoch": 7.125984251968504, "eval_loss": 0.05980484187602997, "eval_runtime": 3.7853, "eval_samples_per_second": 7.661, "eval_steps_per_second": 3.963, "step": 50 }, { "epoch": 7.755905511811024, "grad_norm": 0.3479903618887373, "learning_rate": 8.757824376940748e-06, "loss": 0.0019, "step": 55 }, { "epoch": 8.503937007874015, "grad_norm": 0.5106959959750706, "learning_rate": 8.409585265545509e-06, "loss": 0.0023, "step": 60 }, { "epoch": 9.251968503937007, "grad_norm": 0.14853492351867037, "learning_rate": 8.027244166302641e-06, "loss": 0.002, "step": 65 }, { "epoch": 9.881889763779528, "grad_norm": 0.16542609484128554, "learning_rate": 7.614625178187402e-06, "loss": 0.0016, "step": 70 }, { "epoch": 10.62992125984252, "grad_norm": 0.1706483384230398, "learning_rate": 7.175855233545669e-06, "loss": 0.001, "step": 75 }, { "epoch": 10.62992125984252, "eval_loss": 0.06815612316131592, "eval_runtime": 3.9081, "eval_samples_per_second": 7.42, "eval_steps_per_second": 3.838, "step": 75 }, { "epoch": 11.377952755905511, "grad_norm": 0.12734977176199208, "learning_rate": 6.715322821344495e-06, "loss": 0.0007, "step": 80 }, { "epoch": 12.125984251968504, "grad_norm": 0.055045406105188224, "learning_rate": 6.237634094385814e-06, "loss": 0.0004, "step": 85 }, { "epoch": 12.755905511811024, "grad_norm": 0.24933655723456083, "learning_rate": 5.7475667994901316e-06, "loss": 0.0002, "step": 90 }, { "epoch": 13.503937007874015, "grad_norm": 0.08486520999135673, "learning_rate": 5.250022491431259e-06, "loss": 0.0002, "step": 95 }, { "epoch": 14.251968503937007, "grad_norm": 0.03626525027130318, "learning_rate": 4.749977508568742e-06, "loss": 0.0002, "step": 100 }, { "epoch": 14.251968503937007, "eval_loss": 0.08573012053966522, "eval_runtime": 3.9129, "eval_samples_per_second": 7.411, "eval_steps_per_second": 3.833, "step": 100 }, { "epoch": 14.881889763779528, "grad_norm": 0.005000292928153053, "learning_rate": 4.252433200509869e-06, "loss": 0.0001, "step": 105 }, { "epoch": 15.62992125984252, "grad_norm": 0.0070081305200095844, "learning_rate": 3.762365905614187e-06, "loss": 0.0001, "step": 110 }, { "epoch": 16.37795275590551, "grad_norm": 0.023387552639621518, "learning_rate": 3.2846771786555075e-06, "loss": 0.0001, "step": 115 }, { "epoch": 17.125984251968504, "grad_norm": 0.008314297434967715, "learning_rate": 2.824144766454333e-06, "loss": 0.0, "step": 120 }, { "epoch": 17.755905511811022, "grad_norm": 0.0028401414939199423, "learning_rate": 2.3853748218126e-06, "loss": 0.0, "step": 125 }, { "epoch": 17.755905511811022, "eval_loss": 0.08820342272520065, "eval_runtime": 3.889, "eval_samples_per_second": 7.457, "eval_steps_per_second": 3.857, "step": 125 }, { "epoch": 18.503937007874015, "grad_norm": 0.002171915506295924, "learning_rate": 1.9727558336973594e-06, "loss": 0.0, "step": 130 }, { "epoch": 19.251968503937007, "grad_norm": 0.002247395436703256, "learning_rate": 1.5904147344544928e-06, "loss": 0.0, "step": 135 }, { "epoch": 19.881889763779526, "grad_norm": 0.020207725975457932, "learning_rate": 1.2421756230592535e-06, "loss": 0.0, "step": 140 }, { "epoch": 20.62992125984252, "grad_norm": 0.002114011246814878, "learning_rate": 9.315215172454689e-07, "loss": 0.0, "step": 145 }, { "epoch": 21.37795275590551, "grad_norm": 0.0025524711331419707, "learning_rate": 6.615595170608541e-07, "loss": 0.0, "step": 150 }, { "epoch": 21.37795275590551, "eval_loss": 0.09042119234800339, "eval_runtime": 3.8398, "eval_samples_per_second": 7.552, "eval_steps_per_second": 3.906, "step": 150 }, { "epoch": 22.125984251968504, "grad_norm": 0.003056620821824751, "learning_rate": 4.349897282756488e-07, "loss": 0.0, "step": 155 }, { "epoch": 22.755905511811022, "grad_norm": 0.0022376804480244176, "learning_rate": 2.54078256466348e-07, "loss": 0.0, "step": 160 }, { "epoch": 23.503937007874015, "grad_norm": 0.0023153643544619597, "learning_rate": 1.206345418823235e-07, "loss": 0.0, "step": 165 }, { "epoch": 24.251968503937007, "grad_norm": 0.0024824470278455323, "learning_rate": 3.599326178742535e-08, "loss": 0.0, "step": 170 }, { "epoch": 24.881889763779526, "grad_norm": 0.002034088863587326, "learning_rate": 1.0009812857370016e-09, "loss": 0.0, "step": 175 }, { "epoch": 24.881889763779526, "eval_loss": 0.0903918594121933, "eval_runtime": 3.9075, "eval_samples_per_second": 7.422, "eval_steps_per_second": 3.839, "step": 175 }, { "epoch": 24.881889763779526, "step": 175, "total_flos": 8801054883840.0, "train_loss": 0.039481030342618136, "train_runtime": 3513.9873, "train_samples_per_second": 1.807, "train_steps_per_second": 0.05 } ], "logging_steps": 5, "max_steps": 175, "num_input_tokens_seen": 0, "num_train_epochs": 25, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8801054883840.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }