{ "best_global_step": 4220, "best_metric": 4.378113269805908, "best_model_checkpoint": null, "epoch": 17.77251184834123, "eval_steps": 500, "global_step": 7500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002369668246445498, "grad_norm": 20.34552001953125, "learning_rate": 0.0, "loss": 10.3309, "step": 1 }, { "epoch": 1.0, "grad_norm": 1.0193040370941162, "learning_rate": 8.42e-05, "loss": 5.762, "step": 422 }, { "epoch": 1.0, "eval_loss": 4.781050205230713, "eval_runtime": 21.3156, "eval_samples_per_second": 1292.106, "eval_steps_per_second": 5.067, "step": 422 }, { "epoch": 2.0, "grad_norm": 0.6101345419883728, "learning_rate": 9.566919191919192e-05, "loss": 4.6298, "step": 844 }, { "epoch": 2.0, "eval_loss": 4.602915287017822, "eval_runtime": 22.4738, "eval_samples_per_second": 1225.514, "eval_steps_per_second": 4.806, "step": 844 }, { "epoch": 3.0, "grad_norm": 0.5042136907577515, "learning_rate": 9.03409090909091e-05, "loss": 4.5095, "step": 1266 }, { "epoch": 3.0, "eval_loss": 4.522061347961426, "eval_runtime": 31.9704, "eval_samples_per_second": 861.483, "eval_steps_per_second": 3.378, "step": 1266 }, { "epoch": 4.0, "grad_norm": 0.5082475543022156, "learning_rate": 8.501262626262628e-05, "loss": 4.4388, "step": 1688 }, { "epoch": 4.0, "eval_loss": 4.47515869140625, "eval_runtime": 31.848, "eval_samples_per_second": 864.796, "eval_steps_per_second": 3.391, "step": 1688 }, { "epoch": 5.0, "grad_norm": 0.5105661749839783, "learning_rate": 7.968434343434343e-05, "loss": 4.3851, "step": 2110 }, { "epoch": 5.0, "eval_loss": 4.442058563232422, "eval_runtime": 32.0508, "eval_samples_per_second": 859.323, "eval_steps_per_second": 3.37, "step": 2110 }, { "epoch": 6.0, "grad_norm": 0.5460625886917114, "learning_rate": 7.435606060606061e-05, "loss": 4.3391, "step": 2532 }, { "epoch": 6.0, "eval_loss": 4.4180145263671875, "eval_runtime": 30.8915, "eval_samples_per_second": 891.572, "eval_steps_per_second": 3.496, "step": 2532 }, { "epoch": 7.0, "grad_norm": 0.5497872233390808, "learning_rate": 6.902777777777779e-05, "loss": 4.2975, "step": 2954 }, { "epoch": 7.0, "eval_loss": 4.39995813369751, "eval_runtime": 32.8624, "eval_samples_per_second": 838.1, "eval_steps_per_second": 3.286, "step": 2954 }, { "epoch": 8.0, "grad_norm": 0.5838669538497925, "learning_rate": 6.369949494949495e-05, "loss": 4.258, "step": 3376 }, { "epoch": 8.0, "eval_loss": 4.387345790863037, "eval_runtime": 30.4389, "eval_samples_per_second": 904.829, "eval_steps_per_second": 3.548, "step": 3376 }, { "epoch": 9.0, "grad_norm": 0.6011605262756348, "learning_rate": 5.837121212121213e-05, "loss": 4.2193, "step": 3798 }, { "epoch": 9.0, "eval_loss": 4.380561351776123, "eval_runtime": 31.2627, "eval_samples_per_second": 880.986, "eval_steps_per_second": 3.455, "step": 3798 }, { "epoch": 10.0, "grad_norm": 0.6363890171051025, "learning_rate": 5.30429292929293e-05, "loss": 4.1807, "step": 4220 }, { "epoch": 10.0, "eval_loss": 4.378113269805908, "eval_runtime": 31.7839, "eval_samples_per_second": 866.54, "eval_steps_per_second": 3.398, "step": 4220 }, { "epoch": 11.0, "grad_norm": 0.663550078868866, "learning_rate": 4.771464646464647e-05, "loss": 4.1418, "step": 4642 }, { "epoch": 11.0, "eval_loss": 4.379039764404297, "eval_runtime": 29.9854, "eval_samples_per_second": 918.515, "eval_steps_per_second": 3.602, "step": 4642 }, { "epoch": 12.0, "grad_norm": 0.744036078453064, "learning_rate": 4.238636363636364e-05, "loss": 4.1029, "step": 5064 }, { "epoch": 12.0, "eval_loss": 4.383896350860596, "eval_runtime": 25.563, "eval_samples_per_second": 1077.418, "eval_steps_per_second": 4.225, "step": 5064 }, { "epoch": 13.0, "grad_norm": 0.8011893630027771, "learning_rate": 3.705808080808081e-05, "loss": 4.0643, "step": 5486 }, { "epoch": 13.0, "eval_loss": 4.391408920288086, "eval_runtime": 30.7558, "eval_samples_per_second": 895.506, "eval_steps_per_second": 3.512, "step": 5486 }, { "epoch": 14.0, "grad_norm": 0.8544487357139587, "learning_rate": 3.172979797979798e-05, "loss": 4.0259, "step": 5908 }, { "epoch": 14.0, "eval_loss": 4.40255880355835, "eval_runtime": 31.2899, "eval_samples_per_second": 880.219, "eval_steps_per_second": 3.452, "step": 5908 }, { "epoch": 15.0, "grad_norm": 0.9503664970397949, "learning_rate": 2.6401515151515155e-05, "loss": 3.9896, "step": 6330 }, { "epoch": 15.0, "eval_loss": 4.4131035804748535, "eval_runtime": 21.2356, "eval_samples_per_second": 1296.974, "eval_steps_per_second": 5.086, "step": 6330 }, { "epoch": 16.0, "grad_norm": 1.0018138885498047, "learning_rate": 2.1073232323232324e-05, "loss": 3.9551, "step": 6752 }, { "epoch": 16.0, "eval_loss": 4.427628040313721, "eval_runtime": 21.2661, "eval_samples_per_second": 1295.115, "eval_steps_per_second": 5.079, "step": 6752 }, { "epoch": 17.0, "grad_norm": 1.0533849000930786, "learning_rate": 1.5744949494949496e-05, "loss": 3.9231, "step": 7174 }, { "epoch": 17.0, "eval_loss": 4.4376349449157715, "eval_runtime": 21.2661, "eval_samples_per_second": 1295.114, "eval_steps_per_second": 5.079, "step": 7174 } ], "logging_steps": 500, "max_steps": 8440, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.25410180939776e+17, "train_batch_size": 256, "trial_name": null, "trial_params": null }