{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 7.0, "eval_steps": 500, "global_step": 259, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.13513513513513514, "grad_norm": 5.259959182216627, "learning_rate": 6.153846153846155e-06, "loss": 0.3116, "loss_nan_ranks": 0, "loss_rank_avg": 0.09571799635887146, "step": 5, "valid_targets_mean": 6433.7, "valid_targets_min": 577 }, { "epoch": 0.2702702702702703, "grad_norm": 1.3423309952029923, "learning_rate": 1.3846153846153847e-05, "loss": 0.2587, "loss_nan_ranks": 0, "loss_rank_avg": 0.07112158834934235, "step": 10, "valid_targets_mean": 7327.8, "valid_targets_min": 413 }, { "epoch": 0.40540540540540543, "grad_norm": 0.503106602146357, "learning_rate": 2.153846153846154e-05, "loss": 0.2116, "loss_nan_ranks": 0, "loss_rank_avg": 0.06387270987033844, "step": 15, "valid_targets_mean": 7716.1, "valid_targets_min": 701 }, { "epoch": 0.5405405405405406, "grad_norm": 0.43730375431686763, "learning_rate": 2.923076923076923e-05, "loss": 0.188, "loss_nan_ranks": 0, "loss_rank_avg": 0.057055432349443436, "step": 20, "valid_targets_mean": 6165.1, "valid_targets_min": 2120 }, { "epoch": 0.6756756756756757, "grad_norm": 0.23457297766166457, "learning_rate": 3.692307692307693e-05, "loss": 0.1511, "loss_nan_ranks": 0, "loss_rank_avg": 0.04511473327875137, "step": 25, "valid_targets_mean": 6698.5, "valid_targets_min": 579 }, { "epoch": 0.8108108108108109, "grad_norm": 0.22580079812560477, "learning_rate": 3.998364045590232e-05, "loss": 0.1382, "loss_nan_ranks": 0, "loss_rank_avg": 0.04277999699115753, "step": 30, "valid_targets_mean": 5174.9, "valid_targets_min": 340 }, { "epoch": 0.9459459459459459, "grad_norm": 0.15302847392142027, "learning_rate": 3.988376236895231e-05, "loss": 0.1277, "loss_nan_ranks": 0, "loss_rank_avg": 0.03875000774860382, "step": 35, "valid_targets_mean": 6631.1, "valid_targets_min": 431 }, { "epoch": 1.0810810810810811, "grad_norm": 0.14846277384715484, "learning_rate": 3.969354804762473e-05, "loss": 0.1171, "loss_nan_ranks": 0, "loss_rank_avg": 0.035136736929416656, "step": 40, "valid_targets_mean": 5226.1, "valid_targets_min": 427 }, { "epoch": 1.2162162162162162, "grad_norm": 0.17362666626644624, "learning_rate": 3.9413861676735034e-05, "loss": 0.1152, "loss_nan_ranks": 0, "loss_rank_avg": 0.035750940442085266, "step": 45, "valid_targets_mean": 6524.1, "valid_targets_min": 497 }, { "epoch": 1.3513513513513513, "grad_norm": 0.13178939301683926, "learning_rate": 3.9045973931977495e-05, "loss": 0.1102, "loss_nan_ranks": 0, "loss_rank_avg": 0.036801472306251526, "step": 50, "valid_targets_mean": 6943.8, "valid_targets_min": 2015 }, { "epoch": 1.4864864864864864, "grad_norm": 0.18185896796674594, "learning_rate": 3.8591556206970594e-05, "loss": 0.106, "loss_nan_ranks": 0, "loss_rank_avg": 0.03439167141914368, "step": 55, "valid_targets_mean": 5706.4, "valid_targets_min": 453 }, { "epoch": 1.6216216216216215, "grad_norm": 0.1385453505062919, "learning_rate": 3.805267301975424e-05, "loss": 0.1059, "loss_nan_ranks": 0, "loss_rank_avg": 0.035251468420028687, "step": 60, "valid_targets_mean": 6418.2, "valid_targets_min": 1363 }, { "epoch": 1.7567567567567568, "grad_norm": 0.13076540307986736, "learning_rate": 3.743177263323758e-05, "loss": 0.1042, "loss_nan_ranks": 0, "loss_rank_avg": 0.03142703324556351, "step": 65, "valid_targets_mean": 5221.7, "valid_targets_min": 577 }, { "epoch": 1.8918918918918919, "grad_norm": 0.13340506373409342, "learning_rate": 3.673167593221097e-05, "loss": 0.0995, "loss_nan_ranks": 0, "loss_rank_avg": 0.034315336495637894, "step": 70, "valid_targets_mean": 6568.5, "valid_targets_min": 723 }, { "epoch": 2.027027027027027, "grad_norm": 0.1410479342684041, "learning_rate": 3.5955563607456025e-05, "loss": 0.0975, "loss_nan_ranks": 0, "loss_rank_avg": 0.030078843235969543, "step": 75, "valid_targets_mean": 5617.2, "valid_targets_min": 387 }, { "epoch": 2.1621621621621623, "grad_norm": 0.14495862504665655, "learning_rate": 3.510696170517927e-05, "loss": 0.0949, "loss_nan_ranks": 0, "loss_rank_avg": 0.028750576078891754, "step": 80, "valid_targets_mean": 5150.5, "valid_targets_min": 427 }, { "epoch": 2.2972972972972974, "grad_norm": 0.14212400520351012, "learning_rate": 3.418972560742133e-05, "loss": 0.0905, "loss_nan_ranks": 0, "loss_rank_avg": 0.0317218042910099, "step": 85, "valid_targets_mean": 6628.6, "valid_targets_min": 883 }, { "epoch": 2.4324324324324325, "grad_norm": 0.1337772062885725, "learning_rate": 3.3208022516222195e-05, "loss": 0.091, "loss_nan_ranks": 0, "loss_rank_avg": 0.0257179643958807, "step": 90, "valid_targets_mean": 6095.6, "valid_targets_min": 436 }, { "epoch": 2.5675675675675675, "grad_norm": 0.17276086488015027, "learning_rate": 3.2166312521120775e-05, "loss": 0.0876, "loss_nan_ranks": 0, "loss_rank_avg": 0.030876662582159042, "step": 95, "valid_targets_mean": 6510.9, "valid_targets_min": 1635 }, { "epoch": 2.7027027027027026, "grad_norm": 0.1480157679597937, "learning_rate": 3.106932833600314e-05, "loss": 0.0865, "loss_nan_ranks": 0, "loss_rank_avg": 0.02585354819893837, "step": 100, "valid_targets_mean": 6586.1, "valid_targets_min": 1657 }, { "epoch": 2.8378378378378377, "grad_norm": 0.15909100754186584, "learning_rate": 2.9922053797359406e-05, "loss": 0.0901, "loss_nan_ranks": 0, "loss_rank_avg": 0.027233216911554337, "step": 105, "valid_targets_mean": 5279.3, "valid_targets_min": 393 }, { "epoch": 2.972972972972973, "grad_norm": 0.18306212162162813, "learning_rate": 2.8729701221636294e-05, "loss": 0.0865, "loss_nan_ranks": 0, "loss_rank_avg": 0.02899256721138954, "step": 110, "valid_targets_mean": 5776.7, "valid_targets_min": 1181 }, { "epoch": 3.108108108108108, "grad_norm": 0.15654440385954815, "learning_rate": 2.74976877245558e-05, "loss": 0.0843, "loss_nan_ranks": 0, "loss_rank_avg": 0.026833169162273407, "step": 115, "valid_targets_mean": 5595.8, "valid_targets_min": 395 }, { "epoch": 3.2432432432432434, "grad_norm": 0.13190386927690592, "learning_rate": 2.6231610609986442e-05, "loss": 0.0776, "loss_nan_ranks": 0, "loss_rank_avg": 0.024204321205615997, "step": 120, "valid_targets_mean": 8543.6, "valid_targets_min": 1525 }, { "epoch": 3.3783783783783785, "grad_norm": 0.15764713783801354, "learning_rate": 2.493722194018082e-05, "loss": 0.0813, "loss_nan_ranks": 0, "loss_rank_avg": 0.024981103837490082, "step": 125, "valid_targets_mean": 5946.0, "valid_targets_min": 1137 }, { "epoch": 3.5135135135135136, "grad_norm": 0.1529541859974241, "learning_rate": 2.362040240291227e-05, "loss": 0.0815, "loss_nan_ranks": 0, "loss_rank_avg": 0.026058465242385864, "step": 130, "valid_targets_mean": 6072.2, "valid_targets_min": 802 }, { "epoch": 3.6486486486486487, "grad_norm": 0.33060538437630244, "learning_rate": 2.228713459423804e-05, "loss": 0.0778, "loss_nan_ranks": 0, "loss_rank_avg": 0.02479572780430317, "step": 135, "valid_targets_mean": 5208.4, "valid_targets_min": 497 }, { "epoch": 3.7837837837837838, "grad_norm": 0.15640399728656484, "learning_rate": 2.094347583827102e-05, "loss": 0.0817, "loss_nan_ranks": 0, "loss_rank_avg": 0.026677442714571953, "step": 140, "valid_targets_mean": 5517.4, "valid_targets_min": 393 }, { "epoch": 3.918918918918919, "grad_norm": 0.1517001071529229, "learning_rate": 1.9595530667445775e-05, "loss": 0.076, "loss_nan_ranks": 0, "loss_rank_avg": 0.026775188744068146, "step": 145, "valid_targets_mean": 6216.9, "valid_targets_min": 820 }, { "epoch": 4.054054054054054, "grad_norm": 0.1520783489774219, "learning_rate": 1.824942308830696e-05, "loss": 0.0787, "loss_nan_ranks": 0, "loss_rank_avg": 0.02473868615925312, "step": 150, "valid_targets_mean": 5801.8, "valid_targets_min": 692 }, { "epoch": 4.1891891891891895, "grad_norm": 0.14931092491442138, "learning_rate": 1.691126875882263e-05, "loss": 0.0752, "loss_nan_ranks": 0, "loss_rank_avg": 0.021454155445098877, "step": 155, "valid_targets_mean": 6150.3, "valid_targets_min": 577 }, { "epoch": 4.324324324324325, "grad_norm": 0.15680952631799258, "learning_rate": 1.5587147203626934e-05, "loss": 0.0712, "loss_nan_ranks": 0, "loss_rank_avg": 0.022702787071466446, "step": 160, "valid_targets_mean": 6739.9, "valid_targets_min": 1051 }, { "epoch": 4.45945945945946, "grad_norm": 0.15760466083796523, "learning_rate": 1.4283074193424379e-05, "loss": 0.0713, "loss_nan_ranks": 0, "loss_rank_avg": 0.022617951035499573, "step": 165, "valid_targets_mean": 6288.2, "valid_targets_min": 579 }, { "epoch": 4.594594594594595, "grad_norm": 0.16238993129940618, "learning_rate": 1.3004974414041987e-05, "loss": 0.0738, "loss_nan_ranks": 0, "loss_rank_avg": 0.02532375603914261, "step": 170, "valid_targets_mean": 5912.8, "valid_targets_min": 1757 }, { "epoch": 4.72972972972973, "grad_norm": 0.1636253567113291, "learning_rate": 1.1758654549299735e-05, "loss": 0.0711, "loss_nan_ranks": 0, "loss_rank_avg": 0.022857841104269028, "step": 175, "valid_targets_mean": 5464.2, "valid_targets_min": 577 }, { "epoch": 4.864864864864865, "grad_norm": 0.18989576780948067, "learning_rate": 1.0549776899989686e-05, "loss": 0.0727, "loss_nan_ranks": 0, "loss_rank_avg": 0.021705035120248795, "step": 180, "valid_targets_mean": 7371.7, "valid_targets_min": 355 }, { "epoch": 5.0, "grad_norm": 0.17912554773879225, "learning_rate": 9.3838336588184e-06, "loss": 0.074, "loss_nan_ranks": 0, "loss_rank_avg": 0.029517250135540962, "step": 185, "valid_targets_mean": 5847.1, "valid_targets_min": 342 }, { "epoch": 5.135135135135135, "grad_norm": 0.17463959150551875, "learning_rate": 8.266121958187246e-06, "loss": 0.0709, "loss_nan_ranks": 0, "loss_rank_avg": 0.024629643186926842, "step": 190, "valid_targets_mean": 7010.6, "valid_targets_min": 1067 }, { "epoch": 5.27027027027027, "grad_norm": 0.16012384004377175, "learning_rate": 7.201719804173797e-06, "loss": 0.0683, "loss_nan_ranks": 0, "loss_rank_avg": 0.02399550750851631, "step": 195, "valid_targets_mean": 6869.6, "valid_targets_min": 405 }, { "epoch": 5.405405405405405, "grad_norm": 0.27301642675588644, "learning_rate": 6.1954630060516005e-06, "loss": 0.0676, "loss_nan_ranks": 0, "loss_rank_avg": 0.023653771728277206, "step": 200, "valid_targets_mean": 4831.9, "valid_targets_min": 1161 }, { "epoch": 5.54054054054054, "grad_norm": 0.1781041058075561, "learning_rate": 5.2519232061624255e-06, "loss": 0.0698, "loss_nan_ranks": 0, "loss_rank_avg": 0.02264336310327053, "step": 205, "valid_targets_mean": 6068.0, "valid_targets_min": 467 }, { "epoch": 5.675675675675675, "grad_norm": 0.1623201456410524, "learning_rate": 4.375387109955953e-06, "loss": 0.0693, "loss_nan_ranks": 0, "loss_rank_avg": 0.022714542225003242, "step": 210, "valid_targets_mean": 6206.7, "valid_targets_min": 1846 }, { "epoch": 5.8108108108108105, "grad_norm": 0.18485993941649267, "learning_rate": 3.569837010559505e-06, "loss": 0.0661, "loss_nan_ranks": 0, "loss_rank_avg": 0.023270253092050552, "step": 215, "valid_targets_mean": 7176.0, "valid_targets_min": 372 }, { "epoch": 5.945945945945946, "grad_norm": 0.18961469137543394, "learning_rate": 2.838932696358798e-06, "loss": 0.0695, "loss_nan_ranks": 0, "loss_rank_avg": 0.024968810379505157, "step": 220, "valid_targets_mean": 6053.5, "valid_targets_min": 427 }, { "epoch": 6.081081081081081, "grad_norm": 0.15015876207979317, "learning_rate": 2.1859948237874517e-06, "loss": 0.0681, "loss_nan_ranks": 0, "loss_rank_avg": 0.01862494647502899, "step": 225, "valid_targets_mean": 5860.2, "valid_targets_min": 387 }, { "epoch": 6.216216216216216, "grad_norm": 0.15711014903589288, "learning_rate": 1.6139898308664093e-06, "loss": 0.0663, "loss_nan_ranks": 0, "loss_rank_avg": 0.02179352566599846, "step": 230, "valid_targets_mean": 6635.5, "valid_targets_min": 2015 }, { "epoch": 6.351351351351352, "grad_norm": 0.19367059367410314, "learning_rate": 1.1255164600341816e-06, "loss": 0.0674, "loss_nan_ranks": 0, "loss_rank_avg": 0.022510964423418045, "step": 235, "valid_targets_mean": 5707.8, "valid_targets_min": 445 }, { "epoch": 6.486486486486487, "grad_norm": 0.17844773460429106, "learning_rate": 7.227939514977422e-07, "loss": 0.0666, "loss_nan_ranks": 0, "loss_rank_avg": 0.023919889703392982, "step": 240, "valid_targets_mean": 5842.8, "valid_targets_min": 453 }, { "epoch": 6.621621621621622, "grad_norm": 0.16782709344204977, "learning_rate": 4.0765196074406433e-07, "loss": 0.0697, "loss_nan_ranks": 0, "loss_rank_avg": 0.01854727789759636, "step": 245, "valid_targets_mean": 5001.1, "valid_targets_min": 1381 }, { "epoch": 6.756756756756757, "grad_norm": 0.16683190263277645, "learning_rate": 1.8152224601943435e-07, "loss": 0.0666, "loss_nan_ranks": 0, "loss_rank_avg": 0.02349766716361046, "step": 250, "valid_targets_mean": 7224.0, "valid_targets_min": 405 }, { "epoch": 6.891891891891892, "grad_norm": 0.165105262824618, "learning_rate": 4.5432163541960785e-08, "loss": 0.0668, "loss_nan_ranks": 0, "loss_rank_avg": 0.026979651302099228, "step": 255, "valid_targets_mean": 7160.0, "valid_targets_min": 434 }, { "epoch": 7.0, "step": 259, "total_flos": 1.36492518344124e+18, "train_loss": 0.0, "train_runtime": 1.313, "train_samples_per_second": 18873.051, "train_steps_per_second": 197.261 } ], "logging_steps": 5, "max_steps": 259, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.36492518344124e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }