{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 7.0, "eval_steps": 500, "global_step": 630, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05555555555555555, "grad_norm": 19.037475031095614, "learning_rate": 2.53968253968254e-06, "loss": 0.9975, "loss_nan_ranks": 0, "loss_rank_avg": 0.3319109082221985, "step": 5, "valid_targets_mean": 3084.8, "valid_targets_min": 330 }, { "epoch": 0.1111111111111111, "grad_norm": 10.063492319722197, "learning_rate": 5.7142857142857145e-06, "loss": 0.9275, "loss_nan_ranks": 0, "loss_rank_avg": 0.3317878544330597, "step": 10, "valid_targets_mean": 4345.8, "valid_targets_min": 328 }, { "epoch": 0.16666666666666666, "grad_norm": 2.3475297969942397, "learning_rate": 8.888888888888888e-06, "loss": 0.7902, "loss_nan_ranks": 0, "loss_rank_avg": 0.24648834764957428, "step": 15, "valid_targets_mean": 4131.6, "valid_targets_min": 1216 }, { "epoch": 0.2222222222222222, "grad_norm": 1.476580677728524, "learning_rate": 1.2063492063492064e-05, "loss": 0.7268, "loss_nan_ranks": 0, "loss_rank_avg": 0.21647010743618011, "step": 20, "valid_targets_mean": 3768.8, "valid_targets_min": 404 }, { "epoch": 0.2777777777777778, "grad_norm": 0.9495836410332217, "learning_rate": 1.523809523809524e-05, "loss": 0.6872, "loss_nan_ranks": 0, "loss_rank_avg": 0.22024014592170715, "step": 25, "valid_targets_mean": 3784.8, "valid_targets_min": 280 }, { "epoch": 0.3333333333333333, "grad_norm": 0.6124846458887409, "learning_rate": 1.8412698412698415e-05, "loss": 0.6531, "loss_nan_ranks": 0, "loss_rank_avg": 0.20419517159461975, "step": 30, "valid_targets_mean": 3684.2, "valid_targets_min": 401 }, { "epoch": 0.3888888888888889, "grad_norm": 0.5901105201493042, "learning_rate": 2.158730158730159e-05, "loss": 0.6291, "loss_nan_ranks": 0, "loss_rank_avg": 0.20678561925888062, "step": 35, "valid_targets_mean": 3448.4, "valid_targets_min": 309 }, { "epoch": 0.4444444444444444, "grad_norm": 0.4804068677241246, "learning_rate": 2.4761904761904766e-05, "loss": 0.6125, "loss_nan_ranks": 0, "loss_rank_avg": 0.1943480521440506, "step": 40, "valid_targets_mean": 3677.9, "valid_targets_min": 375 }, { "epoch": 0.5, "grad_norm": 0.4044840533678469, "learning_rate": 2.7936507936507936e-05, "loss": 0.5954, "loss_nan_ranks": 0, "loss_rank_avg": 0.2216871678829193, "step": 45, "valid_targets_mean": 3833.2, "valid_targets_min": 420 }, { "epoch": 0.5555555555555556, "grad_norm": 0.37401717077276986, "learning_rate": 3.111111111111112e-05, "loss": 0.5695, "loss_nan_ranks": 0, "loss_rank_avg": 0.21114522218704224, "step": 50, "valid_targets_mean": 4082.9, "valid_targets_min": 394 }, { "epoch": 0.6111111111111112, "grad_norm": 0.38366050666143015, "learning_rate": 3.4285714285714284e-05, "loss": 0.546, "loss_nan_ranks": 0, "loss_rank_avg": 0.18125328421592712, "step": 55, "valid_targets_mean": 4694.8, "valid_targets_min": 1228 }, { "epoch": 0.6666666666666666, "grad_norm": 0.3705526274838359, "learning_rate": 3.7460317460317464e-05, "loss": 0.5496, "loss_nan_ranks": 0, "loss_rank_avg": 0.1779080033302307, "step": 60, "valid_targets_mean": 3558.2, "valid_targets_min": 405 }, { "epoch": 0.7222222222222222, "grad_norm": 0.3543083921384607, "learning_rate": 3.9999693004141615e-05, "loss": 0.5259, "loss_nan_ranks": 0, "loss_rank_avg": 0.17807340621948242, "step": 65, "valid_targets_mean": 4056.4, "valid_targets_min": 363 }, { "epoch": 0.7777777777777778, "grad_norm": 0.3961065249343112, "learning_rate": 3.998894913865352e-05, "loss": 0.5241, "loss_nan_ranks": 0, "loss_rank_avg": 0.18150421977043152, "step": 70, "valid_targets_mean": 3501.2, "valid_targets_min": 477 }, { "epoch": 0.8333333333333334, "grad_norm": 0.34959954993974884, "learning_rate": 3.9962864903591375e-05, "loss": 0.5204, "loss_nan_ranks": 0, "loss_rank_avg": 0.14983993768692017, "step": 75, "valid_targets_mean": 2969.7, "valid_targets_min": 494 }, { "epoch": 0.8888888888888888, "grad_norm": 0.3074663242298868, "learning_rate": 3.992146031710637e-05, "loss": 0.5089, "loss_nan_ranks": 0, "loss_rank_avg": 0.1618027687072754, "step": 80, "valid_targets_mean": 3615.1, "valid_targets_min": 554 }, { "epoch": 0.9444444444444444, "grad_norm": 0.37613593537441964, "learning_rate": 3.9864767154838864e-05, "loss": 0.4974, "loss_nan_ranks": 0, "loss_rank_avg": 0.15451735258102417, "step": 85, "valid_targets_mean": 3800.1, "valid_targets_min": 390 }, { "epoch": 1.0, "grad_norm": 0.34297142040995393, "learning_rate": 3.9792828925532376e-05, "loss": 0.4945, "loss_nan_ranks": 0, "loss_rank_avg": 0.1645517349243164, "step": 90, "valid_targets_mean": 3238.2, "valid_targets_min": 354 }, { "epoch": 1.0555555555555556, "grad_norm": 0.35120256454353804, "learning_rate": 3.970570083764316e-05, "loss": 0.48, "loss_nan_ranks": 0, "loss_rank_avg": 0.15994709730148315, "step": 95, "valid_targets_mean": 3860.9, "valid_targets_min": 472 }, { "epoch": 1.1111111111111112, "grad_norm": 0.3323790661514736, "learning_rate": 3.9603449756970877e-05, "loss": 0.4807, "loss_nan_ranks": 0, "loss_rank_avg": 0.1775439828634262, "step": 100, "valid_targets_mean": 4234.1, "valid_targets_min": 1927 }, { "epoch": 1.1666666666666667, "grad_norm": 0.31504289803726854, "learning_rate": 3.948615415534294e-05, "loss": 0.479, "loss_nan_ranks": 0, "loss_rank_avg": 0.1668548285961151, "step": 105, "valid_targets_mean": 3562.2, "valid_targets_min": 387 }, { "epoch": 1.2222222222222223, "grad_norm": 0.321625057142259, "learning_rate": 3.9353904050391874e-05, "loss": 0.4686, "loss_nan_ranks": 0, "loss_rank_avg": 0.15156182646751404, "step": 110, "valid_targets_mean": 4068.0, "valid_targets_min": 1105 }, { "epoch": 1.2777777777777777, "grad_norm": 0.3226596300442187, "learning_rate": 3.9206800936472e-05, "loss": 0.4699, "loss_nan_ranks": 0, "loss_rank_avg": 0.14588865637779236, "step": 115, "valid_targets_mean": 3658.9, "valid_targets_min": 268 }, { "epoch": 1.3333333333333333, "grad_norm": 0.36413926771355065, "learning_rate": 3.904495770676831e-05, "loss": 0.4818, "loss_nan_ranks": 0, "loss_rank_avg": 0.15858052670955658, "step": 120, "valid_targets_mean": 4093.0, "valid_targets_min": 1090 }, { "epoch": 1.3888888888888888, "grad_norm": 0.3641062040000603, "learning_rate": 3.886849856665746e-05, "loss": 0.468, "loss_nan_ranks": 0, "loss_rank_avg": 0.15991882979869843, "step": 125, "valid_targets_mean": 3447.6, "valid_targets_min": 280 }, { "epoch": 1.4444444444444444, "grad_norm": 0.34910971297050825, "learning_rate": 3.8677558938387276e-05, "loss": 0.4596, "loss_nan_ranks": 0, "loss_rank_avg": 0.13589301705360413, "step": 130, "valid_targets_mean": 3486.2, "valid_targets_min": 253 }, { "epoch": 1.5, "grad_norm": 0.3421823874745836, "learning_rate": 3.8472285357147966e-05, "loss": 0.465, "loss_nan_ranks": 0, "loss_rank_avg": 0.13018383085727692, "step": 135, "valid_targets_mean": 3037.4, "valid_targets_min": 401 }, { "epoch": 1.5555555555555556, "grad_norm": 0.33111356038555095, "learning_rate": 3.825283535861476e-05, "loss": 0.461, "loss_nan_ranks": 0, "loss_rank_avg": 0.15683633089065552, "step": 140, "valid_targets_mean": 3742.4, "valid_targets_min": 262 }, { "epoch": 1.6111111111111112, "grad_norm": 0.32069041745575666, "learning_rate": 3.801937735804838e-05, "loss": 0.4573, "loss_nan_ranks": 0, "loss_rank_avg": 0.15576741099357605, "step": 145, "valid_targets_mean": 4067.6, "valid_targets_min": 1037 }, { "epoch": 1.6666666666666665, "grad_norm": 0.33546640012641216, "learning_rate": 3.777209052104598e-05, "loss": 0.4602, "loss_nan_ranks": 0, "loss_rank_avg": 0.1551869809627533, "step": 150, "valid_targets_mean": 3980.7, "valid_targets_min": 453 }, { "epoch": 1.7222222222222223, "grad_norm": 0.3363492039248712, "learning_rate": 3.7511164626041823e-05, "loss": 0.455, "loss_nan_ranks": 0, "loss_rank_avg": 0.1573401391506195, "step": 155, "valid_targets_mean": 3769.0, "valid_targets_min": 524 }, { "epoch": 1.7777777777777777, "grad_norm": 0.35299852743300064, "learning_rate": 3.7236799918663284e-05, "loss": 0.4544, "loss_nan_ranks": 0, "loss_rank_avg": 0.17174802720546722, "step": 160, "valid_targets_mean": 4081.9, "valid_targets_min": 494 }, { "epoch": 1.8333333333333335, "grad_norm": 0.3253099455609108, "learning_rate": 3.6949206958053825e-05, "loss": 0.4473, "loss_nan_ranks": 0, "loss_rank_avg": 0.16645416617393494, "step": 165, "valid_targets_mean": 3841.2, "valid_targets_min": 382 }, { "epoch": 1.8888888888888888, "grad_norm": 0.33812215316215416, "learning_rate": 3.6648606455280944e-05, "loss": 0.4503, "loss_nan_ranks": 0, "loss_rank_avg": 0.15572044253349304, "step": 170, "valid_targets_mean": 3896.2, "valid_targets_min": 471 }, { "epoch": 1.9444444444444444, "grad_norm": 0.37673915528072716, "learning_rate": 3.633522910395314e-05, "loss": 0.4467, "loss_nan_ranks": 0, "loss_rank_avg": 0.1881042718887329, "step": 175, "valid_targets_mean": 4453.6, "valid_targets_min": 410 }, { "epoch": 2.0, "grad_norm": 0.3407164964866703, "learning_rate": 3.6009315403175786e-05, "loss": 0.4529, "loss_nan_ranks": 0, "loss_rank_avg": 0.13232126832008362, "step": 180, "valid_targets_mean": 3291.1, "valid_targets_min": 404 }, { "epoch": 2.0555555555555554, "grad_norm": 0.3413029488652493, "learning_rate": 3.567111547298194e-05, "loss": 0.4301, "loss_nan_ranks": 0, "loss_rank_avg": 0.1630811244249344, "step": 185, "valid_targets_mean": 4086.1, "valid_targets_min": 1557 }, { "epoch": 2.111111111111111, "grad_norm": 0.3811750081248158, "learning_rate": 3.532088886237956e-05, "loss": 0.4315, "loss_nan_ranks": 0, "loss_rank_avg": 0.12379217147827148, "step": 190, "valid_targets_mean": 2931.6, "valid_targets_min": 313 }, { "epoch": 2.1666666666666665, "grad_norm": 0.358571537291706, "learning_rate": 3.495890435016258e-05, "loss": 0.4422, "loss_nan_ranks": 0, "loss_rank_avg": 0.14856354892253876, "step": 195, "valid_targets_mean": 3480.2, "valid_targets_min": 145 }, { "epoch": 2.2222222222222223, "grad_norm": 0.3737950299125905, "learning_rate": 3.458543973863859e-05, "loss": 0.4329, "loss_nan_ranks": 0, "loss_rank_avg": 0.16106510162353516, "step": 200, "valid_targets_mean": 4051.9, "valid_targets_min": 390 }, { "epoch": 2.2777777777777777, "grad_norm": 0.3432433293662255, "learning_rate": 3.420078164043161e-05, "loss": 0.4284, "loss_nan_ranks": 0, "loss_rank_avg": 0.14783862233161926, "step": 205, "valid_targets_mean": 3560.0, "valid_targets_min": 328 }, { "epoch": 2.3333333333333335, "grad_norm": 0.3430920927042314, "learning_rate": 3.38052252585233e-05, "loss": 0.4226, "loss_nan_ranks": 0, "loss_rank_avg": 0.14196516573429108, "step": 210, "valid_targets_mean": 3396.0, "valid_targets_min": 345 }, { "epoch": 2.388888888888889, "grad_norm": 0.38863253659695013, "learning_rate": 3.339907415970168e-05, "loss": 0.4235, "loss_nan_ranks": 0, "loss_rank_avg": 0.13569079339504242, "step": 215, "valid_targets_mean": 3621.8, "valid_targets_min": 354 }, { "epoch": 2.4444444444444446, "grad_norm": 0.321929890810943, "learning_rate": 3.298264004159104e-05, "loss": 0.4206, "loss_nan_ranks": 0, "loss_rank_avg": 0.14095836877822876, "step": 220, "valid_targets_mean": 3712.3, "valid_targets_min": 461 }, { "epoch": 2.5, "grad_norm": 0.3529153108569068, "learning_rate": 3.255624249344198e-05, "loss": 0.4243, "loss_nan_ranks": 0, "loss_rank_avg": 0.12544581294059753, "step": 225, "valid_targets_mean": 3365.8, "valid_targets_min": 98 }, { "epoch": 2.5555555555555554, "grad_norm": 0.34457099540336644, "learning_rate": 3.212020875086495e-05, "loss": 0.4272, "loss_nan_ranks": 0, "loss_rank_avg": 0.14480704069137573, "step": 230, "valid_targets_mean": 3387.6, "valid_targets_min": 595 }, { "epoch": 2.611111111111111, "grad_norm": 0.3455832583683071, "learning_rate": 3.1674873444695804e-05, "loss": 0.4347, "loss_nan_ranks": 0, "loss_rank_avg": 0.14685365557670593, "step": 235, "valid_targets_mean": 3451.9, "valid_targets_min": 345 }, { "epoch": 2.6666666666666665, "grad_norm": 0.3215500471418138, "learning_rate": 3.122057834418582e-05, "loss": 0.422, "loss_nan_ranks": 0, "loss_rank_avg": 0.1365378350019455, "step": 240, "valid_targets_mean": 3914.4, "valid_targets_min": 505 }, { "epoch": 2.7222222222222223, "grad_norm": 0.34593025741903677, "learning_rate": 3.075767209471345e-05, "loss": 0.4147, "loss_nan_ranks": 0, "loss_rank_avg": 0.12988005578517914, "step": 245, "valid_targets_mean": 3272.9, "valid_targets_min": 475 }, { "epoch": 2.7777777777777777, "grad_norm": 0.34062328817660814, "learning_rate": 3.0286509950219077e-05, "loss": 0.4207, "loss_nan_ranks": 0, "loss_rank_avg": 0.1327398270368576, "step": 250, "valid_targets_mean": 3390.3, "valid_targets_min": 260 }, { "epoch": 2.8333333333333335, "grad_norm": 0.3357111436567365, "learning_rate": 2.9807453500567937e-05, "loss": 0.4165, "loss_nan_ranks": 0, "loss_rank_avg": 0.14951291680335999, "step": 255, "valid_targets_mean": 4245.5, "valid_targets_min": 1302 }, { "epoch": 2.888888888888889, "grad_norm": 0.34569763385162866, "learning_rate": 2.9320870394050783e-05, "loss": 0.426, "loss_nan_ranks": 0, "loss_rank_avg": 0.14805033802986145, "step": 260, "valid_targets_mean": 3933.6, "valid_targets_min": 325 }, { "epoch": 2.9444444444444446, "grad_norm": 0.34420513950314197, "learning_rate": 2.8827134055234883e-05, "loss": 0.4227, "loss_nan_ranks": 0, "loss_rank_avg": 0.14399433135986328, "step": 265, "valid_targets_mean": 3832.5, "valid_targets_min": 1034 }, { "epoch": 3.0, "grad_norm": 0.3983608305380748, "learning_rate": 2.8326623398382174e-05, "loss": 0.4186, "loss_nan_ranks": 0, "loss_rank_avg": 0.14143314957618713, "step": 270, "valid_targets_mean": 4202.6, "valid_targets_min": 1467 }, { "epoch": 3.0555555555555554, "grad_norm": 0.35299028016710193, "learning_rate": 2.781972253665431e-05, "loss": 0.4132, "loss_nan_ranks": 0, "loss_rank_avg": 0.12317074835300446, "step": 275, "valid_targets_mean": 3195.2, "valid_targets_min": 460 }, { "epoch": 3.111111111111111, "grad_norm": 0.3376366411188975, "learning_rate": 2.7306820487327906e-05, "loss": 0.4058, "loss_nan_ranks": 0, "loss_rank_avg": 0.14129731059074402, "step": 280, "valid_targets_mean": 3994.2, "valid_targets_min": 909 }, { "epoch": 3.1666666666666665, "grad_norm": 0.35420688321944427, "learning_rate": 2.6788310873246133e-05, "loss": 0.4037, "loss_nan_ranks": 0, "loss_rank_avg": 0.1481894999742508, "step": 285, "valid_targets_mean": 3624.8, "valid_targets_min": 186 }, { "epoch": 3.2222222222222223, "grad_norm": 0.33884247477237095, "learning_rate": 2.62645916207358e-05, "loss": 0.4022, "loss_nan_ranks": 0, "loss_rank_avg": 0.12818801403045654, "step": 290, "valid_targets_mean": 3634.7, "valid_targets_min": 406 }, { "epoch": 3.2777777777777777, "grad_norm": 0.35808314122646906, "learning_rate": 2.5736064654221808e-05, "loss": 0.4078, "loss_nan_ranks": 0, "loss_rank_avg": 0.14692871272563934, "step": 295, "valid_targets_mean": 3957.3, "valid_targets_min": 814 }, { "epoch": 3.3333333333333335, "grad_norm": 0.3360689997001178, "learning_rate": 2.5203135587773196e-05, "loss": 0.4064, "loss_nan_ranks": 0, "loss_rank_avg": 0.13141655921936035, "step": 300, "valid_targets_mean": 3711.9, "valid_targets_min": 408 }, { "epoch": 3.388888888888889, "grad_norm": 0.34466387998125586, "learning_rate": 2.4666213413817696e-05, "loss": 0.4022, "loss_nan_ranks": 0, "loss_rank_avg": 0.1474984586238861, "step": 305, "valid_targets_mean": 4172.6, "valid_targets_min": 334 }, { "epoch": 3.4444444444444446, "grad_norm": 0.3292600704344278, "learning_rate": 2.4125710189263555e-05, "loss": 0.4, "loss_nan_ranks": 0, "loss_rank_avg": 0.13848447799682617, "step": 310, "valid_targets_mean": 4131.7, "valid_targets_min": 367 }, { "epoch": 3.5, "grad_norm": 0.3522048413936891, "learning_rate": 2.3582040719269504e-05, "loss": 0.3979, "loss_nan_ranks": 0, "loss_rank_avg": 0.1349484920501709, "step": 315, "valid_targets_mean": 3562.1, "valid_targets_min": 337 }, { "epoch": 3.5555555555555554, "grad_norm": 0.353827674173661, "learning_rate": 2.3035622238905694e-05, "loss": 0.404, "loss_nan_ranks": 0, "loss_rank_avg": 0.13351227343082428, "step": 320, "valid_targets_mean": 3304.5, "valid_targets_min": 462 }, { "epoch": 3.611111111111111, "grad_norm": 0.36329667172881713, "learning_rate": 2.2486874092949708e-05, "loss": 0.4026, "loss_nan_ranks": 0, "loss_rank_avg": 0.12374818325042725, "step": 325, "valid_targets_mean": 3150.0, "valid_targets_min": 431 }, { "epoch": 3.6666666666666665, "grad_norm": 0.34975906106997023, "learning_rate": 2.1936217414063584e-05, "loss": 0.4045, "loss_nan_ranks": 0, "loss_rank_avg": 0.11291907727718353, "step": 330, "valid_targets_mean": 3368.1, "valid_targets_min": 273 }, { "epoch": 3.7222222222222223, "grad_norm": 0.3262337332472019, "learning_rate": 2.138407479959869e-05, "loss": 0.4008, "loss_nan_ranks": 0, "loss_rank_avg": 0.11993084847927094, "step": 335, "valid_targets_mean": 3319.6, "valid_targets_min": 345 }, { "epoch": 3.7777777777777777, "grad_norm": 0.3905840521353238, "learning_rate": 2.0830869987276537e-05, "loss": 0.3996, "loss_nan_ranks": 0, "loss_rank_avg": 0.12794974446296692, "step": 340, "valid_targets_mean": 3615.5, "valid_targets_min": 513 }, { "epoch": 3.8333333333333335, "grad_norm": 0.3767804784651145, "learning_rate": 2.027702752999444e-05, "loss": 0.4018, "loss_nan_ranks": 0, "loss_rank_avg": 0.13156664371490479, "step": 345, "valid_targets_mean": 3314.1, "valid_targets_min": 196 }, { "epoch": 3.888888888888889, "grad_norm": 0.4004500617710589, "learning_rate": 1.9722972470005573e-05, "loss": 0.4019, "loss_nan_ranks": 0, "loss_rank_avg": 0.16676589846611023, "step": 350, "valid_targets_mean": 4194.2, "valid_targets_min": 343 }, { "epoch": 3.9444444444444446, "grad_norm": 0.3354819345422965, "learning_rate": 1.916913001272347e-05, "loss": 0.4015, "loss_nan_ranks": 0, "loss_rank_avg": 0.12294766306877136, "step": 355, "valid_targets_mean": 4012.5, "valid_targets_min": 206 }, { "epoch": 4.0, "grad_norm": 0.3505665200641073, "learning_rate": 1.8615925200401318e-05, "loss": 0.4046, "loss_nan_ranks": 0, "loss_rank_avg": 0.12504121661186218, "step": 360, "valid_targets_mean": 3056.0, "valid_targets_min": 361 }, { "epoch": 4.055555555555555, "grad_norm": 0.3400952724809926, "learning_rate": 1.806378258593642e-05, "loss": 0.4018, "loss_nan_ranks": 0, "loss_rank_avg": 0.11445639282464981, "step": 365, "valid_targets_mean": 3330.9, "valid_targets_min": 150 }, { "epoch": 4.111111111111111, "grad_norm": 0.3430283248301176, "learning_rate": 1.7513125907050302e-05, "loss": 0.3874, "loss_nan_ranks": 0, "loss_rank_avg": 0.12557926774024963, "step": 370, "valid_targets_mean": 4006.9, "valid_targets_min": 378 }, { "epoch": 4.166666666666667, "grad_norm": 0.330963547175682, "learning_rate": 1.6964377761094313e-05, "loss": 0.3855, "loss_nan_ranks": 0, "loss_rank_avg": 0.1276659518480301, "step": 375, "valid_targets_mean": 4130.3, "valid_targets_min": 289 }, { "epoch": 4.222222222222222, "grad_norm": 0.33066245321257237, "learning_rate": 1.6417959280730506e-05, "loss": 0.3956, "loss_nan_ranks": 0, "loss_rank_avg": 0.1549251228570938, "step": 380, "valid_targets_mean": 4253.3, "valid_targets_min": 690 }, { "epoch": 4.277777777777778, "grad_norm": 0.33770896122425714, "learning_rate": 1.5874289810736452e-05, "loss": 0.3854, "loss_nan_ranks": 0, "loss_rank_avg": 0.12555822730064392, "step": 385, "valid_targets_mean": 3684.2, "valid_targets_min": 378 }, { "epoch": 4.333333333333333, "grad_norm": 0.3308154242663407, "learning_rate": 1.5333786586182308e-05, "loss": 0.3886, "loss_nan_ranks": 0, "loss_rank_avg": 0.1105135902762413, "step": 390, "valid_targets_mean": 3509.6, "valid_targets_min": 206 }, { "epoch": 4.388888888888889, "grad_norm": 0.34154761675847767, "learning_rate": 1.4796864412226812e-05, "loss": 0.3912, "loss_nan_ranks": 0, "loss_rank_avg": 0.12594228982925415, "step": 395, "valid_targets_mean": 3624.0, "valid_targets_min": 173 }, { "epoch": 4.444444444444445, "grad_norm": 0.34703014372739505, "learning_rate": 1.4263935345778202e-05, "loss": 0.3885, "loss_nan_ranks": 0, "loss_rank_avg": 0.11487920582294464, "step": 400, "valid_targets_mean": 3166.8, "valid_targets_min": 413 }, { "epoch": 4.5, "grad_norm": 0.33486183447383105, "learning_rate": 1.37354083792642e-05, "loss": 0.3939, "loss_nan_ranks": 0, "loss_rank_avg": 0.1340012103319168, "step": 405, "valid_targets_mean": 3674.7, "valid_targets_min": 1323 }, { "epoch": 4.555555555555555, "grad_norm": 0.3280361715448883, "learning_rate": 1.3211689126753879e-05, "loss": 0.3893, "loss_nan_ranks": 0, "loss_rank_avg": 0.13589763641357422, "step": 410, "valid_targets_mean": 4136.9, "valid_targets_min": 402 }, { "epoch": 4.611111111111111, "grad_norm": 0.3304314990276744, "learning_rate": 1.26931795126721e-05, "loss": 0.3932, "loss_nan_ranks": 0, "loss_rank_avg": 0.13184812664985657, "step": 415, "valid_targets_mean": 4020.1, "valid_targets_min": 596 }, { "epoch": 4.666666666666667, "grad_norm": 0.36705421176651976, "learning_rate": 1.2180277463345697e-05, "loss": 0.3825, "loss_nan_ranks": 0, "loss_rank_avg": 0.12759140133857727, "step": 420, "valid_targets_mean": 3406.6, "valid_targets_min": 505 }, { "epoch": 4.722222222222222, "grad_norm": 0.37079333963641453, "learning_rate": 1.167337660161783e-05, "loss": 0.3879, "loss_nan_ranks": 0, "loss_rank_avg": 0.1401033103466034, "step": 425, "valid_targets_mean": 3449.6, "valid_targets_min": 987 }, { "epoch": 4.777777777777778, "grad_norm": 0.37212452994075035, "learning_rate": 1.1172865944765122e-05, "loss": 0.3905, "loss_nan_ranks": 0, "loss_rank_avg": 0.12527167797088623, "step": 430, "valid_targets_mean": 3579.2, "valid_targets_min": 354 }, { "epoch": 4.833333333333333, "grad_norm": 0.31452234844700844, "learning_rate": 1.067912960594923e-05, "loss": 0.3922, "loss_nan_ranks": 0, "loss_rank_avg": 0.13775086402893066, "step": 435, "valid_targets_mean": 4263.2, "valid_targets_min": 600 }, { "epoch": 4.888888888888889, "grad_norm": 0.32313593185220624, "learning_rate": 1.0192546499432066e-05, "loss": 0.3845, "loss_nan_ranks": 0, "loss_rank_avg": 0.1394214630126953, "step": 440, "valid_targets_mean": 4157.8, "valid_targets_min": 374 }, { "epoch": 4.944444444444445, "grad_norm": 0.31455375463568513, "learning_rate": 9.713490049780931e-06, "loss": 0.3871, "loss_nan_ranks": 0, "loss_rank_avg": 0.10356146097183228, "step": 445, "valid_targets_mean": 3152.8, "valid_targets_min": 367 }, { "epoch": 5.0, "grad_norm": 0.3424632953230929, "learning_rate": 9.242327905286552e-06, "loss": 0.3789, "loss_nan_ranks": 0, "loss_rank_avg": 0.11645328998565674, "step": 450, "valid_targets_mean": 3493.2, "valid_targets_min": 378 }, { "epoch": 5.055555555555555, "grad_norm": 0.3147491459732294, "learning_rate": 8.779421655814189e-06, "loss": 0.3779, "loss_nan_ranks": 0, "loss_rank_avg": 0.1383446902036667, "step": 455, "valid_targets_mean": 4309.0, "valid_targets_min": 413 }, { "epoch": 5.111111111111111, "grad_norm": 0.3270625971249571, "learning_rate": 8.325126555304208e-06, "loss": 0.3854, "loss_nan_ranks": 0, "loss_rank_avg": 0.11969293653964996, "step": 460, "valid_targets_mean": 3615.5, "valid_targets_min": 324 }, { "epoch": 5.166666666666667, "grad_norm": 0.505261887782307, "learning_rate": 7.879791249135059e-06, "loss": 0.3782, "loss_nan_ranks": 0, "loss_rank_avg": 0.13538235425949097, "step": 465, "valid_targets_mean": 4006.3, "valid_targets_min": 732 }, { "epoch": 5.222222222222222, "grad_norm": 0.3274039672119106, "learning_rate": 7.443757506558033e-06, "loss": 0.3803, "loss_nan_ranks": 0, "loss_rank_avg": 0.12611891329288483, "step": 470, "valid_targets_mean": 3720.2, "valid_targets_min": 410 }, { "epoch": 5.277777777777778, "grad_norm": 0.325522620038438, "learning_rate": 7.0173599584089625e-06, "loss": 0.374, "loss_nan_ranks": 0, "loss_rank_avg": 0.12371672689914703, "step": 475, "valid_targets_mean": 3496.4, "valid_targets_min": 249 }, { "epoch": 5.333333333333333, "grad_norm": 0.3956796011342642, "learning_rate": 6.600925840298331e-06, "loss": 0.3786, "loss_nan_ranks": 0, "loss_rank_avg": 0.13171960413455963, "step": 480, "valid_targets_mean": 3330.3, "valid_targets_min": 505 }, { "epoch": 5.388888888888889, "grad_norm": 0.33440218053570064, "learning_rate": 6.1947747414767035e-06, "loss": 0.3803, "loss_nan_ranks": 0, "loss_rank_avg": 0.11577549576759338, "step": 485, "valid_targets_mean": 3481.1, "valid_targets_min": 343 }, { "epoch": 5.444444444444445, "grad_norm": 0.312449052176888, "learning_rate": 5.799218359568395e-06, "loss": 0.3874, "loss_nan_ranks": 0, "loss_rank_avg": 0.12085084617137909, "step": 490, "valid_targets_mean": 3568.8, "valid_targets_min": 289 }, { "epoch": 5.5, "grad_norm": 0.32763223291113786, "learning_rate": 5.414560261361415e-06, "loss": 0.3827, "loss_nan_ranks": 0, "loss_rank_avg": 0.12720659375190735, "step": 495, "valid_targets_mean": 3656.4, "valid_targets_min": 196 }, { "epoch": 5.555555555555555, "grad_norm": 0.3478293772830613, "learning_rate": 5.041095649837429e-06, "loss": 0.382, "loss_nan_ranks": 0, "loss_rank_avg": 0.14937317371368408, "step": 500, "valid_targets_mean": 4328.1, "valid_targets_min": 541 }, { "epoch": 5.611111111111111, "grad_norm": 0.3333322556427379, "learning_rate": 4.679111137620442e-06, "loss": 0.3855, "loss_nan_ranks": 0, "loss_rank_avg": 0.11903315037488937, "step": 505, "valid_targets_mean": 3451.2, "valid_targets_min": 387 }, { "epoch": 5.666666666666667, "grad_norm": 0.334978529000904, "learning_rate": 4.328884527018067e-06, "loss": 0.3768, "loss_nan_ranks": 0, "loss_rank_avg": 0.1238088607788086, "step": 510, "valid_targets_mean": 3320.3, "valid_targets_min": 381 }, { "epoch": 5.722222222222222, "grad_norm": 0.3391629049530058, "learning_rate": 3.990684596824219e-06, "loss": 0.384, "loss_nan_ranks": 0, "loss_rank_avg": 0.11955156177282333, "step": 515, "valid_targets_mean": 3707.0, "valid_targets_min": 357 }, { "epoch": 5.777777777777778, "grad_norm": 0.3469430832903268, "learning_rate": 3.6647708960468696e-06, "loss": 0.3836, "loss_nan_ranks": 0, "loss_rank_avg": 0.14425542950630188, "step": 520, "valid_targets_mean": 4292.4, "valid_targets_min": 1258 }, { "epoch": 5.833333333333333, "grad_norm": 0.31586389870343146, "learning_rate": 3.3513935447190595e-06, "loss": 0.3762, "loss_nan_ranks": 0, "loss_rank_avg": 0.1405523270368576, "step": 525, "valid_targets_mean": 4005.5, "valid_targets_min": 356 }, { "epoch": 5.888888888888889, "grad_norm": 0.3413641532518475, "learning_rate": 3.050793041946183e-06, "loss": 0.3797, "loss_nan_ranks": 0, "loss_rank_avg": 0.1431114375591278, "step": 530, "valid_targets_mean": 3861.8, "valid_targets_min": 298 }, { "epoch": 5.944444444444445, "grad_norm": 0.3640119009955889, "learning_rate": 2.763200081336721e-06, "loss": 0.3765, "loss_nan_ranks": 0, "loss_rank_avg": 0.14532673358917236, "step": 535, "valid_targets_mean": 3385.6, "valid_targets_min": 304 }, { "epoch": 6.0, "grad_norm": 0.3215369886099674, "learning_rate": 2.488835373958185e-06, "loss": 0.387, "loss_nan_ranks": 0, "loss_rank_avg": 0.09596771001815796, "step": 540, "valid_targets_mean": 2951.0, "valid_targets_min": 366 }, { "epoch": 6.055555555555555, "grad_norm": 0.32957858738873264, "learning_rate": 2.2279094789540244e-06, "loss": 0.3797, "loss_nan_ranks": 0, "loss_rank_avg": 0.12534984946250916, "step": 545, "valid_targets_mean": 3643.2, "valid_targets_min": 334 }, { "epoch": 6.111111111111111, "grad_norm": 0.30481921640658827, "learning_rate": 1.9806226419516195e-06, "loss": 0.3752, "loss_nan_ranks": 0, "loss_rank_avg": 0.1421050727367401, "step": 550, "valid_targets_mean": 4784.9, "valid_targets_min": 2109 }, { "epoch": 6.166666666666667, "grad_norm": 0.3261027404622833, "learning_rate": 1.7471646413852439e-06, "loss": 0.3751, "loss_nan_ranks": 0, "loss_rank_avg": 0.1255086064338684, "step": 555, "valid_targets_mean": 3384.8, "valid_targets_min": 280 }, { "epoch": 6.222222222222222, "grad_norm": 0.3332990066131783, "learning_rate": 1.527714642852045e-06, "loss": 0.3771, "loss_nan_ranks": 0, "loss_rank_avg": 0.10999426990747452, "step": 560, "valid_targets_mean": 3432.9, "valid_targets_min": 448 }, { "epoch": 6.277777777777778, "grad_norm": 0.31562874627264265, "learning_rate": 1.3224410616127292e-06, "loss": 0.3786, "loss_nan_ranks": 0, "loss_rank_avg": 0.12806940078735352, "step": 565, "valid_targets_mean": 4304.7, "valid_targets_min": 1720 }, { "epoch": 6.333333333333333, "grad_norm": 0.3466774183511758, "learning_rate": 1.1315014333425455e-06, "loss": 0.3729, "loss_nan_ranks": 0, "loss_rank_avg": 0.1276746392250061, "step": 570, "valid_targets_mean": 3264.2, "valid_targets_min": 1090 }, { "epoch": 6.388888888888889, "grad_norm": 0.33063763343455427, "learning_rate": 9.550422932316938e-07, "loss": 0.3707, "loss_nan_ranks": 0, "loss_rank_avg": 0.12351889163255692, "step": 575, "valid_targets_mean": 4072.2, "valid_targets_min": 345 }, { "epoch": 6.444444444444445, "grad_norm": 0.3371607315092282, "learning_rate": 7.931990635280052e-07, "loss": 0.381, "loss_nan_ranks": 0, "loss_rank_avg": 0.1360011100769043, "step": 580, "valid_targets_mean": 3644.0, "valid_targets_min": 387 }, { "epoch": 6.5, "grad_norm": 0.3142943262407192, "learning_rate": 6.460959496081276e-07, "loss": 0.3729, "loss_nan_ranks": 0, "loss_rank_avg": 0.12145544588565826, "step": 585, "valid_targets_mean": 3623.0, "valid_targets_min": 436 }, { "epoch": 6.555555555555555, "grad_norm": 0.3120929356571334, "learning_rate": 5.13845844657066e-07, "loss": 0.3766, "loss_nan_ranks": 0, "loss_rank_avg": 0.12722007930278778, "step": 590, "valid_targets_mean": 4118.3, "valid_targets_min": 214 }, { "epoch": 6.611111111111111, "grad_norm": 0.3248736657284745, "learning_rate": 3.965502430291235e-07, "loss": 0.38, "loss_nan_ranks": 0, "loss_rank_avg": 0.12026658654212952, "step": 595, "valid_targets_mean": 3682.3, "valid_targets_min": 153 }, { "epoch": 6.666666666666667, "grad_norm": 0.3371061664081027, "learning_rate": 2.942991623568436e-07, "loss": 0.3816, "loss_nan_ranks": 0, "loss_rank_avg": 0.1275937557220459, "step": 600, "valid_targets_mean": 3659.6, "valid_targets_min": 328 }, { "epoch": 6.722222222222222, "grad_norm": 0.3414371651160847, "learning_rate": 2.0717107446762696e-07, "loss": 0.3803, "loss_nan_ranks": 0, "loss_rank_avg": 0.11111524701118469, "step": 605, "valid_targets_mean": 2836.0, "valid_targets_min": 390 }, { "epoch": 6.777777777777778, "grad_norm": 0.33615108170492364, "learning_rate": 1.3523284516113955e-07, "loss": 0.3797, "loss_nan_ranks": 0, "loss_rank_avg": 0.14175578951835632, "step": 610, "valid_targets_mean": 3894.5, "valid_targets_min": 308 }, { "epoch": 6.833333333333333, "grad_norm": 0.3180306253416986, "learning_rate": 7.853968289363245e-08, "loss": 0.3783, "loss_nan_ranks": 0, "loss_rank_avg": 0.1266726553440094, "step": 615, "valid_targets_mean": 3853.8, "valid_targets_min": 272 }, { "epoch": 6.888888888888889, "grad_norm": 0.34831401135169726, "learning_rate": 3.7135096408631443e-08, "loss": 0.3762, "loss_nan_ranks": 0, "loss_rank_avg": 0.11110063642263412, "step": 620, "valid_targets_mean": 3097.7, "valid_targets_min": 1279 }, { "epoch": 6.944444444444445, "grad_norm": 0.39253455514179003, "learning_rate": 1.1050861346488806e-08, "loss": 0.3821, "loss_nan_ranks": 0, "loss_rank_avg": 0.12320859730243683, "step": 625, "valid_targets_mean": 3804.1, "valid_targets_min": 505 }, { "epoch": 7.0, "grad_norm": 0.345704758959377, "learning_rate": 3.069958583856725e-10, "loss": 0.3799, "loss_nan_ranks": 0, "loss_rank_avg": 0.13302914798259735, "step": 630, "valid_targets_mean": 3211.4, "valid_targets_min": 220 }, { "epoch": 7.0, "loss_nan_ranks": 0, "loss_rank_avg": 0.13302914798259735, "step": 630, "total_flos": 1.4527566279155384e+18, "train_loss": 0.4386636906199985, "train_runtime": 8968.3932, "train_samples_per_second": 6.727, "train_steps_per_second": 0.07, "valid_targets_mean": 3211.4, "valid_targets_min": 220 } ], "logging_steps": 5, "max_steps": 630, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.4527566279155384e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }