Files
sft__stackexchange-tezos-sa…/trainer_state.json

1434 lines
39 KiB
JSON
Raw Normal View History

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 7.0,
"eval_steps": 500,
"global_step": 630,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05555555555555555,
"grad_norm": 19.037475031095614,
"learning_rate": 2.53968253968254e-06,
"loss": 0.9975,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3319109082221985,
"step": 5,
"valid_targets_mean": 3084.8,
"valid_targets_min": 330
},
{
"epoch": 0.1111111111111111,
"grad_norm": 10.063492319722197,
"learning_rate": 5.7142857142857145e-06,
"loss": 0.9275,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3317878544330597,
"step": 10,
"valid_targets_mean": 4345.8,
"valid_targets_min": 328
},
{
"epoch": 0.16666666666666666,
"grad_norm": 2.3475297969942397,
"learning_rate": 8.888888888888888e-06,
"loss": 0.7902,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24648834764957428,
"step": 15,
"valid_targets_mean": 4131.6,
"valid_targets_min": 1216
},
{
"epoch": 0.2222222222222222,
"grad_norm": 1.476580677728524,
"learning_rate": 1.2063492063492064e-05,
"loss": 0.7268,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21647010743618011,
"step": 20,
"valid_targets_mean": 3768.8,
"valid_targets_min": 404
},
{
"epoch": 0.2777777777777778,
"grad_norm": 0.9495836410332217,
"learning_rate": 1.523809523809524e-05,
"loss": 0.6872,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22024014592170715,
"step": 25,
"valid_targets_mean": 3784.8,
"valid_targets_min": 280
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.6124846458887409,
"learning_rate": 1.8412698412698415e-05,
"loss": 0.6531,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20419517159461975,
"step": 30,
"valid_targets_mean": 3684.2,
"valid_targets_min": 401
},
{
"epoch": 0.3888888888888889,
"grad_norm": 0.5901105201493042,
"learning_rate": 2.158730158730159e-05,
"loss": 0.6291,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20678561925888062,
"step": 35,
"valid_targets_mean": 3448.4,
"valid_targets_min": 309
},
{
"epoch": 0.4444444444444444,
"grad_norm": 0.4804068677241246,
"learning_rate": 2.4761904761904766e-05,
"loss": 0.6125,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1943480521440506,
"step": 40,
"valid_targets_mean": 3677.9,
"valid_targets_min": 375
},
{
"epoch": 0.5,
"grad_norm": 0.4044840533678469,
"learning_rate": 2.7936507936507936e-05,
"loss": 0.5954,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2216871678829193,
"step": 45,
"valid_targets_mean": 3833.2,
"valid_targets_min": 420
},
{
"epoch": 0.5555555555555556,
"grad_norm": 0.37401717077276986,
"learning_rate": 3.111111111111112e-05,
"loss": 0.5695,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21114522218704224,
"step": 50,
"valid_targets_mean": 4082.9,
"valid_targets_min": 394
},
{
"epoch": 0.6111111111111112,
"grad_norm": 0.38366050666143015,
"learning_rate": 3.4285714285714284e-05,
"loss": 0.546,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18125328421592712,
"step": 55,
"valid_targets_mean": 4694.8,
"valid_targets_min": 1228
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.3705526274838359,
"learning_rate": 3.7460317460317464e-05,
"loss": 0.5496,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1779080033302307,
"step": 60,
"valid_targets_mean": 3558.2,
"valid_targets_min": 405
},
{
"epoch": 0.7222222222222222,
"grad_norm": 0.3543083921384607,
"learning_rate": 3.9999693004141615e-05,
"loss": 0.5259,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17807340621948242,
"step": 65,
"valid_targets_mean": 4056.4,
"valid_targets_min": 363
},
{
"epoch": 0.7777777777777778,
"grad_norm": 0.3961065249343112,
"learning_rate": 3.998894913865352e-05,
"loss": 0.5241,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18150421977043152,
"step": 70,
"valid_targets_mean": 3501.2,
"valid_targets_min": 477
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.34959954993974884,
"learning_rate": 3.9962864903591375e-05,
"loss": 0.5204,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14983993768692017,
"step": 75,
"valid_targets_mean": 2969.7,
"valid_targets_min": 494
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.3074663242298868,
"learning_rate": 3.992146031710637e-05,
"loss": 0.5089,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1618027687072754,
"step": 80,
"valid_targets_mean": 3615.1,
"valid_targets_min": 554
},
{
"epoch": 0.9444444444444444,
"grad_norm": 0.37613593537441964,
"learning_rate": 3.9864767154838864e-05,
"loss": 0.4974,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15451735258102417,
"step": 85,
"valid_targets_mean": 3800.1,
"valid_targets_min": 390
},
{
"epoch": 1.0,
"grad_norm": 0.34297142040995393,
"learning_rate": 3.9792828925532376e-05,
"loss": 0.4945,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1645517349243164,
"step": 90,
"valid_targets_mean": 3238.2,
"valid_targets_min": 354
},
{
"epoch": 1.0555555555555556,
"grad_norm": 0.35120256454353804,
"learning_rate": 3.970570083764316e-05,
"loss": 0.48,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15994709730148315,
"step": 95,
"valid_targets_mean": 3860.9,
"valid_targets_min": 472
},
{
"epoch": 1.1111111111111112,
"grad_norm": 0.3323790661514736,
"learning_rate": 3.9603449756970877e-05,
"loss": 0.4807,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1775439828634262,
"step": 100,
"valid_targets_mean": 4234.1,
"valid_targets_min": 1927
},
{
"epoch": 1.1666666666666667,
"grad_norm": 0.31504289803726854,
"learning_rate": 3.948615415534294e-05,
"loss": 0.479,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1668548285961151,
"step": 105,
"valid_targets_mean": 3562.2,
"valid_targets_min": 387
},
{
"epoch": 1.2222222222222223,
"grad_norm": 0.321625057142259,
"learning_rate": 3.9353904050391874e-05,
"loss": 0.4686,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15156182646751404,
"step": 110,
"valid_targets_mean": 4068.0,
"valid_targets_min": 1105
},
{
"epoch": 1.2777777777777777,
"grad_norm": 0.3226596300442187,
"learning_rate": 3.9206800936472e-05,
"loss": 0.4699,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14588865637779236,
"step": 115,
"valid_targets_mean": 3658.9,
"valid_targets_min": 268
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.36413926771355065,
"learning_rate": 3.904495770676831e-05,
"loss": 0.4818,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15858052670955658,
"step": 120,
"valid_targets_mean": 4093.0,
"valid_targets_min": 1090
},
{
"epoch": 1.3888888888888888,
"grad_norm": 0.3641062040000603,
"learning_rate": 3.886849856665746e-05,
"loss": 0.468,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15991882979869843,
"step": 125,
"valid_targets_mean": 3447.6,
"valid_targets_min": 280
},
{
"epoch": 1.4444444444444444,
"grad_norm": 0.34910971297050825,
"learning_rate": 3.8677558938387276e-05,
"loss": 0.4596,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13589301705360413,
"step": 130,
"valid_targets_mean": 3486.2,
"valid_targets_min": 253
},
{
"epoch": 1.5,
"grad_norm": 0.3421823874745836,
"learning_rate": 3.8472285357147966e-05,
"loss": 0.465,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13018383085727692,
"step": 135,
"valid_targets_mean": 3037.4,
"valid_targets_min": 401
},
{
"epoch": 1.5555555555555556,
"grad_norm": 0.33111356038555095,
"learning_rate": 3.825283535861476e-05,
"loss": 0.461,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15683633089065552,
"step": 140,
"valid_targets_mean": 3742.4,
"valid_targets_min": 262
},
{
"epoch": 1.6111111111111112,
"grad_norm": 0.32069041745575666,
"learning_rate": 3.801937735804838e-05,
"loss": 0.4573,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15576741099357605,
"step": 145,
"valid_targets_mean": 4067.6,
"valid_targets_min": 1037
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.33546640012641216,
"learning_rate": 3.777209052104598e-05,
"loss": 0.4602,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1551869809627533,
"step": 150,
"valid_targets_mean": 3980.7,
"valid_targets_min": 453
},
{
"epoch": 1.7222222222222223,
"grad_norm": 0.3363492039248712,
"learning_rate": 3.7511164626041823e-05,
"loss": 0.455,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1573401391506195,
"step": 155,
"valid_targets_mean": 3769.0,
"valid_targets_min": 524
},
{
"epoch": 1.7777777777777777,
"grad_norm": 0.35299852743300064,
"learning_rate": 3.7236799918663284e-05,
"loss": 0.4544,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17174802720546722,
"step": 160,
"valid_targets_mean": 4081.9,
"valid_targets_min": 494
},
{
"epoch": 1.8333333333333335,
"grad_norm": 0.3253099455609108,
"learning_rate": 3.6949206958053825e-05,
"loss": 0.4473,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16645416617393494,
"step": 165,
"valid_targets_mean": 3841.2,
"valid_targets_min": 382
},
{
"epoch": 1.8888888888888888,
"grad_norm": 0.33812215316215416,
"learning_rate": 3.6648606455280944e-05,
"loss": 0.4503,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15572044253349304,
"step": 170,
"valid_targets_mean": 3896.2,
"valid_targets_min": 471
},
{
"epoch": 1.9444444444444444,
"grad_norm": 0.37673915528072716,
"learning_rate": 3.633522910395314e-05,
"loss": 0.4467,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1881042718887329,
"step": 175,
"valid_targets_mean": 4453.6,
"valid_targets_min": 410
},
{
"epoch": 2.0,
"grad_norm": 0.3407164964866703,
"learning_rate": 3.6009315403175786e-05,
"loss": 0.4529,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13232126832008362,
"step": 180,
"valid_targets_mean": 3291.1,
"valid_targets_min": 404
},
{
"epoch": 2.0555555555555554,
"grad_norm": 0.3413029488652493,
"learning_rate": 3.567111547298194e-05,
"loss": 0.4301,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1630811244249344,
"step": 185,
"valid_targets_mean": 4086.1,
"valid_targets_min": 1557
},
{
"epoch": 2.111111111111111,
"grad_norm": 0.3811750081248158,
"learning_rate": 3.532088886237956e-05,
"loss": 0.4315,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12379217147827148,
"step": 190,
"valid_targets_mean": 2931.6,
"valid_targets_min": 313
},
{
"epoch": 2.1666666666666665,
"grad_norm": 0.358571537291706,
"learning_rate": 3.495890435016258e-05,
"loss": 0.4422,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14856354892253876,
"step": 195,
"valid_targets_mean": 3480.2,
"valid_targets_min": 145
},
{
"epoch": 2.2222222222222223,
"grad_norm": 0.3737950299125905,
"learning_rate": 3.458543973863859e-05,
"loss": 0.4329,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16106510162353516,
"step": 200,
"valid_targets_mean": 4051.9,
"valid_targets_min": 390
},
{
"epoch": 2.2777777777777777,
"grad_norm": 0.3432433293662255,
"learning_rate": 3.420078164043161e-05,
"loss": 0.4284,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14783862233161926,
"step": 205,
"valid_targets_mean": 3560.0,
"valid_targets_min": 328
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.3430920927042314,
"learning_rate": 3.38052252585233e-05,
"loss": 0.4226,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14196516573429108,
"step": 210,
"valid_targets_mean": 3396.0,
"valid_targets_min": 345
},
{
"epoch": 2.388888888888889,
"grad_norm": 0.38863253659695013,
"learning_rate": 3.339907415970168e-05,
"loss": 0.4235,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13569079339504242,
"step": 215,
"valid_targets_mean": 3621.8,
"valid_targets_min": 354
},
{
"epoch": 2.4444444444444446,
"grad_norm": 0.321929890810943,
"learning_rate": 3.298264004159104e-05,
"loss": 0.4206,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14095836877822876,
"step": 220,
"valid_targets_mean": 3712.3,
"valid_targets_min": 461
},
{
"epoch": 2.5,
"grad_norm": 0.3529153108569068,
"learning_rate": 3.255624249344198e-05,
"loss": 0.4243,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12544581294059753,
"step": 225,
"valid_targets_mean": 3365.8,
"valid_targets_min": 98
},
{
"epoch": 2.5555555555555554,
"grad_norm": 0.34457099540336644,
"learning_rate": 3.212020875086495e-05,
"loss": 0.4272,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14480704069137573,
"step": 230,
"valid_targets_mean": 3387.6,
"valid_targets_min": 595
},
{
"epoch": 2.611111111111111,
"grad_norm": 0.3455832583683071,
"learning_rate": 3.1674873444695804e-05,
"loss": 0.4347,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14685365557670593,
"step": 235,
"valid_targets_mean": 3451.9,
"valid_targets_min": 345
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.3215500471418138,
"learning_rate": 3.122057834418582e-05,
"loss": 0.422,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1365378350019455,
"step": 240,
"valid_targets_mean": 3914.4,
"valid_targets_min": 505
},
{
"epoch": 2.7222222222222223,
"grad_norm": 0.34593025741903677,
"learning_rate": 3.075767209471345e-05,
"loss": 0.4147,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12988005578517914,
"step": 245,
"valid_targets_mean": 3272.9,
"valid_targets_min": 475
},
{
"epoch": 2.7777777777777777,
"grad_norm": 0.34062328817660814,
"learning_rate": 3.0286509950219077e-05,
"loss": 0.4207,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1327398270368576,
"step": 250,
"valid_targets_mean": 3390.3,
"valid_targets_min": 260
},
{
"epoch": 2.8333333333333335,
"grad_norm": 0.3357111436567365,
"learning_rate": 2.9807453500567937e-05,
"loss": 0.4165,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14951291680335999,
"step": 255,
"valid_targets_mean": 4245.5,
"valid_targets_min": 1302
},
{
"epoch": 2.888888888888889,
"grad_norm": 0.34569763385162866,
"learning_rate": 2.9320870394050783e-05,
"loss": 0.426,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14805033802986145,
"step": 260,
"valid_targets_mean": 3933.6,
"valid_targets_min": 325
},
{
"epoch": 2.9444444444444446,
"grad_norm": 0.34420513950314197,
"learning_rate": 2.8827134055234883e-05,
"loss": 0.4227,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14399433135986328,
"step": 265,
"valid_targets_mean": 3832.5,
"valid_targets_min": 1034
},
{
"epoch": 3.0,
"grad_norm": 0.3983608305380748,
"learning_rate": 2.8326623398382174e-05,
"loss": 0.4186,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14143314957618713,
"step": 270,
"valid_targets_mean": 4202.6,
"valid_targets_min": 1467
},
{
"epoch": 3.0555555555555554,
"grad_norm": 0.35299028016710193,
"learning_rate": 2.781972253665431e-05,
"loss": 0.4132,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12317074835300446,
"step": 275,
"valid_targets_mean": 3195.2,
"valid_targets_min": 460
},
{
"epoch": 3.111111111111111,
"grad_norm": 0.3376366411188975,
"learning_rate": 2.7306820487327906e-05,
"loss": 0.4058,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14129731059074402,
"step": 280,
"valid_targets_mean": 3994.2,
"valid_targets_min": 909
},
{
"epoch": 3.1666666666666665,
"grad_norm": 0.35420688321944427,
"learning_rate": 2.6788310873246133e-05,
"loss": 0.4037,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1481894999742508,
"step": 285,
"valid_targets_mean": 3624.8,
"valid_targets_min": 186
},
{
"epoch": 3.2222222222222223,
"grad_norm": 0.33884247477237095,
"learning_rate": 2.62645916207358e-05,
"loss": 0.4022,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12818801403045654,
"step": 290,
"valid_targets_mean": 3634.7,
"valid_targets_min": 406
},
{
"epoch": 3.2777777777777777,
"grad_norm": 0.35808314122646906,
"learning_rate": 2.5736064654221808e-05,
"loss": 0.4078,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14692871272563934,
"step": 295,
"valid_targets_mean": 3957.3,
"valid_targets_min": 814
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.3360689997001178,
"learning_rate": 2.5203135587773196e-05,
"loss": 0.4064,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13141655921936035,
"step": 300,
"valid_targets_mean": 3711.9,
"valid_targets_min": 408
},
{
"epoch": 3.388888888888889,
"grad_norm": 0.34466387998125586,
"learning_rate": 2.4666213413817696e-05,
"loss": 0.4022,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1474984586238861,
"step": 305,
"valid_targets_mean": 4172.6,
"valid_targets_min": 334
},
{
"epoch": 3.4444444444444446,
"grad_norm": 0.3292600704344278,
"learning_rate": 2.4125710189263555e-05,
"loss": 0.4,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13848447799682617,
"step": 310,
"valid_targets_mean": 4131.7,
"valid_targets_min": 367
},
{
"epoch": 3.5,
"grad_norm": 0.3522048413936891,
"learning_rate": 2.3582040719269504e-05,
"loss": 0.3979,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1349484920501709,
"step": 315,
"valid_targets_mean": 3562.1,
"valid_targets_min": 337
},
{
"epoch": 3.5555555555555554,
"grad_norm": 0.353827674173661,
"learning_rate": 2.3035622238905694e-05,
"loss": 0.404,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13351227343082428,
"step": 320,
"valid_targets_mean": 3304.5,
"valid_targets_min": 462
},
{
"epoch": 3.611111111111111,
"grad_norm": 0.36329667172881713,
"learning_rate": 2.2486874092949708e-05,
"loss": 0.4026,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12374818325042725,
"step": 325,
"valid_targets_mean": 3150.0,
"valid_targets_min": 431
},
{
"epoch": 3.6666666666666665,
"grad_norm": 0.34975906106997023,
"learning_rate": 2.1936217414063584e-05,
"loss": 0.4045,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11291907727718353,
"step": 330,
"valid_targets_mean": 3368.1,
"valid_targets_min": 273
},
{
"epoch": 3.7222222222222223,
"grad_norm": 0.3262337332472019,
"learning_rate": 2.138407479959869e-05,
"loss": 0.4008,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11993084847927094,
"step": 335,
"valid_targets_mean": 3319.6,
"valid_targets_min": 345
},
{
"epoch": 3.7777777777777777,
"grad_norm": 0.3905840521353238,
"learning_rate": 2.0830869987276537e-05,
"loss": 0.3996,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12794974446296692,
"step": 340,
"valid_targets_mean": 3615.5,
"valid_targets_min": 513
},
{
"epoch": 3.8333333333333335,
"grad_norm": 0.3767804784651145,
"learning_rate": 2.027702752999444e-05,
"loss": 0.4018,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13156664371490479,
"step": 345,
"valid_targets_mean": 3314.1,
"valid_targets_min": 196
},
{
"epoch": 3.888888888888889,
"grad_norm": 0.4004500617710589,
"learning_rate": 1.9722972470005573e-05,
"loss": 0.4019,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16676589846611023,
"step": 350,
"valid_targets_mean": 4194.2,
"valid_targets_min": 343
},
{
"epoch": 3.9444444444444446,
"grad_norm": 0.3354819345422965,
"learning_rate": 1.916913001272347e-05,
"loss": 0.4015,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12294766306877136,
"step": 355,
"valid_targets_mean": 4012.5,
"valid_targets_min": 206
},
{
"epoch": 4.0,
"grad_norm": 0.3505665200641073,
"learning_rate": 1.8615925200401318e-05,
"loss": 0.4046,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12504121661186218,
"step": 360,
"valid_targets_mean": 3056.0,
"valid_targets_min": 361
},
{
"epoch": 4.055555555555555,
"grad_norm": 0.3400952724809926,
"learning_rate": 1.806378258593642e-05,
"loss": 0.4018,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11445639282464981,
"step": 365,
"valid_targets_mean": 3330.9,
"valid_targets_min": 150
},
{
"epoch": 4.111111111111111,
"grad_norm": 0.3430283248301176,
"learning_rate": 1.7513125907050302e-05,
"loss": 0.3874,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12557926774024963,
"step": 370,
"valid_targets_mean": 4006.9,
"valid_targets_min": 378
},
{
"epoch": 4.166666666666667,
"grad_norm": 0.330963547175682,
"learning_rate": 1.6964377761094313e-05,
"loss": 0.3855,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1276659518480301,
"step": 375,
"valid_targets_mean": 4130.3,
"valid_targets_min": 289
},
{
"epoch": 4.222222222222222,
"grad_norm": 0.33066245321257237,
"learning_rate": 1.6417959280730506e-05,
"loss": 0.3956,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1549251228570938,
"step": 380,
"valid_targets_mean": 4253.3,
"valid_targets_min": 690
},
{
"epoch": 4.277777777777778,
"grad_norm": 0.33770896122425714,
"learning_rate": 1.5874289810736452e-05,
"loss": 0.3854,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12555822730064392,
"step": 385,
"valid_targets_mean": 3684.2,
"valid_targets_min": 378
},
{
"epoch": 4.333333333333333,
"grad_norm": 0.3308154242663407,
"learning_rate": 1.5333786586182308e-05,
"loss": 0.3886,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1105135902762413,
"step": 390,
"valid_targets_mean": 3509.6,
"valid_targets_min": 206
},
{
"epoch": 4.388888888888889,
"grad_norm": 0.34154761675847767,
"learning_rate": 1.4796864412226812e-05,
"loss": 0.3912,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12594228982925415,
"step": 395,
"valid_targets_mean": 3624.0,
"valid_targets_min": 173
},
{
"epoch": 4.444444444444445,
"grad_norm": 0.34703014372739505,
"learning_rate": 1.4263935345778202e-05,
"loss": 0.3885,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11487920582294464,
"step": 400,
"valid_targets_mean": 3166.8,
"valid_targets_min": 413
},
{
"epoch": 4.5,
"grad_norm": 0.33486183447383105,
"learning_rate": 1.37354083792642e-05,
"loss": 0.3939,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1340012103319168,
"step": 405,
"valid_targets_mean": 3674.7,
"valid_targets_min": 1323
},
{
"epoch": 4.555555555555555,
"grad_norm": 0.3280361715448883,
"learning_rate": 1.3211689126753879e-05,
"loss": 0.3893,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13589763641357422,
"step": 410,
"valid_targets_mean": 4136.9,
"valid_targets_min": 402
},
{
"epoch": 4.611111111111111,
"grad_norm": 0.3304314990276744,
"learning_rate": 1.26931795126721e-05,
"loss": 0.3932,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13184812664985657,
"step": 415,
"valid_targets_mean": 4020.1,
"valid_targets_min": 596
},
{
"epoch": 4.666666666666667,
"grad_norm": 0.36705421176651976,
"learning_rate": 1.2180277463345697e-05,
"loss": 0.3825,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12759140133857727,
"step": 420,
"valid_targets_mean": 3406.6,
"valid_targets_min": 505
},
{
"epoch": 4.722222222222222,
"grad_norm": 0.37079333963641453,
"learning_rate": 1.167337660161783e-05,
"loss": 0.3879,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1401033103466034,
"step": 425,
"valid_targets_mean": 3449.6,
"valid_targets_min": 987
},
{
"epoch": 4.777777777777778,
"grad_norm": 0.37212452994075035,
"learning_rate": 1.1172865944765122e-05,
"loss": 0.3905,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12527167797088623,
"step": 430,
"valid_targets_mean": 3579.2,
"valid_targets_min": 354
},
{
"epoch": 4.833333333333333,
"grad_norm": 0.31452234844700844,
"learning_rate": 1.067912960594923e-05,
"loss": 0.3922,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13775086402893066,
"step": 435,
"valid_targets_mean": 4263.2,
"valid_targets_min": 600
},
{
"epoch": 4.888888888888889,
"grad_norm": 0.32313593185220624,
"learning_rate": 1.0192546499432066e-05,
"loss": 0.3845,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1394214630126953,
"step": 440,
"valid_targets_mean": 4157.8,
"valid_targets_min": 374
},
{
"epoch": 4.944444444444445,
"grad_norm": 0.31455375463568513,
"learning_rate": 9.713490049780931e-06,
"loss": 0.3871,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10356146097183228,
"step": 445,
"valid_targets_mean": 3152.8,
"valid_targets_min": 367
},
{
"epoch": 5.0,
"grad_norm": 0.3424632953230929,
"learning_rate": 9.242327905286552e-06,
"loss": 0.3789,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11645328998565674,
"step": 450,
"valid_targets_mean": 3493.2,
"valid_targets_min": 378
},
{
"epoch": 5.055555555555555,
"grad_norm": 0.3147491459732294,
"learning_rate": 8.779421655814189e-06,
"loss": 0.3779,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1383446902036667,
"step": 455,
"valid_targets_mean": 4309.0,
"valid_targets_min": 413
},
{
"epoch": 5.111111111111111,
"grad_norm": 0.3270625971249571,
"learning_rate": 8.325126555304208e-06,
"loss": 0.3854,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11969293653964996,
"step": 460,
"valid_targets_mean": 3615.5,
"valid_targets_min": 324
},
{
"epoch": 5.166666666666667,
"grad_norm": 0.505261887782307,
"learning_rate": 7.879791249135059e-06,
"loss": 0.3782,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13538235425949097,
"step": 465,
"valid_targets_mean": 4006.3,
"valid_targets_min": 732
},
{
"epoch": 5.222222222222222,
"grad_norm": 0.3274039672119106,
"learning_rate": 7.443757506558033e-06,
"loss": 0.3803,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12611891329288483,
"step": 470,
"valid_targets_mean": 3720.2,
"valid_targets_min": 410
},
{
"epoch": 5.277777777777778,
"grad_norm": 0.325522620038438,
"learning_rate": 7.0173599584089625e-06,
"loss": 0.374,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12371672689914703,
"step": 475,
"valid_targets_mean": 3496.4,
"valid_targets_min": 249
},
{
"epoch": 5.333333333333333,
"grad_norm": 0.3956796011342642,
"learning_rate": 6.600925840298331e-06,
"loss": 0.3786,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13171960413455963,
"step": 480,
"valid_targets_mean": 3330.3,
"valid_targets_min": 505
},
{
"epoch": 5.388888888888889,
"grad_norm": 0.33440218053570064,
"learning_rate": 6.1947747414767035e-06,
"loss": 0.3803,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11577549576759338,
"step": 485,
"valid_targets_mean": 3481.1,
"valid_targets_min": 343
},
{
"epoch": 5.444444444444445,
"grad_norm": 0.312449052176888,
"learning_rate": 5.799218359568395e-06,
"loss": 0.3874,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12085084617137909,
"step": 490,
"valid_targets_mean": 3568.8,
"valid_targets_min": 289
},
{
"epoch": 5.5,
"grad_norm": 0.32763223291113786,
"learning_rate": 5.414560261361415e-06,
"loss": 0.3827,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12720659375190735,
"step": 495,
"valid_targets_mean": 3656.4,
"valid_targets_min": 196
},
{
"epoch": 5.555555555555555,
"grad_norm": 0.3478293772830613,
"learning_rate": 5.041095649837429e-06,
"loss": 0.382,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14937317371368408,
"step": 500,
"valid_targets_mean": 4328.1,
"valid_targets_min": 541
},
{
"epoch": 5.611111111111111,
"grad_norm": 0.3333322556427379,
"learning_rate": 4.679111137620442e-06,
"loss": 0.3855,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11903315037488937,
"step": 505,
"valid_targets_mean": 3451.2,
"valid_targets_min": 387
},
{
"epoch": 5.666666666666667,
"grad_norm": 0.334978529000904,
"learning_rate": 4.328884527018067e-06,
"loss": 0.3768,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1238088607788086,
"step": 510,
"valid_targets_mean": 3320.3,
"valid_targets_min": 381
},
{
"epoch": 5.722222222222222,
"grad_norm": 0.3391629049530058,
"learning_rate": 3.990684596824219e-06,
"loss": 0.384,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11955156177282333,
"step": 515,
"valid_targets_mean": 3707.0,
"valid_targets_min": 357
},
{
"epoch": 5.777777777777778,
"grad_norm": 0.3469430832903268,
"learning_rate": 3.6647708960468696e-06,
"loss": 0.3836,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14425542950630188,
"step": 520,
"valid_targets_mean": 4292.4,
"valid_targets_min": 1258
},
{
"epoch": 5.833333333333333,
"grad_norm": 0.31586389870343146,
"learning_rate": 3.3513935447190595e-06,
"loss": 0.3762,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1405523270368576,
"step": 525,
"valid_targets_mean": 4005.5,
"valid_targets_min": 356
},
{
"epoch": 5.888888888888889,
"grad_norm": 0.3413641532518475,
"learning_rate": 3.050793041946183e-06,
"loss": 0.3797,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1431114375591278,
"step": 530,
"valid_targets_mean": 3861.8,
"valid_targets_min": 298
},
{
"epoch": 5.944444444444445,
"grad_norm": 0.3640119009955889,
"learning_rate": 2.763200081336721e-06,
"loss": 0.3765,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14532673358917236,
"step": 535,
"valid_targets_mean": 3385.6,
"valid_targets_min": 304
},
{
"epoch": 6.0,
"grad_norm": 0.3215369886099674,
"learning_rate": 2.488835373958185e-06,
"loss": 0.387,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09596771001815796,
"step": 540,
"valid_targets_mean": 2951.0,
"valid_targets_min": 366
},
{
"epoch": 6.055555555555555,
"grad_norm": 0.32957858738873264,
"learning_rate": 2.2279094789540244e-06,
"loss": 0.3797,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12534984946250916,
"step": 545,
"valid_targets_mean": 3643.2,
"valid_targets_min": 334
},
{
"epoch": 6.111111111111111,
"grad_norm": 0.30481921640658827,
"learning_rate": 1.9806226419516195e-06,
"loss": 0.3752,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1421050727367401,
"step": 550,
"valid_targets_mean": 4784.9,
"valid_targets_min": 2109
},
{
"epoch": 6.166666666666667,
"grad_norm": 0.3261027404622833,
"learning_rate": 1.7471646413852439e-06,
"loss": 0.3751,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1255086064338684,
"step": 555,
"valid_targets_mean": 3384.8,
"valid_targets_min": 280
},
{
"epoch": 6.222222222222222,
"grad_norm": 0.3332990066131783,
"learning_rate": 1.527714642852045e-06,
"loss": 0.3771,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10999426990747452,
"step": 560,
"valid_targets_mean": 3432.9,
"valid_targets_min": 448
},
{
"epoch": 6.277777777777778,
"grad_norm": 0.31562874627264265,
"learning_rate": 1.3224410616127292e-06,
"loss": 0.3786,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12806940078735352,
"step": 565,
"valid_targets_mean": 4304.7,
"valid_targets_min": 1720
},
{
"epoch": 6.333333333333333,
"grad_norm": 0.3466774183511758,
"learning_rate": 1.1315014333425455e-06,
"loss": 0.3729,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1276746392250061,
"step": 570,
"valid_targets_mean": 3264.2,
"valid_targets_min": 1090
},
{
"epoch": 6.388888888888889,
"grad_norm": 0.33063763343455427,
"learning_rate": 9.550422932316938e-07,
"loss": 0.3707,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12351889163255692,
"step": 575,
"valid_targets_mean": 4072.2,
"valid_targets_min": 345
},
{
"epoch": 6.444444444444445,
"grad_norm": 0.3371607315092282,
"learning_rate": 7.931990635280052e-07,
"loss": 0.381,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1360011100769043,
"step": 580,
"valid_targets_mean": 3644.0,
"valid_targets_min": 387
},
{
"epoch": 6.5,
"grad_norm": 0.3142943262407192,
"learning_rate": 6.460959496081276e-07,
"loss": 0.3729,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12145544588565826,
"step": 585,
"valid_targets_mean": 3623.0,
"valid_targets_min": 436
},
{
"epoch": 6.555555555555555,
"grad_norm": 0.3120929356571334,
"learning_rate": 5.13845844657066e-07,
"loss": 0.3766,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12722007930278778,
"step": 590,
"valid_targets_mean": 4118.3,
"valid_targets_min": 214
},
{
"epoch": 6.611111111111111,
"grad_norm": 0.3248736657284745,
"learning_rate": 3.965502430291235e-07,
"loss": 0.38,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12026658654212952,
"step": 595,
"valid_targets_mean": 3682.3,
"valid_targets_min": 153
},
{
"epoch": 6.666666666666667,
"grad_norm": 0.3371061664081027,
"learning_rate": 2.942991623568436e-07,
"loss": 0.3816,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1275937557220459,
"step": 600,
"valid_targets_mean": 3659.6,
"valid_targets_min": 328
},
{
"epoch": 6.722222222222222,
"grad_norm": 0.3414371651160847,
"learning_rate": 2.0717107446762696e-07,
"loss": 0.3803,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11111524701118469,
"step": 605,
"valid_targets_mean": 2836.0,
"valid_targets_min": 390
},
{
"epoch": 6.777777777777778,
"grad_norm": 0.33615108170492364,
"learning_rate": 1.3523284516113955e-07,
"loss": 0.3797,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14175578951835632,
"step": 610,
"valid_targets_mean": 3894.5,
"valid_targets_min": 308
},
{
"epoch": 6.833333333333333,
"grad_norm": 0.3180306253416986,
"learning_rate": 7.853968289363245e-08,
"loss": 0.3783,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1266726553440094,
"step": 615,
"valid_targets_mean": 3853.8,
"valid_targets_min": 272
},
{
"epoch": 6.888888888888889,
"grad_norm": 0.34831401135169726,
"learning_rate": 3.7135096408631443e-08,
"loss": 0.3762,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11110063642263412,
"step": 620,
"valid_targets_mean": 3097.7,
"valid_targets_min": 1279
},
{
"epoch": 6.944444444444445,
"grad_norm": 0.39253455514179003,
"learning_rate": 1.1050861346488806e-08,
"loss": 0.3821,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12320859730243683,
"step": 625,
"valid_targets_mean": 3804.1,
"valid_targets_min": 505
},
{
"epoch": 7.0,
"grad_norm": 0.345704758959377,
"learning_rate": 3.069958583856725e-10,
"loss": 0.3799,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13302914798259735,
"step": 630,
"valid_targets_mean": 3211.4,
"valid_targets_min": 220
},
{
"epoch": 7.0,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13302914798259735,
"step": 630,
"total_flos": 1.4527566279155384e+18,
"train_loss": 0.4386636906199985,
"train_runtime": 8968.3932,
"train_samples_per_second": 6.727,
"train_steps_per_second": 0.07,
"valid_targets_mean": 3211.4,
"valid_targets_min": 220
}
],
"logging_steps": 5,
"max_steps": 630,
"num_input_tokens_seen": 0,
"num_train_epochs": 7,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.4527566279155384e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}