Files
sft__Kimi-2-5-swesmith-orac…/trainer_state.json

1555 lines
43 KiB
JSON
Raw Permalink Normal View History

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 7.0,
"eval_steps": 500,
"global_step": 686,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.051194539249146756,
"grad_norm": 25.82480694310689,
"learning_rate": 2.3188405797101453e-06,
"loss": 0.8837,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28898581862449646,
"step": 5,
"valid_targets_mean": 4911.5,
"valid_targets_min": 2215
},
{
"epoch": 0.10238907849829351,
"grad_norm": 12.744490357644285,
"learning_rate": 5.2173913043478265e-06,
"loss": 0.7924,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2303556501865387,
"step": 10,
"valid_targets_mean": 5216.9,
"valid_targets_min": 319
},
{
"epoch": 0.15358361774744028,
"grad_norm": 2.6591207927386353,
"learning_rate": 8.115942028985508e-06,
"loss": 0.629,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22648176550865173,
"step": 15,
"valid_targets_mean": 5766.3,
"valid_targets_min": 2726
},
{
"epoch": 0.20477815699658702,
"grad_norm": 1.534238831991734,
"learning_rate": 1.101449275362319e-05,
"loss": 0.5623,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1773180514574051,
"step": 20,
"valid_targets_mean": 5145.9,
"valid_targets_min": 2241
},
{
"epoch": 0.25597269624573377,
"grad_norm": 1.0352664016291477,
"learning_rate": 1.391304347826087e-05,
"loss": 0.5276,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1391734778881073,
"step": 25,
"valid_targets_mean": 4861.1,
"valid_targets_min": 134
},
{
"epoch": 0.30716723549488056,
"grad_norm": 0.6935657264704905,
"learning_rate": 1.681159420289855e-05,
"loss": 0.4868,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15757042169570923,
"step": 30,
"valid_targets_mean": 5287.8,
"valid_targets_min": 1702
},
{
"epoch": 0.3583617747440273,
"grad_norm": 0.574511779794303,
"learning_rate": 1.9710144927536236e-05,
"loss": 0.4641,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14636749029159546,
"step": 35,
"valid_targets_mean": 4977.4,
"valid_targets_min": 1383
},
{
"epoch": 0.40955631399317405,
"grad_norm": 0.45954923986300594,
"learning_rate": 2.2608695652173914e-05,
"loss": 0.4417,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14387136697769165,
"step": 40,
"valid_targets_mean": 5467.8,
"valid_targets_min": 413
},
{
"epoch": 0.46075085324232085,
"grad_norm": 0.36407097207825967,
"learning_rate": 2.5507246376811593e-05,
"loss": 0.4245,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11947175860404968,
"step": 45,
"valid_targets_mean": 4556.4,
"valid_targets_min": 1526
},
{
"epoch": 0.5119453924914675,
"grad_norm": 0.3200847843431381,
"learning_rate": 2.840579710144928e-05,
"loss": 0.4008,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12619568407535553,
"step": 50,
"valid_targets_mean": 5079.8,
"valid_targets_min": 409
},
{
"epoch": 0.5631399317406144,
"grad_norm": 0.2749784180114348,
"learning_rate": 3.130434782608696e-05,
"loss": 0.3861,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15403807163238525,
"step": 55,
"valid_targets_mean": 5949.9,
"valid_targets_min": 350
},
{
"epoch": 0.6143344709897611,
"grad_norm": 0.260946688973715,
"learning_rate": 3.420289855072464e-05,
"loss": 0.3798,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10824601352214813,
"step": 60,
"valid_targets_mean": 4519.8,
"valid_targets_min": 1730
},
{
"epoch": 0.6655290102389079,
"grad_norm": 0.2695889942073195,
"learning_rate": 3.7101449275362325e-05,
"loss": 0.3636,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14311721920967102,
"step": 65,
"valid_targets_mean": 5901.2,
"valid_targets_min": 2696
},
{
"epoch": 0.7167235494880546,
"grad_norm": 0.2513368707958076,
"learning_rate": 4e-05,
"loss": 0.3524,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.113977812230587,
"step": 70,
"valid_targets_mean": 5193.1,
"valid_targets_min": 937
},
{
"epoch": 0.7679180887372014,
"grad_norm": 0.25669844186315777,
"learning_rate": 3.999351894109228e-05,
"loss": 0.3337,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10874387621879578,
"step": 75,
"valid_targets_mean": 5100.9,
"valid_targets_min": 1998
},
{
"epoch": 0.8191126279863481,
"grad_norm": 0.2654328857905388,
"learning_rate": 3.997407996478158e-05,
"loss": 0.3345,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11455205827951431,
"step": 80,
"valid_targets_mean": 5224.6,
"valid_targets_min": 433
},
{
"epoch": 0.8703071672354948,
"grad_norm": 0.2593782401899841,
"learning_rate": 3.9941695669582944e-05,
"loss": 0.3304,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10599816590547562,
"step": 85,
"valid_targets_mean": 4968.2,
"valid_targets_min": 2196
},
{
"epoch": 0.9215017064846417,
"grad_norm": 0.23686889348670484,
"learning_rate": 3.989638704394887e-05,
"loss": 0.3168,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11794209480285645,
"step": 90,
"valid_targets_mean": 5557.9,
"valid_targets_min": 1855
},
{
"epoch": 0.9726962457337884,
"grad_norm": 0.27218880367654336,
"learning_rate": 3.983818345266653e-05,
"loss": 0.3149,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09047803282737732,
"step": 95,
"valid_targets_mean": 4906.0,
"valid_targets_min": 2087
},
{
"epoch": 1.0204778156996588,
"grad_norm": 0.2548103228534002,
"learning_rate": 3.976712261782631e-05,
"loss": 0.3204,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10694491863250732,
"step": 100,
"valid_targets_mean": 4863.1,
"valid_targets_min": 1985
},
{
"epoch": 1.0716723549488054,
"grad_norm": 0.26271765818455856,
"learning_rate": 3.968325059437385e-05,
"loss": 0.3154,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11918433755636215,
"step": 105,
"valid_targets_mean": 5599.3,
"valid_targets_min": 421
},
{
"epoch": 1.1228668941979523,
"grad_norm": 0.27930934721029477,
"learning_rate": 3.958662174026164e-05,
"loss": 0.3121,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09404879063367844,
"step": 110,
"valid_targets_mean": 5062.9,
"valid_targets_min": 2242
},
{
"epoch": 1.174061433447099,
"grad_norm": 0.26321540953232964,
"learning_rate": 3.947729868121924e-05,
"loss": 0.3053,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10871408879756927,
"step": 115,
"valid_targets_mean": 5306.3,
"valid_targets_min": 314
},
{
"epoch": 1.2252559726962458,
"grad_norm": 0.26402045132441954,
"learning_rate": 3.935535227016521e-05,
"loss": 0.3076,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11834833025932312,
"step": 120,
"valid_targets_mean": 5735.2,
"valid_targets_min": 365
},
{
"epoch": 1.2764505119453924,
"grad_norm": 0.26407368854218555,
"learning_rate": 3.922086154128693e-05,
"loss": 0.2968,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10184627771377563,
"step": 125,
"valid_targets_mean": 5052.8,
"valid_targets_min": 2201
},
{
"epoch": 1.3276450511945392,
"grad_norm": 0.26565677723672504,
"learning_rate": 3.907391365881802e-05,
"loss": 0.3075,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08913706988096237,
"step": 130,
"valid_targets_mean": 4881.6,
"valid_targets_min": 2122
},
{
"epoch": 1.378839590443686,
"grad_norm": 0.2831307137649524,
"learning_rate": 3.891460386054675e-05,
"loss": 0.299,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09639839828014374,
"step": 135,
"valid_targets_mean": 4887.5,
"valid_targets_min": 2153
},
{
"epoch": 1.4300341296928327,
"grad_norm": 0.27646586353677727,
"learning_rate": 3.8743035396091845e-05,
"loss": 0.2998,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11154685169458389,
"step": 140,
"valid_targets_mean": 5716.4,
"valid_targets_min": 2925
},
{
"epoch": 1.4812286689419796,
"grad_norm": 0.27431775252756874,
"learning_rate": 3.8559319459985776e-05,
"loss": 0.2969,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08550999313592911,
"step": 145,
"valid_targets_mean": 4336.7,
"valid_targets_min": 416
},
{
"epoch": 1.5324232081911262,
"grad_norm": 0.25781067169583155,
"learning_rate": 3.836357511960898e-05,
"loss": 0.2946,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1011088490486145,
"step": 150,
"valid_targets_mean": 5483.2,
"valid_targets_min": 2251
},
{
"epoch": 1.583617747440273,
"grad_norm": 0.27911127666111935,
"learning_rate": 3.815592923802152e-05,
"loss": 0.2922,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10180200636386871,
"step": 155,
"valid_targets_mean": 5241.7,
"valid_targets_min": 1968
},
{
"epoch": 1.63481228668942,
"grad_norm": 0.2743316944893025,
"learning_rate": 3.793651639174246e-05,
"loss": 0.2881,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09541475772857666,
"step": 160,
"valid_targets_mean": 5239.0,
"valid_targets_min": 220
},
{
"epoch": 1.6860068259385665,
"grad_norm": 0.2943897714245753,
"learning_rate": 3.7705478783529986e-05,
"loss": 0.2971,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08847599476575851,
"step": 165,
"valid_targets_mean": 4835.5,
"valid_targets_min": 2225
},
{
"epoch": 1.7372013651877132,
"grad_norm": 0.261601602924854,
"learning_rate": 3.746296615021896e-05,
"loss": 0.296,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09461049735546112,
"step": 170,
"valid_targets_mean": 4939.5,
"valid_targets_min": 2062
},
{
"epoch": 1.78839590443686,
"grad_norm": 0.27191268600437934,
"learning_rate": 3.720913566567562e-05,
"loss": 0.2894,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09755190461874008,
"step": 175,
"valid_targets_mean": 5106.4,
"valid_targets_min": 2367
},
{
"epoch": 1.8395904436860069,
"grad_norm": 0.24464624444284053,
"learning_rate": 3.6944151838932274e-05,
"loss": 0.2935,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.110307976603508,
"step": 180,
"valid_targets_mean": 6091.2,
"valid_targets_min": 1757
},
{
"epoch": 1.8907849829351537,
"grad_norm": 0.2602335537625986,
"learning_rate": 3.666818640756797e-05,
"loss": 0.2902,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09931231290102005,
"step": 185,
"valid_targets_mean": 5034.7,
"valid_targets_min": 451
},
{
"epoch": 1.9419795221843004,
"grad_norm": 0.25671393831155714,
"learning_rate": 3.638141822640444e-05,
"loss": 0.2908,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0843583270907402,
"step": 190,
"valid_targets_mean": 4742.2,
"valid_targets_min": 2233
},
{
"epoch": 1.993174061433447,
"grad_norm": 0.23767750953863967,
"learning_rate": 3.608403315158917e-05,
"loss": 0.2915,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08634407818317413,
"step": 195,
"valid_targets_mean": 4896.8,
"valid_targets_min": 329
},
{
"epoch": 2.0409556313993176,
"grad_norm": 0.2549526896790837,
"learning_rate": 3.5776223920140985e-05,
"loss": 0.2833,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11179839074611664,
"step": 200,
"valid_targets_mean": 6050.2,
"valid_targets_min": 2254
},
{
"epoch": 2.092150170648464,
"grad_norm": 0.2738878896819206,
"learning_rate": 3.545819002503602e-05,
"loss": 0.284,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09601035714149475,
"step": 205,
"valid_targets_mean": 5224.0,
"valid_targets_min": 1694
},
{
"epoch": 2.143344709897611,
"grad_norm": 0.2780264276944305,
"learning_rate": 3.513013758591515e-05,
"loss": 0.2815,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0888153463602066,
"step": 210,
"valid_targets_mean": 4907.5,
"valid_targets_min": 365
},
{
"epoch": 2.1945392491467577,
"grad_norm": 0.26333842271632707,
"learning_rate": 3.479227921549666e-05,
"loss": 0.281,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09323578327894211,
"step": 215,
"valid_targets_mean": 5229.7,
"valid_targets_min": 241
},
{
"epoch": 2.2457337883959045,
"grad_norm": 0.2934385105416155,
"learning_rate": 3.444483388178066e-05,
"loss": 0.27,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09266763925552368,
"step": 220,
"valid_targets_mean": 5267.0,
"valid_targets_min": 410
},
{
"epoch": 2.296928327645051,
"grad_norm": 0.25567549374046494,
"learning_rate": 3.4088026766134654e-05,
"loss": 0.2782,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08348363637924194,
"step": 225,
"valid_targets_mean": 5137.3,
"valid_targets_min": 2069
},
{
"epoch": 2.348122866894198,
"grad_norm": 0.3022766271875021,
"learning_rate": 3.372208911735216e-05,
"loss": 0.2856,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08997620642185211,
"step": 230,
"valid_targets_mean": 5248.5,
"valid_targets_min": 424
},
{
"epoch": 2.3993174061433447,
"grad_norm": 0.2534445787104033,
"learning_rate": 3.3347258101779015e-05,
"loss": 0.282,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11496127396821976,
"step": 235,
"valid_targets_mean": 5611.1,
"valid_targets_min": 264
},
{
"epoch": 2.4505119453924915,
"grad_norm": 0.2520452606948065,
"learning_rate": 3.296377664960445e-05,
"loss": 0.2708,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09025128185749054,
"step": 240,
"valid_targets_mean": 5019.2,
"valid_targets_min": 433
},
{
"epoch": 2.5017064846416384,
"grad_norm": 0.2627583946055227,
"learning_rate": 3.257189329741662e-05,
"loss": 0.2723,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08457937836647034,
"step": 245,
"valid_targets_mean": 4861.2,
"valid_targets_min": 1761
},
{
"epoch": 2.5529010238907848,
"grad_norm": 0.2561709090419887,
"learning_rate": 3.217186202712458e-05,
"loss": 0.274,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0853310376405716,
"step": 250,
"valid_targets_mean": 4727.4,
"valid_targets_min": 1792
},
{
"epoch": 2.6040955631399316,
"grad_norm": 0.44538128860438136,
"learning_rate": 3.1763942101351095e-05,
"loss": 0.2773,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09366409480571747,
"step": 255,
"valid_targets_mean": 5007.0,
"valid_targets_min": 365
},
{
"epoch": 2.6552901023890785,
"grad_norm": 0.2852975254240757,
"learning_rate": 3.134839789540302e-05,
"loss": 0.2752,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0924173966050148,
"step": 260,
"valid_targets_mean": 5460.8,
"valid_targets_min": 433
},
{
"epoch": 2.7064846416382253,
"grad_norm": 0.26038175748374537,
"learning_rate": 3.0925498725928115e-05,
"loss": 0.2702,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0912502110004425,
"step": 265,
"valid_targets_mean": 5070.3,
"valid_targets_min": 2242
},
{
"epoch": 2.757679180887372,
"grad_norm": 0.2663014460071229,
"learning_rate": 3.0495518676369306e-05,
"loss": 0.2767,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09056515991687775,
"step": 270,
"valid_targets_mean": 5028.2,
"valid_targets_min": 1937
},
{
"epoch": 2.8088737201365186,
"grad_norm": 0.3060566751344927,
"learning_rate": 3.0058736419329643e-05,
"loss": 0.2693,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08921462297439575,
"step": 275,
"valid_targets_mean": 4812.8,
"valid_targets_min": 314
},
{
"epoch": 2.8600682593856654,
"grad_norm": 0.24797757600989206,
"learning_rate": 2.9615435035962878e-05,
"loss": 0.2774,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09497439861297607,
"step": 280,
"valid_targets_mean": 5516.0,
"valid_targets_min": 2022
},
{
"epoch": 2.9112627986348123,
"grad_norm": 0.3147509660798771,
"learning_rate": 2.9165901832506977e-05,
"loss": 0.2714,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09763729572296143,
"step": 285,
"valid_targets_mean": 4715.4,
"valid_targets_min": 284
},
{
"epoch": 2.962457337883959,
"grad_norm": 0.26780692138537443,
"learning_rate": 2.8710428154079185e-05,
"loss": 0.2684,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08496683835983276,
"step": 290,
"valid_targets_mean": 4745.5,
"valid_targets_min": 1235
},
{
"epoch": 3.0102389078498293,
"grad_norm": 0.26968597810630485,
"learning_rate": 2.824930919585359e-05,
"loss": 0.2695,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08243635296821594,
"step": 295,
"valid_targets_mean": 5450.9,
"valid_targets_min": 317
},
{
"epoch": 3.061433447098976,
"grad_norm": 0.31124671324275366,
"learning_rate": 2.778284381174336e-05,
"loss": 0.2619,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08364085853099823,
"step": 300,
"valid_targets_mean": 5530.3,
"valid_targets_min": 1478
},
{
"epoch": 3.112627986348123,
"grad_norm": 0.30315704623816053,
"learning_rate": 2.7311334320711784e-05,
"loss": 0.2627,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09039817005395889,
"step": 305,
"valid_targets_mean": 5173.2,
"valid_targets_min": 273
},
{
"epoch": 3.1638225255972694,
"grad_norm": 0.2822760635459168,
"learning_rate": 2.683508631083755e-05,
"loss": 0.2664,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07820077240467072,
"step": 310,
"valid_targets_mean": 4802.0,
"valid_targets_min": 2196
},
{
"epoch": 3.2150170648464163,
"grad_norm": 0.29743866886806536,
"learning_rate": 2.6354408441261324e-05,
"loss": 0.2679,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09245477616786957,
"step": 315,
"valid_targets_mean": 5530.5,
"valid_targets_min": 393
},
{
"epoch": 3.266211604095563,
"grad_norm": 0.27606973763989207,
"learning_rate": 2.5869612242141946e-05,
"loss": 0.2664,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08491060137748718,
"step": 320,
"valid_targets_mean": 5039.2,
"valid_targets_min": 2348
},
{
"epoch": 3.31740614334471,
"grad_norm": 0.2502805038424136,
"learning_rate": 2.538101191275189e-05,
"loss": 0.2647,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09891136735677719,
"step": 325,
"valid_targets_mean": 5541.6,
"valid_targets_min": 315
},
{
"epoch": 3.368600682593857,
"grad_norm": 0.2611678416517467,
"learning_rate": 2.488892411784286e-05,
"loss": 0.2601,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08309619128704071,
"step": 330,
"valid_targets_mean": 4493.0,
"valid_targets_min": 2063
},
{
"epoch": 3.419795221843003,
"grad_norm": 0.2611914869636264,
"learning_rate": 2.439366778241352e-05,
"loss": 0.2706,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09184084832668304,
"step": 335,
"valid_targets_mean": 4901.5,
"valid_targets_min": 430
},
{
"epoch": 3.47098976109215,
"grad_norm": 0.27246810160180385,
"learning_rate": 2.3895563885012303e-05,
"loss": 0.2604,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08379268646240234,
"step": 340,
"valid_targets_mean": 4821.3,
"valid_targets_min": 2467
},
{
"epoch": 3.522184300341297,
"grad_norm": 0.2536800115672537,
"learning_rate": 2.3394935249709332e-05,
"loss": 0.2641,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08427208662033081,
"step": 345,
"valid_targets_mean": 4990.8,
"valid_targets_min": 2231
},
{
"epoch": 3.573378839590444,
"grad_norm": 0.258064897446696,
"learning_rate": 2.2892106336872234e-05,
"loss": 0.2659,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08550337702035904,
"step": 350,
"valid_targets_mean": 5197.7,
"valid_targets_min": 2135
},
{
"epoch": 3.6245733788395906,
"grad_norm": 0.2495721887544657,
"learning_rate": 2.2387403032881467e-05,
"loss": 0.2622,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08749514818191528,
"step": 355,
"valid_targets_mean": 4881.7,
"valid_targets_min": 2212
},
{
"epoch": 3.675767918088737,
"grad_norm": 0.2468445540270157,
"learning_rate": 2.1881152438921447e-05,
"loss": 0.2624,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08829236775636673,
"step": 360,
"valid_targets_mean": 5321.8,
"valid_targets_min": 1695
},
{
"epoch": 3.726962457337884,
"grad_norm": 0.2550167362316407,
"learning_rate": 2.1373682658984317e-05,
"loss": 0.2639,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0848623663187027,
"step": 365,
"valid_targets_mean": 4613.7,
"valid_targets_min": 2456
},
{
"epoch": 3.7781569965870307,
"grad_norm": 0.2310804725665606,
"learning_rate": 2.0865322587223855e-05,
"loss": 0.2585,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08814995735883713,
"step": 370,
"valid_targets_mean": 5460.0,
"valid_targets_min": 1805
},
{
"epoch": 3.8293515358361776,
"grad_norm": 0.23090177072633178,
"learning_rate": 2.035640169479719e-05,
"loss": 0.2617,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.06490929424762726,
"step": 375,
"valid_targets_mean": 4068.9,
"valid_targets_min": 1709
},
{
"epoch": 3.8805460750853245,
"grad_norm": 0.23838704123836477,
"learning_rate": 1.9847249816332644e-05,
"loss": 0.2679,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08089349418878555,
"step": 380,
"valid_targets_mean": 4778.1,
"valid_targets_min": 349
},
{
"epoch": 3.931740614334471,
"grad_norm": 0.260605487192663,
"learning_rate": 1.933819693616195e-05,
"loss": 0.2529,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08289642632007599,
"step": 385,
"valid_targets_mean": 4828.9,
"valid_targets_min": 2450
},
{
"epoch": 3.9829351535836177,
"grad_norm": 0.26648319115456115,
"learning_rate": 1.8829572974455465e-05,
"loss": 0.2689,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10124649107456207,
"step": 390,
"valid_targets_mean": 5940.3,
"valid_targets_min": 2112
},
{
"epoch": 4.030716723549488,
"grad_norm": 0.25074836696819963,
"learning_rate": 1.832170757339895e-05,
"loss": 0.2607,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09088584035634995,
"step": 395,
"valid_targets_mean": 5790.5,
"valid_targets_min": 1996
},
{
"epoch": 4.081911262798635,
"grad_norm": 0.23932382754020326,
"learning_rate": 1.781492988355056e-05,
"loss": 0.2591,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08872021734714508,
"step": 400,
"valid_targets_mean": 5202.1,
"valid_targets_min": 443
},
{
"epoch": 4.1331058020477816,
"grad_norm": 0.24243405252900668,
"learning_rate": 1.7309568350516376e-05,
"loss": 0.254,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07671861350536346,
"step": 405,
"valid_targets_mean": 5088.6,
"valid_targets_min": 2001
},
{
"epoch": 4.184300341296928,
"grad_norm": 0.24082013214852604,
"learning_rate": 1.680595050208296e-05,
"loss": 0.2549,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08671650290489197,
"step": 410,
"valid_targets_mean": 5681.1,
"valid_targets_min": 2280
},
{
"epoch": 4.235494880546075,
"grad_norm": 0.2705939041983527,
"learning_rate": 1.630440273594455e-05,
"loss": 0.2647,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08631773293018341,
"step": 415,
"valid_targets_mean": 4975.5,
"valid_targets_min": 446
},
{
"epoch": 4.286689419795222,
"grad_norm": 0.2647801079481516,
"learning_rate": 1.5805250108162898e-05,
"loss": 0.2493,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08514288067817688,
"step": 420,
"valid_targets_mean": 5125.4,
"valid_targets_min": 2016
},
{
"epoch": 4.337883959044369,
"grad_norm": 0.24156132010147113,
"learning_rate": 1.530881612249646e-05,
"loss": 0.2581,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0757707729935646,
"step": 425,
"valid_targets_mean": 5020.7,
"valid_targets_min": 2842
},
{
"epoch": 4.389078498293515,
"grad_norm": 0.24248555906848412,
"learning_rate": 1.4815422520735735e-05,
"loss": 0.2536,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07050696760416031,
"step": 430,
"valid_targets_mean": 4395.3,
"valid_targets_min": 2210
},
{
"epoch": 4.440273037542662,
"grad_norm": 0.24171942435675195,
"learning_rate": 1.432538907418047e-05,
"loss": 0.2548,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08460260927677155,
"step": 435,
"valid_targets_mean": 5636.2,
"valid_targets_min": 2623
},
{
"epoch": 4.491467576791809,
"grad_norm": 0.31663158719733203,
"learning_rate": 1.3839033376394082e-05,
"loss": 0.257,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08407416194677353,
"step": 440,
"valid_targets_mean": 4772.5,
"valid_targets_min": 134
},
{
"epoch": 4.5426621160409555,
"grad_norm": 0.23417315461999244,
"learning_rate": 1.33566706373693e-05,
"loss": 0.2599,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08053170144557953,
"step": 445,
"valid_targets_mean": 5175.6,
"valid_targets_min": 281
},
{
"epoch": 4.593856655290102,
"grad_norm": 0.24575926696014141,
"learning_rate": 1.2878613479238774e-05,
"loss": 0.2541,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08464860171079636,
"step": 450,
"valid_targets_mean": 5122.8,
"valid_targets_min": 1882
},
{
"epoch": 4.645051194539249,
"grad_norm": 0.23884179028036426,
"learning_rate": 1.2405171733662822e-05,
"loss": 0.2519,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07469707727432251,
"step": 455,
"valid_targets_mean": 4802.0,
"valid_targets_min": 1730
},
{
"epoch": 4.696245733788396,
"grad_norm": 0.2622151398636782,
"learning_rate": 1.1936652241025679e-05,
"loss": 0.2499,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07871793210506439,
"step": 460,
"valid_targets_mean": 5118.5,
"valid_targets_min": 389
},
{
"epoch": 4.747440273037543,
"grad_norm": 0.24354224502364916,
"learning_rate": 1.1473358651570479e-05,
"loss": 0.2589,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08825745433568954,
"step": 465,
"valid_targets_mean": 5207.9,
"valid_targets_min": 487
},
{
"epoch": 4.798634812286689,
"grad_norm": 0.26717838407309835,
"learning_rate": 1.1015591228601692e-05,
"loss": 0.2523,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09401947259902954,
"step": 470,
"valid_targets_mean": 5151.4,
"valid_targets_min": 2040
},
{
"epoch": 4.849829351535837,
"grad_norm": 0.6288828196111563,
"learning_rate": 1.0563646653882755e-05,
"loss": 0.2543,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08458419144153595,
"step": 475,
"valid_targets_mean": 5222.2,
"valid_targets_min": 2375
},
{
"epoch": 4.901023890784983,
"grad_norm": 0.2388380799823632,
"learning_rate": 1.0117817835354851e-05,
"loss": 0.2545,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10258987545967102,
"step": 480,
"valid_targets_mean": 5773.7,
"valid_targets_min": 273
},
{
"epoch": 4.952218430034129,
"grad_norm": 0.23524178853837582,
"learning_rate": 9.678393717301526e-06,
"loss": 0.2609,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08160092681646347,
"step": 485,
"valid_targets_mean": 4906.0,
"valid_targets_min": 2117
},
{
"epoch": 5.0,
"grad_norm": 0.2815721527054231,
"learning_rate": 9.245659093082243e-06,
"loss": 0.2541,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11172492057085037,
"step": 490,
"valid_targets_mean": 4392.0,
"valid_targets_min": 519
},
{
"epoch": 5.051194539249146,
"grad_norm": 0.23545881110678396,
"learning_rate": 8.819894420556112e-06,
"loss": 0.246,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09946560859680176,
"step": 495,
"valid_targets_mean": 5775.0,
"valid_targets_min": 1809
},
{
"epoch": 5.102389078498294,
"grad_norm": 0.2332984410262101,
"learning_rate": 8.40137564031547e-06,
"loss": 0.2511,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08911414444446564,
"step": 500,
"valid_targets_mean": 4858.6,
"valid_targets_min": 424
},
{
"epoch": 5.15358361774744,
"grad_norm": 0.2406516993513017,
"learning_rate": 7.990373996847194e-06,
"loss": 0.2511,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0798487663269043,
"step": 505,
"valid_targets_mean": 5106.6,
"valid_targets_min": 414
},
{
"epoch": 5.204778156996587,
"grad_norm": 0.25632115440474484,
"learning_rate": 7.5871558627375295e-06,
"loss": 0.2481,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08686481416225433,
"step": 510,
"valid_targets_mean": 4846.7,
"valid_targets_min": 492
},
{
"epoch": 5.255972696245734,
"grad_norm": 0.23873965803262287,
"learning_rate": 7.1919825660344696e-06,
"loss": 0.255,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08521182835102081,
"step": 515,
"valid_targets_mean": 5219.3,
"valid_targets_min": 1881
},
{
"epoch": 5.30716723549488,
"grad_norm": 0.23815246828270364,
"learning_rate": 6.805110220879459e-06,
"loss": 0.2543,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07839064300060272,
"step": 520,
"valid_targets_mean": 4840.9,
"valid_targets_min": 2536
},
{
"epoch": 5.3583617747440275,
"grad_norm": 0.2975224705505978,
"learning_rate": 6.4267895615183915e-06,
"loss": 0.2547,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08646044880151749,
"step": 525,
"valid_targets_mean": 4866.6,
"valid_targets_min": 497
},
{
"epoch": 5.409556313993174,
"grad_norm": 0.23640902606037387,
"learning_rate": 6.057265779799193e-06,
"loss": 0.2553,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10723152756690979,
"step": 530,
"valid_targets_mean": 5736.0,
"valid_targets_min": 299
},
{
"epoch": 5.460750853242321,
"grad_norm": 0.2307913484720284,
"learning_rate": 5.696778366261575e-06,
"loss": 0.2459,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08867950737476349,
"step": 535,
"valid_targets_mean": 5571.2,
"valid_targets_min": 1468
},
{
"epoch": 5.511945392491468,
"grad_norm": 0.23315915412363156,
"learning_rate": 5.345560954921802e-06,
"loss": 0.2492,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08251780271530151,
"step": 540,
"valid_targets_mean": 5138.9,
"valid_targets_min": 428
},
{
"epoch": 5.563139931740614,
"grad_norm": 0.22322887600839503,
"learning_rate": 5.00384117185311e-06,
"loss": 0.2506,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09144619107246399,
"step": 545,
"valid_targets_mean": 5855.6,
"valid_targets_min": 1850
},
{
"epoch": 5.614334470989761,
"grad_norm": 0.24296113504336472,
"learning_rate": 4.671840487659882e-06,
"loss": 0.2509,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09224293380975723,
"step": 550,
"valid_targets_mean": 6035.7,
"valid_targets_min": 2794
},
{
"epoch": 5.665529010238908,
"grad_norm": 0.2501891462512511,
"learning_rate": 4.3497740739413015e-06,
"loss": 0.2492,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07294757664203644,
"step": 555,
"valid_targets_mean": 4596.5,
"valid_targets_min": 453
},
{
"epoch": 5.716723549488055,
"grad_norm": 0.23660123212684395,
"learning_rate": 4.037850663837315e-06,
"loss": 0.2534,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07551420480012894,
"step": 560,
"valid_targets_mean": 4695.0,
"valid_targets_min": 1834
},
{
"epoch": 5.7679180887372015,
"grad_norm": 0.2298824544771145,
"learning_rate": 3.7362724167474774e-06,
"loss": 0.2558,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0779377892613411,
"step": 565,
"valid_targets_mean": 4954.0,
"valid_targets_min": 2135
},
{
"epoch": 5.819112627986348,
"grad_norm": 0.21382794578335707,
"learning_rate": 3.4452347873102565e-06,
"loss": 0.2451,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07789836078882217,
"step": 570,
"valid_targets_mean": 5438.0,
"valid_targets_min": 2658
},
{
"epoch": 5.870307167235495,
"grad_norm": 0.21625982549299644,
"learning_rate": 3.1649263987277303e-06,
"loss": 0.247,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0796944797039032,
"step": 575,
"valid_targets_mean": 5126.0,
"valid_targets_min": 1466
},
{
"epoch": 5.921501706484642,
"grad_norm": 0.21966123329170228,
"learning_rate": 2.8955289205177696e-06,
"loss": 0.2462,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08940281718969345,
"step": 580,
"valid_targets_mean": 5365.1,
"valid_targets_min": 542
},
{
"epoch": 5.972696245733788,
"grad_norm": 0.23264769437080063,
"learning_rate": 2.6372169507729627e-06,
"loss": 0.2588,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08277872949838638,
"step": 585,
"valid_targets_mean": 5623.2,
"valid_targets_min": 2509
},
{
"epoch": 6.020477815699659,
"grad_norm": 0.23969753992511295,
"learning_rate": 2.3901579030025566e-06,
"loss": 0.2553,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0817054808139801,
"step": 590,
"valid_targets_mean": 5169.0,
"valid_targets_min": 2446
},
{
"epoch": 6.071672354948806,
"grad_norm": 0.235094543609173,
"learning_rate": 2.15451189763078e-06,
"loss": 0.2474,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09182994067668915,
"step": 595,
"valid_targets_mean": 5807.4,
"valid_targets_min": 2348
},
{
"epoch": 6.122866894197952,
"grad_norm": 0.2427230093710648,
"learning_rate": 1.930431658221854e-06,
"loss": 0.2486,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08008411526679993,
"step": 600,
"valid_targets_mean": 4879.1,
"valid_targets_min": 446
},
{
"epoch": 6.174061433447099,
"grad_norm": 0.23142147399386198,
"learning_rate": 1.7180624124989398e-06,
"loss": 0.2543,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0793958306312561,
"step": 605,
"valid_targets_mean": 5092.9,
"valid_targets_min": 2197
},
{
"epoch": 6.225255972696246,
"grad_norm": 0.2280686494846546,
"learning_rate": 1.5175417982212138e-06,
"loss": 0.2475,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07384900748729706,
"step": 610,
"valid_targets_mean": 4928.0,
"valid_targets_min": 1860
},
{
"epoch": 6.276450511945392,
"grad_norm": 0.24929297314875484,
"learning_rate": 1.3289997739800108e-06,
"loss": 0.2481,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0833367109298706,
"step": 615,
"valid_targets_mean": 5088.8,
"valid_targets_min": 2764
},
{
"epoch": 6.327645051194539,
"grad_norm": 0.22000964241336962,
"learning_rate": 1.1525585349718948e-06,
"loss": 0.2465,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07332947105169296,
"step": 620,
"valid_targets_mean": 4677.8,
"valid_targets_min": 220
},
{
"epoch": 6.378839590443686,
"grad_norm": 0.21275459726601303,
"learning_rate": 9.883324338032474e-07,
"loss": 0.2525,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07395243644714355,
"step": 625,
"valid_targets_mean": 5172.9,
"valid_targets_min": 2763
},
{
"epoch": 6.4300341296928325,
"grad_norm": 0.22175862203200505,
"learning_rate": 8.364279063776526e-07,
"loss": 0.2458,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0932476595044136,
"step": 630,
"valid_targets_mean": 5806.0,
"valid_targets_min": 2569
},
{
"epoch": 6.48122866894198,
"grad_norm": 0.2230388965834993,
"learning_rate": 6.969434029141676e-07,
"loss": 0.2456,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07441005110740662,
"step": 635,
"valid_targets_mean": 4756.5,
"valid_targets_min": 2402
},
{
"epoch": 6.532423208191126,
"grad_norm": 0.349252390093201,
"learning_rate": 5.699693241411619e-07,
"loss": 0.248,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09055956453084946,
"step": 640,
"valid_targets_mean": 5269.7,
"valid_targets_min": 222
},
{
"epoch": 6.5836177474402735,
"grad_norm": 0.22602023572874996,
"learning_rate": 4.5558796270706254e-07,
"loss": 0.2469,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09014368057250977,
"step": 645,
"valid_targets_mean": 5677.8,
"valid_targets_min": 2555
},
{
"epoch": 6.63481228668942,
"grad_norm": 0.24090264289449,
"learning_rate": 3.5387344984600946e-07,
"loss": 0.2454,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08535467833280563,
"step": 650,
"valid_targets_mean": 4994.6,
"valid_targets_min": 2472
},
{
"epoch": 6.686006825938566,
"grad_norm": 0.20794321625955012,
"learning_rate": 2.64891707332966e-07,
"loss": 0.2494,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09163576364517212,
"step": 655,
"valid_targets_mean": 5890.2,
"valid_targets_min": 2276
},
{
"epoch": 6.737201365187714,
"grad_norm": 0.22007475356230136,
"learning_rate": 1.887004047594232e-07,
"loss": 0.239,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08739501237869263,
"step": 660,
"valid_targets_mean": 5521.7,
"valid_targets_min": 1730
},
{
"epoch": 6.78839590443686,
"grad_norm": 0.25286299855724226,
"learning_rate": 1.2534892215740667e-07,
"loss": 0.2517,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10239039361476898,
"step": 665,
"valid_targets_mean": 5944.1,
"valid_targets_min": 393
},
{
"epoch": 6.839590443686006,
"grad_norm": 0.24536425918506533,
"learning_rate": 7.487831799597889e-08,
"loss": 0.2463,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08520665764808655,
"step": 670,
"valid_targets_mean": 5079.4,
"valid_targets_min": 1871
},
{
"epoch": 6.890784982935154,
"grad_norm": 0.22126404698942398,
"learning_rate": 3.73213025710073e-08,
"loss": 0.2511,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08079634606838226,
"step": 675,
"valid_targets_mean": 5323.9,
"valid_targets_min": 1875
},
{
"epoch": 6.9419795221843,
"grad_norm": 0.23165959369746986,
"learning_rate": 1.2702216805431377e-08,
"loss": 0.257,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08266259729862213,
"step": 680,
"valid_targets_mean": 5277.4,
"valid_targets_min": 439
},
{
"epoch": 6.993174061433447,
"grad_norm": 0.2427793642183869,
"learning_rate": 1.037016473757202e-09,
"loss": 0.2553,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08131192624568939,
"step": 685,
"valid_targets_mean": 5002.9,
"valid_targets_min": 281
},
{
"epoch": 7.0,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11108480393886566,
"step": 686,
"total_flos": 2.601520842818978e+18,
"train_loss": 0.2925965665424183,
"train_runtime": 15951.2631,
"train_samples_per_second": 4.106,
"train_steps_per_second": 0.043,
"valid_targets_mean": 4441.2,
"valid_targets_min": 1797
}
],
"logging_steps": 5,
"max_steps": 686,
"num_input_tokens_seen": 0,
"num_train_epochs": 7,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.601520842818978e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}