Files
alfworld-swesmith-r2egym-sw…/trainer_state.json

2600 lines
69 KiB
JSON
Raw Permalink Normal View History

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.597039473684211,
"eval_steps": 500,
"global_step": 1421,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.024671052631578948,
"grad_norm": 5.377997875213623,
"learning_rate": 1.118881118881119e-06,
"loss": 0.5476,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.47340989112854004,
"step": 5
},
{
"epoch": 0.049342105263157895,
"grad_norm": 5.392073154449463,
"learning_rate": 2.517482517482518e-06,
"loss": 0.5429,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.48170506954193115,
"step": 10
},
{
"epoch": 0.07401315789473684,
"grad_norm": 4.068439960479736,
"learning_rate": 3.916083916083917e-06,
"loss": 0.5108,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.5030127763748169,
"step": 15
},
{
"epoch": 0.09868421052631579,
"grad_norm": 1.50242280960083,
"learning_rate": 5.314685314685315e-06,
"loss": 0.4949,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4839881658554077,
"step": 20
},
{
"epoch": 0.12335526315789473,
"grad_norm": 0.9494503736495972,
"learning_rate": 6.713286713286714e-06,
"loss": 0.4425,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4382355809211731,
"step": 25
},
{
"epoch": 0.14802631578947367,
"grad_norm": 0.8043321967124939,
"learning_rate": 8.111888111888112e-06,
"loss": 0.4344,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3997962474822998,
"step": 30
},
{
"epoch": 0.17269736842105263,
"grad_norm": 0.7665994763374329,
"learning_rate": 9.510489510489511e-06,
"loss": 0.4256,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4312514364719391,
"step": 35
},
{
"epoch": 0.19736842105263158,
"grad_norm": 0.5361006855964661,
"learning_rate": 1.0909090909090909e-05,
"loss": 0.3893,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.33488091826438904,
"step": 40
},
{
"epoch": 0.22203947368421054,
"grad_norm": 0.4154154062271118,
"learning_rate": 1.230769230769231e-05,
"loss": 0.3565,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2907222509384155,
"step": 45
},
{
"epoch": 0.24671052631578946,
"grad_norm": 0.38533732295036316,
"learning_rate": 1.3706293706293707e-05,
"loss": 0.3607,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.38802745938301086,
"step": 50
},
{
"epoch": 0.2713815789473684,
"grad_norm": 0.6358391642570496,
"learning_rate": 1.5104895104895105e-05,
"loss": 0.336,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.35459989309310913,
"step": 55
},
{
"epoch": 0.29605263157894735,
"grad_norm": 0.7225229740142822,
"learning_rate": 1.6503496503496507e-05,
"loss": 0.3261,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.32752037048339844,
"step": 60
},
{
"epoch": 0.3207236842105263,
"grad_norm": 0.4612922668457031,
"learning_rate": 1.7902097902097903e-05,
"loss": 0.3038,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29034003615379333,
"step": 65
},
{
"epoch": 0.34539473684210525,
"grad_norm": 0.41292667388916016,
"learning_rate": 1.9300699300699302e-05,
"loss": 0.3026,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29919901490211487,
"step": 70
},
{
"epoch": 0.37006578947368424,
"grad_norm": 0.3574488162994385,
"learning_rate": 2.06993006993007e-05,
"loss": 0.2916,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2877832353115082,
"step": 75
},
{
"epoch": 0.39473684210526316,
"grad_norm": 0.4183422923088074,
"learning_rate": 2.2097902097902097e-05,
"loss": 0.2951,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2495700865983963,
"step": 80
},
{
"epoch": 0.4194078947368421,
"grad_norm": 0.27113303542137146,
"learning_rate": 2.3496503496503496e-05,
"loss": 0.2694,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2706011235713959,
"step": 85
},
{
"epoch": 0.4440789473684211,
"grad_norm": 0.32469940185546875,
"learning_rate": 2.48951048951049e-05,
"loss": 0.2666,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2613554000854492,
"step": 90
},
{
"epoch": 0.46875,
"grad_norm": 0.3434113562107086,
"learning_rate": 2.6293706293706294e-05,
"loss": 0.2516,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2635467052459717,
"step": 95
},
{
"epoch": 0.4934210526315789,
"grad_norm": 0.44404661655426025,
"learning_rate": 2.7692307692307694e-05,
"loss": 0.2534,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2347426861524582,
"step": 100
},
{
"epoch": 0.024671052631578948,
"grad_norm": 0.3727414608001709,
"learning_rate": 2.9090909090909093e-05,
"loss": 0.3065,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25368186831474304,
"step": 105
},
{
"epoch": 0.049342105263157895,
"grad_norm": 0.3068452477455139,
"learning_rate": 3.048951048951049e-05,
"loss": 0.2958,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2523610293865204,
"step": 110
},
{
"epoch": 0.07401315789473684,
"grad_norm": 0.29999276995658875,
"learning_rate": 3.188811188811189e-05,
"loss": 0.2856,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27986133098602295,
"step": 115
},
{
"epoch": 0.09868421052631579,
"grad_norm": 0.28988873958587646,
"learning_rate": 3.328671328671329e-05,
"loss": 0.2982,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30054572224617004,
"step": 120
},
{
"epoch": 0.12335526315789473,
"grad_norm": 0.2404654622077942,
"learning_rate": 3.468531468531469e-05,
"loss": 0.2746,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27615606784820557,
"step": 125
},
{
"epoch": 0.14802631578947367,
"grad_norm": 0.24244281649589539,
"learning_rate": 3.608391608391609e-05,
"loss": 0.2767,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25839051604270935,
"step": 130
},
{
"epoch": 0.17269736842105263,
"grad_norm": 0.25900793075561523,
"learning_rate": 3.748251748251749e-05,
"loss": 0.2763,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2887214124202728,
"step": 135
},
{
"epoch": 0.19736842105263158,
"grad_norm": 0.23659999668598175,
"learning_rate": 3.888111888111888e-05,
"loss": 0.2626,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23153053224086761,
"step": 140
},
{
"epoch": 0.22203947368421054,
"grad_norm": 0.22117386758327484,
"learning_rate": 3.999993957205587e-05,
"loss": 0.2494,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20298953354358673,
"step": 145
},
{
"epoch": 0.24671052631578946,
"grad_norm": 0.26892349123954773,
"learning_rate": 3.999782463235198e-05,
"loss": 0.2604,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28457778692245483,
"step": 150
},
{
"epoch": 0.2713815789473684,
"grad_norm": 0.32646119594573975,
"learning_rate": 3.999268866058499e-05,
"loss": 0.246,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2623623311519623,
"step": 155
},
{
"epoch": 0.29605263157894735,
"grad_norm": 0.32743313908576965,
"learning_rate": 3.9984532432636075e-05,
"loss": 0.2436,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25287550687789917,
"step": 160
},
{
"epoch": 0.3207236842105263,
"grad_norm": 0.2839692234992981,
"learning_rate": 3.997335718065055e-05,
"loss": 0.2348,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22406595945358276,
"step": 165
},
{
"epoch": 0.34539473684210525,
"grad_norm": 0.27507659792900085,
"learning_rate": 3.995916459285176e-05,
"loss": 0.2395,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24255843460559845,
"step": 170
},
{
"epoch": 0.37006578947368424,
"grad_norm": 0.27469953894615173,
"learning_rate": 3.994195681328607e-05,
"loss": 0.235,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23514795303344727,
"step": 175
},
{
"epoch": 0.39473684210526316,
"grad_norm": 0.30293864011764526,
"learning_rate": 3.99217364414989e-05,
"loss": 0.2436,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20414546132087708,
"step": 180
},
{
"epoch": 0.4194078947368421,
"grad_norm": 0.22480283677577972,
"learning_rate": 3.989850653214208e-05,
"loss": 0.2274,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23089763522148132,
"step": 185
},
{
"epoch": 0.4440789473684211,
"grad_norm": 0.2698158621788025,
"learning_rate": 3.987227059451237e-05,
"loss": 0.2289,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.224493145942688,
"step": 190
},
{
"epoch": 0.46875,
"grad_norm": 0.30755510926246643,
"learning_rate": 3.984303259202129e-05,
"loss": 0.2179,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2319842278957367,
"step": 195
},
{
"epoch": 0.4934210526315789,
"grad_norm": 0.31990331411361694,
"learning_rate": 3.9810796941596414e-05,
"loss": 0.2229,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20815354585647583,
"step": 200
},
{
"epoch": 0.024671052631578948,
"grad_norm": 0.2590547502040863,
"learning_rate": 3.97755685130141e-05,
"loss": 0.2447,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2057332992553711,
"step": 205
},
{
"epoch": 0.049342105263157895,
"grad_norm": 0.2779129147529602,
"learning_rate": 3.973735262816381e-05,
"loss": 0.2418,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2053648829460144,
"step": 210
},
{
"epoch": 0.07401315789473684,
"grad_norm": 0.2121449112892151,
"learning_rate": 3.9696155060244166e-05,
"loss": 0.238,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23251962661743164,
"step": 215
},
{
"epoch": 0.09868421052631579,
"grad_norm": 0.2546617090702057,
"learning_rate": 3.9651982032890774e-05,
"loss": 0.2522,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25815051794052124,
"step": 220
},
{
"epoch": 0.12335526315789473,
"grad_norm": 0.20591332018375397,
"learning_rate": 3.960484021923606e-05,
"loss": 0.2365,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23984409868717194,
"step": 225
},
{
"epoch": 0.14802631578947367,
"grad_norm": 0.22549138963222504,
"learning_rate": 3.9554736740901163e-05,
"loss": 0.2417,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22766338288784027,
"step": 230
},
{
"epoch": 0.17269736842105263,
"grad_norm": 0.22996200621128082,
"learning_rate": 3.950167916692008e-05,
"loss": 0.2442,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25690600275993347,
"step": 235
},
{
"epoch": 0.19736842105263158,
"grad_norm": 0.23695601522922516,
"learning_rate": 3.9445675512596224e-05,
"loss": 0.2347,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20780086517333984,
"step": 240
},
{
"epoch": 0.22203947368421054,
"grad_norm": 0.1984836608171463,
"learning_rate": 3.938673423829159e-05,
"loss": 0.2254,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18308115005493164,
"step": 245
},
{
"epoch": 0.24671052631578946,
"grad_norm": 0.23254786431789398,
"learning_rate": 3.932486424814865e-05,
"loss": 0.2369,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2599466145038605,
"step": 250
},
{
"epoch": 0.2713815789473684,
"grad_norm": 0.2990603446960449,
"learning_rate": 3.92600748887452e-05,
"loss": 0.2234,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23988397419452667,
"step": 255
},
{
"epoch": 0.29605263157894735,
"grad_norm": 0.3279288411140442,
"learning_rate": 3.9192375947682436e-05,
"loss": 0.2225,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23210851848125458,
"step": 260
},
{
"epoch": 0.3207236842105263,
"grad_norm": 0.28320780396461487,
"learning_rate": 3.9121777652106325e-05,
"loss": 0.2157,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20393934845924377,
"step": 265
},
{
"epoch": 0.34539473684210525,
"grad_norm": 0.2532626986503601,
"learning_rate": 3.904829066716263e-05,
"loss": 0.2206,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22491160035133362,
"step": 270
},
{
"epoch": 0.37006578947368424,
"grad_norm": 0.24180759489536285,
"learning_rate": 3.8971926094385725e-05,
"loss": 0.217,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21774451434612274,
"step": 275
},
{
"epoch": 0.39473684210526316,
"grad_norm": 0.28742486238479614,
"learning_rate": 3.889269547002153e-05,
"loss": 0.2258,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18878665566444397,
"step": 280
},
{
"epoch": 0.4194078947368421,
"grad_norm": 0.2189558744430542,
"learning_rate": 3.881061076328475e-05,
"loss": 0.2121,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2159784585237503,
"step": 285
},
{
"epoch": 0.4440789473684211,
"grad_norm": 0.24447618424892426,
"learning_rate": 3.872568437455071e-05,
"loss": 0.2141,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20933416485786438,
"step": 290
},
{
"epoch": 0.46875,
"grad_norm": 0.2725847661495209,
"learning_rate": 3.863792913348202e-05,
"loss": 0.2045,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21849879622459412,
"step": 295
},
{
"epoch": 0.4934210526315789,
"grad_norm": 0.2945365011692047,
"learning_rate": 3.854735829709049e-05,
"loss": 0.2099,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1962527334690094,
"step": 300
},
{
"epoch": 1.024671052631579,
"grad_norm": 0.2862054705619812,
"learning_rate": 3.8453985547734364e-05,
"loss": 0.2265,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19139519333839417,
"step": 305
},
{
"epoch": 1.049342105263158,
"grad_norm": 0.2504815459251404,
"learning_rate": 3.835782499105136e-05,
"loss": 0.2244,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19045579433441162,
"step": 310
},
{
"epoch": 1.0740131578947367,
"grad_norm": 0.22405609488487244,
"learning_rate": 3.825889115382777e-05,
"loss": 0.2215,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2161717265844345,
"step": 315
},
{
"epoch": 1.0986842105263157,
"grad_norm": 0.23503802716732025,
"learning_rate": 3.815719898180397e-05,
"loss": 0.2353,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24189823865890503,
"step": 320
},
{
"epoch": 1.1233552631578947,
"grad_norm": 0.19855502247810364,
"learning_rate": 3.8052763837416496e-05,
"loss": 0.221,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22518810629844666,
"step": 325
},
{
"epoch": 1.1480263157894737,
"grad_norm": 0.22038273513317108,
"learning_rate": 3.794560149747736e-05,
"loss": 0.2268,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21357837319374084,
"step": 330
},
{
"epoch": 1.1726973684210527,
"grad_norm": 0.2181052416563034,
"learning_rate": 3.7835728150790626e-05,
"loss": 0.2297,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24232840538024902,
"step": 335
},
{
"epoch": 1.1973684210526316,
"grad_norm": 0.21558205783367157,
"learning_rate": 3.7723160395706846e-05,
"loss": 0.2213,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19611139595508575,
"step": 340
},
{
"epoch": 1.2220394736842106,
"grad_norm": 0.19245308637619019,
"learning_rate": 3.760791523761553e-05,
"loss": 0.213,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17266516387462616,
"step": 345
},
{
"epoch": 1.2467105263157894,
"grad_norm": 0.23584093153476715,
"learning_rate": 3.749001008637621e-05,
"loss": 0.2247,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24593010544776917,
"step": 350
},
{
"epoch": 1.2713815789473684,
"grad_norm": 0.2926468253135681,
"learning_rate": 3.736946275368834e-05,
"loss": 0.2116,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22750172019004822,
"step": 355
},
{
"epoch": 1.2960526315789473,
"grad_norm": 0.26042640209198,
"learning_rate": 3.724629145040056e-05,
"loss": 0.2112,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22006699442863464,
"step": 360
},
{
"epoch": 1.3207236842105263,
"grad_norm": 0.26961809396743774,
"learning_rate": 3.7120514783759555e-05,
"loss": 0.206,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19352692365646362,
"step": 365
},
{
"epoch": 1.3453947368421053,
"grad_norm": 0.25605323910713196,
"learning_rate": 3.699215175459917e-05,
"loss": 0.2105,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21544437110424042,
"step": 370
},
{
"epoch": 1.3700657894736843,
"grad_norm": 0.28638923168182373,
"learning_rate": 3.686122175446992e-05,
"loss": 0.2072,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20685678720474243,
"step": 375
},
{
"epoch": 1.3947368421052633,
"grad_norm": 0.26736754179000854,
"learning_rate": 3.672774456270959e-05,
"loss": 0.215,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1788450926542282,
"step": 380
},
{
"epoch": 1.419407894736842,
"grad_norm": 0.19807708263397217,
"learning_rate": 3.659174034345522e-05,
"loss": 0.2027,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20726953446865082,
"step": 385
},
{
"epoch": 1.444078947368421,
"grad_norm": 0.24334271252155304,
"learning_rate": 3.645322964259689e-05,
"loss": 0.2047,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1996752917766571,
"step": 390
},
{
"epoch": 1.46875,
"grad_norm": 0.2998853623867035,
"learning_rate": 3.631223338467394e-05,
"loss": 0.1961,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20962709188461304,
"step": 395
},
{
"epoch": 1.493421052631579,
"grad_norm": 0.23867939412593842,
"learning_rate": 3.616877286971396e-05,
"loss": 0.2018,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18793398141860962,
"step": 400
},
{
"epoch": 1.024671052631579,
"grad_norm": 0.2809722423553467,
"learning_rate": 3.6022869770014964e-05,
"loss": 0.2166,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18363182246685028,
"step": 405
},
{
"epoch": 1.049342105263158,
"grad_norm": 0.22357912361621857,
"learning_rate": 3.587454612687148e-05,
"loss": 0.2144,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18171587586402893,
"step": 410
},
{
"epoch": 1.0740131578947367,
"grad_norm": 0.20701156556606293,
"learning_rate": 3.5723824347244745e-05,
"loss": 0.2119,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2063273936510086,
"step": 415
},
{
"epoch": 1.0986842105263157,
"grad_norm": 0.23529016971588135,
"learning_rate": 3.557072720037779e-05,
"loss": 0.225,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23247161507606506,
"step": 420
},
{
"epoch": 1.1233552631578947,
"grad_norm": 0.2022576779127121,
"learning_rate": 3.541527781435568e-05,
"loss": 0.2115,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2152596414089203,
"step": 425
},
{
"epoch": 1.1480263157894737,
"grad_norm": 0.20009814202785492,
"learning_rate": 3.525749967261164e-05,
"loss": 0.2173,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20451240241527557,
"step": 430
},
{
"epoch": 1.1726973684210527,
"grad_norm": 0.2168785035610199,
"learning_rate": 3.509741661037945e-05,
"loss": 0.2202,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2329305112361908,
"step": 435
},
{
"epoch": 1.1973684210526316,
"grad_norm": 0.22165916860103607,
"learning_rate": 3.493505281109269e-05,
"loss": 0.2125,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18825937807559967,
"step": 440
},
{
"epoch": 1.2220394736842106,
"grad_norm": 0.1891016662120819,
"learning_rate": 3.477043280273139e-05,
"loss": 0.2048,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16563668847084045,
"step": 445
},
{
"epoch": 1.2467105263157894,
"grad_norm": 0.21851275861263275,
"learning_rate": 3.460358145411669e-05,
"loss": 0.2163,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2363748550415039,
"step": 450
},
{
"epoch": 1.2713815789473684,
"grad_norm": 0.30250489711761475,
"learning_rate": 3.4434523971153876e-05,
"loss": 0.2038,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21931317448616028,
"step": 455
},
{
"epoch": 1.2960526315789473,
"grad_norm": 0.25916534662246704,
"learning_rate": 3.426328589302463e-05,
"loss": 0.2034,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21159030497074127,
"step": 460
},
{
"epoch": 1.3207236842105263,
"grad_norm": 0.28234443068504333,
"learning_rate": 3.408989308832887e-05,
"loss": 0.1997,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18619588017463684,
"step": 465
},
{
"epoch": 1.3453947368421053,
"grad_norm": 0.25823721289634705,
"learning_rate": 3.3914371751176806e-05,
"loss": 0.2034,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20875269174575806,
"step": 470
},
{
"epoch": 1.3700657894736843,
"grad_norm": 0.25747162103652954,
"learning_rate": 3.3736748397231865e-05,
"loss": 0.2001,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20162513852119446,
"step": 475
},
{
"epoch": 1.3947368421052633,
"grad_norm": 0.2906784415245056,
"learning_rate": 3.3557049859705026e-05,
"loss": 0.208,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1721152365207672,
"step": 480
},
{
"epoch": 1.419407894736842,
"grad_norm": 0.19228465855121613,
"learning_rate": 3.3375303285301175e-05,
"loss": 0.1964,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2005109190940857,
"step": 485
},
{
"epoch": 1.444078947368421,
"grad_norm": 0.22863629460334778,
"learning_rate": 3.31915361301181e-05,
"loss": 0.1981,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19252851605415344,
"step": 490
},
{
"epoch": 1.46875,
"grad_norm": 0.26857441663742065,
"learning_rate": 3.300577615549874e-05,
"loss": 0.1899,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2054862082004547,
"step": 495
},
{
"epoch": 1.493421052631579,
"grad_norm": 0.23823519051074982,
"learning_rate": 3.281805142383738e-05,
"loss": 0.1957,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1814895123243332,
"step": 500
},
{
"epoch": 2.0246710526315788,
"grad_norm": 0.29881370067596436,
"learning_rate": 3.262839029434026e-05,
"loss": 0.2102,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1785247027873993,
"step": 505
},
{
"epoch": 2.049342105263158,
"grad_norm": 0.24525128304958344,
"learning_rate": 3.243682141874147e-05,
"loss": 0.2074,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17585539817810059,
"step": 510
},
{
"epoch": 2.0740131578947367,
"grad_norm": 0.20698322355747223,
"learning_rate": 3.2243373736974524e-05,
"loss": 0.2051,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1997726708650589,
"step": 515
},
{
"epoch": 2.098684210526316,
"grad_norm": 0.22929632663726807,
"learning_rate": 3.204807647280049e-05,
"loss": 0.2176,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22522705793380737,
"step": 520
},
{
"epoch": 2.1233552631578947,
"grad_norm": 0.18781138956546783,
"learning_rate": 3.185095912939324e-05,
"loss": 0.2046,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20897895097732544,
"step": 525
},
{
"epoch": 2.1480263157894735,
"grad_norm": 0.2008383721113205,
"learning_rate": 3.165205148488242e-05,
"loss": 0.2105,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19787193834781647,
"step": 530
},
{
"epoch": 2.1726973684210527,
"grad_norm": 0.21053168177604675,
"learning_rate": 3.145138358785494e-05,
"loss": 0.2131,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2260877788066864,
"step": 535
},
{
"epoch": 2.1973684210526314,
"grad_norm": 0.2918646037578583,
"learning_rate": 3.124898575281562e-05,
"loss": 0.2057,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1822304129600525,
"step": 540
},
{
"epoch": 2.2220394736842106,
"grad_norm": 0.18496881425380707,
"learning_rate": 3.1044888555607594e-05,
"loss": 0.1985,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16007639467716217,
"step": 545
},
{
"epoch": 2.2467105263157894,
"grad_norm": 0.2168571650981903,
"learning_rate": 3.0839122828793314e-05,
"loss": 0.2098,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22958366572856903,
"step": 550
},
{
"epoch": 2.2713815789473686,
"grad_norm": 0.29614129662513733,
"learning_rate": 3.0631719656996707e-05,
"loss": 0.1979,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21339645981788635,
"step": 555
},
{
"epoch": 2.2960526315789473,
"grad_norm": 0.2810326814651489,
"learning_rate": 3.042271037220731e-05,
"loss": 0.1972,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20447814464569092,
"step": 560
},
{
"epoch": 2.3207236842105265,
"grad_norm": 0.2408124804496765,
"learning_rate": 3.0212126549046986e-05,
"loss": 0.1923,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17865577340126038,
"step": 565
},
{
"epoch": 2.3453947368421053,
"grad_norm": 0.2610262334346771,
"learning_rate": 3.0000000000000004e-05,
"loss": 0.1969,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20306739211082458,
"step": 570
},
{
"epoch": 2.370065789473684,
"grad_norm": 0.4123166501522064,
"learning_rate": 2.978636277060722e-05,
"loss": 0.1942,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2014990746974945,
"step": 575
},
{
"epoch": 2.3947368421052633,
"grad_norm": 0.277736634016037,
"learning_rate": 2.9571247134624985e-05,
"loss": 0.2033,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16646766662597656,
"step": 580
},
{
"epoch": 2.419407894736842,
"grad_norm": 0.2018052637577057,
"learning_rate": 2.9354685589149637e-05,
"loss": 0.1911,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19513532519340515,
"step": 585
},
{
"epoch": 2.4440789473684212,
"grad_norm": 0.24421605467796326,
"learning_rate": 2.9136710849708225e-05,
"loss": 0.1925,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18745818734169006,
"step": 590
},
{
"epoch": 2.46875,
"grad_norm": 0.26889941096305847,
"learning_rate": 2.8917355845316214e-05,
"loss": 0.1844,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20007860660552979,
"step": 595
},
{
"epoch": 2.4934210526315788,
"grad_norm": 0.22898580133914948,
"learning_rate": 2.869665371350299e-05,
"loss": 0.1907,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17558446526527405,
"step": 600
},
{
"epoch": 2.0246710526315788,
"grad_norm": 0.29629117250442505,
"learning_rate": 2.8474637795305842e-05,
"loss": 0.2053,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1745256781578064,
"step": 605
},
{
"epoch": 2.049342105263158,
"grad_norm": 0.22614240646362305,
"learning_rate": 2.825134163023318e-05,
"loss": 0.2023,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17126205563545227,
"step": 610
},
{
"epoch": 2.0740131578947367,
"grad_norm": 0.20102806389331818,
"learning_rate": 2.802679895119778e-05,
"loss": 0.2,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19442611932754517,
"step": 615
},
{
"epoch": 2.098684210526316,
"grad_norm": 0.2268202155828476,
"learning_rate": 2.7801043679420856e-05,
"loss": 0.2119,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21963992714881897,
"step": 620
},
{
"epoch": 2.1233552631578947,
"grad_norm": 0.1838330328464508,
"learning_rate": 2.75741099193076e-05,
"loss": 0.1992,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20316563546657562,
"step": 625
},
{
"epoch": 2.1480263157894735,
"grad_norm": 0.20691487193107605,
"learning_rate": 2.734603195329514e-05,
"loss": 0.205,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1925206482410431,
"step": 630
},
{
"epoch": 2.1726973684210527,
"grad_norm": 0.21233633160591125,
"learning_rate": 2.711684423667353e-05,
"loss": 0.2073,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22030431032180786,
"step": 635
},
{
"epoch": 2.1973684210526314,
"grad_norm": 0.2087518721818924,
"learning_rate": 2.688658139238067e-05,
"loss": 0.2004,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17837481200695038,
"step": 640
},
{
"epoch": 2.2220394736842106,
"grad_norm": 0.18232280015945435,
"learning_rate": 2.6655278205771877e-05,
"loss": 0.1934,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15586256980895996,
"step": 645
},
{
"epoch": 2.2467105263157894,
"grad_norm": 0.2109103947877884,
"learning_rate": 2.6422969619364965e-05,
"loss": 0.2045,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.223219633102417,
"step": 650
},
{
"epoch": 2.2713815789473686,
"grad_norm": 0.27833494544029236,
"learning_rate": 2.6189690727561478e-05,
"loss": 0.1929,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20833665132522583,
"step": 655
},
{
"epoch": 2.2960526315789473,
"grad_norm": 0.2569701075553894,
"learning_rate": 2.5955476771345116e-05,
"loss": 0.1928,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2008177787065506,
"step": 660
},
{
"epoch": 2.3207236842105265,
"grad_norm": 0.24047020077705383,
"learning_rate": 2.5720363132957915e-05,
"loss": 0.1874,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1724005788564682,
"step": 665
},
{
"epoch": 2.3453947368421053,
"grad_norm": 0.2805600166320801,
"learning_rate": 2.5484385330555138e-05,
"loss": 0.1915,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19802047312259674,
"step": 670
},
{
"epoch": 2.370065789473684,
"grad_norm": 0.3280099332332611,
"learning_rate": 2.5247579012839584e-05,
"loss": 0.1895,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19793689250946045,
"step": 675
},
{
"epoch": 2.3947368421052633,
"grad_norm": 0.27804508805274963,
"learning_rate": 2.500997995367626e-05,
"loss": 0.2007,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16160696744918823,
"step": 680
},
{
"epoch": 2.419407894736842,
"grad_norm": 0.2003459334373474,
"learning_rate": 2.4771624046688043e-05,
"loss": 0.1865,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19053032994270325,
"step": 685
},
{
"epoch": 2.4440789473684212,
"grad_norm": 0.2516365945339203,
"learning_rate": 2.4532547299833337e-05,
"loss": 0.1876,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1822153627872467,
"step": 690
},
{
"epoch": 2.46875,
"grad_norm": 0.2790171205997467,
"learning_rate": 2.4292785829966407e-05,
"loss": 0.1798,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19594642519950867,
"step": 695
},
{
"epoch": 2.4934210526315788,
"grad_norm": 0.2231920063495636,
"learning_rate": 2.405237585738126e-05,
"loss": 0.1859,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1701628416776657,
"step": 700
},
{
"epoch": 3.0246710526315788,
"grad_norm": 0.28956133127212524,
"learning_rate": 2.381135370033996e-05,
"loss": 0.2012,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1710219532251358,
"step": 705
},
{
"epoch": 3.049342105263158,
"grad_norm": 0.23850497603416443,
"learning_rate": 2.356975576958606e-05,
"loss": 0.1977,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16769038140773773,
"step": 710
},
{
"epoch": 3.0740131578947367,
"grad_norm": 0.19639335572719574,
"learning_rate": 2.3327618562844116e-05,
"loss": 0.1954,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18999743461608887,
"step": 715
},
{
"epoch": 3.098684210526316,
"grad_norm": 0.22467830777168274,
"learning_rate": 2.3084978659306048e-05,
"loss": 0.2069,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21505983173847198,
"step": 720
},
{
"epoch": 3.1233552631578947,
"grad_norm": 0.19431814551353455,
"learning_rate": 2.2841872714105196e-05,
"loss": 0.1944,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19865034520626068,
"step": 725
},
{
"epoch": 3.1480263157894735,
"grad_norm": 0.2027537077665329,
"learning_rate": 2.25983374527789e-05,
"loss": 0.2002,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18814553320407867,
"step": 730
},
{
"epoch": 3.1726973684210527,
"grad_norm": 0.21179619431495667,
"learning_rate": 2.2354409665720427e-05,
"loss": 0.2024,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21532279253005981,
"step": 735
},
{
"epoch": 3.1973684210526314,
"grad_norm": 0.20413529872894287,
"learning_rate": 2.2110126202621162e-05,
"loss": 0.1957,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17406898736953735,
"step": 740
},
{
"epoch": 3.2220394736842106,
"grad_norm": 0.19533619284629822,
"learning_rate": 2.1865523966903758e-05,
"loss": 0.1889,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15199615061283112,
"step": 745
},
{
"epoch": 3.2467105263157894,
"grad_norm": 0.212092787027359,
"learning_rate": 2.16206399101472e-05,
"loss": 0.1998,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21798661351203918,
"step": 750
},
{
"epoch": 3.2713815789473686,
"grad_norm": 0.2671966552734375,
"learning_rate": 2.1375511026504653e-05,
"loss": 0.1885,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20311211049556732,
"step": 755
},
{
"epoch": 3.2960526315789473,
"grad_norm": 0.24363179504871368,
"learning_rate": 2.113017434711479e-05,
"loss": 0.1875,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1942652463912964,
"step": 760
},
{
"epoch": 3.3207236842105265,
"grad_norm": 0.22911496460437775,
"learning_rate": 2.088466693450758e-05,
"loss": 0.1823,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1669602394104004,
"step": 765
},
{
"epoch": 3.3453947368421053,
"grad_norm": 0.24698033928871155,
"learning_rate": 2.0639025877005308e-05,
"loss": 0.1863,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19326898455619812,
"step": 770
},
{
"epoch": 3.370065789473684,
"grad_norm": 0.5782654881477356,
"learning_rate": 2.039328828311976e-05,
"loss": 0.1854,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20133379101753235,
"step": 775
},
{
"epoch": 3.3947368421052633,
"grad_norm": 0.29828956723213196,
"learning_rate": 2.014749127594625e-05,
"loss": 0.1983,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1581995040178299,
"step": 780
},
{
"epoch": 3.419407894736842,
"grad_norm": 0.20889180898666382,
"learning_rate": 1.9901671987555568e-05,
"loss": 0.1828,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18696996569633484,
"step": 785
},
{
"epoch": 3.4440789473684212,
"grad_norm": 0.24528741836547852,
"learning_rate": 1.9655867553384472e-05,
"loss": 0.1834,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17848661541938782,
"step": 790
},
{
"epoch": 3.46875,
"grad_norm": 0.24264982342720032,
"learning_rate": 1.9410115106625714e-05,
"loss": 0.1754,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18969368934631348,
"step": 795
},
{
"epoch": 3.4934210526315788,
"grad_norm": 0.2114223688840866,
"learning_rate": 1.9164451772618435e-05,
"loss": 0.1812,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16628679633140564,
"step": 800
},
{
"epoch": 3.0246710526315788,
"grad_norm": 0.308517187833786,
"learning_rate": 1.891891466323966e-05,
"loss": 0.1979,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16870887577533722,
"step": 805
},
{
"epoch": 3.049342105263158,
"grad_norm": 0.24148114025592804,
"learning_rate": 1.8673540871297927e-05,
"loss": 0.1942,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16438522934913635,
"step": 810
},
{
"epoch": 3.0740131578947367,
"grad_norm": 0.1998043656349182,
"learning_rate": 1.842836746492971e-05,
"loss": 0.1916,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18565362691879272,
"step": 815
},
{
"epoch": 3.098684210526316,
"grad_norm": 0.22963786125183105,
"learning_rate": 1.8183431481999658e-05,
"loss": 0.2026,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21126440167427063,
"step": 820
},
{
"epoch": 3.1233552631578947,
"grad_norm": 0.1848352700471878,
"learning_rate": 1.793876992450529e-05,
"loss": 0.1905,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1943422555923462,
"step": 825
},
{
"epoch": 3.1480263157894735,
"grad_norm": 0.20099779963493347,
"learning_rate": 1.769441975298726e-05,
"loss": 0.1958,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1838373839855194,
"step": 830
},
{
"epoch": 3.1726973684210527,
"grad_norm": 0.23807059228420258,
"learning_rate": 1.7450417880945705e-05,
"loss": 0.198,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21098263561725616,
"step": 835
},
{
"epoch": 3.1973684210526314,
"grad_norm": 0.2139764130115509,
"learning_rate": 1.720680116926388e-05,
"loss": 0.1914,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17009641230106354,
"step": 840
},
{
"epoch": 3.2220394736842106,
"grad_norm": 0.18776723742485046,
"learning_rate": 1.6963606420639602e-05,
"loss": 0.185,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14875054359436035,
"step": 845
},
{
"epoch": 3.2467105263157894,
"grad_norm": 0.21373049914836884,
"learning_rate": 1.6720870374025578e-05,
"loss": 0.1957,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.212870791554451,
"step": 850
},
{
"epoch": 3.2713815789473686,
"grad_norm": 0.2705714702606201,
"learning_rate": 1.6478629699079278e-05,
"loss": 0.1846,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19945034384727478,
"step": 855
},
{
"epoch": 3.2960526315789473,
"grad_norm": 0.25683969259262085,
"learning_rate": 1.6236920990623374e-05,
"loss": 0.1833,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19059477746486664,
"step": 860
},
{
"epoch": 3.3207236842105265,
"grad_norm": 0.2478659451007843,
"learning_rate": 1.5995780763117382e-05,
"loss": 0.1781,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16162365674972534,
"step": 865
},
{
"epoch": 3.3453947368421053,
"grad_norm": 0.24454790353775024,
"learning_rate": 1.5755245445141544e-05,
"loss": 0.182,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1889064908027649,
"step": 870
},
{
"epoch": 3.370065789473684,
"grad_norm": 0.32232385873794556,
"learning_rate": 1.5515351373893573e-05,
"loss": 0.1804,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19105729460716248,
"step": 875
},
{
"epoch": 3.3947368421052633,
"grad_norm": 0.2774730324745178,
"learning_rate": 1.5276134789699344e-05,
"loss": 0.1936,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15349537134170532,
"step": 880
},
{
"epoch": 3.419407894736842,
"grad_norm": 0.19822268187999725,
"learning_rate": 1.503763183053805e-05,
"loss": 0.1787,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18199126422405243,
"step": 885
},
{
"epoch": 3.4440789473684212,
"grad_norm": 0.2223716378211975,
"learning_rate": 1.4799878526582987e-05,
"loss": 0.1788,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17311102151870728,
"step": 890
},
{
"epoch": 3.46875,
"grad_norm": 0.2528238594532013,
"learning_rate": 1.4562910794758488e-05,
"loss": 0.171,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1867274045944214,
"step": 895
},
{
"epoch": 3.4934210526315788,
"grad_norm": 0.21180278062820435,
"learning_rate": 1.4326764433314066e-05,
"loss": 0.1771,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16158296167850494,
"step": 900
},
{
"epoch": 4.024671052631579,
"grad_norm": 0.2938304543495178,
"learning_rate": 1.4091475116416415e-05,
"loss": 0.195,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1667296290397644,
"step": 905
},
{
"epoch": 4.0493421052631575,
"grad_norm": 0.24434833228588104,
"learning_rate": 1.3857078388760203e-05,
"loss": 0.1909,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16151553392410278,
"step": 910
},
{
"epoch": 4.074013157894737,
"grad_norm": 0.19539859890937805,
"learning_rate": 1.3623609660198373e-05,
"loss": 0.188,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18197977542877197,
"step": 915
},
{
"epoch": 4.098684210526316,
"grad_norm": 0.23001670837402344,
"learning_rate": 1.3391104200392905e-05,
"loss": 0.1987,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2076358199119568,
"step": 920
},
{
"epoch": 4.123355263157895,
"grad_norm": 0.18925786018371582,
"learning_rate": 1.3159597133486628e-05,
"loss": 0.1865,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19061236083507538,
"step": 925
},
{
"epoch": 4.1480263157894735,
"grad_norm": 0.20448650419712067,
"learning_rate": 1.292912343279713e-05,
"loss": 0.1918,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18021315336227417,
"step": 930
},
{
"epoch": 4.172697368421052,
"grad_norm": 0.2233712524175644,
"learning_rate": 1.2699717915533402e-05,
"loss": 0.1942,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20732900500297546,
"step": 935
},
{
"epoch": 4.197368421052632,
"grad_norm": 0.20618166029453278,
"learning_rate": 1.2471415237536065e-05,
"loss": 0.1874,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16642165184020996,
"step": 940
},
{
"epoch": 4.222039473684211,
"grad_norm": 0.19250109791755676,
"learning_rate": 1.2244249888041955e-05,
"loss": 0.1813,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1461605280637741,
"step": 945
},
{
"epoch": 4.246710526315789,
"grad_norm": 0.20326243340969086,
"learning_rate": 1.2018256184473967e-05,
"loss": 0.1919,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20845885574817657,
"step": 950
},
{
"epoch": 4.271381578947368,
"grad_norm": 0.28202906250953674,
"learning_rate": 1.1793468267256709e-05,
"loss": 0.1804,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19517214596271515,
"step": 955
},
{
"epoch": 4.296052631578947,
"grad_norm": 0.23696176707744598,
"learning_rate": 1.156992009465904e-05,
"loss": 0.1788,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18528538942337036,
"step": 960
},
{
"epoch": 4.3207236842105265,
"grad_norm": 0.2820415794849396,
"learning_rate": 1.1347645437664032e-05,
"loss": 0.1738,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15721410512924194,
"step": 965
},
{
"epoch": 4.345394736842105,
"grad_norm": 0.2320161908864975,
"learning_rate": 1.1126677874867245e-05,
"loss": 0.1776,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1845950335264206,
"step": 970
},
{
"epoch": 4.370065789473684,
"grad_norm": 0.33977606892585754,
"learning_rate": 1.0907050787404105e-05,
"loss": 0.1757,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18589606881141663,
"step": 975
},
{
"epoch": 4.394736842105263,
"grad_norm": 0.2689703404903412,
"learning_rate": 1.0688797353907052e-05,
"loss": 0.1882,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14957153797149658,
"step": 980
},
{
"epoch": 4.4194078947368425,
"grad_norm": 0.2037592977285385,
"learning_rate": 1.0471950545493328e-05,
"loss": 0.1753,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17800632119178772,
"step": 985
},
{
"epoch": 4.444078947368421,
"grad_norm": 0.2365749329328537,
"learning_rate": 1.0256543120784074e-05,
"loss": 0.1746,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16894236207008362,
"step": 990
},
{
"epoch": 4.46875,
"grad_norm": 0.2567404806613922,
"learning_rate": 1.0042607620955592e-05,
"loss": 0.1669,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18090274930000305,
"step": 995
},
{
"epoch": 4.493421052631579,
"grad_norm": 0.2108602672815323,
"learning_rate": 9.830176364823349e-06,
"loss": 0.1729,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15751853585243225,
"step": 1000
},
{
"epoch": 4.024671052631579,
"grad_norm": 0.30531227588653564,
"learning_rate": 9.619281443959711e-06,
"loss": 0.1925,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16471046209335327,
"step": 1005
},
{
"epoch": 4.0493421052631575,
"grad_norm": 0.2457159012556076,
"learning_rate": 9.409954717845861e-06,
"loss": 0.188,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15918654203414917,
"step": 1010
},
{
"epoch": 4.074013157894737,
"grad_norm": 0.2185594141483307,
"learning_rate": 9.202227809058912e-06,
"loss": 0.1848,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17858435213565826,
"step": 1015
},
{
"epoch": 4.098684210526316,
"grad_norm": 0.31074318289756775,
"learning_rate": 8.996132098494688e-06,
"loss": 0.1951,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20442083477973938,
"step": 1020
},
{
"epoch": 4.123355263157895,
"grad_norm": 0.1880805939435959,
"learning_rate": 8.791698720627138e-06,
"loss": 0.183,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18682318925857544,
"step": 1025
},
{
"epoch": 4.1480263157894735,
"grad_norm": 0.1928797960281372,
"learning_rate": 8.58895855880484e-06,
"loss": 0.1881,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17656448483467102,
"step": 1030
},
{
"epoch": 4.172697368421052,
"grad_norm": 0.22544586658477783,
"learning_rate": 8.387942240585587e-06,
"loss": 0.1905,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20401433110237122,
"step": 1035
},
{
"epoch": 4.197368421052632,
"grad_norm": 0.20387957990169525,
"learning_rate": 8.188680133109485e-06,
"loss": 0.1838,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16293783485889435,
"step": 1040
},
{
"epoch": 4.222039473684211,
"grad_norm": 0.22418615221977234,
"learning_rate": 7.991202338511477e-06,
"loss": 0.1779,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14349329471588135,
"step": 1045
},
{
"epoch": 4.246710526315789,
"grad_norm": 0.21125830709934235,
"learning_rate": 7.795538689373859e-06,
"loss": 0.1881,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20435020327568054,
"step": 1050
},
{
"epoch": 4.271381578947368,
"grad_norm": 0.256409227848053,
"learning_rate": 7.601718744219555e-06,
"loss": 0.1768,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19160351157188416,
"step": 1055
},
{
"epoch": 4.296052631578947,
"grad_norm": 0.23899729549884796,
"learning_rate": 7.409771783046733e-06,
"loss": 0.1747,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18137173354625702,
"step": 1060
},
{
"epoch": 4.3207236842105265,
"grad_norm": 0.2511383295059204,
"learning_rate": 7.219726802905573e-06,
"loss": 0.17,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15333089232444763,
"step": 1065
},
{
"epoch": 4.345394736842105,
"grad_norm": 0.24497000873088837,
"learning_rate": 7.0316125135176935e-06,
"loss": 0.1735,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17997264862060547,
"step": 1070
},
{
"epoch": 4.370065789473684,
"grad_norm": 0.2674829661846161,
"learning_rate": 6.845457332939083e-06,
"loss": 0.1717,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18120409548282623,
"step": 1075
},
{
"epoch": 4.394736842105263,
"grad_norm": 0.253813773393631,
"learning_rate": 6.661289383266984e-06,
"loss": 0.1848,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14498010277748108,
"step": 1080
},
{
"epoch": 4.4194078947368425,
"grad_norm": 0.20171727240085602,
"learning_rate": 6.479136486391599e-06,
"loss": 0.1709,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1741921305656433,
"step": 1085
},
{
"epoch": 4.444078947368421,
"grad_norm": 0.265755832195282,
"learning_rate": 6.299026159793042e-06,
"loss": 0.1704,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16434627771377563,
"step": 1090
},
{
"epoch": 4.46875,
"grad_norm": 0.24092736840248108,
"learning_rate": 6.120985612384369e-06,
"loss": 0.1636,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17743858695030212,
"step": 1095
},
{
"epoch": 4.493421052631579,
"grad_norm": 0.22309978306293488,
"learning_rate": 5.945041740401147e-06,
"loss": 0.1691,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1535460352897644,
"step": 1100
},
{
"epoch": 5.024671052631579,
"grad_norm": 0.296979695558548,
"learning_rate": 5.7712211233383104e-06,
"loss": 0.1901,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16269783675670624,
"step": 1105
},
{
"epoch": 5.0493421052631575,
"grad_norm": 0.2464631199836731,
"learning_rate": 5.5995500199348565e-06,
"loss": 0.1852,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15707314014434814,
"step": 1110
},
{
"epoch": 5.074013157894737,
"grad_norm": 0.19684258103370667,
"learning_rate": 5.430054364206965e-06,
"loss": 0.182,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17553085088729858,
"step": 1115
},
{
"epoch": 5.098684210526316,
"grad_norm": 0.2454098016023636,
"learning_rate": 5.262759761530214e-06,
"loss": 0.1921,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20213395357131958,
"step": 1120
},
{
"epoch": 5.123355263157895,
"grad_norm": 0.19603319466114044,
"learning_rate": 5.097691484771434e-06,
"loss": 0.1797,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1835222691297531,
"step": 1125
},
{
"epoch": 5.1480263157894735,
"grad_norm": 0.19945049285888672,
"learning_rate": 4.934874470470756e-06,
"loss": 0.1847,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17295250296592712,
"step": 1130
},
{
"epoch": 5.172697368421052,
"grad_norm": 0.23703482747077942,
"learning_rate": 4.77433331507454e-06,
"loss": 0.187,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2007107436656952,
"step": 1135
},
{
"epoch": 5.197368421052632,
"grad_norm": 0.19850043952465057,
"learning_rate": 4.6160922712195875e-06,
"loss": 0.1803,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15952754020690918,
"step": 1140
},
{
"epoch": 5.222039473684211,
"grad_norm": 0.19503454864025116,
"learning_rate": 4.460175244069395e-06,
"loss": 0.1748,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14143893122673035,
"step": 1145
},
{
"epoch": 5.246710526315789,
"grad_norm": 0.21830520033836365,
"learning_rate": 4.306605787702802e-06,
"loss": 0.1846,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20023545622825623,
"step": 1150
},
{
"epoch": 5.271381578947368,
"grad_norm": 0.2543354332447052,
"learning_rate": 4.155407101555764e-06,
"loss": 0.1731,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18734650313854218,
"step": 1155
},
{
"epoch": 5.296052631578947,
"grad_norm": 0.2577175796031952,
"learning_rate": 4.006602026916617e-06,
"loss": 0.1708,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17666634917259216,
"step": 1160
},
{
"epoch": 5.3207236842105265,
"grad_norm": 0.2335437387228012,
"learning_rate": 3.860213043475531e-06,
"loss": 0.1663,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1490708291530609,
"step": 1165
},
{
"epoch": 5.345394736842105,
"grad_norm": 0.23638561367988586,
"learning_rate": 3.7162622659285185e-06,
"loss": 0.1694,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1765957921743393,
"step": 1170
},
{
"epoch": 5.370065789473684,
"grad_norm": 0.3142367899417877,
"learning_rate": 3.5747714406366154e-06,
"loss": 0.1677,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17734456062316895,
"step": 1175
},
{
"epoch": 5.394736842105263,
"grad_norm": 0.2611706852912903,
"learning_rate": 3.435761942340705e-06,
"loss": 0.1807,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14173588156700134,
"step": 1180
},
{
"epoch": 5.4194078947368425,
"grad_norm": 0.20892195403575897,
"learning_rate": 3.2992547709324964e-06,
"loss": 0.168,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17047154903411865,
"step": 1185
},
{
"epoch": 5.444078947368421,
"grad_norm": 0.24029605090618134,
"learning_rate": 3.1652705482820665e-06,
"loss": 0.167,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16115230321884155,
"step": 1190
},
{
"epoch": 5.46875,
"grad_norm": 0.2471027821302414,
"learning_rate": 3.033829515122608e-06,
"loss": 0.1598,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1730382740497589,
"step": 1195
},
{
"epoch": 5.493421052631579,
"grad_norm": 0.22252005338668823,
"learning_rate": 2.904951527992652e-06,
"loss": 0.1656,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1501745879650116,
"step": 1200
},
{
"epoch": 5.024671052631579,
"grad_norm": 0.2984672784805298,
"learning_rate": 2.7786560562364285e-06,
"loss": 0.1878,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16104725003242493,
"step": 1205
},
{
"epoch": 5.0493421052631575,
"grad_norm": 0.24322442710399628,
"learning_rate": 2.6549621790626166e-06,
"loss": 0.1825,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1545785665512085,
"step": 1210
},
{
"epoch": 5.074013157894737,
"grad_norm": 0.21504734456539154,
"learning_rate": 2.533888582662145e-06,
"loss": 0.1791,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17252680659294128,
"step": 1215
},
{
"epoch": 5.098684210526316,
"grad_norm": 0.2362941950559616,
"learning_rate": 2.41545355738525e-06,
"loss": 0.189,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19943520426750183,
"step": 1220
},
{
"epoch": 5.123355263157895,
"grad_norm": 0.1955908238887787,
"learning_rate": 2.299674994978436e-06,
"loss": 0.1765,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1803397238254547,
"step": 1225
},
{
"epoch": 5.1480263157894735,
"grad_norm": 0.19484035670757294,
"learning_rate": 2.1865703858815656e-06,
"loss": 0.1813,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1700981855392456,
"step": 1230
},
{
"epoch": 5.172697368421052,
"grad_norm": 0.2399033159017563,
"learning_rate": 2.076156816585639e-06,
"loss": 0.1836,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19748282432556152,
"step": 1235
},
{
"epoch": 5.197368421052632,
"grad_norm": 0.20060895383358002,
"learning_rate": 1.9684509670515585e-06,
"loss": 0.177,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1564170867204666,
"step": 1240
},
{
"epoch": 5.222039473684211,
"grad_norm": 0.2051486074924469,
"learning_rate": 1.86346910819033e-06,
"loss": 0.1718,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13873499631881714,
"step": 1245
},
{
"epoch": 5.246710526315789,
"grad_norm": 0.21883392333984375,
"learning_rate": 1.7612270994050362e-06,
"loss": 0.1812,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19669045507907867,
"step": 1250
},
{
"epoch": 5.271381578947368,
"grad_norm": 0.27726781368255615,
"learning_rate": 1.6617403861949898e-06,
"loss": 0.1702,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18430817127227783,
"step": 1255
},
{
"epoch": 5.296052631578947,
"grad_norm": 0.25286152958869934,
"learning_rate": 1.5650239978224346e-06,
"loss": 0.1672,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17335672676563263,
"step": 1260
},
{
"epoch": 5.3207236842105265,
"grad_norm": 0.24115750193595886,
"learning_rate": 1.4710925450420632e-06,
"loss": 0.1629,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14512015879154205,
"step": 1265
},
{
"epoch": 5.345394736842105,
"grad_norm": 0.36607709527015686,
"learning_rate": 1.379960217893841e-06,
"loss": 0.1659,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1726306825876236,
"step": 1270
},
{
"epoch": 5.370065789473684,
"grad_norm": 0.2740156054496765,
"learning_rate": 1.2916407835593093e-06,
"loss": 0.1641,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1730458289384842,
"step": 1275
},
{
"epoch": 5.394736842105263,
"grad_norm": 0.2575497627258301,
"learning_rate": 1.2061475842818337e-06,
"loss": 0.1772,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13783614337444305,
"step": 1280
},
{
"epoch": 5.4194078947368425,
"grad_norm": 0.19674454629421234,
"learning_rate": 1.1234935353509946e-06,
"loss": 0.1638,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16719526052474976,
"step": 1285
},
{
"epoch": 5.444078947368421,
"grad_norm": 0.25193727016448975,
"learning_rate": 1.0436911231515202e-06,
"loss": 0.1631,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15700435638427734,
"step": 1290
},
{
"epoch": 5.46875,
"grad_norm": 0.2387438416481018,
"learning_rate": 9.667524032769715e-07,
"loss": 0.1565,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16894632577896118,
"step": 1295
},
{
"epoch": 5.493421052631579,
"grad_norm": 0.21962200105190277,
"learning_rate": 8.926889987085441e-07,
"loss": 0.1619,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1462545394897461,
"step": 1300
},
{
"epoch": 6.024671052631579,
"grad_norm": 0.294238418340683,
"learning_rate": 8.215120980591984e-07,
"loss": 0.1856,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15981429815292358,
"step": 1305
},
{
"epoch": 6.0493421052631575,
"grad_norm": 0.2525791823863983,
"learning_rate": 7.532324538834279e-07,
"loss": 0.1802,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15260854363441467,
"step": 1310
},
{
"epoch": 6.074013157894737,
"grad_norm": 0.1967260092496872,
"learning_rate": 6.878603810528739e-07,
"loss": 0.1765,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1697266399860382,
"step": 1315
},
{
"epoch": 6.098684210526316,
"grad_norm": 0.24107760190963745,
"learning_rate": 6.25405755198103e-07,
"loss": 0.1861,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19656933844089508,
"step": 1320
},
{
"epoch": 6.123355263157895,
"grad_norm": 0.21563208103179932,
"learning_rate": 5.658780112166872e-07,
"loss": 0.1735,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17719779908657074,
"step": 1325
},
{
"epoch": 6.1480263157894735,
"grad_norm": 0.20149841904640198,
"learning_rate": 5.092861418479156e-07,
"loss": 0.1781,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16660000383853912,
"step": 1330
},
{
"epoch": 6.172697368421052,
"grad_norm": 0.24279265105724335,
"learning_rate": 4.556386963142645e-07,
"loss": 0.1805,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19446003437042236,
"step": 1335
},
{
"epoch": 6.197368421052632,
"grad_norm": 0.2047039270401001,
"learning_rate": 4.04943779029896e-07,
"loss": 0.1738,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1529483050107956,
"step": 1340
},
{
"epoch": 6.222039473684211,
"grad_norm": 0.20605064928531647,
"learning_rate": 3.5720904837632355e-07,
"loss": 0.1688,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13626405596733093,
"step": 1345
},
{
"epoch": 6.246710526315789,
"grad_norm": 0.20880380272865295,
"learning_rate": 3.124417155454884e-07,
"loss": 0.178,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19269785284996033,
"step": 1350
},
{
"epoch": 6.271381578947368,
"grad_norm": 0.2428048700094223,
"learning_rate": 2.7064854345037585e-07,
"loss": 0.167,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18093225359916687,
"step": 1355
},
{
"epoch": 6.296052631578947,
"grad_norm": 0.2846923768520355,
"learning_rate": 2.3183584570335205e-07,
"loss": 0.1636,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16989687085151672,
"step": 1360
},
{
"epoch": 6.3207236842105265,
"grad_norm": 0.2494208961725235,
"learning_rate": 1.9600948566238287e-07,
"loss": 0.1596,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1410118043422699,
"step": 1365
},
{
"epoch": 6.345394736842105,
"grad_norm": 0.25421878695487976,
"learning_rate": 1.631748755452667e-07,
"loss": 0.1629,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17008930444717407,
"step": 1370
},
{
"epoch": 6.370065789473684,
"grad_norm": 0.30556827783584595,
"learning_rate": 1.3333697561201732e-07,
"loss": 0.1603,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1695536971092224,
"step": 1375
},
{
"epoch": 6.394736842105263,
"grad_norm": 0.2617764472961426,
"learning_rate": 1.0650029341553902e-07,
"loss": 0.1734,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13440750539302826,
"step": 1380
},
{
"epoch": 6.4194078947368425,
"grad_norm": 0.22152091562747955,
"learning_rate": 8.266888312066013e-08,
"loss": 0.1609,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16376778483390808,
"step": 1385
},
{
"epoch": 6.444078947368421,
"grad_norm": 0.24168169498443604,
"learning_rate": 6.184634489169838e-08,
"loss": 0.1598,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15376853942871094,
"step": 1390
},
{
"epoch": 6.46875,
"grad_norm": 0.27041196823120117,
"learning_rate": 4.403582434857834e-08,
"loss": 0.1529,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1645251214504242,
"step": 1395
},
{
"epoch": 6.493421052631579,
"grad_norm": 0.23931093513965607,
"learning_rate": 2.924001209163363e-08,
"loss": 0.1583,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14250212907791138,
"step": 1400
},
{
"epoch": 6.5180921052631575,
"grad_norm": 0.2812100648880005,
"learning_rate": 1.7461143295141036e-08,
"loss": 0.1821,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16714885830879211,
"step": 1405
},
{
"epoch": 6.542763157894737,
"grad_norm": 0.21569399535655975,
"learning_rate": 8.700997369659459e-09,
"loss": 0.1983,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21874967217445374,
"step": 1410
},
{
"epoch": 6.567434210526316,
"grad_norm": 0.23459841310977936,
"learning_rate": 2.9608976932182788e-09,
"loss": 0.2006,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18229544162750244,
"step": 1415
},
{
"epoch": 6.592105263157895,
"grad_norm": 0.22248035669326782,
"learning_rate": 2.4171141139284204e-10,
"loss": 0.1941,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18813243508338928,
"step": 1420
},
{
"epoch": 6.597039473684211,
"step": 1421,
"total_flos": 7.249602429950362e+16,
"train_loss": 0.0,
"train_runtime": 1.0653,
"train_samples_per_second": 7990.221,
"train_steps_per_second": 1333.894
}
],
"logging_steps": 5,
"max_steps": 1421,
"num_input_tokens_seen": 0,
"num_train_epochs": 7,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.249602429950362e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}