{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.999116607773852, "eval_steps": 500, "global_step": 7918, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00441696113074205, "grad_norm": 24.109052658081055, "learning_rate": 2.0176544766708703e-07, "loss": 0.8392, "loss_nan_ranks": 0, "loss_rank_avg": 0.7250818014144897, "step": 5 }, { "epoch": 0.0088339222614841, "grad_norm": 23.32025718688965, "learning_rate": 4.5397225725094586e-07, "loss": 0.8118, "loss_nan_ranks": 0, "loss_rank_avg": 0.8858845233917236, "step": 10 }, { "epoch": 0.013250883392226149, "grad_norm": 19.11461639404297, "learning_rate": 7.061790668348046e-07, "loss": 0.8166, "loss_nan_ranks": 0, "loss_rank_avg": 1.0077567100524902, "step": 15 }, { "epoch": 0.0176678445229682, "grad_norm": 18.66092300415039, "learning_rate": 9.583858764186634e-07, "loss": 0.7684, "loss_nan_ranks": 0, "loss_rank_avg": 0.7495265007019043, "step": 20 }, { "epoch": 0.022084805653710248, "grad_norm": 11.922789573669434, "learning_rate": 1.210592686002522e-06, "loss": 0.7393, "loss_nan_ranks": 0, "loss_rank_avg": 0.6014531850814819, "step": 25 }, { "epoch": 0.026501766784452298, "grad_norm": 9.920490264892578, "learning_rate": 1.4627994955863808e-06, "loss": 0.7164, "loss_nan_ranks": 0, "loss_rank_avg": 0.6873708367347717, "step": 30 }, { "epoch": 0.030918727915194347, "grad_norm": 6.4108171463012695, "learning_rate": 1.7150063051702399e-06, "loss": 0.6782, "loss_nan_ranks": 0, "loss_rank_avg": 0.628339946269989, "step": 35 }, { "epoch": 0.0353356890459364, "grad_norm": 4.962175369262695, "learning_rate": 1.9672131147540985e-06, "loss": 0.6596, "loss_nan_ranks": 0, "loss_rank_avg": 0.6136308908462524, "step": 40 }, { "epoch": 0.03975265017667844, "grad_norm": 2.925511121749878, "learning_rate": 2.2194199243379574e-06, "loss": 0.6625, "loss_nan_ranks": 0, "loss_rank_avg": 0.7212941646575928, "step": 45 }, { "epoch": 0.044169611307420496, "grad_norm": 1.815821886062622, "learning_rate": 2.4716267339218163e-06, "loss": 0.6164, "loss_nan_ranks": 0, "loss_rank_avg": 0.6185051202774048, "step": 50 }, { "epoch": 0.04858657243816254, "grad_norm": 1.6444969177246094, "learning_rate": 2.723833543505675e-06, "loss": 0.5692, "loss_nan_ranks": 0, "loss_rank_avg": 0.5030768513679504, "step": 55 }, { "epoch": 0.053003533568904596, "grad_norm": 1.4677263498306274, "learning_rate": 2.9760403530895336e-06, "loss": 0.5737, "loss_nan_ranks": 0, "loss_rank_avg": 0.6769396066665649, "step": 60 }, { "epoch": 0.05742049469964664, "grad_norm": 1.4194655418395996, "learning_rate": 3.2282471626733925e-06, "loss": 0.5621, "loss_nan_ranks": 0, "loss_rank_avg": 0.48633405566215515, "step": 65 }, { "epoch": 0.061837455830388695, "grad_norm": 1.1492063999176025, "learning_rate": 3.480453972257251e-06, "loss": 0.5322, "loss_nan_ranks": 0, "loss_rank_avg": 0.5070392489433289, "step": 70 }, { "epoch": 0.06625441696113074, "grad_norm": 1.0129790306091309, "learning_rate": 3.73266078184111e-06, "loss": 0.5497, "loss_nan_ranks": 0, "loss_rank_avg": 0.4552308917045593, "step": 75 }, { "epoch": 0.0706713780918728, "grad_norm": 0.9223603010177612, "learning_rate": 3.984867591424969e-06, "loss": 0.5376, "loss_nan_ranks": 0, "loss_rank_avg": 0.4579840302467346, "step": 80 }, { "epoch": 0.07508833922261485, "grad_norm": 1.050498366355896, "learning_rate": 4.237074401008828e-06, "loss": 0.5411, "loss_nan_ranks": 0, "loss_rank_avg": 0.5621368885040283, "step": 85 }, { "epoch": 0.07950530035335689, "grad_norm": 0.8152749538421631, "learning_rate": 4.4892812105926865e-06, "loss": 0.5246, "loss_nan_ranks": 0, "loss_rank_avg": 0.49742865562438965, "step": 90 }, { "epoch": 0.08392226148409894, "grad_norm": 0.9410054087638855, "learning_rate": 4.741488020176545e-06, "loss": 0.5438, "loss_nan_ranks": 0, "loss_rank_avg": 0.5632932186126709, "step": 95 }, { "epoch": 0.08833922261484099, "grad_norm": 0.7807851433753967, "learning_rate": 4.993694829760403e-06, "loss": 0.5607, "loss_nan_ranks": 0, "loss_rank_avg": 0.5178039073944092, "step": 100 }, { "epoch": 0.09275618374558305, "grad_norm": 0.9543706178665161, "learning_rate": 5.245901639344263e-06, "loss": 0.5857, "loss_nan_ranks": 0, "loss_rank_avg": 0.4983155429363251, "step": 105 }, { "epoch": 0.09717314487632508, "grad_norm": 1.1576862335205078, "learning_rate": 5.498108448928121e-06, "loss": 0.5065, "loss_nan_ranks": 0, "loss_rank_avg": 0.5359828472137451, "step": 110 }, { "epoch": 0.10159010600706714, "grad_norm": 0.8757576942443848, "learning_rate": 5.7503152585119805e-06, "loss": 0.5241, "loss_nan_ranks": 0, "loss_rank_avg": 0.4543844163417816, "step": 115 }, { "epoch": 0.10600706713780919, "grad_norm": 0.8769079446792603, "learning_rate": 6.00252206809584e-06, "loss": 0.4923, "loss_nan_ranks": 0, "loss_rank_avg": 0.4466102123260498, "step": 120 }, { "epoch": 0.11042402826855123, "grad_norm": 0.7672253251075745, "learning_rate": 6.254728877679697e-06, "loss": 0.4789, "loss_nan_ranks": 0, "loss_rank_avg": 0.5950910449028015, "step": 125 }, { "epoch": 0.11484098939929328, "grad_norm": 0.7976667284965515, "learning_rate": 6.506935687263557e-06, "loss": 0.4813, "loss_nan_ranks": 0, "loss_rank_avg": 0.46099185943603516, "step": 130 }, { "epoch": 0.11925795053003534, "grad_norm": 0.8457517623901367, "learning_rate": 6.759142496847415e-06, "loss": 0.5338, "loss_nan_ranks": 0, "loss_rank_avg": 0.4921356439590454, "step": 135 }, { "epoch": 0.12367491166077739, "grad_norm": 0.8651091456413269, "learning_rate": 7.0113493064312745e-06, "loss": 0.4892, "loss_nan_ranks": 0, "loss_rank_avg": 0.4262227714061737, "step": 140 }, { "epoch": 0.12809187279151943, "grad_norm": 0.7235690355300903, "learning_rate": 7.263556116015134e-06, "loss": 0.459, "loss_nan_ranks": 0, "loss_rank_avg": 0.46743807196617126, "step": 145 }, { "epoch": 0.13250883392226148, "grad_norm": 0.7583150267601013, "learning_rate": 7.515762925598991e-06, "loss": 0.4925, "loss_nan_ranks": 0, "loss_rank_avg": 0.4664328396320343, "step": 150 }, { "epoch": 0.13692579505300354, "grad_norm": 0.9911392331123352, "learning_rate": 7.76796973518285e-06, "loss": 0.4904, "loss_nan_ranks": 0, "loss_rank_avg": 0.5170290470123291, "step": 155 }, { "epoch": 0.1413427561837456, "grad_norm": 0.7634608745574951, "learning_rate": 8.020176544766708e-06, "loss": 0.484, "loss_nan_ranks": 0, "loss_rank_avg": 0.45101815462112427, "step": 160 }, { "epoch": 0.14575971731448764, "grad_norm": 0.8441954255104065, "learning_rate": 8.272383354350568e-06, "loss": 0.4711, "loss_nan_ranks": 0, "loss_rank_avg": 0.527004599571228, "step": 165 }, { "epoch": 0.1501766784452297, "grad_norm": 0.8853057026863098, "learning_rate": 8.524590163934427e-06, "loss": 0.4725, "loss_nan_ranks": 0, "loss_rank_avg": 0.40662050247192383, "step": 170 }, { "epoch": 0.15459363957597172, "grad_norm": 0.7503329515457153, "learning_rate": 8.776796973518286e-06, "loss": 0.4435, "loss_nan_ranks": 0, "loss_rank_avg": 0.45254433155059814, "step": 175 }, { "epoch": 0.15901060070671377, "grad_norm": 0.811824381351471, "learning_rate": 9.029003783102146e-06, "loss": 0.4582, "loss_nan_ranks": 0, "loss_rank_avg": 0.44835376739501953, "step": 180 }, { "epoch": 0.16342756183745583, "grad_norm": 0.8182066679000854, "learning_rate": 9.281210592686003e-06, "loss": 0.4924, "loss_nan_ranks": 0, "loss_rank_avg": 0.5891055464744568, "step": 185 }, { "epoch": 0.16784452296819788, "grad_norm": 0.7815266251564026, "learning_rate": 9.533417402269862e-06, "loss": 0.4778, "loss_nan_ranks": 0, "loss_rank_avg": 0.44031378626823425, "step": 190 }, { "epoch": 0.17226148409893993, "grad_norm": 0.8124738931655884, "learning_rate": 9.78562421185372e-06, "loss": 0.4345, "loss_nan_ranks": 0, "loss_rank_avg": 0.45023810863494873, "step": 195 }, { "epoch": 0.17667844522968199, "grad_norm": 0.8434866666793823, "learning_rate": 1.0037831021437581e-05, "loss": 0.4199, "loss_nan_ranks": 0, "loss_rank_avg": 0.44062817096710205, "step": 200 }, { "epoch": 0.18109540636042404, "grad_norm": 0.8283309936523438, "learning_rate": 1.0290037831021437e-05, "loss": 0.4342, "loss_nan_ranks": 0, "loss_rank_avg": 0.4788593649864197, "step": 205 }, { "epoch": 0.1855123674911661, "grad_norm": 0.6613907814025879, "learning_rate": 1.0542244640605296e-05, "loss": 0.4274, "loss_nan_ranks": 0, "loss_rank_avg": 0.4137096405029297, "step": 210 }, { "epoch": 0.18992932862190812, "grad_norm": 0.9123347401618958, "learning_rate": 1.0794451450189156e-05, "loss": 0.4314, "loss_nan_ranks": 0, "loss_rank_avg": 0.4231804609298706, "step": 215 }, { "epoch": 0.19434628975265017, "grad_norm": 0.8312901854515076, "learning_rate": 1.1046658259773015e-05, "loss": 0.4208, "loss_nan_ranks": 0, "loss_rank_avg": 0.4278219938278198, "step": 220 }, { "epoch": 0.19876325088339222, "grad_norm": 0.6793373823165894, "learning_rate": 1.1298865069356874e-05, "loss": 0.4253, "loss_nan_ranks": 0, "loss_rank_avg": 0.39295411109924316, "step": 225 }, { "epoch": 0.20318021201413428, "grad_norm": 0.8161110877990723, "learning_rate": 1.1551071878940732e-05, "loss": 0.4605, "loss_nan_ranks": 0, "loss_rank_avg": 0.4723338186740875, "step": 230 }, { "epoch": 0.20759717314487633, "grad_norm": 0.7220736145973206, "learning_rate": 1.1803278688524591e-05, "loss": 0.4244, "loss_nan_ranks": 0, "loss_rank_avg": 0.44514936208724976, "step": 235 }, { "epoch": 0.21201413427561838, "grad_norm": 0.780061662197113, "learning_rate": 1.205548549810845e-05, "loss": 0.472, "loss_nan_ranks": 0, "loss_rank_avg": 0.4039708971977234, "step": 240 }, { "epoch": 0.21643109540636044, "grad_norm": 0.6663079857826233, "learning_rate": 1.230769230769231e-05, "loss": 0.4287, "loss_nan_ranks": 0, "loss_rank_avg": 0.35989290475845337, "step": 245 }, { "epoch": 0.22084805653710246, "grad_norm": 0.8617585897445679, "learning_rate": 1.2559899117276166e-05, "loss": 0.4589, "loss_nan_ranks": 0, "loss_rank_avg": 0.5611141324043274, "step": 250 }, { "epoch": 0.2252650176678445, "grad_norm": 0.8636696934700012, "learning_rate": 1.2812105926860025e-05, "loss": 0.393, "loss_nan_ranks": 0, "loss_rank_avg": 0.4264186918735504, "step": 255 }, { "epoch": 0.22968197879858657, "grad_norm": 0.884749174118042, "learning_rate": 1.3064312736443884e-05, "loss": 0.441, "loss_nan_ranks": 0, "loss_rank_avg": 0.5420808792114258, "step": 260 }, { "epoch": 0.23409893992932862, "grad_norm": 0.7348568439483643, "learning_rate": 1.3316519546027744e-05, "loss": 0.357, "loss_nan_ranks": 0, "loss_rank_avg": 0.34996867179870605, "step": 265 }, { "epoch": 0.23851590106007067, "grad_norm": 0.6985581517219543, "learning_rate": 1.3568726355611603e-05, "loss": 0.4148, "loss_nan_ranks": 0, "loss_rank_avg": 0.44080445170402527, "step": 270 }, { "epoch": 0.24293286219081273, "grad_norm": 0.8112905025482178, "learning_rate": 1.382093316519546e-05, "loss": 0.387, "loss_nan_ranks": 0, "loss_rank_avg": 0.46948182582855225, "step": 275 }, { "epoch": 0.24734982332155478, "grad_norm": 0.7835463881492615, "learning_rate": 1.407313997477932e-05, "loss": 0.4286, "loss_nan_ranks": 0, "loss_rank_avg": 0.39757293462753296, "step": 280 }, { "epoch": 0.25176678445229683, "grad_norm": 0.8474117517471313, "learning_rate": 1.4325346784363179e-05, "loss": 0.4184, "loss_nan_ranks": 0, "loss_rank_avg": 0.386111855506897, "step": 285 }, { "epoch": 0.25618374558303886, "grad_norm": 0.7113578915596008, "learning_rate": 1.4577553593947038e-05, "loss": 0.3587, "loss_nan_ranks": 0, "loss_rank_avg": 0.32697296142578125, "step": 290 }, { "epoch": 0.26060070671378094, "grad_norm": 0.8683375716209412, "learning_rate": 1.4829760403530898e-05, "loss": 0.3728, "loss_nan_ranks": 0, "loss_rank_avg": 0.3836482763290405, "step": 295 }, { "epoch": 0.26501766784452296, "grad_norm": 0.732476532459259, "learning_rate": 1.5081967213114754e-05, "loss": 0.4082, "loss_nan_ranks": 0, "loss_rank_avg": 0.39730846881866455, "step": 300 }, { "epoch": 0.26943462897526504, "grad_norm": 0.8139944076538086, "learning_rate": 1.5334174022698615e-05, "loss": 0.4319, "loss_nan_ranks": 0, "loss_rank_avg": 0.44922178983688354, "step": 305 }, { "epoch": 0.27385159010600707, "grad_norm": 0.7223174571990967, "learning_rate": 1.5586380832282474e-05, "loss": 0.3937, "loss_nan_ranks": 0, "loss_rank_avg": 0.39554718136787415, "step": 310 }, { "epoch": 0.2782685512367491, "grad_norm": 0.7935890555381775, "learning_rate": 1.5838587641866333e-05, "loss": 0.3971, "loss_nan_ranks": 0, "loss_rank_avg": 0.38846322894096375, "step": 315 }, { "epoch": 0.2826855123674912, "grad_norm": 0.8385109305381775, "learning_rate": 1.6090794451450193e-05, "loss": 0.3842, "loss_nan_ranks": 0, "loss_rank_avg": 0.3368198275566101, "step": 320 }, { "epoch": 0.2871024734982332, "grad_norm": 0.7849225401878357, "learning_rate": 1.634300126103405e-05, "loss": 0.4017, "loss_nan_ranks": 0, "loss_rank_avg": 0.4689519703388214, "step": 325 }, { "epoch": 0.2915194346289753, "grad_norm": 0.9184194207191467, "learning_rate": 1.6595208070617908e-05, "loss": 0.4222, "loss_nan_ranks": 0, "loss_rank_avg": 0.451241135597229, "step": 330 }, { "epoch": 0.2959363957597173, "grad_norm": 0.7168762683868408, "learning_rate": 1.6847414880201767e-05, "loss": 0.3989, "loss_nan_ranks": 0, "loss_rank_avg": 0.3456907868385315, "step": 335 }, { "epoch": 0.3003533568904594, "grad_norm": 0.7282963395118713, "learning_rate": 1.7099621689785626e-05, "loss": 0.4091, "loss_nan_ranks": 0, "loss_rank_avg": 0.3524041771888733, "step": 340 }, { "epoch": 0.3047703180212014, "grad_norm": 0.6994873285293579, "learning_rate": 1.7351828499369486e-05, "loss": 0.4219, "loss_nan_ranks": 0, "loss_rank_avg": 0.3284667134284973, "step": 345 }, { "epoch": 0.30918727915194344, "grad_norm": 0.6103523969650269, "learning_rate": 1.760403530895334e-05, "loss": 0.4224, "loss_nan_ranks": 0, "loss_rank_avg": 0.3484126925468445, "step": 350 }, { "epoch": 0.3136042402826855, "grad_norm": 0.7844368815422058, "learning_rate": 1.78562421185372e-05, "loss": 0.4439, "loss_nan_ranks": 0, "loss_rank_avg": 0.47252100706100464, "step": 355 }, { "epoch": 0.31802120141342755, "grad_norm": 0.6682479381561279, "learning_rate": 1.810844892812106e-05, "loss": 0.3943, "loss_nan_ranks": 0, "loss_rank_avg": 0.4642447829246521, "step": 360 }, { "epoch": 0.3224381625441696, "grad_norm": 0.7882423996925354, "learning_rate": 1.836065573770492e-05, "loss": 0.3837, "loss_nan_ranks": 0, "loss_rank_avg": 0.4364447593688965, "step": 365 }, { "epoch": 0.32685512367491165, "grad_norm": 0.6892913579940796, "learning_rate": 1.861286254728878e-05, "loss": 0.403, "loss_nan_ranks": 0, "loss_rank_avg": 0.3730366826057434, "step": 370 }, { "epoch": 0.33127208480565373, "grad_norm": 0.890687882900238, "learning_rate": 1.8865069356872635e-05, "loss": 0.4094, "loss_nan_ranks": 0, "loss_rank_avg": 0.4724777638912201, "step": 375 }, { "epoch": 0.33568904593639576, "grad_norm": 0.7592324018478394, "learning_rate": 1.9117276166456494e-05, "loss": 0.4166, "loss_nan_ranks": 0, "loss_rank_avg": 0.38363948464393616, "step": 380 }, { "epoch": 0.3401060070671378, "grad_norm": 0.8239976763725281, "learning_rate": 1.9369482976040353e-05, "loss": 0.3977, "loss_nan_ranks": 0, "loss_rank_avg": 0.37950828671455383, "step": 385 }, { "epoch": 0.34452296819787986, "grad_norm": 1.0034205913543701, "learning_rate": 1.9621689785624213e-05, "loss": 0.3811, "loss_nan_ranks": 0, "loss_rank_avg": 0.31529074907302856, "step": 390 }, { "epoch": 0.3489399293286219, "grad_norm": 0.7281728982925415, "learning_rate": 1.9873896595208072e-05, "loss": 0.4184, "loss_nan_ranks": 0, "loss_rank_avg": 0.35957640409469604, "step": 395 }, { "epoch": 0.35335689045936397, "grad_norm": 0.8345057368278503, "learning_rate": 2.012610340479193e-05, "loss": 0.3603, "loss_nan_ranks": 0, "loss_rank_avg": 0.38588133454322815, "step": 400 }, { "epoch": 0.357773851590106, "grad_norm": 0.881252110004425, "learning_rate": 2.037831021437579e-05, "loss": 0.4153, "loss_nan_ranks": 0, "loss_rank_avg": 0.385145366191864, "step": 405 }, { "epoch": 0.3621908127208481, "grad_norm": 0.7450293302536011, "learning_rate": 2.063051702395965e-05, "loss": 0.43, "loss_nan_ranks": 0, "loss_rank_avg": 0.4830145835876465, "step": 410 }, { "epoch": 0.3666077738515901, "grad_norm": 0.7826328873634338, "learning_rate": 2.0882723833543506e-05, "loss": 0.3932, "loss_nan_ranks": 0, "loss_rank_avg": 0.34232282638549805, "step": 415 }, { "epoch": 0.3710247349823322, "grad_norm": 0.7054056525230408, "learning_rate": 2.113493064312737e-05, "loss": 0.3558, "loss_nan_ranks": 0, "loss_rank_avg": 0.2948620617389679, "step": 420 }, { "epoch": 0.3754416961130742, "grad_norm": 0.770078718662262, "learning_rate": 2.1387137452711224e-05, "loss": 0.382, "loss_nan_ranks": 0, "loss_rank_avg": 0.4264480471611023, "step": 425 }, { "epoch": 0.37985865724381623, "grad_norm": 0.7419948577880859, "learning_rate": 2.1639344262295087e-05, "loss": 0.3874, "loss_nan_ranks": 0, "loss_rank_avg": 0.4408531188964844, "step": 430 }, { "epoch": 0.3842756183745583, "grad_norm": 0.7609454989433289, "learning_rate": 2.1891551071878943e-05, "loss": 0.3913, "loss_nan_ranks": 0, "loss_rank_avg": 0.35307884216308594, "step": 435 }, { "epoch": 0.38869257950530034, "grad_norm": 1.1566354036331177, "learning_rate": 2.21437578814628e-05, "loss": 0.3921, "loss_nan_ranks": 0, "loss_rank_avg": 0.2975735664367676, "step": 440 }, { "epoch": 0.3931095406360424, "grad_norm": 0.8143091201782227, "learning_rate": 2.239596469104666e-05, "loss": 0.377, "loss_nan_ranks": 0, "loss_rank_avg": 0.3160606920719147, "step": 445 }, { "epoch": 0.39752650176678445, "grad_norm": 0.6456040143966675, "learning_rate": 2.2648171500630518e-05, "loss": 0.3703, "loss_nan_ranks": 0, "loss_rank_avg": 0.37065446376800537, "step": 450 }, { "epoch": 0.4019434628975265, "grad_norm": 0.6718341708183289, "learning_rate": 2.290037831021438e-05, "loss": 0.3584, "loss_nan_ranks": 0, "loss_rank_avg": 0.32244637608528137, "step": 455 }, { "epoch": 0.40636042402826855, "grad_norm": 0.862759530544281, "learning_rate": 2.3152585119798236e-05, "loss": 0.4145, "loss_nan_ranks": 0, "loss_rank_avg": 0.4153875708580017, "step": 460 }, { "epoch": 0.4107773851590106, "grad_norm": 0.7327967882156372, "learning_rate": 2.3404791929382092e-05, "loss": 0.3628, "loss_nan_ranks": 0, "loss_rank_avg": 0.3565199077129364, "step": 465 }, { "epoch": 0.41519434628975266, "grad_norm": 0.863936722278595, "learning_rate": 2.3656998738965955e-05, "loss": 0.405, "loss_nan_ranks": 0, "loss_rank_avg": 0.40698155760765076, "step": 470 }, { "epoch": 0.4196113074204947, "grad_norm": 0.8501296639442444, "learning_rate": 2.390920554854981e-05, "loss": 0.3567, "loss_nan_ranks": 0, "loss_rank_avg": 0.36806195974349976, "step": 475 }, { "epoch": 0.42402826855123676, "grad_norm": 1.3909848928451538, "learning_rate": 2.4161412358133673e-05, "loss": 0.4706, "loss_nan_ranks": 0, "loss_rank_avg": 0.4072020649909973, "step": 480 }, { "epoch": 0.4284452296819788, "grad_norm": 0.7283811569213867, "learning_rate": 2.441361916771753e-05, "loss": 0.3921, "loss_nan_ranks": 0, "loss_rank_avg": 0.37325534224510193, "step": 485 }, { "epoch": 0.43286219081272087, "grad_norm": 0.7579320669174194, "learning_rate": 2.466582597730139e-05, "loss": 0.3424, "loss_nan_ranks": 0, "loss_rank_avg": 0.33884212374687195, "step": 490 }, { "epoch": 0.4372791519434629, "grad_norm": 0.8202218413352966, "learning_rate": 2.4918032786885248e-05, "loss": 0.3737, "loss_nan_ranks": 0, "loss_rank_avg": 0.34762677550315857, "step": 495 }, { "epoch": 0.4416961130742049, "grad_norm": 0.7608699798583984, "learning_rate": 2.5170239596469107e-05, "loss": 0.3843, "loss_nan_ranks": 0, "loss_rank_avg": 0.306643545627594, "step": 500 }, { "epoch": 0.446113074204947, "grad_norm": 0.7544461488723755, "learning_rate": 2.5422446406052967e-05, "loss": 0.3745, "loss_nan_ranks": 0, "loss_rank_avg": 0.39631327986717224, "step": 505 }, { "epoch": 0.450530035335689, "grad_norm": 0.6755616664886475, "learning_rate": 2.5674653215636826e-05, "loss": 0.3803, "loss_nan_ranks": 0, "loss_rank_avg": 0.3355293869972229, "step": 510 }, { "epoch": 0.4549469964664311, "grad_norm": 0.6714569330215454, "learning_rate": 2.5926860025220682e-05, "loss": 0.349, "loss_nan_ranks": 0, "loss_rank_avg": 0.344030499458313, "step": 515 }, { "epoch": 0.45936395759717313, "grad_norm": 0.7619994878768921, "learning_rate": 2.6179066834804544e-05, "loss": 0.3474, "loss_nan_ranks": 0, "loss_rank_avg": 0.331425279378891, "step": 520 }, { "epoch": 0.4637809187279152, "grad_norm": 0.745580792427063, "learning_rate": 2.64312736443884e-05, "loss": 0.3563, "loss_nan_ranks": 0, "loss_rank_avg": 0.39198458194732666, "step": 525 }, { "epoch": 0.46819787985865724, "grad_norm": 0.823861837387085, "learning_rate": 2.668348045397226e-05, "loss": 0.3697, "loss_nan_ranks": 0, "loss_rank_avg": 0.3569245934486389, "step": 530 }, { "epoch": 0.4726148409893993, "grad_norm": 0.7193405628204346, "learning_rate": 2.693568726355612e-05, "loss": 0.3745, "loss_nan_ranks": 0, "loss_rank_avg": 0.4549163579940796, "step": 535 }, { "epoch": 0.47703180212014135, "grad_norm": 0.7041448354721069, "learning_rate": 2.7187894073139975e-05, "loss": 0.3817, "loss_nan_ranks": 0, "loss_rank_avg": 0.43728315830230713, "step": 540 }, { "epoch": 0.48144876325088337, "grad_norm": 0.8459624648094177, "learning_rate": 2.7440100882723838e-05, "loss": 0.3744, "loss_nan_ranks": 0, "loss_rank_avg": 0.3709990084171295, "step": 545 }, { "epoch": 0.48586572438162545, "grad_norm": 0.8668114542961121, "learning_rate": 2.7692307692307694e-05, "loss": 0.3612, "loss_nan_ranks": 0, "loss_rank_avg": 0.32689058780670166, "step": 550 }, { "epoch": 0.4902826855123675, "grad_norm": 0.668462872505188, "learning_rate": 2.7944514501891556e-05, "loss": 0.3848, "loss_nan_ranks": 0, "loss_rank_avg": 0.30886298418045044, "step": 555 }, { "epoch": 0.49469964664310956, "grad_norm": 0.7367919683456421, "learning_rate": 2.8196721311475412e-05, "loss": 0.3591, "loss_nan_ranks": 0, "loss_rank_avg": 0.3122621476650238, "step": 560 }, { "epoch": 0.4991166077738516, "grad_norm": 0.7553625106811523, "learning_rate": 2.8448928121059268e-05, "loss": 0.362, "loss_nan_ranks": 0, "loss_rank_avg": 0.3086245059967041, "step": 565 }, { "epoch": 0.5035335689045937, "grad_norm": 0.6816399097442627, "learning_rate": 2.870113493064313e-05, "loss": 0.3565, "loss_nan_ranks": 0, "loss_rank_avg": 0.3278921842575073, "step": 570 }, { "epoch": 0.5079505300353356, "grad_norm": 0.7400028109550476, "learning_rate": 2.8953341740226987e-05, "loss": 0.3903, "loss_nan_ranks": 0, "loss_rank_avg": 0.33623063564300537, "step": 575 }, { "epoch": 0.5123674911660777, "grad_norm": 0.6778237819671631, "learning_rate": 2.920554854981085e-05, "loss": 0.3745, "loss_nan_ranks": 0, "loss_rank_avg": 0.3270686864852905, "step": 580 }, { "epoch": 0.5167844522968198, "grad_norm": 0.7293447256088257, "learning_rate": 2.9457755359394705e-05, "loss": 0.427, "loss_nan_ranks": 0, "loss_rank_avg": 0.47877180576324463, "step": 585 }, { "epoch": 0.5212014134275619, "grad_norm": 0.7676773071289062, "learning_rate": 2.9709962168978565e-05, "loss": 0.4404, "loss_nan_ranks": 0, "loss_rank_avg": 0.4877261817455292, "step": 590 }, { "epoch": 0.5256183745583038, "grad_norm": 0.6538991332054138, "learning_rate": 2.9962168978562424e-05, "loss": 0.3604, "loss_nan_ranks": 0, "loss_rank_avg": 0.3789966106414795, "step": 595 }, { "epoch": 0.5300353356890459, "grad_norm": 0.7705276012420654, "learning_rate": 3.0214375788146283e-05, "loss": 0.3947, "loss_nan_ranks": 0, "loss_rank_avg": 0.33382245898246765, "step": 600 }, { "epoch": 0.534452296819788, "grad_norm": 0.7710214853286743, "learning_rate": 3.0466582597730143e-05, "loss": 0.3543, "loss_nan_ranks": 0, "loss_rank_avg": 0.38197773694992065, "step": 605 }, { "epoch": 0.5388692579505301, "grad_norm": 0.67430579662323, "learning_rate": 3.0718789407314e-05, "loss": 0.4353, "loss_nan_ranks": 0, "loss_rank_avg": 0.4609474241733551, "step": 610 }, { "epoch": 0.5432862190812721, "grad_norm": 0.7167083621025085, "learning_rate": 3.097099621689786e-05, "loss": 0.3326, "loss_nan_ranks": 0, "loss_rank_avg": 0.2994512915611267, "step": 615 }, { "epoch": 0.5477031802120141, "grad_norm": 0.9038445949554443, "learning_rate": 3.122320302648172e-05, "loss": 0.3872, "loss_nan_ranks": 0, "loss_rank_avg": 0.36574289202690125, "step": 620 }, { "epoch": 0.5521201413427562, "grad_norm": 0.7607389092445374, "learning_rate": 3.1475409836065576e-05, "loss": 0.384, "loss_nan_ranks": 0, "loss_rank_avg": 0.4068770706653595, "step": 625 }, { "epoch": 0.5565371024734982, "grad_norm": 0.8022470474243164, "learning_rate": 3.1727616645649436e-05, "loss": 0.3579, "loss_nan_ranks": 0, "loss_rank_avg": 0.34602364897727966, "step": 630 }, { "epoch": 0.5609540636042403, "grad_norm": 0.7999392747879028, "learning_rate": 3.1979823455233295e-05, "loss": 0.3784, "loss_nan_ranks": 0, "loss_rank_avg": 0.4875880479812622, "step": 635 }, { "epoch": 0.5653710247349824, "grad_norm": 0.7946346402168274, "learning_rate": 3.2232030264817154e-05, "loss": 0.3777, "loss_nan_ranks": 0, "loss_rank_avg": 0.4051263630390167, "step": 640 }, { "epoch": 0.5697879858657244, "grad_norm": 0.6947000622749329, "learning_rate": 3.2484237074401014e-05, "loss": 0.3565, "loss_nan_ranks": 0, "loss_rank_avg": 0.4333820044994354, "step": 645 }, { "epoch": 0.5742049469964664, "grad_norm": 0.7858723402023315, "learning_rate": 3.273644388398487e-05, "loss": 0.3576, "loss_nan_ranks": 0, "loss_rank_avg": 0.33583977818489075, "step": 650 }, { "epoch": 0.5786219081272085, "grad_norm": 0.817613959312439, "learning_rate": 3.298865069356873e-05, "loss": 0.3129, "loss_nan_ranks": 0, "loss_rank_avg": 0.27572858333587646, "step": 655 }, { "epoch": 0.5830388692579506, "grad_norm": 0.7588901519775391, "learning_rate": 3.324085750315259e-05, "loss": 0.367, "loss_nan_ranks": 0, "loss_rank_avg": 0.29186704754829407, "step": 660 }, { "epoch": 0.5874558303886925, "grad_norm": 0.7261371612548828, "learning_rate": 3.3493064312736444e-05, "loss": 0.3133, "loss_nan_ranks": 0, "loss_rank_avg": 0.3395135998725891, "step": 665 }, { "epoch": 0.5918727915194346, "grad_norm": 0.6673470735549927, "learning_rate": 3.37452711223203e-05, "loss": 0.404, "loss_nan_ranks": 0, "loss_rank_avg": 0.41102349758148193, "step": 670 }, { "epoch": 0.5962897526501767, "grad_norm": 0.6788419485092163, "learning_rate": 3.399747793190416e-05, "loss": 0.3931, "loss_nan_ranks": 0, "loss_rank_avg": 0.3825211226940155, "step": 675 }, { "epoch": 0.6007067137809188, "grad_norm": 0.861670970916748, "learning_rate": 3.424968474148802e-05, "loss": 0.3875, "loss_nan_ranks": 0, "loss_rank_avg": 0.3033771514892578, "step": 680 }, { "epoch": 0.6051236749116607, "grad_norm": 0.6976490616798401, "learning_rate": 3.450189155107188e-05, "loss": 0.3206, "loss_nan_ranks": 0, "loss_rank_avg": 0.3370228409767151, "step": 685 }, { "epoch": 0.6095406360424028, "grad_norm": 0.6630620956420898, "learning_rate": 3.475409836065574e-05, "loss": 0.3294, "loss_nan_ranks": 0, "loss_rank_avg": 0.34097903966903687, "step": 690 }, { "epoch": 0.6139575971731449, "grad_norm": 0.6962843537330627, "learning_rate": 3.50063051702396e-05, "loss": 0.3731, "loss_nan_ranks": 0, "loss_rank_avg": 0.40441685914993286, "step": 695 }, { "epoch": 0.6183745583038869, "grad_norm": 0.6727367639541626, "learning_rate": 3.525851197982346e-05, "loss": 0.3226, "loss_nan_ranks": 0, "loss_rank_avg": 0.32211965322494507, "step": 700 }, { "epoch": 0.622791519434629, "grad_norm": 0.7818762063980103, "learning_rate": 3.551071878940732e-05, "loss": 0.3474, "loss_nan_ranks": 0, "loss_rank_avg": 0.4583301544189453, "step": 705 }, { "epoch": 0.627208480565371, "grad_norm": 0.7723045349121094, "learning_rate": 3.576292559899118e-05, "loss": 0.3719, "loss_nan_ranks": 0, "loss_rank_avg": 0.28167077898979187, "step": 710 }, { "epoch": 0.6316254416961131, "grad_norm": 0.6072138547897339, "learning_rate": 3.601513240857503e-05, "loss": 0.3479, "loss_nan_ranks": 0, "loss_rank_avg": 0.2791953682899475, "step": 715 }, { "epoch": 0.6360424028268551, "grad_norm": 0.8396653532981873, "learning_rate": 3.6267339218158896e-05, "loss": 0.4271, "loss_nan_ranks": 0, "loss_rank_avg": 0.5739034414291382, "step": 720 }, { "epoch": 0.6404593639575972, "grad_norm": 0.7916384935379028, "learning_rate": 3.651954602774275e-05, "loss": 0.4105, "loss_nan_ranks": 0, "loss_rank_avg": 0.42675089836120605, "step": 725 }, { "epoch": 0.6448763250883393, "grad_norm": 0.7917523980140686, "learning_rate": 3.677175283732661e-05, "loss": 0.3365, "loss_nan_ranks": 0, "loss_rank_avg": 0.30214524269104004, "step": 730 }, { "epoch": 0.6492932862190812, "grad_norm": 0.6911900639533997, "learning_rate": 3.702395964691047e-05, "loss": 0.3602, "loss_nan_ranks": 0, "loss_rank_avg": 0.3707190155982971, "step": 735 }, { "epoch": 0.6537102473498233, "grad_norm": 0.7061692476272583, "learning_rate": 3.727616645649433e-05, "loss": 0.3479, "loss_nan_ranks": 0, "loss_rank_avg": 0.28617987036705017, "step": 740 }, { "epoch": 0.6581272084805654, "grad_norm": 0.6829811334609985, "learning_rate": 3.7528373266078186e-05, "loss": 0.4159, "loss_nan_ranks": 0, "loss_rank_avg": 0.4385426640510559, "step": 745 }, { "epoch": 0.6625441696113075, "grad_norm": 0.688046395778656, "learning_rate": 3.7780580075662045e-05, "loss": 0.3727, "loss_nan_ranks": 0, "loss_rank_avg": 0.3491179943084717, "step": 750 }, { "epoch": 0.6669611307420494, "grad_norm": 0.6785129308700562, "learning_rate": 3.8032786885245905e-05, "loss": 0.3583, "loss_nan_ranks": 0, "loss_rank_avg": 0.369488000869751, "step": 755 }, { "epoch": 0.6713780918727915, "grad_norm": 0.7109005451202393, "learning_rate": 3.8284993694829764e-05, "loss": 0.3344, "loss_nan_ranks": 0, "loss_rank_avg": 0.354184627532959, "step": 760 }, { "epoch": 0.6757950530035336, "grad_norm": 0.7314112782478333, "learning_rate": 3.853720050441362e-05, "loss": 0.3582, "loss_nan_ranks": 0, "loss_rank_avg": 0.37824511528015137, "step": 765 }, { "epoch": 0.6802120141342756, "grad_norm": 1.165858268737793, "learning_rate": 3.878940731399748e-05, "loss": 0.384, "loss_nan_ranks": 0, "loss_rank_avg": 0.32820677757263184, "step": 770 }, { "epoch": 0.6846289752650176, "grad_norm": 0.8004192113876343, "learning_rate": 3.904161412358134e-05, "loss": 0.3607, "loss_nan_ranks": 0, "loss_rank_avg": 0.43373507261276245, "step": 775 }, { "epoch": 0.6890459363957597, "grad_norm": 0.6773238182067871, "learning_rate": 3.9293820933165195e-05, "loss": 0.3786, "loss_nan_ranks": 0, "loss_rank_avg": 0.3990272581577301, "step": 780 }, { "epoch": 0.6934628975265018, "grad_norm": 0.676603376865387, "learning_rate": 3.954602774274906e-05, "loss": 0.3336, "loss_nan_ranks": 0, "loss_rank_avg": 0.34566766023635864, "step": 785 }, { "epoch": 0.6978798586572438, "grad_norm": 0.7312802672386169, "learning_rate": 3.979823455233291e-05, "loss": 0.3405, "loss_nan_ranks": 0, "loss_rank_avg": 0.37781059741973877, "step": 790 }, { "epoch": 0.7022968197879859, "grad_norm": 0.7477230429649353, "learning_rate": 3.99999980591192e-05, "loss": 0.345, "loss_nan_ranks": 0, "loss_rank_avg": 0.29296875, "step": 795 }, { "epoch": 0.7067137809187279, "grad_norm": 0.6933770179748535, "learning_rate": 3.99999301283305e-05, "loss": 0.4408, "loss_nan_ranks": 0, "loss_rank_avg": 0.4305647313594818, "step": 800 }, { "epoch": 0.7111307420494699, "grad_norm": 0.6644602417945862, "learning_rate": 3.999976515387813e-05, "loss": 0.3571, "loss_nan_ranks": 0, "loss_rank_avg": 0.2757279574871063, "step": 805 }, { "epoch": 0.715547703180212, "grad_norm": 0.6703394651412964, "learning_rate": 3.9999503136562586e-05, "loss": 0.3417, "loss_nan_ranks": 0, "loss_rank_avg": 0.3425188660621643, "step": 810 }, { "epoch": 0.7199646643109541, "grad_norm": 0.6245801448822021, "learning_rate": 3.999914407765523e-05, "loss": 0.3524, "loss_nan_ranks": 0, "loss_rank_avg": 0.29982197284698486, "step": 815 }, { "epoch": 0.7243816254416962, "grad_norm": 0.701495885848999, "learning_rate": 3.999868797889828e-05, "loss": 0.3204, "loss_nan_ranks": 0, "loss_rank_avg": 0.3113703429698944, "step": 820 }, { "epoch": 0.7287985865724381, "grad_norm": 0.8265374302864075, "learning_rate": 3.999813484250483e-05, "loss": 0.3488, "loss_nan_ranks": 0, "loss_rank_avg": 0.3571431338787079, "step": 825 }, { "epoch": 0.7332155477031802, "grad_norm": 0.8132041096687317, "learning_rate": 3.99974846711588e-05, "loss": 0.3718, "loss_nan_ranks": 0, "loss_rank_avg": 0.30748432874679565, "step": 830 }, { "epoch": 0.7376325088339223, "grad_norm": 0.6265267133712769, "learning_rate": 3.9996737468014954e-05, "loss": 0.3123, "loss_nan_ranks": 0, "loss_rank_avg": 0.3108974099159241, "step": 835 }, { "epoch": 0.7420494699646644, "grad_norm": 0.7385701537132263, "learning_rate": 3.999589323669887e-05, "loss": 0.359, "loss_nan_ranks": 0, "loss_rank_avg": 0.40513014793395996, "step": 840 }, { "epoch": 0.7464664310954063, "grad_norm": 0.6594541668891907, "learning_rate": 3.9994951981306926e-05, "loss": 0.3511, "loss_nan_ranks": 0, "loss_rank_avg": 0.2959279716014862, "step": 845 }, { "epoch": 0.7508833922261484, "grad_norm": 0.7326868176460266, "learning_rate": 3.9993913706406287e-05, "loss": 0.349, "loss_nan_ranks": 0, "loss_rank_avg": 0.31566479802131653, "step": 850 }, { "epoch": 0.7553003533568905, "grad_norm": 0.798692524433136, "learning_rate": 3.999277841703486e-05, "loss": 0.347, "loss_nan_ranks": 0, "loss_rank_avg": 0.31945452094078064, "step": 855 }, { "epoch": 0.7597173144876325, "grad_norm": 0.6340591907501221, "learning_rate": 3.999154611870131e-05, "loss": 0.3524, "loss_nan_ranks": 0, "loss_rank_avg": 0.3852103352546692, "step": 860 }, { "epoch": 0.7641342756183745, "grad_norm": 0.7896412014961243, "learning_rate": 3.999021681738499e-05, "loss": 0.3417, "loss_nan_ranks": 0, "loss_rank_avg": 0.32461148500442505, "step": 865 }, { "epoch": 0.7685512367491166, "grad_norm": 0.6427087187767029, "learning_rate": 3.998879051953593e-05, "loss": 0.3073, "loss_nan_ranks": 0, "loss_rank_avg": 0.28621137142181396, "step": 870 }, { "epoch": 0.7729681978798587, "grad_norm": 0.6806996464729309, "learning_rate": 3.9987267232074816e-05, "loss": 0.3812, "loss_nan_ranks": 0, "loss_rank_avg": 0.3365304470062256, "step": 875 }, { "epoch": 0.7773851590106007, "grad_norm": 0.6693117618560791, "learning_rate": 3.998564696239295e-05, "loss": 0.3718, "loss_nan_ranks": 0, "loss_rank_avg": 0.3166996240615845, "step": 880 }, { "epoch": 0.7818021201413428, "grad_norm": 0.719115674495697, "learning_rate": 3.99839297183522e-05, "loss": 0.3356, "loss_nan_ranks": 0, "loss_rank_avg": 0.33166027069091797, "step": 885 }, { "epoch": 0.7862190812720848, "grad_norm": 0.6326349973678589, "learning_rate": 3.998211550828497e-05, "loss": 0.3528, "loss_nan_ranks": 0, "loss_rank_avg": 0.3603453040122986, "step": 890 }, { "epoch": 0.7906360424028268, "grad_norm": 0.8190131187438965, "learning_rate": 3.998020434099418e-05, "loss": 0.3497, "loss_nan_ranks": 0, "loss_rank_avg": 0.38982582092285156, "step": 895 }, { "epoch": 0.7950530035335689, "grad_norm": 0.6838703751564026, "learning_rate": 3.997819622575319e-05, "loss": 0.3586, "loss_nan_ranks": 0, "loss_rank_avg": 0.3148457705974579, "step": 900 }, { "epoch": 0.799469964664311, "grad_norm": 0.6027899384498596, "learning_rate": 3.9976091172305794e-05, "loss": 0.3576, "loss_nan_ranks": 0, "loss_rank_avg": 0.3718492388725281, "step": 905 }, { "epoch": 0.803886925795053, "grad_norm": 1.1394686698913574, "learning_rate": 3.9973889190866105e-05, "loss": 0.3383, "loss_nan_ranks": 0, "loss_rank_avg": 0.32566916942596436, "step": 910 }, { "epoch": 0.808303886925795, "grad_norm": 0.6600670218467712, "learning_rate": 3.99715902921186e-05, "loss": 0.355, "loss_nan_ranks": 0, "loss_rank_avg": 0.32827770709991455, "step": 915 }, { "epoch": 0.8127208480565371, "grad_norm": 0.8769943714141846, "learning_rate": 3.9969194487217987e-05, "loss": 0.3669, "loss_nan_ranks": 0, "loss_rank_avg": 0.3776477575302124, "step": 920 }, { "epoch": 0.8171378091872792, "grad_norm": 0.6823641657829285, "learning_rate": 3.9966701787789194e-05, "loss": 0.3431, "loss_nan_ranks": 0, "loss_rank_avg": 0.31834328174591064, "step": 925 }, { "epoch": 0.8215547703180212, "grad_norm": 0.7511164546012878, "learning_rate": 3.996411220592729e-05, "loss": 0.3553, "loss_nan_ranks": 0, "loss_rank_avg": 0.3393649458885193, "step": 930 }, { "epoch": 0.8259717314487632, "grad_norm": 0.6989418268203735, "learning_rate": 3.996142575419745e-05, "loss": 0.3087, "loss_nan_ranks": 0, "loss_rank_avg": 0.36577892303466797, "step": 935 }, { "epoch": 0.8303886925795053, "grad_norm": 0.6358893513679504, "learning_rate": 3.995864244563487e-05, "loss": 0.3472, "loss_nan_ranks": 0, "loss_rank_avg": 0.29227280616760254, "step": 940 }, { "epoch": 0.8348056537102474, "grad_norm": 0.6637855768203735, "learning_rate": 3.9955762293744735e-05, "loss": 0.3563, "loss_nan_ranks": 0, "loss_rank_avg": 0.3354935348033905, "step": 945 }, { "epoch": 0.8392226148409894, "grad_norm": 1.028828740119934, "learning_rate": 3.9952785312502107e-05, "loss": 0.3675, "loss_nan_ranks": 0, "loss_rank_avg": 0.3541829586029053, "step": 950 }, { "epoch": 0.8436395759717314, "grad_norm": 0.660925030708313, "learning_rate": 3.99497115163519e-05, "loss": 0.4159, "loss_nan_ranks": 0, "loss_rank_avg": 0.3134467601776123, "step": 955 }, { "epoch": 0.8480565371024735, "grad_norm": 0.6419395208358765, "learning_rate": 3.994654092020877e-05, "loss": 0.3492, "loss_nan_ranks": 0, "loss_rank_avg": 0.36793047189712524, "step": 960 }, { "epoch": 0.8524734982332155, "grad_norm": 0.6670768857002258, "learning_rate": 3.994327353945712e-05, "loss": 0.3413, "loss_nan_ranks": 0, "loss_rank_avg": 0.35276514291763306, "step": 965 }, { "epoch": 0.8568904593639576, "grad_norm": 0.7899559736251831, "learning_rate": 3.9939909389950894e-05, "loss": 0.3682, "loss_nan_ranks": 0, "loss_rank_avg": 0.4707202911376953, "step": 970 }, { "epoch": 0.8613074204946997, "grad_norm": 0.8200883865356445, "learning_rate": 3.9936448488013646e-05, "loss": 0.3363, "loss_nan_ranks": 0, "loss_rank_avg": 0.33861371874809265, "step": 975 }, { "epoch": 0.8657243816254417, "grad_norm": 0.7544311285018921, "learning_rate": 3.9932890850438356e-05, "loss": 0.3754, "loss_nan_ranks": 0, "loss_rank_avg": 0.4074310064315796, "step": 980 }, { "epoch": 0.8701413427561837, "grad_norm": 0.8232197165489197, "learning_rate": 3.9929236494487395e-05, "loss": 0.359, "loss_nan_ranks": 0, "loss_rank_avg": 0.39264535903930664, "step": 985 }, { "epoch": 0.8745583038869258, "grad_norm": 0.6976638436317444, "learning_rate": 3.9925485437892434e-05, "loss": 0.3726, "loss_nan_ranks": 0, "loss_rank_avg": 0.34916025400161743, "step": 990 }, { "epoch": 0.8789752650176679, "grad_norm": 0.7832766771316528, "learning_rate": 3.992163769885435e-05, "loss": 0.3198, "loss_nan_ranks": 0, "loss_rank_avg": 0.33082062005996704, "step": 995 }, { "epoch": 0.8833922261484098, "grad_norm": 0.6496185064315796, "learning_rate": 3.9917693296043124e-05, "loss": 0.3586, "loss_nan_ranks": 0, "loss_rank_avg": 0.3823724091053009, "step": 1000 }, { "epoch": 0.8878091872791519, "grad_norm": 1.394060492515564, "learning_rate": 3.9913652248597806e-05, "loss": 0.3653, "loss_nan_ranks": 0, "loss_rank_avg": 0.3977188169956207, "step": 1005 }, { "epoch": 0.892226148409894, "grad_norm": 2.0863211154937744, "learning_rate": 3.990951457612637e-05, "loss": 0.3364, "loss_nan_ranks": 0, "loss_rank_avg": 0.3450695276260376, "step": 1010 }, { "epoch": 0.8966431095406361, "grad_norm": 0.9185066223144531, "learning_rate": 3.9905280298705624e-05, "loss": 0.3569, "loss_nan_ranks": 0, "loss_rank_avg": 0.3249596357345581, "step": 1015 }, { "epoch": 0.901060070671378, "grad_norm": 0.9138262271881104, "learning_rate": 3.9900949436881126e-05, "loss": 0.3507, "loss_nan_ranks": 0, "loss_rank_avg": 0.3235431909561157, "step": 1020 }, { "epoch": 0.9054770318021201, "grad_norm": 0.663921058177948, "learning_rate": 3.989652201166709e-05, "loss": 0.3224, "loss_nan_ranks": 0, "loss_rank_avg": 0.30882376432418823, "step": 1025 }, { "epoch": 0.9098939929328622, "grad_norm": 1.4138214588165283, "learning_rate": 3.989199804454627e-05, "loss": 0.3297, "loss_nan_ranks": 0, "loss_rank_avg": 0.41032299399375916, "step": 1030 }, { "epoch": 0.9143109540636042, "grad_norm": 0.7750231027603149, "learning_rate": 3.988737755746986e-05, "loss": 0.3366, "loss_nan_ranks": 0, "loss_rank_avg": 0.35097378492355347, "step": 1035 }, { "epoch": 0.9187279151943463, "grad_norm": 0.691072940826416, "learning_rate": 3.9882660572857375e-05, "loss": 0.3495, "loss_nan_ranks": 0, "loss_rank_avg": 0.318118155002594, "step": 1040 }, { "epoch": 0.9231448763250883, "grad_norm": 1.0037578344345093, "learning_rate": 3.987784711359658e-05, "loss": 0.3272, "loss_nan_ranks": 0, "loss_rank_avg": 0.2796628773212433, "step": 1045 }, { "epoch": 0.9275618374558304, "grad_norm": 0.667972981929779, "learning_rate": 3.987293720304335e-05, "loss": 0.3611, "loss_nan_ranks": 0, "loss_rank_avg": 0.41761964559555054, "step": 1050 }, { "epoch": 0.9319787985865724, "grad_norm": 0.6739106178283691, "learning_rate": 3.9867930865021535e-05, "loss": 0.3379, "loss_nan_ranks": 0, "loss_rank_avg": 0.28799429535865784, "step": 1055 }, { "epoch": 0.9363957597173145, "grad_norm": 0.8230948448181152, "learning_rate": 3.9862828123822905e-05, "loss": 0.3756, "loss_nan_ranks": 0, "loss_rank_avg": 0.3900887966156006, "step": 1060 }, { "epoch": 0.9408127208480566, "grad_norm": 0.6671487092971802, "learning_rate": 3.985762900420698e-05, "loss": 0.3687, "loss_nan_ranks": 0, "loss_rank_avg": 0.36852848529815674, "step": 1065 }, { "epoch": 0.9452296819787986, "grad_norm": 0.6791719198226929, "learning_rate": 3.985233353140092e-05, "loss": 0.2972, "loss_nan_ranks": 0, "loss_rank_avg": 0.28970006108283997, "step": 1070 }, { "epoch": 0.9496466431095406, "grad_norm": 0.6565694212913513, "learning_rate": 3.984694173109942e-05, "loss": 0.3508, "loss_nan_ranks": 0, "loss_rank_avg": 0.3562009036540985, "step": 1075 }, { "epoch": 0.9540636042402827, "grad_norm": 0.6499453186988831, "learning_rate": 3.984145362946458e-05, "loss": 0.361, "loss_nan_ranks": 0, "loss_rank_avg": 0.4719471335411072, "step": 1080 }, { "epoch": 0.9584805653710248, "grad_norm": 0.6347289085388184, "learning_rate": 3.983586925312576e-05, "loss": 0.3525, "loss_nan_ranks": 0, "loss_rank_avg": 0.30270689725875854, "step": 1085 }, { "epoch": 0.9628975265017667, "grad_norm": 0.7031768560409546, "learning_rate": 3.983018862917948e-05, "loss": 0.3245, "loss_nan_ranks": 0, "loss_rank_avg": 0.28910696506500244, "step": 1090 }, { "epoch": 0.9673144876325088, "grad_norm": 0.6593021750450134, "learning_rate": 3.9824411785189264e-05, "loss": 0.3461, "loss_nan_ranks": 0, "loss_rank_avg": 0.2953903079032898, "step": 1095 }, { "epoch": 0.9717314487632509, "grad_norm": 0.7052675485610962, "learning_rate": 3.9818538749185506e-05, "loss": 0.3357, "loss_nan_ranks": 0, "loss_rank_avg": 0.38916873931884766, "step": 1100 }, { "epoch": 0.976148409893993, "grad_norm": 0.781073808670044, "learning_rate": 3.981256954966536e-05, "loss": 0.3559, "loss_nan_ranks": 0, "loss_rank_avg": 0.42383044958114624, "step": 1105 }, { "epoch": 0.980565371024735, "grad_norm": 0.8780611157417297, "learning_rate": 3.9806504215592575e-05, "loss": 0.345, "loss_nan_ranks": 0, "loss_rank_avg": 0.3717604875564575, "step": 1110 }, { "epoch": 0.984982332155477, "grad_norm": 0.6577640771865845, "learning_rate": 3.980034277639737e-05, "loss": 0.3427, "loss_nan_ranks": 0, "loss_rank_avg": 0.3323565423488617, "step": 1115 }, { "epoch": 0.9893992932862191, "grad_norm": 0.6743144392967224, "learning_rate": 3.979408526197628e-05, "loss": 0.3845, "loss_nan_ranks": 0, "loss_rank_avg": 0.37875691056251526, "step": 1120 }, { "epoch": 0.9938162544169611, "grad_norm": 0.6501055359840393, "learning_rate": 3.9787731702692004e-05, "loss": 0.3406, "loss_nan_ranks": 0, "loss_rank_avg": 0.3506585955619812, "step": 1125 }, { "epoch": 0.9982332155477032, "grad_norm": 0.7022161483764648, "learning_rate": 3.9781282129373294e-05, "loss": 0.3353, "loss_nan_ranks": 0, "loss_rank_avg": 0.41503027081489563, "step": 1130 }, { "epoch": 1.0026501766784452, "grad_norm": 0.6163008213043213, "learning_rate": 3.9774736573314774e-05, "loss": 0.3603, "loss_nan_ranks": 0, "loss_rank_avg": 0.41754186153411865, "step": 1135 }, { "epoch": 1.0070671378091873, "grad_norm": 0.7156196236610413, "learning_rate": 3.9768095066276794e-05, "loss": 0.3576, "loss_nan_ranks": 0, "loss_rank_avg": 0.275761216878891, "step": 1140 }, { "epoch": 1.0114840989399294, "grad_norm": 0.6655539274215698, "learning_rate": 3.9761357640485255e-05, "loss": 0.3423, "loss_nan_ranks": 0, "loss_rank_avg": 0.33661192655563354, "step": 1145 }, { "epoch": 1.0159010600706713, "grad_norm": 0.673190712928772, "learning_rate": 3.975452432863152e-05, "loss": 0.317, "loss_nan_ranks": 0, "loss_rank_avg": 0.3377433717250824, "step": 1150 }, { "epoch": 1.0203180212014133, "grad_norm": 0.6982813477516174, "learning_rate": 3.974759516387216e-05, "loss": 0.3091, "loss_nan_ranks": 0, "loss_rank_avg": 0.2920030951499939, "step": 1155 }, { "epoch": 1.0247349823321554, "grad_norm": 0.6755205988883972, "learning_rate": 3.9740570179828905e-05, "loss": 0.3357, "loss_nan_ranks": 0, "loss_rank_avg": 0.28413355350494385, "step": 1160 }, { "epoch": 1.0291519434628975, "grad_norm": 0.7137518525123596, "learning_rate": 3.9733449410588354e-05, "loss": 0.3105, "loss_nan_ranks": 0, "loss_rank_avg": 0.2865508198738098, "step": 1165 }, { "epoch": 1.0335689045936396, "grad_norm": 0.6695932149887085, "learning_rate": 3.972623289070191e-05, "loss": 0.329, "loss_nan_ranks": 0, "loss_rank_avg": 0.3236057758331299, "step": 1170 }, { "epoch": 1.0379858657243817, "grad_norm": 0.8682721257209778, "learning_rate": 3.971892065518557e-05, "loss": 0.2882, "loss_nan_ranks": 0, "loss_rank_avg": 0.2601502239704132, "step": 1175 }, { "epoch": 1.0424028268551238, "grad_norm": 0.6531527638435364, "learning_rate": 3.971151273951979e-05, "loss": 0.2812, "loss_nan_ranks": 0, "loss_rank_avg": 0.30173492431640625, "step": 1180 }, { "epoch": 1.0468197879858656, "grad_norm": 0.752144992351532, "learning_rate": 3.970400917964922e-05, "loss": 0.354, "loss_nan_ranks": 0, "loss_rank_avg": 0.2773906886577606, "step": 1185 }, { "epoch": 1.0512367491166077, "grad_norm": 0.6246238350868225, "learning_rate": 3.969641001198266e-05, "loss": 0.32, "loss_nan_ranks": 0, "loss_rank_avg": 0.2532247304916382, "step": 1190 }, { "epoch": 1.0556537102473498, "grad_norm": 0.7625958919525146, "learning_rate": 3.9688715273392785e-05, "loss": 0.294, "loss_nan_ranks": 0, "loss_rank_avg": 0.28142765164375305, "step": 1195 }, { "epoch": 1.0600706713780919, "grad_norm": 0.6404998302459717, "learning_rate": 3.9680925001216e-05, "loss": 0.3253, "loss_nan_ranks": 0, "loss_rank_avg": 0.30740100145339966, "step": 1200 }, { "epoch": 1.064487632508834, "grad_norm": 0.7395120859146118, "learning_rate": 3.967303923325228e-05, "loss": 0.3327, "loss_nan_ranks": 0, "loss_rank_avg": 0.34588027000427246, "step": 1205 }, { "epoch": 1.068904593639576, "grad_norm": 0.6187570691108704, "learning_rate": 3.966505800776493e-05, "loss": 0.3793, "loss_nan_ranks": 0, "loss_rank_avg": 0.33034655451774597, "step": 1210 }, { "epoch": 1.073321554770318, "grad_norm": 0.6744672656059265, "learning_rate": 3.965698136348048e-05, "loss": 0.3273, "loss_nan_ranks": 0, "loss_rank_avg": 0.41641291975975037, "step": 1215 }, { "epoch": 1.0777385159010602, "grad_norm": 0.6118738055229187, "learning_rate": 3.96488093395884e-05, "loss": 0.3072, "loss_nan_ranks": 0, "loss_rank_avg": 0.306530237197876, "step": 1220 }, { "epoch": 1.082155477031802, "grad_norm": 0.5981642603874207, "learning_rate": 3.964054197574099e-05, "loss": 0.3266, "loss_nan_ranks": 0, "loss_rank_avg": 0.34041914343833923, "step": 1225 }, { "epoch": 1.0865724381625441, "grad_norm": 0.649811863899231, "learning_rate": 3.963217931205317e-05, "loss": 0.3013, "loss_nan_ranks": 0, "loss_rank_avg": 0.25741928815841675, "step": 1230 }, { "epoch": 1.0909893992932862, "grad_norm": 0.6270660161972046, "learning_rate": 3.962372138910223e-05, "loss": 0.32, "loss_nan_ranks": 0, "loss_rank_avg": 0.3462643325328827, "step": 1235 }, { "epoch": 1.0954063604240283, "grad_norm": 0.75999915599823, "learning_rate": 3.9615168247927735e-05, "loss": 0.3129, "loss_nan_ranks": 0, "loss_rank_avg": 0.31946852803230286, "step": 1240 }, { "epoch": 1.0998233215547704, "grad_norm": 0.7573233246803284, "learning_rate": 3.9606519930031225e-05, "loss": 0.3373, "loss_nan_ranks": 0, "loss_rank_avg": 0.3408510386943817, "step": 1245 }, { "epoch": 1.1042402826855124, "grad_norm": 0.6537328362464905, "learning_rate": 3.959777647737606e-05, "loss": 0.3615, "loss_nan_ranks": 0, "loss_rank_avg": 0.34509575366973877, "step": 1250 }, { "epoch": 1.1086572438162543, "grad_norm": 0.6237077713012695, "learning_rate": 3.958893793238723e-05, "loss": 0.3505, "loss_nan_ranks": 0, "loss_rank_avg": 0.365169882774353, "step": 1255 }, { "epoch": 1.1130742049469964, "grad_norm": 0.6630376577377319, "learning_rate": 3.958000433795113e-05, "loss": 0.3865, "loss_nan_ranks": 0, "loss_rank_avg": 0.35122057795524597, "step": 1260 }, { "epoch": 1.1174911660777385, "grad_norm": 0.6241370439529419, "learning_rate": 3.957097573741534e-05, "loss": 0.3463, "loss_nan_ranks": 0, "loss_rank_avg": 0.30081939697265625, "step": 1265 }, { "epoch": 1.1219081272084805, "grad_norm": 0.7064021229743958, "learning_rate": 3.956185217458843e-05, "loss": 0.3429, "loss_nan_ranks": 0, "loss_rank_avg": 0.2906290888786316, "step": 1270 }, { "epoch": 1.1263250883392226, "grad_norm": 0.7379579544067383, "learning_rate": 3.955263369373977e-05, "loss": 0.306, "loss_nan_ranks": 0, "loss_rank_avg": 0.2594120502471924, "step": 1275 }, { "epoch": 1.1307420494699647, "grad_norm": 0.6167639493942261, "learning_rate": 3.9543320339599266e-05, "loss": 0.3344, "loss_nan_ranks": 0, "loss_rank_avg": 0.27128392457962036, "step": 1280 }, { "epoch": 1.1351590106007068, "grad_norm": 0.6793010234832764, "learning_rate": 3.953391215735718e-05, "loss": 0.3495, "loss_nan_ranks": 0, "loss_rank_avg": 0.3618358373641968, "step": 1285 }, { "epoch": 1.1395759717314489, "grad_norm": 0.8995165824890137, "learning_rate": 3.952440919266389e-05, "loss": 0.3221, "loss_nan_ranks": 0, "loss_rank_avg": 0.43430811166763306, "step": 1290 }, { "epoch": 1.1439929328621907, "grad_norm": 0.746021568775177, "learning_rate": 3.951481149162968e-05, "loss": 0.3149, "loss_nan_ranks": 0, "loss_rank_avg": 0.27596035599708557, "step": 1295 }, { "epoch": 1.1484098939929328, "grad_norm": 0.6076446175575256, "learning_rate": 3.950511910082452e-05, "loss": 0.3011, "loss_nan_ranks": 0, "loss_rank_avg": 0.2746957242488861, "step": 1300 }, { "epoch": 1.1528268551236749, "grad_norm": 0.692255973815918, "learning_rate": 3.949533206727784e-05, "loss": 0.3092, "loss_nan_ranks": 0, "loss_rank_avg": 0.3611806333065033, "step": 1305 }, { "epoch": 1.157243816254417, "grad_norm": 0.7213220000267029, "learning_rate": 3.948545043847826e-05, "loss": 0.3042, "loss_nan_ranks": 0, "loss_rank_avg": 0.3065589368343353, "step": 1310 }, { "epoch": 1.161660777385159, "grad_norm": 0.6529719829559326, "learning_rate": 3.947547426237344e-05, "loss": 0.3432, "loss_nan_ranks": 0, "loss_rank_avg": 0.3932499289512634, "step": 1315 }, { "epoch": 1.1660777385159011, "grad_norm": 0.683671236038208, "learning_rate": 3.9465403587369784e-05, "loss": 0.3098, "loss_nan_ranks": 0, "loss_rank_avg": 0.274165540933609, "step": 1320 }, { "epoch": 1.170494699646643, "grad_norm": 0.8071700930595398, "learning_rate": 3.945523846233222e-05, "loss": 0.3043, "loss_nan_ranks": 0, "loss_rank_avg": 0.3191944658756256, "step": 1325 }, { "epoch": 1.174911660777385, "grad_norm": 0.6285055875778198, "learning_rate": 3.944497893658396e-05, "loss": 0.3261, "loss_nan_ranks": 0, "loss_rank_avg": 0.3105619251728058, "step": 1330 }, { "epoch": 1.1793286219081272, "grad_norm": 0.8717606663703918, "learning_rate": 3.943462505990629e-05, "loss": 0.3588, "loss_nan_ranks": 0, "loss_rank_avg": 0.3164404630661011, "step": 1335 }, { "epoch": 1.1837455830388692, "grad_norm": 0.7031919956207275, "learning_rate": 3.942417688253827e-05, "loss": 0.3394, "loss_nan_ranks": 0, "loss_rank_avg": 0.3558098077774048, "step": 1340 }, { "epoch": 1.1881625441696113, "grad_norm": 0.5943769216537476, "learning_rate": 3.9413634455176584e-05, "loss": 0.3199, "loss_nan_ranks": 0, "loss_rank_avg": 0.31143832206726074, "step": 1345 }, { "epoch": 1.1925795053003534, "grad_norm": 0.6784660220146179, "learning_rate": 3.940299782897517e-05, "loss": 0.3039, "loss_nan_ranks": 0, "loss_rank_avg": 0.31072232127189636, "step": 1350 }, { "epoch": 1.1969964664310955, "grad_norm": 0.6783735752105713, "learning_rate": 3.939226705554507e-05, "loss": 0.3124, "loss_nan_ranks": 0, "loss_rank_avg": 0.3142765462398529, "step": 1355 }, { "epoch": 1.2014134275618376, "grad_norm": 0.6520729660987854, "learning_rate": 3.9381442186954155e-05, "loss": 0.3508, "loss_nan_ranks": 0, "loss_rank_avg": 0.2647053599357605, "step": 1360 }, { "epoch": 1.2058303886925794, "grad_norm": 0.6096318960189819, "learning_rate": 3.9370523275726844e-05, "loss": 0.3369, "loss_nan_ranks": 0, "loss_rank_avg": 0.27471020817756653, "step": 1365 }, { "epoch": 1.2102473498233215, "grad_norm": 0.6824337840080261, "learning_rate": 3.935951037484388e-05, "loss": 0.3035, "loss_nan_ranks": 0, "loss_rank_avg": 0.3124973177909851, "step": 1370 }, { "epoch": 1.2146643109540636, "grad_norm": 0.7245553135871887, "learning_rate": 3.934840353774208e-05, "loss": 0.3162, "loss_nan_ranks": 0, "loss_rank_avg": 0.27780479192733765, "step": 1375 }, { "epoch": 1.2190812720848057, "grad_norm": 0.8077725172042847, "learning_rate": 3.9337202818314016e-05, "loss": 0.2926, "loss_nan_ranks": 0, "loss_rank_avg": 0.36201488971710205, "step": 1380 }, { "epoch": 1.2234982332155477, "grad_norm": 0.6361654996871948, "learning_rate": 3.932590827090783e-05, "loss": 0.3642, "loss_nan_ranks": 0, "loss_rank_avg": 0.40312081575393677, "step": 1385 }, { "epoch": 1.2279151943462898, "grad_norm": 0.6152886748313904, "learning_rate": 3.931451995032693e-05, "loss": 0.3168, "loss_nan_ranks": 0, "loss_rank_avg": 0.3805939853191376, "step": 1390 }, { "epoch": 1.232332155477032, "grad_norm": 0.7459203004837036, "learning_rate": 3.930303791182972e-05, "loss": 0.3519, "loss_nan_ranks": 0, "loss_rank_avg": 0.34478020668029785, "step": 1395 }, { "epoch": 1.2367491166077738, "grad_norm": 0.8537634015083313, "learning_rate": 3.929146221112936e-05, "loss": 0.3215, "loss_nan_ranks": 0, "loss_rank_avg": 0.281982421875, "step": 1400 }, { "epoch": 1.2411660777385158, "grad_norm": 0.6000507473945618, "learning_rate": 3.927979290439346e-05, "loss": 0.3281, "loss_nan_ranks": 0, "loss_rank_avg": 0.2675206661224365, "step": 1405 }, { "epoch": 1.245583038869258, "grad_norm": 0.640716016292572, "learning_rate": 3.926803004824382e-05, "loss": 0.3312, "loss_nan_ranks": 0, "loss_rank_avg": 0.2979744076728821, "step": 1410 }, { "epoch": 1.25, "grad_norm": 0.7331620454788208, "learning_rate": 3.925617369975619e-05, "loss": 0.3385, "loss_nan_ranks": 0, "loss_rank_avg": 0.335235059261322, "step": 1415 }, { "epoch": 1.254416961130742, "grad_norm": 0.6532949805259705, "learning_rate": 3.924422391645994e-05, "loss": 0.3509, "loss_nan_ranks": 0, "loss_rank_avg": 0.3822665214538574, "step": 1420 }, { "epoch": 1.2588339222614842, "grad_norm": 0.7220327854156494, "learning_rate": 3.923218075633781e-05, "loss": 0.3268, "loss_nan_ranks": 0, "loss_rank_avg": 0.3238363265991211, "step": 1425 }, { "epoch": 1.2632508833922262, "grad_norm": 0.727918803691864, "learning_rate": 3.9220044277825615e-05, "loss": 0.3149, "loss_nan_ranks": 0, "loss_rank_avg": 0.31373223662376404, "step": 1430 }, { "epoch": 1.2676678445229683, "grad_norm": 0.633305013179779, "learning_rate": 3.920781453981199e-05, "loss": 0.2994, "loss_nan_ranks": 0, "loss_rank_avg": 0.3962780237197876, "step": 1435 }, { "epoch": 1.2720848056537102, "grad_norm": 0.6449732184410095, "learning_rate": 3.919549160163806e-05, "loss": 0.3217, "loss_nan_ranks": 0, "loss_rank_avg": 0.2957836985588074, "step": 1440 }, { "epoch": 1.2765017667844523, "grad_norm": 0.8494489789009094, "learning_rate": 3.91830755230972e-05, "loss": 0.3579, "loss_nan_ranks": 0, "loss_rank_avg": 0.47561344504356384, "step": 1445 }, { "epoch": 1.2809187279151943, "grad_norm": 0.6150957942008972, "learning_rate": 3.91705663644347e-05, "loss": 0.3226, "loss_nan_ranks": 0, "loss_rank_avg": 0.3158280849456787, "step": 1450 }, { "epoch": 1.2853356890459364, "grad_norm": 0.5343297719955444, "learning_rate": 3.91579641863475e-05, "loss": 0.323, "loss_nan_ranks": 0, "loss_rank_avg": 0.30349069833755493, "step": 1455 }, { "epoch": 1.2897526501766785, "grad_norm": 0.8276622295379639, "learning_rate": 3.91452690499839e-05, "loss": 0.3446, "loss_nan_ranks": 0, "loss_rank_avg": 0.4098215103149414, "step": 1460 }, { "epoch": 1.2941696113074204, "grad_norm": 0.6456217765808105, "learning_rate": 3.913248101694323e-05, "loss": 0.333, "loss_nan_ranks": 0, "loss_rank_avg": 0.40201887488365173, "step": 1465 }, { "epoch": 1.2985865724381624, "grad_norm": 0.5984911322593689, "learning_rate": 3.911960014927559e-05, "loss": 0.3269, "loss_nan_ranks": 0, "loss_rank_avg": 0.3077358603477478, "step": 1470 }, { "epoch": 1.3030035335689045, "grad_norm": 0.6234849691390991, "learning_rate": 3.910662650948153e-05, "loss": 0.3081, "loss_nan_ranks": 0, "loss_rank_avg": 0.3293282687664032, "step": 1475 }, { "epoch": 1.3074204946996466, "grad_norm": 0.6392715573310852, "learning_rate": 3.9093560160511746e-05, "loss": 0.3063, "loss_nan_ranks": 0, "loss_rank_avg": 0.34439730644226074, "step": 1480 }, { "epoch": 1.3118374558303887, "grad_norm": 0.5880979299545288, "learning_rate": 3.9080401165766776e-05, "loss": 0.316, "loss_nan_ranks": 0, "loss_rank_avg": 0.2801549434661865, "step": 1485 }, { "epoch": 1.3162544169611308, "grad_norm": 0.5844652652740479, "learning_rate": 3.9067149589096695e-05, "loss": 0.2849, "loss_nan_ranks": 0, "loss_rank_avg": 0.2638784646987915, "step": 1490 }, { "epoch": 1.3206713780918728, "grad_norm": 0.5854594707489014, "learning_rate": 3.905380549480081e-05, "loss": 0.3029, "loss_nan_ranks": 0, "loss_rank_avg": 0.30802667140960693, "step": 1495 }, { "epoch": 1.325088339222615, "grad_norm": 0.5132575035095215, "learning_rate": 3.904036894762734e-05, "loss": 0.3015, "loss_nan_ranks": 0, "loss_rank_avg": 0.2757795453071594, "step": 1500 }, { "epoch": 1.329505300353357, "grad_norm": 0.790227472782135, "learning_rate": 3.9026840012773094e-05, "loss": 0.3119, "loss_nan_ranks": 0, "loss_rank_avg": 0.3685351610183716, "step": 1505 }, { "epoch": 1.3339222614840989, "grad_norm": 0.5928642153739929, "learning_rate": 3.901321875588317e-05, "loss": 0.3241, "loss_nan_ranks": 0, "loss_rank_avg": 0.3497644364833832, "step": 1510 }, { "epoch": 1.338339222614841, "grad_norm": 0.6434882283210754, "learning_rate": 3.899950524305064e-05, "loss": 0.3218, "loss_nan_ranks": 0, "loss_rank_avg": 0.3292595148086548, "step": 1515 }, { "epoch": 1.342756183745583, "grad_norm": 0.7256221771240234, "learning_rate": 3.898569954081621e-05, "loss": 0.3332, "loss_nan_ranks": 0, "loss_rank_avg": 0.3023999333381653, "step": 1520 }, { "epoch": 1.3471731448763251, "grad_norm": 0.7445887327194214, "learning_rate": 3.897180171616791e-05, "loss": 0.3047, "loss_nan_ranks": 0, "loss_rank_avg": 0.27664875984191895, "step": 1525 }, { "epoch": 1.3515901060070672, "grad_norm": 0.6363182663917542, "learning_rate": 3.895781183654076e-05, "loss": 0.348, "loss_nan_ranks": 0, "loss_rank_avg": 0.2896704077720642, "step": 1530 }, { "epoch": 1.356007067137809, "grad_norm": 0.7220079898834229, "learning_rate": 3.894372996981647e-05, "loss": 0.3056, "loss_nan_ranks": 0, "loss_rank_avg": 0.3187897503376007, "step": 1535 }, { "epoch": 1.3604240282685511, "grad_norm": 0.9932358264923096, "learning_rate": 3.892955618432306e-05, "loss": 0.2863, "loss_nan_ranks": 0, "loss_rank_avg": 0.3777332007884979, "step": 1540 }, { "epoch": 1.3648409893992932, "grad_norm": 0.6612488627433777, "learning_rate": 3.891529054883458e-05, "loss": 0.3671, "loss_nan_ranks": 0, "loss_rank_avg": 0.32506632804870605, "step": 1545 }, { "epoch": 1.3692579505300353, "grad_norm": 0.809368371963501, "learning_rate": 3.8900933132570755e-05, "loss": 0.3164, "loss_nan_ranks": 0, "loss_rank_avg": 0.2754046618938446, "step": 1550 }, { "epoch": 1.3736749116607774, "grad_norm": 0.6561906933784485, "learning_rate": 3.888648400519663e-05, "loss": 0.364, "loss_nan_ranks": 0, "loss_rank_avg": 0.27140358090400696, "step": 1555 }, { "epoch": 1.3780918727915195, "grad_norm": 0.6149983406066895, "learning_rate": 3.8871943236822274e-05, "loss": 0.2918, "loss_nan_ranks": 0, "loss_rank_avg": 0.3111530542373657, "step": 1560 }, { "epoch": 1.3825088339222615, "grad_norm": 0.788455605506897, "learning_rate": 3.88573108980024e-05, "loss": 0.3015, "loss_nan_ranks": 0, "loss_rank_avg": 0.31705474853515625, "step": 1565 }, { "epoch": 1.3869257950530036, "grad_norm": 0.8068515062332153, "learning_rate": 3.8842587059736054e-05, "loss": 0.2891, "loss_nan_ranks": 0, "loss_rank_avg": 0.27609729766845703, "step": 1570 }, { "epoch": 1.3913427561837457, "grad_norm": 0.5502995252609253, "learning_rate": 3.882777179346622e-05, "loss": 0.3524, "loss_nan_ranks": 0, "loss_rank_avg": 0.286032497882843, "step": 1575 }, { "epoch": 1.3957597173144876, "grad_norm": 0.5802372694015503, "learning_rate": 3.881286517107957e-05, "loss": 0.343, "loss_nan_ranks": 0, "loss_rank_avg": 0.24839898943901062, "step": 1580 }, { "epoch": 1.4001766784452296, "grad_norm": 0.614653468132019, "learning_rate": 3.879786726490599e-05, "loss": 0.3196, "loss_nan_ranks": 0, "loss_rank_avg": 0.29347753524780273, "step": 1585 }, { "epoch": 1.4045936395759717, "grad_norm": 0.5936715006828308, "learning_rate": 3.8782778147718335e-05, "loss": 0.329, "loss_nan_ranks": 0, "loss_rank_avg": 0.3896068036556244, "step": 1590 }, { "epoch": 1.4090106007067138, "grad_norm": 5.28199577331543, "learning_rate": 3.876759789273202e-05, "loss": 0.3, "loss_nan_ranks": 0, "loss_rank_avg": 0.26368066668510437, "step": 1595 }, { "epoch": 1.4134275618374559, "grad_norm": 0.7651248574256897, "learning_rate": 3.8752326573604684e-05, "loss": 0.3075, "loss_nan_ranks": 0, "loss_rank_avg": 0.30806493759155273, "step": 1600 }, { "epoch": 1.417844522968198, "grad_norm": 0.6337783932685852, "learning_rate": 3.873696426443581e-05, "loss": 0.3195, "loss_nan_ranks": 0, "loss_rank_avg": 0.34544122219085693, "step": 1605 }, { "epoch": 1.4222614840989398, "grad_norm": 0.7464025616645813, "learning_rate": 3.872151103976642e-05, "loss": 0.3251, "loss_nan_ranks": 0, "loss_rank_avg": 0.3165499269962311, "step": 1610 }, { "epoch": 1.426678445229682, "grad_norm": 0.5613967180252075, "learning_rate": 3.870596697457863e-05, "loss": 0.3442, "loss_nan_ranks": 0, "loss_rank_avg": 0.27536633610725403, "step": 1615 }, { "epoch": 1.431095406360424, "grad_norm": 0.6180069446563721, "learning_rate": 3.8690332144295375e-05, "loss": 0.3426, "loss_nan_ranks": 0, "loss_rank_avg": 0.39397120475769043, "step": 1620 }, { "epoch": 1.435512367491166, "grad_norm": 0.628078818321228, "learning_rate": 3.867460662477996e-05, "loss": 0.332, "loss_nan_ranks": 0, "loss_rank_avg": 0.3936801254749298, "step": 1625 }, { "epoch": 1.4399293286219081, "grad_norm": 0.6523563265800476, "learning_rate": 3.865879049233577e-05, "loss": 0.3076, "loss_nan_ranks": 0, "loss_rank_avg": 0.31615036725997925, "step": 1630 }, { "epoch": 1.4443462897526502, "grad_norm": 0.6801185607910156, "learning_rate": 3.864288382370584e-05, "loss": 0.3124, "loss_nan_ranks": 0, "loss_rank_avg": 0.27253082394599915, "step": 1635 }, { "epoch": 1.4487632508833923, "grad_norm": 0.6551727056503296, "learning_rate": 3.8626886696072495e-05, "loss": 0.3393, "loss_nan_ranks": 0, "loss_rank_avg": 0.3144484758377075, "step": 1640 }, { "epoch": 1.4531802120141344, "grad_norm": 0.6799625158309937, "learning_rate": 3.8610799187057025e-05, "loss": 0.3086, "loss_nan_ranks": 0, "loss_rank_avg": 0.3325912356376648, "step": 1645 }, { "epoch": 1.4575971731448762, "grad_norm": 0.6419410705566406, "learning_rate": 3.8594621374719226e-05, "loss": 0.3026, "loss_nan_ranks": 0, "loss_rank_avg": 0.28254279494285583, "step": 1650 }, { "epoch": 1.4620141342756183, "grad_norm": 0.6062602996826172, "learning_rate": 3.857835333755709e-05, "loss": 0.3182, "loss_nan_ranks": 0, "loss_rank_avg": 0.32770591974258423, "step": 1655 }, { "epoch": 1.4664310954063604, "grad_norm": 0.6399083137512207, "learning_rate": 3.856199515450638e-05, "loss": 0.3236, "loss_nan_ranks": 0, "loss_rank_avg": 0.38535135984420776, "step": 1660 }, { "epoch": 1.4708480565371025, "grad_norm": 0.6881480813026428, "learning_rate": 3.8545546904940285e-05, "loss": 0.3233, "loss_nan_ranks": 0, "loss_rank_avg": 0.35274213552474976, "step": 1665 }, { "epoch": 1.4752650176678446, "grad_norm": 0.708899199962616, "learning_rate": 3.8529008668668996e-05, "loss": 0.3243, "loss_nan_ranks": 0, "loss_rank_avg": 0.29779568314552307, "step": 1670 }, { "epoch": 1.4796819787985867, "grad_norm": 0.5962197780609131, "learning_rate": 3.851238052593935e-05, "loss": 0.3054, "loss_nan_ranks": 0, "loss_rank_avg": 0.3215206265449524, "step": 1675 }, { "epoch": 1.4840989399293285, "grad_norm": 0.6873301863670349, "learning_rate": 3.849566255743442e-05, "loss": 0.3252, "loss_nan_ranks": 0, "loss_rank_avg": 0.29667988419532776, "step": 1680 }, { "epoch": 1.4885159010600706, "grad_norm": 0.7549837827682495, "learning_rate": 3.8478854844273134e-05, "loss": 0.3139, "loss_nan_ranks": 0, "loss_rank_avg": 0.32202067971229553, "step": 1685 }, { "epoch": 1.4929328621908127, "grad_norm": 0.6532432436943054, "learning_rate": 3.846195746800988e-05, "loss": 0.2846, "loss_nan_ranks": 0, "loss_rank_avg": 0.2808791995048523, "step": 1690 }, { "epoch": 1.4973498233215548, "grad_norm": 0.6817176342010498, "learning_rate": 3.8444970510634124e-05, "loss": 0.3371, "loss_nan_ranks": 0, "loss_rank_avg": 0.29034101963043213, "step": 1695 }, { "epoch": 1.5017667844522968, "grad_norm": 0.6129101514816284, "learning_rate": 3.842789405456996e-05, "loss": 0.3295, "loss_nan_ranks": 0, "loss_rank_avg": 0.3373889923095703, "step": 1700 }, { "epoch": 1.506183745583039, "grad_norm": 0.5987979769706726, "learning_rate": 3.841072818267578e-05, "loss": 0.3237, "loss_nan_ranks": 0, "loss_rank_avg": 0.36195623874664307, "step": 1705 }, { "epoch": 1.510600706713781, "grad_norm": 0.6188389658927917, "learning_rate": 3.839347297824383e-05, "loss": 0.3196, "loss_nan_ranks": 0, "loss_rank_avg": 0.28901487588882446, "step": 1710 }, { "epoch": 1.515017667844523, "grad_norm": 0.605717658996582, "learning_rate": 3.837612852499982e-05, "loss": 0.352, "loss_nan_ranks": 0, "loss_rank_avg": 0.31816282868385315, "step": 1715 }, { "epoch": 1.5194346289752652, "grad_norm": 0.5934823751449585, "learning_rate": 3.8358694907102504e-05, "loss": 0.3625, "loss_nan_ranks": 0, "loss_rank_avg": 0.3863886594772339, "step": 1720 }, { "epoch": 1.523851590106007, "grad_norm": 0.578649640083313, "learning_rate": 3.834117220914328e-05, "loss": 0.3449, "loss_nan_ranks": 0, "loss_rank_avg": 0.2828061580657959, "step": 1725 }, { "epoch": 1.528268551236749, "grad_norm": 0.6291191577911377, "learning_rate": 3.832356051614579e-05, "loss": 0.307, "loss_nan_ranks": 0, "loss_rank_avg": 0.28863656520843506, "step": 1730 }, { "epoch": 1.5326855123674912, "grad_norm": 0.6461930274963379, "learning_rate": 3.8305859913565505e-05, "loss": 0.3011, "loss_nan_ranks": 0, "loss_rank_avg": 0.3028411865234375, "step": 1735 }, { "epoch": 1.5371024734982333, "grad_norm": 0.7606573104858398, "learning_rate": 3.8288070487289274e-05, "loss": 0.3087, "loss_nan_ranks": 0, "loss_rank_avg": 0.3565424084663391, "step": 1740 }, { "epoch": 1.5415194346289751, "grad_norm": 0.6665891408920288, "learning_rate": 3.827019232363496e-05, "loss": 0.3183, "loss_nan_ranks": 0, "loss_rank_avg": 0.3757513165473938, "step": 1745 }, { "epoch": 1.5459363957597172, "grad_norm": 0.6307830214500427, "learning_rate": 3.8252225509350985e-05, "loss": 0.3252, "loss_nan_ranks": 0, "loss_rank_avg": 0.322690486907959, "step": 1750 }, { "epoch": 1.5503533568904593, "grad_norm": 0.6449663043022156, "learning_rate": 3.823417013161594e-05, "loss": 0.3276, "loss_nan_ranks": 0, "loss_rank_avg": 0.3264492154121399, "step": 1755 }, { "epoch": 1.5547703180212014, "grad_norm": 0.6309463381767273, "learning_rate": 3.821602627803813e-05, "loss": 0.3399, "loss_nan_ranks": 0, "loss_rank_avg": 0.3071059584617615, "step": 1760 }, { "epoch": 1.5591872791519434, "grad_norm": 0.5758494138717651, "learning_rate": 3.819779403665515e-05, "loss": 0.3248, "loss_nan_ranks": 0, "loss_rank_avg": 0.3266148567199707, "step": 1765 }, { "epoch": 1.5636042402826855, "grad_norm": 0.6446824669837952, "learning_rate": 3.8179473495933497e-05, "loss": 0.3323, "loss_nan_ranks": 0, "loss_rank_avg": 0.3177984952926636, "step": 1770 }, { "epoch": 1.5680212014134276, "grad_norm": 0.5853697657585144, "learning_rate": 3.8161064744768096e-05, "loss": 0.2712, "loss_nan_ranks": 0, "loss_rank_avg": 0.24256934225559235, "step": 1775 }, { "epoch": 1.5724381625441697, "grad_norm": 0.722345232963562, "learning_rate": 3.814256787248189e-05, "loss": 0.3833, "loss_nan_ranks": 0, "loss_rank_avg": 0.4055064916610718, "step": 1780 }, { "epoch": 1.5768551236749118, "grad_norm": 0.7835103869438171, "learning_rate": 3.81239829688254e-05, "loss": 0.3099, "loss_nan_ranks": 0, "loss_rank_avg": 0.3148956596851349, "step": 1785 }, { "epoch": 1.5812720848056538, "grad_norm": 0.6101419925689697, "learning_rate": 3.810531012397632e-05, "loss": 0.3416, "loss_nan_ranks": 0, "loss_rank_avg": 0.3856297433376312, "step": 1790 }, { "epoch": 1.585689045936396, "grad_norm": 0.607524037361145, "learning_rate": 3.8086549428539016e-05, "loss": 0.3393, "loss_nan_ranks": 0, "loss_rank_avg": 0.3431060314178467, "step": 1795 }, { "epoch": 1.5901060070671378, "grad_norm": 0.700236976146698, "learning_rate": 3.806770097354413e-05, "loss": 0.2922, "loss_nan_ranks": 0, "loss_rank_avg": 0.30610954761505127, "step": 1800 }, { "epoch": 1.5945229681978799, "grad_norm": 0.5721810460090637, "learning_rate": 3.8048764850448146e-05, "loss": 0.3178, "loss_nan_ranks": 0, "loss_rank_avg": 0.2738623023033142, "step": 1805 }, { "epoch": 1.598939929328622, "grad_norm": 0.591039776802063, "learning_rate": 3.802974115113292e-05, "loss": 0.3071, "loss_nan_ranks": 0, "loss_rank_avg": 0.3155909776687622, "step": 1810 }, { "epoch": 1.6033568904593638, "grad_norm": 0.6222483515739441, "learning_rate": 3.801062996790526e-05, "loss": 0.3603, "loss_nan_ranks": 0, "loss_rank_avg": 0.40520355105400085, "step": 1815 }, { "epoch": 1.6077738515901059, "grad_norm": 0.6669710874557495, "learning_rate": 3.7991431393496435e-05, "loss": 0.3065, "loss_nan_ranks": 0, "loss_rank_avg": 0.3122641444206238, "step": 1820 }, { "epoch": 1.612190812720848, "grad_norm": 0.628384530544281, "learning_rate": 3.797214552106178e-05, "loss": 0.2951, "loss_nan_ranks": 0, "loss_rank_avg": 0.29349058866500854, "step": 1825 }, { "epoch": 1.61660777385159, "grad_norm": 0.6003797054290771, "learning_rate": 3.7952772444180205e-05, "loss": 0.3327, "loss_nan_ranks": 0, "loss_rank_avg": 0.28449106216430664, "step": 1830 }, { "epoch": 1.6210247349823321, "grad_norm": 0.6813213229179382, "learning_rate": 3.793331225685376e-05, "loss": 0.3209, "loss_nan_ranks": 0, "loss_rank_avg": 0.247023805975914, "step": 1835 }, { "epoch": 1.6254416961130742, "grad_norm": 0.6331183314323425, "learning_rate": 3.791376505350716e-05, "loss": 0.2859, "loss_nan_ranks": 0, "loss_rank_avg": 0.2805844843387604, "step": 1840 }, { "epoch": 1.6298586572438163, "grad_norm": 0.5931798815727234, "learning_rate": 3.789413092898735e-05, "loss": 0.2862, "loss_nan_ranks": 0, "loss_rank_avg": 0.2882406711578369, "step": 1845 }, { "epoch": 1.6342756183745584, "grad_norm": 0.6270744204521179, "learning_rate": 3.7874409978563045e-05, "loss": 0.2997, "loss_nan_ranks": 0, "loss_rank_avg": 0.27691859006881714, "step": 1850 }, { "epoch": 1.6386925795053005, "grad_norm": 0.6561160087585449, "learning_rate": 3.785460229792422e-05, "loss": 0.2763, "loss_nan_ranks": 0, "loss_rank_avg": 0.2742835283279419, "step": 1855 }, { "epoch": 1.6431095406360425, "grad_norm": 0.6197009086608887, "learning_rate": 3.783470798318173e-05, "loss": 0.3189, "loss_nan_ranks": 0, "loss_rank_avg": 0.2708655297756195, "step": 1860 }, { "epoch": 1.6475265017667846, "grad_norm": 0.7104551196098328, "learning_rate": 3.7814727130866756e-05, "loss": 0.3393, "loss_nan_ranks": 0, "loss_rank_avg": 0.3953385353088379, "step": 1865 }, { "epoch": 1.6519434628975265, "grad_norm": 0.6067651510238647, "learning_rate": 3.779465983793039e-05, "loss": 0.3433, "loss_nan_ranks": 0, "loss_rank_avg": 0.31828904151916504, "step": 1870 }, { "epoch": 1.6563604240282686, "grad_norm": 0.6345065236091614, "learning_rate": 3.7774506201743175e-05, "loss": 0.3252, "loss_nan_ranks": 0, "loss_rank_avg": 0.3190929591655731, "step": 1875 }, { "epoch": 1.6607773851590106, "grad_norm": 0.5989964604377747, "learning_rate": 3.775426632009456e-05, "loss": 0.3066, "loss_nan_ranks": 0, "loss_rank_avg": 0.2885761857032776, "step": 1880 }, { "epoch": 1.6651943462897525, "grad_norm": 0.6218124032020569, "learning_rate": 3.7733940291192516e-05, "loss": 0.3205, "loss_nan_ranks": 0, "loss_rank_avg": 0.2709357738494873, "step": 1885 }, { "epoch": 1.6696113074204946, "grad_norm": 0.6491353511810303, "learning_rate": 3.771352821366301e-05, "loss": 0.3574, "loss_nan_ranks": 0, "loss_rank_avg": 0.31962040066719055, "step": 1890 }, { "epoch": 1.6740282685512367, "grad_norm": 0.6486796736717224, "learning_rate": 3.769303018654951e-05, "loss": 0.3129, "loss_nan_ranks": 0, "loss_rank_avg": 0.27561670541763306, "step": 1895 }, { "epoch": 1.6784452296819787, "grad_norm": 0.8267950415611267, "learning_rate": 3.7672446309312554e-05, "loss": 0.3588, "loss_nan_ranks": 0, "loss_rank_avg": 0.2873364984989166, "step": 1900 }, { "epoch": 1.6828621908127208, "grad_norm": 0.6081082820892334, "learning_rate": 3.765177668182923e-05, "loss": 0.3609, "loss_nan_ranks": 0, "loss_rank_avg": 0.35057562589645386, "step": 1905 }, { "epoch": 1.687279151943463, "grad_norm": 0.5976192951202393, "learning_rate": 3.763102140439272e-05, "loss": 0.3016, "loss_nan_ranks": 0, "loss_rank_avg": 0.2694868743419647, "step": 1910 }, { "epoch": 1.691696113074205, "grad_norm": 0.7123092412948608, "learning_rate": 3.7610180577711774e-05, "loss": 0.2874, "loss_nan_ranks": 0, "loss_rank_avg": 0.2789933681488037, "step": 1915 }, { "epoch": 1.696113074204947, "grad_norm": 0.741333544254303, "learning_rate": 3.758925430291025e-05, "loss": 0.3251, "loss_nan_ranks": 0, "loss_rank_avg": 0.35715097188949585, "step": 1920 }, { "epoch": 1.7005300353356891, "grad_norm": 0.6544567942619324, "learning_rate": 3.756824268152663e-05, "loss": 0.326, "loss_nan_ranks": 0, "loss_rank_avg": 0.2953934669494629, "step": 1925 }, { "epoch": 1.7049469964664312, "grad_norm": 1.1364296674728394, "learning_rate": 3.7547145815513504e-05, "loss": 0.3568, "loss_nan_ranks": 0, "loss_rank_avg": 0.43960511684417725, "step": 1930 }, { "epoch": 1.7093639575971733, "grad_norm": 0.5804359912872314, "learning_rate": 3.752596380723709e-05, "loss": 0.3531, "loss_nan_ranks": 0, "loss_rank_avg": 0.4059427082538605, "step": 1935 }, { "epoch": 1.7137809187279152, "grad_norm": 0.6707079410552979, "learning_rate": 3.750469675947672e-05, "loss": 0.3044, "loss_nan_ranks": 0, "loss_rank_avg": 0.308398962020874, "step": 1940 }, { "epoch": 1.7181978798586572, "grad_norm": 0.6289849281311035, "learning_rate": 3.7483344775424376e-05, "loss": 0.3225, "loss_nan_ranks": 0, "loss_rank_avg": 0.35997170209884644, "step": 1945 }, { "epoch": 1.7226148409893993, "grad_norm": 0.5992223620414734, "learning_rate": 3.746190795868416e-05, "loss": 0.3168, "loss_nan_ranks": 0, "loss_rank_avg": 0.36869436502456665, "step": 1950 }, { "epoch": 1.7270318021201412, "grad_norm": 0.5985012054443359, "learning_rate": 3.7440386413271796e-05, "loss": 0.2932, "loss_nan_ranks": 0, "loss_rank_avg": 0.3474159240722656, "step": 1955 }, { "epoch": 1.7314487632508833, "grad_norm": 0.6145229339599609, "learning_rate": 3.741878024361412e-05, "loss": 0.3082, "loss_nan_ranks": 0, "loss_rank_avg": 0.34130439162254333, "step": 1960 }, { "epoch": 1.7358657243816253, "grad_norm": 0.5978900790214539, "learning_rate": 3.7397089554548606e-05, "loss": 0.2994, "loss_nan_ranks": 0, "loss_rank_avg": 0.3122338354587555, "step": 1965 }, { "epoch": 1.7402826855123674, "grad_norm": 0.6474099159240723, "learning_rate": 3.73753144513228e-05, "loss": 0.2718, "loss_nan_ranks": 0, "loss_rank_avg": 0.2959476113319397, "step": 1970 }, { "epoch": 1.7446996466431095, "grad_norm": 0.5508759021759033, "learning_rate": 3.735345503959388e-05, "loss": 0.3195, "loss_nan_ranks": 0, "loss_rank_avg": 0.25343257188796997, "step": 1975 }, { "epoch": 1.7491166077738516, "grad_norm": 0.5979631543159485, "learning_rate": 3.7331511425428075e-05, "loss": 0.307, "loss_nan_ranks": 0, "loss_rank_avg": 0.3401501476764679, "step": 1980 }, { "epoch": 1.7535335689045937, "grad_norm": 0.6378216743469238, "learning_rate": 3.73094837153002e-05, "loss": 0.3163, "loss_nan_ranks": 0, "loss_rank_avg": 0.3658217787742615, "step": 1985 }, { "epoch": 1.7579505300353357, "grad_norm": 0.623346745967865, "learning_rate": 3.7287372016093106e-05, "loss": 0.3476, "loss_nan_ranks": 0, "loss_rank_avg": 0.3634029030799866, "step": 1990 }, { "epoch": 1.7623674911660778, "grad_norm": 0.548507034778595, "learning_rate": 3.726517643509718e-05, "loss": 0.3238, "loss_nan_ranks": 0, "loss_rank_avg": 0.34876665472984314, "step": 1995 }, { "epoch": 1.76678445229682, "grad_norm": 0.7020362615585327, "learning_rate": 3.724289708000984e-05, "loss": 0.313, "loss_nan_ranks": 0, "loss_rank_avg": 0.29458093643188477, "step": 2000 }, { "epoch": 1.771201413427562, "grad_norm": 1.0174129009246826, "learning_rate": 3.722053405893495e-05, "loss": 0.3045, "loss_nan_ranks": 0, "loss_rank_avg": 0.2385835349559784, "step": 2005 }, { "epoch": 1.7756183745583038, "grad_norm": 0.6126503348350525, "learning_rate": 3.7198087480382386e-05, "loss": 0.3038, "loss_nan_ranks": 0, "loss_rank_avg": 0.24906566739082336, "step": 2010 }, { "epoch": 1.780035335689046, "grad_norm": 0.6186851263046265, "learning_rate": 3.7175557453267435e-05, "loss": 0.3153, "loss_nan_ranks": 0, "loss_rank_avg": 0.36772221326828003, "step": 2015 }, { "epoch": 1.784452296819788, "grad_norm": 0.5845491886138916, "learning_rate": 3.715294408691029e-05, "loss": 0.3231, "loss_nan_ranks": 0, "loss_rank_avg": 0.31444716453552246, "step": 2020 }, { "epoch": 1.78886925795053, "grad_norm": 0.5485044121742249, "learning_rate": 3.713024749103554e-05, "loss": 0.3279, "loss_nan_ranks": 0, "loss_rank_avg": 0.28981852531433105, "step": 2025 }, { "epoch": 1.793286219081272, "grad_norm": 0.7004613280296326, "learning_rate": 3.71074677757716e-05, "loss": 0.3089, "loss_nan_ranks": 0, "loss_rank_avg": 0.2541000247001648, "step": 2030 }, { "epoch": 1.797703180212014, "grad_norm": 0.7733515501022339, "learning_rate": 3.708460505165021e-05, "loss": 0.3438, "loss_nan_ranks": 0, "loss_rank_avg": 0.3459468185901642, "step": 2035 }, { "epoch": 1.802120141342756, "grad_norm": 0.5577183961868286, "learning_rate": 3.706165942960589e-05, "loss": 0.3271, "loss_nan_ranks": 0, "loss_rank_avg": 0.3305257558822632, "step": 2040 }, { "epoch": 1.8065371024734982, "grad_norm": 0.6522884964942932, "learning_rate": 3.703863102097538e-05, "loss": 0.3168, "loss_nan_ranks": 0, "loss_rank_avg": 0.3232169449329376, "step": 2045 }, { "epoch": 1.8109540636042403, "grad_norm": 0.6660712361335754, "learning_rate": 3.701551993749714e-05, "loss": 0.3165, "loss_nan_ranks": 0, "loss_rank_avg": 0.2644830346107483, "step": 2050 }, { "epoch": 1.8153710247349824, "grad_norm": 0.7599813938140869, "learning_rate": 3.6992326291310764e-05, "loss": 0.3048, "loss_nan_ranks": 0, "loss_rank_avg": 0.30041027069091797, "step": 2055 }, { "epoch": 1.8197879858657244, "grad_norm": 0.6655240058898926, "learning_rate": 3.696905019495647e-05, "loss": 0.2975, "loss_nan_ranks": 0, "loss_rank_avg": 0.2828328609466553, "step": 2060 }, { "epoch": 1.8242049469964665, "grad_norm": 0.608040988445282, "learning_rate": 3.6945691761374535e-05, "loss": 0.3234, "loss_nan_ranks": 0, "loss_rank_avg": 0.42636048793792725, "step": 2065 }, { "epoch": 1.8286219081272086, "grad_norm": 0.6305931806564331, "learning_rate": 3.692225110390474e-05, "loss": 0.3236, "loss_nan_ranks": 0, "loss_rank_avg": 0.3188191056251526, "step": 2070 }, { "epoch": 1.8330388692579507, "grad_norm": 0.7485166788101196, "learning_rate": 3.689872833628587e-05, "loss": 0.3203, "loss_nan_ranks": 0, "loss_rank_avg": 0.42439842224121094, "step": 2075 }, { "epoch": 1.8374558303886925, "grad_norm": 0.5816894769668579, "learning_rate": 3.687512357265509e-05, "loss": 0.3268, "loss_nan_ranks": 0, "loss_rank_avg": 0.37391436100006104, "step": 2080 }, { "epoch": 1.8418727915194346, "grad_norm": 0.6705328822135925, "learning_rate": 3.685143692754743e-05, "loss": 0.3141, "loss_nan_ranks": 0, "loss_rank_avg": 0.361337274312973, "step": 2085 }, { "epoch": 1.8462897526501767, "grad_norm": 0.6142212152481079, "learning_rate": 3.6827668515895234e-05, "loss": 0.3092, "loss_nan_ranks": 0, "loss_rank_avg": 0.3198060691356659, "step": 2090 }, { "epoch": 1.8507067137809188, "grad_norm": 0.6588479280471802, "learning_rate": 3.68038184530276e-05, "loss": 0.3024, "loss_nan_ranks": 0, "loss_rank_avg": 0.3107324540615082, "step": 2095 }, { "epoch": 1.8551236749116606, "grad_norm": 0.6271264553070068, "learning_rate": 3.6779886854669815e-05, "loss": 0.2935, "loss_nan_ranks": 0, "loss_rank_avg": 0.3267250657081604, "step": 2100 }, { "epoch": 1.8595406360424027, "grad_norm": 0.6405203938484192, "learning_rate": 3.6755873836942756e-05, "loss": 0.3282, "loss_nan_ranks": 0, "loss_rank_avg": 0.32828402519226074, "step": 2105 }, { "epoch": 1.8639575971731448, "grad_norm": 0.6557453274726868, "learning_rate": 3.673177951636242e-05, "loss": 0.3506, "loss_nan_ranks": 0, "loss_rank_avg": 0.28758811950683594, "step": 2110 }, { "epoch": 1.8683745583038869, "grad_norm": 1.1797688007354736, "learning_rate": 3.670760400983925e-05, "loss": 0.3433, "loss_nan_ranks": 0, "loss_rank_avg": 0.39800888299942017, "step": 2115 }, { "epoch": 1.872791519434629, "grad_norm": 0.6395815014839172, "learning_rate": 3.6683347434677654e-05, "loss": 0.3342, "loss_nan_ranks": 0, "loss_rank_avg": 0.3299906253814697, "step": 2120 }, { "epoch": 1.877208480565371, "grad_norm": 0.6777175664901733, "learning_rate": 3.6659009908575394e-05, "loss": 0.2953, "loss_nan_ranks": 0, "loss_rank_avg": 0.2692110538482666, "step": 2125 }, { "epoch": 1.8816254416961131, "grad_norm": 0.5334305763244629, "learning_rate": 3.663459154962301e-05, "loss": 0.3263, "loss_nan_ranks": 0, "loss_rank_avg": 0.3176087439060211, "step": 2130 }, { "epoch": 1.8860424028268552, "grad_norm": 0.5658344626426697, "learning_rate": 3.661009247630326e-05, "loss": 0.3149, "loss_nan_ranks": 0, "loss_rank_avg": 0.36185193061828613, "step": 2135 }, { "epoch": 1.8904593639575973, "grad_norm": 0.6579681634902954, "learning_rate": 3.658551280749055e-05, "loss": 0.2887, "loss_nan_ranks": 0, "loss_rank_avg": 0.25398847460746765, "step": 2140 }, { "epoch": 1.8948763250883394, "grad_norm": 0.6341381072998047, "learning_rate": 3.656085266245038e-05, "loss": 0.2892, "loss_nan_ranks": 0, "loss_rank_avg": 0.3196604251861572, "step": 2145 }, { "epoch": 1.8992932862190812, "grad_norm": 0.6079564094543457, "learning_rate": 3.653611216083867e-05, "loss": 0.3093, "loss_nan_ranks": 0, "loss_rank_avg": 0.41236889362335205, "step": 2150 }, { "epoch": 1.9037102473498233, "grad_norm": 0.5752854347229004, "learning_rate": 3.651129142270132e-05, "loss": 0.3111, "loss_nan_ranks": 0, "loss_rank_avg": 0.32420551776885986, "step": 2155 }, { "epoch": 1.9081272084805654, "grad_norm": 0.7557041645050049, "learning_rate": 3.6486390568473494e-05, "loss": 0.3236, "loss_nan_ranks": 0, "loss_rank_avg": 0.40055620670318604, "step": 2160 }, { "epoch": 1.9125441696113075, "grad_norm": 0.6140500903129578, "learning_rate": 3.646140971897914e-05, "loss": 0.2967, "loss_nan_ranks": 0, "loss_rank_avg": 0.3187151551246643, "step": 2165 }, { "epoch": 1.9169611307420493, "grad_norm": 0.5381097793579102, "learning_rate": 3.6436348995430314e-05, "loss": 0.3371, "loss_nan_ranks": 0, "loss_rank_avg": 0.20654772222042084, "step": 2170 }, { "epoch": 1.9213780918727914, "grad_norm": 0.6211098432540894, "learning_rate": 3.641120851942669e-05, "loss": 0.3121, "loss_nan_ranks": 0, "loss_rank_avg": 0.27154093980789185, "step": 2175 }, { "epoch": 1.9257950530035335, "grad_norm": 0.6953615546226501, "learning_rate": 3.638598841295487e-05, "loss": 0.3264, "loss_nan_ranks": 0, "loss_rank_avg": 0.31072691082954407, "step": 2180 }, { "epoch": 1.9302120141342756, "grad_norm": 0.579765796661377, "learning_rate": 3.6360688798387865e-05, "loss": 0.3463, "loss_nan_ranks": 0, "loss_rank_avg": 0.3020220398902893, "step": 2185 }, { "epoch": 1.9346289752650176, "grad_norm": 0.6398031115531921, "learning_rate": 3.633530979848446e-05, "loss": 0.2941, "loss_nan_ranks": 0, "loss_rank_avg": 0.2853238880634308, "step": 2190 }, { "epoch": 1.9390459363957597, "grad_norm": 0.6654336452484131, "learning_rate": 3.6309851536388664e-05, "loss": 0.3671, "loss_nan_ranks": 0, "loss_rank_avg": 0.35676461458206177, "step": 2195 }, { "epoch": 1.9434628975265018, "grad_norm": 0.5575515627861023, "learning_rate": 3.6284314135629036e-05, "loss": 0.3231, "loss_nan_ranks": 0, "loss_rank_avg": 0.4140787720680237, "step": 2200 }, { "epoch": 1.947879858657244, "grad_norm": 0.6344410181045532, "learning_rate": 3.625869772011816e-05, "loss": 0.3538, "loss_nan_ranks": 0, "loss_rank_avg": 0.34461867809295654, "step": 2205 }, { "epoch": 1.952296819787986, "grad_norm": 0.6593054533004761, "learning_rate": 3.6233002414152025e-05, "loss": 0.3141, "loss_nan_ranks": 0, "loss_rank_avg": 0.3200452923774719, "step": 2210 }, { "epoch": 1.956713780918728, "grad_norm": 0.5528004765510559, "learning_rate": 3.620722834240939e-05, "loss": 0.3353, "loss_nan_ranks": 0, "loss_rank_avg": 0.31394898891448975, "step": 2215 }, { "epoch": 1.96113074204947, "grad_norm": 0.5560632944107056, "learning_rate": 3.61813756299512e-05, "loss": 0.3328, "loss_nan_ranks": 0, "loss_rank_avg": 0.3282029330730438, "step": 2220 }, { "epoch": 1.965547703180212, "grad_norm": 0.6172696352005005, "learning_rate": 3.6155444402219995e-05, "loss": 0.3315, "loss_nan_ranks": 0, "loss_rank_avg": 0.2924641966819763, "step": 2225 }, { "epoch": 1.969964664310954, "grad_norm": 0.6023048758506775, "learning_rate": 3.612943478503929e-05, "loss": 0.313, "loss_nan_ranks": 0, "loss_rank_avg": 0.33808523416519165, "step": 2230 }, { "epoch": 1.9743816254416962, "grad_norm": 0.5986247658729553, "learning_rate": 3.610334690461295e-05, "loss": 0.2988, "loss_nan_ranks": 0, "loss_rank_avg": 0.3043588399887085, "step": 2235 }, { "epoch": 1.978798586572438, "grad_norm": 0.6460138559341431, "learning_rate": 3.6077180887524584e-05, "loss": 0.3008, "loss_nan_ranks": 0, "loss_rank_avg": 0.3652576506137848, "step": 2240 }, { "epoch": 1.98321554770318, "grad_norm": 0.6833009123802185, "learning_rate": 3.605093686073694e-05, "loss": 0.3143, "loss_nan_ranks": 0, "loss_rank_avg": 0.3827962875366211, "step": 2245 }, { "epoch": 1.9876325088339222, "grad_norm": 0.5557314157485962, "learning_rate": 3.602461495159131e-05, "loss": 0.3122, "loss_nan_ranks": 0, "loss_rank_avg": 0.28441256284713745, "step": 2250 }, { "epoch": 1.9920494699646643, "grad_norm": 0.6646502017974854, "learning_rate": 3.5998215287806845e-05, "loss": 0.3075, "loss_nan_ranks": 0, "loss_rank_avg": 0.2972768545150757, "step": 2255 }, { "epoch": 1.9964664310954063, "grad_norm": 0.6511381268501282, "learning_rate": 3.597173799748001e-05, "loss": 0.3088, "loss_nan_ranks": 0, "loss_rank_avg": 0.3291865587234497, "step": 2260 }, { "epoch": 2.001766784452297, "grad_norm": 0.5447190999984741, "learning_rate": 3.594518320908391e-05, "loss": 0.2974, "loss_nan_ranks": 0, "loss_rank_avg": 0.28889337182044983, "step": 2265 }, { "epoch": 2.006183745583039, "grad_norm": 0.6198117733001709, "learning_rate": 3.591855105146769e-05, "loss": 0.2677, "loss_nan_ranks": 0, "loss_rank_avg": 0.23140525817871094, "step": 2270 }, { "epoch": 2.010600706713781, "grad_norm": 0.5488595366477966, "learning_rate": 3.589184165385592e-05, "loss": 0.2922, "loss_nan_ranks": 0, "loss_rank_avg": 0.23664027452468872, "step": 2275 }, { "epoch": 2.015017667844523, "grad_norm": 0.5935027599334717, "learning_rate": 3.586505514584793e-05, "loss": 0.2791, "loss_nan_ranks": 0, "loss_rank_avg": 0.3317275643348694, "step": 2280 }, { "epoch": 2.019434628975265, "grad_norm": 0.5912279486656189, "learning_rate": 3.583819165741722e-05, "loss": 0.3277, "loss_nan_ranks": 0, "loss_rank_avg": 0.3340924382209778, "step": 2285 }, { "epoch": 2.0238515901060072, "grad_norm": 0.6450473070144653, "learning_rate": 3.581125131891082e-05, "loss": 0.2556, "loss_nan_ranks": 0, "loss_rank_avg": 0.2265785187482834, "step": 2290 }, { "epoch": 2.0282685512367493, "grad_norm": 0.605987548828125, "learning_rate": 3.578423426104864e-05, "loss": 0.3267, "loss_nan_ranks": 0, "loss_rank_avg": 0.38664746284484863, "step": 2295 }, { "epoch": 2.032685512367491, "grad_norm": 0.6688462495803833, "learning_rate": 3.5757140614922846e-05, "loss": 0.2794, "loss_nan_ranks": 0, "loss_rank_avg": 0.29907870292663574, "step": 2300 }, { "epoch": 2.037102473498233, "grad_norm": 0.6601260900497437, "learning_rate": 3.572997051199724e-05, "loss": 0.3039, "loss_nan_ranks": 0, "loss_rank_avg": 0.4096679091453552, "step": 2305 }, { "epoch": 2.041519434628975, "grad_norm": 0.6559783220291138, "learning_rate": 3.5702724084106596e-05, "loss": 0.2865, "loss_nan_ranks": 0, "loss_rank_avg": 0.2515143156051636, "step": 2310 }, { "epoch": 2.045936395759717, "grad_norm": 0.8877668380737305, "learning_rate": 3.567540146345604e-05, "loss": 0.3152, "loss_nan_ranks": 0, "loss_rank_avg": 0.324682354927063, "step": 2315 }, { "epoch": 2.0503533568904593, "grad_norm": 0.6168680787086487, "learning_rate": 3.5648002782620375e-05, "loss": 0.3071, "loss_nan_ranks": 0, "loss_rank_avg": 0.28638938069343567, "step": 2320 }, { "epoch": 2.0547703180212014, "grad_norm": 0.6127181649208069, "learning_rate": 3.562052817454351e-05, "loss": 0.2835, "loss_nan_ranks": 0, "loss_rank_avg": 0.3344689905643463, "step": 2325 }, { "epoch": 2.0591872791519434, "grad_norm": 0.6770476698875427, "learning_rate": 3.5592977772537734e-05, "loss": 0.2967, "loss_nan_ranks": 0, "loss_rank_avg": 0.257055401802063, "step": 2330 }, { "epoch": 2.0636042402826855, "grad_norm": 0.5903377532958984, "learning_rate": 3.55653517102831e-05, "loss": 0.2975, "loss_nan_ranks": 0, "loss_rank_avg": 0.32297080755233765, "step": 2335 }, { "epoch": 2.0680212014134276, "grad_norm": 0.6872009634971619, "learning_rate": 3.5537650121826804e-05, "loss": 0.2931, "loss_nan_ranks": 0, "loss_rank_avg": 0.3144655227661133, "step": 2340 }, { "epoch": 2.0724381625441697, "grad_norm": 0.6215624213218689, "learning_rate": 3.550987314158249e-05, "loss": 0.3258, "loss_nan_ranks": 0, "loss_rank_avg": 0.35112127661705017, "step": 2345 }, { "epoch": 2.0768551236749118, "grad_norm": 0.6877519488334656, "learning_rate": 3.5482020904329635e-05, "loss": 0.2963, "loss_nan_ranks": 0, "loss_rank_avg": 0.26411569118499756, "step": 2350 }, { "epoch": 2.081272084805654, "grad_norm": 0.8698825240135193, "learning_rate": 3.545409354521286e-05, "loss": 0.3224, "loss_nan_ranks": 0, "loss_rank_avg": 0.33345749974250793, "step": 2355 }, { "epoch": 2.085689045936396, "grad_norm": 0.6604434251785278, "learning_rate": 3.542609119974129e-05, "loss": 0.2875, "loss_nan_ranks": 0, "loss_rank_avg": 0.3097860515117645, "step": 2360 }, { "epoch": 2.090106007067138, "grad_norm": 0.5723004341125488, "learning_rate": 3.539801400378793e-05, "loss": 0.2737, "loss_nan_ranks": 0, "loss_rank_avg": 0.23669950664043427, "step": 2365 }, { "epoch": 2.0945229681978796, "grad_norm": 0.7183104157447815, "learning_rate": 3.5369862093588946e-05, "loss": 0.2733, "loss_nan_ranks": 0, "loss_rank_avg": 0.3131876289844513, "step": 2370 }, { "epoch": 2.0989399293286217, "grad_norm": 0.649411141872406, "learning_rate": 3.534163560574304e-05, "loss": 0.3283, "loss_nan_ranks": 0, "loss_rank_avg": 0.31314218044281006, "step": 2375 }, { "epoch": 2.103356890459364, "grad_norm": 0.659487247467041, "learning_rate": 3.531333467721078e-05, "loss": 0.3206, "loss_nan_ranks": 0, "loss_rank_avg": 0.31890353560447693, "step": 2380 }, { "epoch": 2.107773851590106, "grad_norm": 0.6342830657958984, "learning_rate": 3.5284959445313945e-05, "loss": 0.2959, "loss_nan_ranks": 0, "loss_rank_avg": 0.2846304178237915, "step": 2385 }, { "epoch": 2.112190812720848, "grad_norm": 0.6259862780570984, "learning_rate": 3.525651004773481e-05, "loss": 0.3114, "loss_nan_ranks": 0, "loss_rank_avg": 0.35213717818260193, "step": 2390 }, { "epoch": 2.11660777385159, "grad_norm": 0.6828689575195312, "learning_rate": 3.522798662251558e-05, "loss": 0.3066, "loss_nan_ranks": 0, "loss_rank_avg": 0.35810428857803345, "step": 2395 }, { "epoch": 2.121024734982332, "grad_norm": 0.621423602104187, "learning_rate": 3.51993893080576e-05, "loss": 0.288, "loss_nan_ranks": 0, "loss_rank_avg": 0.23874174058437347, "step": 2400 }, { "epoch": 2.125441696113074, "grad_norm": 0.6099359393119812, "learning_rate": 3.517071824312077e-05, "loss": 0.3052, "loss_nan_ranks": 0, "loss_rank_avg": 0.2990003228187561, "step": 2405 }, { "epoch": 2.1298586572438163, "grad_norm": 0.6000734567642212, "learning_rate": 3.5141973566822843e-05, "loss": 0.2777, "loss_nan_ranks": 0, "loss_rank_avg": 0.2534373700618744, "step": 2410 }, { "epoch": 2.1342756183745584, "grad_norm": 0.5893080830574036, "learning_rate": 3.511315541863873e-05, "loss": 0.2757, "loss_nan_ranks": 0, "loss_rank_avg": 0.2416633665561676, "step": 2415 }, { "epoch": 2.1386925795053005, "grad_norm": 0.6631132960319519, "learning_rate": 3.508426393839986e-05, "loss": 0.3008, "loss_nan_ranks": 0, "loss_rank_avg": 0.28256604075431824, "step": 2420 }, { "epoch": 2.1431095406360425, "grad_norm": 0.5691207647323608, "learning_rate": 3.505529926629348e-05, "loss": 0.2822, "loss_nan_ranks": 0, "loss_rank_avg": 0.2603251338005066, "step": 2425 }, { "epoch": 2.1475265017667846, "grad_norm": 0.632331371307373, "learning_rate": 3.502626154286196e-05, "loss": 0.2722, "loss_nan_ranks": 0, "loss_rank_avg": 0.25206872820854187, "step": 2430 }, { "epoch": 2.1519434628975267, "grad_norm": 0.5492742657661438, "learning_rate": 3.4997150909002156e-05, "loss": 0.2772, "loss_nan_ranks": 0, "loss_rank_avg": 0.21242624521255493, "step": 2435 }, { "epoch": 2.1563604240282688, "grad_norm": 0.6887109279632568, "learning_rate": 3.496796750596469e-05, "loss": 0.3005, "loss_nan_ranks": 0, "loss_rank_avg": 0.33346956968307495, "step": 2440 }, { "epoch": 2.1607773851590104, "grad_norm": 0.6558607816696167, "learning_rate": 3.4938711475353286e-05, "loss": 0.262, "loss_nan_ranks": 0, "loss_rank_avg": 0.22680602967739105, "step": 2445 }, { "epoch": 2.1651943462897525, "grad_norm": 0.6331962943077087, "learning_rate": 3.490938295912404e-05, "loss": 0.3254, "loss_nan_ranks": 0, "loss_rank_avg": 0.319640576839447, "step": 2450 }, { "epoch": 2.1696113074204946, "grad_norm": 0.6158129572868347, "learning_rate": 3.487998209958479e-05, "loss": 0.2815, "loss_nan_ranks": 0, "loss_rank_avg": 0.31720587611198425, "step": 2455 }, { "epoch": 2.1740282685512367, "grad_norm": 0.6651401519775391, "learning_rate": 3.485050903939439e-05, "loss": 0.2918, "loss_nan_ranks": 0, "loss_rank_avg": 0.33852121233940125, "step": 2460 }, { "epoch": 2.1784452296819787, "grad_norm": 0.7507199048995972, "learning_rate": 3.482096392156203e-05, "loss": 0.3105, "loss_nan_ranks": 0, "loss_rank_avg": 0.3842792212963104, "step": 2465 }, { "epoch": 2.182862190812721, "grad_norm": 0.5813544988632202, "learning_rate": 3.4791346889446536e-05, "loss": 0.2912, "loss_nan_ranks": 0, "loss_rank_avg": 0.2993150055408478, "step": 2470 }, { "epoch": 2.187279151943463, "grad_norm": 0.5870715975761414, "learning_rate": 3.476165808675567e-05, "loss": 0.2811, "loss_nan_ranks": 0, "loss_rank_avg": 0.22926348447799683, "step": 2475 }, { "epoch": 2.191696113074205, "grad_norm": 0.6749504208564758, "learning_rate": 3.473189765754544e-05, "loss": 0.3342, "loss_nan_ranks": 0, "loss_rank_avg": 0.2661892771720886, "step": 2480 }, { "epoch": 2.196113074204947, "grad_norm": 0.7036541104316711, "learning_rate": 3.4702065746219416e-05, "loss": 0.3031, "loss_nan_ranks": 0, "loss_rank_avg": 0.28539180755615234, "step": 2485 }, { "epoch": 2.200530035335689, "grad_norm": 0.7815271019935608, "learning_rate": 3.467216249752799e-05, "loss": 0.3215, "loss_nan_ranks": 0, "loss_rank_avg": 0.35631412267684937, "step": 2490 }, { "epoch": 2.204946996466431, "grad_norm": 0.6628273129463196, "learning_rate": 3.4642188056567726e-05, "loss": 0.2966, "loss_nan_ranks": 0, "loss_rank_avg": 0.27998021245002747, "step": 2495 }, { "epoch": 2.2093639575971733, "grad_norm": 0.968639075756073, "learning_rate": 3.461214256878059e-05, "loss": 0.2672, "loss_nan_ranks": 0, "loss_rank_avg": 0.23727664351463318, "step": 2500 }, { "epoch": 2.2137809187279154, "grad_norm": 0.6167461276054382, "learning_rate": 3.458202617995332e-05, "loss": 0.293, "loss_nan_ranks": 0, "loss_rank_avg": 0.30183541774749756, "step": 2505 }, { "epoch": 2.218197879858657, "grad_norm": 0.6023343801498413, "learning_rate": 3.4551839036216645e-05, "loss": 0.2677, "loss_nan_ranks": 0, "loss_rank_avg": 0.2949235141277313, "step": 2510 }, { "epoch": 2.222614840989399, "grad_norm": 0.6811277866363525, "learning_rate": 3.452158128404465e-05, "loss": 0.3118, "loss_nan_ranks": 0, "loss_rank_avg": 0.3465280532836914, "step": 2515 }, { "epoch": 2.227031802120141, "grad_norm": 0.6505920886993408, "learning_rate": 3.449125307025399e-05, "loss": 0.2811, "loss_nan_ranks": 0, "loss_rank_avg": 0.2278938889503479, "step": 2520 }, { "epoch": 2.2314487632508833, "grad_norm": 0.5918983221054077, "learning_rate": 3.446085454200322e-05, "loss": 0.2657, "loss_nan_ranks": 0, "loss_rank_avg": 0.2987041771411896, "step": 2525 }, { "epoch": 2.2358657243816253, "grad_norm": 0.5385729670524597, "learning_rate": 3.44303858467921e-05, "loss": 0.294, "loss_nan_ranks": 0, "loss_rank_avg": 0.25492385029792786, "step": 2530 }, { "epoch": 2.2402826855123674, "grad_norm": 0.6151769757270813, "learning_rate": 3.4399847132460826e-05, "loss": 0.3009, "loss_nan_ranks": 0, "loss_rank_avg": 0.2648699879646301, "step": 2535 }, { "epoch": 2.2446996466431095, "grad_norm": 0.6031373739242554, "learning_rate": 3.436923854718935e-05, "loss": 0.2864, "loss_nan_ranks": 0, "loss_rank_avg": 0.23283502459526062, "step": 2540 }, { "epoch": 2.2491166077738516, "grad_norm": 0.6123268008232117, "learning_rate": 3.433856023949666e-05, "loss": 0.3324, "loss_nan_ranks": 0, "loss_rank_avg": 0.2949381172657013, "step": 2545 }, { "epoch": 2.2535335689045937, "grad_norm": 0.63086998462677, "learning_rate": 3.430781235824006e-05, "loss": 0.3372, "loss_nan_ranks": 0, "loss_rank_avg": 0.3198104500770569, "step": 2550 }, { "epoch": 2.2579505300353357, "grad_norm": 0.5796027779579163, "learning_rate": 3.427699505261439e-05, "loss": 0.276, "loss_nan_ranks": 0, "loss_rank_avg": 0.2592904567718506, "step": 2555 }, { "epoch": 2.262367491166078, "grad_norm": 0.6514537930488586, "learning_rate": 3.4246108472151404e-05, "loss": 0.3106, "loss_nan_ranks": 0, "loss_rank_avg": 0.24681276082992554, "step": 2560 }, { "epoch": 2.26678445229682, "grad_norm": 0.6188172698020935, "learning_rate": 3.421515276671897e-05, "loss": 0.3131, "loss_nan_ranks": 0, "loss_rank_avg": 0.25234729051589966, "step": 2565 }, { "epoch": 2.271201413427562, "grad_norm": 0.614671528339386, "learning_rate": 3.418412808652037e-05, "loss": 0.2993, "loss_nan_ranks": 0, "loss_rank_avg": 0.2817681133747101, "step": 2570 }, { "epoch": 2.275618374558304, "grad_norm": 0.6262189745903015, "learning_rate": 3.4153034582093546e-05, "loss": 0.2756, "loss_nan_ranks": 0, "loss_rank_avg": 0.2680363059043884, "step": 2575 }, { "epoch": 2.280035335689046, "grad_norm": 0.6415190100669861, "learning_rate": 3.412187240431043e-05, "loss": 0.3148, "loss_nan_ranks": 0, "loss_rank_avg": 0.40983670949935913, "step": 2580 }, { "epoch": 2.2844522968197882, "grad_norm": 0.5524982213973999, "learning_rate": 3.409064170437612e-05, "loss": 0.3066, "loss_nan_ranks": 0, "loss_rank_avg": 0.26829278469085693, "step": 2585 }, { "epoch": 2.28886925795053, "grad_norm": 0.5553908944129944, "learning_rate": 3.405934263382824e-05, "loss": 0.2891, "loss_nan_ranks": 0, "loss_rank_avg": 0.24250522255897522, "step": 2590 }, { "epoch": 2.293286219081272, "grad_norm": 0.5957083106040955, "learning_rate": 3.4027975344536125e-05, "loss": 0.2759, "loss_nan_ranks": 0, "loss_rank_avg": 0.2847854495048523, "step": 2595 }, { "epoch": 2.297703180212014, "grad_norm": 0.5808223485946655, "learning_rate": 3.399653998870016e-05, "loss": 0.3083, "loss_nan_ranks": 0, "loss_rank_avg": 0.3035164773464203, "step": 2600 }, { "epoch": 2.302120141342756, "grad_norm": 0.5876525640487671, "learning_rate": 3.396503671885098e-05, "loss": 0.2856, "loss_nan_ranks": 0, "loss_rank_avg": 0.3480638265609741, "step": 2605 }, { "epoch": 2.306537102473498, "grad_norm": 0.5925856232643127, "learning_rate": 3.3933465687848745e-05, "loss": 0.2716, "loss_nan_ranks": 0, "loss_rank_avg": 0.2727441191673279, "step": 2610 }, { "epoch": 2.3109540636042403, "grad_norm": 0.6211825013160706, "learning_rate": 3.390182704888242e-05, "loss": 0.2747, "loss_nan_ranks": 0, "loss_rank_avg": 0.26210927963256836, "step": 2615 }, { "epoch": 2.3153710247349824, "grad_norm": 0.6318957805633545, "learning_rate": 3.387012095546903e-05, "loss": 0.2731, "loss_nan_ranks": 0, "loss_rank_avg": 0.2508987486362457, "step": 2620 }, { "epoch": 2.3197879858657244, "grad_norm": 0.5794128775596619, "learning_rate": 3.3838347561452854e-05, "loss": 0.2676, "loss_nan_ranks": 0, "loss_rank_avg": 0.2536904811859131, "step": 2625 }, { "epoch": 2.3242049469964665, "grad_norm": 0.7687904238700867, "learning_rate": 3.380650702100478e-05, "loss": 0.3206, "loss_nan_ranks": 0, "loss_rank_avg": 0.2745317220687866, "step": 2630 }, { "epoch": 2.3286219081272086, "grad_norm": 0.6696126461029053, "learning_rate": 3.3774599488621477e-05, "loss": 0.2588, "loss_nan_ranks": 0, "loss_rank_avg": 0.34656822681427, "step": 2635 }, { "epoch": 2.3330388692579507, "grad_norm": 0.6203848719596863, "learning_rate": 3.374262511912468e-05, "loss": 0.287, "loss_nan_ranks": 0, "loss_rank_avg": 0.27805572748184204, "step": 2640 }, { "epoch": 2.3374558303886928, "grad_norm": 0.6597334742546082, "learning_rate": 3.371058406766043e-05, "loss": 0.3141, "loss_nan_ranks": 0, "loss_rank_avg": 0.2802439332008362, "step": 2645 }, { "epoch": 2.3418727915194344, "grad_norm": 0.5888747572898865, "learning_rate": 3.3678476489698316e-05, "loss": 0.2838, "loss_nan_ranks": 0, "loss_rank_avg": 0.28658509254455566, "step": 2650 }, { "epoch": 2.3462897526501765, "grad_norm": 0.6427140831947327, "learning_rate": 3.364630254103073e-05, "loss": 0.2916, "loss_nan_ranks": 0, "loss_rank_avg": 0.2904638648033142, "step": 2655 }, { "epoch": 2.3507067137809186, "grad_norm": 0.6213993430137634, "learning_rate": 3.3614062377772124e-05, "loss": 0.2578, "loss_nan_ranks": 0, "loss_rank_avg": 0.25421738624572754, "step": 2660 }, { "epoch": 2.3551236749116606, "grad_norm": 0.6171494722366333, "learning_rate": 3.358175615635821e-05, "loss": 0.2999, "loss_nan_ranks": 0, "loss_rank_avg": 0.2982363998889923, "step": 2665 }, { "epoch": 2.3595406360424027, "grad_norm": 0.6194586157798767, "learning_rate": 3.354938403354524e-05, "loss": 0.2791, "loss_nan_ranks": 0, "loss_rank_avg": 0.27266404032707214, "step": 2670 }, { "epoch": 2.363957597173145, "grad_norm": 0.5762106776237488, "learning_rate": 3.351694616640924e-05, "loss": 0.2736, "loss_nan_ranks": 0, "loss_rank_avg": 0.264001727104187, "step": 2675 }, { "epoch": 2.368374558303887, "grad_norm": 0.7091301679611206, "learning_rate": 3.348444271234523e-05, "loss": 0.2929, "loss_nan_ranks": 0, "loss_rank_avg": 0.24949441850185394, "step": 2680 }, { "epoch": 2.372791519434629, "grad_norm": 0.5496478080749512, "learning_rate": 3.3451873829066474e-05, "loss": 0.2975, "loss_nan_ranks": 0, "loss_rank_avg": 0.23808279633522034, "step": 2685 }, { "epoch": 2.377208480565371, "grad_norm": 0.6888132095336914, "learning_rate": 3.341923967460371e-05, "loss": 0.2851, "loss_nan_ranks": 0, "loss_rank_avg": 0.24560776352882385, "step": 2690 }, { "epoch": 2.381625441696113, "grad_norm": 0.5535579323768616, "learning_rate": 3.338654040730439e-05, "loss": 0.3136, "loss_nan_ranks": 0, "loss_rank_avg": 0.30382639169692993, "step": 2695 }, { "epoch": 2.386042402826855, "grad_norm": 0.5726889967918396, "learning_rate": 3.335377618583191e-05, "loss": 0.3478, "loss_nan_ranks": 0, "loss_rank_avg": 0.355141818523407, "step": 2700 }, { "epoch": 2.3904593639575973, "grad_norm": 0.5942392349243164, "learning_rate": 3.332094716916481e-05, "loss": 0.2869, "loss_nan_ranks": 0, "loss_rank_avg": 0.2685794234275818, "step": 2705 }, { "epoch": 2.3948763250883394, "grad_norm": 0.6522179245948792, "learning_rate": 3.328805351659606e-05, "loss": 0.2872, "loss_nan_ranks": 0, "loss_rank_avg": 0.26475298404693604, "step": 2710 }, { "epoch": 2.3992932862190814, "grad_norm": 0.6337575912475586, "learning_rate": 3.3255095387732245e-05, "loss": 0.2787, "loss_nan_ranks": 0, "loss_rank_avg": 0.304879754781723, "step": 2715 }, { "epoch": 2.4037102473498235, "grad_norm": 0.6819047331809998, "learning_rate": 3.3222072942492807e-05, "loss": 0.283, "loss_nan_ranks": 0, "loss_rank_avg": 0.2330174744129181, "step": 2720 }, { "epoch": 2.4081272084805656, "grad_norm": 0.7528138160705566, "learning_rate": 3.318898634110925e-05, "loss": 0.3262, "loss_nan_ranks": 0, "loss_rank_avg": 0.43806248903274536, "step": 2725 }, { "epoch": 2.4125441696113072, "grad_norm": 0.6072879433631897, "learning_rate": 3.31558357441244e-05, "loss": 0.2902, "loss_nan_ranks": 0, "loss_rank_avg": 0.2504928410053253, "step": 2730 }, { "epoch": 2.4169611307420493, "grad_norm": 0.6802332401275635, "learning_rate": 3.312262131239157e-05, "loss": 0.3299, "loss_nan_ranks": 0, "loss_rank_avg": 0.26598912477493286, "step": 2735 }, { "epoch": 2.4213780918727914, "grad_norm": 0.6019970774650574, "learning_rate": 3.308934320707385e-05, "loss": 0.2914, "loss_nan_ranks": 0, "loss_rank_avg": 0.2811354398727417, "step": 2740 }, { "epoch": 2.4257950530035335, "grad_norm": 0.5732802748680115, "learning_rate": 3.305600158964325e-05, "loss": 0.3168, "loss_nan_ranks": 0, "loss_rank_avg": 0.36934155225753784, "step": 2745 }, { "epoch": 2.4302120141342756, "grad_norm": 0.6369587182998657, "learning_rate": 3.3022596621879976e-05, "loss": 0.3298, "loss_nan_ranks": 0, "loss_rank_avg": 0.3334657549858093, "step": 2750 }, { "epoch": 2.4346289752650176, "grad_norm": 0.5502805709838867, "learning_rate": 3.298912846587162e-05, "loss": 0.2721, "loss_nan_ranks": 0, "loss_rank_avg": 0.21960023045539856, "step": 2755 }, { "epoch": 2.4390459363957597, "grad_norm": 0.5598191618919373, "learning_rate": 3.2955597284012375e-05, "loss": 0.304, "loss_nan_ranks": 0, "loss_rank_avg": 0.28418734669685364, "step": 2760 }, { "epoch": 2.443462897526502, "grad_norm": 1.0691636800765991, "learning_rate": 3.2922003239002234e-05, "loss": 0.322, "loss_nan_ranks": 0, "loss_rank_avg": 0.4324992001056671, "step": 2765 }, { "epoch": 2.447879858657244, "grad_norm": 0.5500887036323547, "learning_rate": 3.288834649384624e-05, "loss": 0.2833, "loss_nan_ranks": 0, "loss_rank_avg": 0.30279844999313354, "step": 2770 }, { "epoch": 2.452296819787986, "grad_norm": 0.6789073348045349, "learning_rate": 3.2854627211853656e-05, "loss": 0.329, "loss_nan_ranks": 0, "loss_rank_avg": 0.3540169894695282, "step": 2775 }, { "epoch": 2.456713780918728, "grad_norm": 0.6984388828277588, "learning_rate": 3.2820845556637173e-05, "loss": 0.3262, "loss_nan_ranks": 0, "loss_rank_avg": 0.26223987340927124, "step": 2780 }, { "epoch": 2.46113074204947, "grad_norm": 0.6759300827980042, "learning_rate": 3.278700169211216e-05, "loss": 0.2892, "loss_nan_ranks": 0, "loss_rank_avg": 0.2541850209236145, "step": 2785 }, { "epoch": 2.4655477031802118, "grad_norm": 0.5834370255470276, "learning_rate": 3.275309578249581e-05, "loss": 0.2874, "loss_nan_ranks": 0, "loss_rank_avg": 0.27517110109329224, "step": 2790 }, { "epoch": 2.469964664310954, "grad_norm": 0.6928835511207581, "learning_rate": 3.2719127992306386e-05, "loss": 0.2761, "loss_nan_ranks": 0, "loss_rank_avg": 0.34850040078163147, "step": 2795 }, { "epoch": 2.474381625441696, "grad_norm": 0.5541810989379883, "learning_rate": 3.26850984863624e-05, "loss": 0.2861, "loss_nan_ranks": 0, "loss_rank_avg": 0.25448691844940186, "step": 2800 }, { "epoch": 2.478798586572438, "grad_norm": 0.5982546210289001, "learning_rate": 3.265100742978183e-05, "loss": 0.2931, "loss_nan_ranks": 0, "loss_rank_avg": 0.21053552627563477, "step": 2805 }, { "epoch": 2.48321554770318, "grad_norm": 0.5980389714241028, "learning_rate": 3.261685498798131e-05, "loss": 0.2993, "loss_nan_ranks": 0, "loss_rank_avg": 0.2833639681339264, "step": 2810 }, { "epoch": 2.487632508833922, "grad_norm": 0.5997810959815979, "learning_rate": 3.258264132667531e-05, "loss": 0.2439, "loss_nan_ranks": 0, "loss_rank_avg": 0.2284521758556366, "step": 2815 }, { "epoch": 2.4920494699646643, "grad_norm": 0.5549051761627197, "learning_rate": 3.254836661187537e-05, "loss": 0.2679, "loss_nan_ranks": 0, "loss_rank_avg": 0.31983861327171326, "step": 2820 }, { "epoch": 2.4964664310954063, "grad_norm": 0.5757570862770081, "learning_rate": 3.2514031009889264e-05, "loss": 0.2843, "loss_nan_ranks": 0, "loss_rank_avg": 0.27677464485168457, "step": 2825 }, { "epoch": 2.5008833922261484, "grad_norm": 0.5566675662994385, "learning_rate": 3.247963468732021e-05, "loss": 0.2926, "loss_nan_ranks": 0, "loss_rank_avg": 0.23760895431041718, "step": 2830 }, { "epoch": 2.5053003533568905, "grad_norm": 0.5576279163360596, "learning_rate": 3.244517781106604e-05, "loss": 0.2898, "loss_nan_ranks": 0, "loss_rank_avg": 0.3000289797782898, "step": 2835 }, { "epoch": 2.5097173144876326, "grad_norm": 0.5720181465148926, "learning_rate": 3.241066054831842e-05, "loss": 0.2766, "loss_nan_ranks": 0, "loss_rank_avg": 0.2972480058670044, "step": 2840 }, { "epoch": 2.5141342756183747, "grad_norm": 0.5405129194259644, "learning_rate": 3.237608306656201e-05, "loss": 0.2677, "loss_nan_ranks": 0, "loss_rank_avg": 0.23669353127479553, "step": 2845 }, { "epoch": 2.5185512367491167, "grad_norm": 0.5755829215049744, "learning_rate": 3.234144553357368e-05, "loss": 0.3027, "loss_nan_ranks": 0, "loss_rank_avg": 0.3481435775756836, "step": 2850 }, { "epoch": 2.522968197879859, "grad_norm": 0.7123222947120667, "learning_rate": 3.230674811742167e-05, "loss": 0.2627, "loss_nan_ranks": 0, "loss_rank_avg": 0.2425704300403595, "step": 2855 }, { "epoch": 2.527385159010601, "grad_norm": 0.6053310632705688, "learning_rate": 3.227199098646479e-05, "loss": 0.2944, "loss_nan_ranks": 0, "loss_rank_avg": 0.3263585567474365, "step": 2860 }, { "epoch": 2.531802120141343, "grad_norm": 0.6052203178405762, "learning_rate": 3.223717430935158e-05, "loss": 0.3646, "loss_nan_ranks": 0, "loss_rank_avg": 0.30958011746406555, "step": 2865 }, { "epoch": 2.536219081272085, "grad_norm": 0.6099016666412354, "learning_rate": 3.2202298255019546e-05, "loss": 0.2613, "loss_nan_ranks": 0, "loss_rank_avg": 0.2909597158432007, "step": 2870 }, { "epoch": 2.5406360424028267, "grad_norm": 0.6662100553512573, "learning_rate": 3.216736299269427e-05, "loss": 0.2772, "loss_nan_ranks": 0, "loss_rank_avg": 0.2982367277145386, "step": 2875 }, { "epoch": 2.545053003533569, "grad_norm": 0.7286942005157471, "learning_rate": 3.213236869188864e-05, "loss": 0.3025, "loss_nan_ranks": 0, "loss_rank_avg": 0.2656930088996887, "step": 2880 }, { "epoch": 2.549469964664311, "grad_norm": 0.5937227606773376, "learning_rate": 3.209731552240201e-05, "loss": 0.2999, "loss_nan_ranks": 0, "loss_rank_avg": 0.34423115849494934, "step": 2885 }, { "epoch": 2.553886925795053, "grad_norm": 0.6218231320381165, "learning_rate": 3.206220365431937e-05, "loss": 0.3097, "loss_nan_ranks": 0, "loss_rank_avg": 0.3695404529571533, "step": 2890 }, { "epoch": 2.558303886925795, "grad_norm": 0.6536095142364502, "learning_rate": 3.202703325801054e-05, "loss": 0.2994, "loss_nan_ranks": 0, "loss_rank_avg": 0.36360839009284973, "step": 2895 }, { "epoch": 2.562720848056537, "grad_norm": 0.6371568441390991, "learning_rate": 3.19918045041293e-05, "loss": 0.3254, "loss_nan_ranks": 0, "loss_rank_avg": 0.3120114803314209, "step": 2900 }, { "epoch": 2.567137809187279, "grad_norm": 0.6116867065429688, "learning_rate": 3.1956517563612645e-05, "loss": 0.3266, "loss_nan_ranks": 0, "loss_rank_avg": 0.31710511445999146, "step": 2905 }, { "epoch": 2.5715547703180213, "grad_norm": 0.6612317562103271, "learning_rate": 3.1921172607679846e-05, "loss": 0.3028, "loss_nan_ranks": 0, "loss_rank_avg": 0.3122076690196991, "step": 2910 }, { "epoch": 2.5759717314487633, "grad_norm": 0.6160558462142944, "learning_rate": 3.1885769807831714e-05, "loss": 0.29, "loss_nan_ranks": 0, "loss_rank_avg": 0.27517688274383545, "step": 2915 }, { "epoch": 2.5803886925795054, "grad_norm": 0.600016176700592, "learning_rate": 3.185030933584972e-05, "loss": 0.3317, "loss_nan_ranks": 0, "loss_rank_avg": 0.3114628791809082, "step": 2920 }, { "epoch": 2.5848056537102475, "grad_norm": 1.3786065578460693, "learning_rate": 3.181479136379518e-05, "loss": 0.314, "loss_nan_ranks": 0, "loss_rank_avg": 0.39571282267570496, "step": 2925 }, { "epoch": 2.589222614840989, "grad_norm": 0.6324872970581055, "learning_rate": 3.177921606400838e-05, "loss": 0.2583, "loss_nan_ranks": 0, "loss_rank_avg": 0.26827818155288696, "step": 2930 }, { "epoch": 2.5936395759717312, "grad_norm": 0.7281625270843506, "learning_rate": 3.1743583609107815e-05, "loss": 0.2935, "loss_nan_ranks": 0, "loss_rank_avg": 0.2841928005218506, "step": 2935 }, { "epoch": 2.5980565371024733, "grad_norm": 0.5587540864944458, "learning_rate": 3.1707894171989266e-05, "loss": 0.2884, "loss_nan_ranks": 0, "loss_rank_avg": 0.2770228981971741, "step": 2940 }, { "epoch": 2.6024734982332154, "grad_norm": 0.6678293347358704, "learning_rate": 3.167214792582505e-05, "loss": 0.3147, "loss_nan_ranks": 0, "loss_rank_avg": 0.30559659004211426, "step": 2945 }, { "epoch": 2.6068904593639575, "grad_norm": 0.6080886125564575, "learning_rate": 3.163634504406309e-05, "loss": 0.2943, "loss_nan_ranks": 0, "loss_rank_avg": 0.2588590383529663, "step": 2950 }, { "epoch": 2.6113074204946995, "grad_norm": 0.5840783715248108, "learning_rate": 3.160048570042614e-05, "loss": 0.2724, "loss_nan_ranks": 0, "loss_rank_avg": 0.2337406575679779, "step": 2955 }, { "epoch": 2.6157243816254416, "grad_norm": 0.5808059573173523, "learning_rate": 3.1564570068910905e-05, "loss": 0.2943, "loss_nan_ranks": 0, "loss_rank_avg": 0.304470419883728, "step": 2960 }, { "epoch": 2.6201413427561837, "grad_norm": 0.5165686011314392, "learning_rate": 3.152859832378723e-05, "loss": 0.2963, "loss_nan_ranks": 0, "loss_rank_avg": 0.3160091042518616, "step": 2965 }, { "epoch": 2.624558303886926, "grad_norm": 0.5811315774917603, "learning_rate": 3.1492570639597216e-05, "loss": 0.2916, "loss_nan_ranks": 0, "loss_rank_avg": 0.22409474849700928, "step": 2970 }, { "epoch": 2.628975265017668, "grad_norm": 0.901731014251709, "learning_rate": 3.145648719115439e-05, "loss": 0.2875, "loss_nan_ranks": 0, "loss_rank_avg": 0.2565080225467682, "step": 2975 }, { "epoch": 2.63339222614841, "grad_norm": 0.6577340364456177, "learning_rate": 3.1420348153542875e-05, "loss": 0.3208, "loss_nan_ranks": 0, "loss_rank_avg": 0.33042359352111816, "step": 2980 }, { "epoch": 2.637809187279152, "grad_norm": 0.5609688758850098, "learning_rate": 3.138415370211651e-05, "loss": 0.3028, "loss_nan_ranks": 0, "loss_rank_avg": 0.30433666706085205, "step": 2985 }, { "epoch": 2.642226148409894, "grad_norm": 0.5504027605056763, "learning_rate": 3.1347904012498015e-05, "loss": 0.2762, "loss_nan_ranks": 0, "loss_rank_avg": 0.27242234349250793, "step": 2990 }, { "epoch": 2.646643109540636, "grad_norm": 0.679768443107605, "learning_rate": 3.1311599260578144e-05, "loss": 0.2736, "loss_nan_ranks": 0, "loss_rank_avg": 0.26140671968460083, "step": 2995 }, { "epoch": 2.6510600706713783, "grad_norm": 1.4311226606369019, "learning_rate": 3.1275239622514805e-05, "loss": 0.2793, "loss_nan_ranks": 0, "loss_rank_avg": 0.25825434923171997, "step": 3000 }, { "epoch": 2.6554770318021204, "grad_norm": 0.6120150685310364, "learning_rate": 3.123882527473226e-05, "loss": 0.2988, "loss_nan_ranks": 0, "loss_rank_avg": 0.33173418045043945, "step": 3005 }, { "epoch": 2.6598939929328624, "grad_norm": 0.7447654008865356, "learning_rate": 3.1202356393920205e-05, "loss": 0.2891, "loss_nan_ranks": 0, "loss_rank_avg": 0.2618862986564636, "step": 3010 }, { "epoch": 2.664310954063604, "grad_norm": 0.6641417145729065, "learning_rate": 3.1165833157032945e-05, "loss": 0.3026, "loss_nan_ranks": 0, "loss_rank_avg": 0.32111161947250366, "step": 3015 }, { "epoch": 2.668727915194346, "grad_norm": 0.5896281599998474, "learning_rate": 3.112925574128853e-05, "loss": 0.3406, "loss_nan_ranks": 0, "loss_rank_avg": 0.3454618453979492, "step": 3020 }, { "epoch": 2.6731448763250882, "grad_norm": 0.5626996159553528, "learning_rate": 3.109262432416791e-05, "loss": 0.2728, "loss_nan_ranks": 0, "loss_rank_avg": 0.25294023752212524, "step": 3025 }, { "epoch": 2.6775618374558303, "grad_norm": 0.6241391897201538, "learning_rate": 3.105593908341405e-05, "loss": 0.3298, "loss_nan_ranks": 0, "loss_rank_avg": 0.4321364164352417, "step": 3030 }, { "epoch": 2.6819787985865724, "grad_norm": 0.5742759704589844, "learning_rate": 3.1019200197031074e-05, "loss": 0.2969, "loss_nan_ranks": 0, "loss_rank_avg": 0.29776108264923096, "step": 3035 }, { "epoch": 2.6863957597173145, "grad_norm": 0.5870106220245361, "learning_rate": 3.098240784328342e-05, "loss": 0.2909, "loss_nan_ranks": 0, "loss_rank_avg": 0.3504473567008972, "step": 3040 }, { "epoch": 2.6908127208480566, "grad_norm": 0.7754055261611938, "learning_rate": 3.094556220069495e-05, "loss": 0.296, "loss_nan_ranks": 0, "loss_rank_avg": 0.3014984726905823, "step": 3045 }, { "epoch": 2.6952296819787986, "grad_norm": 0.6273770928382874, "learning_rate": 3.09086634480481e-05, "loss": 0.3134, "loss_nan_ranks": 0, "loss_rank_avg": 0.26273688673973083, "step": 3050 }, { "epoch": 2.6996466431095407, "grad_norm": 0.6451196074485779, "learning_rate": 3.087171176438299e-05, "loss": 0.2997, "loss_nan_ranks": 0, "loss_rank_avg": 0.32904472947120667, "step": 3055 }, { "epoch": 2.704063604240283, "grad_norm": 0.5997614860534668, "learning_rate": 3.083470732899659e-05, "loss": 0.2636, "loss_nan_ranks": 0, "loss_rank_avg": 0.27674105763435364, "step": 3060 }, { "epoch": 2.708480565371025, "grad_norm": 0.6042181253433228, "learning_rate": 3.0797650321441836e-05, "loss": 0.2955, "loss_nan_ranks": 0, "loss_rank_avg": 0.3244817852973938, "step": 3065 }, { "epoch": 2.7128975265017665, "grad_norm": 0.6343804597854614, "learning_rate": 3.076054092152673e-05, "loss": 0.3018, "loss_nan_ranks": 0, "loss_rank_avg": 0.25237613916397095, "step": 3070 }, { "epoch": 2.7173144876325086, "grad_norm": 0.5651307106018066, "learning_rate": 3.072337930931351e-05, "loss": 0.3081, "loss_nan_ranks": 0, "loss_rank_avg": 0.2834140658378601, "step": 3075 }, { "epoch": 2.7217314487632507, "grad_norm": 0.5973877906799316, "learning_rate": 3.068616566511777e-05, "loss": 0.2835, "loss_nan_ranks": 0, "loss_rank_avg": 0.3114064633846283, "step": 3080 }, { "epoch": 2.7261484098939928, "grad_norm": 0.6461395621299744, "learning_rate": 3.0648900169507546e-05, "loss": 0.3086, "loss_nan_ranks": 0, "loss_rank_avg": 0.3663422465324402, "step": 3085 }, { "epoch": 2.730565371024735, "grad_norm": 0.7704973220825195, "learning_rate": 3.0611583003302483e-05, "loss": 0.2973, "loss_nan_ranks": 0, "loss_rank_avg": 0.324812650680542, "step": 3090 }, { "epoch": 2.734982332155477, "grad_norm": 0.6625204086303711, "learning_rate": 3.0574214347572944e-05, "loss": 0.2868, "loss_nan_ranks": 0, "loss_rank_avg": 0.2915918827056885, "step": 3095 }, { "epoch": 2.739399293286219, "grad_norm": 0.7523000240325928, "learning_rate": 3.0536794383639124e-05, "loss": 0.315, "loss_nan_ranks": 0, "loss_rank_avg": 0.267653226852417, "step": 3100 }, { "epoch": 2.743816254416961, "grad_norm": 0.6316024661064148, "learning_rate": 3.0499323293070168e-05, "loss": 0.2744, "loss_nan_ranks": 0, "loss_rank_avg": 0.2455906867980957, "step": 3105 }, { "epoch": 2.748233215547703, "grad_norm": 0.5501362681388855, "learning_rate": 3.0461801257683316e-05, "loss": 0.2514, "loss_nan_ranks": 0, "loss_rank_avg": 0.24123415350914001, "step": 3110 }, { "epoch": 2.7526501766784452, "grad_norm": 0.5820156335830688, "learning_rate": 3.0424228459542996e-05, "loss": 0.3227, "loss_nan_ranks": 0, "loss_rank_avg": 0.2901195287704468, "step": 3115 }, { "epoch": 2.7570671378091873, "grad_norm": 0.5763217806816101, "learning_rate": 3.0386605080959933e-05, "loss": 0.3368, "loss_nan_ranks": 0, "loss_rank_avg": 0.2486405223608017, "step": 3120 }, { "epoch": 2.7614840989399294, "grad_norm": 0.6018761396408081, "learning_rate": 3.0348931304490308e-05, "loss": 0.3192, "loss_nan_ranks": 0, "loss_rank_avg": 0.3718198835849762, "step": 3125 }, { "epoch": 2.7659010600706715, "grad_norm": 0.6464835405349731, "learning_rate": 3.0311207312934802e-05, "loss": 0.3052, "loss_nan_ranks": 0, "loss_rank_avg": 0.2733161747455597, "step": 3130 }, { "epoch": 2.7703180212014136, "grad_norm": 0.6074162125587463, "learning_rate": 3.0273433289337782e-05, "loss": 0.3438, "loss_nan_ranks": 0, "loss_rank_avg": 0.3934619128704071, "step": 3135 }, { "epoch": 2.7747349823321557, "grad_norm": 0.6861649751663208, "learning_rate": 3.0235609416986382e-05, "loss": 0.3097, "loss_nan_ranks": 0, "loss_rank_avg": 0.3455376625061035, "step": 3140 }, { "epoch": 2.7791519434628977, "grad_norm": 0.546413242816925, "learning_rate": 3.0197735879409582e-05, "loss": 0.2465, "loss_nan_ranks": 0, "loss_rank_avg": 0.3003658056259155, "step": 3145 }, { "epoch": 2.78356890459364, "grad_norm": 0.5838289260864258, "learning_rate": 3.015981286037737e-05, "loss": 0.2401, "loss_nan_ranks": 0, "loss_rank_avg": 0.24027827382087708, "step": 3150 }, { "epoch": 2.787985865724382, "grad_norm": 0.6270075440406799, "learning_rate": 3.0121840543899828e-05, "loss": 0.2884, "loss_nan_ranks": 0, "loss_rank_avg": 0.26620614528656006, "step": 3155 }, { "epoch": 2.7924028268551235, "grad_norm": 0.6166526079177856, "learning_rate": 3.008381911422624e-05, "loss": 0.3056, "loss_nan_ranks": 0, "loss_rank_avg": 0.2833024263381958, "step": 3160 }, { "epoch": 2.7968197879858656, "grad_norm": 0.7598279714584351, "learning_rate": 3.0045748755844183e-05, "loss": 0.2683, "loss_nan_ranks": 0, "loss_rank_avg": 0.23250682651996613, "step": 3165 }, { "epoch": 2.8012367491166077, "grad_norm": 0.5732672810554504, "learning_rate": 3.000762965347866e-05, "loss": 0.3035, "loss_nan_ranks": 0, "loss_rank_avg": 0.28198108077049255, "step": 3170 }, { "epoch": 2.8056537102473498, "grad_norm": 0.6021406650543213, "learning_rate": 2.9969461992091187e-05, "loss": 0.3052, "loss_nan_ranks": 0, "loss_rank_avg": 0.4037885367870331, "step": 3175 }, { "epoch": 2.810070671378092, "grad_norm": 0.5615018606185913, "learning_rate": 2.9931245956878892e-05, "loss": 0.2972, "loss_nan_ranks": 0, "loss_rank_avg": 0.35477539896965027, "step": 3180 }, { "epoch": 2.814487632508834, "grad_norm": 0.6810048818588257, "learning_rate": 2.9892981733273622e-05, "loss": 0.2809, "loss_nan_ranks": 0, "loss_rank_avg": 0.32894670963287354, "step": 3185 }, { "epoch": 2.818904593639576, "grad_norm": 0.6966601014137268, "learning_rate": 2.9854669506941056e-05, "loss": 0.3045, "loss_nan_ranks": 0, "loss_rank_avg": 0.2800629138946533, "step": 3190 }, { "epoch": 2.823321554770318, "grad_norm": 0.6326582431793213, "learning_rate": 2.9816309463779777e-05, "loss": 0.3209, "loss_nan_ranks": 0, "loss_rank_avg": 0.2518727481365204, "step": 3195 }, { "epoch": 2.82773851590106, "grad_norm": 0.6759814023971558, "learning_rate": 2.9777901789920393e-05, "loss": 0.3045, "loss_nan_ranks": 0, "loss_rank_avg": 0.3975909948348999, "step": 3200 }, { "epoch": 2.8321554770318023, "grad_norm": 0.6340915560722351, "learning_rate": 2.9739446671724633e-05, "loss": 0.2956, "loss_nan_ranks": 0, "loss_rank_avg": 0.25144162774086, "step": 3205 }, { "epoch": 2.836572438162544, "grad_norm": 0.6134033203125, "learning_rate": 2.9700944295784416e-05, "loss": 0.3104, "loss_nan_ranks": 0, "loss_rank_avg": 0.3317497670650482, "step": 3210 }, { "epoch": 2.840989399293286, "grad_norm": 0.5460602045059204, "learning_rate": 2.9662394848920976e-05, "loss": 0.247, "loss_nan_ranks": 0, "loss_rank_avg": 0.27905362844467163, "step": 3215 }, { "epoch": 2.845406360424028, "grad_norm": 0.6375061273574829, "learning_rate": 2.962379851818396e-05, "loss": 0.3235, "loss_nan_ranks": 0, "loss_rank_avg": 0.36598873138427734, "step": 3220 }, { "epoch": 2.84982332155477, "grad_norm": 0.5567300319671631, "learning_rate": 2.9585155490850463e-05, "loss": 0.2588, "loss_nan_ranks": 0, "loss_rank_avg": 0.18971426784992218, "step": 3225 }, { "epoch": 2.854240282685512, "grad_norm": 0.6028186678886414, "learning_rate": 2.954646595442421e-05, "loss": 0.2845, "loss_nan_ranks": 0, "loss_rank_avg": 0.3407897353172302, "step": 3230 }, { "epoch": 2.8586572438162543, "grad_norm": 0.5787585377693176, "learning_rate": 2.9507730096634558e-05, "loss": 0.2964, "loss_nan_ranks": 0, "loss_rank_avg": 0.3281695246696472, "step": 3235 }, { "epoch": 2.8630742049469964, "grad_norm": 0.6009767055511475, "learning_rate": 2.9468948105435652e-05, "loss": 0.2745, "loss_nan_ranks": 0, "loss_rank_avg": 0.2663469910621643, "step": 3240 }, { "epoch": 2.8674911660777385, "grad_norm": 0.5785790085792542, "learning_rate": 2.943012016900548e-05, "loss": 0.2944, "loss_nan_ranks": 0, "loss_rank_avg": 0.23294416069984436, "step": 3245 }, { "epoch": 2.8719081272084805, "grad_norm": 0.5688700079917908, "learning_rate": 2.9391246475744952e-05, "loss": 0.303, "loss_nan_ranks": 0, "loss_rank_avg": 0.23604430258274078, "step": 3250 }, { "epoch": 2.8763250883392226, "grad_norm": 0.5440213084220886, "learning_rate": 2.9352327214277002e-05, "loss": 0.3017, "loss_nan_ranks": 0, "loss_rank_avg": 0.29112064838409424, "step": 3255 }, { "epoch": 2.8807420494699647, "grad_norm": 0.5312842726707458, "learning_rate": 2.931336257344569e-05, "loss": 0.2971, "loss_nan_ranks": 0, "loss_rank_avg": 0.25097882747650146, "step": 3260 }, { "epoch": 2.885159010600707, "grad_norm": 0.658263623714447, "learning_rate": 2.9274352742315234e-05, "loss": 0.2663, "loss_nan_ranks": 0, "loss_rank_avg": 0.2677016854286194, "step": 3265 }, { "epoch": 2.889575971731449, "grad_norm": 0.6275796890258789, "learning_rate": 2.923529791016916e-05, "loss": 0.2654, "loss_nan_ranks": 0, "loss_rank_avg": 0.26214462518692017, "step": 3270 }, { "epoch": 2.893992932862191, "grad_norm": 0.6369099020957947, "learning_rate": 2.919619826650932e-05, "loss": 0.3023, "loss_nan_ranks": 0, "loss_rank_avg": 0.27498164772987366, "step": 3275 }, { "epoch": 2.898409893992933, "grad_norm": 0.6062490344047546, "learning_rate": 2.9157054001055007e-05, "loss": 0.2484, "loss_nan_ranks": 0, "loss_rank_avg": 0.35193073749542236, "step": 3280 }, { "epoch": 2.902826855123675, "grad_norm": 0.6960145831108093, "learning_rate": 2.9117865303742043e-05, "loss": 0.2768, "loss_nan_ranks": 0, "loss_rank_avg": 0.1947738081216812, "step": 3285 }, { "epoch": 2.907243816254417, "grad_norm": 0.610952615737915, "learning_rate": 2.9078632364721813e-05, "loss": 0.2925, "loss_nan_ranks": 0, "loss_rank_avg": 0.3309261202812195, "step": 3290 }, { "epoch": 2.9116607773851593, "grad_norm": 0.696971595287323, "learning_rate": 2.903935537436041e-05, "loss": 0.3046, "loss_nan_ranks": 0, "loss_rank_avg": 0.3597220778465271, "step": 3295 }, { "epoch": 2.916077738515901, "grad_norm": 0.5838300585746765, "learning_rate": 2.900003452323764e-05, "loss": 0.2693, "loss_nan_ranks": 0, "loss_rank_avg": 0.2813507318496704, "step": 3300 }, { "epoch": 2.920494699646643, "grad_norm": 0.5652858018875122, "learning_rate": 2.8960670002146138e-05, "loss": 0.2469, "loss_nan_ranks": 0, "loss_rank_avg": 0.2683357000350952, "step": 3305 }, { "epoch": 2.924911660777385, "grad_norm": 0.6993016004562378, "learning_rate": 2.8921262002090443e-05, "loss": 0.2897, "loss_nan_ranks": 0, "loss_rank_avg": 0.26220783591270447, "step": 3310 }, { "epoch": 2.929328621908127, "grad_norm": 0.6500904560089111, "learning_rate": 2.888181071428607e-05, "loss": 0.2631, "loss_nan_ranks": 0, "loss_rank_avg": 0.27039170265197754, "step": 3315 }, { "epoch": 2.9337455830388692, "grad_norm": 0.6717191934585571, "learning_rate": 2.884231633015854e-05, "loss": 0.3335, "loss_nan_ranks": 0, "loss_rank_avg": 0.31322067975997925, "step": 3320 }, { "epoch": 2.9381625441696113, "grad_norm": 0.6734561920166016, "learning_rate": 2.8802779041342527e-05, "loss": 0.2989, "loss_nan_ranks": 0, "loss_rank_avg": 0.28675904870033264, "step": 3325 }, { "epoch": 2.9425795053003534, "grad_norm": 0.5548768043518066, "learning_rate": 2.876319903968086e-05, "loss": 0.2627, "loss_nan_ranks": 0, "loss_rank_avg": 0.23984147608280182, "step": 3330 }, { "epoch": 2.9469964664310955, "grad_norm": 0.5678575038909912, "learning_rate": 2.8723576517223635e-05, "loss": 0.27, "loss_nan_ranks": 0, "loss_rank_avg": 0.26975440979003906, "step": 3335 }, { "epoch": 2.9514134275618376, "grad_norm": 0.5872315764427185, "learning_rate": 2.8683911666227254e-05, "loss": 0.2604, "loss_nan_ranks": 0, "loss_rank_avg": 0.24881170690059662, "step": 3340 }, { "epoch": 2.9558303886925796, "grad_norm": 0.5842449069023132, "learning_rate": 2.864420467915352e-05, "loss": 0.2799, "loss_nan_ranks": 0, "loss_rank_avg": 0.24471619725227356, "step": 3345 }, { "epoch": 2.9602473498233217, "grad_norm": 0.6217790842056274, "learning_rate": 2.8604455748668675e-05, "loss": 0.2745, "loss_nan_ranks": 0, "loss_rank_avg": 0.3837500810623169, "step": 3350 }, { "epoch": 2.9646643109540634, "grad_norm": 0.7379738092422485, "learning_rate": 2.8564665067642485e-05, "loss": 0.3111, "loss_nan_ranks": 0, "loss_rank_avg": 0.3087494373321533, "step": 3355 }, { "epoch": 2.9690812720848054, "grad_norm": 0.6993508338928223, "learning_rate": 2.8524832829147297e-05, "loss": 0.3157, "loss_nan_ranks": 0, "loss_rank_avg": 0.2241922914981842, "step": 3360 }, { "epoch": 2.9734982332155475, "grad_norm": 0.6891131401062012, "learning_rate": 2.8484959226457115e-05, "loss": 0.2835, "loss_nan_ranks": 0, "loss_rank_avg": 0.25775182247161865, "step": 3365 }, { "epoch": 2.9779151943462896, "grad_norm": 0.5844883322715759, "learning_rate": 2.8445044453046624e-05, "loss": 0.2942, "loss_nan_ranks": 0, "loss_rank_avg": 0.285645067691803, "step": 3370 }, { "epoch": 2.9823321554770317, "grad_norm": 0.546260416507721, "learning_rate": 2.8405088702590296e-05, "loss": 0.2498, "loss_nan_ranks": 0, "loss_rank_avg": 0.26289626955986023, "step": 3375 }, { "epoch": 2.9867491166077738, "grad_norm": 0.682931125164032, "learning_rate": 2.8365092168961442e-05, "loss": 0.2906, "loss_nan_ranks": 0, "loss_rank_avg": 0.23991906642913818, "step": 3380 }, { "epoch": 2.991166077738516, "grad_norm": 0.600168764591217, "learning_rate": 2.8325055046231232e-05, "loss": 0.2954, "loss_nan_ranks": 0, "loss_rank_avg": 0.37289607524871826, "step": 3385 }, { "epoch": 2.995583038869258, "grad_norm": 0.6167921423912048, "learning_rate": 2.8284977528667806e-05, "loss": 0.3104, "loss_nan_ranks": 0, "loss_rank_avg": 0.2748357951641083, "step": 3390 }, { "epoch": 3.0008833922261484, "grad_norm": 0.5644031763076782, "learning_rate": 2.8244859810735304e-05, "loss": 0.2734, "loss_nan_ranks": 0, "loss_rank_avg": 0.31137675046920776, "step": 3395 }, { "epoch": 3.0053003533568905, "grad_norm": 0.5274640917778015, "learning_rate": 2.8204702087092907e-05, "loss": 0.2752, "loss_nan_ranks": 0, "loss_rank_avg": 0.24842819571495056, "step": 3400 }, { "epoch": 3.0097173144876326, "grad_norm": 0.6265926361083984, "learning_rate": 2.8164504552593946e-05, "loss": 0.2768, "loss_nan_ranks": 0, "loss_rank_avg": 0.27338293194770813, "step": 3405 }, { "epoch": 3.0141342756183747, "grad_norm": 0.6399211287498474, "learning_rate": 2.8124267402284892e-05, "loss": 0.2868, "loss_nan_ranks": 0, "loss_rank_avg": 0.3242674469947815, "step": 3410 }, { "epoch": 3.0185512367491167, "grad_norm": 0.5678868889808655, "learning_rate": 2.808399083140445e-05, "loss": 0.2472, "loss_nan_ranks": 0, "loss_rank_avg": 0.2346222996711731, "step": 3415 }, { "epoch": 3.022968197879859, "grad_norm": 0.6776866912841797, "learning_rate": 2.804367503538261e-05, "loss": 0.2434, "loss_nan_ranks": 0, "loss_rank_avg": 0.3134090304374695, "step": 3420 }, { "epoch": 3.027385159010601, "grad_norm": 0.6301928758621216, "learning_rate": 2.800332020983968e-05, "loss": 0.297, "loss_nan_ranks": 0, "loss_rank_avg": 0.26638519763946533, "step": 3425 }, { "epoch": 3.0318021201413425, "grad_norm": 0.6153246164321899, "learning_rate": 2.796292655058535e-05, "loss": 0.2712, "loss_nan_ranks": 0, "loss_rank_avg": 0.2534346878528595, "step": 3430 }, { "epoch": 3.0362190812720846, "grad_norm": 0.6437485814094543, "learning_rate": 2.792249425361773e-05, "loss": 0.2873, "loss_nan_ranks": 0, "loss_rank_avg": 0.3017566204071045, "step": 3435 }, { "epoch": 3.0406360424028267, "grad_norm": 0.5970696210861206, "learning_rate": 2.788202351512243e-05, "loss": 0.2605, "loss_nan_ranks": 0, "loss_rank_avg": 0.27321383357048035, "step": 3440 }, { "epoch": 3.045053003533569, "grad_norm": 0.6142945885658264, "learning_rate": 2.7841514531471574e-05, "loss": 0.2623, "loss_nan_ranks": 0, "loss_rank_avg": 0.2881008982658386, "step": 3445 }, { "epoch": 3.049469964664311, "grad_norm": 0.6773906350135803, "learning_rate": 2.7800967499222845e-05, "loss": 0.262, "loss_nan_ranks": 0, "loss_rank_avg": 0.2629165053367615, "step": 3450 }, { "epoch": 3.053886925795053, "grad_norm": 0.545464038848877, "learning_rate": 2.7760382615118562e-05, "loss": 0.25, "loss_nan_ranks": 0, "loss_rank_avg": 0.27945563197135925, "step": 3455 }, { "epoch": 3.058303886925795, "grad_norm": 0.6101946234703064, "learning_rate": 2.7719760076084713e-05, "loss": 0.2938, "loss_nan_ranks": 0, "loss_rank_avg": 0.268838107585907, "step": 3460 }, { "epoch": 3.062720848056537, "grad_norm": 2.510591506958008, "learning_rate": 2.7679100079229982e-05, "loss": 0.2674, "loss_nan_ranks": 0, "loss_rank_avg": 0.20418402552604675, "step": 3465 }, { "epoch": 3.067137809187279, "grad_norm": 0.5778970122337341, "learning_rate": 2.7638402821844808e-05, "loss": 0.281, "loss_nan_ranks": 0, "loss_rank_avg": 0.33357682824134827, "step": 3470 }, { "epoch": 3.0715547703180213, "grad_norm": 0.630087673664093, "learning_rate": 2.7597668501400436e-05, "loss": 0.2591, "loss_nan_ranks": 0, "loss_rank_avg": 0.28215503692626953, "step": 3475 }, { "epoch": 3.0759717314487633, "grad_norm": 0.6290969848632812, "learning_rate": 2.7556897315547934e-05, "loss": 0.29, "loss_nan_ranks": 0, "loss_rank_avg": 0.2777438461780548, "step": 3480 }, { "epoch": 3.0803886925795054, "grad_norm": 0.5843200087547302, "learning_rate": 2.7516089462117265e-05, "loss": 0.2386, "loss_nan_ranks": 0, "loss_rank_avg": 0.195445254445076, "step": 3485 }, { "epoch": 3.0848056537102475, "grad_norm": 0.6632335186004639, "learning_rate": 2.747524513911629e-05, "loss": 0.259, "loss_nan_ranks": 0, "loss_rank_avg": 0.3661026358604431, "step": 3490 }, { "epoch": 3.0892226148409896, "grad_norm": 0.6285350918769836, "learning_rate": 2.7434364544729844e-05, "loss": 0.2747, "loss_nan_ranks": 0, "loss_rank_avg": 0.32909315824508667, "step": 3495 }, { "epoch": 3.0936395759717312, "grad_norm": 0.5703840255737305, "learning_rate": 2.7393447877318756e-05, "loss": 0.2907, "loss_nan_ranks": 0, "loss_rank_avg": 0.34857630729675293, "step": 3500 }, { "epoch": 3.0980565371024733, "grad_norm": 0.5537542104721069, "learning_rate": 2.735249533541888e-05, "loss": 0.2578, "loss_nan_ranks": 0, "loss_rank_avg": 0.3032943606376648, "step": 3505 }, { "epoch": 3.1024734982332154, "grad_norm": 0.6345096826553345, "learning_rate": 2.7311507117740138e-05, "loss": 0.261, "loss_nan_ranks": 0, "loss_rank_avg": 0.2263367921113968, "step": 3510 }, { "epoch": 3.1068904593639575, "grad_norm": 0.671589195728302, "learning_rate": 2.7270483423165578e-05, "loss": 0.2604, "loss_nan_ranks": 0, "loss_rank_avg": 0.23606635630130768, "step": 3515 }, { "epoch": 3.1113074204946995, "grad_norm": 0.6208762526512146, "learning_rate": 2.7229424450750378e-05, "loss": 0.2418, "loss_nan_ranks": 0, "loss_rank_avg": 0.26206400990486145, "step": 3520 }, { "epoch": 3.1157243816254416, "grad_norm": 0.6254077553749084, "learning_rate": 2.7188330399720883e-05, "loss": 0.2593, "loss_nan_ranks": 0, "loss_rank_avg": 0.21936869621276855, "step": 3525 }, { "epoch": 3.1201413427561837, "grad_norm": 0.6420286893844604, "learning_rate": 2.7147201469473645e-05, "loss": 0.2697, "loss_nan_ranks": 0, "loss_rank_avg": 0.28934288024902344, "step": 3530 }, { "epoch": 3.124558303886926, "grad_norm": 0.8817894458770752, "learning_rate": 2.7106037859574482e-05, "loss": 0.27, "loss_nan_ranks": 0, "loss_rank_avg": 0.23228083550930023, "step": 3535 }, { "epoch": 3.128975265017668, "grad_norm": 0.686221718788147, "learning_rate": 2.706483976975746e-05, "loss": 0.2552, "loss_nan_ranks": 0, "loss_rank_avg": 0.2681628465652466, "step": 3540 }, { "epoch": 3.13339222614841, "grad_norm": 0.6326223611831665, "learning_rate": 2.702360739992395e-05, "loss": 0.2812, "loss_nan_ranks": 0, "loss_rank_avg": 0.2811989486217499, "step": 3545 }, { "epoch": 3.137809187279152, "grad_norm": 0.7377673387527466, "learning_rate": 2.698234095014167e-05, "loss": 0.2648, "loss_nan_ranks": 0, "loss_rank_avg": 0.26155680418014526, "step": 3550 }, { "epoch": 3.142226148409894, "grad_norm": 0.6575599312782288, "learning_rate": 2.6941040620643685e-05, "loss": 0.2887, "loss_nan_ranks": 0, "loss_rank_avg": 0.31103795766830444, "step": 3555 }, { "epoch": 3.146643109540636, "grad_norm": 0.6150068044662476, "learning_rate": 2.689970661182747e-05, "loss": 0.2926, "loss_nan_ranks": 0, "loss_rank_avg": 0.25528043508529663, "step": 3560 }, { "epoch": 3.1510600706713783, "grad_norm": 0.5725194215774536, "learning_rate": 2.6858339124253902e-05, "loss": 0.288, "loss_nan_ranks": 0, "loss_rank_avg": 0.25137802958488464, "step": 3565 }, { "epoch": 3.1554770318021204, "grad_norm": 0.5649983286857605, "learning_rate": 2.681693835864631e-05, "loss": 0.2707, "loss_nan_ranks": 0, "loss_rank_avg": 0.2684163451194763, "step": 3570 }, { "epoch": 3.159893992932862, "grad_norm": 0.6405424475669861, "learning_rate": 2.6775504515889498e-05, "loss": 0.2665, "loss_nan_ranks": 0, "loss_rank_avg": 0.27231472730636597, "step": 3575 }, { "epoch": 3.164310954063604, "grad_norm": 0.5784630179405212, "learning_rate": 2.6734037797028764e-05, "loss": 0.298, "loss_nan_ranks": 0, "loss_rank_avg": 0.25174811482429504, "step": 3580 }, { "epoch": 3.168727915194346, "grad_norm": 0.572640061378479, "learning_rate": 2.6692538403268916e-05, "loss": 0.2867, "loss_nan_ranks": 0, "loss_rank_avg": 0.22904187440872192, "step": 3585 }, { "epoch": 3.1731448763250882, "grad_norm": 0.6837023496627808, "learning_rate": 2.6651006535973327e-05, "loss": 0.3015, "loss_nan_ranks": 0, "loss_rank_avg": 0.2275552749633789, "step": 3590 }, { "epoch": 3.1775618374558303, "grad_norm": 0.5804446935653687, "learning_rate": 2.660944239666293e-05, "loss": 0.2832, "loss_nan_ranks": 0, "loss_rank_avg": 0.29134559631347656, "step": 3595 }, { "epoch": 3.1819787985865724, "grad_norm": 0.6767000555992126, "learning_rate": 2.6567846187015245e-05, "loss": 0.2804, "loss_nan_ranks": 0, "loss_rank_avg": 0.3343573808670044, "step": 3600 }, { "epoch": 3.1863957597173145, "grad_norm": 0.615800142288208, "learning_rate": 2.6526218108863408e-05, "loss": 0.3103, "loss_nan_ranks": 0, "loss_rank_avg": 0.42924535274505615, "step": 3605 }, { "epoch": 3.1908127208480566, "grad_norm": 0.6070627570152283, "learning_rate": 2.648455836419518e-05, "loss": 0.262, "loss_nan_ranks": 0, "loss_rank_avg": 0.2295074164867401, "step": 3610 }, { "epoch": 3.1952296819787986, "grad_norm": 0.6150861382484436, "learning_rate": 2.6442867155151984e-05, "loss": 0.2611, "loss_nan_ranks": 0, "loss_rank_avg": 0.24106386303901672, "step": 3615 }, { "epoch": 3.1996466431095407, "grad_norm": 0.6103044748306274, "learning_rate": 2.6401144684027915e-05, "loss": 0.2458, "loss_nan_ranks": 0, "loss_rank_avg": 0.26913753151893616, "step": 3620 }, { "epoch": 3.204063604240283, "grad_norm": 0.6718603372573853, "learning_rate": 2.635939115326874e-05, "loss": 0.2722, "loss_nan_ranks": 0, "loss_rank_avg": 0.28498226404190063, "step": 3625 }, { "epoch": 3.208480565371025, "grad_norm": 0.6954282522201538, "learning_rate": 2.631760676547096e-05, "loss": 0.3019, "loss_nan_ranks": 0, "loss_rank_avg": 0.22322580218315125, "step": 3630 }, { "epoch": 3.212897526501767, "grad_norm": 0.6374308466911316, "learning_rate": 2.6275791723380772e-05, "loss": 0.2825, "loss_nan_ranks": 0, "loss_rank_avg": 0.25081831216812134, "step": 3635 }, { "epoch": 3.2173144876325086, "grad_norm": 0.6096289753913879, "learning_rate": 2.6233946229893147e-05, "loss": 0.2657, "loss_nan_ranks": 0, "loss_rank_avg": 0.23818659782409668, "step": 3640 }, { "epoch": 3.2217314487632507, "grad_norm": 0.6019396185874939, "learning_rate": 2.6192070488050783e-05, "loss": 0.2425, "loss_nan_ranks": 0, "loss_rank_avg": 0.26665595173835754, "step": 3645 }, { "epoch": 3.2261484098939928, "grad_norm": 0.6398298144340515, "learning_rate": 2.615016470104316e-05, "loss": 0.2519, "loss_nan_ranks": 0, "loss_rank_avg": 0.21210452914237976, "step": 3650 }, { "epoch": 3.230565371024735, "grad_norm": 0.5565531253814697, "learning_rate": 2.6108229072205545e-05, "loss": 0.31, "loss_nan_ranks": 0, "loss_rank_avg": 0.2653741240501404, "step": 3655 }, { "epoch": 3.234982332155477, "grad_norm": 0.6310369968414307, "learning_rate": 2.606626380501801e-05, "loss": 0.2921, "loss_nan_ranks": 0, "loss_rank_avg": 0.3396562337875366, "step": 3660 }, { "epoch": 3.239399293286219, "grad_norm": 0.6153047680854797, "learning_rate": 2.6024269103104417e-05, "loss": 0.2716, "loss_nan_ranks": 0, "loss_rank_avg": 0.28231340646743774, "step": 3665 }, { "epoch": 3.243816254416961, "grad_norm": 0.667517900466919, "learning_rate": 2.5982245170231467e-05, "loss": 0.264, "loss_nan_ranks": 0, "loss_rank_avg": 0.32096004486083984, "step": 3670 }, { "epoch": 3.248233215547703, "grad_norm": 0.6767958998680115, "learning_rate": 2.5940192210307697e-05, "loss": 0.2544, "loss_nan_ranks": 0, "loss_rank_avg": 0.3398870825767517, "step": 3675 }, { "epoch": 3.2526501766784452, "grad_norm": 0.6170216798782349, "learning_rate": 2.5898110427382487e-05, "loss": 0.2543, "loss_nan_ranks": 0, "loss_rank_avg": 0.2500012516975403, "step": 3680 }, { "epoch": 3.2570671378091873, "grad_norm": 0.5803675055503845, "learning_rate": 2.5856000025645065e-05, "loss": 0.2803, "loss_nan_ranks": 0, "loss_rank_avg": 0.2736228108406067, "step": 3685 }, { "epoch": 3.2614840989399294, "grad_norm": 0.6387597322463989, "learning_rate": 2.581386120942353e-05, "loss": 0.3275, "loss_nan_ranks": 0, "loss_rank_avg": 0.28693830966949463, "step": 3690 }, { "epoch": 3.2659010600706715, "grad_norm": 0.6041744351387024, "learning_rate": 2.577169418318385e-05, "loss": 0.309, "loss_nan_ranks": 0, "loss_rank_avg": 0.3677545189857483, "step": 3695 }, { "epoch": 3.2703180212014136, "grad_norm": 0.591244637966156, "learning_rate": 2.5729499151528877e-05, "loss": 0.2956, "loss_nan_ranks": 0, "loss_rank_avg": 0.3903142213821411, "step": 3700 }, { "epoch": 3.2747349823321557, "grad_norm": 0.5570688247680664, "learning_rate": 2.568727631919735e-05, "loss": 0.3056, "loss_nan_ranks": 0, "loss_rank_avg": 0.32988813519477844, "step": 3705 }, { "epoch": 3.2791519434628977, "grad_norm": 0.7770934104919434, "learning_rate": 2.5645025891062897e-05, "loss": 0.2645, "loss_nan_ranks": 0, "loss_rank_avg": 0.2805140018463135, "step": 3710 }, { "epoch": 3.28356890459364, "grad_norm": 0.5827252864837646, "learning_rate": 2.5602748072133054e-05, "loss": 0.2863, "loss_nan_ranks": 0, "loss_rank_avg": 0.25772303342819214, "step": 3715 }, { "epoch": 3.2879858657243815, "grad_norm": 0.6013981103897095, "learning_rate": 2.5560443067548263e-05, "loss": 0.2814, "loss_nan_ranks": 0, "loss_rank_avg": 0.24229644238948822, "step": 3720 }, { "epoch": 3.2924028268551235, "grad_norm": 0.5573179721832275, "learning_rate": 2.5518111082580873e-05, "loss": 0.251, "loss_nan_ranks": 0, "loss_rank_avg": 0.2687339782714844, "step": 3725 }, { "epoch": 3.2968197879858656, "grad_norm": 0.5730259418487549, "learning_rate": 2.547575232263414e-05, "loss": 0.2566, "loss_nan_ranks": 0, "loss_rank_avg": 0.2892979681491852, "step": 3730 }, { "epoch": 3.3012367491166077, "grad_norm": 0.578188955783844, "learning_rate": 2.5433366993241252e-05, "loss": 0.2739, "loss_nan_ranks": 0, "loss_rank_avg": 0.25793546438217163, "step": 3735 }, { "epoch": 3.3056537102473498, "grad_norm": 0.6442872881889343, "learning_rate": 2.5390955300064306e-05, "loss": 0.2552, "loss_nan_ranks": 0, "loss_rank_avg": 0.23315435647964478, "step": 3740 }, { "epoch": 3.310070671378092, "grad_norm": 0.6832812428474426, "learning_rate": 2.5348517448893323e-05, "loss": 0.283, "loss_nan_ranks": 0, "loss_rank_avg": 0.28596487641334534, "step": 3745 }, { "epoch": 3.314487632508834, "grad_norm": 0.5652126669883728, "learning_rate": 2.530605364564526e-05, "loss": 0.306, "loss_nan_ranks": 0, "loss_rank_avg": 0.22419892251491547, "step": 3750 }, { "epoch": 3.318904593639576, "grad_norm": 0.5962374806404114, "learning_rate": 2.5263564096362972e-05, "loss": 0.2711, "loss_nan_ranks": 0, "loss_rank_avg": 0.23518124222755432, "step": 3755 }, { "epoch": 3.323321554770318, "grad_norm": 0.5664314031600952, "learning_rate": 2.5221049007214276e-05, "loss": 0.2561, "loss_nan_ranks": 0, "loss_rank_avg": 0.2558237910270691, "step": 3760 }, { "epoch": 3.32773851590106, "grad_norm": 0.5694194436073303, "learning_rate": 2.5178508584490882e-05, "loss": 0.2672, "loss_nan_ranks": 0, "loss_rank_avg": 0.2523389458656311, "step": 3765 }, { "epoch": 3.3321554770318023, "grad_norm": 0.926784873008728, "learning_rate": 2.5135943034607434e-05, "loss": 0.3056, "loss_nan_ranks": 0, "loss_rank_avg": 0.3541930913925171, "step": 3770 }, { "epoch": 3.3365724381625443, "grad_norm": 0.65824294090271, "learning_rate": 2.50933525641005e-05, "loss": 0.2076, "loss_nan_ranks": 0, "loss_rank_avg": 0.19914916157722473, "step": 3775 }, { "epoch": 3.340989399293286, "grad_norm": 0.62986159324646, "learning_rate": 2.5050737379627575e-05, "loss": 0.2822, "loss_nan_ranks": 0, "loss_rank_avg": 0.285235196352005, "step": 3780 }, { "epoch": 3.345406360424028, "grad_norm": 0.5901169776916504, "learning_rate": 2.5008097687966052e-05, "loss": 0.2573, "loss_nan_ranks": 0, "loss_rank_avg": 0.3077230453491211, "step": 3785 }, { "epoch": 3.34982332155477, "grad_norm": 0.6803475618362427, "learning_rate": 2.4965433696012255e-05, "loss": 0.3068, "loss_nan_ranks": 0, "loss_rank_avg": 0.3818191885948181, "step": 3790 }, { "epoch": 3.354240282685512, "grad_norm": 0.5957804918289185, "learning_rate": 2.49227456107804e-05, "loss": 0.3033, "loss_nan_ranks": 0, "loss_rank_avg": 0.3599034249782562, "step": 3795 }, { "epoch": 3.3586572438162543, "grad_norm": 0.5898772478103638, "learning_rate": 2.488003363940163e-05, "loss": 0.2727, "loss_nan_ranks": 0, "loss_rank_avg": 0.275057315826416, "step": 3800 }, { "epoch": 3.3630742049469964, "grad_norm": 0.6106370687484741, "learning_rate": 2.4837297989122987e-05, "loss": 0.2831, "loss_nan_ranks": 0, "loss_rank_avg": 0.31545543670654297, "step": 3805 }, { "epoch": 3.3674911660777385, "grad_norm": 0.5764063596725464, "learning_rate": 2.4794538867306385e-05, "loss": 0.2837, "loss_nan_ranks": 0, "loss_rank_avg": 0.30026909708976746, "step": 3810 }, { "epoch": 3.3719081272084805, "grad_norm": 0.5482817888259888, "learning_rate": 2.4751756481427637e-05, "loss": 0.2699, "loss_nan_ranks": 0, "loss_rank_avg": 0.23257222771644592, "step": 3815 }, { "epoch": 3.3763250883392226, "grad_norm": 0.6209834218025208, "learning_rate": 2.4708951039075462e-05, "loss": 0.2375, "loss_nan_ranks": 0, "loss_rank_avg": 0.21792644262313843, "step": 3820 }, { "epoch": 3.3807420494699647, "grad_norm": 0.5752567648887634, "learning_rate": 2.4666122747950416e-05, "loss": 0.257, "loss_nan_ranks": 0, "loss_rank_avg": 0.2612408995628357, "step": 3825 }, { "epoch": 3.385159010600707, "grad_norm": 0.594068706035614, "learning_rate": 2.4623271815863943e-05, "loss": 0.2552, "loss_nan_ranks": 0, "loss_rank_avg": 0.26158034801483154, "step": 3830 }, { "epoch": 3.389575971731449, "grad_norm": 0.6304320693016052, "learning_rate": 2.4580398450737338e-05, "loss": 0.3036, "loss_nan_ranks": 0, "loss_rank_avg": 0.3264097571372986, "step": 3835 }, { "epoch": 3.393992932862191, "grad_norm": 0.7010661959648132, "learning_rate": 2.4537502860600754e-05, "loss": 0.2842, "loss_nan_ranks": 0, "loss_rank_avg": 0.2843078374862671, "step": 3840 }, { "epoch": 3.398409893992933, "grad_norm": 0.6101694107055664, "learning_rate": 2.4494585253592184e-05, "loss": 0.2717, "loss_nan_ranks": 0, "loss_rank_avg": 0.2982633113861084, "step": 3845 }, { "epoch": 3.402826855123675, "grad_norm": 0.5913658738136292, "learning_rate": 2.445164583795643e-05, "loss": 0.2567, "loss_nan_ranks": 0, "loss_rank_avg": 0.3931722640991211, "step": 3850 }, { "epoch": 3.407243816254417, "grad_norm": 0.6326600313186646, "learning_rate": 2.4408684822044152e-05, "loss": 0.2485, "loss_nan_ranks": 0, "loss_rank_avg": 0.1966112107038498, "step": 3855 }, { "epoch": 3.411660777385159, "grad_norm": 0.5460976958274841, "learning_rate": 2.4365702414310786e-05, "loss": 0.2891, "loss_nan_ranks": 0, "loss_rank_avg": 0.2495535910129547, "step": 3860 }, { "epoch": 3.416077738515901, "grad_norm": 0.5842785835266113, "learning_rate": 2.4322698823315572e-05, "loss": 0.2936, "loss_nan_ranks": 0, "loss_rank_avg": 0.22732970118522644, "step": 3865 }, { "epoch": 3.420494699646643, "grad_norm": 0.5686548948287964, "learning_rate": 2.4279674257720548e-05, "loss": 0.2531, "loss_nan_ranks": 0, "loss_rank_avg": 0.24848207831382751, "step": 3870 }, { "epoch": 3.424911660777385, "grad_norm": 0.6319994926452637, "learning_rate": 2.4236628926289506e-05, "loss": 0.2879, "loss_nan_ranks": 0, "loss_rank_avg": 0.2527256906032562, "step": 3875 }, { "epoch": 3.429328621908127, "grad_norm": 0.9111738204956055, "learning_rate": 2.4193563037887025e-05, "loss": 0.2488, "loss_nan_ranks": 0, "loss_rank_avg": 0.27606528997421265, "step": 3880 }, { "epoch": 3.4337455830388692, "grad_norm": 0.9722045063972473, "learning_rate": 2.4150476801477404e-05, "loss": 0.2452, "loss_nan_ranks": 0, "loss_rank_avg": 0.24321609735488892, "step": 3885 }, { "epoch": 3.4381625441696113, "grad_norm": 0.5848979949951172, "learning_rate": 2.4107370426123685e-05, "loss": 0.2405, "loss_nan_ranks": 0, "loss_rank_avg": 0.2286883294582367, "step": 3890 }, { "epoch": 3.4425795053003534, "grad_norm": 0.7078244686126709, "learning_rate": 2.406424412098664e-05, "loss": 0.2926, "loss_nan_ranks": 0, "loss_rank_avg": 0.2944197654724121, "step": 3895 }, { "epoch": 3.4469964664310955, "grad_norm": 0.516459584236145, "learning_rate": 2.4021098095323713e-05, "loss": 0.2537, "loss_nan_ranks": 0, "loss_rank_avg": 0.23038452863693237, "step": 3900 }, { "epoch": 3.4514134275618376, "grad_norm": 0.6297348737716675, "learning_rate": 2.3977932558488074e-05, "loss": 0.2725, "loss_nan_ranks": 0, "loss_rank_avg": 0.28293001651763916, "step": 3905 }, { "epoch": 3.4558303886925796, "grad_norm": 0.6394320726394653, "learning_rate": 2.3934747719927534e-05, "loss": 0.2794, "loss_nan_ranks": 0, "loss_rank_avg": 0.29240959882736206, "step": 3910 }, { "epoch": 3.4602473498233217, "grad_norm": 0.5845627188682556, "learning_rate": 2.3891543789183573e-05, "loss": 0.2999, "loss_nan_ranks": 0, "loss_rank_avg": 0.34419286251068115, "step": 3915 }, { "epoch": 3.464664310954064, "grad_norm": 0.5385047197341919, "learning_rate": 2.3848320975890316e-05, "loss": 0.2683, "loss_nan_ranks": 0, "loss_rank_avg": 0.23887717723846436, "step": 3920 }, { "epoch": 3.4690812720848054, "grad_norm": 0.5908883810043335, "learning_rate": 2.3805079489773508e-05, "loss": 0.2442, "loss_nan_ranks": 0, "loss_rank_avg": 0.2729918360710144, "step": 3925 }, { "epoch": 3.4734982332155475, "grad_norm": 0.6180974841117859, "learning_rate": 2.376181954064948e-05, "loss": 0.2982, "loss_nan_ranks": 0, "loss_rank_avg": 0.2929500937461853, "step": 3930 }, { "epoch": 3.4779151943462896, "grad_norm": 0.5845738649368286, "learning_rate": 2.3718541338424176e-05, "loss": 0.2994, "loss_nan_ranks": 0, "loss_rank_avg": 0.2808518409729004, "step": 3935 }, { "epoch": 3.4823321554770317, "grad_norm": 0.7396969795227051, "learning_rate": 2.3675245093092082e-05, "loss": 0.2486, "loss_nan_ranks": 0, "loss_rank_avg": 0.2241256982088089, "step": 3940 }, { "epoch": 3.4867491166077738, "grad_norm": 0.6327768564224243, "learning_rate": 2.3631931014735258e-05, "loss": 0.2812, "loss_nan_ranks": 0, "loss_rank_avg": 0.2858206033706665, "step": 3945 }, { "epoch": 3.491166077738516, "grad_norm": 0.6281271576881409, "learning_rate": 2.358859931352227e-05, "loss": 0.2562, "loss_nan_ranks": 0, "loss_rank_avg": 0.22891634702682495, "step": 3950 }, { "epoch": 3.495583038869258, "grad_norm": 0.636397659778595, "learning_rate": 2.3545250199707207e-05, "loss": 0.2416, "loss_nan_ranks": 0, "loss_rank_avg": 0.2959528863430023, "step": 3955 }, { "epoch": 3.5, "grad_norm": 0.6934898495674133, "learning_rate": 2.350188388362865e-05, "loss": 0.2578, "loss_nan_ranks": 0, "loss_rank_avg": 0.3115631937980652, "step": 3960 }, { "epoch": 3.504416961130742, "grad_norm": 0.5889595150947571, "learning_rate": 2.3458500575708642e-05, "loss": 0.2683, "loss_nan_ranks": 0, "loss_rank_avg": 0.31395983695983887, "step": 3965 }, { "epoch": 3.508833922261484, "grad_norm": 0.6764131188392639, "learning_rate": 2.341510048645167e-05, "loss": 0.2803, "loss_nan_ranks": 0, "loss_rank_avg": 0.301558256149292, "step": 3970 }, { "epoch": 3.5132508833922262, "grad_norm": 0.6070583462715149, "learning_rate": 2.337168382644367e-05, "loss": 0.3018, "loss_nan_ranks": 0, "loss_rank_avg": 0.3173370063304901, "step": 3975 }, { "epoch": 3.5176678445229683, "grad_norm": 0.5840939283370972, "learning_rate": 2.332825080635094e-05, "loss": 0.3153, "loss_nan_ranks": 0, "loss_rank_avg": 0.3090516924858093, "step": 3980 }, { "epoch": 3.5220848056537104, "grad_norm": 0.7202004790306091, "learning_rate": 2.3284801636919205e-05, "loss": 0.2874, "loss_nan_ranks": 0, "loss_rank_avg": 0.41266167163848877, "step": 3985 }, { "epoch": 3.5265017667844525, "grad_norm": 0.6619871258735657, "learning_rate": 2.3241336528972522e-05, "loss": 0.2659, "loss_nan_ranks": 0, "loss_rank_avg": 0.2852621078491211, "step": 3990 }, { "epoch": 3.5309187279151946, "grad_norm": 0.5971029996871948, "learning_rate": 2.3197855693412295e-05, "loss": 0.2901, "loss_nan_ranks": 0, "loss_rank_avg": 0.24810791015625, "step": 3995 }, { "epoch": 3.5353356890459366, "grad_norm": 0.5984452962875366, "learning_rate": 2.3154359341216243e-05, "loss": 0.2733, "loss_nan_ranks": 0, "loss_rank_avg": 0.23727092146873474, "step": 4000 }, { "epoch": 3.5397526501766783, "grad_norm": 0.5858767628669739, "learning_rate": 2.311084768343737e-05, "loss": 0.2487, "loss_nan_ranks": 0, "loss_rank_avg": 0.27304431796073914, "step": 4005 }, { "epoch": 3.5441696113074204, "grad_norm": 0.6337104439735413, "learning_rate": 2.306732093120295e-05, "loss": 0.2892, "loss_nan_ranks": 0, "loss_rank_avg": 0.37451407313346863, "step": 4010 }, { "epoch": 3.5485865724381624, "grad_norm": 0.6166836023330688, "learning_rate": 2.3023779295713497e-05, "loss": 0.2833, "loss_nan_ranks": 0, "loss_rank_avg": 0.25973236560821533, "step": 4015 }, { "epoch": 3.5530035335689045, "grad_norm": 0.6111301183700562, "learning_rate": 2.2980222988241733e-05, "loss": 0.2633, "loss_nan_ranks": 0, "loss_rank_avg": 0.25971752405166626, "step": 4020 }, { "epoch": 3.5574204946996466, "grad_norm": 0.6352372765541077, "learning_rate": 2.293665222013158e-05, "loss": 0.2422, "loss_nan_ranks": 0, "loss_rank_avg": 0.21252036094665527, "step": 4025 }, { "epoch": 3.5618374558303887, "grad_norm": 0.6189213395118713, "learning_rate": 2.2893067202797136e-05, "loss": 0.2314, "loss_nan_ranks": 0, "loss_rank_avg": 0.22057734429836273, "step": 4030 }, { "epoch": 3.5662544169611308, "grad_norm": 0.5966265201568604, "learning_rate": 2.2849468147721615e-05, "loss": 0.27, "loss_nan_ranks": 0, "loss_rank_avg": 0.26105162501335144, "step": 4035 }, { "epoch": 3.570671378091873, "grad_norm": 0.7433478832244873, "learning_rate": 2.280585526645637e-05, "loss": 0.272, "loss_nan_ranks": 0, "loss_rank_avg": 0.2277737855911255, "step": 4040 }, { "epoch": 3.575088339222615, "grad_norm": 0.6259749531745911, "learning_rate": 2.2762228770619815e-05, "loss": 0.2872, "loss_nan_ranks": 0, "loss_rank_avg": 0.1985919028520584, "step": 4045 }, { "epoch": 3.579505300353357, "grad_norm": 0.6301651000976562, "learning_rate": 2.2718588871896454e-05, "loss": 0.2657, "loss_nan_ranks": 0, "loss_rank_avg": 0.3158514201641083, "step": 4050 }, { "epoch": 3.583922261484099, "grad_norm": 0.556121289730072, "learning_rate": 2.2674935782035804e-05, "loss": 0.2521, "loss_nan_ranks": 0, "loss_rank_avg": 0.23911920189857483, "step": 4055 }, { "epoch": 3.5883392226148407, "grad_norm": 0.5992854833602905, "learning_rate": 2.2631269712851385e-05, "loss": 0.2349, "loss_nan_ranks": 0, "loss_rank_avg": 0.23733431100845337, "step": 4060 }, { "epoch": 3.592756183745583, "grad_norm": 0.6029264330863953, "learning_rate": 2.258759087621971e-05, "loss": 0.26, "loss_nan_ranks": 0, "loss_rank_avg": 0.30784931778907776, "step": 4065 }, { "epoch": 3.597173144876325, "grad_norm": 0.6338687539100647, "learning_rate": 2.2543899484079245e-05, "loss": 0.299, "loss_nan_ranks": 0, "loss_rank_avg": 0.26149436831474304, "step": 4070 }, { "epoch": 3.601590106007067, "grad_norm": 0.5709251761436462, "learning_rate": 2.2500195748429352e-05, "loss": 0.2602, "loss_nan_ranks": 0, "loss_rank_avg": 0.26341015100479126, "step": 4075 }, { "epoch": 3.606007067137809, "grad_norm": 0.6511635780334473, "learning_rate": 2.2456479881329315e-05, "loss": 0.2457, "loss_nan_ranks": 0, "loss_rank_avg": 0.2105947583913803, "step": 4080 }, { "epoch": 3.610424028268551, "grad_norm": 0.5920666456222534, "learning_rate": 2.2412752094897267e-05, "loss": 0.2718, "loss_nan_ranks": 0, "loss_rank_avg": 0.25122541189193726, "step": 4085 }, { "epoch": 3.614840989399293, "grad_norm": 0.6184494495391846, "learning_rate": 2.236901260130918e-05, "loss": 0.2614, "loss_nan_ranks": 0, "loss_rank_avg": 0.19458022713661194, "step": 4090 }, { "epoch": 3.6192579505300353, "grad_norm": 0.5893781781196594, "learning_rate": 2.2325261612797832e-05, "loss": 0.2518, "loss_nan_ranks": 0, "loss_rank_avg": 0.26221320033073425, "step": 4095 }, { "epoch": 3.6236749116607774, "grad_norm": 0.6632538437843323, "learning_rate": 2.2281499341651767e-05, "loss": 0.268, "loss_nan_ranks": 0, "loss_rank_avg": 0.2836889624595642, "step": 4100 }, { "epoch": 3.6280918727915195, "grad_norm": 0.6132526993751526, "learning_rate": 2.223772600021429e-05, "loss": 0.2728, "loss_nan_ranks": 0, "loss_rank_avg": 0.3212493658065796, "step": 4105 }, { "epoch": 3.6325088339222615, "grad_norm": 0.5957457423210144, "learning_rate": 2.2193941800882418e-05, "loss": 0.3153, "loss_nan_ranks": 0, "loss_rank_avg": 0.4252464175224304, "step": 4110 }, { "epoch": 3.6369257950530036, "grad_norm": 0.6638500094413757, "learning_rate": 2.2150146956105836e-05, "loss": 0.3003, "loss_nan_ranks": 0, "loss_rank_avg": 0.2801547646522522, "step": 4115 }, { "epoch": 3.6413427561837457, "grad_norm": 0.6362109184265137, "learning_rate": 2.210634167838591e-05, "loss": 0.2801, "loss_nan_ranks": 0, "loss_rank_avg": 0.36497941613197327, "step": 4120 }, { "epoch": 3.645759717314488, "grad_norm": 0.7648757100105286, "learning_rate": 2.2062526180274607e-05, "loss": 0.2378, "loss_nan_ranks": 0, "loss_rank_avg": 0.2305619865655899, "step": 4125 }, { "epoch": 3.65017667844523, "grad_norm": 0.5891234278678894, "learning_rate": 2.2018700674373487e-05, "loss": 0.2642, "loss_nan_ranks": 0, "loss_rank_avg": 0.23286129534244537, "step": 4130 }, { "epoch": 3.654593639575972, "grad_norm": 0.5884703993797302, "learning_rate": 2.1974865373332695e-05, "loss": 0.281, "loss_nan_ranks": 0, "loss_rank_avg": 0.2888106107711792, "step": 4135 }, { "epoch": 3.659010600706714, "grad_norm": 0.5825828313827515, "learning_rate": 2.1931020489849865e-05, "loss": 0.2649, "loss_nan_ranks": 0, "loss_rank_avg": 0.2594233453273773, "step": 4140 }, { "epoch": 3.663427561837456, "grad_norm": 0.6837796568870544, "learning_rate": 2.1887166236669154e-05, "loss": 0.2716, "loss_nan_ranks": 0, "loss_rank_avg": 0.23577167093753815, "step": 4145 }, { "epoch": 3.6678445229681977, "grad_norm": 0.558169960975647, "learning_rate": 2.184330282658018e-05, "loss": 0.2425, "loss_nan_ranks": 0, "loss_rank_avg": 0.22032378613948822, "step": 4150 }, { "epoch": 3.67226148409894, "grad_norm": 0.5991285443305969, "learning_rate": 2.1799430472416975e-05, "loss": 0.31, "loss_nan_ranks": 0, "loss_rank_avg": 0.24205946922302246, "step": 4155 }, { "epoch": 3.676678445229682, "grad_norm": 0.5339157581329346, "learning_rate": 2.1755549387056997e-05, "loss": 0.2795, "loss_nan_ranks": 0, "loss_rank_avg": 0.24257515370845795, "step": 4160 }, { "epoch": 3.681095406360424, "grad_norm": 0.5839236378669739, "learning_rate": 2.1711659783420043e-05, "loss": 0.2469, "loss_nan_ranks": 0, "loss_rank_avg": 0.3155287802219391, "step": 4165 }, { "epoch": 3.685512367491166, "grad_norm": 0.6157808303833008, "learning_rate": 2.1667761874467256e-05, "loss": 0.282, "loss_nan_ranks": 0, "loss_rank_avg": 0.3029802739620209, "step": 4170 }, { "epoch": 3.689929328621908, "grad_norm": 0.6158527731895447, "learning_rate": 2.162385587320008e-05, "loss": 0.2547, "loss_nan_ranks": 0, "loss_rank_avg": 0.24819602072238922, "step": 4175 }, { "epoch": 3.6943462897526502, "grad_norm": 0.6458650231361389, "learning_rate": 2.1579941992659214e-05, "loss": 0.2515, "loss_nan_ranks": 0, "loss_rank_avg": 0.2597864866256714, "step": 4180 }, { "epoch": 3.6987632508833923, "grad_norm": 0.6469552516937256, "learning_rate": 2.1536020445923595e-05, "loss": 0.2546, "loss_nan_ranks": 0, "loss_rank_avg": 0.21913045644760132, "step": 4185 }, { "epoch": 3.7031802120141344, "grad_norm": 0.6282652616500854, "learning_rate": 2.1492091446109372e-05, "loss": 0.2705, "loss_nan_ranks": 0, "loss_rank_avg": 0.23371529579162598, "step": 4190 }, { "epoch": 3.7075971731448765, "grad_norm": 0.5427141785621643, "learning_rate": 2.1448155206368823e-05, "loss": 0.2806, "loss_nan_ranks": 0, "loss_rank_avg": 0.26559561491012573, "step": 4195 }, { "epoch": 3.712014134275618, "grad_norm": 0.6435478329658508, "learning_rate": 2.1404211939889392e-05, "loss": 0.2435, "loss_nan_ranks": 0, "loss_rank_avg": 0.22901174426078796, "step": 4200 }, { "epoch": 3.71643109540636, "grad_norm": 0.553631067276001, "learning_rate": 2.1360261859892594e-05, "loss": 0.3053, "loss_nan_ranks": 0, "loss_rank_avg": 0.3381679654121399, "step": 4205 }, { "epoch": 3.7208480565371023, "grad_norm": 0.6866286993026733, "learning_rate": 2.1316305179633016e-05, "loss": 0.3006, "loss_nan_ranks": 0, "loss_rank_avg": 0.3275683522224426, "step": 4210 }, { "epoch": 3.7252650176678443, "grad_norm": 0.6093161106109619, "learning_rate": 2.1272342112397272e-05, "loss": 0.2419, "loss_nan_ranks": 0, "loss_rank_avg": 0.21553654968738556, "step": 4215 }, { "epoch": 3.7296819787985864, "grad_norm": 0.5793571472167969, "learning_rate": 2.1228372871502955e-05, "loss": 0.2842, "loss_nan_ranks": 0, "loss_rank_avg": 0.24574121832847595, "step": 4220 }, { "epoch": 3.7340989399293285, "grad_norm": 0.6067283153533936, "learning_rate": 2.1184397670297624e-05, "loss": 0.2336, "loss_nan_ranks": 0, "loss_rank_avg": 0.2613888382911682, "step": 4225 }, { "epoch": 3.7385159010600706, "grad_norm": 0.6006746292114258, "learning_rate": 2.1140416722157765e-05, "loss": 0.2854, "loss_nan_ranks": 0, "loss_rank_avg": 0.38783538341522217, "step": 4230 }, { "epoch": 3.7429328621908127, "grad_norm": 0.5855950117111206, "learning_rate": 2.1096430240487723e-05, "loss": 0.253, "loss_nan_ranks": 0, "loss_rank_avg": 0.2570192515850067, "step": 4235 }, { "epoch": 3.7473498233215548, "grad_norm": 0.632363498210907, "learning_rate": 2.105243843871873e-05, "loss": 0.2336, "loss_nan_ranks": 0, "loss_rank_avg": 0.24486814439296722, "step": 4240 }, { "epoch": 3.751766784452297, "grad_norm": 0.6137506365776062, "learning_rate": 2.100844153030779e-05, "loss": 0.2863, "loss_nan_ranks": 0, "loss_rank_avg": 0.21526217460632324, "step": 4245 }, { "epoch": 3.756183745583039, "grad_norm": 0.567816972732544, "learning_rate": 2.096443972873673e-05, "loss": 0.2202, "loss_nan_ranks": 0, "loss_rank_avg": 0.21241986751556396, "step": 4250 }, { "epoch": 3.760600706713781, "grad_norm": 0.6972650289535522, "learning_rate": 2.0920433247511092e-05, "loss": 0.2904, "loss_nan_ranks": 0, "loss_rank_avg": 0.3464621901512146, "step": 4255 }, { "epoch": 3.765017667844523, "grad_norm": 0.5928968191146851, "learning_rate": 2.087642230015912e-05, "loss": 0.2641, "loss_nan_ranks": 0, "loss_rank_avg": 0.24683159589767456, "step": 4260 }, { "epoch": 3.769434628975265, "grad_norm": 0.8208529353141785, "learning_rate": 2.0832407100230747e-05, "loss": 0.2684, "loss_nan_ranks": 0, "loss_rank_avg": 0.30278170108795166, "step": 4265 }, { "epoch": 3.7738515901060072, "grad_norm": 0.5523584485054016, "learning_rate": 2.078838786129653e-05, "loss": 0.276, "loss_nan_ranks": 0, "loss_rank_avg": 0.28194743394851685, "step": 4270 }, { "epoch": 3.7782685512367493, "grad_norm": 0.6361742615699768, "learning_rate": 2.0744364796946624e-05, "loss": 0.2752, "loss_nan_ranks": 0, "loss_rank_avg": 0.3716488778591156, "step": 4275 }, { "epoch": 3.7826855123674914, "grad_norm": 0.6412866711616516, "learning_rate": 2.0700338120789754e-05, "loss": 0.2855, "loss_nan_ranks": 0, "loss_rank_avg": 0.24867978692054749, "step": 4280 }, { "epoch": 3.7871024734982335, "grad_norm": 0.6166922450065613, "learning_rate": 2.0656308046452157e-05, "loss": 0.2542, "loss_nan_ranks": 0, "loss_rank_avg": 0.25477123260498047, "step": 4285 }, { "epoch": 3.791519434628975, "grad_norm": 0.5771932601928711, "learning_rate": 2.0612274787576565e-05, "loss": 0.2917, "loss_nan_ranks": 0, "loss_rank_avg": 0.2894698977470398, "step": 4290 }, { "epoch": 3.795936395759717, "grad_norm": 0.645137369632721, "learning_rate": 2.0568238557821175e-05, "loss": 0.2617, "loss_nan_ranks": 0, "loss_rank_avg": 0.2622634470462799, "step": 4295 }, { "epoch": 3.8003533568904593, "grad_norm": 0.6164206266403198, "learning_rate": 2.0524199570858573e-05, "loss": 0.2591, "loss_nan_ranks": 0, "loss_rank_avg": 0.28416186571121216, "step": 4300 }, { "epoch": 3.8047703180212014, "grad_norm": 0.6087357401847839, "learning_rate": 2.048015804037474e-05, "loss": 0.2536, "loss_nan_ranks": 0, "loss_rank_avg": 0.22203269600868225, "step": 4305 }, { "epoch": 3.8091872791519434, "grad_norm": 0.6240975260734558, "learning_rate": 2.0436114180068008e-05, "loss": 0.2946, "loss_nan_ranks": 0, "loss_rank_avg": 0.3069062829017639, "step": 4310 }, { "epoch": 3.8136042402826855, "grad_norm": 0.5895731449127197, "learning_rate": 2.039206820364798e-05, "loss": 0.2882, "loss_nan_ranks": 0, "loss_rank_avg": 0.3107955753803253, "step": 4315 }, { "epoch": 3.8180212014134276, "grad_norm": 0.6357214450836182, "learning_rate": 2.034802032483457e-05, "loss": 0.2684, "loss_nan_ranks": 0, "loss_rank_avg": 0.2024916559457779, "step": 4320 }, { "epoch": 3.8224381625441697, "grad_norm": 0.6543165445327759, "learning_rate": 2.0303970757356894e-05, "loss": 0.251, "loss_nan_ranks": 0, "loss_rank_avg": 0.2537229657173157, "step": 4325 }, { "epoch": 3.8268551236749118, "grad_norm": 0.6982588768005371, "learning_rate": 2.025991971495226e-05, "loss": 0.2544, "loss_nan_ranks": 0, "loss_rank_avg": 0.2701827883720398, "step": 4330 }, { "epoch": 3.831272084805654, "grad_norm": 0.5591413378715515, "learning_rate": 2.021586741136516e-05, "loss": 0.2919, "loss_nan_ranks": 0, "loss_rank_avg": 0.3260684907436371, "step": 4335 }, { "epoch": 3.835689045936396, "grad_norm": 0.7472929954528809, "learning_rate": 2.017181406034617e-05, "loss": 0.2375, "loss_nan_ranks": 0, "loss_rank_avg": 0.20827150344848633, "step": 4340 }, { "epoch": 3.8401060070671376, "grad_norm": 0.8127717971801758, "learning_rate": 2.0127759875650974e-05, "loss": 0.2724, "loss_nan_ranks": 0, "loss_rank_avg": 0.18227070569992065, "step": 4345 }, { "epoch": 3.8445229681978796, "grad_norm": 0.5871464610099792, "learning_rate": 2.0083705071039297e-05, "loss": 0.2649, "loss_nan_ranks": 0, "loss_rank_avg": 0.307786226272583, "step": 4350 }, { "epoch": 3.8489399293286217, "grad_norm": 0.6023159623146057, "learning_rate": 2.0039649860273855e-05, "loss": 0.2563, "loss_nan_ranks": 0, "loss_rank_avg": 0.27349045872688293, "step": 4355 }, { "epoch": 3.853356890459364, "grad_norm": 1.6955772638320923, "learning_rate": 1.9995594457119364e-05, "loss": 0.2609, "loss_nan_ranks": 0, "loss_rank_avg": 0.24877923727035522, "step": 4360 }, { "epoch": 3.857773851590106, "grad_norm": 0.6980836987495422, "learning_rate": 1.995153907534145e-05, "loss": 0.3256, "loss_nan_ranks": 0, "loss_rank_avg": 0.317715585231781, "step": 4365 }, { "epoch": 3.862190812720848, "grad_norm": 0.6564183831214905, "learning_rate": 1.990748392870563e-05, "loss": 0.2733, "loss_nan_ranks": 0, "loss_rank_avg": 0.3203606605529785, "step": 4370 }, { "epoch": 3.86660777385159, "grad_norm": 0.5388532876968384, "learning_rate": 1.986342923097631e-05, "loss": 0.3132, "loss_nan_ranks": 0, "loss_rank_avg": 0.2857687771320343, "step": 4375 }, { "epoch": 3.871024734982332, "grad_norm": 0.5364903211593628, "learning_rate": 1.98193751959157e-05, "loss": 0.2478, "loss_nan_ranks": 0, "loss_rank_avg": 0.21258722245693207, "step": 4380 }, { "epoch": 3.875441696113074, "grad_norm": 0.636022686958313, "learning_rate": 1.977532203728278e-05, "loss": 0.3042, "loss_nan_ranks": 0, "loss_rank_avg": 0.355568528175354, "step": 4385 }, { "epoch": 3.8798586572438163, "grad_norm": 0.5986488461494446, "learning_rate": 1.9731269968832305e-05, "loss": 0.2595, "loss_nan_ranks": 0, "loss_rank_avg": 0.2478679120540619, "step": 4390 }, { "epoch": 3.8842756183745584, "grad_norm": 0.5643515586853027, "learning_rate": 1.9687219204313717e-05, "loss": 0.2887, "loss_nan_ranks": 0, "loss_rank_avg": 0.21933361887931824, "step": 4395 }, { "epoch": 3.8886925795053005, "grad_norm": 0.6250163316726685, "learning_rate": 1.9643169957470157e-05, "loss": 0.2878, "loss_nan_ranks": 0, "loss_rank_avg": 0.29644590616226196, "step": 4400 }, { "epoch": 3.8931095406360425, "grad_norm": 0.6132106781005859, "learning_rate": 1.959912244203737e-05, "loss": 0.256, "loss_nan_ranks": 0, "loss_rank_avg": 0.24946492910385132, "step": 4405 }, { "epoch": 3.8975265017667846, "grad_norm": 0.596892774105072, "learning_rate": 1.9555076871742734e-05, "loss": 0.2952, "loss_nan_ranks": 0, "loss_rank_avg": 0.24082526564598083, "step": 4410 }, { "epoch": 3.9019434628975267, "grad_norm": 0.5782345533370972, "learning_rate": 1.951103346030415e-05, "loss": 0.2534, "loss_nan_ranks": 0, "loss_rank_avg": 0.2679441571235657, "step": 4415 }, { "epoch": 3.9063604240282688, "grad_norm": 0.5399941205978394, "learning_rate": 1.9466992421429076e-05, "loss": 0.2328, "loss_nan_ranks": 0, "loss_rank_avg": 0.18268732726573944, "step": 4420 }, { "epoch": 3.910777385159011, "grad_norm": 0.6017982959747314, "learning_rate": 1.9422953968813454e-05, "loss": 0.2697, "loss_nan_ranks": 0, "loss_rank_avg": 0.27265119552612305, "step": 4425 }, { "epoch": 3.9151943462897525, "grad_norm": 0.7943002581596375, "learning_rate": 1.937891831614066e-05, "loss": 0.256, "loss_nan_ranks": 0, "loss_rank_avg": 0.2352999448776245, "step": 4430 }, { "epoch": 3.9196113074204946, "grad_norm": 0.6090646386146545, "learning_rate": 1.93348856770805e-05, "loss": 0.2937, "loss_nan_ranks": 0, "loss_rank_avg": 0.3457292914390564, "step": 4435 }, { "epoch": 3.9240282685512367, "grad_norm": 0.584635317325592, "learning_rate": 1.929085626528814e-05, "loss": 0.2579, "loss_nan_ranks": 0, "loss_rank_avg": 0.30109745264053345, "step": 4440 }, { "epoch": 3.9284452296819787, "grad_norm": 0.6093024611473083, "learning_rate": 1.9246830294403108e-05, "loss": 0.2978, "loss_nan_ranks": 0, "loss_rank_avg": 0.3744407594203949, "step": 4445 }, { "epoch": 3.932862190812721, "grad_norm": 0.594610333442688, "learning_rate": 1.920280797804822e-05, "loss": 0.2748, "loss_nan_ranks": 0, "loss_rank_avg": 0.25274011492729187, "step": 4450 }, { "epoch": 3.937279151943463, "grad_norm": 0.5948688387870789, "learning_rate": 1.915878952982857e-05, "loss": 0.3056, "loss_nan_ranks": 0, "loss_rank_avg": 0.2250659167766571, "step": 4455 }, { "epoch": 3.941696113074205, "grad_norm": 0.608898937702179, "learning_rate": 1.911477516333048e-05, "loss": 0.2758, "loss_nan_ranks": 0, "loss_rank_avg": 0.2589750289916992, "step": 4460 }, { "epoch": 3.946113074204947, "grad_norm": 0.710386335849762, "learning_rate": 1.907076509212046e-05, "loss": 0.2521, "loss_nan_ranks": 0, "loss_rank_avg": 0.19801661372184753, "step": 4465 }, { "epoch": 3.950530035335689, "grad_norm": 0.6093432307243347, "learning_rate": 1.9026759529744187e-05, "loss": 0.3113, "loss_nan_ranks": 0, "loss_rank_avg": 0.26044270396232605, "step": 4470 }, { "epoch": 3.954946996466431, "grad_norm": 0.546137809753418, "learning_rate": 1.8982758689725447e-05, "loss": 0.2627, "loss_nan_ranks": 0, "loss_rank_avg": 0.19887375831604004, "step": 4475 }, { "epoch": 3.9593639575971733, "grad_norm": 0.5550782084465027, "learning_rate": 1.8938762785565137e-05, "loss": 0.2416, "loss_nan_ranks": 0, "loss_rank_avg": 0.2474510371685028, "step": 4480 }, { "epoch": 3.963780918727915, "grad_norm": 0.5750576853752136, "learning_rate": 1.8894772030740182e-05, "loss": 0.2648, "loss_nan_ranks": 0, "loss_rank_avg": 0.26418790221214294, "step": 4485 }, { "epoch": 3.968197879858657, "grad_norm": 0.6318395137786865, "learning_rate": 1.8850786638702528e-05, "loss": 0.29, "loss_nan_ranks": 0, "loss_rank_avg": 0.2796365022659302, "step": 4490 }, { "epoch": 3.972614840989399, "grad_norm": 0.6644579172134399, "learning_rate": 1.88068068228781e-05, "loss": 0.2798, "loss_nan_ranks": 0, "loss_rank_avg": 0.20628443360328674, "step": 4495 }, { "epoch": 3.977031802120141, "grad_norm": 0.6782865524291992, "learning_rate": 1.876283279666576e-05, "loss": 0.2907, "loss_nan_ranks": 0, "loss_rank_avg": 0.3842836320400238, "step": 4500 }, { "epoch": 3.9814487632508833, "grad_norm": 0.5653501152992249, "learning_rate": 1.87188647734363e-05, "loss": 0.2882, "loss_nan_ranks": 0, "loss_rank_avg": 0.2926984429359436, "step": 4505 }, { "epoch": 3.9858657243816253, "grad_norm": 0.6898596286773682, "learning_rate": 1.8674902966531354e-05, "loss": 0.2813, "loss_nan_ranks": 0, "loss_rank_avg": 0.2633523941040039, "step": 4510 }, { "epoch": 3.9902826855123674, "grad_norm": 0.5879707336425781, "learning_rate": 1.8630947589262417e-05, "loss": 0.2905, "loss_nan_ranks": 0, "loss_rank_avg": 0.27371013164520264, "step": 4515 }, { "epoch": 3.9946996466431095, "grad_norm": 0.6024365425109863, "learning_rate": 1.858699885490977e-05, "loss": 0.2682, "loss_nan_ranks": 0, "loss_rank_avg": 0.26837724447250366, "step": 4520 }, { "epoch": 3.9991166077738516, "grad_norm": 0.7211329340934753, "learning_rate": 1.8543056976721472e-05, "loss": 0.2526, "loss_nan_ranks": 0, "loss_rank_avg": 0.221034437417984, "step": 4525 }, { "epoch": 4.004416961130742, "grad_norm": 0.5379669070243835, "learning_rate": 1.84991221679123e-05, "loss": 0.2219, "loss_nan_ranks": 0, "loss_rank_avg": 0.1902405172586441, "step": 4530 }, { "epoch": 4.008833922261484, "grad_norm": 0.546558141708374, "learning_rate": 1.845519464166275e-05, "loss": 0.2414, "loss_nan_ranks": 0, "loss_rank_avg": 0.19735684990882874, "step": 4535 }, { "epoch": 4.013250883392226, "grad_norm": 0.6087609529495239, "learning_rate": 1.8411274611117974e-05, "loss": 0.2617, "loss_nan_ranks": 0, "loss_rank_avg": 0.2241283804178238, "step": 4540 }, { "epoch": 4.017667844522968, "grad_norm": 0.5435627698898315, "learning_rate": 1.836736228938674e-05, "loss": 0.2354, "loss_nan_ranks": 0, "loss_rank_avg": 0.20003776252269745, "step": 4545 }, { "epoch": 4.02208480565371, "grad_norm": 0.7864016890525818, "learning_rate": 1.832345788954043e-05, "loss": 0.2476, "loss_nan_ranks": 0, "loss_rank_avg": 0.20103882253170013, "step": 4550 }, { "epoch": 4.0265017667844525, "grad_norm": 0.6416686773300171, "learning_rate": 1.8279561624611962e-05, "loss": 0.2612, "loss_nan_ranks": 0, "loss_rank_avg": 0.22663789987564087, "step": 4555 }, { "epoch": 4.030918727915195, "grad_norm": 0.5910441875457764, "learning_rate": 1.8235673707594822e-05, "loss": 0.2545, "loss_nan_ranks": 0, "loss_rank_avg": 0.24896244704723358, "step": 4560 }, { "epoch": 4.035335689045937, "grad_norm": 0.6123059988021851, "learning_rate": 1.819179435144195e-05, "loss": 0.2485, "loss_nan_ranks": 0, "loss_rank_avg": 0.3180444836616516, "step": 4565 }, { "epoch": 4.039752650176679, "grad_norm": 0.6581359505653381, "learning_rate": 1.8147923769064776e-05, "loss": 0.2517, "loss_nan_ranks": 0, "loss_rank_avg": 0.23575004935264587, "step": 4570 }, { "epoch": 4.044169611307421, "grad_norm": 0.6589792370796204, "learning_rate": 1.8104062173332134e-05, "loss": 0.242, "loss_nan_ranks": 0, "loss_rank_avg": 0.2652082145214081, "step": 4575 }, { "epoch": 4.048586572438163, "grad_norm": 0.6208878755569458, "learning_rate": 1.8060209777069267e-05, "loss": 0.2391, "loss_nan_ranks": 0, "loss_rank_avg": 0.2510373592376709, "step": 4580 }, { "epoch": 4.053003533568905, "grad_norm": 0.7314044833183289, "learning_rate": 1.801636679305679e-05, "loss": 0.2537, "loss_nan_ranks": 0, "loss_rank_avg": 0.23933479189872742, "step": 4585 }, { "epoch": 4.057420494699647, "grad_norm": 0.7164866328239441, "learning_rate": 1.797253343402962e-05, "loss": 0.2101, "loss_nan_ranks": 0, "loss_rank_avg": 0.2189275324344635, "step": 4590 }, { "epoch": 4.061837455830389, "grad_norm": 0.7242477536201477, "learning_rate": 1.7928709912676e-05, "loss": 0.2666, "loss_nan_ranks": 0, "loss_rank_avg": 0.23622474074363708, "step": 4595 }, { "epoch": 4.06625441696113, "grad_norm": 0.6429215669631958, "learning_rate": 1.788489644163642e-05, "loss": 0.2344, "loss_nan_ranks": 0, "loss_rank_avg": 0.2727344036102295, "step": 4600 }, { "epoch": 4.070671378091872, "grad_norm": 0.6153692603111267, "learning_rate": 1.784109323350261e-05, "loss": 0.2724, "loss_nan_ranks": 0, "loss_rank_avg": 0.3209673762321472, "step": 4605 }, { "epoch": 4.0750883392226145, "grad_norm": 0.6157684922218323, "learning_rate": 1.77973005008165e-05, "loss": 0.2666, "loss_nan_ranks": 0, "loss_rank_avg": 0.25213688611984253, "step": 4610 }, { "epoch": 4.079505300353357, "grad_norm": 0.5954070687294006, "learning_rate": 1.7753518456069198e-05, "loss": 0.2325, "loss_nan_ranks": 0, "loss_rank_avg": 0.23194274306297302, "step": 4615 }, { "epoch": 4.083922261484099, "grad_norm": 0.6085749864578247, "learning_rate": 1.770974731169995e-05, "loss": 0.2456, "loss_nan_ranks": 0, "loss_rank_avg": 0.23410743474960327, "step": 4620 }, { "epoch": 4.088339222614841, "grad_norm": 0.5981534123420715, "learning_rate": 1.76659872800951e-05, "loss": 0.2454, "loss_nan_ranks": 0, "loss_rank_avg": 0.2934369444847107, "step": 4625 }, { "epoch": 4.092756183745583, "grad_norm": 0.6048264503479004, "learning_rate": 1.7622238573587093e-05, "loss": 0.2482, "loss_nan_ranks": 0, "loss_rank_avg": 0.3395196795463562, "step": 4630 }, { "epoch": 4.097173144876325, "grad_norm": 0.5977709293365479, "learning_rate": 1.7578501404453388e-05, "loss": 0.281, "loss_nan_ranks": 0, "loss_rank_avg": 0.25631198287010193, "step": 4635 }, { "epoch": 4.101590106007067, "grad_norm": 0.6589481234550476, "learning_rate": 1.7534775984915503e-05, "loss": 0.2383, "loss_nan_ranks": 0, "loss_rank_avg": 0.2767007350921631, "step": 4640 }, { "epoch": 4.106007067137809, "grad_norm": 0.6230020523071289, "learning_rate": 1.7491062527137912e-05, "loss": 0.2795, "loss_nan_ranks": 0, "loss_rank_avg": 0.27050620317459106, "step": 4645 }, { "epoch": 4.110424028268551, "grad_norm": 0.6690418720245361, "learning_rate": 1.744736124322707e-05, "loss": 0.2497, "loss_nan_ranks": 0, "loss_rank_avg": 0.26453179121017456, "step": 4650 }, { "epoch": 4.114840989399293, "grad_norm": 0.5822317004203796, "learning_rate": 1.7403672345230342e-05, "loss": 0.2322, "loss_nan_ranks": 0, "loss_rank_avg": 0.24094851315021515, "step": 4655 }, { "epoch": 4.119257950530035, "grad_norm": 0.6514151692390442, "learning_rate": 1.7359996045135007e-05, "loss": 0.2192, "loss_nan_ranks": 0, "loss_rank_avg": 0.1770108938217163, "step": 4660 }, { "epoch": 4.123674911660777, "grad_norm": 0.5683082938194275, "learning_rate": 1.7316332554867224e-05, "loss": 0.2665, "loss_nan_ranks": 0, "loss_rank_avg": 0.2293209284543991, "step": 4665 }, { "epoch": 4.1280918727915195, "grad_norm": 0.6644694805145264, "learning_rate": 1.7272682086290982e-05, "loss": 0.2602, "loss_nan_ranks": 0, "loss_rank_avg": 0.411828875541687, "step": 4670 }, { "epoch": 4.1325088339222615, "grad_norm": 0.6769957542419434, "learning_rate": 1.722904485120709e-05, "loss": 0.2498, "loss_nan_ranks": 0, "loss_rank_avg": 0.23401038348674774, "step": 4675 }, { "epoch": 4.136925795053004, "grad_norm": 0.615993857383728, "learning_rate": 1.7185421061352135e-05, "loss": 0.2403, "loss_nan_ranks": 0, "loss_rank_avg": 0.2185857892036438, "step": 4680 }, { "epoch": 4.141342756183746, "grad_norm": 0.680887758731842, "learning_rate": 1.7141810928397495e-05, "loss": 0.2512, "loss_nan_ranks": 0, "loss_rank_avg": 0.2617225646972656, "step": 4685 }, { "epoch": 4.145759717314488, "grad_norm": 0.6120012998580933, "learning_rate": 1.7098214663948243e-05, "loss": 0.2467, "loss_nan_ranks": 0, "loss_rank_avg": 0.24213114380836487, "step": 4690 }, { "epoch": 4.15017667844523, "grad_norm": 0.655899703502655, "learning_rate": 1.7054632479542196e-05, "loss": 0.2392, "loss_nan_ranks": 0, "loss_rank_avg": 0.2701648473739624, "step": 4695 }, { "epoch": 4.154593639575972, "grad_norm": 0.6427193880081177, "learning_rate": 1.7011064586648828e-05, "loss": 0.2549, "loss_nan_ranks": 0, "loss_rank_avg": 0.36555686593055725, "step": 4700 }, { "epoch": 4.159010600706714, "grad_norm": 0.77866530418396, "learning_rate": 1.6967511196668277e-05, "loss": 0.2433, "loss_nan_ranks": 0, "loss_rank_avg": 0.2713695466518402, "step": 4705 }, { "epoch": 4.163427561837456, "grad_norm": 0.6055473685264587, "learning_rate": 1.6923972520930307e-05, "loss": 0.2595, "loss_nan_ranks": 0, "loss_rank_avg": 0.2517254948616028, "step": 4710 }, { "epoch": 4.167844522968198, "grad_norm": 0.6935552358627319, "learning_rate": 1.688044877069328e-05, "loss": 0.2606, "loss_nan_ranks": 0, "loss_rank_avg": 0.2867121696472168, "step": 4715 }, { "epoch": 4.17226148409894, "grad_norm": 0.5792904496192932, "learning_rate": 1.6836940157143152e-05, "loss": 0.2477, "loss_nan_ranks": 0, "loss_rank_avg": 0.26725780963897705, "step": 4720 }, { "epoch": 4.176678445229682, "grad_norm": 0.6815539002418518, "learning_rate": 1.6793446891392422e-05, "loss": 0.2758, "loss_nan_ranks": 0, "loss_rank_avg": 0.31988525390625, "step": 4725 }, { "epoch": 4.181095406360424, "grad_norm": 0.6169085502624512, "learning_rate": 1.6749969184479116e-05, "loss": 0.2304, "loss_nan_ranks": 0, "loss_rank_avg": 0.24607014656066895, "step": 4730 }, { "epoch": 4.1855123674911665, "grad_norm": 0.6831942200660706, "learning_rate": 1.670650724736577e-05, "loss": 0.2323, "loss_nan_ranks": 0, "loss_rank_avg": 0.23659402132034302, "step": 4735 }, { "epoch": 4.189929328621908, "grad_norm": 0.6516426205635071, "learning_rate": 1.66630612909384e-05, "loss": 0.2365, "loss_nan_ranks": 0, "loss_rank_avg": 0.2049470990896225, "step": 4740 }, { "epoch": 4.19434628975265, "grad_norm": 0.7550818920135498, "learning_rate": 1.661963152600549e-05, "loss": 0.2397, "loss_nan_ranks": 0, "loss_rank_avg": 0.22731956839561462, "step": 4745 }, { "epoch": 4.198763250883392, "grad_norm": 0.641147255897522, "learning_rate": 1.657621816329694e-05, "loss": 0.2502, "loss_nan_ranks": 0, "loss_rank_avg": 0.20782433450222015, "step": 4750 }, { "epoch": 4.203180212014134, "grad_norm": 0.6107928156852722, "learning_rate": 1.6532821413463083e-05, "loss": 0.2476, "loss_nan_ranks": 0, "loss_rank_avg": 0.23882174491882324, "step": 4755 }, { "epoch": 4.207597173144876, "grad_norm": 0.621147096157074, "learning_rate": 1.648944148707363e-05, "loss": 0.2452, "loss_nan_ranks": 0, "loss_rank_avg": 0.2622299790382385, "step": 4760 }, { "epoch": 4.212014134275618, "grad_norm": 0.7022213339805603, "learning_rate": 1.6446078594616666e-05, "loss": 0.2463, "loss_nan_ranks": 0, "loss_rank_avg": 0.18493574857711792, "step": 4765 }, { "epoch": 4.21643109540636, "grad_norm": 0.6667992472648621, "learning_rate": 1.640273294649762e-05, "loss": 0.2593, "loss_nan_ranks": 0, "loss_rank_avg": 0.3022603392601013, "step": 4770 }, { "epoch": 4.220848056537102, "grad_norm": 0.6353417038917542, "learning_rate": 1.635940475303826e-05, "loss": 0.2516, "loss_nan_ranks": 0, "loss_rank_avg": 0.28547731041908264, "step": 4775 }, { "epoch": 4.225265017667844, "grad_norm": 0.703382670879364, "learning_rate": 1.631609422447565e-05, "loss": 0.2362, "loss_nan_ranks": 0, "loss_rank_avg": 0.18541069328784943, "step": 4780 }, { "epoch": 4.229681978798586, "grad_norm": 0.5781343579292297, "learning_rate": 1.6272801570961136e-05, "loss": 0.2179, "loss_nan_ranks": 0, "loss_rank_avg": 0.21327394247055054, "step": 4785 }, { "epoch": 4.2340989399293285, "grad_norm": 0.628350019454956, "learning_rate": 1.6229527002559346e-05, "loss": 0.2669, "loss_nan_ranks": 0, "loss_rank_avg": 0.22581510245800018, "step": 4790 }, { "epoch": 4.238515901060071, "grad_norm": 0.6018873453140259, "learning_rate": 1.6186270729247137e-05, "loss": 0.2395, "loss_nan_ranks": 0, "loss_rank_avg": 0.2645873725414276, "step": 4795 }, { "epoch": 4.242932862190813, "grad_norm": 0.6109693646430969, "learning_rate": 1.614303296091262e-05, "loss": 0.2536, "loss_nan_ranks": 0, "loss_rank_avg": 0.23807910084724426, "step": 4800 }, { "epoch": 4.247349823321555, "grad_norm": 0.6540105938911438, "learning_rate": 1.6099813907354077e-05, "loss": 0.2925, "loss_nan_ranks": 0, "loss_rank_avg": 0.3849636912345886, "step": 4805 }, { "epoch": 4.251766784452297, "grad_norm": 0.603500485420227, "learning_rate": 1.6056613778279026e-05, "loss": 0.253, "loss_nan_ranks": 0, "loss_rank_avg": 0.2598439157009125, "step": 4810 }, { "epoch": 4.256183745583039, "grad_norm": 0.6518641710281372, "learning_rate": 1.6013432783303133e-05, "loss": 0.2903, "loss_nan_ranks": 0, "loss_rank_avg": 0.30824732780456543, "step": 4815 }, { "epoch": 4.260600706713781, "grad_norm": 0.6308322548866272, "learning_rate": 1.5970271131949213e-05, "loss": 0.2959, "loss_nan_ranks": 0, "loss_rank_avg": 0.22987233102321625, "step": 4820 }, { "epoch": 4.265017667844523, "grad_norm": 0.5965448617935181, "learning_rate": 1.5927129033646264e-05, "loss": 0.2509, "loss_nan_ranks": 0, "loss_rank_avg": 0.3175223171710968, "step": 4825 }, { "epoch": 4.269434628975265, "grad_norm": 0.6295101642608643, "learning_rate": 1.588400669772836e-05, "loss": 0.2623, "loss_nan_ranks": 0, "loss_rank_avg": 0.2827831506729126, "step": 4830 }, { "epoch": 4.273851590106007, "grad_norm": 0.592918872833252, "learning_rate": 1.5840904333433717e-05, "loss": 0.2439, "loss_nan_ranks": 0, "loss_rank_avg": 0.25347912311553955, "step": 4835 }, { "epoch": 4.278268551236749, "grad_norm": 0.6142850518226624, "learning_rate": 1.5797822149903625e-05, "loss": 0.2262, "loss_nan_ranks": 0, "loss_rank_avg": 0.2786719501018524, "step": 4840 }, { "epoch": 4.282685512367491, "grad_norm": 0.6769436597824097, "learning_rate": 1.575476035618147e-05, "loss": 0.253, "loss_nan_ranks": 0, "loss_rank_avg": 0.27305734157562256, "step": 4845 }, { "epoch": 4.2871024734982335, "grad_norm": 0.6391955018043518, "learning_rate": 1.5711719161211674e-05, "loss": 0.2378, "loss_nan_ranks": 0, "loss_rank_avg": 0.2296718955039978, "step": 4850 }, { "epoch": 4.291519434628976, "grad_norm": 0.6884132623672485, "learning_rate": 1.5668698773838746e-05, "loss": 0.2877, "loss_nan_ranks": 0, "loss_rank_avg": 0.24689459800720215, "step": 4855 }, { "epoch": 4.295936395759718, "grad_norm": 0.6045219898223877, "learning_rate": 1.562569940280622e-05, "loss": 0.229, "loss_nan_ranks": 0, "loss_rank_avg": 0.24978384375572205, "step": 4860 }, { "epoch": 4.30035335689046, "grad_norm": 0.8001757860183716, "learning_rate": 1.5582721256755632e-05, "loss": 0.243, "loss_nan_ranks": 0, "loss_rank_avg": 0.16442768275737762, "step": 4865 }, { "epoch": 4.304770318021202, "grad_norm": 0.6849569082260132, "learning_rate": 1.5539764544225565e-05, "loss": 0.2499, "loss_nan_ranks": 0, "loss_rank_avg": 0.1961660087108612, "step": 4870 }, { "epoch": 4.309187279151944, "grad_norm": 0.6818974614143372, "learning_rate": 1.5496829473650568e-05, "loss": 0.2427, "loss_nan_ranks": 0, "loss_rank_avg": 0.24043825268745422, "step": 4875 }, { "epoch": 4.313604240282686, "grad_norm": 0.5977321863174438, "learning_rate": 1.5453916253360218e-05, "loss": 0.2614, "loss_nan_ranks": 0, "loss_rank_avg": 0.2922767102718353, "step": 4880 }, { "epoch": 4.318021201413427, "grad_norm": 0.6175761818885803, "learning_rate": 1.5411025091578025e-05, "loss": 0.2386, "loss_nan_ranks": 0, "loss_rank_avg": 0.21561937034130096, "step": 4885 }, { "epoch": 4.322438162544169, "grad_norm": 0.6610462069511414, "learning_rate": 1.5368156196420506e-05, "loss": 0.3025, "loss_nan_ranks": 0, "loss_rank_avg": 0.3540098965167999, "step": 4890 }, { "epoch": 4.326855123674911, "grad_norm": 0.5722936987876892, "learning_rate": 1.5325309775896117e-05, "loss": 0.2698, "loss_nan_ranks": 0, "loss_rank_avg": 0.2856179475784302, "step": 4895 }, { "epoch": 4.331272084805653, "grad_norm": 0.5998113751411438, "learning_rate": 1.5282486037904253e-05, "loss": 0.2487, "loss_nan_ranks": 0, "loss_rank_avg": 0.23048731684684753, "step": 4900 }, { "epoch": 4.3356890459363955, "grad_norm": 0.5948532819747925, "learning_rate": 1.5239685190234287e-05, "loss": 0.2564, "loss_nan_ranks": 0, "loss_rank_avg": 0.24808341264724731, "step": 4905 }, { "epoch": 4.340106007067138, "grad_norm": 0.6422690749168396, "learning_rate": 1.519690744056447e-05, "loss": 0.2617, "loss_nan_ranks": 0, "loss_rank_avg": 0.20361298322677612, "step": 4910 }, { "epoch": 4.34452296819788, "grad_norm": 0.6684097051620483, "learning_rate": 1.5154152996461026e-05, "loss": 0.2334, "loss_nan_ranks": 0, "loss_rank_avg": 0.17725619673728943, "step": 4915 }, { "epoch": 4.348939929328622, "grad_norm": 0.6210095286369324, "learning_rate": 1.5111422065377062e-05, "loss": 0.2756, "loss_nan_ranks": 0, "loss_rank_avg": 0.34692680835723877, "step": 4920 }, { "epoch": 4.353356890459364, "grad_norm": 0.617612898349762, "learning_rate": 1.5068714854651614e-05, "loss": 0.2544, "loss_nan_ranks": 0, "loss_rank_avg": 0.3084735870361328, "step": 4925 }, { "epoch": 4.357773851590106, "grad_norm": 0.6277985572814941, "learning_rate": 1.5026031571508606e-05, "loss": 0.2548, "loss_nan_ranks": 0, "loss_rank_avg": 0.2271774411201477, "step": 4930 }, { "epoch": 4.362190812720848, "grad_norm": 0.6329308152198792, "learning_rate": 1.498337242305588e-05, "loss": 0.239, "loss_nan_ranks": 0, "loss_rank_avg": 0.20148731768131256, "step": 4935 }, { "epoch": 4.36660777385159, "grad_norm": 0.7278103232383728, "learning_rate": 1.4940737616284163e-05, "loss": 0.2697, "loss_nan_ranks": 0, "loss_rank_avg": 0.24024207890033722, "step": 4940 }, { "epoch": 4.371024734982332, "grad_norm": 0.7309764623641968, "learning_rate": 1.4898127358066061e-05, "loss": 0.2612, "loss_nan_ranks": 0, "loss_rank_avg": 0.26143181324005127, "step": 4945 }, { "epoch": 4.375441696113074, "grad_norm": 0.6814038157463074, "learning_rate": 1.4855541855155086e-05, "loss": 0.2499, "loss_nan_ranks": 0, "loss_rank_avg": 0.26438629627227783, "step": 4950 }, { "epoch": 4.379858657243816, "grad_norm": 0.6568551063537598, "learning_rate": 1.4812981314184607e-05, "loss": 0.2344, "loss_nan_ranks": 0, "loss_rank_avg": 0.26820051670074463, "step": 4955 }, { "epoch": 4.384275618374558, "grad_norm": 0.6261412501335144, "learning_rate": 1.4770445941666905e-05, "loss": 0.2311, "loss_nan_ranks": 0, "loss_rank_avg": 0.282004714012146, "step": 4960 }, { "epoch": 4.3886925795053005, "grad_norm": 0.8228702545166016, "learning_rate": 1.4727935943992098e-05, "loss": 0.244, "loss_nan_ranks": 0, "loss_rank_avg": 0.24492314457893372, "step": 4965 }, { "epoch": 4.3931095406360425, "grad_norm": 0.6519868969917297, "learning_rate": 1.4685451527427224e-05, "loss": 0.2287, "loss_nan_ranks": 0, "loss_rank_avg": 0.2917710840702057, "step": 4970 }, { "epoch": 4.397526501766785, "grad_norm": 0.5828742980957031, "learning_rate": 1.4642992898115158e-05, "loss": 0.246, "loss_nan_ranks": 0, "loss_rank_avg": 0.21987062692642212, "step": 4975 }, { "epoch": 4.401943462897527, "grad_norm": 0.5715969204902649, "learning_rate": 1.460056026207367e-05, "loss": 0.2758, "loss_nan_ranks": 0, "loss_rank_avg": 0.3032934367656708, "step": 4980 }, { "epoch": 4.406360424028269, "grad_norm": 0.7088951468467712, "learning_rate": 1.4558153825194419e-05, "loss": 0.2615, "loss_nan_ranks": 0, "loss_rank_avg": 0.2616375982761383, "step": 4985 }, { "epoch": 4.410777385159011, "grad_norm": 0.5688840746879578, "learning_rate": 1.4515773793241898e-05, "loss": 0.2407, "loss_nan_ranks": 0, "loss_rank_avg": 0.28225451707839966, "step": 4990 }, { "epoch": 4.415194346289753, "grad_norm": 0.6520532369613647, "learning_rate": 1.4473420371852526e-05, "loss": 0.2543, "loss_nan_ranks": 0, "loss_rank_avg": 0.302867591381073, "step": 4995 }, { "epoch": 4.419611307420495, "grad_norm": 0.6046358942985535, "learning_rate": 1.4431093766533567e-05, "loss": 0.2606, "loss_nan_ranks": 0, "loss_rank_avg": 0.26887935400009155, "step": 5000 }, { "epoch": 4.424028268551237, "grad_norm": 0.6057432889938354, "learning_rate": 1.4388794182662186e-05, "loss": 0.2842, "loss_nan_ranks": 0, "loss_rank_avg": 0.3114582598209381, "step": 5005 }, { "epoch": 4.428445229681979, "grad_norm": 0.616709291934967, "learning_rate": 1.4346521825484424e-05, "loss": 0.2327, "loss_nan_ranks": 0, "loss_rank_avg": 0.2023238241672516, "step": 5010 }, { "epoch": 4.432862190812721, "grad_norm": 0.5935223698616028, "learning_rate": 1.4304276900114222e-05, "loss": 0.248, "loss_nan_ranks": 0, "loss_rank_avg": 0.3148966431617737, "step": 5015 }, { "epoch": 4.4372791519434625, "grad_norm": 0.5329979658126831, "learning_rate": 1.4262059611532419e-05, "loss": 0.2224, "loss_nan_ranks": 0, "loss_rank_avg": 0.20250526070594788, "step": 5020 }, { "epoch": 4.4416961130742045, "grad_norm": 0.6398733854293823, "learning_rate": 1.4219870164585739e-05, "loss": 0.2735, "loss_nan_ranks": 0, "loss_rank_avg": 0.3195938467979431, "step": 5025 }, { "epoch": 4.446113074204947, "grad_norm": 1.2401176691055298, "learning_rate": 1.417770876398583e-05, "loss": 0.2832, "loss_nan_ranks": 0, "loss_rank_avg": 0.27838078141212463, "step": 5030 }, { "epoch": 4.450530035335689, "grad_norm": 0.8684335947036743, "learning_rate": 1.4135575614308232e-05, "loss": 0.2552, "loss_nan_ranks": 0, "loss_rank_avg": 0.29576045274734497, "step": 5035 }, { "epoch": 4.454946996466431, "grad_norm": 0.709621012210846, "learning_rate": 1.4093470919991442e-05, "loss": 0.2892, "loss_nan_ranks": 0, "loss_rank_avg": 0.29040542244911194, "step": 5040 }, { "epoch": 4.459363957597173, "grad_norm": 0.5984362363815308, "learning_rate": 1.4051394885335836e-05, "loss": 0.2772, "loss_nan_ranks": 0, "loss_rank_avg": 0.21487905085086823, "step": 5045 }, { "epoch": 4.463780918727915, "grad_norm": 0.5817950963973999, "learning_rate": 1.4009347714502778e-05, "loss": 0.2243, "loss_nan_ranks": 0, "loss_rank_avg": 0.2756061553955078, "step": 5050 }, { "epoch": 4.468197879858657, "grad_norm": 0.6032534837722778, "learning_rate": 1.3967329611513543e-05, "loss": 0.243, "loss_nan_ranks": 0, "loss_rank_avg": 0.34259840846061707, "step": 5055 }, { "epoch": 4.472614840989399, "grad_norm": 0.5521541833877563, "learning_rate": 1.3925340780248373e-05, "loss": 0.2629, "loss_nan_ranks": 0, "loss_rank_avg": 0.2480975240468979, "step": 5060 }, { "epoch": 4.477031802120141, "grad_norm": 0.5998513102531433, "learning_rate": 1.3883381424445506e-05, "loss": 0.2168, "loss_nan_ranks": 0, "loss_rank_avg": 0.17923498153686523, "step": 5065 }, { "epoch": 4.481448763250883, "grad_norm": 0.6650380492210388, "learning_rate": 1.3841451747700098e-05, "loss": 0.2603, "loss_nan_ranks": 0, "loss_rank_avg": 0.2855014503002167, "step": 5070 }, { "epoch": 4.485865724381625, "grad_norm": 0.5877347588539124, "learning_rate": 1.3799551953463362e-05, "loss": 0.2619, "loss_nan_ranks": 0, "loss_rank_avg": 0.21608762443065643, "step": 5075 }, { "epoch": 4.490282685512367, "grad_norm": 0.5865316390991211, "learning_rate": 1.3757682245041466e-05, "loss": 0.2375, "loss_nan_ranks": 0, "loss_rank_avg": 0.22935955226421356, "step": 5080 }, { "epoch": 4.4946996466431095, "grad_norm": 0.5989511013031006, "learning_rate": 1.3715842825594628e-05, "loss": 0.2427, "loss_nan_ranks": 0, "loss_rank_avg": 0.2101748287677765, "step": 5085 }, { "epoch": 4.499116607773852, "grad_norm": 0.6382969617843628, "learning_rate": 1.3674033898136071e-05, "loss": 0.278, "loss_nan_ranks": 0, "loss_rank_avg": 0.2421863079071045, "step": 5090 }, { "epoch": 4.503533568904594, "grad_norm": 0.638594388961792, "learning_rate": 1.3632255665531088e-05, "loss": 0.2619, "loss_nan_ranks": 0, "loss_rank_avg": 0.22802847623825073, "step": 5095 }, { "epoch": 4.507950530035336, "grad_norm": 0.6239488124847412, "learning_rate": 1.3590508330496027e-05, "loss": 0.2318, "loss_nan_ranks": 0, "loss_rank_avg": 0.2688833773136139, "step": 5100 }, { "epoch": 4.512367491166078, "grad_norm": 0.6269590258598328, "learning_rate": 1.3548792095597305e-05, "loss": 0.2235, "loss_nan_ranks": 0, "loss_rank_avg": 0.23217537999153137, "step": 5105 }, { "epoch": 4.51678445229682, "grad_norm": 0.6065074801445007, "learning_rate": 1.3507107163250453e-05, "loss": 0.2419, "loss_nan_ranks": 0, "loss_rank_avg": 0.2584981918334961, "step": 5110 }, { "epoch": 4.521201413427562, "grad_norm": 0.6206178665161133, "learning_rate": 1.3465453735719087e-05, "loss": 0.2439, "loss_nan_ranks": 0, "loss_rank_avg": 0.2482869029045105, "step": 5115 }, { "epoch": 4.525618374558304, "grad_norm": 0.6238287091255188, "learning_rate": 1.3423832015114e-05, "loss": 0.2832, "loss_nan_ranks": 0, "loss_rank_avg": 0.31270158290863037, "step": 5120 }, { "epoch": 4.530035335689046, "grad_norm": 0.5761831998825073, "learning_rate": 1.3382242203392083e-05, "loss": 0.2112, "loss_nan_ranks": 0, "loss_rank_avg": 0.1838235706090927, "step": 5125 }, { "epoch": 4.534452296819788, "grad_norm": 0.636617124080658, "learning_rate": 1.3340684502355443e-05, "loss": 0.2808, "loss_nan_ranks": 0, "loss_rank_avg": 0.3398388922214508, "step": 5130 }, { "epoch": 4.53886925795053, "grad_norm": 0.6107141971588135, "learning_rate": 1.3299159113650357e-05, "loss": 0.2751, "loss_nan_ranks": 0, "loss_rank_avg": 0.26246610283851624, "step": 5135 }, { "epoch": 4.543286219081272, "grad_norm": 0.6003983020782471, "learning_rate": 1.325766623876632e-05, "loss": 0.2733, "loss_nan_ranks": 0, "loss_rank_avg": 0.27694079279899597, "step": 5140 }, { "epoch": 4.5477031802120145, "grad_norm": 0.6244370937347412, "learning_rate": 1.321620607903508e-05, "loss": 0.261, "loss_nan_ranks": 0, "loss_rank_avg": 0.26806601881980896, "step": 5145 }, { "epoch": 4.5521201413427566, "grad_norm": 0.6757631897926331, "learning_rate": 1.3174778835629605e-05, "loss": 0.2636, "loss_nan_ranks": 0, "loss_rank_avg": 0.27561718225479126, "step": 5150 }, { "epoch": 4.556537102473499, "grad_norm": 0.6886018514633179, "learning_rate": 1.3133384709563188e-05, "loss": 0.2509, "loss_nan_ranks": 0, "loss_rank_avg": 0.24703247845172882, "step": 5155 }, { "epoch": 4.560954063604241, "grad_norm": 0.6107934713363647, "learning_rate": 1.309202390168841e-05, "loss": 0.2404, "loss_nan_ranks": 0, "loss_rank_avg": 0.28265172243118286, "step": 5160 }, { "epoch": 4.565371024734983, "grad_norm": 0.7151978015899658, "learning_rate": 1.3050696612696188e-05, "loss": 0.2432, "loss_nan_ranks": 0, "loss_rank_avg": 0.25835981965065, "step": 5165 }, { "epoch": 4.569787985865725, "grad_norm": 0.5881211161613464, "learning_rate": 1.3009403043114796e-05, "loss": 0.26, "loss_nan_ranks": 0, "loss_rank_avg": 0.2113930583000183, "step": 5170 }, { "epoch": 4.574204946996466, "grad_norm": 0.7579076886177063, "learning_rate": 1.2968143393308897e-05, "loss": 0.2324, "loss_nan_ranks": 0, "loss_rank_avg": 0.3218274712562561, "step": 5175 }, { "epoch": 4.578621908127208, "grad_norm": 0.5710316896438599, "learning_rate": 1.2926917863478581e-05, "loss": 0.2566, "loss_nan_ranks": 0, "loss_rank_avg": 0.2202773243188858, "step": 5180 }, { "epoch": 4.58303886925795, "grad_norm": 0.6459076404571533, "learning_rate": 1.2885726653658355e-05, "loss": 0.2738, "loss_nan_ranks": 0, "loss_rank_avg": 0.2878916263580322, "step": 5185 }, { "epoch": 4.587455830388692, "grad_norm": 0.5926949381828308, "learning_rate": 1.2844569963716222e-05, "loss": 0.2545, "loss_nan_ranks": 0, "loss_rank_avg": 0.23166950047016144, "step": 5190 }, { "epoch": 4.591872791519434, "grad_norm": 0.6707189083099365, "learning_rate": 1.280344799335267e-05, "loss": 0.2543, "loss_nan_ranks": 0, "loss_rank_avg": 0.25186440348625183, "step": 5195 }, { "epoch": 4.5962897526501765, "grad_norm": 0.573384165763855, "learning_rate": 1.2762360942099745e-05, "loss": 0.2415, "loss_nan_ranks": 0, "loss_rank_avg": 0.24173548817634583, "step": 5200 }, { "epoch": 4.6007067137809186, "grad_norm": 0.6182858347892761, "learning_rate": 1.2721309009320021e-05, "loss": 0.2502, "loss_nan_ranks": 0, "loss_rank_avg": 0.23106129467487335, "step": 5205 }, { "epoch": 4.605123674911661, "grad_norm": 0.6368361115455627, "learning_rate": 1.268029239420571e-05, "loss": 0.295, "loss_nan_ranks": 0, "loss_rank_avg": 0.2934111952781677, "step": 5210 }, { "epoch": 4.609540636042403, "grad_norm": 0.6030951738357544, "learning_rate": 1.2639311295777632e-05, "loss": 0.2495, "loss_nan_ranks": 0, "loss_rank_avg": 0.2604959309101105, "step": 5215 }, { "epoch": 4.613957597173145, "grad_norm": 0.6463732123374939, "learning_rate": 1.2598365912884267e-05, "loss": 0.2556, "loss_nan_ranks": 0, "loss_rank_avg": 0.25189077854156494, "step": 5220 }, { "epoch": 4.618374558303887, "grad_norm": 0.5990574359893799, "learning_rate": 1.2557456444200831e-05, "loss": 0.296, "loss_nan_ranks": 0, "loss_rank_avg": 0.2886963486671448, "step": 5225 }, { "epoch": 4.622791519434629, "grad_norm": 0.6595236659049988, "learning_rate": 1.2516583088228224e-05, "loss": 0.2777, "loss_nan_ranks": 0, "loss_rank_avg": 0.17959409952163696, "step": 5230 }, { "epoch": 4.627208480565371, "grad_norm": 0.5914446711540222, "learning_rate": 1.2475746043292176e-05, "loss": 0.2595, "loss_nan_ranks": 0, "loss_rank_avg": 0.2113860547542572, "step": 5235 }, { "epoch": 4.631625441696113, "grad_norm": 0.6367279887199402, "learning_rate": 1.243494550754219e-05, "loss": 0.2763, "loss_nan_ranks": 0, "loss_rank_avg": 0.2789275348186493, "step": 5240 }, { "epoch": 4.636042402826855, "grad_norm": 0.6216886043548584, "learning_rate": 1.239418167895063e-05, "loss": 0.2777, "loss_nan_ranks": 0, "loss_rank_avg": 0.2759518623352051, "step": 5245 }, { "epoch": 4.640459363957597, "grad_norm": 0.689033031463623, "learning_rate": 1.2353454755311751e-05, "loss": 0.2444, "loss_nan_ranks": 0, "loss_rank_avg": 0.25160372257232666, "step": 5250 }, { "epoch": 4.644876325088339, "grad_norm": 0.5909837484359741, "learning_rate": 1.2312764934240735e-05, "loss": 0.2706, "loss_nan_ranks": 0, "loss_rank_avg": 0.26029708981513977, "step": 5255 }, { "epoch": 4.6492932862190814, "grad_norm": 0.6285930275917053, "learning_rate": 1.227211241317275e-05, "loss": 0.2523, "loss_nan_ranks": 0, "loss_rank_avg": 0.27858561277389526, "step": 5260 }, { "epoch": 4.6537102473498235, "grad_norm": 0.6383758783340454, "learning_rate": 1.223149738936195e-05, "loss": 0.2539, "loss_nan_ranks": 0, "loss_rank_avg": 0.2381717562675476, "step": 5265 }, { "epoch": 4.658127208480566, "grad_norm": 0.6008221507072449, "learning_rate": 1.219092005988057e-05, "loss": 0.2527, "loss_nan_ranks": 0, "loss_rank_avg": 0.2692645490169525, "step": 5270 }, { "epoch": 4.662544169611308, "grad_norm": 0.6864904761314392, "learning_rate": 1.215038062161792e-05, "loss": 0.2383, "loss_nan_ranks": 0, "loss_rank_avg": 0.24149903655052185, "step": 5275 }, { "epoch": 4.66696113074205, "grad_norm": 0.6051673889160156, "learning_rate": 1.2109879271279486e-05, "loss": 0.2567, "loss_nan_ranks": 0, "loss_rank_avg": 0.2649880647659302, "step": 5280 }, { "epoch": 4.671378091872792, "grad_norm": 0.6154433488845825, "learning_rate": 1.2069416205385902e-05, "loss": 0.2271, "loss_nan_ranks": 0, "loss_rank_avg": 0.19507455825805664, "step": 5285 }, { "epoch": 4.675795053003534, "grad_norm": 0.6651699542999268, "learning_rate": 1.2028991620272081e-05, "loss": 0.2139, "loss_nan_ranks": 0, "loss_rank_avg": 0.19150885939598083, "step": 5290 }, { "epoch": 4.680212014134275, "grad_norm": 0.6139252185821533, "learning_rate": 1.1988605712086199e-05, "loss": 0.2503, "loss_nan_ranks": 0, "loss_rank_avg": 0.201002836227417, "step": 5295 }, { "epoch": 4.684628975265017, "grad_norm": 0.6537069082260132, "learning_rate": 1.1948258676788751e-05, "loss": 0.269, "loss_nan_ranks": 0, "loss_rank_avg": 0.33721426129341125, "step": 5300 }, { "epoch": 4.689045936395759, "grad_norm": 0.6756031513214111, "learning_rate": 1.190795071015165e-05, "loss": 0.2719, "loss_nan_ranks": 0, "loss_rank_avg": 0.34285324811935425, "step": 5305 }, { "epoch": 4.693462897526501, "grad_norm": 0.5824704170227051, "learning_rate": 1.1867682007757191e-05, "loss": 0.2423, "loss_nan_ranks": 0, "loss_rank_avg": 0.2109375149011612, "step": 5310 }, { "epoch": 4.6978798586572434, "grad_norm": 0.6546387076377869, "learning_rate": 1.1827452764997198e-05, "loss": 0.2419, "loss_nan_ranks": 0, "loss_rank_avg": 0.17242984473705292, "step": 5315 }, { "epoch": 4.7022968197879855, "grad_norm": 0.698542058467865, "learning_rate": 1.1787263177071997e-05, "loss": 0.2423, "loss_nan_ranks": 0, "loss_rank_avg": 0.18797524273395538, "step": 5320 }, { "epoch": 4.706713780918728, "grad_norm": 0.7340909242630005, "learning_rate": 1.174711343898952e-05, "loss": 0.234, "loss_nan_ranks": 0, "loss_rank_avg": 0.19447045028209686, "step": 5325 }, { "epoch": 4.71113074204947, "grad_norm": 0.5774402022361755, "learning_rate": 1.1707003745564319e-05, "loss": 0.232, "loss_nan_ranks": 0, "loss_rank_avg": 0.24991655349731445, "step": 5330 }, { "epoch": 4.715547703180212, "grad_norm": 0.6813011169433594, "learning_rate": 1.1666934291416666e-05, "loss": 0.2505, "loss_nan_ranks": 0, "loss_rank_avg": 0.2309049814939499, "step": 5335 }, { "epoch": 4.719964664310954, "grad_norm": 0.6249779462814331, "learning_rate": 1.1626905270971563e-05, "loss": 0.2342, "loss_nan_ranks": 0, "loss_rank_avg": 0.20291580259799957, "step": 5340 }, { "epoch": 4.724381625441696, "grad_norm": 0.6181374788284302, "learning_rate": 1.1586916878457837e-05, "loss": 0.235, "loss_nan_ranks": 0, "loss_rank_avg": 0.34734612703323364, "step": 5345 }, { "epoch": 4.728798586572438, "grad_norm": 0.5957716107368469, "learning_rate": 1.1546969307907162e-05, "loss": 0.2824, "loss_nan_ranks": 0, "loss_rank_avg": 0.21913906931877136, "step": 5350 }, { "epoch": 4.73321554770318, "grad_norm": 0.6221516132354736, "learning_rate": 1.1507062753153155e-05, "loss": 0.2466, "loss_nan_ranks": 0, "loss_rank_avg": 0.20802852511405945, "step": 5355 }, { "epoch": 4.737632508833922, "grad_norm": 0.6126749515533447, "learning_rate": 1.1467197407830409e-05, "loss": 0.2835, "loss_nan_ranks": 0, "loss_rank_avg": 0.3353455662727356, "step": 5360 }, { "epoch": 4.742049469964664, "grad_norm": 0.6599173545837402, "learning_rate": 1.1427373465373541e-05, "loss": 0.2764, "loss_nan_ranks": 0, "loss_rank_avg": 0.289046049118042, "step": 5365 }, { "epoch": 4.746466431095406, "grad_norm": 0.5880971550941467, "learning_rate": 1.1387591119016292e-05, "loss": 0.2267, "loss_nan_ranks": 0, "loss_rank_avg": 0.2312006950378418, "step": 5370 }, { "epoch": 4.750883392226148, "grad_norm": 0.6444661617279053, "learning_rate": 1.1347850561790594e-05, "loss": 0.2895, "loss_nan_ranks": 0, "loss_rank_avg": 0.2494499385356903, "step": 5375 }, { "epoch": 4.7553003533568905, "grad_norm": 0.5412120819091797, "learning_rate": 1.1308151986525557e-05, "loss": 0.2552, "loss_nan_ranks": 0, "loss_rank_avg": 0.20912671089172363, "step": 5380 }, { "epoch": 4.759717314487633, "grad_norm": 0.5995916724205017, "learning_rate": 1.1268495585846621e-05, "loss": 0.2509, "loss_nan_ranks": 0, "loss_rank_avg": 0.21118956804275513, "step": 5385 }, { "epoch": 4.764134275618375, "grad_norm": 0.6803336143493652, "learning_rate": 1.1228881552174585e-05, "loss": 0.2416, "loss_nan_ranks": 0, "loss_rank_avg": 0.2551300525665283, "step": 5390 }, { "epoch": 4.768551236749117, "grad_norm": 0.6087960600852966, "learning_rate": 1.1189310077724667e-05, "loss": 0.2682, "loss_nan_ranks": 0, "loss_rank_avg": 0.23086312413215637, "step": 5395 }, { "epoch": 4.772968197879859, "grad_norm": 0.6534774303436279, "learning_rate": 1.1149781354505565e-05, "loss": 0.2789, "loss_nan_ranks": 0, "loss_rank_avg": 0.2714969515800476, "step": 5400 }, { "epoch": 4.777385159010601, "grad_norm": 0.5906716585159302, "learning_rate": 1.111029557431858e-05, "loss": 0.2106, "loss_nan_ranks": 0, "loss_rank_avg": 0.25104430317878723, "step": 5405 }, { "epoch": 4.781802120141343, "grad_norm": 0.6094644665718079, "learning_rate": 1.1070852928756598e-05, "loss": 0.2423, "loss_nan_ranks": 0, "loss_rank_avg": 0.34287014603614807, "step": 5410 }, { "epoch": 4.786219081272085, "grad_norm": 0.6480569243431091, "learning_rate": 1.1031453609203244e-05, "loss": 0.2544, "loss_nan_ranks": 0, "loss_rank_avg": 0.24471309781074524, "step": 5415 }, { "epoch": 4.790636042402827, "grad_norm": 0.6177884936332703, "learning_rate": 1.0992097806831894e-05, "loss": 0.2405, "loss_nan_ranks": 0, "loss_rank_avg": 0.2734081745147705, "step": 5420 }, { "epoch": 4.795053003533569, "grad_norm": 0.666823148727417, "learning_rate": 1.0952785712604777e-05, "loss": 0.2846, "loss_nan_ranks": 0, "loss_rank_avg": 0.23860061168670654, "step": 5425 }, { "epoch": 4.799469964664311, "grad_norm": 0.6771073937416077, "learning_rate": 1.0913517517272057e-05, "loss": 0.2523, "loss_nan_ranks": 0, "loss_rank_avg": 0.2811965048313141, "step": 5430 }, { "epoch": 4.803886925795053, "grad_norm": 0.6511576771736145, "learning_rate": 1.0874293411370847e-05, "loss": 0.251, "loss_nan_ranks": 0, "loss_rank_avg": 0.2048361599445343, "step": 5435 }, { "epoch": 4.8083038869257955, "grad_norm": 0.622969925403595, "learning_rate": 1.083511358522439e-05, "loss": 0.2611, "loss_nan_ranks": 0, "loss_rank_avg": 0.25874412059783936, "step": 5440 }, { "epoch": 4.8127208480565375, "grad_norm": 0.6246639490127563, "learning_rate": 1.0795978228941025e-05, "loss": 0.2497, "loss_nan_ranks": 0, "loss_rank_avg": 0.28046876192092896, "step": 5445 }, { "epoch": 4.81713780918728, "grad_norm": 0.5713747143745422, "learning_rate": 1.0756887532413328e-05, "loss": 0.2686, "loss_nan_ranks": 0, "loss_rank_avg": 0.20135197043418884, "step": 5450 }, { "epoch": 4.821554770318021, "grad_norm": 0.6463353037834167, "learning_rate": 1.0717841685317207e-05, "loss": 0.258, "loss_nan_ranks": 0, "loss_rank_avg": 0.24732375144958496, "step": 5455 }, { "epoch": 4.825971731448763, "grad_norm": 0.6025271415710449, "learning_rate": 1.0678840877110906e-05, "loss": 0.2883, "loss_nan_ranks": 0, "loss_rank_avg": 0.2834791839122772, "step": 5460 }, { "epoch": 4.830388692579505, "grad_norm": 0.5726755261421204, "learning_rate": 1.0639885297034157e-05, "loss": 0.2302, "loss_nan_ranks": 0, "loss_rank_avg": 0.23436738550662994, "step": 5465 }, { "epoch": 4.834805653710247, "grad_norm": 0.6157001852989197, "learning_rate": 1.060097513410723e-05, "loss": 0.2783, "loss_nan_ranks": 0, "loss_rank_avg": 0.25192394852638245, "step": 5470 }, { "epoch": 4.839222614840989, "grad_norm": 0.6347540020942688, "learning_rate": 1.0562110577130031e-05, "loss": 0.2754, "loss_nan_ranks": 0, "loss_rank_avg": 0.324943482875824, "step": 5475 }, { "epoch": 4.843639575971731, "grad_norm": 0.5740549564361572, "learning_rate": 1.0523291814681149e-05, "loss": 0.2622, "loss_nan_ranks": 0, "loss_rank_avg": 0.25427937507629395, "step": 5480 }, { "epoch": 4.848056537102473, "grad_norm": 0.5769616961479187, "learning_rate": 1.0484519035117015e-05, "loss": 0.2484, "loss_nan_ranks": 0, "loss_rank_avg": 0.2660577595233917, "step": 5485 }, { "epoch": 4.852473498233215, "grad_norm": 0.7008361220359802, "learning_rate": 1.0445792426570894e-05, "loss": 0.2689, "loss_nan_ranks": 0, "loss_rank_avg": 0.2324357032775879, "step": 5490 }, { "epoch": 4.8568904593639575, "grad_norm": 0.6129425764083862, "learning_rate": 1.040711217695205e-05, "loss": 0.2246, "loss_nan_ranks": 0, "loss_rank_avg": 0.245370551943779, "step": 5495 }, { "epoch": 4.8613074204946995, "grad_norm": 0.6054858565330505, "learning_rate": 1.0368478473944792e-05, "loss": 0.2404, "loss_nan_ranks": 0, "loss_rank_avg": 0.23680360615253448, "step": 5500 }, { "epoch": 4.865724381625442, "grad_norm": 0.5556029081344604, "learning_rate": 1.0329891505007582e-05, "loss": 0.242, "loss_nan_ranks": 0, "loss_rank_avg": 0.21621742844581604, "step": 5505 }, { "epoch": 4.870141342756184, "grad_norm": 0.5951539874076843, "learning_rate": 1.029135145737212e-05, "loss": 0.2418, "loss_nan_ranks": 0, "loss_rank_avg": 0.24147434532642365, "step": 5510 }, { "epoch": 4.874558303886926, "grad_norm": 0.6369734406471252, "learning_rate": 1.0252858518042413e-05, "loss": 0.253, "loss_nan_ranks": 0, "loss_rank_avg": 0.2957211136817932, "step": 5515 }, { "epoch": 4.878975265017668, "grad_norm": 0.6428771018981934, "learning_rate": 1.0214412873793931e-05, "loss": 0.2393, "loss_nan_ranks": 0, "loss_rank_avg": 0.20931799709796906, "step": 5520 }, { "epoch": 4.88339222614841, "grad_norm": 0.591044008731842, "learning_rate": 1.0176014711172615e-05, "loss": 0.2694, "loss_nan_ranks": 0, "loss_rank_avg": 0.25282272696495056, "step": 5525 }, { "epoch": 4.887809187279152, "grad_norm": 0.5807618498802185, "learning_rate": 1.0137664216494035e-05, "loss": 0.2504, "loss_nan_ranks": 0, "loss_rank_avg": 0.26002442836761475, "step": 5530 }, { "epoch": 4.892226148409894, "grad_norm": 0.5614008903503418, "learning_rate": 1.0099361575842486e-05, "loss": 0.2173, "loss_nan_ranks": 0, "loss_rank_avg": 0.2629309296607971, "step": 5535 }, { "epoch": 4.896643109540636, "grad_norm": 0.5964021682739258, "learning_rate": 1.0061106975070025e-05, "loss": 0.2604, "loss_nan_ranks": 0, "loss_rank_avg": 0.3080860376358032, "step": 5540 }, { "epoch": 4.901060070671378, "grad_norm": 0.6463096141815186, "learning_rate": 1.0022900599795641e-05, "loss": 0.3287, "loss_nan_ranks": 0, "loss_rank_avg": 0.2719125747680664, "step": 5545 }, { "epoch": 4.90547703180212, "grad_norm": 0.6948069334030151, "learning_rate": 9.984742635404313e-06, "loss": 0.2694, "loss_nan_ranks": 0, "loss_rank_avg": 0.2654094398021698, "step": 5550 }, { "epoch": 4.909893992932862, "grad_norm": 0.9675594568252563, "learning_rate": 9.946633267046125e-06, "loss": 0.2607, "loss_nan_ranks": 0, "loss_rank_avg": 0.2373758852481842, "step": 5555 }, { "epoch": 4.9143109540636045, "grad_norm": 0.6239486336708069, "learning_rate": 9.908572679635337e-06, "loss": 0.2684, "loss_nan_ranks": 0, "loss_rank_avg": 0.19304318726062775, "step": 5560 }, { "epoch": 4.918727915194347, "grad_norm": 0.6223797798156738, "learning_rate": 9.87056105784957e-06, "loss": 0.2712, "loss_nan_ranks": 0, "loss_rank_avg": 0.2559204697608948, "step": 5565 }, { "epoch": 4.923144876325089, "grad_norm": 0.5596070885658264, "learning_rate": 9.832598586128796e-06, "loss": 0.3051, "loss_nan_ranks": 0, "loss_rank_avg": 0.24063706398010254, "step": 5570 }, { "epoch": 4.927561837455831, "grad_norm": 0.7187504768371582, "learning_rate": 9.794685448674533e-06, "loss": 0.2447, "loss_nan_ranks": 0, "loss_rank_avg": 0.2679745554924011, "step": 5575 }, { "epoch": 4.931978798586572, "grad_norm": 0.6528657078742981, "learning_rate": 9.756821829448911e-06, "loss": 0.2278, "loss_nan_ranks": 0, "loss_rank_avg": 0.2918229103088379, "step": 5580 }, { "epoch": 4.936395759717314, "grad_norm": 0.5671543478965759, "learning_rate": 9.719007912173786e-06, "loss": 0.2551, "loss_nan_ranks": 0, "loss_rank_avg": 0.24266092479228973, "step": 5585 }, { "epoch": 4.940812720848056, "grad_norm": 0.5972075462341309, "learning_rate": 9.681243880329864e-06, "loss": 0.2973, "loss_nan_ranks": 0, "loss_rank_avg": 0.26102444529533386, "step": 5590 }, { "epoch": 4.945229681978798, "grad_norm": 0.6930065751075745, "learning_rate": 9.643529917155765e-06, "loss": 0.2431, "loss_nan_ranks": 0, "loss_rank_avg": 0.15023840963840485, "step": 5595 }, { "epoch": 4.94964664310954, "grad_norm": 0.5619693994522095, "learning_rate": 9.60586620564721e-06, "loss": 0.2326, "loss_nan_ranks": 0, "loss_rank_avg": 0.17770014703273773, "step": 5600 }, { "epoch": 4.954063604240282, "grad_norm": 0.683238685131073, "learning_rate": 9.568252928556045e-06, "loss": 0.2577, "loss_nan_ranks": 0, "loss_rank_avg": 0.27513328194618225, "step": 5605 }, { "epoch": 4.958480565371024, "grad_norm": 0.6224611401557922, "learning_rate": 9.530690268389419e-06, "loss": 0.2536, "loss_nan_ranks": 0, "loss_rank_avg": 0.2843214273452759, "step": 5610 }, { "epoch": 4.9628975265017665, "grad_norm": 0.6748574376106262, "learning_rate": 9.493178407408898e-06, "loss": 0.2502, "loss_nan_ranks": 0, "loss_rank_avg": 0.3191365599632263, "step": 5615 }, { "epoch": 4.967314487632509, "grad_norm": 0.6514118313789368, "learning_rate": 9.45571752762952e-06, "loss": 0.2612, "loss_nan_ranks": 0, "loss_rank_avg": 0.27559196949005127, "step": 5620 }, { "epoch": 4.971731448763251, "grad_norm": 0.5978466868400574, "learning_rate": 9.418307810818974e-06, "loss": 0.2224, "loss_nan_ranks": 0, "loss_rank_avg": 0.2235470712184906, "step": 5625 }, { "epoch": 4.976148409893993, "grad_norm": 0.8135108351707458, "learning_rate": 9.380949438496694e-06, "loss": 0.2443, "loss_nan_ranks": 0, "loss_rank_avg": 0.21574482321739197, "step": 5630 }, { "epoch": 4.980565371024735, "grad_norm": 0.5374975204467773, "learning_rate": 9.343642591932986e-06, "loss": 0.2481, "loss_nan_ranks": 0, "loss_rank_avg": 0.24713656306266785, "step": 5635 }, { "epoch": 4.984982332155477, "grad_norm": 0.6835610866546631, "learning_rate": 9.306387452148117e-06, "loss": 0.2555, "loss_nan_ranks": 0, "loss_rank_avg": 0.2892993092536926, "step": 5640 }, { "epoch": 4.989399293286219, "grad_norm": 0.6098293662071228, "learning_rate": 9.269184199911507e-06, "loss": 0.2758, "loss_nan_ranks": 0, "loss_rank_avg": 0.2980913519859314, "step": 5645 }, { "epoch": 4.993816254416961, "grad_norm": 0.5952140092849731, "learning_rate": 9.232033015740765e-06, "loss": 0.283, "loss_nan_ranks": 0, "loss_rank_avg": 0.3264719247817993, "step": 5650 }, { "epoch": 4.998233215547703, "grad_norm": 0.6798433661460876, "learning_rate": 9.19493407990087e-06, "loss": 0.2749, "loss_nan_ranks": 0, "loss_rank_avg": 0.2580171823501587, "step": 5655 }, { "epoch": 5.003533568904594, "grad_norm": 0.5702685117721558, "learning_rate": 9.157887572403292e-06, "loss": 0.2212, "loss_nan_ranks": 0, "loss_rank_avg": 0.20382651686668396, "step": 5660 }, { "epoch": 5.007950530035336, "grad_norm": 0.692338764667511, "learning_rate": 9.120893673005095e-06, "loss": 0.2174, "loss_nan_ranks": 0, "loss_rank_avg": 0.19380773603916168, "step": 5665 }, { "epoch": 5.012367491166078, "grad_norm": 0.5917826890945435, "learning_rate": 9.083952561208093e-06, "loss": 0.2525, "loss_nan_ranks": 0, "loss_rank_avg": 0.21118654310703278, "step": 5670 }, { "epoch": 5.01678445229682, "grad_norm": 0.6775484681129456, "learning_rate": 9.04706441625793e-06, "loss": 0.2458, "loss_nan_ranks": 0, "loss_rank_avg": 0.3707857131958008, "step": 5675 }, { "epoch": 5.021201413427562, "grad_norm": 0.6675564646720886, "learning_rate": 9.010229417143298e-06, "loss": 0.2348, "loss_nan_ranks": 0, "loss_rank_avg": 0.18382994830608368, "step": 5680 }, { "epoch": 5.025618374558304, "grad_norm": 0.6618363261222839, "learning_rate": 8.973447742594959e-06, "loss": 0.2264, "loss_nan_ranks": 0, "loss_rank_avg": 0.22905008494853973, "step": 5685 }, { "epoch": 5.030035335689046, "grad_norm": 0.6834167242050171, "learning_rate": 8.936719571084964e-06, "loss": 0.2468, "loss_nan_ranks": 0, "loss_rank_avg": 0.16894429922103882, "step": 5690 }, { "epoch": 5.034452296819788, "grad_norm": 1.0125937461853027, "learning_rate": 8.900045080825772e-06, "loss": 0.1942, "loss_nan_ranks": 0, "loss_rank_avg": 0.1643485128879547, "step": 5695 }, { "epoch": 5.03886925795053, "grad_norm": 0.66016685962677, "learning_rate": 8.863424449769326e-06, "loss": 0.2063, "loss_nan_ranks": 0, "loss_rank_avg": 0.19492757320404053, "step": 5700 }, { "epoch": 5.043286219081272, "grad_norm": 0.6835595369338989, "learning_rate": 8.826857855606268e-06, "loss": 0.2236, "loss_nan_ranks": 0, "loss_rank_avg": 0.2136615514755249, "step": 5705 }, { "epoch": 5.0477031802120145, "grad_norm": 0.6571291089057922, "learning_rate": 8.790345475765028e-06, "loss": 0.2325, "loss_nan_ranks": 0, "loss_rank_avg": 0.31727343797683716, "step": 5710 }, { "epoch": 5.0521201413427566, "grad_norm": 0.6805379390716553, "learning_rate": 8.753887487410988e-06, "loss": 0.244, "loss_nan_ranks": 0, "loss_rank_avg": 0.2703685164451599, "step": 5715 }, { "epoch": 5.056537102473499, "grad_norm": 0.6848156452178955, "learning_rate": 8.71748406744559e-06, "loss": 0.2635, "loss_nan_ranks": 0, "loss_rank_avg": 0.2362714409828186, "step": 5720 }, { "epoch": 5.060954063604241, "grad_norm": 0.6533686518669128, "learning_rate": 8.681135392505521e-06, "loss": 0.2934, "loss_nan_ranks": 0, "loss_rank_avg": 0.33763277530670166, "step": 5725 }, { "epoch": 5.065371024734982, "grad_norm": 0.7272844910621643, "learning_rate": 8.644841638961827e-06, "loss": 0.2103, "loss_nan_ranks": 0, "loss_rank_avg": 0.16482684016227722, "step": 5730 }, { "epoch": 5.069787985865724, "grad_norm": 0.6510602831840515, "learning_rate": 8.608602982919061e-06, "loss": 0.2306, "loss_nan_ranks": 0, "loss_rank_avg": 0.1856457144021988, "step": 5735 }, { "epoch": 5.074204946996466, "grad_norm": 0.7189889550209045, "learning_rate": 8.57241960021444e-06, "loss": 0.2622, "loss_nan_ranks": 0, "loss_rank_avg": 0.23383861780166626, "step": 5740 }, { "epoch": 5.078621908127208, "grad_norm": 0.6912586092948914, "learning_rate": 8.536291666416971e-06, "loss": 0.2268, "loss_nan_ranks": 0, "loss_rank_avg": 0.1808895766735077, "step": 5745 }, { "epoch": 5.08303886925795, "grad_norm": 0.6760678291320801, "learning_rate": 8.500219356826633e-06, "loss": 0.2813, "loss_nan_ranks": 0, "loss_rank_avg": 0.3097551465034485, "step": 5750 }, { "epoch": 5.087455830388692, "grad_norm": 0.6680203080177307, "learning_rate": 8.464202846473467e-06, "loss": 0.2059, "loss_nan_ranks": 0, "loss_rank_avg": 0.20965927839279175, "step": 5755 }, { "epoch": 5.091872791519434, "grad_norm": 0.6519942879676819, "learning_rate": 8.428242310116817e-06, "loss": 0.2579, "loss_nan_ranks": 0, "loss_rank_avg": 0.28410807251930237, "step": 5760 }, { "epoch": 5.0962897526501765, "grad_norm": 0.7109984755516052, "learning_rate": 8.392337922244383e-06, "loss": 0.2401, "loss_nan_ranks": 0, "loss_rank_avg": 0.19458754360675812, "step": 5765 }, { "epoch": 5.1007067137809186, "grad_norm": 4.666537284851074, "learning_rate": 8.35648985707144e-06, "loss": 0.2129, "loss_nan_ranks": 0, "loss_rank_avg": 0.21463000774383545, "step": 5770 }, { "epoch": 5.105123674911661, "grad_norm": 0.7026737928390503, "learning_rate": 8.320698288539997e-06, "loss": 0.2755, "loss_nan_ranks": 0, "loss_rank_avg": 0.1861979365348816, "step": 5775 }, { "epoch": 5.109540636042403, "grad_norm": 0.7229135632514954, "learning_rate": 8.284963390317885e-06, "loss": 0.1896, "loss_nan_ranks": 0, "loss_rank_avg": 0.17001771926879883, "step": 5780 }, { "epoch": 5.113957597173145, "grad_norm": 0.7189328670501709, "learning_rate": 8.24928533579799e-06, "loss": 0.2427, "loss_nan_ranks": 0, "loss_rank_avg": 0.2957168519496918, "step": 5785 }, { "epoch": 5.118374558303887, "grad_norm": 0.6980794668197632, "learning_rate": 8.21366429809737e-06, "loss": 0.2294, "loss_nan_ranks": 0, "loss_rank_avg": 0.27361002564430237, "step": 5790 }, { "epoch": 5.122791519434629, "grad_norm": 0.6661799550056458, "learning_rate": 8.17810045005644e-06, "loss": 0.2844, "loss_nan_ranks": 0, "loss_rank_avg": 0.26608705520629883, "step": 5795 }, { "epoch": 5.127208480565371, "grad_norm": 0.6025856733322144, "learning_rate": 8.142593964238092e-06, "loss": 0.2243, "loss_nan_ranks": 0, "loss_rank_avg": 0.23461788892745972, "step": 5800 }, { "epoch": 5.131625441696113, "grad_norm": 0.6775685548782349, "learning_rate": 8.107145012926909e-06, "loss": 0.2261, "loss_nan_ranks": 0, "loss_rank_avg": 0.20733845233917236, "step": 5805 }, { "epoch": 5.136042402826855, "grad_norm": 0.6828117966651917, "learning_rate": 8.071753768128299e-06, "loss": 0.2198, "loss_nan_ranks": 0, "loss_rank_avg": 0.19489261507987976, "step": 5810 }, { "epoch": 5.140459363957597, "grad_norm": 0.6730659604072571, "learning_rate": 8.036420401567662e-06, "loss": 0.236, "loss_nan_ranks": 0, "loss_rank_avg": 0.2441500872373581, "step": 5815 }, { "epoch": 5.144876325088339, "grad_norm": 0.6313605904579163, "learning_rate": 8.001145084689563e-06, "loss": 0.2842, "loss_nan_ranks": 0, "loss_rank_avg": 0.26540714502334595, "step": 5820 }, { "epoch": 5.1492932862190814, "grad_norm": 0.6368901133537292, "learning_rate": 7.965927988656903e-06, "loss": 0.2422, "loss_nan_ranks": 0, "loss_rank_avg": 0.2015400528907776, "step": 5825 }, { "epoch": 5.1537102473498235, "grad_norm": 0.7787892818450928, "learning_rate": 7.930769284350084e-06, "loss": 0.2294, "loss_nan_ranks": 0, "loss_rank_avg": 0.21253234148025513, "step": 5830 }, { "epoch": 5.158127208480566, "grad_norm": 0.7237471342086792, "learning_rate": 7.895669142366159e-06, "loss": 0.2181, "loss_nan_ranks": 0, "loss_rank_avg": 0.29186874628067017, "step": 5835 }, { "epoch": 5.162544169611308, "grad_norm": 0.677948534488678, "learning_rate": 7.860627733018065e-06, "loss": 0.2292, "loss_nan_ranks": 0, "loss_rank_avg": 0.22216159105300903, "step": 5840 }, { "epoch": 5.16696113074205, "grad_norm": 0.6752915382385254, "learning_rate": 7.825645226333714e-06, "loss": 0.219, "loss_nan_ranks": 0, "loss_rank_avg": 0.18272480368614197, "step": 5845 }, { "epoch": 5.171378091872792, "grad_norm": 0.6883508563041687, "learning_rate": 7.79072179205523e-06, "loss": 0.2321, "loss_nan_ranks": 0, "loss_rank_avg": 0.3108995854854584, "step": 5850 }, { "epoch": 5.175795053003534, "grad_norm": 0.6497143507003784, "learning_rate": 7.755857599638124e-06, "loss": 0.2032, "loss_nan_ranks": 0, "loss_rank_avg": 0.24391454458236694, "step": 5855 }, { "epoch": 5.180212014134276, "grad_norm": 0.6400555372238159, "learning_rate": 7.721052818250419e-06, "loss": 0.2759, "loss_nan_ranks": 0, "loss_rank_avg": 0.252552330493927, "step": 5860 }, { "epoch": 5.184628975265018, "grad_norm": 0.624398946762085, "learning_rate": 7.686307616771883e-06, "loss": 0.2458, "loss_nan_ranks": 0, "loss_rank_avg": 0.2461286336183548, "step": 5865 }, { "epoch": 5.189045936395759, "grad_norm": 0.6464707255363464, "learning_rate": 7.651622163793189e-06, "loss": 0.247, "loss_nan_ranks": 0, "loss_rank_avg": 0.23976178467273712, "step": 5870 }, { "epoch": 5.193462897526501, "grad_norm": 0.6472598314285278, "learning_rate": 7.616996627615103e-06, "loss": 0.2295, "loss_nan_ranks": 0, "loss_rank_avg": 0.2011461853981018, "step": 5875 }, { "epoch": 5.1978798586572434, "grad_norm": 0.7136189341545105, "learning_rate": 7.582431176247642e-06, "loss": 0.2714, "loss_nan_ranks": 0, "loss_rank_avg": 0.19472795724868774, "step": 5880 }, { "epoch": 5.2022968197879855, "grad_norm": 0.6750161051750183, "learning_rate": 7.547925977409301e-06, "loss": 0.2119, "loss_nan_ranks": 0, "loss_rank_avg": 0.20839814841747284, "step": 5885 }, { "epoch": 5.206713780918728, "grad_norm": 0.6782904267311096, "learning_rate": 7.5134811985262115e-06, "loss": 0.2842, "loss_nan_ranks": 0, "loss_rank_avg": 0.19967156648635864, "step": 5890 }, { "epoch": 5.21113074204947, "grad_norm": 0.5999016761779785, "learning_rate": 7.479097006731333e-06, "loss": 0.2569, "loss_nan_ranks": 0, "loss_rank_avg": 0.2858957052230835, "step": 5895 }, { "epoch": 5.215547703180212, "grad_norm": 0.7441349625587463, "learning_rate": 7.444773568863646e-06, "loss": 0.2368, "loss_nan_ranks": 0, "loss_rank_avg": 0.2590683400630951, "step": 5900 }, { "epoch": 5.219964664310954, "grad_norm": 0.7110154032707214, "learning_rate": 7.410511051467339e-06, "loss": 0.247, "loss_nan_ranks": 0, "loss_rank_avg": 0.17946434020996094, "step": 5905 }, { "epoch": 5.224381625441696, "grad_norm": 0.6729068756103516, "learning_rate": 7.376309620791016e-06, "loss": 0.1984, "loss_nan_ranks": 0, "loss_rank_avg": 0.24539992213249207, "step": 5910 }, { "epoch": 5.228798586572438, "grad_norm": 0.7076547145843506, "learning_rate": 7.342169442786835e-06, "loss": 0.2352, "loss_nan_ranks": 0, "loss_rank_avg": 0.22367143630981445, "step": 5915 }, { "epoch": 5.23321554770318, "grad_norm": 0.6758275032043457, "learning_rate": 7.308090683109803e-06, "loss": 0.2557, "loss_nan_ranks": 0, "loss_rank_avg": 0.222344771027565, "step": 5920 }, { "epoch": 5.237632508833922, "grad_norm": 0.5967568755149841, "learning_rate": 7.274073507116865e-06, "loss": 0.2537, "loss_nan_ranks": 0, "loss_rank_avg": 0.29815196990966797, "step": 5925 }, { "epoch": 5.242049469964664, "grad_norm": 0.5832816958427429, "learning_rate": 7.240118079866163e-06, "loss": 0.2208, "loss_nan_ranks": 0, "loss_rank_avg": 0.2399141788482666, "step": 5930 }, { "epoch": 5.246466431095406, "grad_norm": 0.6074890494346619, "learning_rate": 7.206224566116247e-06, "loss": 0.2618, "loss_nan_ranks": 0, "loss_rank_avg": 0.19510237872600555, "step": 5935 }, { "epoch": 5.250883392226148, "grad_norm": 0.6020046472549438, "learning_rate": 7.172393130325208e-06, "loss": 0.2298, "loss_nan_ranks": 0, "loss_rank_avg": 0.22199654579162598, "step": 5940 }, { "epoch": 5.2553003533568905, "grad_norm": 0.6055787801742554, "learning_rate": 7.138623936649951e-06, "loss": 0.2273, "loss_nan_ranks": 0, "loss_rank_avg": 0.2283298373222351, "step": 5945 }, { "epoch": 5.259717314487633, "grad_norm": 0.6854278445243835, "learning_rate": 7.104917148945363e-06, "loss": 0.2597, "loss_nan_ranks": 0, "loss_rank_avg": 0.23151074349880219, "step": 5950 }, { "epoch": 5.264134275618375, "grad_norm": 0.6667275428771973, "learning_rate": 7.0712729307635284e-06, "loss": 0.2464, "loss_nan_ranks": 0, "loss_rank_avg": 0.24305643141269684, "step": 5955 }, { "epoch": 5.268551236749117, "grad_norm": 0.6479029655456543, "learning_rate": 7.037691445352917e-06, "loss": 0.2446, "loss_nan_ranks": 0, "loss_rank_avg": 0.2675766944885254, "step": 5960 }, { "epoch": 5.272968197879859, "grad_norm": 0.6230666041374207, "learning_rate": 7.00417285565762e-06, "loss": 0.2521, "loss_nan_ranks": 0, "loss_rank_avg": 0.20548516511917114, "step": 5965 }, { "epoch": 5.277385159010601, "grad_norm": 0.6456372737884521, "learning_rate": 6.970717324316545e-06, "loss": 0.2666, "loss_nan_ranks": 0, "loss_rank_avg": 0.2712244689464569, "step": 5970 }, { "epoch": 5.281802120141343, "grad_norm": 0.6794562339782715, "learning_rate": 6.937325013662623e-06, "loss": 0.2322, "loss_nan_ranks": 0, "loss_rank_avg": 0.29143932461738586, "step": 5975 }, { "epoch": 5.286219081272085, "grad_norm": 0.6608504056930542, "learning_rate": 6.903996085722033e-06, "loss": 0.233, "loss_nan_ranks": 0, "loss_rank_avg": 0.20360919833183289, "step": 5980 }, { "epoch": 5.290636042402827, "grad_norm": 0.6932647824287415, "learning_rate": 6.8707307022134e-06, "loss": 0.2401, "loss_nan_ranks": 0, "loss_rank_avg": 0.2070590704679489, "step": 5985 }, { "epoch": 5.295053003533569, "grad_norm": 0.631776750087738, "learning_rate": 6.8375290245470296e-06, "loss": 0.2582, "loss_nan_ranks": 0, "loss_rank_avg": 0.2469078004360199, "step": 5990 }, { "epoch": 5.299469964664311, "grad_norm": 0.6876675486564636, "learning_rate": 6.804391213824087e-06, "loss": 0.2549, "loss_nan_ranks": 0, "loss_rank_avg": 0.2884756028652191, "step": 5995 }, { "epoch": 5.303886925795053, "grad_norm": 0.6492973566055298, "learning_rate": 6.771317430835888e-06, "loss": 0.2322, "loss_nan_ranks": 0, "loss_rank_avg": 0.1926867663860321, "step": 6000 }, { "epoch": 5.3083038869257955, "grad_norm": 0.6198075413703918, "learning_rate": 6.73830783606303e-06, "loss": 0.2568, "loss_nan_ranks": 0, "loss_rank_avg": 0.3009505867958069, "step": 6005 }, { "epoch": 5.3127208480565375, "grad_norm": 0.6509351134300232, "learning_rate": 6.705362589674667e-06, "loss": 0.2372, "loss_nan_ranks": 0, "loss_rank_avg": 0.22769448161125183, "step": 6010 }, { "epoch": 5.317137809187279, "grad_norm": 0.6019710302352905, "learning_rate": 6.6724818515277544e-06, "loss": 0.2299, "loss_nan_ranks": 0, "loss_rank_avg": 0.2702226936817169, "step": 6015 }, { "epoch": 5.321554770318021, "grad_norm": 0.6558071970939636, "learning_rate": 6.639665781166189e-06, "loss": 0.2595, "loss_nan_ranks": 0, "loss_rank_avg": 0.2750241160392761, "step": 6020 }, { "epoch": 5.325971731448763, "grad_norm": 0.5591468811035156, "learning_rate": 6.606914537820122e-06, "loss": 0.2572, "loss_nan_ranks": 0, "loss_rank_avg": 0.2233850359916687, "step": 6025 }, { "epoch": 5.330388692579505, "grad_norm": 0.6942284107208252, "learning_rate": 6.574228280405139e-06, "loss": 0.2525, "loss_nan_ranks": 0, "loss_rank_avg": 0.19524750113487244, "step": 6030 }, { "epoch": 5.334805653710247, "grad_norm": 0.6840284466743469, "learning_rate": 6.5416071675215136e-06, "loss": 0.233, "loss_nan_ranks": 0, "loss_rank_avg": 0.2810319662094116, "step": 6035 }, { "epoch": 5.339222614840989, "grad_norm": 0.6279734373092651, "learning_rate": 6.509051357453393e-06, "loss": 0.2251, "loss_nan_ranks": 0, "loss_rank_avg": 0.24808743596076965, "step": 6040 }, { "epoch": 5.343639575971731, "grad_norm": 0.6400632262229919, "learning_rate": 6.476561008168096e-06, "loss": 0.2538, "loss_nan_ranks": 0, "loss_rank_avg": 0.21455033123493195, "step": 6045 }, { "epoch": 5.348056537102473, "grad_norm": 0.6099967360496521, "learning_rate": 6.444136277315296e-06, "loss": 0.2336, "loss_nan_ranks": 0, "loss_rank_avg": 0.21934688091278076, "step": 6050 }, { "epoch": 5.352473498233215, "grad_norm": 0.9150162935256958, "learning_rate": 6.4117773222262805e-06, "loss": 0.2358, "loss_nan_ranks": 0, "loss_rank_avg": 0.17225447297096252, "step": 6055 }, { "epoch": 5.3568904593639575, "grad_norm": 0.631752073764801, "learning_rate": 6.379484299913172e-06, "loss": 0.2533, "loss_nan_ranks": 0, "loss_rank_avg": 0.23191042244434357, "step": 6060 }, { "epoch": 5.3613074204946995, "grad_norm": 0.6716257929801941, "learning_rate": 6.3472573670681805e-06, "loss": 0.2593, "loss_nan_ranks": 0, "loss_rank_avg": 0.3306858539581299, "step": 6065 }, { "epoch": 5.365724381625442, "grad_norm": 0.6031466126441956, "learning_rate": 6.315096680062838e-06, "loss": 0.2525, "loss_nan_ranks": 0, "loss_rank_avg": 0.2807907164096832, "step": 6070 }, { "epoch": 5.370141342756184, "grad_norm": 0.6290670037269592, "learning_rate": 6.283002394947216e-06, "loss": 0.2355, "loss_nan_ranks": 0, "loss_rank_avg": 0.22744987905025482, "step": 6075 }, { "epoch": 5.374558303886926, "grad_norm": 0.6004354357719421, "learning_rate": 6.2509746674492346e-06, "loss": 0.2552, "loss_nan_ranks": 0, "loss_rank_avg": 0.23005065321922302, "step": 6080 }, { "epoch": 5.378975265017668, "grad_norm": 0.6632710099220276, "learning_rate": 6.21901365297382e-06, "loss": 0.2316, "loss_nan_ranks": 0, "loss_rank_avg": 0.17470410466194153, "step": 6085 }, { "epoch": 5.38339222614841, "grad_norm": 0.6784233450889587, "learning_rate": 6.187119506602215e-06, "loss": 0.2751, "loss_nan_ranks": 0, "loss_rank_avg": 0.2649574875831604, "step": 6090 }, { "epoch": 5.387809187279152, "grad_norm": 0.6669394969940186, "learning_rate": 6.1552923830912e-06, "loss": 0.2403, "loss_nan_ranks": 0, "loss_rank_avg": 0.3093627393245697, "step": 6095 }, { "epoch": 5.392226148409894, "grad_norm": 0.6822030544281006, "learning_rate": 6.123532436872353e-06, "loss": 0.2475, "loss_nan_ranks": 0, "loss_rank_avg": 0.2727193236351013, "step": 6100 }, { "epoch": 5.396643109540636, "grad_norm": 0.6640558838844299, "learning_rate": 6.091839822051284e-06, "loss": 0.2906, "loss_nan_ranks": 0, "loss_rank_avg": 0.3642868995666504, "step": 6105 }, { "epoch": 5.401060070671378, "grad_norm": 0.6346195936203003, "learning_rate": 6.060214692406905e-06, "loss": 0.2527, "loss_nan_ranks": 0, "loss_rank_avg": 0.32739025354385376, "step": 6110 }, { "epoch": 5.40547703180212, "grad_norm": 0.662929356098175, "learning_rate": 6.028657201390682e-06, "loss": 0.224, "loss_nan_ranks": 0, "loss_rank_avg": 0.2002319097518921, "step": 6115 }, { "epoch": 5.409893992932862, "grad_norm": 0.6548178195953369, "learning_rate": 5.99716750212586e-06, "loss": 0.2456, "loss_nan_ranks": 0, "loss_rank_avg": 0.2806054651737213, "step": 6120 }, { "epoch": 5.4143109540636045, "grad_norm": 0.6379974484443665, "learning_rate": 5.965745747406775e-06, "loss": 0.2818, "loss_nan_ranks": 0, "loss_rank_avg": 0.3060104548931122, "step": 6125 }, { "epoch": 5.418727915194347, "grad_norm": 0.6715204119682312, "learning_rate": 5.934392089698064e-06, "loss": 0.2124, "loss_nan_ranks": 0, "loss_rank_avg": 0.16398535668849945, "step": 6130 }, { "epoch": 5.423144876325089, "grad_norm": 0.6135415434837341, "learning_rate": 5.903106681133952e-06, "loss": 0.2021, "loss_nan_ranks": 0, "loss_rank_avg": 0.19793318212032318, "step": 6135 }, { "epoch": 5.427561837455831, "grad_norm": 0.7642617225646973, "learning_rate": 5.871889673517501e-06, "loss": 0.2439, "loss_nan_ranks": 0, "loss_rank_avg": 0.24503038823604584, "step": 6140 }, { "epoch": 5.431978798586573, "grad_norm": 0.6476492881774902, "learning_rate": 5.840741218319881e-06, "loss": 0.2563, "loss_nan_ranks": 0, "loss_rank_avg": 0.23620697855949402, "step": 6145 }, { "epoch": 5.436395759717314, "grad_norm": 0.6400974988937378, "learning_rate": 5.809661466679635e-06, "loss": 0.246, "loss_nan_ranks": 0, "loss_rank_avg": 0.21778175234794617, "step": 6150 }, { "epoch": 5.440812720848056, "grad_norm": 0.637679398059845, "learning_rate": 5.778650569401922e-06, "loss": 0.2374, "loss_nan_ranks": 0, "loss_rank_avg": 0.2865186929702759, "step": 6155 }, { "epoch": 5.445229681978798, "grad_norm": 0.6278441548347473, "learning_rate": 5.747708676957844e-06, "loss": 0.2198, "loss_nan_ranks": 0, "loss_rank_avg": 0.22370074689388275, "step": 6160 }, { "epoch": 5.44964664310954, "grad_norm": 0.6846042275428772, "learning_rate": 5.716835939483641e-06, "loss": 0.2423, "loss_nan_ranks": 0, "loss_rank_avg": 0.18103906512260437, "step": 6165 }, { "epoch": 5.454063604240282, "grad_norm": 0.765556812286377, "learning_rate": 5.686032506780015e-06, "loss": 0.2228, "loss_nan_ranks": 0, "loss_rank_avg": 0.20441153645515442, "step": 6170 }, { "epoch": 5.458480565371024, "grad_norm": 0.7388072609901428, "learning_rate": 5.655298528311388e-06, "loss": 0.243, "loss_nan_ranks": 0, "loss_rank_avg": 0.210384339094162, "step": 6175 }, { "epoch": 5.4628975265017665, "grad_norm": 0.6503342390060425, "learning_rate": 5.624634153205178e-06, "loss": 0.2485, "loss_nan_ranks": 0, "loss_rank_avg": 0.23481428623199463, "step": 6180 }, { "epoch": 5.467314487632509, "grad_norm": 0.6476783752441406, "learning_rate": 5.594039530251065e-06, "loss": 0.2386, "loss_nan_ranks": 0, "loss_rank_avg": 0.2961850166320801, "step": 6185 }, { "epoch": 5.471731448763251, "grad_norm": 0.7560742497444153, "learning_rate": 5.563514807900285e-06, "loss": 0.2508, "loss_nan_ranks": 0, "loss_rank_avg": 0.2309841811656952, "step": 6190 }, { "epoch": 5.476148409893993, "grad_norm": 0.5617563128471375, "learning_rate": 5.533060134264907e-06, "loss": 0.2262, "loss_nan_ranks": 0, "loss_rank_avg": 0.1815638542175293, "step": 6195 }, { "epoch": 5.480565371024735, "grad_norm": 0.7148119211196899, "learning_rate": 5.5026756571170896e-06, "loss": 0.2373, "loss_nan_ranks": 0, "loss_rank_avg": 0.2046487033367157, "step": 6200 }, { "epoch": 5.484982332155477, "grad_norm": 1.1636927127838135, "learning_rate": 5.472361523888401e-06, "loss": 0.2203, "loss_nan_ranks": 0, "loss_rank_avg": 0.16061300039291382, "step": 6205 }, { "epoch": 5.489399293286219, "grad_norm": 0.682188093662262, "learning_rate": 5.442117881669085e-06, "loss": 0.2265, "loss_nan_ranks": 0, "loss_rank_avg": 0.19478756189346313, "step": 6210 }, { "epoch": 5.493816254416961, "grad_norm": 0.6677638292312622, "learning_rate": 5.411944877207347e-06, "loss": 0.2506, "loss_nan_ranks": 0, "loss_rank_avg": 0.2209673374891281, "step": 6215 }, { "epoch": 5.498233215547703, "grad_norm": 0.6800921559333801, "learning_rate": 5.38184265690864e-06, "loss": 0.2008, "loss_nan_ranks": 0, "loss_rank_avg": 0.1875065267086029, "step": 6220 }, { "epoch": 5.502650176678445, "grad_norm": 0.7293074131011963, "learning_rate": 5.3518113668349645e-06, "loss": 0.2397, "loss_nan_ranks": 0, "loss_rank_avg": 0.31664156913757324, "step": 6225 }, { "epoch": 5.507067137809187, "grad_norm": 0.6156737208366394, "learning_rate": 5.321851152704154e-06, "loss": 0.2621, "loss_nan_ranks": 0, "loss_rank_avg": 0.17080801725387573, "step": 6230 }, { "epoch": 5.511484098939929, "grad_norm": 0.6778889298439026, "learning_rate": 5.291962159889148e-06, "loss": 0.2454, "loss_nan_ranks": 0, "loss_rank_avg": 0.24503864347934723, "step": 6235 }, { "epoch": 5.5159010600706715, "grad_norm": 0.7297868728637695, "learning_rate": 5.262144533417344e-06, "loss": 0.2354, "loss_nan_ranks": 0, "loss_rank_avg": 0.2277633249759674, "step": 6240 }, { "epoch": 5.520318021201414, "grad_norm": 0.6664875745773315, "learning_rate": 5.232398417969815e-06, "loss": 0.2233, "loss_nan_ranks": 0, "loss_rank_avg": 0.21163435280323029, "step": 6245 }, { "epoch": 5.524734982332156, "grad_norm": 0.6079632639884949, "learning_rate": 5.2027239578806734e-06, "loss": 0.2322, "loss_nan_ranks": 0, "loss_rank_avg": 0.2564762830734253, "step": 6250 }, { "epoch": 5.529151943462898, "grad_norm": 0.6662228107452393, "learning_rate": 5.173121297136337e-06, "loss": 0.201, "loss_nan_ranks": 0, "loss_rank_avg": 0.21402853727340698, "step": 6255 }, { "epoch": 5.53356890459364, "grad_norm": 0.6768613457679749, "learning_rate": 5.14359057937484e-06, "loss": 0.2965, "loss_nan_ranks": 0, "loss_rank_avg": 0.2002478837966919, "step": 6260 }, { "epoch": 5.537985865724382, "grad_norm": 0.6485766172409058, "learning_rate": 5.114131947885137e-06, "loss": 0.2873, "loss_nan_ranks": 0, "loss_rank_avg": 0.3273354470729828, "step": 6265 }, { "epoch": 5.542402826855124, "grad_norm": 0.6236052513122559, "learning_rate": 5.084745545606402e-06, "loss": 0.2543, "loss_nan_ranks": 0, "loss_rank_avg": 0.17392995953559875, "step": 6270 }, { "epoch": 5.546819787985866, "grad_norm": 0.613959789276123, "learning_rate": 5.055431515127349e-06, "loss": 0.2463, "loss_nan_ranks": 0, "loss_rank_avg": 0.2359083592891693, "step": 6275 }, { "epoch": 5.551236749116608, "grad_norm": 0.6324638724327087, "learning_rate": 5.026189998685504e-06, "loss": 0.2449, "loss_nan_ranks": 0, "loss_rank_avg": 0.24177028238773346, "step": 6280 }, { "epoch": 5.55565371024735, "grad_norm": 0.6312211155891418, "learning_rate": 4.9970211381665665e-06, "loss": 0.275, "loss_nan_ranks": 0, "loss_rank_avg": 0.22367584705352783, "step": 6285 }, { "epoch": 5.560070671378092, "grad_norm": 0.9709349274635315, "learning_rate": 4.967925075103685e-06, "loss": 0.2554, "loss_nan_ranks": 0, "loss_rank_avg": 0.32471510767936707, "step": 6290 }, { "epoch": 5.564487632508834, "grad_norm": 0.6403810977935791, "learning_rate": 4.93890195067678e-06, "loss": 0.2563, "loss_nan_ranks": 0, "loss_rank_avg": 0.2835839092731476, "step": 6295 }, { "epoch": 5.5689045936395765, "grad_norm": 0.6615350246429443, "learning_rate": 4.909951905711858e-06, "loss": 0.2297, "loss_nan_ranks": 0, "loss_rank_avg": 0.1782667636871338, "step": 6300 }, { "epoch": 5.573321554770318, "grad_norm": 0.7057219743728638, "learning_rate": 4.881075080680335e-06, "loss": 0.2317, "loss_nan_ranks": 0, "loss_rank_avg": 0.28443485498428345, "step": 6305 }, { "epoch": 5.57773851590106, "grad_norm": 0.6871824264526367, "learning_rate": 4.852271615698349e-06, "loss": 0.2183, "loss_nan_ranks": 0, "loss_rank_avg": 0.28328755497932434, "step": 6310 }, { "epoch": 5.582155477031802, "grad_norm": 0.6732246279716492, "learning_rate": 4.823541650526058e-06, "loss": 0.2652, "loss_nan_ranks": 0, "loss_rank_avg": 0.24324092268943787, "step": 6315 }, { "epoch": 5.586572438162544, "grad_norm": 0.6424944400787354, "learning_rate": 4.7948853245670294e-06, "loss": 0.2128, "loss_nan_ranks": 0, "loss_rank_avg": 0.17290586233139038, "step": 6320 }, { "epoch": 5.590989399293286, "grad_norm": 0.659040093421936, "learning_rate": 4.7663027768674705e-06, "loss": 0.2051, "loss_nan_ranks": 0, "loss_rank_avg": 0.15980809926986694, "step": 6325 }, { "epoch": 5.595406360424028, "grad_norm": 0.6853231191635132, "learning_rate": 4.737794146115633e-06, "loss": 0.2225, "loss_nan_ranks": 0, "loss_rank_avg": 0.19948209822177887, "step": 6330 }, { "epoch": 5.59982332155477, "grad_norm": 0.651915431022644, "learning_rate": 4.7093595706410945e-06, "loss": 0.2627, "loss_nan_ranks": 0, "loss_rank_avg": 0.36641934514045715, "step": 6335 }, { "epoch": 5.604240282685512, "grad_norm": 0.6209613084793091, "learning_rate": 4.680999188414108e-06, "loss": 0.24, "loss_nan_ranks": 0, "loss_rank_avg": 0.20791277289390564, "step": 6340 }, { "epoch": 5.608657243816254, "grad_norm": 0.7123491764068604, "learning_rate": 4.652713137044927e-06, "loss": 0.2189, "loss_nan_ranks": 0, "loss_rank_avg": 0.25710418820381165, "step": 6345 }, { "epoch": 5.613074204946996, "grad_norm": 0.7246097922325134, "learning_rate": 4.624501553783127e-06, "loss": 0.2295, "loss_nan_ranks": 0, "loss_rank_avg": 0.23944400250911713, "step": 6350 }, { "epoch": 5.6174911660777385, "grad_norm": 0.7621732354164124, "learning_rate": 4.596364575516969e-06, "loss": 0.2256, "loss_nan_ranks": 0, "loss_rank_avg": 0.26072970032691956, "step": 6355 }, { "epoch": 5.6219081272084805, "grad_norm": 0.6371416449546814, "learning_rate": 4.568302338772688e-06, "loss": 0.2364, "loss_nan_ranks": 0, "loss_rank_avg": 0.21054017543792725, "step": 6360 }, { "epoch": 5.626325088339223, "grad_norm": 0.6732529401779175, "learning_rate": 4.540314979713876e-06, "loss": 0.2299, "loss_nan_ranks": 0, "loss_rank_avg": 0.19560235738754272, "step": 6365 }, { "epoch": 5.630742049469965, "grad_norm": 0.6369305849075317, "learning_rate": 4.512402634140804e-06, "loss": 0.2584, "loss_nan_ranks": 0, "loss_rank_avg": 0.2767673134803772, "step": 6370 }, { "epoch": 5.635159010600707, "grad_norm": 0.6594879627227783, "learning_rate": 4.484565437489759e-06, "loss": 0.2553, "loss_nan_ranks": 0, "loss_rank_avg": 0.27891114354133606, "step": 6375 }, { "epoch": 5.639575971731449, "grad_norm": 0.6730323433876038, "learning_rate": 4.456803524832389e-06, "loss": 0.2364, "loss_nan_ranks": 0, "loss_rank_avg": 0.2509545385837555, "step": 6380 }, { "epoch": 5.643992932862191, "grad_norm": 0.6865994334220886, "learning_rate": 4.429117030875052e-06, "loss": 0.2052, "loss_nan_ranks": 0, "loss_rank_avg": 0.1903909146785736, "step": 6385 }, { "epoch": 5.648409893992933, "grad_norm": 0.6434392333030701, "learning_rate": 4.401506089958161e-06, "loss": 0.2322, "loss_nan_ranks": 0, "loss_rank_avg": 0.29577142000198364, "step": 6390 }, { "epoch": 5.652826855123675, "grad_norm": 0.6849035620689392, "learning_rate": 4.37397083605551e-06, "loss": 0.2162, "loss_nan_ranks": 0, "loss_rank_avg": 0.2336483895778656, "step": 6395 }, { "epoch": 5.657243816254417, "grad_norm": 0.6565053462982178, "learning_rate": 4.346511402773688e-06, "loss": 0.2306, "loss_nan_ranks": 0, "loss_rank_avg": 0.23053717613220215, "step": 6400 }, { "epoch": 5.661660777385159, "grad_norm": 0.7413392066955566, "learning_rate": 4.319127923351339e-06, "loss": 0.2713, "loss_nan_ranks": 0, "loss_rank_avg": 0.27822649478912354, "step": 6405 }, { "epoch": 5.666077738515901, "grad_norm": 0.613976776599884, "learning_rate": 4.291820530658595e-06, "loss": 0.2549, "loss_nan_ranks": 0, "loss_rank_avg": 0.2500184178352356, "step": 6410 }, { "epoch": 5.670494699646643, "grad_norm": 0.6709519028663635, "learning_rate": 4.264589357196389e-06, "loss": 0.2342, "loss_nan_ranks": 0, "loss_rank_avg": 0.2182079702615738, "step": 6415 }, { "epoch": 5.6749116607773855, "grad_norm": 0.618645966053009, "learning_rate": 4.2374345350958256e-06, "loss": 0.2329, "loss_nan_ranks": 0, "loss_rank_avg": 0.16464778780937195, "step": 6420 }, { "epoch": 5.679328621908128, "grad_norm": 0.6145651936531067, "learning_rate": 4.2103561961175354e-06, "loss": 0.2168, "loss_nan_ranks": 0, "loss_rank_avg": 0.25252848863601685, "step": 6425 }, { "epoch": 5.683745583038869, "grad_norm": 0.7100237607955933, "learning_rate": 4.183354471651037e-06, "loss": 0.2357, "loss_nan_ranks": 0, "loss_rank_avg": 0.21529585123062134, "step": 6430 }, { "epoch": 5.688162544169611, "grad_norm": 0.6366299986839294, "learning_rate": 4.156429492714109e-06, "loss": 0.2213, "loss_nan_ranks": 0, "loss_rank_avg": 0.22616274654865265, "step": 6435 }, { "epoch": 5.692579505300353, "grad_norm": 0.6993550658226013, "learning_rate": 4.129581389952129e-06, "loss": 0.2259, "loss_nan_ranks": 0, "loss_rank_avg": 0.16478785872459412, "step": 6440 }, { "epoch": 5.696996466431095, "grad_norm": 0.6645349860191345, "learning_rate": 4.102810293637465e-06, "loss": 0.2262, "loss_nan_ranks": 0, "loss_rank_avg": 0.22474364936351776, "step": 6445 }, { "epoch": 5.701413427561837, "grad_norm": 0.6140891313552856, "learning_rate": 4.076116333668838e-06, "loss": 0.2337, "loss_nan_ranks": 0, "loss_rank_avg": 0.23928777873516083, "step": 6450 }, { "epoch": 5.705830388692579, "grad_norm": 0.6906691193580627, "learning_rate": 4.049499639570682e-06, "loss": 0.2503, "loss_nan_ranks": 0, "loss_rank_avg": 0.206419438123703, "step": 6455 }, { "epoch": 5.710247349823321, "grad_norm": 0.8086975812911987, "learning_rate": 4.022960340492525e-06, "loss": 0.2277, "loss_nan_ranks": 0, "loss_rank_avg": 0.2215789258480072, "step": 6460 }, { "epoch": 5.714664310954063, "grad_norm": 0.6342015266418457, "learning_rate": 3.996498565208358e-06, "loss": 0.2277, "loss_nan_ranks": 0, "loss_rank_avg": 0.20739787817001343, "step": 6465 }, { "epoch": 5.719081272084805, "grad_norm": 0.6628281474113464, "learning_rate": 3.970114442116013e-06, "loss": 0.1905, "loss_nan_ranks": 0, "loss_rank_avg": 0.18304401636123657, "step": 6470 }, { "epoch": 5.7234982332155475, "grad_norm": 0.6513307690620422, "learning_rate": 3.943808099236524e-06, "loss": 0.2257, "loss_nan_ranks": 0, "loss_rank_avg": 0.21980533003807068, "step": 6475 }, { "epoch": 5.72791519434629, "grad_norm": 0.6264829635620117, "learning_rate": 3.917579664213549e-06, "loss": 0.2471, "loss_nan_ranks": 0, "loss_rank_avg": 0.3289884328842163, "step": 6480 }, { "epoch": 5.732332155477032, "grad_norm": 0.6363914012908936, "learning_rate": 3.8914292643126915e-06, "loss": 0.225, "loss_nan_ranks": 0, "loss_rank_avg": 0.2795798182487488, "step": 6485 }, { "epoch": 5.736749116607774, "grad_norm": 0.693565309047699, "learning_rate": 3.865357026420926e-06, "loss": 0.2457, "loss_nan_ranks": 0, "loss_rank_avg": 0.2867644131183624, "step": 6490 }, { "epoch": 5.741166077738516, "grad_norm": 0.6560927629470825, "learning_rate": 3.839363077045974e-06, "loss": 0.229, "loss_nan_ranks": 0, "loss_rank_avg": 0.2569573223590851, "step": 6495 }, { "epoch": 5.745583038869258, "grad_norm": 0.6350461840629578, "learning_rate": 3.8134475423156757e-06, "loss": 0.2428, "loss_nan_ranks": 0, "loss_rank_avg": 0.2617478370666504, "step": 6500 }, { "epoch": 5.75, "grad_norm": 0.7265409827232361, "learning_rate": 3.787610547977396e-06, "loss": 0.2413, "loss_nan_ranks": 0, "loss_rank_avg": 0.18774326145648956, "step": 6505 }, { "epoch": 5.754416961130742, "grad_norm": 0.6530236005783081, "learning_rate": 3.7618522193973994e-06, "loss": 0.2527, "loss_nan_ranks": 0, "loss_rank_avg": 0.277637243270874, "step": 6510 }, { "epoch": 5.758833922261484, "grad_norm": 0.684080958366394, "learning_rate": 3.7361726815602596e-06, "loss": 0.2515, "loss_nan_ranks": 0, "loss_rank_avg": 0.2156839370727539, "step": 6515 }, { "epoch": 5.763250883392226, "grad_norm": 0.6697428822517395, "learning_rate": 3.710572059068218e-06, "loss": 0.2364, "loss_nan_ranks": 0, "loss_rank_avg": 0.26217517256736755, "step": 6520 }, { "epoch": 5.767667844522968, "grad_norm": 0.6842355132102966, "learning_rate": 3.6850504761406282e-06, "loss": 0.291, "loss_nan_ranks": 0, "loss_rank_avg": 0.34840089082717896, "step": 6525 }, { "epoch": 5.77208480565371, "grad_norm": 0.7177510261535645, "learning_rate": 3.6596080566133176e-06, "loss": 0.2566, "loss_nan_ranks": 0, "loss_rank_avg": 0.25881335139274597, "step": 6530 }, { "epoch": 5.7765017667844525, "grad_norm": 0.6908156275749207, "learning_rate": 3.6342449239379974e-06, "loss": 0.2514, "loss_nan_ranks": 0, "loss_rank_avg": 0.25273647904396057, "step": 6535 }, { "epoch": 5.780918727915195, "grad_norm": 0.6992905139923096, "learning_rate": 3.608961201181662e-06, "loss": 0.2722, "loss_nan_ranks": 0, "loss_rank_avg": 0.27057141065597534, "step": 6540 }, { "epoch": 5.785335689045937, "grad_norm": 0.708998441696167, "learning_rate": 3.5837570110259945e-06, "loss": 0.2274, "loss_nan_ranks": 0, "loss_rank_avg": 0.2736978530883789, "step": 6545 }, { "epoch": 5.789752650176679, "grad_norm": 0.5654464364051819, "learning_rate": 3.558632475766777e-06, "loss": 0.237, "loss_nan_ranks": 0, "loss_rank_avg": 0.23132240772247314, "step": 6550 }, { "epoch": 5.794169611307421, "grad_norm": 0.7154576182365417, "learning_rate": 3.5335877173132672e-06, "loss": 0.2623, "loss_nan_ranks": 0, "loss_rank_avg": 0.29507291316986084, "step": 6555 }, { "epoch": 5.798586572438163, "grad_norm": 0.6882935762405396, "learning_rate": 3.5086228571876622e-06, "loss": 0.2782, "loss_nan_ranks": 0, "loss_rank_avg": 0.2825637757778168, "step": 6560 }, { "epoch": 5.803003533568905, "grad_norm": 0.6030979156494141, "learning_rate": 3.4837380165244494e-06, "loss": 0.2596, "loss_nan_ranks": 0, "loss_rank_avg": 0.29193153977394104, "step": 6565 }, { "epoch": 5.807420494699647, "grad_norm": 0.6919524073600769, "learning_rate": 3.4589333160698592e-06, "loss": 0.2359, "loss_nan_ranks": 0, "loss_rank_avg": 0.2584839463233948, "step": 6570 }, { "epoch": 5.811837455830389, "grad_norm": 0.6700971722602844, "learning_rate": 3.434208876181262e-06, "loss": 0.244, "loss_nan_ranks": 0, "loss_rank_avg": 0.2972269654273987, "step": 6575 }, { "epoch": 5.816254416961131, "grad_norm": 0.6191151142120361, "learning_rate": 3.409564816826587e-06, "loss": 0.2667, "loss_nan_ranks": 0, "loss_rank_avg": 0.31485503911972046, "step": 6580 }, { "epoch": 5.820671378091872, "grad_norm": 0.627770185470581, "learning_rate": 3.385001257583744e-06, "loss": 0.2082, "loss_nan_ranks": 0, "loss_rank_avg": 0.27181103825569153, "step": 6585 }, { "epoch": 5.8250883392226145, "grad_norm": 0.6451102495193481, "learning_rate": 3.3605183176400402e-06, "loss": 0.2312, "loss_nan_ranks": 0, "loss_rank_avg": 0.3111085295677185, "step": 6590 }, { "epoch": 5.829505300353357, "grad_norm": 0.6402949094772339, "learning_rate": 3.3361161157916012e-06, "loss": 0.2148, "loss_nan_ranks": 0, "loss_rank_avg": 0.17799827456474304, "step": 6595 }, { "epoch": 5.833922261484099, "grad_norm": 0.6498696804046631, "learning_rate": 3.3117947704427866e-06, "loss": 0.2404, "loss_nan_ranks": 0, "loss_rank_avg": 0.2313275933265686, "step": 6600 }, { "epoch": 5.838339222614841, "grad_norm": 0.6034268736839294, "learning_rate": 3.287554399605637e-06, "loss": 0.2114, "loss_nan_ranks": 0, "loss_rank_avg": 0.18082204461097717, "step": 6605 }, { "epoch": 5.842756183745583, "grad_norm": 0.6362025141716003, "learning_rate": 3.2633951208992797e-06, "loss": 0.2358, "loss_nan_ranks": 0, "loss_rank_avg": 0.22728809714317322, "step": 6610 }, { "epoch": 5.847173144876325, "grad_norm": 0.6251878142356873, "learning_rate": 3.2393170515493756e-06, "loss": 0.2204, "loss_nan_ranks": 0, "loss_rank_avg": 0.24366095662117004, "step": 6615 }, { "epoch": 5.851590106007067, "grad_norm": 0.6515399813652039, "learning_rate": 3.2153203083875306e-06, "loss": 0.2544, "loss_nan_ranks": 0, "loss_rank_avg": 0.21778994798660278, "step": 6620 }, { "epoch": 5.856007067137809, "grad_norm": 0.8303614854812622, "learning_rate": 3.19140500785075e-06, "loss": 0.2382, "loss_nan_ranks": 0, "loss_rank_avg": 0.14264947175979614, "step": 6625 }, { "epoch": 5.860424028268551, "grad_norm": 0.6612645387649536, "learning_rate": 3.1675712659808576e-06, "loss": 0.2356, "loss_nan_ranks": 0, "loss_rank_avg": 0.22965016961097717, "step": 6630 }, { "epoch": 5.864840989399293, "grad_norm": 0.6375709176063538, "learning_rate": 3.1438191984239297e-06, "loss": 0.226, "loss_nan_ranks": 0, "loss_rank_avg": 0.24031785130500793, "step": 6635 }, { "epoch": 5.869257950530035, "grad_norm": 0.6527304649353027, "learning_rate": 3.1201489204297663e-06, "loss": 0.2465, "loss_nan_ranks": 0, "loss_rank_avg": 0.18436655402183533, "step": 6640 }, { "epoch": 5.873674911660777, "grad_norm": 0.6133697628974915, "learning_rate": 3.0965605468512837e-06, "loss": 0.2645, "loss_nan_ranks": 0, "loss_rank_avg": 0.2529999315738678, "step": 6645 }, { "epoch": 5.8780918727915195, "grad_norm": 0.6855648756027222, "learning_rate": 3.0730541921439936e-06, "loss": 0.2243, "loss_nan_ranks": 0, "loss_rank_avg": 0.2065194845199585, "step": 6650 }, { "epoch": 5.8825088339222615, "grad_norm": 0.7099367380142212, "learning_rate": 3.049629970365433e-06, "loss": 0.2309, "loss_nan_ranks": 0, "loss_rank_avg": 0.20094117522239685, "step": 6655 }, { "epoch": 5.886925795053004, "grad_norm": 0.6442746520042419, "learning_rate": 3.026287995174615e-06, "loss": 0.2356, "loss_nan_ranks": 0, "loss_rank_avg": 0.22805841267108917, "step": 6660 }, { "epoch": 5.891342756183746, "grad_norm": 0.6667559146881104, "learning_rate": 3.0030283798314785e-06, "loss": 0.2445, "loss_nan_ranks": 0, "loss_rank_avg": 0.19497643411159515, "step": 6665 }, { "epoch": 5.895759717314488, "grad_norm": 0.6932128071784973, "learning_rate": 2.9798512371963207e-06, "loss": 0.2535, "loss_nan_ranks": 0, "loss_rank_avg": 0.2080661952495575, "step": 6670 }, { "epoch": 5.90017667844523, "grad_norm": 0.6642143130302429, "learning_rate": 2.9567566797292914e-06, "loss": 0.2605, "loss_nan_ranks": 0, "loss_rank_avg": 0.26065370440483093, "step": 6675 }, { "epoch": 5.904593639575972, "grad_norm": 0.7001204490661621, "learning_rate": 2.9337448194897943e-06, "loss": 0.2583, "loss_nan_ranks": 0, "loss_rank_avg": 0.25018325448036194, "step": 6680 }, { "epoch": 5.909010600706714, "grad_norm": 0.6896331310272217, "learning_rate": 2.9108157681359837e-06, "loss": 0.2428, "loss_nan_ranks": 0, "loss_rank_avg": 0.317201167345047, "step": 6685 }, { "epoch": 5.913427561837456, "grad_norm": 0.6316036581993103, "learning_rate": 2.8879696369242062e-06, "loss": 0.2704, "loss_nan_ranks": 0, "loss_rank_avg": 0.2903493642807007, "step": 6690 }, { "epoch": 5.917844522968198, "grad_norm": 0.5617479681968689, "learning_rate": 2.8652065367084627e-06, "loss": 0.2479, "loss_nan_ranks": 0, "loss_rank_avg": 0.25086894631385803, "step": 6695 }, { "epoch": 5.92226148409894, "grad_norm": 0.6145947575569153, "learning_rate": 2.8425265779398704e-06, "loss": 0.251, "loss_nan_ranks": 0, "loss_rank_avg": 0.2544384002685547, "step": 6700 }, { "epoch": 5.926678445229682, "grad_norm": 0.718999981880188, "learning_rate": 2.819929870666129e-06, "loss": 0.2146, "loss_nan_ranks": 0, "loss_rank_avg": 0.155266672372818, "step": 6705 }, { "epoch": 5.9310954063604235, "grad_norm": 0.8380730748176575, "learning_rate": 2.7974165245309913e-06, "loss": 0.2275, "loss_nan_ranks": 0, "loss_rank_avg": 0.1611081063747406, "step": 6710 }, { "epoch": 5.935512367491166, "grad_norm": 0.7612493634223938, "learning_rate": 2.774986648773701e-06, "loss": 0.2487, "loss_nan_ranks": 0, "loss_rank_avg": 0.23470436036586761, "step": 6715 }, { "epoch": 5.939929328621908, "grad_norm": 0.6034536361694336, "learning_rate": 2.752640352228524e-06, "loss": 0.254, "loss_nan_ranks": 0, "loss_rank_avg": 0.2653493583202362, "step": 6720 }, { "epoch": 5.94434628975265, "grad_norm": 0.6654173731803894, "learning_rate": 2.7303777433241506e-06, "loss": 0.2321, "loss_nan_ranks": 0, "loss_rank_avg": 0.25490570068359375, "step": 6725 }, { "epoch": 5.948763250883392, "grad_norm": 0.6058556437492371, "learning_rate": 2.708198930083219e-06, "loss": 0.2504, "loss_nan_ranks": 0, "loss_rank_avg": 0.2901003956794739, "step": 6730 }, { "epoch": 5.953180212014134, "grad_norm": 0.6939293146133423, "learning_rate": 2.6861040201217692e-06, "loss": 0.2246, "loss_nan_ranks": 0, "loss_rank_avg": 0.260385125875473, "step": 6735 }, { "epoch": 5.957597173144876, "grad_norm": 0.6245858669281006, "learning_rate": 2.6640931206487252e-06, "loss": 0.2539, "loss_nan_ranks": 0, "loss_rank_avg": 0.25670942664146423, "step": 6740 }, { "epoch": 5.962014134275618, "grad_norm": 0.6986921429634094, "learning_rate": 2.642166338465384e-06, "loss": 0.2416, "loss_nan_ranks": 0, "loss_rank_avg": 0.21145933866500854, "step": 6745 }, { "epoch": 5.96643109540636, "grad_norm": 0.6841897964477539, "learning_rate": 2.6203237799648663e-06, "loss": 0.213, "loss_nan_ranks": 0, "loss_rank_avg": 0.18843623995780945, "step": 6750 }, { "epoch": 5.970848056537102, "grad_norm": 0.6969452500343323, "learning_rate": 2.598565551131653e-06, "loss": 0.2295, "loss_nan_ranks": 0, "loss_rank_avg": 0.3311152458190918, "step": 6755 }, { "epoch": 5.975265017667844, "grad_norm": 0.6374683380126953, "learning_rate": 2.5768917575410134e-06, "loss": 0.2332, "loss_nan_ranks": 0, "loss_rank_avg": 0.2040780782699585, "step": 6760 }, { "epoch": 5.979681978798586, "grad_norm": 0.7229990363121033, "learning_rate": 2.555302504358537e-06, "loss": 0.2225, "loss_nan_ranks": 0, "loss_rank_avg": 0.21656137704849243, "step": 6765 }, { "epoch": 5.9840989399293285, "grad_norm": 0.7268469333648682, "learning_rate": 2.5337978963396003e-06, "loss": 0.227, "loss_nan_ranks": 0, "loss_rank_avg": 0.1876240074634552, "step": 6770 }, { "epoch": 5.988515901060071, "grad_norm": 0.642284631729126, "learning_rate": 2.5123780378288642e-06, "loss": 0.2404, "loss_nan_ranks": 0, "loss_rank_avg": 0.24313439428806305, "step": 6775 }, { "epoch": 5.992932862190813, "grad_norm": 0.6617910265922546, "learning_rate": 2.49104303275977e-06, "loss": 0.2628, "loss_nan_ranks": 0, "loss_rank_avg": 0.25242412090301514, "step": 6780 }, { "epoch": 5.997349823321555, "grad_norm": 0.6622885465621948, "learning_rate": 2.4697929846540335e-06, "loss": 0.2371, "loss_nan_ranks": 0, "loss_rank_avg": 0.28016042709350586, "step": 6785 }, { "epoch": 6.002650176678445, "grad_norm": 0.6871181726455688, "learning_rate": 2.4486279966211425e-06, "loss": 0.2211, "loss_nan_ranks": 0, "loss_rank_avg": 0.21785372495651245, "step": 6790 }, { "epoch": 6.007067137809187, "grad_norm": 0.634680986404419, "learning_rate": 2.427548171357843e-06, "loss": 0.2488, "loss_nan_ranks": 0, "loss_rank_avg": 0.23458047211170197, "step": 6795 }, { "epoch": 6.011484098939929, "grad_norm": 0.7124053835868835, "learning_rate": 2.406553611147684e-06, "loss": 0.2631, "loss_nan_ranks": 0, "loss_rank_avg": 0.1963014453649521, "step": 6800 }, { "epoch": 6.0159010600706715, "grad_norm": 0.6288027763366699, "learning_rate": 2.38564441786046e-06, "loss": 0.2463, "loss_nan_ranks": 0, "loss_rank_avg": 0.22201281785964966, "step": 6805 }, { "epoch": 6.020318021201414, "grad_norm": 0.8096534013748169, "learning_rate": 2.364820692951766e-06, "loss": 0.2418, "loss_nan_ranks": 0, "loss_rank_avg": 0.2866779565811157, "step": 6810 }, { "epoch": 6.024734982332156, "grad_norm": 0.7279224395751953, "learning_rate": 2.3440825374624798e-06, "loss": 0.2504, "loss_nan_ranks": 0, "loss_rank_avg": 0.1597205102443695, "step": 6815 }, { "epoch": 6.029151943462898, "grad_norm": 0.6755273938179016, "learning_rate": 2.3234300520182873e-06, "loss": 0.2535, "loss_nan_ranks": 0, "loss_rank_avg": 0.2171267867088318, "step": 6820 }, { "epoch": 6.03356890459364, "grad_norm": 0.6738324761390686, "learning_rate": 2.3028633368291843e-06, "loss": 0.2408, "loss_nan_ranks": 0, "loss_rank_avg": 0.31061723828315735, "step": 6825 }, { "epoch": 6.037985865724382, "grad_norm": 0.6739371418952942, "learning_rate": 2.2823824916889724e-06, "loss": 0.2133, "loss_nan_ranks": 0, "loss_rank_avg": 0.19004075229167938, "step": 6830 }, { "epoch": 6.042402826855124, "grad_norm": 0.7035517692565918, "learning_rate": 2.261987615974832e-06, "loss": 0.2345, "loss_nan_ranks": 0, "loss_rank_avg": 0.29125821590423584, "step": 6835 }, { "epoch": 6.046819787985866, "grad_norm": 0.718480110168457, "learning_rate": 2.241678808646768e-06, "loss": 0.227, "loss_nan_ranks": 0, "loss_rank_avg": 0.21620148420333862, "step": 6840 }, { "epoch": 6.051236749116608, "grad_norm": 0.6759348511695862, "learning_rate": 2.2214561682471825e-06, "loss": 0.2263, "loss_nan_ranks": 0, "loss_rank_avg": 0.22231999039649963, "step": 6845 }, { "epoch": 6.05565371024735, "grad_norm": 0.6698014140129089, "learning_rate": 2.201319792900374e-06, "loss": 0.2305, "loss_nan_ranks": 0, "loss_rank_avg": 0.20235413312911987, "step": 6850 }, { "epoch": 6.060070671378092, "grad_norm": 0.6121541261672974, "learning_rate": 2.181269780312063e-06, "loss": 0.2177, "loss_nan_ranks": 0, "loss_rank_avg": 0.20862017571926117, "step": 6855 }, { "epoch": 6.0644876325088335, "grad_norm": 0.6756522059440613, "learning_rate": 2.1613062277689266e-06, "loss": 0.2205, "loss_nan_ranks": 0, "loss_rank_avg": 0.17809954285621643, "step": 6860 }, { "epoch": 6.068904593639576, "grad_norm": 0.7342451810836792, "learning_rate": 2.141429232138117e-06, "loss": 0.2207, "loss_nan_ranks": 0, "loss_rank_avg": 0.2191966474056244, "step": 6865 }, { "epoch": 6.073321554770318, "grad_norm": 0.7150316834449768, "learning_rate": 2.1216388898667973e-06, "loss": 0.2352, "loss_nan_ranks": 0, "loss_rank_avg": 0.23332220315933228, "step": 6870 }, { "epoch": 6.07773851590106, "grad_norm": 0.6240957379341125, "learning_rate": 2.1019352969816585e-06, "loss": 0.265, "loss_nan_ranks": 0, "loss_rank_avg": 0.24282801151275635, "step": 6875 }, { "epoch": 6.082155477031802, "grad_norm": 0.6066805720329285, "learning_rate": 2.082318549088491e-06, "loss": 0.2542, "loss_nan_ranks": 0, "loss_rank_avg": 0.26603370904922485, "step": 6880 }, { "epoch": 6.086572438162544, "grad_norm": 0.7509016990661621, "learning_rate": 2.062788741371673e-06, "loss": 0.2227, "loss_nan_ranks": 0, "loss_rank_avg": 0.3612693250179291, "step": 6885 }, { "epoch": 6.090989399293286, "grad_norm": 0.701538622379303, "learning_rate": 2.0433459685937395e-06, "loss": 0.2593, "loss_nan_ranks": 0, "loss_rank_avg": 0.318620890378952, "step": 6890 }, { "epoch": 6.095406360424028, "grad_norm": 0.6263982653617859, "learning_rate": 2.0239903250949176e-06, "loss": 0.2221, "loss_nan_ranks": 0, "loss_rank_avg": 0.23397664725780487, "step": 6895 }, { "epoch": 6.09982332155477, "grad_norm": 0.644121527671814, "learning_rate": 2.0047219047926614e-06, "loss": 0.2221, "loss_nan_ranks": 0, "loss_rank_avg": 0.22818706929683685, "step": 6900 }, { "epoch": 6.104240282685512, "grad_norm": 0.6542277336120605, "learning_rate": 1.9855408011812117e-06, "loss": 0.2456, "loss_nan_ranks": 0, "loss_rank_avg": 0.2627973258495331, "step": 6905 }, { "epoch": 6.108657243816254, "grad_norm": 0.6295666694641113, "learning_rate": 1.966447107331104e-06, "loss": 0.2238, "loss_nan_ranks": 0, "loss_rank_avg": 0.27365830540657043, "step": 6910 }, { "epoch": 6.113074204946996, "grad_norm": 0.6320794224739075, "learning_rate": 1.9474409158887807e-06, "loss": 0.233, "loss_nan_ranks": 0, "loss_rank_avg": 0.23829936981201172, "step": 6915 }, { "epoch": 6.1174911660777385, "grad_norm": 0.7099300622940063, "learning_rate": 1.9285223190760737e-06, "loss": 0.2178, "loss_nan_ranks": 0, "loss_rank_avg": 0.16763125360012054, "step": 6920 }, { "epoch": 6.1219081272084805, "grad_norm": 0.5849335193634033, "learning_rate": 1.9096914086898087e-06, "loss": 0.2098, "loss_nan_ranks": 0, "loss_rank_avg": 0.2682895064353943, "step": 6925 }, { "epoch": 6.126325088339223, "grad_norm": 0.5837305784225464, "learning_rate": 1.8909482761013254e-06, "loss": 0.2236, "loss_nan_ranks": 0, "loss_rank_avg": 0.20228040218353271, "step": 6930 }, { "epoch": 6.130742049469965, "grad_norm": 0.7193915247917175, "learning_rate": 1.872293012256059e-06, "loss": 0.2034, "loss_nan_ranks": 0, "loss_rank_avg": 0.24561667442321777, "step": 6935 }, { "epoch": 6.135159010600707, "grad_norm": 0.7574843168258667, "learning_rate": 1.853725707673082e-06, "loss": 0.2399, "loss_nan_ranks": 0, "loss_rank_avg": 0.26278358697891235, "step": 6940 }, { "epoch": 6.139575971731449, "grad_norm": 0.7833778262138367, "learning_rate": 1.8352464524446724e-06, "loss": 0.2054, "loss_nan_ranks": 0, "loss_rank_avg": 0.18190684914588928, "step": 6945 }, { "epoch": 6.143992932862191, "grad_norm": 0.6389424204826355, "learning_rate": 1.8168553362358787e-06, "loss": 0.2165, "loss_nan_ranks": 0, "loss_rank_avg": 0.228424072265625, "step": 6950 }, { "epoch": 6.148409893992933, "grad_norm": 0.7503507733345032, "learning_rate": 1.7985524482840676e-06, "loss": 0.2483, "loss_nan_ranks": 0, "loss_rank_avg": 0.3514091968536377, "step": 6955 }, { "epoch": 6.152826855123675, "grad_norm": 0.6644186973571777, "learning_rate": 1.7803378773985214e-06, "loss": 0.2445, "loss_nan_ranks": 0, "loss_rank_avg": 0.23452533781528473, "step": 6960 }, { "epoch": 6.157243816254417, "grad_norm": 0.6842535138130188, "learning_rate": 1.7622117119599802e-06, "loss": 0.2313, "loss_nan_ranks": 0, "loss_rank_avg": 0.2150776982307434, "step": 6965 }, { "epoch": 6.161660777385159, "grad_norm": 0.6151363849639893, "learning_rate": 1.74417403992023e-06, "loss": 0.2473, "loss_nan_ranks": 0, "loss_rank_avg": 0.25320905447006226, "step": 6970 }, { "epoch": 6.166077738515901, "grad_norm": 0.8280779719352722, "learning_rate": 1.7262249488016648e-06, "loss": 0.276, "loss_nan_ranks": 0, "loss_rank_avg": 0.3232913315296173, "step": 6975 }, { "epoch": 6.170494699646643, "grad_norm": 0.6878020167350769, "learning_rate": 1.708364525696864e-06, "loss": 0.2731, "loss_nan_ranks": 0, "loss_rank_avg": 0.3126068711280823, "step": 6980 }, { "epoch": 6.1749116607773855, "grad_norm": 0.7097288966178894, "learning_rate": 1.6905928572681806e-06, "loss": 0.2509, "loss_nan_ranks": 0, "loss_rank_avg": 0.3269100785255432, "step": 6985 }, { "epoch": 6.179328621908128, "grad_norm": 0.6415253281593323, "learning_rate": 1.6729100297472967e-06, "loss": 0.2125, "loss_nan_ranks": 0, "loss_rank_avg": 0.2540552020072937, "step": 6990 }, { "epoch": 6.18374558303887, "grad_norm": 0.6544218063354492, "learning_rate": 1.6553161289348429e-06, "loss": 0.2182, "loss_nan_ranks": 0, "loss_rank_avg": 0.21726730465888977, "step": 6995 }, { "epoch": 6.188162544169611, "grad_norm": 0.6272726655006409, "learning_rate": 1.637811240199938e-06, "loss": 0.2144, "loss_nan_ranks": 0, "loss_rank_avg": 0.27637243270874023, "step": 7000 }, { "epoch": 6.192579505300353, "grad_norm": 0.651709258556366, "learning_rate": 1.620395448479808e-06, "loss": 0.2198, "loss_nan_ranks": 0, "loss_rank_avg": 0.15154890716075897, "step": 7005 }, { "epoch": 6.196996466431095, "grad_norm": 0.6224712133407593, "learning_rate": 1.603068838279358e-06, "loss": 0.2222, "loss_nan_ranks": 0, "loss_rank_avg": 0.2673448920249939, "step": 7010 }, { "epoch": 6.201413427561837, "grad_norm": 0.6794725060462952, "learning_rate": 1.5858314936707731e-06, "loss": 0.2144, "loss_nan_ranks": 0, "loss_rank_avg": 0.26523715257644653, "step": 7015 }, { "epoch": 6.205830388692579, "grad_norm": 0.6605962514877319, "learning_rate": 1.5686834982930954e-06, "loss": 0.2489, "loss_nan_ranks": 0, "loss_rank_avg": 0.24990308284759521, "step": 7020 }, { "epoch": 6.210247349823321, "grad_norm": 0.6467421054840088, "learning_rate": 1.551624935351832e-06, "loss": 0.2235, "loss_nan_ranks": 0, "loss_rank_avg": 0.21919971704483032, "step": 7025 }, { "epoch": 6.214664310954063, "grad_norm": 0.6687191724777222, "learning_rate": 1.5346558876185459e-06, "loss": 0.2176, "loss_nan_ranks": 0, "loss_rank_avg": 0.19604460895061493, "step": 7030 }, { "epoch": 6.219081272084805, "grad_norm": 0.6332302689552307, "learning_rate": 1.5177764374304493e-06, "loss": 0.2437, "loss_nan_ranks": 0, "loss_rank_avg": 0.2982211112976074, "step": 7035 }, { "epoch": 6.2234982332155475, "grad_norm": 0.6573311686515808, "learning_rate": 1.500986666690012e-06, "loss": 0.241, "loss_nan_ranks": 0, "loss_rank_avg": 0.2335302233695984, "step": 7040 }, { "epoch": 6.22791519434629, "grad_norm": 0.6834275126457214, "learning_rate": 1.4842866568645642e-06, "loss": 0.2073, "loss_nan_ranks": 0, "loss_rank_avg": 0.2586025595664978, "step": 7045 }, { "epoch": 6.232332155477032, "grad_norm": 0.6670604944229126, "learning_rate": 1.4676764889858964e-06, "loss": 0.2075, "loss_nan_ranks": 0, "loss_rank_avg": 0.15618737041950226, "step": 7050 }, { "epoch": 6.236749116607774, "grad_norm": 0.6588166356086731, "learning_rate": 1.4511562436498671e-06, "loss": 0.2369, "loss_nan_ranks": 0, "loss_rank_avg": 0.2003191113471985, "step": 7055 }, { "epoch": 6.241166077738516, "grad_norm": 0.6686882972717285, "learning_rate": 1.4347260010160112e-06, "loss": 0.2507, "loss_nan_ranks": 0, "loss_rank_avg": 0.3045847713947296, "step": 7060 }, { "epoch": 6.245583038869258, "grad_norm": 0.8178129196166992, "learning_rate": 1.418385840807157e-06, "loss": 0.2332, "loss_nan_ranks": 0, "loss_rank_avg": 0.307248055934906, "step": 7065 }, { "epoch": 6.25, "grad_norm": 0.7019538283348083, "learning_rate": 1.402135842309027e-06, "loss": 0.2203, "loss_nan_ranks": 0, "loss_rank_avg": 0.20041900873184204, "step": 7070 }, { "epoch": 6.254416961130742, "grad_norm": 0.6783828735351562, "learning_rate": 1.3859760843698733e-06, "loss": 0.2481, "loss_nan_ranks": 0, "loss_rank_avg": 0.20858903229236603, "step": 7075 }, { "epoch": 6.258833922261484, "grad_norm": 0.7637086510658264, "learning_rate": 1.3699066454000698e-06, "loss": 0.2303, "loss_nan_ranks": 0, "loss_rank_avg": 0.2005981206893921, "step": 7080 }, { "epoch": 6.263250883392226, "grad_norm": 0.7075777649879456, "learning_rate": 1.353927603371754e-06, "loss": 0.2606, "loss_nan_ranks": 0, "loss_rank_avg": 0.28827959299087524, "step": 7085 }, { "epoch": 6.267667844522968, "grad_norm": 0.6434727311134338, "learning_rate": 1.3380390358184324e-06, "loss": 0.2137, "loss_nan_ranks": 0, "loss_rank_avg": 0.19321520626544952, "step": 7090 }, { "epoch": 6.27208480565371, "grad_norm": 0.7264281511306763, "learning_rate": 1.322241019834616e-06, "loss": 0.2577, "loss_nan_ranks": 0, "loss_rank_avg": 0.30105140805244446, "step": 7095 }, { "epoch": 6.2765017667844525, "grad_norm": 0.6986357569694519, "learning_rate": 1.3065336320754418e-06, "loss": 0.232, "loss_nan_ranks": 0, "loss_rank_avg": 0.2179412990808487, "step": 7100 }, { "epoch": 6.280918727915195, "grad_norm": 0.6690407991409302, "learning_rate": 1.2909169487562978e-06, "loss": 0.2367, "loss_nan_ranks": 0, "loss_rank_avg": 0.27286529541015625, "step": 7105 }, { "epoch": 6.285335689045937, "grad_norm": 0.6591483354568481, "learning_rate": 1.2753910456524588e-06, "loss": 0.2241, "loss_nan_ranks": 0, "loss_rank_avg": 0.22066080570220947, "step": 7110 }, { "epoch": 6.289752650176679, "grad_norm": 0.825552225112915, "learning_rate": 1.2599559980987076e-06, "loss": 0.2708, "loss_nan_ranks": 0, "loss_rank_avg": 0.22613100707530975, "step": 7115 }, { "epoch": 6.294169611307421, "grad_norm": 0.6642948985099792, "learning_rate": 1.2446118809889906e-06, "loss": 0.2394, "loss_nan_ranks": 0, "loss_rank_avg": 0.1839137226343155, "step": 7120 }, { "epoch": 6.298586572438163, "grad_norm": 0.7041632533073425, "learning_rate": 1.22935876877603e-06, "loss": 0.2177, "loss_nan_ranks": 0, "loss_rank_avg": 0.19626665115356445, "step": 7125 }, { "epoch": 6.303003533568905, "grad_norm": 0.7177551984786987, "learning_rate": 1.214196735470985e-06, "loss": 0.2486, "loss_nan_ranks": 0, "loss_rank_avg": 0.276611328125, "step": 7130 }, { "epoch": 6.307420494699647, "grad_norm": 0.6658399701118469, "learning_rate": 1.1991258546430683e-06, "loss": 0.2368, "loss_nan_ranks": 0, "loss_rank_avg": 0.27585369348526, "step": 7135 }, { "epoch": 6.311837455830389, "grad_norm": 0.6692659258842468, "learning_rate": 1.184146199419216e-06, "loss": 0.2728, "loss_nan_ranks": 0, "loss_rank_avg": 0.2560042142868042, "step": 7140 }, { "epoch": 6.316254416961131, "grad_norm": 0.6785102486610413, "learning_rate": 1.1692578424837131e-06, "loss": 0.232, "loss_nan_ranks": 0, "loss_rank_avg": 0.3469223976135254, "step": 7145 }, { "epoch": 6.320671378091872, "grad_norm": 0.6885204911231995, "learning_rate": 1.1544608560778392e-06, "loss": 0.198, "loss_nan_ranks": 0, "loss_rank_avg": 0.1730516254901886, "step": 7150 }, { "epoch": 6.3250883392226145, "grad_norm": 0.7053970694541931, "learning_rate": 1.139755311999544e-06, "loss": 0.2155, "loss_nan_ranks": 0, "loss_rank_avg": 0.22853153944015503, "step": 7155 }, { "epoch": 6.329505300353357, "grad_norm": 0.6633711457252502, "learning_rate": 1.1251412816030637e-06, "loss": 0.2489, "loss_nan_ranks": 0, "loss_rank_avg": 0.18317949771881104, "step": 7160 }, { "epoch": 6.333922261484099, "grad_norm": 0.6200980544090271, "learning_rate": 1.1106188357986003e-06, "loss": 0.2302, "loss_nan_ranks": 0, "loss_rank_avg": 0.28622856736183167, "step": 7165 }, { "epoch": 6.338339222614841, "grad_norm": 0.7030071020126343, "learning_rate": 1.096188045051969e-06, "loss": 0.2349, "loss_nan_ranks": 0, "loss_rank_avg": 0.20836327970027924, "step": 7170 }, { "epoch": 6.342756183745583, "grad_norm": 0.6545681953430176, "learning_rate": 1.0818489793842523e-06, "loss": 0.2205, "loss_nan_ranks": 0, "loss_rank_avg": 0.21316681802272797, "step": 7175 }, { "epoch": 6.347173144876325, "grad_norm": 0.626930296421051, "learning_rate": 1.0676017083714684e-06, "loss": 0.2523, "loss_nan_ranks": 0, "loss_rank_avg": 0.33314627408981323, "step": 7180 }, { "epoch": 6.351590106007067, "grad_norm": 0.6591570973396301, "learning_rate": 1.0534463011442276e-06, "loss": 0.2251, "loss_nan_ranks": 0, "loss_rank_avg": 0.2925190329551697, "step": 7185 }, { "epoch": 6.356007067137809, "grad_norm": 0.6768887639045715, "learning_rate": 1.0393828263873985e-06, "loss": 0.2692, "loss_nan_ranks": 0, "loss_rank_avg": 0.20045700669288635, "step": 7190 }, { "epoch": 6.360424028268551, "grad_norm": 0.6835484504699707, "learning_rate": 1.0254113523397736e-06, "loss": 0.2218, "loss_nan_ranks": 0, "loss_rank_avg": 0.1818992793560028, "step": 7195 }, { "epoch": 6.364840989399293, "grad_norm": 0.6533926725387573, "learning_rate": 1.0115319467937402e-06, "loss": 0.242, "loss_nan_ranks": 0, "loss_rank_avg": 0.2500065565109253, "step": 7200 }, { "epoch": 6.369257950530035, "grad_norm": 0.6944401860237122, "learning_rate": 9.977446770949562e-07, "loss": 0.2294, "loss_nan_ranks": 0, "loss_rank_avg": 0.23091307282447815, "step": 7205 }, { "epoch": 6.373674911660777, "grad_norm": 0.6814969778060913, "learning_rate": 9.840496101420106e-07, "loss": 0.2407, "loss_nan_ranks": 0, "loss_rank_avg": 0.20970004796981812, "step": 7210 }, { "epoch": 6.3780918727915195, "grad_norm": 0.6713853478431702, "learning_rate": 9.704468123861077e-07, "loss": 0.2477, "loss_nan_ranks": 0, "loss_rank_avg": 0.24036778509616852, "step": 7215 }, { "epoch": 6.3825088339222615, "grad_norm": 0.6768038868904114, "learning_rate": 9.569363498307482e-07, "loss": 0.239, "loss_nan_ranks": 0, "loss_rank_avg": 0.2124941051006317, "step": 7220 }, { "epoch": 6.386925795053004, "grad_norm": 0.7332238554954529, "learning_rate": 9.43518288031402e-07, "loss": 0.2365, "loss_nan_ranks": 0, "loss_rank_avg": 0.25171586871147156, "step": 7225 }, { "epoch": 6.391342756183746, "grad_norm": 0.7168375849723816, "learning_rate": 9.301926920951798e-07, "loss": 0.192, "loss_nan_ranks": 0, "loss_rank_avg": 0.18527813255786896, "step": 7230 }, { "epoch": 6.395759717314488, "grad_norm": 0.7241307497024536, "learning_rate": 9.169596266805536e-07, "loss": 0.2508, "loss_nan_ranks": 0, "loss_rank_avg": 0.25133055448532104, "step": 7235 }, { "epoch": 6.40017667844523, "grad_norm": 0.6914727091789246, "learning_rate": 9.038191559969967e-07, "loss": 0.2597, "loss_nan_ranks": 0, "loss_rank_avg": 0.2576597332954407, "step": 7240 }, { "epoch": 6.404593639575972, "grad_norm": 0.6564986705780029, "learning_rate": 8.907713438047039e-07, "loss": 0.2343, "loss_nan_ranks": 0, "loss_rank_avg": 0.28039878606796265, "step": 7245 }, { "epoch": 6.409010600706714, "grad_norm": 0.690380334854126, "learning_rate": 8.77816253414272e-07, "loss": 0.2845, "loss_nan_ranks": 0, "loss_rank_avg": 0.19468779861927032, "step": 7250 }, { "epoch": 6.413427561837456, "grad_norm": 0.6827090978622437, "learning_rate": 8.649539476863933e-07, "loss": 0.1907, "loss_nan_ranks": 0, "loss_rank_avg": 0.1751917153596878, "step": 7255 }, { "epoch": 6.417844522968198, "grad_norm": 0.7905651926994324, "learning_rate": 8.521844890315489e-07, "loss": 0.2369, "loss_nan_ranks": 0, "loss_rank_avg": 0.22601011395454407, "step": 7260 }, { "epoch": 6.42226148409894, "grad_norm": 0.7389162182807922, "learning_rate": 8.395079394097072e-07, "loss": 0.2287, "loss_nan_ranks": 0, "loss_rank_avg": 0.18410909175872803, "step": 7265 }, { "epoch": 6.426678445229682, "grad_norm": 0.7305995225906372, "learning_rate": 8.269243603300259e-07, "loss": 0.2387, "loss_nan_ranks": 0, "loss_rank_avg": 0.22303178906440735, "step": 7270 }, { "epoch": 6.431095406360424, "grad_norm": 0.7176210284233093, "learning_rate": 8.144338128505458e-07, "loss": 0.2216, "loss_nan_ranks": 0, "loss_rank_avg": 0.2397802472114563, "step": 7275 }, { "epoch": 6.435512367491166, "grad_norm": 0.6935552954673767, "learning_rate": 8.020363575779044e-07, "loss": 0.2516, "loss_nan_ranks": 0, "loss_rank_avg": 0.2560634911060333, "step": 7280 }, { "epoch": 6.439929328621908, "grad_norm": 0.6029765009880066, "learning_rate": 7.897320546670362e-07, "loss": 0.2133, "loss_nan_ranks": 0, "loss_rank_avg": 0.1827746331691742, "step": 7285 }, { "epoch": 6.44434628975265, "grad_norm": 0.6512175798416138, "learning_rate": 7.775209638208814e-07, "loss": 0.2498, "loss_nan_ranks": 0, "loss_rank_avg": 0.3437669277191162, "step": 7290 }, { "epoch": 6.448763250883392, "grad_norm": 0.8887504935264587, "learning_rate": 7.654031442900978e-07, "loss": 0.1744, "loss_nan_ranks": 0, "loss_rank_avg": 0.12437689304351807, "step": 7295 }, { "epoch": 6.453180212014134, "grad_norm": 0.776211678981781, "learning_rate": 7.533786548727695e-07, "loss": 0.2215, "loss_nan_ranks": 0, "loss_rank_avg": 0.26376765966415405, "step": 7300 }, { "epoch": 6.457597173144876, "grad_norm": 0.6631959080696106, "learning_rate": 7.414475539141275e-07, "loss": 0.2707, "loss_nan_ranks": 0, "loss_rank_avg": 0.23870348930358887, "step": 7305 }, { "epoch": 6.462014134275618, "grad_norm": 0.6867117285728455, "learning_rate": 7.296098993062562e-07, "loss": 0.1844, "loss_nan_ranks": 0, "loss_rank_avg": 0.15025901794433594, "step": 7310 }, { "epoch": 6.46643109540636, "grad_norm": 0.6691762208938599, "learning_rate": 7.178657484878338e-07, "loss": 0.257, "loss_nan_ranks": 0, "loss_rank_avg": 0.25004538893699646, "step": 7315 }, { "epoch": 6.470848056537102, "grad_norm": 0.5934985876083374, "learning_rate": 7.062151584438215e-07, "loss": 0.2236, "loss_nan_ranks": 0, "loss_rank_avg": 0.2773967385292053, "step": 7320 }, { "epoch": 6.475265017667844, "grad_norm": 0.6796112656593323, "learning_rate": 6.946581857052192e-07, "loss": 0.2281, "loss_nan_ranks": 0, "loss_rank_avg": 0.2649118900299072, "step": 7325 }, { "epoch": 6.479681978798586, "grad_norm": 0.6687401533126831, "learning_rate": 6.831948863487703e-07, "loss": 0.2464, "loss_nan_ranks": 0, "loss_rank_avg": 0.24808388948440552, "step": 7330 }, { "epoch": 6.4840989399293285, "grad_norm": 0.7123255133628845, "learning_rate": 6.71825315996697e-07, "loss": 0.225, "loss_nan_ranks": 0, "loss_rank_avg": 0.28177204728126526, "step": 7335 }, { "epoch": 6.488515901060071, "grad_norm": 0.6031168103218079, "learning_rate": 6.605495298164299e-07, "loss": 0.2491, "loss_nan_ranks": 0, "loss_rank_avg": 0.21055057644844055, "step": 7340 }, { "epoch": 6.492932862190813, "grad_norm": 0.5988543033599854, "learning_rate": 6.493675825203416e-07, "loss": 0.1915, "loss_nan_ranks": 0, "loss_rank_avg": 0.18854458630084991, "step": 7345 }, { "epoch": 6.497349823321555, "grad_norm": 0.6946279406547546, "learning_rate": 6.382795283654796e-07, "loss": 0.2077, "loss_nan_ranks": 0, "loss_rank_avg": 0.2246820479631424, "step": 7350 }, { "epoch": 6.501766784452297, "grad_norm": 0.7605702877044678, "learning_rate": 6.272854211532964e-07, "loss": 0.2322, "loss_nan_ranks": 0, "loss_rank_avg": 0.16712304949760437, "step": 7355 }, { "epoch": 6.506183745583039, "grad_norm": 0.7368548512458801, "learning_rate": 6.163853142294041e-07, "loss": 0.2214, "loss_nan_ranks": 0, "loss_rank_avg": 0.1958334743976593, "step": 7360 }, { "epoch": 6.510600706713781, "grad_norm": 0.6829209923744202, "learning_rate": 6.055792604833022e-07, "loss": 0.2615, "loss_nan_ranks": 0, "loss_rank_avg": 0.2502579689025879, "step": 7365 }, { "epoch": 6.515017667844523, "grad_norm": 0.6418865323066711, "learning_rate": 5.948673123481286e-07, "loss": 0.249, "loss_nan_ranks": 0, "loss_rank_avg": 0.24650785326957703, "step": 7370 }, { "epoch": 6.519434628975265, "grad_norm": 0.6992244124412537, "learning_rate": 5.842495218003952e-07, "loss": 0.2199, "loss_nan_ranks": 0, "loss_rank_avg": 0.25637686252593994, "step": 7375 }, { "epoch": 6.523851590106007, "grad_norm": 0.6708112955093384, "learning_rate": 5.737259403597484e-07, "loss": 0.2334, "loss_nan_ranks": 0, "loss_rank_avg": 0.274458646774292, "step": 7380 }, { "epoch": 6.528268551236749, "grad_norm": 0.6802352070808411, "learning_rate": 5.632966190887157e-07, "loss": 0.2038, "loss_nan_ranks": 0, "loss_rank_avg": 0.2165306955575943, "step": 7385 }, { "epoch": 6.532685512367491, "grad_norm": 0.6572979688644409, "learning_rate": 5.529616085924439e-07, "loss": 0.233, "loss_nan_ranks": 0, "loss_rank_avg": 0.2785220146179199, "step": 7390 }, { "epoch": 6.5371024734982335, "grad_norm": 0.702512264251709, "learning_rate": 5.42720959018479e-07, "loss": 0.268, "loss_nan_ranks": 0, "loss_rank_avg": 0.32956579327583313, "step": 7395 }, { "epoch": 6.541519434628976, "grad_norm": 0.6240583658218384, "learning_rate": 5.325747200564979e-07, "loss": 0.2227, "loss_nan_ranks": 0, "loss_rank_avg": 0.28135302662849426, "step": 7400 }, { "epoch": 6.545936395759718, "grad_norm": 0.7530537247657776, "learning_rate": 5.225229409380839e-07, "loss": 0.2127, "loss_nan_ranks": 0, "loss_rank_avg": 0.2570284605026245, "step": 7405 }, { "epoch": 6.55035335689046, "grad_norm": 0.6967406868934631, "learning_rate": 5.125656704364801e-07, "loss": 0.2077, "loss_nan_ranks": 0, "loss_rank_avg": 0.1789890080690384, "step": 7410 }, { "epoch": 6.554770318021202, "grad_norm": 0.6931060552597046, "learning_rate": 5.027029568663566e-07, "loss": 0.2762, "loss_nan_ranks": 0, "loss_rank_avg": 0.2260131686925888, "step": 7415 }, { "epoch": 6.559187279151944, "grad_norm": 0.7548974752426147, "learning_rate": 4.929348480835749e-07, "loss": 0.212, "loss_nan_ranks": 0, "loss_rank_avg": 0.18322241306304932, "step": 7420 }, { "epoch": 6.563604240282686, "grad_norm": 0.6624125242233276, "learning_rate": 4.832613914849504e-07, "loss": 0.2257, "loss_nan_ranks": 0, "loss_rank_avg": 0.20556357502937317, "step": 7425 }, { "epoch": 6.568021201413428, "grad_norm": 0.7263450622558594, "learning_rate": 4.7368263400803693e-07, "loss": 0.2179, "loss_nan_ranks": 0, "loss_rank_avg": 0.22019252181053162, "step": 7430 }, { "epoch": 6.572438162544169, "grad_norm": 0.6411981582641602, "learning_rate": 4.6419862213087365e-07, "loss": 0.221, "loss_nan_ranks": 0, "loss_rank_avg": 0.2418275624513626, "step": 7435 }, { "epoch": 6.576855123674911, "grad_norm": 0.652529776096344, "learning_rate": 4.548094018717919e-07, "loss": 0.2338, "loss_nan_ranks": 0, "loss_rank_avg": 0.24279262125492096, "step": 7440 }, { "epoch": 6.581272084805653, "grad_norm": 0.7010001540184021, "learning_rate": 4.4551501878916214e-07, "loss": 0.2489, "loss_nan_ranks": 0, "loss_rank_avg": 0.38378775119781494, "step": 7445 }, { "epoch": 6.5856890459363955, "grad_norm": 0.6368486881256104, "learning_rate": 4.363155179811962e-07, "loss": 0.1945, "loss_nan_ranks": 0, "loss_rank_avg": 0.18857476115226746, "step": 7450 }, { "epoch": 6.590106007067138, "grad_norm": 0.7825067043304443, "learning_rate": 4.2721094408570974e-07, "loss": 0.2113, "loss_nan_ranks": 0, "loss_rank_avg": 0.2131972312927246, "step": 7455 }, { "epoch": 6.59452296819788, "grad_norm": 0.7557323575019836, "learning_rate": 4.1820134127991794e-07, "loss": 0.2024, "loss_nan_ranks": 0, "loss_rank_avg": 0.18626517057418823, "step": 7460 }, { "epoch": 6.598939929328622, "grad_norm": 0.6673694849014282, "learning_rate": 4.0928675328022027e-07, "loss": 0.2569, "loss_nan_ranks": 0, "loss_rank_avg": 0.25416773557662964, "step": 7465 }, { "epoch": 6.603356890459364, "grad_norm": 0.7011893391609192, "learning_rate": 4.0046722334197375e-07, "loss": 0.2523, "loss_nan_ranks": 0, "loss_rank_avg": 0.3195936977863312, "step": 7470 }, { "epoch": 6.607773851590106, "grad_norm": 0.8319687247276306, "learning_rate": 3.9174279425931105e-07, "loss": 0.2493, "loss_nan_ranks": 0, "loss_rank_avg": 0.3487233519554138, "step": 7475 }, { "epoch": 6.612190812720848, "grad_norm": 0.7716171145439148, "learning_rate": 3.8311350836490514e-07, "loss": 0.2072, "loss_nan_ranks": 0, "loss_rank_avg": 0.17440339922904968, "step": 7480 }, { "epoch": 6.61660777385159, "grad_norm": 0.7053916454315186, "learning_rate": 3.7457940752977594e-07, "loss": 0.2245, "loss_nan_ranks": 0, "loss_rank_avg": 0.21849893033504486, "step": 7485 }, { "epoch": 6.621024734982332, "grad_norm": 0.7948006987571716, "learning_rate": 3.6614053316309074e-07, "loss": 0.2484, "loss_nan_ranks": 0, "loss_rank_avg": 0.26136571168899536, "step": 7490 }, { "epoch": 6.625441696113074, "grad_norm": 0.6417169570922852, "learning_rate": 3.577969262119574e-07, "loss": 0.2471, "loss_nan_ranks": 0, "loss_rank_avg": 0.26517075300216675, "step": 7495 }, { "epoch": 6.629858657243816, "grad_norm": 0.7573156952857971, "learning_rate": 3.4954862716122473e-07, "loss": 0.245, "loss_nan_ranks": 0, "loss_rank_avg": 0.16177725791931152, "step": 7500 }, { "epoch": 6.634275618374558, "grad_norm": 0.6673710942268372, "learning_rate": 3.413956760332937e-07, "loss": 0.2486, "loss_nan_ranks": 0, "loss_rank_avg": 0.28934532403945923, "step": 7505 }, { "epoch": 6.6386925795053005, "grad_norm": 0.7154965400695801, "learning_rate": 3.3333811238791316e-07, "loss": 0.2397, "loss_nan_ranks": 0, "loss_rank_avg": 0.19894251227378845, "step": 7510 }, { "epoch": 6.6431095406360425, "grad_norm": 0.8006401658058167, "learning_rate": 3.2537597532199315e-07, "loss": 0.2319, "loss_nan_ranks": 0, "loss_rank_avg": 0.23368564248085022, "step": 7515 }, { "epoch": 6.647526501766785, "grad_norm": 0.6231130361557007, "learning_rate": 3.175093034694188e-07, "loss": 0.2462, "loss_nan_ranks": 0, "loss_rank_avg": 0.19901973009109497, "step": 7520 }, { "epoch": 6.651943462897527, "grad_norm": 0.6645359396934509, "learning_rate": 3.0973813500085215e-07, "loss": 0.2524, "loss_nan_ranks": 0, "loss_rank_avg": 0.23790386319160461, "step": 7525 }, { "epoch": 6.656360424028269, "grad_norm": 0.7502444386482239, "learning_rate": 3.0206250762356393e-07, "loss": 0.2229, "loss_nan_ranks": 0, "loss_rank_avg": 0.22129979729652405, "step": 7530 }, { "epoch": 6.660777385159011, "grad_norm": 0.6512811183929443, "learning_rate": 2.944824585812289e-07, "loss": 0.253, "loss_nan_ranks": 0, "loss_rank_avg": 0.24692480266094208, "step": 7535 }, { "epoch": 6.665194346289753, "grad_norm": 0.6338093876838684, "learning_rate": 2.86998024653764e-07, "loss": 0.2605, "loss_nan_ranks": 0, "loss_rank_avg": 0.21317501366138458, "step": 7540 }, { "epoch": 6.669611307420495, "grad_norm": 0.7195187211036682, "learning_rate": 2.7960924215714394e-07, "loss": 0.2675, "loss_nan_ranks": 0, "loss_rank_avg": 0.21621374785900116, "step": 7545 }, { "epoch": 6.674028268551237, "grad_norm": 0.750506579875946, "learning_rate": 2.723161469432123e-07, "loss": 0.1965, "loss_nan_ranks": 0, "loss_rank_avg": 0.15070626139640808, "step": 7550 }, { "epoch": 6.678445229681979, "grad_norm": 0.6777002811431885, "learning_rate": 2.6511877439953536e-07, "loss": 0.2459, "loss_nan_ranks": 0, "loss_rank_avg": 0.2585234045982361, "step": 7555 }, { "epoch": 6.68286219081272, "grad_norm": 0.6883443593978882, "learning_rate": 2.5801715944919983e-07, "loss": 0.2413, "loss_nan_ranks": 0, "loss_rank_avg": 0.20979022979736328, "step": 7560 }, { "epoch": 6.6872791519434625, "grad_norm": 1.0622401237487793, "learning_rate": 2.510113365506639e-07, "loss": 0.2139, "loss_nan_ranks": 0, "loss_rank_avg": 0.2046893984079361, "step": 7565 }, { "epoch": 6.6916961130742045, "grad_norm": 0.6978748440742493, "learning_rate": 2.441013396975822e-07, "loss": 0.2604, "loss_nan_ranks": 0, "loss_rank_avg": 0.24113841354846954, "step": 7570 }, { "epoch": 6.696113074204947, "grad_norm": 0.7417298555374146, "learning_rate": 2.3728720241864123e-07, "loss": 0.2582, "loss_nan_ranks": 0, "loss_rank_avg": 0.2803192436695099, "step": 7575 }, { "epoch": 6.700530035335689, "grad_norm": 0.6066814661026001, "learning_rate": 2.3056895777740174e-07, "loss": 0.2462, "loss_nan_ranks": 0, "loss_rank_avg": 0.2779116630554199, "step": 7580 }, { "epoch": 6.704946996466431, "grad_norm": 0.7119974493980408, "learning_rate": 2.2394663837213005e-07, "loss": 0.226, "loss_nan_ranks": 0, "loss_rank_avg": 0.18332818150520325, "step": 7585 }, { "epoch": 6.709363957597173, "grad_norm": 0.6575401425361633, "learning_rate": 2.1742027633564477e-07, "loss": 0.2058, "loss_nan_ranks": 0, "loss_rank_avg": 0.22709758579730988, "step": 7590 }, { "epoch": 6.713780918727915, "grad_norm": 0.6911765933036804, "learning_rate": 2.1098990333516144e-07, "loss": 0.2245, "loss_nan_ranks": 0, "loss_rank_avg": 0.1879327893257141, "step": 7595 }, { "epoch": 6.718197879858657, "grad_norm": 0.6780998110771179, "learning_rate": 2.0465555057213705e-07, "loss": 0.2762, "loss_nan_ranks": 0, "loss_rank_avg": 0.2876776456832886, "step": 7600 }, { "epoch": 6.722614840989399, "grad_norm": 0.6587400436401367, "learning_rate": 1.9841724878211676e-07, "loss": 0.2409, "loss_nan_ranks": 0, "loss_rank_avg": 0.2115318775177002, "step": 7605 }, { "epoch": 6.727031802120141, "grad_norm": 0.7173461318016052, "learning_rate": 1.9227502823458976e-07, "loss": 0.2223, "loss_nan_ranks": 0, "loss_rank_avg": 0.23369640111923218, "step": 7610 }, { "epoch": 6.731448763250883, "grad_norm": 1.5791888236999512, "learning_rate": 1.8622891873284254e-07, "loss": 0.2322, "loss_nan_ranks": 0, "loss_rank_avg": 0.1805000901222229, "step": 7615 }, { "epoch": 6.735865724381625, "grad_norm": 0.720446765422821, "learning_rate": 1.8027894961380353e-07, "loss": 0.237, "loss_nan_ranks": 0, "loss_rank_avg": 0.25251534581184387, "step": 7620 }, { "epoch": 6.740282685512367, "grad_norm": 0.8006609082221985, "learning_rate": 1.7442514974792103e-07, "loss": 0.2704, "loss_nan_ranks": 0, "loss_rank_avg": 0.23466309905052185, "step": 7625 }, { "epoch": 6.7446996466431095, "grad_norm": 0.6387749910354614, "learning_rate": 1.6866754753899429e-07, "loss": 0.2293, "loss_nan_ranks": 0, "loss_rank_avg": 0.29380500316619873, "step": 7630 }, { "epoch": 6.749116607773852, "grad_norm": 0.6837402582168579, "learning_rate": 1.6300617092406933e-07, "loss": 0.2852, "loss_nan_ranks": 0, "loss_rank_avg": 0.24347926676273346, "step": 7635 }, { "epoch": 6.753533568904594, "grad_norm": 0.6359254717826843, "learning_rate": 1.5744104737327458e-07, "loss": 0.2428, "loss_nan_ranks": 0, "loss_rank_avg": 0.2921152710914612, "step": 7640 }, { "epoch": 6.757950530035336, "grad_norm": 0.7265500426292419, "learning_rate": 1.5197220388970313e-07, "loss": 0.2053, "loss_nan_ranks": 0, "loss_rank_avg": 0.1532514989376068, "step": 7645 }, { "epoch": 6.762367491166078, "grad_norm": 0.6616162061691284, "learning_rate": 1.4659966700927952e-07, "loss": 0.2316, "loss_nan_ranks": 0, "loss_rank_avg": 0.23041373491287231, "step": 7650 }, { "epoch": 6.76678445229682, "grad_norm": 0.6630392670631409, "learning_rate": 1.413234628006288e-07, "loss": 0.2372, "loss_nan_ranks": 0, "loss_rank_avg": 0.2294270396232605, "step": 7655 }, { "epoch": 6.771201413427562, "grad_norm": 0.6096079349517822, "learning_rate": 1.3614361686494549e-07, "loss": 0.2375, "loss_nan_ranks": 0, "loss_rank_avg": 0.2563217282295227, "step": 7660 }, { "epoch": 6.775618374558304, "grad_norm": 0.6756962537765503, "learning_rate": 1.310601543358847e-07, "loss": 0.2402, "loss_nan_ranks": 0, "loss_rank_avg": 0.18047744035720825, "step": 7665 }, { "epoch": 6.780035335689046, "grad_norm": 0.855846107006073, "learning_rate": 1.260730998794202e-07, "loss": 0.2261, "loss_nan_ranks": 0, "loss_rank_avg": 0.21389582753181458, "step": 7670 }, { "epoch": 6.784452296819788, "grad_norm": 0.64863520860672, "learning_rate": 1.2118247769373758e-07, "loss": 0.2558, "loss_nan_ranks": 0, "loss_rank_avg": 0.3245598375797272, "step": 7675 }, { "epoch": 6.78886925795053, "grad_norm": 0.6336214542388916, "learning_rate": 1.163883115091169e-07, "loss": 0.2173, "loss_nan_ranks": 0, "loss_rank_avg": 0.2048630714416504, "step": 7680 }, { "epoch": 6.793286219081272, "grad_norm": 0.7539443373680115, "learning_rate": 1.1169062458781022e-07, "loss": 0.2131, "loss_nan_ranks": 0, "loss_rank_avg": 0.18259122967720032, "step": 7685 }, { "epoch": 6.7977031802120145, "grad_norm": 0.7678613662719727, "learning_rate": 1.0708943972393748e-07, "loss": 0.2261, "loss_nan_ranks": 0, "loss_rank_avg": 0.23812338709831238, "step": 7690 }, { "epoch": 6.8021201413427566, "grad_norm": 0.6185624003410339, "learning_rate": 1.025847792433643e-07, "loss": 0.2611, "loss_nan_ranks": 0, "loss_rank_avg": 0.2555660903453827, "step": 7695 }, { "epoch": 6.806537102473499, "grad_norm": 0.6969472169876099, "learning_rate": 9.817666500360867e-08, "loss": 0.2362, "loss_nan_ranks": 0, "loss_rank_avg": 0.20245115458965302, "step": 7700 }, { "epoch": 6.810954063604241, "grad_norm": 0.678084135055542, "learning_rate": 9.386511839372114e-08, "loss": 0.2119, "loss_nan_ranks": 0, "loss_rank_avg": 0.21555975079536438, "step": 7705 }, { "epoch": 6.815371024734983, "grad_norm": 0.6462898850440979, "learning_rate": 8.965016033418705e-08, "loss": 0.2257, "loss_nan_ranks": 0, "loss_rank_avg": 0.20813468098640442, "step": 7710 }, { "epoch": 6.819787985865725, "grad_norm": 0.6597393155097961, "learning_rate": 8.553181127683108e-08, "loss": 0.2653, "loss_nan_ranks": 0, "loss_rank_avg": 0.23082324862480164, "step": 7715 }, { "epoch": 6.824204946996466, "grad_norm": 0.6514454483985901, "learning_rate": 8.1510091204704e-08, "loss": 0.2372, "loss_nan_ranks": 0, "loss_rank_avg": 0.23393850028514862, "step": 7720 }, { "epoch": 6.828621908127208, "grad_norm": 0.7454639077186584, "learning_rate": 7.758501963199605e-08, "loss": 0.2673, "loss_nan_ranks": 0, "loss_rank_avg": 0.26989006996154785, "step": 7725 }, { "epoch": 6.83303886925795, "grad_norm": 0.6669009327888489, "learning_rate": 7.375661560394154e-08, "loss": 0.2844, "loss_nan_ranks": 0, "loss_rank_avg": 0.2976769208908081, "step": 7730 }, { "epoch": 6.837455830388692, "grad_norm": 0.7060198783874512, "learning_rate": 7.002489769672105e-08, "loss": 0.2247, "loss_nan_ranks": 0, "loss_rank_avg": 0.265421986579895, "step": 7735 }, { "epoch": 6.841872791519434, "grad_norm": 0.7038555145263672, "learning_rate": 6.638988401737933e-08, "loss": 0.2383, "loss_nan_ranks": 0, "loss_rank_avg": 0.2049991488456726, "step": 7740 }, { "epoch": 6.8462897526501765, "grad_norm": 0.7277244925498962, "learning_rate": 6.285159220372982e-08, "loss": 0.2929, "loss_nan_ranks": 0, "loss_rank_avg": 0.20731091499328613, "step": 7745 }, { "epoch": 6.8507067137809186, "grad_norm": 0.6668598651885986, "learning_rate": 5.941003942427026e-08, "loss": 0.2092, "loss_nan_ranks": 0, "loss_rank_avg": 0.23843859136104584, "step": 7750 }, { "epoch": 6.855123674911661, "grad_norm": 0.6584317684173584, "learning_rate": 5.6065242378104957e-08, "loss": 0.2357, "loss_nan_ranks": 0, "loss_rank_avg": 0.24361775815486908, "step": 7755 }, { "epoch": 6.859540636042403, "grad_norm": 0.6468645334243774, "learning_rate": 5.281721729486044e-08, "loss": 0.2196, "loss_nan_ranks": 0, "loss_rank_avg": 0.2681673467159271, "step": 7760 }, { "epoch": 6.863957597173145, "grad_norm": 0.5975419282913208, "learning_rate": 4.966597993460109e-08, "loss": 0.2458, "loss_nan_ranks": 0, "loss_rank_avg": 0.2951905131340027, "step": 7765 }, { "epoch": 6.868374558303887, "grad_norm": 0.7025241255760193, "learning_rate": 4.6611545587762486e-08, "loss": 0.1947, "loss_nan_ranks": 0, "loss_rank_avg": 0.22322210669517517, "step": 7770 }, { "epoch": 6.872791519434629, "grad_norm": 0.7678356170654297, "learning_rate": 4.365392907507149e-08, "loss": 0.1886, "loss_nan_ranks": 0, "loss_rank_avg": 0.15035316348075867, "step": 7775 }, { "epoch": 6.877208480565371, "grad_norm": 0.6374990344047546, "learning_rate": 4.079314474747742e-08, "loss": 0.2243, "loss_nan_ranks": 0, "loss_rank_avg": 0.24768120050430298, "step": 7780 }, { "epoch": 6.881625441696113, "grad_norm": 0.6044369339942932, "learning_rate": 3.802920648607433e-08, "loss": 0.2157, "loss_nan_ranks": 0, "loss_rank_avg": 0.22181479632854462, "step": 7785 }, { "epoch": 6.886042402826855, "grad_norm": 0.6744352579116821, "learning_rate": 3.536212770204772e-08, "loss": 0.2375, "loss_nan_ranks": 0, "loss_rank_avg": 0.29637134075164795, "step": 7790 }, { "epoch": 6.890459363957597, "grad_norm": 0.6962430477142334, "learning_rate": 3.279192133659459e-08, "loss": 0.2164, "loss_nan_ranks": 0, "loss_rank_avg": 0.2750842869281769, "step": 7795 }, { "epoch": 6.894876325088339, "grad_norm": 0.6851283311843872, "learning_rate": 3.0318599860872377e-08, "loss": 0.2213, "loss_nan_ranks": 0, "loss_rank_avg": 0.29073143005371094, "step": 7800 }, { "epoch": 6.8992932862190814, "grad_norm": 0.7150095701217651, "learning_rate": 2.7942175275932347e-08, "loss": 0.2358, "loss_nan_ranks": 0, "loss_rank_avg": 0.22673749923706055, "step": 7805 }, { "epoch": 6.9037102473498235, "grad_norm": 0.7254000306129456, "learning_rate": 2.5662659112659637e-08, "loss": 0.2383, "loss_nan_ranks": 0, "loss_rank_avg": 0.2616110146045685, "step": 7810 }, { "epoch": 6.908127208480566, "grad_norm": 0.613750696182251, "learning_rate": 2.3480062431724404e-08, "loss": 0.2359, "loss_nan_ranks": 0, "loss_rank_avg": 0.17015941441059113, "step": 7815 }, { "epoch": 6.912544169611308, "grad_norm": 0.6614463925361633, "learning_rate": 2.1394395823524093e-08, "loss": 0.198, "loss_nan_ranks": 0, "loss_rank_avg": 0.182722270488739, "step": 7820 }, { "epoch": 6.91696113074205, "grad_norm": 0.7142705917358398, "learning_rate": 1.9405669408127935e-08, "loss": 0.2227, "loss_nan_ranks": 0, "loss_rank_avg": 0.23320986330509186, "step": 7825 }, { "epoch": 6.921378091872792, "grad_norm": 0.701187252998352, "learning_rate": 1.7513892835236967e-08, "loss": 0.2251, "loss_nan_ranks": 0, "loss_rank_avg": 0.23433920741081238, "step": 7830 }, { "epoch": 6.925795053003534, "grad_norm": 0.6869240999221802, "learning_rate": 1.5719075284126307e-08, "loss": 0.2325, "loss_nan_ranks": 0, "loss_rank_avg": 0.24257785081863403, "step": 7835 }, { "epoch": 6.930212014134275, "grad_norm": 0.656295657157898, "learning_rate": 1.4021225463614063e-08, "loss": 0.2759, "loss_nan_ranks": 0, "loss_rank_avg": 0.3004852533340454, "step": 7840 }, { "epoch": 6.934628975265017, "grad_norm": 0.6697101593017578, "learning_rate": 1.2420351612003611e-08, "loss": 0.2478, "loss_nan_ranks": 0, "loss_rank_avg": 0.24285230040550232, "step": 7845 }, { "epoch": 6.939045936395759, "grad_norm": 0.6110602617263794, "learning_rate": 1.0916461497059161e-08, "loss": 0.1851, "loss_nan_ranks": 0, "loss_rank_avg": 0.1842736005783081, "step": 7850 }, { "epoch": 6.943462897526501, "grad_norm": 0.7281805276870728, "learning_rate": 9.509562415952468e-09, "loss": 0.2303, "loss_nan_ranks": 0, "loss_rank_avg": 0.19850802421569824, "step": 7855 }, { "epoch": 6.9478798586572434, "grad_norm": 0.6979923248291016, "learning_rate": 8.199661195240626e-09, "loss": 0.2786, "loss_nan_ranks": 0, "loss_rank_avg": 0.3593960404396057, "step": 7860 }, { "epoch": 6.9522968197879855, "grad_norm": 0.7403237223625183, "learning_rate": 6.9867641908305524e-09, "loss": 0.2273, "loss_nan_ranks": 0, "loss_rank_avg": 0.32437562942504883, "step": 7865 }, { "epoch": 6.956713780918728, "grad_norm": 0.6508459448814392, "learning_rate": 5.870877287934562e-09, "loss": 0.2254, "loss_nan_ranks": 0, "loss_rank_avg": 0.27816981077194214, "step": 7870 }, { "epoch": 6.96113074204947, "grad_norm": 0.8912989497184753, "learning_rate": 4.852005901063717e-09, "loss": 0.207, "loss_nan_ranks": 0, "loss_rank_avg": 0.17091882228851318, "step": 7875 }, { "epoch": 6.965547703180212, "grad_norm": 0.6817123889923096, "learning_rate": 3.930154973985634e-09, "loss": 0.2314, "loss_nan_ranks": 0, "loss_rank_avg": 0.20704488456249237, "step": 7880 }, { "epoch": 6.969964664310954, "grad_norm": 0.6807764172554016, "learning_rate": 3.1053289797022825e-09, "loss": 0.2361, "loss_nan_ranks": 0, "loss_rank_avg": 0.19280683994293213, "step": 7885 }, { "epoch": 6.974381625441696, "grad_norm": 0.7215080261230469, "learning_rate": 2.37753192043888e-09, "loss": 0.2094, "loss_nan_ranks": 0, "loss_rank_avg": 0.20111405849456787, "step": 7890 }, { "epoch": 6.978798586572438, "grad_norm": 0.7553413510322571, "learning_rate": 1.746767327610588e-09, "loss": 0.2498, "loss_nan_ranks": 0, "loss_rank_avg": 0.21126879751682281, "step": 7895 }, { "epoch": 6.98321554770318, "grad_norm": 0.6956353187561035, "learning_rate": 1.2130382618114057e-09, "loss": 0.2267, "loss_nan_ranks": 0, "loss_rank_avg": 0.26631930470466614, "step": 7900 }, { "epoch": 6.987632508833922, "grad_norm": 0.9583781361579895, "learning_rate": 7.763473128052923e-10, "loss": 0.2211, "loss_nan_ranks": 0, "loss_rank_avg": 0.20020142197608948, "step": 7905 }, { "epoch": 6.992049469964664, "grad_norm": 0.7983410358428955, "learning_rate": 4.366965994995198e-10, "loss": 0.2046, "loss_nan_ranks": 0, "loss_rank_avg": 0.17487777769565582, "step": 7910 }, { "epoch": 6.996466431095406, "grad_norm": 0.7559086084365845, "learning_rate": 1.9408776995355483e-10, "loss": 0.1849, "loss_nan_ranks": 0, "loss_rank_avg": 0.1718912273645401, "step": 7915 }, { "epoch": 6.999116607773852, "loss_nan_ranks": 0, "loss_rank_avg": 0.2685817778110504, "step": 7918, "total_flos": 7677204649476096.0, "train_loss": 0.2887290850240313, "train_runtime": 73070.9484, "train_samples_per_second": 0.108, "train_steps_per_second": 0.108 } ], "logging_steps": 5, "max_steps": 7924, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 1500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7677204649476096.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }