Files
GLM-4_7-r2egym_sandboxes-ma…/trainer_state.json
ModelHub XC 8c7ae21563 初始化项目,由ModelHub XC社区提供模型
Model: laion/GLM-4_7-r2egym_sandboxes-maxeps-131k-lc
Source: Original Platform
2026-05-16 16:35:55 +08:00

14293 lines
378 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.999116607773852,
"eval_steps": 500,
"global_step": 7918,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00441696113074205,
"grad_norm": 24.109052658081055,
"learning_rate": 2.0176544766708703e-07,
"loss": 0.8392,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.7250818014144897,
"step": 5
},
{
"epoch": 0.0088339222614841,
"grad_norm": 23.32025718688965,
"learning_rate": 4.5397225725094586e-07,
"loss": 0.8118,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.8858845233917236,
"step": 10
},
{
"epoch": 0.013250883392226149,
"grad_norm": 19.11461639404297,
"learning_rate": 7.061790668348046e-07,
"loss": 0.8166,
"loss_nan_ranks": 0,
"loss_rank_avg": 1.0077567100524902,
"step": 15
},
{
"epoch": 0.0176678445229682,
"grad_norm": 18.66092300415039,
"learning_rate": 9.583858764186634e-07,
"loss": 0.7684,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.7495265007019043,
"step": 20
},
{
"epoch": 0.022084805653710248,
"grad_norm": 11.922789573669434,
"learning_rate": 1.210592686002522e-06,
"loss": 0.7393,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.6014531850814819,
"step": 25
},
{
"epoch": 0.026501766784452298,
"grad_norm": 9.920490264892578,
"learning_rate": 1.4627994955863808e-06,
"loss": 0.7164,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.6873708367347717,
"step": 30
},
{
"epoch": 0.030918727915194347,
"grad_norm": 6.4108171463012695,
"learning_rate": 1.7150063051702399e-06,
"loss": 0.6782,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.628339946269989,
"step": 35
},
{
"epoch": 0.0353356890459364,
"grad_norm": 4.962175369262695,
"learning_rate": 1.9672131147540985e-06,
"loss": 0.6596,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.6136308908462524,
"step": 40
},
{
"epoch": 0.03975265017667844,
"grad_norm": 2.925511121749878,
"learning_rate": 2.2194199243379574e-06,
"loss": 0.6625,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.7212941646575928,
"step": 45
},
{
"epoch": 0.044169611307420496,
"grad_norm": 1.815821886062622,
"learning_rate": 2.4716267339218163e-06,
"loss": 0.6164,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.6185051202774048,
"step": 50
},
{
"epoch": 0.04858657243816254,
"grad_norm": 1.6444969177246094,
"learning_rate": 2.723833543505675e-06,
"loss": 0.5692,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.5030768513679504,
"step": 55
},
{
"epoch": 0.053003533568904596,
"grad_norm": 1.4677263498306274,
"learning_rate": 2.9760403530895336e-06,
"loss": 0.5737,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.6769396066665649,
"step": 60
},
{
"epoch": 0.05742049469964664,
"grad_norm": 1.4194655418395996,
"learning_rate": 3.2282471626733925e-06,
"loss": 0.5621,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.48633405566215515,
"step": 65
},
{
"epoch": 0.061837455830388695,
"grad_norm": 1.1492063999176025,
"learning_rate": 3.480453972257251e-06,
"loss": 0.5322,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.5070392489433289,
"step": 70
},
{
"epoch": 0.06625441696113074,
"grad_norm": 1.0129790306091309,
"learning_rate": 3.73266078184111e-06,
"loss": 0.5497,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4552308917045593,
"step": 75
},
{
"epoch": 0.0706713780918728,
"grad_norm": 0.9223603010177612,
"learning_rate": 3.984867591424969e-06,
"loss": 0.5376,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4579840302467346,
"step": 80
},
{
"epoch": 0.07508833922261485,
"grad_norm": 1.050498366355896,
"learning_rate": 4.237074401008828e-06,
"loss": 0.5411,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.5621368885040283,
"step": 85
},
{
"epoch": 0.07950530035335689,
"grad_norm": 0.8152749538421631,
"learning_rate": 4.4892812105926865e-06,
"loss": 0.5246,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.49742865562438965,
"step": 90
},
{
"epoch": 0.08392226148409894,
"grad_norm": 0.9410054087638855,
"learning_rate": 4.741488020176545e-06,
"loss": 0.5438,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.5632932186126709,
"step": 95
},
{
"epoch": 0.08833922261484099,
"grad_norm": 0.7807851433753967,
"learning_rate": 4.993694829760403e-06,
"loss": 0.5607,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.5178039073944092,
"step": 100
},
{
"epoch": 0.09275618374558305,
"grad_norm": 0.9543706178665161,
"learning_rate": 5.245901639344263e-06,
"loss": 0.5857,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4983155429363251,
"step": 105
},
{
"epoch": 0.09717314487632508,
"grad_norm": 1.1576862335205078,
"learning_rate": 5.498108448928121e-06,
"loss": 0.5065,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.5359828472137451,
"step": 110
},
{
"epoch": 0.10159010600706714,
"grad_norm": 0.8757576942443848,
"learning_rate": 5.7503152585119805e-06,
"loss": 0.5241,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4543844163417816,
"step": 115
},
{
"epoch": 0.10600706713780919,
"grad_norm": 0.8769079446792603,
"learning_rate": 6.00252206809584e-06,
"loss": 0.4923,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4466102123260498,
"step": 120
},
{
"epoch": 0.11042402826855123,
"grad_norm": 0.7672253251075745,
"learning_rate": 6.254728877679697e-06,
"loss": 0.4789,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.5950910449028015,
"step": 125
},
{
"epoch": 0.11484098939929328,
"grad_norm": 0.7976667284965515,
"learning_rate": 6.506935687263557e-06,
"loss": 0.4813,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.46099185943603516,
"step": 130
},
{
"epoch": 0.11925795053003534,
"grad_norm": 0.8457517623901367,
"learning_rate": 6.759142496847415e-06,
"loss": 0.5338,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4921356439590454,
"step": 135
},
{
"epoch": 0.12367491166077739,
"grad_norm": 0.8651091456413269,
"learning_rate": 7.0113493064312745e-06,
"loss": 0.4892,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4262227714061737,
"step": 140
},
{
"epoch": 0.12809187279151943,
"grad_norm": 0.7235690355300903,
"learning_rate": 7.263556116015134e-06,
"loss": 0.459,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.46743807196617126,
"step": 145
},
{
"epoch": 0.13250883392226148,
"grad_norm": 0.7583150267601013,
"learning_rate": 7.515762925598991e-06,
"loss": 0.4925,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4664328396320343,
"step": 150
},
{
"epoch": 0.13692579505300354,
"grad_norm": 0.9911392331123352,
"learning_rate": 7.76796973518285e-06,
"loss": 0.4904,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.5170290470123291,
"step": 155
},
{
"epoch": 0.1413427561837456,
"grad_norm": 0.7634608745574951,
"learning_rate": 8.020176544766708e-06,
"loss": 0.484,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.45101815462112427,
"step": 160
},
{
"epoch": 0.14575971731448764,
"grad_norm": 0.8441954255104065,
"learning_rate": 8.272383354350568e-06,
"loss": 0.4711,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.527004599571228,
"step": 165
},
{
"epoch": 0.1501766784452297,
"grad_norm": 0.8853057026863098,
"learning_rate": 8.524590163934427e-06,
"loss": 0.4725,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.40662050247192383,
"step": 170
},
{
"epoch": 0.15459363957597172,
"grad_norm": 0.7503329515457153,
"learning_rate": 8.776796973518286e-06,
"loss": 0.4435,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.45254433155059814,
"step": 175
},
{
"epoch": 0.15901060070671377,
"grad_norm": 0.811824381351471,
"learning_rate": 9.029003783102146e-06,
"loss": 0.4582,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.44835376739501953,
"step": 180
},
{
"epoch": 0.16342756183745583,
"grad_norm": 0.8182066679000854,
"learning_rate": 9.281210592686003e-06,
"loss": 0.4924,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.5891055464744568,
"step": 185
},
{
"epoch": 0.16784452296819788,
"grad_norm": 0.7815266251564026,
"learning_rate": 9.533417402269862e-06,
"loss": 0.4778,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.44031378626823425,
"step": 190
},
{
"epoch": 0.17226148409893993,
"grad_norm": 0.8124738931655884,
"learning_rate": 9.78562421185372e-06,
"loss": 0.4345,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.45023810863494873,
"step": 195
},
{
"epoch": 0.17667844522968199,
"grad_norm": 0.8434866666793823,
"learning_rate": 1.0037831021437581e-05,
"loss": 0.4199,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.44062817096710205,
"step": 200
},
{
"epoch": 0.18109540636042404,
"grad_norm": 0.8283309936523438,
"learning_rate": 1.0290037831021437e-05,
"loss": 0.4342,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4788593649864197,
"step": 205
},
{
"epoch": 0.1855123674911661,
"grad_norm": 0.6613907814025879,
"learning_rate": 1.0542244640605296e-05,
"loss": 0.4274,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4137096405029297,
"step": 210
},
{
"epoch": 0.18992932862190812,
"grad_norm": 0.9123347401618958,
"learning_rate": 1.0794451450189156e-05,
"loss": 0.4314,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4231804609298706,
"step": 215
},
{
"epoch": 0.19434628975265017,
"grad_norm": 0.8312901854515076,
"learning_rate": 1.1046658259773015e-05,
"loss": 0.4208,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4278219938278198,
"step": 220
},
{
"epoch": 0.19876325088339222,
"grad_norm": 0.6793373823165894,
"learning_rate": 1.1298865069356874e-05,
"loss": 0.4253,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.39295411109924316,
"step": 225
},
{
"epoch": 0.20318021201413428,
"grad_norm": 0.8161110877990723,
"learning_rate": 1.1551071878940732e-05,
"loss": 0.4605,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4723338186740875,
"step": 230
},
{
"epoch": 0.20759717314487633,
"grad_norm": 0.7220736145973206,
"learning_rate": 1.1803278688524591e-05,
"loss": 0.4244,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.44514936208724976,
"step": 235
},
{
"epoch": 0.21201413427561838,
"grad_norm": 0.780061662197113,
"learning_rate": 1.205548549810845e-05,
"loss": 0.472,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4039708971977234,
"step": 240
},
{
"epoch": 0.21643109540636044,
"grad_norm": 0.6663079857826233,
"learning_rate": 1.230769230769231e-05,
"loss": 0.4287,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.35989290475845337,
"step": 245
},
{
"epoch": 0.22084805653710246,
"grad_norm": 0.8617585897445679,
"learning_rate": 1.2559899117276166e-05,
"loss": 0.4589,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.5611141324043274,
"step": 250
},
{
"epoch": 0.2252650176678445,
"grad_norm": 0.8636696934700012,
"learning_rate": 1.2812105926860025e-05,
"loss": 0.393,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4264186918735504,
"step": 255
},
{
"epoch": 0.22968197879858657,
"grad_norm": 0.884749174118042,
"learning_rate": 1.3064312736443884e-05,
"loss": 0.441,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.5420808792114258,
"step": 260
},
{
"epoch": 0.23409893992932862,
"grad_norm": 0.7348568439483643,
"learning_rate": 1.3316519546027744e-05,
"loss": 0.357,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.34996867179870605,
"step": 265
},
{
"epoch": 0.23851590106007067,
"grad_norm": 0.6985581517219543,
"learning_rate": 1.3568726355611603e-05,
"loss": 0.4148,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.44080445170402527,
"step": 270
},
{
"epoch": 0.24293286219081273,
"grad_norm": 0.8112905025482178,
"learning_rate": 1.382093316519546e-05,
"loss": 0.387,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.46948182582855225,
"step": 275
},
{
"epoch": 0.24734982332155478,
"grad_norm": 0.7835463881492615,
"learning_rate": 1.407313997477932e-05,
"loss": 0.4286,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.39757293462753296,
"step": 280
},
{
"epoch": 0.25176678445229683,
"grad_norm": 0.8474117517471313,
"learning_rate": 1.4325346784363179e-05,
"loss": 0.4184,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.386111855506897,
"step": 285
},
{
"epoch": 0.25618374558303886,
"grad_norm": 0.7113578915596008,
"learning_rate": 1.4577553593947038e-05,
"loss": 0.3587,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.32697296142578125,
"step": 290
},
{
"epoch": 0.26060070671378094,
"grad_norm": 0.8683375716209412,
"learning_rate": 1.4829760403530898e-05,
"loss": 0.3728,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3836482763290405,
"step": 295
},
{
"epoch": 0.26501766784452296,
"grad_norm": 0.732476532459259,
"learning_rate": 1.5081967213114754e-05,
"loss": 0.4082,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.39730846881866455,
"step": 300
},
{
"epoch": 0.26943462897526504,
"grad_norm": 0.8139944076538086,
"learning_rate": 1.5334174022698615e-05,
"loss": 0.4319,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.44922178983688354,
"step": 305
},
{
"epoch": 0.27385159010600707,
"grad_norm": 0.7223174571990967,
"learning_rate": 1.5586380832282474e-05,
"loss": 0.3937,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.39554718136787415,
"step": 310
},
{
"epoch": 0.2782685512367491,
"grad_norm": 0.7935890555381775,
"learning_rate": 1.5838587641866333e-05,
"loss": 0.3971,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.38846322894096375,
"step": 315
},
{
"epoch": 0.2826855123674912,
"grad_norm": 0.8385109305381775,
"learning_rate": 1.6090794451450193e-05,
"loss": 0.3842,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3368198275566101,
"step": 320
},
{
"epoch": 0.2871024734982332,
"grad_norm": 0.7849225401878357,
"learning_rate": 1.634300126103405e-05,
"loss": 0.4017,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4689519703388214,
"step": 325
},
{
"epoch": 0.2915194346289753,
"grad_norm": 0.9184194207191467,
"learning_rate": 1.6595208070617908e-05,
"loss": 0.4222,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.451241135597229,
"step": 330
},
{
"epoch": 0.2959363957597173,
"grad_norm": 0.7168762683868408,
"learning_rate": 1.6847414880201767e-05,
"loss": 0.3989,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3456907868385315,
"step": 335
},
{
"epoch": 0.3003533568904594,
"grad_norm": 0.7282963395118713,
"learning_rate": 1.7099621689785626e-05,
"loss": 0.4091,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3524041771888733,
"step": 340
},
{
"epoch": 0.3047703180212014,
"grad_norm": 0.6994873285293579,
"learning_rate": 1.7351828499369486e-05,
"loss": 0.4219,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3284667134284973,
"step": 345
},
{
"epoch": 0.30918727915194344,
"grad_norm": 0.6103523969650269,
"learning_rate": 1.760403530895334e-05,
"loss": 0.4224,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3484126925468445,
"step": 350
},
{
"epoch": 0.3136042402826855,
"grad_norm": 0.7844368815422058,
"learning_rate": 1.78562421185372e-05,
"loss": 0.4439,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.47252100706100464,
"step": 355
},
{
"epoch": 0.31802120141342755,
"grad_norm": 0.6682479381561279,
"learning_rate": 1.810844892812106e-05,
"loss": 0.3943,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4642447829246521,
"step": 360
},
{
"epoch": 0.3224381625441696,
"grad_norm": 0.7882423996925354,
"learning_rate": 1.836065573770492e-05,
"loss": 0.3837,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4364447593688965,
"step": 365
},
{
"epoch": 0.32685512367491165,
"grad_norm": 0.6892913579940796,
"learning_rate": 1.861286254728878e-05,
"loss": 0.403,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3730366826057434,
"step": 370
},
{
"epoch": 0.33127208480565373,
"grad_norm": 0.890687882900238,
"learning_rate": 1.8865069356872635e-05,
"loss": 0.4094,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4724777638912201,
"step": 375
},
{
"epoch": 0.33568904593639576,
"grad_norm": 0.7592324018478394,
"learning_rate": 1.9117276166456494e-05,
"loss": 0.4166,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.38363948464393616,
"step": 380
},
{
"epoch": 0.3401060070671378,
"grad_norm": 0.8239976763725281,
"learning_rate": 1.9369482976040353e-05,
"loss": 0.3977,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.37950828671455383,
"step": 385
},
{
"epoch": 0.34452296819787986,
"grad_norm": 1.0034205913543701,
"learning_rate": 1.9621689785624213e-05,
"loss": 0.3811,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31529074907302856,
"step": 390
},
{
"epoch": 0.3489399293286219,
"grad_norm": 0.7281728982925415,
"learning_rate": 1.9873896595208072e-05,
"loss": 0.4184,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.35957640409469604,
"step": 395
},
{
"epoch": 0.35335689045936397,
"grad_norm": 0.8345057368278503,
"learning_rate": 2.012610340479193e-05,
"loss": 0.3603,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.38588133454322815,
"step": 400
},
{
"epoch": 0.357773851590106,
"grad_norm": 0.881252110004425,
"learning_rate": 2.037831021437579e-05,
"loss": 0.4153,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.385145366191864,
"step": 405
},
{
"epoch": 0.3621908127208481,
"grad_norm": 0.7450293302536011,
"learning_rate": 2.063051702395965e-05,
"loss": 0.43,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4830145835876465,
"step": 410
},
{
"epoch": 0.3666077738515901,
"grad_norm": 0.7826328873634338,
"learning_rate": 2.0882723833543506e-05,
"loss": 0.3932,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.34232282638549805,
"step": 415
},
{
"epoch": 0.3710247349823322,
"grad_norm": 0.7054056525230408,
"learning_rate": 2.113493064312737e-05,
"loss": 0.3558,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2948620617389679,
"step": 420
},
{
"epoch": 0.3754416961130742,
"grad_norm": 0.770078718662262,
"learning_rate": 2.1387137452711224e-05,
"loss": 0.382,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4264480471611023,
"step": 425
},
{
"epoch": 0.37985865724381623,
"grad_norm": 0.7419948577880859,
"learning_rate": 2.1639344262295087e-05,
"loss": 0.3874,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4408531188964844,
"step": 430
},
{
"epoch": 0.3842756183745583,
"grad_norm": 0.7609454989433289,
"learning_rate": 2.1891551071878943e-05,
"loss": 0.3913,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.35307884216308594,
"step": 435
},
{
"epoch": 0.38869257950530034,
"grad_norm": 1.1566354036331177,
"learning_rate": 2.21437578814628e-05,
"loss": 0.3921,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2975735664367676,
"step": 440
},
{
"epoch": 0.3931095406360424,
"grad_norm": 0.8143091201782227,
"learning_rate": 2.239596469104666e-05,
"loss": 0.377,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3160606920719147,
"step": 445
},
{
"epoch": 0.39752650176678445,
"grad_norm": 0.6456040143966675,
"learning_rate": 2.2648171500630518e-05,
"loss": 0.3703,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.37065446376800537,
"step": 450
},
{
"epoch": 0.4019434628975265,
"grad_norm": 0.6718341708183289,
"learning_rate": 2.290037831021438e-05,
"loss": 0.3584,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.32244637608528137,
"step": 455
},
{
"epoch": 0.40636042402826855,
"grad_norm": 0.862759530544281,
"learning_rate": 2.3152585119798236e-05,
"loss": 0.4145,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4153875708580017,
"step": 460
},
{
"epoch": 0.4107773851590106,
"grad_norm": 0.7327967882156372,
"learning_rate": 2.3404791929382092e-05,
"loss": 0.3628,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3565199077129364,
"step": 465
},
{
"epoch": 0.41519434628975266,
"grad_norm": 0.863936722278595,
"learning_rate": 2.3656998738965955e-05,
"loss": 0.405,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.40698155760765076,
"step": 470
},
{
"epoch": 0.4196113074204947,
"grad_norm": 0.8501296639442444,
"learning_rate": 2.390920554854981e-05,
"loss": 0.3567,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.36806195974349976,
"step": 475
},
{
"epoch": 0.42402826855123676,
"grad_norm": 1.3909848928451538,
"learning_rate": 2.4161412358133673e-05,
"loss": 0.4706,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4072020649909973,
"step": 480
},
{
"epoch": 0.4284452296819788,
"grad_norm": 0.7283811569213867,
"learning_rate": 2.441361916771753e-05,
"loss": 0.3921,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.37325534224510193,
"step": 485
},
{
"epoch": 0.43286219081272087,
"grad_norm": 0.7579320669174194,
"learning_rate": 2.466582597730139e-05,
"loss": 0.3424,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.33884212374687195,
"step": 490
},
{
"epoch": 0.4372791519434629,
"grad_norm": 0.8202218413352966,
"learning_rate": 2.4918032786885248e-05,
"loss": 0.3737,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.34762677550315857,
"step": 495
},
{
"epoch": 0.4416961130742049,
"grad_norm": 0.7608699798583984,
"learning_rate": 2.5170239596469107e-05,
"loss": 0.3843,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.306643545627594,
"step": 500
},
{
"epoch": 0.446113074204947,
"grad_norm": 0.7544461488723755,
"learning_rate": 2.5422446406052967e-05,
"loss": 0.3745,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.39631327986717224,
"step": 505
},
{
"epoch": 0.450530035335689,
"grad_norm": 0.6755616664886475,
"learning_rate": 2.5674653215636826e-05,
"loss": 0.3803,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3355293869972229,
"step": 510
},
{
"epoch": 0.4549469964664311,
"grad_norm": 0.6714569330215454,
"learning_rate": 2.5926860025220682e-05,
"loss": 0.349,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.344030499458313,
"step": 515
},
{
"epoch": 0.45936395759717313,
"grad_norm": 0.7619994878768921,
"learning_rate": 2.6179066834804544e-05,
"loss": 0.3474,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.331425279378891,
"step": 520
},
{
"epoch": 0.4637809187279152,
"grad_norm": 0.745580792427063,
"learning_rate": 2.64312736443884e-05,
"loss": 0.3563,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.39198458194732666,
"step": 525
},
{
"epoch": 0.46819787985865724,
"grad_norm": 0.823861837387085,
"learning_rate": 2.668348045397226e-05,
"loss": 0.3697,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3569245934486389,
"step": 530
},
{
"epoch": 0.4726148409893993,
"grad_norm": 0.7193405628204346,
"learning_rate": 2.693568726355612e-05,
"loss": 0.3745,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4549163579940796,
"step": 535
},
{
"epoch": 0.47703180212014135,
"grad_norm": 0.7041448354721069,
"learning_rate": 2.7187894073139975e-05,
"loss": 0.3817,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.43728315830230713,
"step": 540
},
{
"epoch": 0.48144876325088337,
"grad_norm": 0.8459624648094177,
"learning_rate": 2.7440100882723838e-05,
"loss": 0.3744,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3709990084171295,
"step": 545
},
{
"epoch": 0.48586572438162545,
"grad_norm": 0.8668114542961121,
"learning_rate": 2.7692307692307694e-05,
"loss": 0.3612,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.32689058780670166,
"step": 550
},
{
"epoch": 0.4902826855123675,
"grad_norm": 0.668462872505188,
"learning_rate": 2.7944514501891556e-05,
"loss": 0.3848,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30886298418045044,
"step": 555
},
{
"epoch": 0.49469964664310956,
"grad_norm": 0.7367919683456421,
"learning_rate": 2.8196721311475412e-05,
"loss": 0.3591,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3122621476650238,
"step": 560
},
{
"epoch": 0.4991166077738516,
"grad_norm": 0.7553625106811523,
"learning_rate": 2.8448928121059268e-05,
"loss": 0.362,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3086245059967041,
"step": 565
},
{
"epoch": 0.5035335689045937,
"grad_norm": 0.6816399097442627,
"learning_rate": 2.870113493064313e-05,
"loss": 0.3565,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3278921842575073,
"step": 570
},
{
"epoch": 0.5079505300353356,
"grad_norm": 0.7400028109550476,
"learning_rate": 2.8953341740226987e-05,
"loss": 0.3903,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.33623063564300537,
"step": 575
},
{
"epoch": 0.5123674911660777,
"grad_norm": 0.6778237819671631,
"learning_rate": 2.920554854981085e-05,
"loss": 0.3745,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3270686864852905,
"step": 580
},
{
"epoch": 0.5167844522968198,
"grad_norm": 0.7293447256088257,
"learning_rate": 2.9457755359394705e-05,
"loss": 0.427,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.47877180576324463,
"step": 585
},
{
"epoch": 0.5212014134275619,
"grad_norm": 0.7676773071289062,
"learning_rate": 2.9709962168978565e-05,
"loss": 0.4404,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4877261817455292,
"step": 590
},
{
"epoch": 0.5256183745583038,
"grad_norm": 0.6538991332054138,
"learning_rate": 2.9962168978562424e-05,
"loss": 0.3604,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3789966106414795,
"step": 595
},
{
"epoch": 0.5300353356890459,
"grad_norm": 0.7705276012420654,
"learning_rate": 3.0214375788146283e-05,
"loss": 0.3947,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.33382245898246765,
"step": 600
},
{
"epoch": 0.534452296819788,
"grad_norm": 0.7710214853286743,
"learning_rate": 3.0466582597730143e-05,
"loss": 0.3543,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.38197773694992065,
"step": 605
},
{
"epoch": 0.5388692579505301,
"grad_norm": 0.67430579662323,
"learning_rate": 3.0718789407314e-05,
"loss": 0.4353,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4609474241733551,
"step": 610
},
{
"epoch": 0.5432862190812721,
"grad_norm": 0.7167083621025085,
"learning_rate": 3.097099621689786e-05,
"loss": 0.3326,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2994512915611267,
"step": 615
},
{
"epoch": 0.5477031802120141,
"grad_norm": 0.9038445949554443,
"learning_rate": 3.122320302648172e-05,
"loss": 0.3872,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.36574289202690125,
"step": 620
},
{
"epoch": 0.5521201413427562,
"grad_norm": 0.7607389092445374,
"learning_rate": 3.1475409836065576e-05,
"loss": 0.384,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4068770706653595,
"step": 625
},
{
"epoch": 0.5565371024734982,
"grad_norm": 0.8022470474243164,
"learning_rate": 3.1727616645649436e-05,
"loss": 0.3579,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.34602364897727966,
"step": 630
},
{
"epoch": 0.5609540636042403,
"grad_norm": 0.7999392747879028,
"learning_rate": 3.1979823455233295e-05,
"loss": 0.3784,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4875880479812622,
"step": 635
},
{
"epoch": 0.5653710247349824,
"grad_norm": 0.7946346402168274,
"learning_rate": 3.2232030264817154e-05,
"loss": 0.3777,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4051263630390167,
"step": 640
},
{
"epoch": 0.5697879858657244,
"grad_norm": 0.6947000622749329,
"learning_rate": 3.2484237074401014e-05,
"loss": 0.3565,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4333820044994354,
"step": 645
},
{
"epoch": 0.5742049469964664,
"grad_norm": 0.7858723402023315,
"learning_rate": 3.273644388398487e-05,
"loss": 0.3576,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.33583977818489075,
"step": 650
},
{
"epoch": 0.5786219081272085,
"grad_norm": 0.817613959312439,
"learning_rate": 3.298865069356873e-05,
"loss": 0.3129,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27572858333587646,
"step": 655
},
{
"epoch": 0.5830388692579506,
"grad_norm": 0.7588901519775391,
"learning_rate": 3.324085750315259e-05,
"loss": 0.367,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29186704754829407,
"step": 660
},
{
"epoch": 0.5874558303886925,
"grad_norm": 0.7261371612548828,
"learning_rate": 3.3493064312736444e-05,
"loss": 0.3133,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3395135998725891,
"step": 665
},
{
"epoch": 0.5918727915194346,
"grad_norm": 0.6673470735549927,
"learning_rate": 3.37452711223203e-05,
"loss": 0.404,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.41102349758148193,
"step": 670
},
{
"epoch": 0.5962897526501767,
"grad_norm": 0.6788419485092163,
"learning_rate": 3.399747793190416e-05,
"loss": 0.3931,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3825211226940155,
"step": 675
},
{
"epoch": 0.6007067137809188,
"grad_norm": 0.861670970916748,
"learning_rate": 3.424968474148802e-05,
"loss": 0.3875,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3033771514892578,
"step": 680
},
{
"epoch": 0.6051236749116607,
"grad_norm": 0.6976490616798401,
"learning_rate": 3.450189155107188e-05,
"loss": 0.3206,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3370228409767151,
"step": 685
},
{
"epoch": 0.6095406360424028,
"grad_norm": 0.6630620956420898,
"learning_rate": 3.475409836065574e-05,
"loss": 0.3294,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.34097903966903687,
"step": 690
},
{
"epoch": 0.6139575971731449,
"grad_norm": 0.6962843537330627,
"learning_rate": 3.50063051702396e-05,
"loss": 0.3731,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.40441685914993286,
"step": 695
},
{
"epoch": 0.6183745583038869,
"grad_norm": 0.6727367639541626,
"learning_rate": 3.525851197982346e-05,
"loss": 0.3226,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.32211965322494507,
"step": 700
},
{
"epoch": 0.622791519434629,
"grad_norm": 0.7818762063980103,
"learning_rate": 3.551071878940732e-05,
"loss": 0.3474,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4583301544189453,
"step": 705
},
{
"epoch": 0.627208480565371,
"grad_norm": 0.7723045349121094,
"learning_rate": 3.576292559899118e-05,
"loss": 0.3719,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28167077898979187,
"step": 710
},
{
"epoch": 0.6316254416961131,
"grad_norm": 0.6072138547897339,
"learning_rate": 3.601513240857503e-05,
"loss": 0.3479,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2791953682899475,
"step": 715
},
{
"epoch": 0.6360424028268551,
"grad_norm": 0.8396653532981873,
"learning_rate": 3.6267339218158896e-05,
"loss": 0.4271,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.5739034414291382,
"step": 720
},
{
"epoch": 0.6404593639575972,
"grad_norm": 0.7916384935379028,
"learning_rate": 3.651954602774275e-05,
"loss": 0.4105,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.42675089836120605,
"step": 725
},
{
"epoch": 0.6448763250883393,
"grad_norm": 0.7917523980140686,
"learning_rate": 3.677175283732661e-05,
"loss": 0.3365,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30214524269104004,
"step": 730
},
{
"epoch": 0.6492932862190812,
"grad_norm": 0.6911900639533997,
"learning_rate": 3.702395964691047e-05,
"loss": 0.3602,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3707190155982971,
"step": 735
},
{
"epoch": 0.6537102473498233,
"grad_norm": 0.7061692476272583,
"learning_rate": 3.727616645649433e-05,
"loss": 0.3479,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28617987036705017,
"step": 740
},
{
"epoch": 0.6581272084805654,
"grad_norm": 0.6829811334609985,
"learning_rate": 3.7528373266078186e-05,
"loss": 0.4159,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4385426640510559,
"step": 745
},
{
"epoch": 0.6625441696113075,
"grad_norm": 0.688046395778656,
"learning_rate": 3.7780580075662045e-05,
"loss": 0.3727,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3491179943084717,
"step": 750
},
{
"epoch": 0.6669611307420494,
"grad_norm": 0.6785129308700562,
"learning_rate": 3.8032786885245905e-05,
"loss": 0.3583,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.369488000869751,
"step": 755
},
{
"epoch": 0.6713780918727915,
"grad_norm": 0.7109005451202393,
"learning_rate": 3.8284993694829764e-05,
"loss": 0.3344,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.354184627532959,
"step": 760
},
{
"epoch": 0.6757950530035336,
"grad_norm": 0.7314112782478333,
"learning_rate": 3.853720050441362e-05,
"loss": 0.3582,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.37824511528015137,
"step": 765
},
{
"epoch": 0.6802120141342756,
"grad_norm": 1.165858268737793,
"learning_rate": 3.878940731399748e-05,
"loss": 0.384,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.32820677757263184,
"step": 770
},
{
"epoch": 0.6846289752650176,
"grad_norm": 0.8004192113876343,
"learning_rate": 3.904161412358134e-05,
"loss": 0.3607,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.43373507261276245,
"step": 775
},
{
"epoch": 0.6890459363957597,
"grad_norm": 0.6773238182067871,
"learning_rate": 3.9293820933165195e-05,
"loss": 0.3786,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3990272581577301,
"step": 780
},
{
"epoch": 0.6934628975265018,
"grad_norm": 0.676603376865387,
"learning_rate": 3.954602774274906e-05,
"loss": 0.3336,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.34566766023635864,
"step": 785
},
{
"epoch": 0.6978798586572438,
"grad_norm": 0.7312802672386169,
"learning_rate": 3.979823455233291e-05,
"loss": 0.3405,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.37781059741973877,
"step": 790
},
{
"epoch": 0.7022968197879859,
"grad_norm": 0.7477230429649353,
"learning_rate": 3.99999980591192e-05,
"loss": 0.345,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29296875,
"step": 795
},
{
"epoch": 0.7067137809187279,
"grad_norm": 0.6933770179748535,
"learning_rate": 3.99999301283305e-05,
"loss": 0.4408,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4305647313594818,
"step": 800
},
{
"epoch": 0.7111307420494699,
"grad_norm": 0.6644602417945862,
"learning_rate": 3.999976515387813e-05,
"loss": 0.3571,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2757279574871063,
"step": 805
},
{
"epoch": 0.715547703180212,
"grad_norm": 0.6703394651412964,
"learning_rate": 3.9999503136562586e-05,
"loss": 0.3417,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3425188660621643,
"step": 810
},
{
"epoch": 0.7199646643109541,
"grad_norm": 0.6245801448822021,
"learning_rate": 3.999914407765523e-05,
"loss": 0.3524,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29982197284698486,
"step": 815
},
{
"epoch": 0.7243816254416962,
"grad_norm": 0.701495885848999,
"learning_rate": 3.999868797889828e-05,
"loss": 0.3204,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3113703429698944,
"step": 820
},
{
"epoch": 0.7287985865724381,
"grad_norm": 0.8265374302864075,
"learning_rate": 3.999813484250483e-05,
"loss": 0.3488,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3571431338787079,
"step": 825
},
{
"epoch": 0.7332155477031802,
"grad_norm": 0.8132041096687317,
"learning_rate": 3.99974846711588e-05,
"loss": 0.3718,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30748432874679565,
"step": 830
},
{
"epoch": 0.7376325088339223,
"grad_norm": 0.6265267133712769,
"learning_rate": 3.9996737468014954e-05,
"loss": 0.3123,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3108974099159241,
"step": 835
},
{
"epoch": 0.7420494699646644,
"grad_norm": 0.7385701537132263,
"learning_rate": 3.999589323669887e-05,
"loss": 0.359,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.40513014793395996,
"step": 840
},
{
"epoch": 0.7464664310954063,
"grad_norm": 0.6594541668891907,
"learning_rate": 3.9994951981306926e-05,
"loss": 0.3511,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2959279716014862,
"step": 845
},
{
"epoch": 0.7508833922261484,
"grad_norm": 0.7326868176460266,
"learning_rate": 3.9993913706406287e-05,
"loss": 0.349,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31566479802131653,
"step": 850
},
{
"epoch": 0.7553003533568905,
"grad_norm": 0.798692524433136,
"learning_rate": 3.999277841703486e-05,
"loss": 0.347,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31945452094078064,
"step": 855
},
{
"epoch": 0.7597173144876325,
"grad_norm": 0.6340591907501221,
"learning_rate": 3.999154611870131e-05,
"loss": 0.3524,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3852103352546692,
"step": 860
},
{
"epoch": 0.7641342756183745,
"grad_norm": 0.7896412014961243,
"learning_rate": 3.999021681738499e-05,
"loss": 0.3417,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.32461148500442505,
"step": 865
},
{
"epoch": 0.7685512367491166,
"grad_norm": 0.6427087187767029,
"learning_rate": 3.998879051953593e-05,
"loss": 0.3073,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28621137142181396,
"step": 870
},
{
"epoch": 0.7729681978798587,
"grad_norm": 0.6806996464729309,
"learning_rate": 3.9987267232074816e-05,
"loss": 0.3812,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3365304470062256,
"step": 875
},
{
"epoch": 0.7773851590106007,
"grad_norm": 0.6693117618560791,
"learning_rate": 3.998564696239295e-05,
"loss": 0.3718,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3166996240615845,
"step": 880
},
{
"epoch": 0.7818021201413428,
"grad_norm": 0.719115674495697,
"learning_rate": 3.99839297183522e-05,
"loss": 0.3356,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.33166027069091797,
"step": 885
},
{
"epoch": 0.7862190812720848,
"grad_norm": 0.6326349973678589,
"learning_rate": 3.998211550828497e-05,
"loss": 0.3528,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3603453040122986,
"step": 890
},
{
"epoch": 0.7906360424028268,
"grad_norm": 0.8190131187438965,
"learning_rate": 3.998020434099418e-05,
"loss": 0.3497,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.38982582092285156,
"step": 895
},
{
"epoch": 0.7950530035335689,
"grad_norm": 0.6838703751564026,
"learning_rate": 3.997819622575319e-05,
"loss": 0.3586,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3148457705974579,
"step": 900
},
{
"epoch": 0.799469964664311,
"grad_norm": 0.6027899384498596,
"learning_rate": 3.9976091172305794e-05,
"loss": 0.3576,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3718492388725281,
"step": 905
},
{
"epoch": 0.803886925795053,
"grad_norm": 1.1394686698913574,
"learning_rate": 3.9973889190866105e-05,
"loss": 0.3383,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.32566916942596436,
"step": 910
},
{
"epoch": 0.808303886925795,
"grad_norm": 0.6600670218467712,
"learning_rate": 3.99715902921186e-05,
"loss": 0.355,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.32827770709991455,
"step": 915
},
{
"epoch": 0.8127208480565371,
"grad_norm": 0.8769943714141846,
"learning_rate": 3.9969194487217987e-05,
"loss": 0.3669,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3776477575302124,
"step": 920
},
{
"epoch": 0.8171378091872792,
"grad_norm": 0.6823641657829285,
"learning_rate": 3.9966701787789194e-05,
"loss": 0.3431,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31834328174591064,
"step": 925
},
{
"epoch": 0.8215547703180212,
"grad_norm": 0.7511164546012878,
"learning_rate": 3.996411220592729e-05,
"loss": 0.3553,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3393649458885193,
"step": 930
},
{
"epoch": 0.8259717314487632,
"grad_norm": 0.6989418268203735,
"learning_rate": 3.996142575419745e-05,
"loss": 0.3087,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.36577892303466797,
"step": 935
},
{
"epoch": 0.8303886925795053,
"grad_norm": 0.6358893513679504,
"learning_rate": 3.995864244563487e-05,
"loss": 0.3472,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29227280616760254,
"step": 940
},
{
"epoch": 0.8348056537102474,
"grad_norm": 0.6637855768203735,
"learning_rate": 3.9955762293744735e-05,
"loss": 0.3563,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3354935348033905,
"step": 945
},
{
"epoch": 0.8392226148409894,
"grad_norm": 1.028828740119934,
"learning_rate": 3.9952785312502107e-05,
"loss": 0.3675,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3541829586029053,
"step": 950
},
{
"epoch": 0.8436395759717314,
"grad_norm": 0.660925030708313,
"learning_rate": 3.99497115163519e-05,
"loss": 0.4159,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3134467601776123,
"step": 955
},
{
"epoch": 0.8480565371024735,
"grad_norm": 0.6419395208358765,
"learning_rate": 3.994654092020877e-05,
"loss": 0.3492,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.36793047189712524,
"step": 960
},
{
"epoch": 0.8524734982332155,
"grad_norm": 0.6670768857002258,
"learning_rate": 3.994327353945712e-05,
"loss": 0.3413,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.35276514291763306,
"step": 965
},
{
"epoch": 0.8568904593639576,
"grad_norm": 0.7899559736251831,
"learning_rate": 3.9939909389950894e-05,
"loss": 0.3682,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4707202911376953,
"step": 970
},
{
"epoch": 0.8613074204946997,
"grad_norm": 0.8200883865356445,
"learning_rate": 3.9936448488013646e-05,
"loss": 0.3363,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.33861371874809265,
"step": 975
},
{
"epoch": 0.8657243816254417,
"grad_norm": 0.7544311285018921,
"learning_rate": 3.9932890850438356e-05,
"loss": 0.3754,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4074310064315796,
"step": 980
},
{
"epoch": 0.8701413427561837,
"grad_norm": 0.8232197165489197,
"learning_rate": 3.9929236494487395e-05,
"loss": 0.359,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.39264535903930664,
"step": 985
},
{
"epoch": 0.8745583038869258,
"grad_norm": 0.6976638436317444,
"learning_rate": 3.9925485437892434e-05,
"loss": 0.3726,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.34916025400161743,
"step": 990
},
{
"epoch": 0.8789752650176679,
"grad_norm": 0.7832766771316528,
"learning_rate": 3.992163769885435e-05,
"loss": 0.3198,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.33082062005996704,
"step": 995
},
{
"epoch": 0.8833922261484098,
"grad_norm": 0.6496185064315796,
"learning_rate": 3.9917693296043124e-05,
"loss": 0.3586,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3823724091053009,
"step": 1000
},
{
"epoch": 0.8878091872791519,
"grad_norm": 1.394060492515564,
"learning_rate": 3.9913652248597806e-05,
"loss": 0.3653,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3977188169956207,
"step": 1005
},
{
"epoch": 0.892226148409894,
"grad_norm": 2.0863211154937744,
"learning_rate": 3.990951457612637e-05,
"loss": 0.3364,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3450695276260376,
"step": 1010
},
{
"epoch": 0.8966431095406361,
"grad_norm": 0.9185066223144531,
"learning_rate": 3.9905280298705624e-05,
"loss": 0.3569,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3249596357345581,
"step": 1015
},
{
"epoch": 0.901060070671378,
"grad_norm": 0.9138262271881104,
"learning_rate": 3.9900949436881126e-05,
"loss": 0.3507,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3235431909561157,
"step": 1020
},
{
"epoch": 0.9054770318021201,
"grad_norm": 0.663921058177948,
"learning_rate": 3.989652201166709e-05,
"loss": 0.3224,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30882376432418823,
"step": 1025
},
{
"epoch": 0.9098939929328622,
"grad_norm": 1.4138214588165283,
"learning_rate": 3.989199804454627e-05,
"loss": 0.3297,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.41032299399375916,
"step": 1030
},
{
"epoch": 0.9143109540636042,
"grad_norm": 0.7750231027603149,
"learning_rate": 3.988737755746986e-05,
"loss": 0.3366,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.35097378492355347,
"step": 1035
},
{
"epoch": 0.9187279151943463,
"grad_norm": 0.691072940826416,
"learning_rate": 3.9882660572857375e-05,
"loss": 0.3495,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.318118155002594,
"step": 1040
},
{
"epoch": 0.9231448763250883,
"grad_norm": 1.0037578344345093,
"learning_rate": 3.987784711359658e-05,
"loss": 0.3272,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2796628773212433,
"step": 1045
},
{
"epoch": 0.9275618374558304,
"grad_norm": 0.667972981929779,
"learning_rate": 3.987293720304335e-05,
"loss": 0.3611,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.41761964559555054,
"step": 1050
},
{
"epoch": 0.9319787985865724,
"grad_norm": 0.6739106178283691,
"learning_rate": 3.9867930865021535e-05,
"loss": 0.3379,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28799429535865784,
"step": 1055
},
{
"epoch": 0.9363957597173145,
"grad_norm": 0.8230948448181152,
"learning_rate": 3.9862828123822905e-05,
"loss": 0.3756,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3900887966156006,
"step": 1060
},
{
"epoch": 0.9408127208480566,
"grad_norm": 0.6671487092971802,
"learning_rate": 3.985762900420698e-05,
"loss": 0.3687,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.36852848529815674,
"step": 1065
},
{
"epoch": 0.9452296819787986,
"grad_norm": 0.6791719198226929,
"learning_rate": 3.985233353140092e-05,
"loss": 0.2972,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28970006108283997,
"step": 1070
},
{
"epoch": 0.9496466431095406,
"grad_norm": 0.6565694212913513,
"learning_rate": 3.984694173109942e-05,
"loss": 0.3508,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3562009036540985,
"step": 1075
},
{
"epoch": 0.9540636042402827,
"grad_norm": 0.6499453186988831,
"learning_rate": 3.984145362946458e-05,
"loss": 0.361,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4719471335411072,
"step": 1080
},
{
"epoch": 0.9584805653710248,
"grad_norm": 0.6347289085388184,
"learning_rate": 3.983586925312576e-05,
"loss": 0.3525,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30270689725875854,
"step": 1085
},
{
"epoch": 0.9628975265017667,
"grad_norm": 0.7031768560409546,
"learning_rate": 3.983018862917948e-05,
"loss": 0.3245,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28910696506500244,
"step": 1090
},
{
"epoch": 0.9673144876325088,
"grad_norm": 0.6593021750450134,
"learning_rate": 3.9824411785189264e-05,
"loss": 0.3461,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2953903079032898,
"step": 1095
},
{
"epoch": 0.9717314487632509,
"grad_norm": 0.7052675485610962,
"learning_rate": 3.9818538749185506e-05,
"loss": 0.3357,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.38916873931884766,
"step": 1100
},
{
"epoch": 0.976148409893993,
"grad_norm": 0.781073808670044,
"learning_rate": 3.981256954966536e-05,
"loss": 0.3559,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.42383044958114624,
"step": 1105
},
{
"epoch": 0.980565371024735,
"grad_norm": 0.8780611157417297,
"learning_rate": 3.9806504215592575e-05,
"loss": 0.345,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3717604875564575,
"step": 1110
},
{
"epoch": 0.984982332155477,
"grad_norm": 0.6577640771865845,
"learning_rate": 3.980034277639737e-05,
"loss": 0.3427,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3323565423488617,
"step": 1115
},
{
"epoch": 0.9893992932862191,
"grad_norm": 0.6743144392967224,
"learning_rate": 3.979408526197628e-05,
"loss": 0.3845,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.37875691056251526,
"step": 1120
},
{
"epoch": 0.9938162544169611,
"grad_norm": 0.6501055359840393,
"learning_rate": 3.9787731702692004e-05,
"loss": 0.3406,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3506585955619812,
"step": 1125
},
{
"epoch": 0.9982332155477032,
"grad_norm": 0.7022161483764648,
"learning_rate": 3.9781282129373294e-05,
"loss": 0.3353,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.41503027081489563,
"step": 1130
},
{
"epoch": 1.0026501766784452,
"grad_norm": 0.6163008213043213,
"learning_rate": 3.9774736573314774e-05,
"loss": 0.3603,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.41754186153411865,
"step": 1135
},
{
"epoch": 1.0070671378091873,
"grad_norm": 0.7156196236610413,
"learning_rate": 3.9768095066276794e-05,
"loss": 0.3576,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.275761216878891,
"step": 1140
},
{
"epoch": 1.0114840989399294,
"grad_norm": 0.6655539274215698,
"learning_rate": 3.9761357640485255e-05,
"loss": 0.3423,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.33661192655563354,
"step": 1145
},
{
"epoch": 1.0159010600706713,
"grad_norm": 0.673190712928772,
"learning_rate": 3.975452432863152e-05,
"loss": 0.317,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3377433717250824,
"step": 1150
},
{
"epoch": 1.0203180212014133,
"grad_norm": 0.6982813477516174,
"learning_rate": 3.974759516387216e-05,
"loss": 0.3091,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2920030951499939,
"step": 1155
},
{
"epoch": 1.0247349823321554,
"grad_norm": 0.6755205988883972,
"learning_rate": 3.9740570179828905e-05,
"loss": 0.3357,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28413355350494385,
"step": 1160
},
{
"epoch": 1.0291519434628975,
"grad_norm": 0.7137518525123596,
"learning_rate": 3.9733449410588354e-05,
"loss": 0.3105,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2865508198738098,
"step": 1165
},
{
"epoch": 1.0335689045936396,
"grad_norm": 0.6695932149887085,
"learning_rate": 3.972623289070191e-05,
"loss": 0.329,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3236057758331299,
"step": 1170
},
{
"epoch": 1.0379858657243817,
"grad_norm": 0.8682721257209778,
"learning_rate": 3.971892065518557e-05,
"loss": 0.2882,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2601502239704132,
"step": 1175
},
{
"epoch": 1.0424028268551238,
"grad_norm": 0.6531527638435364,
"learning_rate": 3.971151273951979e-05,
"loss": 0.2812,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30173492431640625,
"step": 1180
},
{
"epoch": 1.0468197879858656,
"grad_norm": 0.752144992351532,
"learning_rate": 3.970400917964922e-05,
"loss": 0.354,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2773906886577606,
"step": 1185
},
{
"epoch": 1.0512367491166077,
"grad_norm": 0.6246238350868225,
"learning_rate": 3.969641001198266e-05,
"loss": 0.32,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2532247304916382,
"step": 1190
},
{
"epoch": 1.0556537102473498,
"grad_norm": 0.7625958919525146,
"learning_rate": 3.9688715273392785e-05,
"loss": 0.294,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28142765164375305,
"step": 1195
},
{
"epoch": 1.0600706713780919,
"grad_norm": 0.6404998302459717,
"learning_rate": 3.9680925001216e-05,
"loss": 0.3253,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30740100145339966,
"step": 1200
},
{
"epoch": 1.064487632508834,
"grad_norm": 0.7395120859146118,
"learning_rate": 3.967303923325228e-05,
"loss": 0.3327,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.34588027000427246,
"step": 1205
},
{
"epoch": 1.068904593639576,
"grad_norm": 0.6187570691108704,
"learning_rate": 3.966505800776493e-05,
"loss": 0.3793,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.33034655451774597,
"step": 1210
},
{
"epoch": 1.073321554770318,
"grad_norm": 0.6744672656059265,
"learning_rate": 3.965698136348048e-05,
"loss": 0.3273,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.41641291975975037,
"step": 1215
},
{
"epoch": 1.0777385159010602,
"grad_norm": 0.6118738055229187,
"learning_rate": 3.96488093395884e-05,
"loss": 0.3072,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.306530237197876,
"step": 1220
},
{
"epoch": 1.082155477031802,
"grad_norm": 0.5981642603874207,
"learning_rate": 3.964054197574099e-05,
"loss": 0.3266,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.34041914343833923,
"step": 1225
},
{
"epoch": 1.0865724381625441,
"grad_norm": 0.649811863899231,
"learning_rate": 3.963217931205317e-05,
"loss": 0.3013,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25741928815841675,
"step": 1230
},
{
"epoch": 1.0909893992932862,
"grad_norm": 0.6270660161972046,
"learning_rate": 3.962372138910223e-05,
"loss": 0.32,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3462643325328827,
"step": 1235
},
{
"epoch": 1.0954063604240283,
"grad_norm": 0.75999915599823,
"learning_rate": 3.9615168247927735e-05,
"loss": 0.3129,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31946852803230286,
"step": 1240
},
{
"epoch": 1.0998233215547704,
"grad_norm": 0.7573233246803284,
"learning_rate": 3.9606519930031225e-05,
"loss": 0.3373,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3408510386943817,
"step": 1245
},
{
"epoch": 1.1042402826855124,
"grad_norm": 0.6537328362464905,
"learning_rate": 3.959777647737606e-05,
"loss": 0.3615,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.34509575366973877,
"step": 1250
},
{
"epoch": 1.1086572438162543,
"grad_norm": 0.6237077713012695,
"learning_rate": 3.958893793238723e-05,
"loss": 0.3505,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.365169882774353,
"step": 1255
},
{
"epoch": 1.1130742049469964,
"grad_norm": 0.6630376577377319,
"learning_rate": 3.958000433795113e-05,
"loss": 0.3865,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.35122057795524597,
"step": 1260
},
{
"epoch": 1.1174911660777385,
"grad_norm": 0.6241370439529419,
"learning_rate": 3.957097573741534e-05,
"loss": 0.3463,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30081939697265625,
"step": 1265
},
{
"epoch": 1.1219081272084805,
"grad_norm": 0.7064021229743958,
"learning_rate": 3.956185217458843e-05,
"loss": 0.3429,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2906290888786316,
"step": 1270
},
{
"epoch": 1.1263250883392226,
"grad_norm": 0.7379579544067383,
"learning_rate": 3.955263369373977e-05,
"loss": 0.306,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2594120502471924,
"step": 1275
},
{
"epoch": 1.1307420494699647,
"grad_norm": 0.6167639493942261,
"learning_rate": 3.9543320339599266e-05,
"loss": 0.3344,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27128392457962036,
"step": 1280
},
{
"epoch": 1.1351590106007068,
"grad_norm": 0.6793010234832764,
"learning_rate": 3.953391215735718e-05,
"loss": 0.3495,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3618358373641968,
"step": 1285
},
{
"epoch": 1.1395759717314489,
"grad_norm": 0.8995165824890137,
"learning_rate": 3.952440919266389e-05,
"loss": 0.3221,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.43430811166763306,
"step": 1290
},
{
"epoch": 1.1439929328621907,
"grad_norm": 0.746021568775177,
"learning_rate": 3.951481149162968e-05,
"loss": 0.3149,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27596035599708557,
"step": 1295
},
{
"epoch": 1.1484098939929328,
"grad_norm": 0.6076446175575256,
"learning_rate": 3.950511910082452e-05,
"loss": 0.3011,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2746957242488861,
"step": 1300
},
{
"epoch": 1.1528268551236749,
"grad_norm": 0.692255973815918,
"learning_rate": 3.949533206727784e-05,
"loss": 0.3092,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3611806333065033,
"step": 1305
},
{
"epoch": 1.157243816254417,
"grad_norm": 0.7213220000267029,
"learning_rate": 3.948545043847826e-05,
"loss": 0.3042,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3065589368343353,
"step": 1310
},
{
"epoch": 1.161660777385159,
"grad_norm": 0.6529719829559326,
"learning_rate": 3.947547426237344e-05,
"loss": 0.3432,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3932499289512634,
"step": 1315
},
{
"epoch": 1.1660777385159011,
"grad_norm": 0.683671236038208,
"learning_rate": 3.9465403587369784e-05,
"loss": 0.3098,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.274165540933609,
"step": 1320
},
{
"epoch": 1.170494699646643,
"grad_norm": 0.8071700930595398,
"learning_rate": 3.945523846233222e-05,
"loss": 0.3043,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3191944658756256,
"step": 1325
},
{
"epoch": 1.174911660777385,
"grad_norm": 0.6285055875778198,
"learning_rate": 3.944497893658396e-05,
"loss": 0.3261,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3105619251728058,
"step": 1330
},
{
"epoch": 1.1793286219081272,
"grad_norm": 0.8717606663703918,
"learning_rate": 3.943462505990629e-05,
"loss": 0.3588,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3164404630661011,
"step": 1335
},
{
"epoch": 1.1837455830388692,
"grad_norm": 0.7031919956207275,
"learning_rate": 3.942417688253827e-05,
"loss": 0.3394,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3558098077774048,
"step": 1340
},
{
"epoch": 1.1881625441696113,
"grad_norm": 0.5943769216537476,
"learning_rate": 3.9413634455176584e-05,
"loss": 0.3199,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31143832206726074,
"step": 1345
},
{
"epoch": 1.1925795053003534,
"grad_norm": 0.6784660220146179,
"learning_rate": 3.940299782897517e-05,
"loss": 0.3039,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31072232127189636,
"step": 1350
},
{
"epoch": 1.1969964664310955,
"grad_norm": 0.6783735752105713,
"learning_rate": 3.939226705554507e-05,
"loss": 0.3124,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3142765462398529,
"step": 1355
},
{
"epoch": 1.2014134275618376,
"grad_norm": 0.6520729660987854,
"learning_rate": 3.9381442186954155e-05,
"loss": 0.3508,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2647053599357605,
"step": 1360
},
{
"epoch": 1.2058303886925794,
"grad_norm": 0.6096318960189819,
"learning_rate": 3.9370523275726844e-05,
"loss": 0.3369,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27471020817756653,
"step": 1365
},
{
"epoch": 1.2102473498233215,
"grad_norm": 0.6824337840080261,
"learning_rate": 3.935951037484388e-05,
"loss": 0.3035,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3124973177909851,
"step": 1370
},
{
"epoch": 1.2146643109540636,
"grad_norm": 0.7245553135871887,
"learning_rate": 3.934840353774208e-05,
"loss": 0.3162,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27780479192733765,
"step": 1375
},
{
"epoch": 1.2190812720848057,
"grad_norm": 0.8077725172042847,
"learning_rate": 3.9337202818314016e-05,
"loss": 0.2926,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.36201488971710205,
"step": 1380
},
{
"epoch": 1.2234982332155477,
"grad_norm": 0.6361654996871948,
"learning_rate": 3.932590827090783e-05,
"loss": 0.3642,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.40312081575393677,
"step": 1385
},
{
"epoch": 1.2279151943462898,
"grad_norm": 0.6152886748313904,
"learning_rate": 3.931451995032693e-05,
"loss": 0.3168,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3805939853191376,
"step": 1390
},
{
"epoch": 1.232332155477032,
"grad_norm": 0.7459203004837036,
"learning_rate": 3.930303791182972e-05,
"loss": 0.3519,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.34478020668029785,
"step": 1395
},
{
"epoch": 1.2367491166077738,
"grad_norm": 0.8537634015083313,
"learning_rate": 3.929146221112936e-05,
"loss": 0.3215,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.281982421875,
"step": 1400
},
{
"epoch": 1.2411660777385158,
"grad_norm": 0.6000507473945618,
"learning_rate": 3.927979290439346e-05,
"loss": 0.3281,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2675206661224365,
"step": 1405
},
{
"epoch": 1.245583038869258,
"grad_norm": 0.640716016292572,
"learning_rate": 3.926803004824382e-05,
"loss": 0.3312,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2979744076728821,
"step": 1410
},
{
"epoch": 1.25,
"grad_norm": 0.7331620454788208,
"learning_rate": 3.925617369975619e-05,
"loss": 0.3385,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.335235059261322,
"step": 1415
},
{
"epoch": 1.254416961130742,
"grad_norm": 0.6532949805259705,
"learning_rate": 3.924422391645994e-05,
"loss": 0.3509,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3822665214538574,
"step": 1420
},
{
"epoch": 1.2588339222614842,
"grad_norm": 0.7220327854156494,
"learning_rate": 3.923218075633781e-05,
"loss": 0.3268,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3238363265991211,
"step": 1425
},
{
"epoch": 1.2632508833922262,
"grad_norm": 0.727918803691864,
"learning_rate": 3.9220044277825615e-05,
"loss": 0.3149,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31373223662376404,
"step": 1430
},
{
"epoch": 1.2676678445229683,
"grad_norm": 0.633305013179779,
"learning_rate": 3.920781453981199e-05,
"loss": 0.2994,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3962780237197876,
"step": 1435
},
{
"epoch": 1.2720848056537102,
"grad_norm": 0.6449732184410095,
"learning_rate": 3.919549160163806e-05,
"loss": 0.3217,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2957836985588074,
"step": 1440
},
{
"epoch": 1.2765017667844523,
"grad_norm": 0.8494489789009094,
"learning_rate": 3.91830755230972e-05,
"loss": 0.3579,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.47561344504356384,
"step": 1445
},
{
"epoch": 1.2809187279151943,
"grad_norm": 0.6150957942008972,
"learning_rate": 3.91705663644347e-05,
"loss": 0.3226,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3158280849456787,
"step": 1450
},
{
"epoch": 1.2853356890459364,
"grad_norm": 0.5343297719955444,
"learning_rate": 3.91579641863475e-05,
"loss": 0.323,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30349069833755493,
"step": 1455
},
{
"epoch": 1.2897526501766785,
"grad_norm": 0.8276622295379639,
"learning_rate": 3.91452690499839e-05,
"loss": 0.3446,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4098215103149414,
"step": 1460
},
{
"epoch": 1.2941696113074204,
"grad_norm": 0.6456217765808105,
"learning_rate": 3.913248101694323e-05,
"loss": 0.333,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.40201887488365173,
"step": 1465
},
{
"epoch": 1.2985865724381624,
"grad_norm": 0.5984911322593689,
"learning_rate": 3.911960014927559e-05,
"loss": 0.3269,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3077358603477478,
"step": 1470
},
{
"epoch": 1.3030035335689045,
"grad_norm": 0.6234849691390991,
"learning_rate": 3.910662650948153e-05,
"loss": 0.3081,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3293282687664032,
"step": 1475
},
{
"epoch": 1.3074204946996466,
"grad_norm": 0.6392715573310852,
"learning_rate": 3.9093560160511746e-05,
"loss": 0.3063,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.34439730644226074,
"step": 1480
},
{
"epoch": 1.3118374558303887,
"grad_norm": 0.5880979299545288,
"learning_rate": 3.9080401165766776e-05,
"loss": 0.316,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2801549434661865,
"step": 1485
},
{
"epoch": 1.3162544169611308,
"grad_norm": 0.5844652652740479,
"learning_rate": 3.9067149589096695e-05,
"loss": 0.2849,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2638784646987915,
"step": 1490
},
{
"epoch": 1.3206713780918728,
"grad_norm": 0.5854594707489014,
"learning_rate": 3.905380549480081e-05,
"loss": 0.3029,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30802667140960693,
"step": 1495
},
{
"epoch": 1.325088339222615,
"grad_norm": 0.5132575035095215,
"learning_rate": 3.904036894762734e-05,
"loss": 0.3015,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2757795453071594,
"step": 1500
},
{
"epoch": 1.329505300353357,
"grad_norm": 0.790227472782135,
"learning_rate": 3.9026840012773094e-05,
"loss": 0.3119,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3685351610183716,
"step": 1505
},
{
"epoch": 1.3339222614840989,
"grad_norm": 0.5928642153739929,
"learning_rate": 3.901321875588317e-05,
"loss": 0.3241,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3497644364833832,
"step": 1510
},
{
"epoch": 1.338339222614841,
"grad_norm": 0.6434882283210754,
"learning_rate": 3.899950524305064e-05,
"loss": 0.3218,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3292595148086548,
"step": 1515
},
{
"epoch": 1.342756183745583,
"grad_norm": 0.7256221771240234,
"learning_rate": 3.898569954081621e-05,
"loss": 0.3332,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3023999333381653,
"step": 1520
},
{
"epoch": 1.3471731448763251,
"grad_norm": 0.7445887327194214,
"learning_rate": 3.897180171616791e-05,
"loss": 0.3047,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27664875984191895,
"step": 1525
},
{
"epoch": 1.3515901060070672,
"grad_norm": 0.6363182663917542,
"learning_rate": 3.895781183654076e-05,
"loss": 0.348,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2896704077720642,
"step": 1530
},
{
"epoch": 1.356007067137809,
"grad_norm": 0.7220079898834229,
"learning_rate": 3.894372996981647e-05,
"loss": 0.3056,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3187897503376007,
"step": 1535
},
{
"epoch": 1.3604240282685511,
"grad_norm": 0.9932358264923096,
"learning_rate": 3.892955618432306e-05,
"loss": 0.2863,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3777332007884979,
"step": 1540
},
{
"epoch": 1.3648409893992932,
"grad_norm": 0.6612488627433777,
"learning_rate": 3.891529054883458e-05,
"loss": 0.3671,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.32506632804870605,
"step": 1545
},
{
"epoch": 1.3692579505300353,
"grad_norm": 0.809368371963501,
"learning_rate": 3.8900933132570755e-05,
"loss": 0.3164,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2754046618938446,
"step": 1550
},
{
"epoch": 1.3736749116607774,
"grad_norm": 0.6561906933784485,
"learning_rate": 3.888648400519663e-05,
"loss": 0.364,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27140358090400696,
"step": 1555
},
{
"epoch": 1.3780918727915195,
"grad_norm": 0.6149983406066895,
"learning_rate": 3.8871943236822274e-05,
"loss": 0.2918,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3111530542373657,
"step": 1560
},
{
"epoch": 1.3825088339222615,
"grad_norm": 0.788455605506897,
"learning_rate": 3.88573108980024e-05,
"loss": 0.3015,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31705474853515625,
"step": 1565
},
{
"epoch": 1.3869257950530036,
"grad_norm": 0.8068515062332153,
"learning_rate": 3.8842587059736054e-05,
"loss": 0.2891,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27609729766845703,
"step": 1570
},
{
"epoch": 1.3913427561837457,
"grad_norm": 0.5502995252609253,
"learning_rate": 3.882777179346622e-05,
"loss": 0.3524,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.286032497882843,
"step": 1575
},
{
"epoch": 1.3957597173144876,
"grad_norm": 0.5802372694015503,
"learning_rate": 3.881286517107957e-05,
"loss": 0.343,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24839898943901062,
"step": 1580
},
{
"epoch": 1.4001766784452296,
"grad_norm": 0.614653468132019,
"learning_rate": 3.879786726490599e-05,
"loss": 0.3196,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29347753524780273,
"step": 1585
},
{
"epoch": 1.4045936395759717,
"grad_norm": 0.5936715006828308,
"learning_rate": 3.8782778147718335e-05,
"loss": 0.329,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3896068036556244,
"step": 1590
},
{
"epoch": 1.4090106007067138,
"grad_norm": 5.28199577331543,
"learning_rate": 3.876759789273202e-05,
"loss": 0.3,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26368066668510437,
"step": 1595
},
{
"epoch": 1.4134275618374559,
"grad_norm": 0.7651248574256897,
"learning_rate": 3.8752326573604684e-05,
"loss": 0.3075,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30806493759155273,
"step": 1600
},
{
"epoch": 1.417844522968198,
"grad_norm": 0.6337783932685852,
"learning_rate": 3.873696426443581e-05,
"loss": 0.3195,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.34544122219085693,
"step": 1605
},
{
"epoch": 1.4222614840989398,
"grad_norm": 0.7464025616645813,
"learning_rate": 3.872151103976642e-05,
"loss": 0.3251,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3165499269962311,
"step": 1610
},
{
"epoch": 1.426678445229682,
"grad_norm": 0.5613967180252075,
"learning_rate": 3.870596697457863e-05,
"loss": 0.3442,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27536633610725403,
"step": 1615
},
{
"epoch": 1.431095406360424,
"grad_norm": 0.6180069446563721,
"learning_rate": 3.8690332144295375e-05,
"loss": 0.3426,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.39397120475769043,
"step": 1620
},
{
"epoch": 1.435512367491166,
"grad_norm": 0.628078818321228,
"learning_rate": 3.867460662477996e-05,
"loss": 0.332,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3936801254749298,
"step": 1625
},
{
"epoch": 1.4399293286219081,
"grad_norm": 0.6523563265800476,
"learning_rate": 3.865879049233577e-05,
"loss": 0.3076,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31615036725997925,
"step": 1630
},
{
"epoch": 1.4443462897526502,
"grad_norm": 0.6801185607910156,
"learning_rate": 3.864288382370584e-05,
"loss": 0.3124,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27253082394599915,
"step": 1635
},
{
"epoch": 1.4487632508833923,
"grad_norm": 0.6551727056503296,
"learning_rate": 3.8626886696072495e-05,
"loss": 0.3393,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3144484758377075,
"step": 1640
},
{
"epoch": 1.4531802120141344,
"grad_norm": 0.6799625158309937,
"learning_rate": 3.8610799187057025e-05,
"loss": 0.3086,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3325912356376648,
"step": 1645
},
{
"epoch": 1.4575971731448762,
"grad_norm": 0.6419410705566406,
"learning_rate": 3.8594621374719226e-05,
"loss": 0.3026,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28254279494285583,
"step": 1650
},
{
"epoch": 1.4620141342756183,
"grad_norm": 0.6062602996826172,
"learning_rate": 3.857835333755709e-05,
"loss": 0.3182,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.32770591974258423,
"step": 1655
},
{
"epoch": 1.4664310954063604,
"grad_norm": 0.6399083137512207,
"learning_rate": 3.856199515450638e-05,
"loss": 0.3236,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.38535135984420776,
"step": 1660
},
{
"epoch": 1.4708480565371025,
"grad_norm": 0.6881480813026428,
"learning_rate": 3.8545546904940285e-05,
"loss": 0.3233,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.35274213552474976,
"step": 1665
},
{
"epoch": 1.4752650176678446,
"grad_norm": 0.708899199962616,
"learning_rate": 3.8529008668668996e-05,
"loss": 0.3243,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29779568314552307,
"step": 1670
},
{
"epoch": 1.4796819787985867,
"grad_norm": 0.5962197780609131,
"learning_rate": 3.851238052593935e-05,
"loss": 0.3054,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3215206265449524,
"step": 1675
},
{
"epoch": 1.4840989399293285,
"grad_norm": 0.6873301863670349,
"learning_rate": 3.849566255743442e-05,
"loss": 0.3252,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29667988419532776,
"step": 1680
},
{
"epoch": 1.4885159010600706,
"grad_norm": 0.7549837827682495,
"learning_rate": 3.8478854844273134e-05,
"loss": 0.3139,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.32202067971229553,
"step": 1685
},
{
"epoch": 1.4929328621908127,
"grad_norm": 0.6532432436943054,
"learning_rate": 3.846195746800988e-05,
"loss": 0.2846,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2808791995048523,
"step": 1690
},
{
"epoch": 1.4973498233215548,
"grad_norm": 0.6817176342010498,
"learning_rate": 3.8444970510634124e-05,
"loss": 0.3371,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29034101963043213,
"step": 1695
},
{
"epoch": 1.5017667844522968,
"grad_norm": 0.6129101514816284,
"learning_rate": 3.842789405456996e-05,
"loss": 0.3295,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3373889923095703,
"step": 1700
},
{
"epoch": 1.506183745583039,
"grad_norm": 0.5987979769706726,
"learning_rate": 3.841072818267578e-05,
"loss": 0.3237,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.36195623874664307,
"step": 1705
},
{
"epoch": 1.510600706713781,
"grad_norm": 0.6188389658927917,
"learning_rate": 3.839347297824383e-05,
"loss": 0.3196,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28901487588882446,
"step": 1710
},
{
"epoch": 1.515017667844523,
"grad_norm": 0.605717658996582,
"learning_rate": 3.837612852499982e-05,
"loss": 0.352,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31816282868385315,
"step": 1715
},
{
"epoch": 1.5194346289752652,
"grad_norm": 0.5934823751449585,
"learning_rate": 3.8358694907102504e-05,
"loss": 0.3625,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3863886594772339,
"step": 1720
},
{
"epoch": 1.523851590106007,
"grad_norm": 0.578649640083313,
"learning_rate": 3.834117220914328e-05,
"loss": 0.3449,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2828061580657959,
"step": 1725
},
{
"epoch": 1.528268551236749,
"grad_norm": 0.6291191577911377,
"learning_rate": 3.832356051614579e-05,
"loss": 0.307,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28863656520843506,
"step": 1730
},
{
"epoch": 1.5326855123674912,
"grad_norm": 0.6461930274963379,
"learning_rate": 3.8305859913565505e-05,
"loss": 0.3011,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3028411865234375,
"step": 1735
},
{
"epoch": 1.5371024734982333,
"grad_norm": 0.7606573104858398,
"learning_rate": 3.8288070487289274e-05,
"loss": 0.3087,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3565424084663391,
"step": 1740
},
{
"epoch": 1.5415194346289751,
"grad_norm": 0.6665891408920288,
"learning_rate": 3.827019232363496e-05,
"loss": 0.3183,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3757513165473938,
"step": 1745
},
{
"epoch": 1.5459363957597172,
"grad_norm": 0.6307830214500427,
"learning_rate": 3.8252225509350985e-05,
"loss": 0.3252,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.322690486907959,
"step": 1750
},
{
"epoch": 1.5503533568904593,
"grad_norm": 0.6449663043022156,
"learning_rate": 3.823417013161594e-05,
"loss": 0.3276,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3264492154121399,
"step": 1755
},
{
"epoch": 1.5547703180212014,
"grad_norm": 0.6309463381767273,
"learning_rate": 3.821602627803813e-05,
"loss": 0.3399,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3071059584617615,
"step": 1760
},
{
"epoch": 1.5591872791519434,
"grad_norm": 0.5758494138717651,
"learning_rate": 3.819779403665515e-05,
"loss": 0.3248,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3266148567199707,
"step": 1765
},
{
"epoch": 1.5636042402826855,
"grad_norm": 0.6446824669837952,
"learning_rate": 3.8179473495933497e-05,
"loss": 0.3323,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3177984952926636,
"step": 1770
},
{
"epoch": 1.5680212014134276,
"grad_norm": 0.5853697657585144,
"learning_rate": 3.8161064744768096e-05,
"loss": 0.2712,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24256934225559235,
"step": 1775
},
{
"epoch": 1.5724381625441697,
"grad_norm": 0.722345232963562,
"learning_rate": 3.814256787248189e-05,
"loss": 0.3833,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4055064916610718,
"step": 1780
},
{
"epoch": 1.5768551236749118,
"grad_norm": 0.7835103869438171,
"learning_rate": 3.81239829688254e-05,
"loss": 0.3099,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3148956596851349,
"step": 1785
},
{
"epoch": 1.5812720848056538,
"grad_norm": 0.6101419925689697,
"learning_rate": 3.810531012397632e-05,
"loss": 0.3416,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3856297433376312,
"step": 1790
},
{
"epoch": 1.585689045936396,
"grad_norm": 0.607524037361145,
"learning_rate": 3.8086549428539016e-05,
"loss": 0.3393,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3431060314178467,
"step": 1795
},
{
"epoch": 1.5901060070671378,
"grad_norm": 0.700236976146698,
"learning_rate": 3.806770097354413e-05,
"loss": 0.2922,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30610954761505127,
"step": 1800
},
{
"epoch": 1.5945229681978799,
"grad_norm": 0.5721810460090637,
"learning_rate": 3.8048764850448146e-05,
"loss": 0.3178,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2738623023033142,
"step": 1805
},
{
"epoch": 1.598939929328622,
"grad_norm": 0.591039776802063,
"learning_rate": 3.802974115113292e-05,
"loss": 0.3071,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3155909776687622,
"step": 1810
},
{
"epoch": 1.6033568904593638,
"grad_norm": 0.6222483515739441,
"learning_rate": 3.801062996790526e-05,
"loss": 0.3603,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.40520355105400085,
"step": 1815
},
{
"epoch": 1.6077738515901059,
"grad_norm": 0.6669710874557495,
"learning_rate": 3.7991431393496435e-05,
"loss": 0.3065,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3122641444206238,
"step": 1820
},
{
"epoch": 1.612190812720848,
"grad_norm": 0.628384530544281,
"learning_rate": 3.797214552106178e-05,
"loss": 0.2951,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29349058866500854,
"step": 1825
},
{
"epoch": 1.61660777385159,
"grad_norm": 0.6003797054290771,
"learning_rate": 3.7952772444180205e-05,
"loss": 0.3327,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28449106216430664,
"step": 1830
},
{
"epoch": 1.6210247349823321,
"grad_norm": 0.6813213229179382,
"learning_rate": 3.793331225685376e-05,
"loss": 0.3209,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.247023805975914,
"step": 1835
},
{
"epoch": 1.6254416961130742,
"grad_norm": 0.6331183314323425,
"learning_rate": 3.791376505350716e-05,
"loss": 0.2859,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2805844843387604,
"step": 1840
},
{
"epoch": 1.6298586572438163,
"grad_norm": 0.5931798815727234,
"learning_rate": 3.789413092898735e-05,
"loss": 0.2862,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2882406711578369,
"step": 1845
},
{
"epoch": 1.6342756183745584,
"grad_norm": 0.6270744204521179,
"learning_rate": 3.7874409978563045e-05,
"loss": 0.2997,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27691859006881714,
"step": 1850
},
{
"epoch": 1.6386925795053005,
"grad_norm": 0.6561160087585449,
"learning_rate": 3.785460229792422e-05,
"loss": 0.2763,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2742835283279419,
"step": 1855
},
{
"epoch": 1.6431095406360425,
"grad_norm": 0.6197009086608887,
"learning_rate": 3.783470798318173e-05,
"loss": 0.3189,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2708655297756195,
"step": 1860
},
{
"epoch": 1.6475265017667846,
"grad_norm": 0.7104551196098328,
"learning_rate": 3.7814727130866756e-05,
"loss": 0.3393,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3953385353088379,
"step": 1865
},
{
"epoch": 1.6519434628975265,
"grad_norm": 0.6067651510238647,
"learning_rate": 3.779465983793039e-05,
"loss": 0.3433,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31828904151916504,
"step": 1870
},
{
"epoch": 1.6563604240282686,
"grad_norm": 0.6345065236091614,
"learning_rate": 3.7774506201743175e-05,
"loss": 0.3252,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3190929591655731,
"step": 1875
},
{
"epoch": 1.6607773851590106,
"grad_norm": 0.5989964604377747,
"learning_rate": 3.775426632009456e-05,
"loss": 0.3066,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2885761857032776,
"step": 1880
},
{
"epoch": 1.6651943462897525,
"grad_norm": 0.6218124032020569,
"learning_rate": 3.7733940291192516e-05,
"loss": 0.3205,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2709357738494873,
"step": 1885
},
{
"epoch": 1.6696113074204946,
"grad_norm": 0.6491353511810303,
"learning_rate": 3.771352821366301e-05,
"loss": 0.3574,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31962040066719055,
"step": 1890
},
{
"epoch": 1.6740282685512367,
"grad_norm": 0.6486796736717224,
"learning_rate": 3.769303018654951e-05,
"loss": 0.3129,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27561670541763306,
"step": 1895
},
{
"epoch": 1.6784452296819787,
"grad_norm": 0.8267950415611267,
"learning_rate": 3.7672446309312554e-05,
"loss": 0.3588,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2873364984989166,
"step": 1900
},
{
"epoch": 1.6828621908127208,
"grad_norm": 0.6081082820892334,
"learning_rate": 3.765177668182923e-05,
"loss": 0.3609,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.35057562589645386,
"step": 1905
},
{
"epoch": 1.687279151943463,
"grad_norm": 0.5976192951202393,
"learning_rate": 3.763102140439272e-05,
"loss": 0.3016,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2694868743419647,
"step": 1910
},
{
"epoch": 1.691696113074205,
"grad_norm": 0.7123092412948608,
"learning_rate": 3.7610180577711774e-05,
"loss": 0.2874,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2789933681488037,
"step": 1915
},
{
"epoch": 1.696113074204947,
"grad_norm": 0.741333544254303,
"learning_rate": 3.758925430291025e-05,
"loss": 0.3251,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.35715097188949585,
"step": 1920
},
{
"epoch": 1.7005300353356891,
"grad_norm": 0.6544567942619324,
"learning_rate": 3.756824268152663e-05,
"loss": 0.326,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2953934669494629,
"step": 1925
},
{
"epoch": 1.7049469964664312,
"grad_norm": 1.1364296674728394,
"learning_rate": 3.7547145815513504e-05,
"loss": 0.3568,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.43960511684417725,
"step": 1930
},
{
"epoch": 1.7093639575971733,
"grad_norm": 0.5804359912872314,
"learning_rate": 3.752596380723709e-05,
"loss": 0.3531,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4059427082538605,
"step": 1935
},
{
"epoch": 1.7137809187279152,
"grad_norm": 0.6707079410552979,
"learning_rate": 3.750469675947672e-05,
"loss": 0.3044,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.308398962020874,
"step": 1940
},
{
"epoch": 1.7181978798586572,
"grad_norm": 0.6289849281311035,
"learning_rate": 3.7483344775424376e-05,
"loss": 0.3225,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.35997170209884644,
"step": 1945
},
{
"epoch": 1.7226148409893993,
"grad_norm": 0.5992223620414734,
"learning_rate": 3.746190795868416e-05,
"loss": 0.3168,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.36869436502456665,
"step": 1950
},
{
"epoch": 1.7270318021201412,
"grad_norm": 0.5985012054443359,
"learning_rate": 3.7440386413271796e-05,
"loss": 0.2932,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3474159240722656,
"step": 1955
},
{
"epoch": 1.7314487632508833,
"grad_norm": 0.6145229339599609,
"learning_rate": 3.741878024361412e-05,
"loss": 0.3082,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.34130439162254333,
"step": 1960
},
{
"epoch": 1.7358657243816253,
"grad_norm": 0.5978900790214539,
"learning_rate": 3.7397089554548606e-05,
"loss": 0.2994,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3122338354587555,
"step": 1965
},
{
"epoch": 1.7402826855123674,
"grad_norm": 0.6474099159240723,
"learning_rate": 3.73753144513228e-05,
"loss": 0.2718,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2959476113319397,
"step": 1970
},
{
"epoch": 1.7446996466431095,
"grad_norm": 0.5508759021759033,
"learning_rate": 3.735345503959388e-05,
"loss": 0.3195,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25343257188796997,
"step": 1975
},
{
"epoch": 1.7491166077738516,
"grad_norm": 0.5979631543159485,
"learning_rate": 3.7331511425428075e-05,
"loss": 0.307,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3401501476764679,
"step": 1980
},
{
"epoch": 1.7535335689045937,
"grad_norm": 0.6378216743469238,
"learning_rate": 3.73094837153002e-05,
"loss": 0.3163,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3658217787742615,
"step": 1985
},
{
"epoch": 1.7579505300353357,
"grad_norm": 0.623346745967865,
"learning_rate": 3.7287372016093106e-05,
"loss": 0.3476,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3634029030799866,
"step": 1990
},
{
"epoch": 1.7623674911660778,
"grad_norm": 0.548507034778595,
"learning_rate": 3.726517643509718e-05,
"loss": 0.3238,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.34876665472984314,
"step": 1995
},
{
"epoch": 1.76678445229682,
"grad_norm": 0.7020362615585327,
"learning_rate": 3.724289708000984e-05,
"loss": 0.313,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29458093643188477,
"step": 2000
},
{
"epoch": 1.771201413427562,
"grad_norm": 1.0174129009246826,
"learning_rate": 3.722053405893495e-05,
"loss": 0.3045,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2385835349559784,
"step": 2005
},
{
"epoch": 1.7756183745583038,
"grad_norm": 0.6126503348350525,
"learning_rate": 3.7198087480382386e-05,
"loss": 0.3038,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24906566739082336,
"step": 2010
},
{
"epoch": 1.780035335689046,
"grad_norm": 0.6186851263046265,
"learning_rate": 3.7175557453267435e-05,
"loss": 0.3153,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.36772221326828003,
"step": 2015
},
{
"epoch": 1.784452296819788,
"grad_norm": 0.5845491886138916,
"learning_rate": 3.715294408691029e-05,
"loss": 0.3231,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31444716453552246,
"step": 2020
},
{
"epoch": 1.78886925795053,
"grad_norm": 0.5485044121742249,
"learning_rate": 3.713024749103554e-05,
"loss": 0.3279,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28981852531433105,
"step": 2025
},
{
"epoch": 1.793286219081272,
"grad_norm": 0.7004613280296326,
"learning_rate": 3.71074677757716e-05,
"loss": 0.3089,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2541000247001648,
"step": 2030
},
{
"epoch": 1.797703180212014,
"grad_norm": 0.7733515501022339,
"learning_rate": 3.708460505165021e-05,
"loss": 0.3438,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3459468185901642,
"step": 2035
},
{
"epoch": 1.802120141342756,
"grad_norm": 0.5577183961868286,
"learning_rate": 3.706165942960589e-05,
"loss": 0.3271,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3305257558822632,
"step": 2040
},
{
"epoch": 1.8065371024734982,
"grad_norm": 0.6522884964942932,
"learning_rate": 3.703863102097538e-05,
"loss": 0.3168,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3232169449329376,
"step": 2045
},
{
"epoch": 1.8109540636042403,
"grad_norm": 0.6660712361335754,
"learning_rate": 3.701551993749714e-05,
"loss": 0.3165,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2644830346107483,
"step": 2050
},
{
"epoch": 1.8153710247349824,
"grad_norm": 0.7599813938140869,
"learning_rate": 3.6992326291310764e-05,
"loss": 0.3048,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30041027069091797,
"step": 2055
},
{
"epoch": 1.8197879858657244,
"grad_norm": 0.6655240058898926,
"learning_rate": 3.696905019495647e-05,
"loss": 0.2975,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2828328609466553,
"step": 2060
},
{
"epoch": 1.8242049469964665,
"grad_norm": 0.608040988445282,
"learning_rate": 3.6945691761374535e-05,
"loss": 0.3234,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.42636048793792725,
"step": 2065
},
{
"epoch": 1.8286219081272086,
"grad_norm": 0.6305931806564331,
"learning_rate": 3.692225110390474e-05,
"loss": 0.3236,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3188191056251526,
"step": 2070
},
{
"epoch": 1.8330388692579507,
"grad_norm": 0.7485166788101196,
"learning_rate": 3.689872833628587e-05,
"loss": 0.3203,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.42439842224121094,
"step": 2075
},
{
"epoch": 1.8374558303886925,
"grad_norm": 0.5816894769668579,
"learning_rate": 3.687512357265509e-05,
"loss": 0.3268,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.37391436100006104,
"step": 2080
},
{
"epoch": 1.8418727915194346,
"grad_norm": 0.6705328822135925,
"learning_rate": 3.685143692754743e-05,
"loss": 0.3141,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.361337274312973,
"step": 2085
},
{
"epoch": 1.8462897526501767,
"grad_norm": 0.6142212152481079,
"learning_rate": 3.6827668515895234e-05,
"loss": 0.3092,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3198060691356659,
"step": 2090
},
{
"epoch": 1.8507067137809188,
"grad_norm": 0.6588479280471802,
"learning_rate": 3.68038184530276e-05,
"loss": 0.3024,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3107324540615082,
"step": 2095
},
{
"epoch": 1.8551236749116606,
"grad_norm": 0.6271264553070068,
"learning_rate": 3.6779886854669815e-05,
"loss": 0.2935,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3267250657081604,
"step": 2100
},
{
"epoch": 1.8595406360424027,
"grad_norm": 0.6405203938484192,
"learning_rate": 3.6755873836942756e-05,
"loss": 0.3282,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.32828402519226074,
"step": 2105
},
{
"epoch": 1.8639575971731448,
"grad_norm": 0.6557453274726868,
"learning_rate": 3.673177951636242e-05,
"loss": 0.3506,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28758811950683594,
"step": 2110
},
{
"epoch": 1.8683745583038869,
"grad_norm": 1.1797688007354736,
"learning_rate": 3.670760400983925e-05,
"loss": 0.3433,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.39800888299942017,
"step": 2115
},
{
"epoch": 1.872791519434629,
"grad_norm": 0.6395815014839172,
"learning_rate": 3.6683347434677654e-05,
"loss": 0.3342,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3299906253814697,
"step": 2120
},
{
"epoch": 1.877208480565371,
"grad_norm": 0.6777175664901733,
"learning_rate": 3.6659009908575394e-05,
"loss": 0.2953,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2692110538482666,
"step": 2125
},
{
"epoch": 1.8816254416961131,
"grad_norm": 0.5334305763244629,
"learning_rate": 3.663459154962301e-05,
"loss": 0.3263,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3176087439060211,
"step": 2130
},
{
"epoch": 1.8860424028268552,
"grad_norm": 0.5658344626426697,
"learning_rate": 3.661009247630326e-05,
"loss": 0.3149,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.36185193061828613,
"step": 2135
},
{
"epoch": 1.8904593639575973,
"grad_norm": 0.6579681634902954,
"learning_rate": 3.658551280749055e-05,
"loss": 0.2887,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25398847460746765,
"step": 2140
},
{
"epoch": 1.8948763250883394,
"grad_norm": 0.6341381072998047,
"learning_rate": 3.656085266245038e-05,
"loss": 0.2892,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3196604251861572,
"step": 2145
},
{
"epoch": 1.8992932862190812,
"grad_norm": 0.6079564094543457,
"learning_rate": 3.653611216083867e-05,
"loss": 0.3093,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.41236889362335205,
"step": 2150
},
{
"epoch": 1.9037102473498233,
"grad_norm": 0.5752854347229004,
"learning_rate": 3.651129142270132e-05,
"loss": 0.3111,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.32420551776885986,
"step": 2155
},
{
"epoch": 1.9081272084805654,
"grad_norm": 0.7557041645050049,
"learning_rate": 3.6486390568473494e-05,
"loss": 0.3236,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.40055620670318604,
"step": 2160
},
{
"epoch": 1.9125441696113075,
"grad_norm": 0.6140500903129578,
"learning_rate": 3.646140971897914e-05,
"loss": 0.2967,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3187151551246643,
"step": 2165
},
{
"epoch": 1.9169611307420493,
"grad_norm": 0.5381097793579102,
"learning_rate": 3.6436348995430314e-05,
"loss": 0.3371,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20654772222042084,
"step": 2170
},
{
"epoch": 1.9213780918727914,
"grad_norm": 0.6211098432540894,
"learning_rate": 3.641120851942669e-05,
"loss": 0.3121,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27154093980789185,
"step": 2175
},
{
"epoch": 1.9257950530035335,
"grad_norm": 0.6953615546226501,
"learning_rate": 3.638598841295487e-05,
"loss": 0.3264,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31072691082954407,
"step": 2180
},
{
"epoch": 1.9302120141342756,
"grad_norm": 0.579765796661377,
"learning_rate": 3.6360688798387865e-05,
"loss": 0.3463,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3020220398902893,
"step": 2185
},
{
"epoch": 1.9346289752650176,
"grad_norm": 0.6398031115531921,
"learning_rate": 3.633530979848446e-05,
"loss": 0.2941,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2853238880634308,
"step": 2190
},
{
"epoch": 1.9390459363957597,
"grad_norm": 0.6654336452484131,
"learning_rate": 3.6309851536388664e-05,
"loss": 0.3671,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.35676461458206177,
"step": 2195
},
{
"epoch": 1.9434628975265018,
"grad_norm": 0.5575515627861023,
"learning_rate": 3.6284314135629036e-05,
"loss": 0.3231,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4140787720680237,
"step": 2200
},
{
"epoch": 1.947879858657244,
"grad_norm": 0.6344410181045532,
"learning_rate": 3.625869772011816e-05,
"loss": 0.3538,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.34461867809295654,
"step": 2205
},
{
"epoch": 1.952296819787986,
"grad_norm": 0.6593054533004761,
"learning_rate": 3.6233002414152025e-05,
"loss": 0.3141,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3200452923774719,
"step": 2210
},
{
"epoch": 1.956713780918728,
"grad_norm": 0.5528004765510559,
"learning_rate": 3.620722834240939e-05,
"loss": 0.3353,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31394898891448975,
"step": 2215
},
{
"epoch": 1.96113074204947,
"grad_norm": 0.5560632944107056,
"learning_rate": 3.61813756299512e-05,
"loss": 0.3328,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3282029330730438,
"step": 2220
},
{
"epoch": 1.965547703180212,
"grad_norm": 0.6172696352005005,
"learning_rate": 3.6155444402219995e-05,
"loss": 0.3315,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2924641966819763,
"step": 2225
},
{
"epoch": 1.969964664310954,
"grad_norm": 0.6023048758506775,
"learning_rate": 3.612943478503929e-05,
"loss": 0.313,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.33808523416519165,
"step": 2230
},
{
"epoch": 1.9743816254416962,
"grad_norm": 0.5986247658729553,
"learning_rate": 3.610334690461295e-05,
"loss": 0.2988,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3043588399887085,
"step": 2235
},
{
"epoch": 1.978798586572438,
"grad_norm": 0.6460138559341431,
"learning_rate": 3.6077180887524584e-05,
"loss": 0.3008,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3652576506137848,
"step": 2240
},
{
"epoch": 1.98321554770318,
"grad_norm": 0.6833009123802185,
"learning_rate": 3.605093686073694e-05,
"loss": 0.3143,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3827962875366211,
"step": 2245
},
{
"epoch": 1.9876325088339222,
"grad_norm": 0.5557314157485962,
"learning_rate": 3.602461495159131e-05,
"loss": 0.3122,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28441256284713745,
"step": 2250
},
{
"epoch": 1.9920494699646643,
"grad_norm": 0.6646502017974854,
"learning_rate": 3.5998215287806845e-05,
"loss": 0.3075,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2972768545150757,
"step": 2255
},
{
"epoch": 1.9964664310954063,
"grad_norm": 0.6511381268501282,
"learning_rate": 3.597173799748001e-05,
"loss": 0.3088,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3291865587234497,
"step": 2260
},
{
"epoch": 2.001766784452297,
"grad_norm": 0.5447190999984741,
"learning_rate": 3.594518320908391e-05,
"loss": 0.2974,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28889337182044983,
"step": 2265
},
{
"epoch": 2.006183745583039,
"grad_norm": 0.6198117733001709,
"learning_rate": 3.591855105146769e-05,
"loss": 0.2677,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23140525817871094,
"step": 2270
},
{
"epoch": 2.010600706713781,
"grad_norm": 0.5488595366477966,
"learning_rate": 3.589184165385592e-05,
"loss": 0.2922,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23664027452468872,
"step": 2275
},
{
"epoch": 2.015017667844523,
"grad_norm": 0.5935027599334717,
"learning_rate": 3.586505514584793e-05,
"loss": 0.2791,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3317275643348694,
"step": 2280
},
{
"epoch": 2.019434628975265,
"grad_norm": 0.5912279486656189,
"learning_rate": 3.583819165741722e-05,
"loss": 0.3277,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3340924382209778,
"step": 2285
},
{
"epoch": 2.0238515901060072,
"grad_norm": 0.6450473070144653,
"learning_rate": 3.581125131891082e-05,
"loss": 0.2556,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2265785187482834,
"step": 2290
},
{
"epoch": 2.0282685512367493,
"grad_norm": 0.605987548828125,
"learning_rate": 3.578423426104864e-05,
"loss": 0.3267,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.38664746284484863,
"step": 2295
},
{
"epoch": 2.032685512367491,
"grad_norm": 0.6688462495803833,
"learning_rate": 3.5757140614922846e-05,
"loss": 0.2794,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29907870292663574,
"step": 2300
},
{
"epoch": 2.037102473498233,
"grad_norm": 0.6601260900497437,
"learning_rate": 3.572997051199724e-05,
"loss": 0.3039,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4096679091453552,
"step": 2305
},
{
"epoch": 2.041519434628975,
"grad_norm": 0.6559783220291138,
"learning_rate": 3.5702724084106596e-05,
"loss": 0.2865,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2515143156051636,
"step": 2310
},
{
"epoch": 2.045936395759717,
"grad_norm": 0.8877668380737305,
"learning_rate": 3.567540146345604e-05,
"loss": 0.3152,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.324682354927063,
"step": 2315
},
{
"epoch": 2.0503533568904593,
"grad_norm": 0.6168680787086487,
"learning_rate": 3.5648002782620375e-05,
"loss": 0.3071,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28638938069343567,
"step": 2320
},
{
"epoch": 2.0547703180212014,
"grad_norm": 0.6127181649208069,
"learning_rate": 3.562052817454351e-05,
"loss": 0.2835,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3344689905643463,
"step": 2325
},
{
"epoch": 2.0591872791519434,
"grad_norm": 0.6770476698875427,
"learning_rate": 3.5592977772537734e-05,
"loss": 0.2967,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.257055401802063,
"step": 2330
},
{
"epoch": 2.0636042402826855,
"grad_norm": 0.5903377532958984,
"learning_rate": 3.55653517102831e-05,
"loss": 0.2975,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.32297080755233765,
"step": 2335
},
{
"epoch": 2.0680212014134276,
"grad_norm": 0.6872009634971619,
"learning_rate": 3.5537650121826804e-05,
"loss": 0.2931,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3144655227661133,
"step": 2340
},
{
"epoch": 2.0724381625441697,
"grad_norm": 0.6215624213218689,
"learning_rate": 3.550987314158249e-05,
"loss": 0.3258,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.35112127661705017,
"step": 2345
},
{
"epoch": 2.0768551236749118,
"grad_norm": 0.6877519488334656,
"learning_rate": 3.5482020904329635e-05,
"loss": 0.2963,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26411569118499756,
"step": 2350
},
{
"epoch": 2.081272084805654,
"grad_norm": 0.8698825240135193,
"learning_rate": 3.545409354521286e-05,
"loss": 0.3224,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.33345749974250793,
"step": 2355
},
{
"epoch": 2.085689045936396,
"grad_norm": 0.6604434251785278,
"learning_rate": 3.542609119974129e-05,
"loss": 0.2875,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3097860515117645,
"step": 2360
},
{
"epoch": 2.090106007067138,
"grad_norm": 0.5723004341125488,
"learning_rate": 3.539801400378793e-05,
"loss": 0.2737,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23669950664043427,
"step": 2365
},
{
"epoch": 2.0945229681978796,
"grad_norm": 0.7183104157447815,
"learning_rate": 3.5369862093588946e-05,
"loss": 0.2733,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3131876289844513,
"step": 2370
},
{
"epoch": 2.0989399293286217,
"grad_norm": 0.649411141872406,
"learning_rate": 3.534163560574304e-05,
"loss": 0.3283,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31314218044281006,
"step": 2375
},
{
"epoch": 2.103356890459364,
"grad_norm": 0.659487247467041,
"learning_rate": 3.531333467721078e-05,
"loss": 0.3206,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31890353560447693,
"step": 2380
},
{
"epoch": 2.107773851590106,
"grad_norm": 0.6342830657958984,
"learning_rate": 3.5284959445313945e-05,
"loss": 0.2959,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2846304178237915,
"step": 2385
},
{
"epoch": 2.112190812720848,
"grad_norm": 0.6259862780570984,
"learning_rate": 3.525651004773481e-05,
"loss": 0.3114,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.35213717818260193,
"step": 2390
},
{
"epoch": 2.11660777385159,
"grad_norm": 0.6828689575195312,
"learning_rate": 3.522798662251558e-05,
"loss": 0.3066,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.35810428857803345,
"step": 2395
},
{
"epoch": 2.121024734982332,
"grad_norm": 0.621423602104187,
"learning_rate": 3.51993893080576e-05,
"loss": 0.288,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23874174058437347,
"step": 2400
},
{
"epoch": 2.125441696113074,
"grad_norm": 0.6099359393119812,
"learning_rate": 3.517071824312077e-05,
"loss": 0.3052,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2990003228187561,
"step": 2405
},
{
"epoch": 2.1298586572438163,
"grad_norm": 0.6000734567642212,
"learning_rate": 3.5141973566822843e-05,
"loss": 0.2777,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2534373700618744,
"step": 2410
},
{
"epoch": 2.1342756183745584,
"grad_norm": 0.5893080830574036,
"learning_rate": 3.511315541863873e-05,
"loss": 0.2757,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2416633665561676,
"step": 2415
},
{
"epoch": 2.1386925795053005,
"grad_norm": 0.6631132960319519,
"learning_rate": 3.508426393839986e-05,
"loss": 0.3008,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28256604075431824,
"step": 2420
},
{
"epoch": 2.1431095406360425,
"grad_norm": 0.5691207647323608,
"learning_rate": 3.505529926629348e-05,
"loss": 0.2822,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2603251338005066,
"step": 2425
},
{
"epoch": 2.1475265017667846,
"grad_norm": 0.632331371307373,
"learning_rate": 3.502626154286196e-05,
"loss": 0.2722,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25206872820854187,
"step": 2430
},
{
"epoch": 2.1519434628975267,
"grad_norm": 0.5492742657661438,
"learning_rate": 3.4997150909002156e-05,
"loss": 0.2772,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21242624521255493,
"step": 2435
},
{
"epoch": 2.1563604240282688,
"grad_norm": 0.6887109279632568,
"learning_rate": 3.496796750596469e-05,
"loss": 0.3005,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.33346956968307495,
"step": 2440
},
{
"epoch": 2.1607773851590104,
"grad_norm": 0.6558607816696167,
"learning_rate": 3.4938711475353286e-05,
"loss": 0.262,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22680602967739105,
"step": 2445
},
{
"epoch": 2.1651943462897525,
"grad_norm": 0.6331962943077087,
"learning_rate": 3.490938295912404e-05,
"loss": 0.3254,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.319640576839447,
"step": 2450
},
{
"epoch": 2.1696113074204946,
"grad_norm": 0.6158129572868347,
"learning_rate": 3.487998209958479e-05,
"loss": 0.2815,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31720587611198425,
"step": 2455
},
{
"epoch": 2.1740282685512367,
"grad_norm": 0.6651401519775391,
"learning_rate": 3.485050903939439e-05,
"loss": 0.2918,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.33852121233940125,
"step": 2460
},
{
"epoch": 2.1784452296819787,
"grad_norm": 0.7507199048995972,
"learning_rate": 3.482096392156203e-05,
"loss": 0.3105,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3842792212963104,
"step": 2465
},
{
"epoch": 2.182862190812721,
"grad_norm": 0.5813544988632202,
"learning_rate": 3.4791346889446536e-05,
"loss": 0.2912,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2993150055408478,
"step": 2470
},
{
"epoch": 2.187279151943463,
"grad_norm": 0.5870715975761414,
"learning_rate": 3.476165808675567e-05,
"loss": 0.2811,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22926348447799683,
"step": 2475
},
{
"epoch": 2.191696113074205,
"grad_norm": 0.6749504208564758,
"learning_rate": 3.473189765754544e-05,
"loss": 0.3342,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2661892771720886,
"step": 2480
},
{
"epoch": 2.196113074204947,
"grad_norm": 0.7036541104316711,
"learning_rate": 3.4702065746219416e-05,
"loss": 0.3031,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28539180755615234,
"step": 2485
},
{
"epoch": 2.200530035335689,
"grad_norm": 0.7815271019935608,
"learning_rate": 3.467216249752799e-05,
"loss": 0.3215,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.35631412267684937,
"step": 2490
},
{
"epoch": 2.204946996466431,
"grad_norm": 0.6628273129463196,
"learning_rate": 3.4642188056567726e-05,
"loss": 0.2966,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27998021245002747,
"step": 2495
},
{
"epoch": 2.2093639575971733,
"grad_norm": 0.968639075756073,
"learning_rate": 3.461214256878059e-05,
"loss": 0.2672,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23727664351463318,
"step": 2500
},
{
"epoch": 2.2137809187279154,
"grad_norm": 0.6167461276054382,
"learning_rate": 3.458202617995332e-05,
"loss": 0.293,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30183541774749756,
"step": 2505
},
{
"epoch": 2.218197879858657,
"grad_norm": 0.6023343801498413,
"learning_rate": 3.4551839036216645e-05,
"loss": 0.2677,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2949235141277313,
"step": 2510
},
{
"epoch": 2.222614840989399,
"grad_norm": 0.6811277866363525,
"learning_rate": 3.452158128404465e-05,
"loss": 0.3118,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3465280532836914,
"step": 2515
},
{
"epoch": 2.227031802120141,
"grad_norm": 0.6505920886993408,
"learning_rate": 3.449125307025399e-05,
"loss": 0.2811,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2278938889503479,
"step": 2520
},
{
"epoch": 2.2314487632508833,
"grad_norm": 0.5918983221054077,
"learning_rate": 3.446085454200322e-05,
"loss": 0.2657,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2987041771411896,
"step": 2525
},
{
"epoch": 2.2358657243816253,
"grad_norm": 0.5385729670524597,
"learning_rate": 3.44303858467921e-05,
"loss": 0.294,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25492385029792786,
"step": 2530
},
{
"epoch": 2.2402826855123674,
"grad_norm": 0.6151769757270813,
"learning_rate": 3.4399847132460826e-05,
"loss": 0.3009,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2648699879646301,
"step": 2535
},
{
"epoch": 2.2446996466431095,
"grad_norm": 0.6031373739242554,
"learning_rate": 3.436923854718935e-05,
"loss": 0.2864,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23283502459526062,
"step": 2540
},
{
"epoch": 2.2491166077738516,
"grad_norm": 0.6123268008232117,
"learning_rate": 3.433856023949666e-05,
"loss": 0.3324,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2949381172657013,
"step": 2545
},
{
"epoch": 2.2535335689045937,
"grad_norm": 0.63086998462677,
"learning_rate": 3.430781235824006e-05,
"loss": 0.3372,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3198104500770569,
"step": 2550
},
{
"epoch": 2.2579505300353357,
"grad_norm": 0.5796027779579163,
"learning_rate": 3.427699505261439e-05,
"loss": 0.276,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2592904567718506,
"step": 2555
},
{
"epoch": 2.262367491166078,
"grad_norm": 0.6514537930488586,
"learning_rate": 3.4246108472151404e-05,
"loss": 0.3106,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24681276082992554,
"step": 2560
},
{
"epoch": 2.26678445229682,
"grad_norm": 0.6188172698020935,
"learning_rate": 3.421515276671897e-05,
"loss": 0.3131,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25234729051589966,
"step": 2565
},
{
"epoch": 2.271201413427562,
"grad_norm": 0.614671528339386,
"learning_rate": 3.418412808652037e-05,
"loss": 0.2993,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2817681133747101,
"step": 2570
},
{
"epoch": 2.275618374558304,
"grad_norm": 0.6262189745903015,
"learning_rate": 3.4153034582093546e-05,
"loss": 0.2756,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2680363059043884,
"step": 2575
},
{
"epoch": 2.280035335689046,
"grad_norm": 0.6415190100669861,
"learning_rate": 3.412187240431043e-05,
"loss": 0.3148,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.40983670949935913,
"step": 2580
},
{
"epoch": 2.2844522968197882,
"grad_norm": 0.5524982213973999,
"learning_rate": 3.409064170437612e-05,
"loss": 0.3066,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26829278469085693,
"step": 2585
},
{
"epoch": 2.28886925795053,
"grad_norm": 0.5553908944129944,
"learning_rate": 3.405934263382824e-05,
"loss": 0.2891,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24250522255897522,
"step": 2590
},
{
"epoch": 2.293286219081272,
"grad_norm": 0.5957083106040955,
"learning_rate": 3.4027975344536125e-05,
"loss": 0.2759,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2847854495048523,
"step": 2595
},
{
"epoch": 2.297703180212014,
"grad_norm": 0.5808223485946655,
"learning_rate": 3.399653998870016e-05,
"loss": 0.3083,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3035164773464203,
"step": 2600
},
{
"epoch": 2.302120141342756,
"grad_norm": 0.5876525640487671,
"learning_rate": 3.396503671885098e-05,
"loss": 0.2856,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3480638265609741,
"step": 2605
},
{
"epoch": 2.306537102473498,
"grad_norm": 0.5925856232643127,
"learning_rate": 3.3933465687848745e-05,
"loss": 0.2716,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2727441191673279,
"step": 2610
},
{
"epoch": 2.3109540636042403,
"grad_norm": 0.6211825013160706,
"learning_rate": 3.390182704888242e-05,
"loss": 0.2747,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26210927963256836,
"step": 2615
},
{
"epoch": 2.3153710247349824,
"grad_norm": 0.6318957805633545,
"learning_rate": 3.387012095546903e-05,
"loss": 0.2731,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2508987486362457,
"step": 2620
},
{
"epoch": 2.3197879858657244,
"grad_norm": 0.5794128775596619,
"learning_rate": 3.3838347561452854e-05,
"loss": 0.2676,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2536904811859131,
"step": 2625
},
{
"epoch": 2.3242049469964665,
"grad_norm": 0.7687904238700867,
"learning_rate": 3.380650702100478e-05,
"loss": 0.3206,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2745317220687866,
"step": 2630
},
{
"epoch": 2.3286219081272086,
"grad_norm": 0.6696126461029053,
"learning_rate": 3.3774599488621477e-05,
"loss": 0.2588,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.34656822681427,
"step": 2635
},
{
"epoch": 2.3330388692579507,
"grad_norm": 0.6203848719596863,
"learning_rate": 3.374262511912468e-05,
"loss": 0.287,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27805572748184204,
"step": 2640
},
{
"epoch": 2.3374558303886928,
"grad_norm": 0.6597334742546082,
"learning_rate": 3.371058406766043e-05,
"loss": 0.3141,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2802439332008362,
"step": 2645
},
{
"epoch": 2.3418727915194344,
"grad_norm": 0.5888747572898865,
"learning_rate": 3.3678476489698316e-05,
"loss": 0.2838,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28658509254455566,
"step": 2650
},
{
"epoch": 2.3462897526501765,
"grad_norm": 0.6427140831947327,
"learning_rate": 3.364630254103073e-05,
"loss": 0.2916,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2904638648033142,
"step": 2655
},
{
"epoch": 2.3507067137809186,
"grad_norm": 0.6213993430137634,
"learning_rate": 3.3614062377772124e-05,
"loss": 0.2578,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25421738624572754,
"step": 2660
},
{
"epoch": 2.3551236749116606,
"grad_norm": 0.6171494722366333,
"learning_rate": 3.358175615635821e-05,
"loss": 0.2999,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2982363998889923,
"step": 2665
},
{
"epoch": 2.3595406360424027,
"grad_norm": 0.6194586157798767,
"learning_rate": 3.354938403354524e-05,
"loss": 0.2791,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27266404032707214,
"step": 2670
},
{
"epoch": 2.363957597173145,
"grad_norm": 0.5762106776237488,
"learning_rate": 3.351694616640924e-05,
"loss": 0.2736,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.264001727104187,
"step": 2675
},
{
"epoch": 2.368374558303887,
"grad_norm": 0.7091301679611206,
"learning_rate": 3.348444271234523e-05,
"loss": 0.2929,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24949441850185394,
"step": 2680
},
{
"epoch": 2.372791519434629,
"grad_norm": 0.5496478080749512,
"learning_rate": 3.3451873829066474e-05,
"loss": 0.2975,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23808279633522034,
"step": 2685
},
{
"epoch": 2.377208480565371,
"grad_norm": 0.6888132095336914,
"learning_rate": 3.341923967460371e-05,
"loss": 0.2851,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24560776352882385,
"step": 2690
},
{
"epoch": 2.381625441696113,
"grad_norm": 0.5535579323768616,
"learning_rate": 3.338654040730439e-05,
"loss": 0.3136,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30382639169692993,
"step": 2695
},
{
"epoch": 2.386042402826855,
"grad_norm": 0.5726889967918396,
"learning_rate": 3.335377618583191e-05,
"loss": 0.3478,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.355141818523407,
"step": 2700
},
{
"epoch": 2.3904593639575973,
"grad_norm": 0.5942392349243164,
"learning_rate": 3.332094716916481e-05,
"loss": 0.2869,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2685794234275818,
"step": 2705
},
{
"epoch": 2.3948763250883394,
"grad_norm": 0.6522179245948792,
"learning_rate": 3.328805351659606e-05,
"loss": 0.2872,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26475298404693604,
"step": 2710
},
{
"epoch": 2.3992932862190814,
"grad_norm": 0.6337575912475586,
"learning_rate": 3.3255095387732245e-05,
"loss": 0.2787,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.304879754781723,
"step": 2715
},
{
"epoch": 2.4037102473498235,
"grad_norm": 0.6819047331809998,
"learning_rate": 3.3222072942492807e-05,
"loss": 0.283,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2330174744129181,
"step": 2720
},
{
"epoch": 2.4081272084805656,
"grad_norm": 0.7528138160705566,
"learning_rate": 3.318898634110925e-05,
"loss": 0.3262,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.43806248903274536,
"step": 2725
},
{
"epoch": 2.4125441696113072,
"grad_norm": 0.6072879433631897,
"learning_rate": 3.31558357441244e-05,
"loss": 0.2902,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2504928410053253,
"step": 2730
},
{
"epoch": 2.4169611307420493,
"grad_norm": 0.6802332401275635,
"learning_rate": 3.312262131239157e-05,
"loss": 0.3299,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26598912477493286,
"step": 2735
},
{
"epoch": 2.4213780918727914,
"grad_norm": 0.6019970774650574,
"learning_rate": 3.308934320707385e-05,
"loss": 0.2914,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2811354398727417,
"step": 2740
},
{
"epoch": 2.4257950530035335,
"grad_norm": 0.5732802748680115,
"learning_rate": 3.305600158964325e-05,
"loss": 0.3168,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.36934155225753784,
"step": 2745
},
{
"epoch": 2.4302120141342756,
"grad_norm": 0.6369587182998657,
"learning_rate": 3.3022596621879976e-05,
"loss": 0.3298,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3334657549858093,
"step": 2750
},
{
"epoch": 2.4346289752650176,
"grad_norm": 0.5502805709838867,
"learning_rate": 3.298912846587162e-05,
"loss": 0.2721,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21960023045539856,
"step": 2755
},
{
"epoch": 2.4390459363957597,
"grad_norm": 0.5598191618919373,
"learning_rate": 3.2955597284012375e-05,
"loss": 0.304,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28418734669685364,
"step": 2760
},
{
"epoch": 2.443462897526502,
"grad_norm": 1.0691636800765991,
"learning_rate": 3.2922003239002234e-05,
"loss": 0.322,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4324992001056671,
"step": 2765
},
{
"epoch": 2.447879858657244,
"grad_norm": 0.5500887036323547,
"learning_rate": 3.288834649384624e-05,
"loss": 0.2833,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30279844999313354,
"step": 2770
},
{
"epoch": 2.452296819787986,
"grad_norm": 0.6789073348045349,
"learning_rate": 3.2854627211853656e-05,
"loss": 0.329,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3540169894695282,
"step": 2775
},
{
"epoch": 2.456713780918728,
"grad_norm": 0.6984388828277588,
"learning_rate": 3.2820845556637173e-05,
"loss": 0.3262,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26223987340927124,
"step": 2780
},
{
"epoch": 2.46113074204947,
"grad_norm": 0.6759300827980042,
"learning_rate": 3.278700169211216e-05,
"loss": 0.2892,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2541850209236145,
"step": 2785
},
{
"epoch": 2.4655477031802118,
"grad_norm": 0.5834370255470276,
"learning_rate": 3.275309578249581e-05,
"loss": 0.2874,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27517110109329224,
"step": 2790
},
{
"epoch": 2.469964664310954,
"grad_norm": 0.6928835511207581,
"learning_rate": 3.2719127992306386e-05,
"loss": 0.2761,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.34850040078163147,
"step": 2795
},
{
"epoch": 2.474381625441696,
"grad_norm": 0.5541810989379883,
"learning_rate": 3.26850984863624e-05,
"loss": 0.2861,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25448691844940186,
"step": 2800
},
{
"epoch": 2.478798586572438,
"grad_norm": 0.5982546210289001,
"learning_rate": 3.265100742978183e-05,
"loss": 0.2931,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21053552627563477,
"step": 2805
},
{
"epoch": 2.48321554770318,
"grad_norm": 0.5980389714241028,
"learning_rate": 3.261685498798131e-05,
"loss": 0.2993,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2833639681339264,
"step": 2810
},
{
"epoch": 2.487632508833922,
"grad_norm": 0.5997810959815979,
"learning_rate": 3.258264132667531e-05,
"loss": 0.2439,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2284521758556366,
"step": 2815
},
{
"epoch": 2.4920494699646643,
"grad_norm": 0.5549051761627197,
"learning_rate": 3.254836661187537e-05,
"loss": 0.2679,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31983861327171326,
"step": 2820
},
{
"epoch": 2.4964664310954063,
"grad_norm": 0.5757570862770081,
"learning_rate": 3.2514031009889264e-05,
"loss": 0.2843,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27677464485168457,
"step": 2825
},
{
"epoch": 2.5008833922261484,
"grad_norm": 0.5566675662994385,
"learning_rate": 3.247963468732021e-05,
"loss": 0.2926,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23760895431041718,
"step": 2830
},
{
"epoch": 2.5053003533568905,
"grad_norm": 0.5576279163360596,
"learning_rate": 3.244517781106604e-05,
"loss": 0.2898,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3000289797782898,
"step": 2835
},
{
"epoch": 2.5097173144876326,
"grad_norm": 0.5720181465148926,
"learning_rate": 3.241066054831842e-05,
"loss": 0.2766,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2972480058670044,
"step": 2840
},
{
"epoch": 2.5141342756183747,
"grad_norm": 0.5405129194259644,
"learning_rate": 3.237608306656201e-05,
"loss": 0.2677,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23669353127479553,
"step": 2845
},
{
"epoch": 2.5185512367491167,
"grad_norm": 0.5755829215049744,
"learning_rate": 3.234144553357368e-05,
"loss": 0.3027,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3481435775756836,
"step": 2850
},
{
"epoch": 2.522968197879859,
"grad_norm": 0.7123222947120667,
"learning_rate": 3.230674811742167e-05,
"loss": 0.2627,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2425704300403595,
"step": 2855
},
{
"epoch": 2.527385159010601,
"grad_norm": 0.6053310632705688,
"learning_rate": 3.227199098646479e-05,
"loss": 0.2944,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3263585567474365,
"step": 2860
},
{
"epoch": 2.531802120141343,
"grad_norm": 0.6052203178405762,
"learning_rate": 3.223717430935158e-05,
"loss": 0.3646,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30958011746406555,
"step": 2865
},
{
"epoch": 2.536219081272085,
"grad_norm": 0.6099016666412354,
"learning_rate": 3.2202298255019546e-05,
"loss": 0.2613,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2909597158432007,
"step": 2870
},
{
"epoch": 2.5406360424028267,
"grad_norm": 0.6662100553512573,
"learning_rate": 3.216736299269427e-05,
"loss": 0.2772,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2982367277145386,
"step": 2875
},
{
"epoch": 2.545053003533569,
"grad_norm": 0.7286942005157471,
"learning_rate": 3.213236869188864e-05,
"loss": 0.3025,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2656930088996887,
"step": 2880
},
{
"epoch": 2.549469964664311,
"grad_norm": 0.5937227606773376,
"learning_rate": 3.209731552240201e-05,
"loss": 0.2999,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.34423115849494934,
"step": 2885
},
{
"epoch": 2.553886925795053,
"grad_norm": 0.6218231320381165,
"learning_rate": 3.206220365431937e-05,
"loss": 0.3097,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3695404529571533,
"step": 2890
},
{
"epoch": 2.558303886925795,
"grad_norm": 0.6536095142364502,
"learning_rate": 3.202703325801054e-05,
"loss": 0.2994,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.36360839009284973,
"step": 2895
},
{
"epoch": 2.562720848056537,
"grad_norm": 0.6371568441390991,
"learning_rate": 3.19918045041293e-05,
"loss": 0.3254,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3120114803314209,
"step": 2900
},
{
"epoch": 2.567137809187279,
"grad_norm": 0.6116867065429688,
"learning_rate": 3.1956517563612645e-05,
"loss": 0.3266,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31710511445999146,
"step": 2905
},
{
"epoch": 2.5715547703180213,
"grad_norm": 0.6612317562103271,
"learning_rate": 3.1921172607679846e-05,
"loss": 0.3028,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3122076690196991,
"step": 2910
},
{
"epoch": 2.5759717314487633,
"grad_norm": 0.6160558462142944,
"learning_rate": 3.1885769807831714e-05,
"loss": 0.29,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27517688274383545,
"step": 2915
},
{
"epoch": 2.5803886925795054,
"grad_norm": 0.600016176700592,
"learning_rate": 3.185030933584972e-05,
"loss": 0.3317,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3114628791809082,
"step": 2920
},
{
"epoch": 2.5848056537102475,
"grad_norm": 1.3786065578460693,
"learning_rate": 3.181479136379518e-05,
"loss": 0.314,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.39571282267570496,
"step": 2925
},
{
"epoch": 2.589222614840989,
"grad_norm": 0.6324872970581055,
"learning_rate": 3.177921606400838e-05,
"loss": 0.2583,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26827818155288696,
"step": 2930
},
{
"epoch": 2.5936395759717312,
"grad_norm": 0.7281625270843506,
"learning_rate": 3.1743583609107815e-05,
"loss": 0.2935,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2841928005218506,
"step": 2935
},
{
"epoch": 2.5980565371024733,
"grad_norm": 0.5587540864944458,
"learning_rate": 3.1707894171989266e-05,
"loss": 0.2884,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2770228981971741,
"step": 2940
},
{
"epoch": 2.6024734982332154,
"grad_norm": 0.6678293347358704,
"learning_rate": 3.167214792582505e-05,
"loss": 0.3147,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30559659004211426,
"step": 2945
},
{
"epoch": 2.6068904593639575,
"grad_norm": 0.6080886125564575,
"learning_rate": 3.163634504406309e-05,
"loss": 0.2943,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2588590383529663,
"step": 2950
},
{
"epoch": 2.6113074204946995,
"grad_norm": 0.5840783715248108,
"learning_rate": 3.160048570042614e-05,
"loss": 0.2724,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2337406575679779,
"step": 2955
},
{
"epoch": 2.6157243816254416,
"grad_norm": 0.5808059573173523,
"learning_rate": 3.1564570068910905e-05,
"loss": 0.2943,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.304470419883728,
"step": 2960
},
{
"epoch": 2.6201413427561837,
"grad_norm": 0.5165686011314392,
"learning_rate": 3.152859832378723e-05,
"loss": 0.2963,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3160091042518616,
"step": 2965
},
{
"epoch": 2.624558303886926,
"grad_norm": 0.5811315774917603,
"learning_rate": 3.1492570639597216e-05,
"loss": 0.2916,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22409474849700928,
"step": 2970
},
{
"epoch": 2.628975265017668,
"grad_norm": 0.901731014251709,
"learning_rate": 3.145648719115439e-05,
"loss": 0.2875,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2565080225467682,
"step": 2975
},
{
"epoch": 2.63339222614841,
"grad_norm": 0.6577340364456177,
"learning_rate": 3.1420348153542875e-05,
"loss": 0.3208,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.33042359352111816,
"step": 2980
},
{
"epoch": 2.637809187279152,
"grad_norm": 0.5609688758850098,
"learning_rate": 3.138415370211651e-05,
"loss": 0.3028,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30433666706085205,
"step": 2985
},
{
"epoch": 2.642226148409894,
"grad_norm": 0.5504027605056763,
"learning_rate": 3.1347904012498015e-05,
"loss": 0.2762,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27242234349250793,
"step": 2990
},
{
"epoch": 2.646643109540636,
"grad_norm": 0.679768443107605,
"learning_rate": 3.1311599260578144e-05,
"loss": 0.2736,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26140671968460083,
"step": 2995
},
{
"epoch": 2.6510600706713783,
"grad_norm": 1.4311226606369019,
"learning_rate": 3.1275239622514805e-05,
"loss": 0.2793,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25825434923171997,
"step": 3000
},
{
"epoch": 2.6554770318021204,
"grad_norm": 0.6120150685310364,
"learning_rate": 3.123882527473226e-05,
"loss": 0.2988,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.33173418045043945,
"step": 3005
},
{
"epoch": 2.6598939929328624,
"grad_norm": 0.7447654008865356,
"learning_rate": 3.1202356393920205e-05,
"loss": 0.2891,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2618862986564636,
"step": 3010
},
{
"epoch": 2.664310954063604,
"grad_norm": 0.6641417145729065,
"learning_rate": 3.1165833157032945e-05,
"loss": 0.3026,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.32111161947250366,
"step": 3015
},
{
"epoch": 2.668727915194346,
"grad_norm": 0.5896281599998474,
"learning_rate": 3.112925574128853e-05,
"loss": 0.3406,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3454618453979492,
"step": 3020
},
{
"epoch": 2.6731448763250882,
"grad_norm": 0.5626996159553528,
"learning_rate": 3.109262432416791e-05,
"loss": 0.2728,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25294023752212524,
"step": 3025
},
{
"epoch": 2.6775618374558303,
"grad_norm": 0.6241391897201538,
"learning_rate": 3.105593908341405e-05,
"loss": 0.3298,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4321364164352417,
"step": 3030
},
{
"epoch": 2.6819787985865724,
"grad_norm": 0.5742759704589844,
"learning_rate": 3.1019200197031074e-05,
"loss": 0.2969,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29776108264923096,
"step": 3035
},
{
"epoch": 2.6863957597173145,
"grad_norm": 0.5870106220245361,
"learning_rate": 3.098240784328342e-05,
"loss": 0.2909,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3504473567008972,
"step": 3040
},
{
"epoch": 2.6908127208480566,
"grad_norm": 0.7754055261611938,
"learning_rate": 3.094556220069495e-05,
"loss": 0.296,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3014984726905823,
"step": 3045
},
{
"epoch": 2.6952296819787986,
"grad_norm": 0.6273770928382874,
"learning_rate": 3.09086634480481e-05,
"loss": 0.3134,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26273688673973083,
"step": 3050
},
{
"epoch": 2.6996466431095407,
"grad_norm": 0.6451196074485779,
"learning_rate": 3.087171176438299e-05,
"loss": 0.2997,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.32904472947120667,
"step": 3055
},
{
"epoch": 2.704063604240283,
"grad_norm": 0.5997614860534668,
"learning_rate": 3.083470732899659e-05,
"loss": 0.2636,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27674105763435364,
"step": 3060
},
{
"epoch": 2.708480565371025,
"grad_norm": 0.6042181253433228,
"learning_rate": 3.0797650321441836e-05,
"loss": 0.2955,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3244817852973938,
"step": 3065
},
{
"epoch": 2.7128975265017665,
"grad_norm": 0.6343804597854614,
"learning_rate": 3.076054092152673e-05,
"loss": 0.3018,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25237613916397095,
"step": 3070
},
{
"epoch": 2.7173144876325086,
"grad_norm": 0.5651307106018066,
"learning_rate": 3.072337930931351e-05,
"loss": 0.3081,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2834140658378601,
"step": 3075
},
{
"epoch": 2.7217314487632507,
"grad_norm": 0.5973877906799316,
"learning_rate": 3.068616566511777e-05,
"loss": 0.2835,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3114064633846283,
"step": 3080
},
{
"epoch": 2.7261484098939928,
"grad_norm": 0.6461395621299744,
"learning_rate": 3.0648900169507546e-05,
"loss": 0.3086,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3663422465324402,
"step": 3085
},
{
"epoch": 2.730565371024735,
"grad_norm": 0.7704973220825195,
"learning_rate": 3.0611583003302483e-05,
"loss": 0.2973,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.324812650680542,
"step": 3090
},
{
"epoch": 2.734982332155477,
"grad_norm": 0.6625204086303711,
"learning_rate": 3.0574214347572944e-05,
"loss": 0.2868,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2915918827056885,
"step": 3095
},
{
"epoch": 2.739399293286219,
"grad_norm": 0.7523000240325928,
"learning_rate": 3.0536794383639124e-05,
"loss": 0.315,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.267653226852417,
"step": 3100
},
{
"epoch": 2.743816254416961,
"grad_norm": 0.6316024661064148,
"learning_rate": 3.0499323293070168e-05,
"loss": 0.2744,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2455906867980957,
"step": 3105
},
{
"epoch": 2.748233215547703,
"grad_norm": 0.5501362681388855,
"learning_rate": 3.0461801257683316e-05,
"loss": 0.2514,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24123415350914001,
"step": 3110
},
{
"epoch": 2.7526501766784452,
"grad_norm": 0.5820156335830688,
"learning_rate": 3.0424228459542996e-05,
"loss": 0.3227,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2901195287704468,
"step": 3115
},
{
"epoch": 2.7570671378091873,
"grad_norm": 0.5763217806816101,
"learning_rate": 3.0386605080959933e-05,
"loss": 0.3368,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2486405223608017,
"step": 3120
},
{
"epoch": 2.7614840989399294,
"grad_norm": 0.6018761396408081,
"learning_rate": 3.0348931304490308e-05,
"loss": 0.3192,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3718198835849762,
"step": 3125
},
{
"epoch": 2.7659010600706715,
"grad_norm": 0.6464835405349731,
"learning_rate": 3.0311207312934802e-05,
"loss": 0.3052,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2733161747455597,
"step": 3130
},
{
"epoch": 2.7703180212014136,
"grad_norm": 0.6074162125587463,
"learning_rate": 3.0273433289337782e-05,
"loss": 0.3438,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3934619128704071,
"step": 3135
},
{
"epoch": 2.7747349823321557,
"grad_norm": 0.6861649751663208,
"learning_rate": 3.0235609416986382e-05,
"loss": 0.3097,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3455376625061035,
"step": 3140
},
{
"epoch": 2.7791519434628977,
"grad_norm": 0.546413242816925,
"learning_rate": 3.0197735879409582e-05,
"loss": 0.2465,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3003658056259155,
"step": 3145
},
{
"epoch": 2.78356890459364,
"grad_norm": 0.5838289260864258,
"learning_rate": 3.015981286037737e-05,
"loss": 0.2401,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24027827382087708,
"step": 3150
},
{
"epoch": 2.787985865724382,
"grad_norm": 0.6270075440406799,
"learning_rate": 3.0121840543899828e-05,
"loss": 0.2884,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26620614528656006,
"step": 3155
},
{
"epoch": 2.7924028268551235,
"grad_norm": 0.6166526079177856,
"learning_rate": 3.008381911422624e-05,
"loss": 0.3056,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2833024263381958,
"step": 3160
},
{
"epoch": 2.7968197879858656,
"grad_norm": 0.7598279714584351,
"learning_rate": 3.0045748755844183e-05,
"loss": 0.2683,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23250682651996613,
"step": 3165
},
{
"epoch": 2.8012367491166077,
"grad_norm": 0.5732672810554504,
"learning_rate": 3.000762965347866e-05,
"loss": 0.3035,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28198108077049255,
"step": 3170
},
{
"epoch": 2.8056537102473498,
"grad_norm": 0.6021406650543213,
"learning_rate": 2.9969461992091187e-05,
"loss": 0.3052,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4037885367870331,
"step": 3175
},
{
"epoch": 2.810070671378092,
"grad_norm": 0.5615018606185913,
"learning_rate": 2.9931245956878892e-05,
"loss": 0.2972,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.35477539896965027,
"step": 3180
},
{
"epoch": 2.814487632508834,
"grad_norm": 0.6810048818588257,
"learning_rate": 2.9892981733273622e-05,
"loss": 0.2809,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.32894670963287354,
"step": 3185
},
{
"epoch": 2.818904593639576,
"grad_norm": 0.6966601014137268,
"learning_rate": 2.9854669506941056e-05,
"loss": 0.3045,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2800629138946533,
"step": 3190
},
{
"epoch": 2.823321554770318,
"grad_norm": 0.6326582431793213,
"learning_rate": 2.9816309463779777e-05,
"loss": 0.3209,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2518727481365204,
"step": 3195
},
{
"epoch": 2.82773851590106,
"grad_norm": 0.6759814023971558,
"learning_rate": 2.9777901789920393e-05,
"loss": 0.3045,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3975909948348999,
"step": 3200
},
{
"epoch": 2.8321554770318023,
"grad_norm": 0.6340915560722351,
"learning_rate": 2.9739446671724633e-05,
"loss": 0.2956,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25144162774086,
"step": 3205
},
{
"epoch": 2.836572438162544,
"grad_norm": 0.6134033203125,
"learning_rate": 2.9700944295784416e-05,
"loss": 0.3104,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3317497670650482,
"step": 3210
},
{
"epoch": 2.840989399293286,
"grad_norm": 0.5460602045059204,
"learning_rate": 2.9662394848920976e-05,
"loss": 0.247,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27905362844467163,
"step": 3215
},
{
"epoch": 2.845406360424028,
"grad_norm": 0.6375061273574829,
"learning_rate": 2.962379851818396e-05,
"loss": 0.3235,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.36598873138427734,
"step": 3220
},
{
"epoch": 2.84982332155477,
"grad_norm": 0.5567300319671631,
"learning_rate": 2.9585155490850463e-05,
"loss": 0.2588,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18971426784992218,
"step": 3225
},
{
"epoch": 2.854240282685512,
"grad_norm": 0.6028186678886414,
"learning_rate": 2.954646595442421e-05,
"loss": 0.2845,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3407897353172302,
"step": 3230
},
{
"epoch": 2.8586572438162543,
"grad_norm": 0.5787585377693176,
"learning_rate": 2.9507730096634558e-05,
"loss": 0.2964,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3281695246696472,
"step": 3235
},
{
"epoch": 2.8630742049469964,
"grad_norm": 0.6009767055511475,
"learning_rate": 2.9468948105435652e-05,
"loss": 0.2745,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2663469910621643,
"step": 3240
},
{
"epoch": 2.8674911660777385,
"grad_norm": 0.5785790085792542,
"learning_rate": 2.943012016900548e-05,
"loss": 0.2944,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23294416069984436,
"step": 3245
},
{
"epoch": 2.8719081272084805,
"grad_norm": 0.5688700079917908,
"learning_rate": 2.9391246475744952e-05,
"loss": 0.303,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23604430258274078,
"step": 3250
},
{
"epoch": 2.8763250883392226,
"grad_norm": 0.5440213084220886,
"learning_rate": 2.9352327214277002e-05,
"loss": 0.3017,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29112064838409424,
"step": 3255
},
{
"epoch": 2.8807420494699647,
"grad_norm": 0.5312842726707458,
"learning_rate": 2.931336257344569e-05,
"loss": 0.2971,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25097882747650146,
"step": 3260
},
{
"epoch": 2.885159010600707,
"grad_norm": 0.658263623714447,
"learning_rate": 2.9274352742315234e-05,
"loss": 0.2663,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2677016854286194,
"step": 3265
},
{
"epoch": 2.889575971731449,
"grad_norm": 0.6275796890258789,
"learning_rate": 2.923529791016916e-05,
"loss": 0.2654,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26214462518692017,
"step": 3270
},
{
"epoch": 2.893992932862191,
"grad_norm": 0.6369099020957947,
"learning_rate": 2.919619826650932e-05,
"loss": 0.3023,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27498164772987366,
"step": 3275
},
{
"epoch": 2.898409893992933,
"grad_norm": 0.6062490344047546,
"learning_rate": 2.9157054001055007e-05,
"loss": 0.2484,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.35193073749542236,
"step": 3280
},
{
"epoch": 2.902826855123675,
"grad_norm": 0.6960145831108093,
"learning_rate": 2.9117865303742043e-05,
"loss": 0.2768,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1947738081216812,
"step": 3285
},
{
"epoch": 2.907243816254417,
"grad_norm": 0.610952615737915,
"learning_rate": 2.9078632364721813e-05,
"loss": 0.2925,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3309261202812195,
"step": 3290
},
{
"epoch": 2.9116607773851593,
"grad_norm": 0.696971595287323,
"learning_rate": 2.903935537436041e-05,
"loss": 0.3046,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3597220778465271,
"step": 3295
},
{
"epoch": 2.916077738515901,
"grad_norm": 0.5838300585746765,
"learning_rate": 2.900003452323764e-05,
"loss": 0.2693,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2813507318496704,
"step": 3300
},
{
"epoch": 2.920494699646643,
"grad_norm": 0.5652858018875122,
"learning_rate": 2.8960670002146138e-05,
"loss": 0.2469,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2683357000350952,
"step": 3305
},
{
"epoch": 2.924911660777385,
"grad_norm": 0.6993016004562378,
"learning_rate": 2.8921262002090443e-05,
"loss": 0.2897,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26220783591270447,
"step": 3310
},
{
"epoch": 2.929328621908127,
"grad_norm": 0.6500904560089111,
"learning_rate": 2.888181071428607e-05,
"loss": 0.2631,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27039170265197754,
"step": 3315
},
{
"epoch": 2.9337455830388692,
"grad_norm": 0.6717191934585571,
"learning_rate": 2.884231633015854e-05,
"loss": 0.3335,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31322067975997925,
"step": 3320
},
{
"epoch": 2.9381625441696113,
"grad_norm": 0.6734561920166016,
"learning_rate": 2.8802779041342527e-05,
"loss": 0.2989,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28675904870033264,
"step": 3325
},
{
"epoch": 2.9425795053003534,
"grad_norm": 0.5548768043518066,
"learning_rate": 2.876319903968086e-05,
"loss": 0.2627,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23984147608280182,
"step": 3330
},
{
"epoch": 2.9469964664310955,
"grad_norm": 0.5678575038909912,
"learning_rate": 2.8723576517223635e-05,
"loss": 0.27,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26975440979003906,
"step": 3335
},
{
"epoch": 2.9514134275618376,
"grad_norm": 0.5872315764427185,
"learning_rate": 2.8683911666227254e-05,
"loss": 0.2604,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24881170690059662,
"step": 3340
},
{
"epoch": 2.9558303886925796,
"grad_norm": 0.5842449069023132,
"learning_rate": 2.864420467915352e-05,
"loss": 0.2799,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24471619725227356,
"step": 3345
},
{
"epoch": 2.9602473498233217,
"grad_norm": 0.6217790842056274,
"learning_rate": 2.8604455748668675e-05,
"loss": 0.2745,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3837500810623169,
"step": 3350
},
{
"epoch": 2.9646643109540634,
"grad_norm": 0.7379738092422485,
"learning_rate": 2.8564665067642485e-05,
"loss": 0.3111,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3087494373321533,
"step": 3355
},
{
"epoch": 2.9690812720848054,
"grad_norm": 0.6993508338928223,
"learning_rate": 2.8524832829147297e-05,
"loss": 0.3157,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2241922914981842,
"step": 3360
},
{
"epoch": 2.9734982332155475,
"grad_norm": 0.6891131401062012,
"learning_rate": 2.8484959226457115e-05,
"loss": 0.2835,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25775182247161865,
"step": 3365
},
{
"epoch": 2.9779151943462896,
"grad_norm": 0.5844883322715759,
"learning_rate": 2.8445044453046624e-05,
"loss": 0.2942,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.285645067691803,
"step": 3370
},
{
"epoch": 2.9823321554770317,
"grad_norm": 0.546260416507721,
"learning_rate": 2.8405088702590296e-05,
"loss": 0.2498,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26289626955986023,
"step": 3375
},
{
"epoch": 2.9867491166077738,
"grad_norm": 0.682931125164032,
"learning_rate": 2.8365092168961442e-05,
"loss": 0.2906,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23991906642913818,
"step": 3380
},
{
"epoch": 2.991166077738516,
"grad_norm": 0.600168764591217,
"learning_rate": 2.8325055046231232e-05,
"loss": 0.2954,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.37289607524871826,
"step": 3385
},
{
"epoch": 2.995583038869258,
"grad_norm": 0.6167921423912048,
"learning_rate": 2.8284977528667806e-05,
"loss": 0.3104,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2748357951641083,
"step": 3390
},
{
"epoch": 3.0008833922261484,
"grad_norm": 0.5644031763076782,
"learning_rate": 2.8244859810735304e-05,
"loss": 0.2734,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31137675046920776,
"step": 3395
},
{
"epoch": 3.0053003533568905,
"grad_norm": 0.5274640917778015,
"learning_rate": 2.8204702087092907e-05,
"loss": 0.2752,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24842819571495056,
"step": 3400
},
{
"epoch": 3.0097173144876326,
"grad_norm": 0.6265926361083984,
"learning_rate": 2.8164504552593946e-05,
"loss": 0.2768,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27338293194770813,
"step": 3405
},
{
"epoch": 3.0141342756183747,
"grad_norm": 0.6399211287498474,
"learning_rate": 2.8124267402284892e-05,
"loss": 0.2868,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3242674469947815,
"step": 3410
},
{
"epoch": 3.0185512367491167,
"grad_norm": 0.5678868889808655,
"learning_rate": 2.808399083140445e-05,
"loss": 0.2472,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2346222996711731,
"step": 3415
},
{
"epoch": 3.022968197879859,
"grad_norm": 0.6776866912841797,
"learning_rate": 2.804367503538261e-05,
"loss": 0.2434,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3134090304374695,
"step": 3420
},
{
"epoch": 3.027385159010601,
"grad_norm": 0.6301928758621216,
"learning_rate": 2.800332020983968e-05,
"loss": 0.297,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26638519763946533,
"step": 3425
},
{
"epoch": 3.0318021201413425,
"grad_norm": 0.6153246164321899,
"learning_rate": 2.796292655058535e-05,
"loss": 0.2712,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2534346878528595,
"step": 3430
},
{
"epoch": 3.0362190812720846,
"grad_norm": 0.6437485814094543,
"learning_rate": 2.792249425361773e-05,
"loss": 0.2873,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3017566204071045,
"step": 3435
},
{
"epoch": 3.0406360424028267,
"grad_norm": 0.5970696210861206,
"learning_rate": 2.788202351512243e-05,
"loss": 0.2605,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27321383357048035,
"step": 3440
},
{
"epoch": 3.045053003533569,
"grad_norm": 0.6142945885658264,
"learning_rate": 2.7841514531471574e-05,
"loss": 0.2623,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2881008982658386,
"step": 3445
},
{
"epoch": 3.049469964664311,
"grad_norm": 0.6773906350135803,
"learning_rate": 2.7800967499222845e-05,
"loss": 0.262,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2629165053367615,
"step": 3450
},
{
"epoch": 3.053886925795053,
"grad_norm": 0.545464038848877,
"learning_rate": 2.7760382615118562e-05,
"loss": 0.25,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27945563197135925,
"step": 3455
},
{
"epoch": 3.058303886925795,
"grad_norm": 0.6101946234703064,
"learning_rate": 2.7719760076084713e-05,
"loss": 0.2938,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.268838107585907,
"step": 3460
},
{
"epoch": 3.062720848056537,
"grad_norm": 2.510591506958008,
"learning_rate": 2.7679100079229982e-05,
"loss": 0.2674,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20418402552604675,
"step": 3465
},
{
"epoch": 3.067137809187279,
"grad_norm": 0.5778970122337341,
"learning_rate": 2.7638402821844808e-05,
"loss": 0.281,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.33357682824134827,
"step": 3470
},
{
"epoch": 3.0715547703180213,
"grad_norm": 0.630087673664093,
"learning_rate": 2.7597668501400436e-05,
"loss": 0.2591,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28215503692626953,
"step": 3475
},
{
"epoch": 3.0759717314487633,
"grad_norm": 0.6290969848632812,
"learning_rate": 2.7556897315547934e-05,
"loss": 0.29,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2777438461780548,
"step": 3480
},
{
"epoch": 3.0803886925795054,
"grad_norm": 0.5843200087547302,
"learning_rate": 2.7516089462117265e-05,
"loss": 0.2386,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.195445254445076,
"step": 3485
},
{
"epoch": 3.0848056537102475,
"grad_norm": 0.6632335186004639,
"learning_rate": 2.747524513911629e-05,
"loss": 0.259,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3661026358604431,
"step": 3490
},
{
"epoch": 3.0892226148409896,
"grad_norm": 0.6285350918769836,
"learning_rate": 2.7434364544729844e-05,
"loss": 0.2747,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.32909315824508667,
"step": 3495
},
{
"epoch": 3.0936395759717312,
"grad_norm": 0.5703840255737305,
"learning_rate": 2.7393447877318756e-05,
"loss": 0.2907,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.34857630729675293,
"step": 3500
},
{
"epoch": 3.0980565371024733,
"grad_norm": 0.5537542104721069,
"learning_rate": 2.735249533541888e-05,
"loss": 0.2578,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3032943606376648,
"step": 3505
},
{
"epoch": 3.1024734982332154,
"grad_norm": 0.6345096826553345,
"learning_rate": 2.7311507117740138e-05,
"loss": 0.261,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2263367921113968,
"step": 3510
},
{
"epoch": 3.1068904593639575,
"grad_norm": 0.671589195728302,
"learning_rate": 2.7270483423165578e-05,
"loss": 0.2604,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23606635630130768,
"step": 3515
},
{
"epoch": 3.1113074204946995,
"grad_norm": 0.6208762526512146,
"learning_rate": 2.7229424450750378e-05,
"loss": 0.2418,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26206400990486145,
"step": 3520
},
{
"epoch": 3.1157243816254416,
"grad_norm": 0.6254077553749084,
"learning_rate": 2.7188330399720883e-05,
"loss": 0.2593,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21936869621276855,
"step": 3525
},
{
"epoch": 3.1201413427561837,
"grad_norm": 0.6420286893844604,
"learning_rate": 2.7147201469473645e-05,
"loss": 0.2697,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28934288024902344,
"step": 3530
},
{
"epoch": 3.124558303886926,
"grad_norm": 0.8817894458770752,
"learning_rate": 2.7106037859574482e-05,
"loss": 0.27,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23228083550930023,
"step": 3535
},
{
"epoch": 3.128975265017668,
"grad_norm": 0.686221718788147,
"learning_rate": 2.706483976975746e-05,
"loss": 0.2552,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2681628465652466,
"step": 3540
},
{
"epoch": 3.13339222614841,
"grad_norm": 0.6326223611831665,
"learning_rate": 2.702360739992395e-05,
"loss": 0.2812,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2811989486217499,
"step": 3545
},
{
"epoch": 3.137809187279152,
"grad_norm": 0.7377673387527466,
"learning_rate": 2.698234095014167e-05,
"loss": 0.2648,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26155680418014526,
"step": 3550
},
{
"epoch": 3.142226148409894,
"grad_norm": 0.6575599312782288,
"learning_rate": 2.6941040620643685e-05,
"loss": 0.2887,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31103795766830444,
"step": 3555
},
{
"epoch": 3.146643109540636,
"grad_norm": 0.6150068044662476,
"learning_rate": 2.689970661182747e-05,
"loss": 0.2926,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25528043508529663,
"step": 3560
},
{
"epoch": 3.1510600706713783,
"grad_norm": 0.5725194215774536,
"learning_rate": 2.6858339124253902e-05,
"loss": 0.288,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25137802958488464,
"step": 3565
},
{
"epoch": 3.1554770318021204,
"grad_norm": 0.5649983286857605,
"learning_rate": 2.681693835864631e-05,
"loss": 0.2707,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2684163451194763,
"step": 3570
},
{
"epoch": 3.159893992932862,
"grad_norm": 0.6405424475669861,
"learning_rate": 2.6775504515889498e-05,
"loss": 0.2665,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27231472730636597,
"step": 3575
},
{
"epoch": 3.164310954063604,
"grad_norm": 0.5784630179405212,
"learning_rate": 2.6734037797028764e-05,
"loss": 0.298,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25174811482429504,
"step": 3580
},
{
"epoch": 3.168727915194346,
"grad_norm": 0.572640061378479,
"learning_rate": 2.6692538403268916e-05,
"loss": 0.2867,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22904187440872192,
"step": 3585
},
{
"epoch": 3.1731448763250882,
"grad_norm": 0.6837023496627808,
"learning_rate": 2.6651006535973327e-05,
"loss": 0.3015,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2275552749633789,
"step": 3590
},
{
"epoch": 3.1775618374558303,
"grad_norm": 0.5804446935653687,
"learning_rate": 2.660944239666293e-05,
"loss": 0.2832,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29134559631347656,
"step": 3595
},
{
"epoch": 3.1819787985865724,
"grad_norm": 0.6767000555992126,
"learning_rate": 2.6567846187015245e-05,
"loss": 0.2804,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3343573808670044,
"step": 3600
},
{
"epoch": 3.1863957597173145,
"grad_norm": 0.615800142288208,
"learning_rate": 2.6526218108863408e-05,
"loss": 0.3103,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.42924535274505615,
"step": 3605
},
{
"epoch": 3.1908127208480566,
"grad_norm": 0.6070627570152283,
"learning_rate": 2.648455836419518e-05,
"loss": 0.262,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2295074164867401,
"step": 3610
},
{
"epoch": 3.1952296819787986,
"grad_norm": 0.6150861382484436,
"learning_rate": 2.6442867155151984e-05,
"loss": 0.2611,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24106386303901672,
"step": 3615
},
{
"epoch": 3.1996466431095407,
"grad_norm": 0.6103044748306274,
"learning_rate": 2.6401144684027915e-05,
"loss": 0.2458,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26913753151893616,
"step": 3620
},
{
"epoch": 3.204063604240283,
"grad_norm": 0.6718603372573853,
"learning_rate": 2.635939115326874e-05,
"loss": 0.2722,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28498226404190063,
"step": 3625
},
{
"epoch": 3.208480565371025,
"grad_norm": 0.6954282522201538,
"learning_rate": 2.631760676547096e-05,
"loss": 0.3019,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22322580218315125,
"step": 3630
},
{
"epoch": 3.212897526501767,
"grad_norm": 0.6374308466911316,
"learning_rate": 2.6275791723380772e-05,
"loss": 0.2825,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25081831216812134,
"step": 3635
},
{
"epoch": 3.2173144876325086,
"grad_norm": 0.6096289753913879,
"learning_rate": 2.6233946229893147e-05,
"loss": 0.2657,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23818659782409668,
"step": 3640
},
{
"epoch": 3.2217314487632507,
"grad_norm": 0.6019396185874939,
"learning_rate": 2.6192070488050783e-05,
"loss": 0.2425,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26665595173835754,
"step": 3645
},
{
"epoch": 3.2261484098939928,
"grad_norm": 0.6398298144340515,
"learning_rate": 2.615016470104316e-05,
"loss": 0.2519,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21210452914237976,
"step": 3650
},
{
"epoch": 3.230565371024735,
"grad_norm": 0.5565531253814697,
"learning_rate": 2.6108229072205545e-05,
"loss": 0.31,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2653741240501404,
"step": 3655
},
{
"epoch": 3.234982332155477,
"grad_norm": 0.6310369968414307,
"learning_rate": 2.606626380501801e-05,
"loss": 0.2921,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3396562337875366,
"step": 3660
},
{
"epoch": 3.239399293286219,
"grad_norm": 0.6153047680854797,
"learning_rate": 2.6024269103104417e-05,
"loss": 0.2716,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28231340646743774,
"step": 3665
},
{
"epoch": 3.243816254416961,
"grad_norm": 0.667517900466919,
"learning_rate": 2.5982245170231467e-05,
"loss": 0.264,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.32096004486083984,
"step": 3670
},
{
"epoch": 3.248233215547703,
"grad_norm": 0.6767958998680115,
"learning_rate": 2.5940192210307697e-05,
"loss": 0.2544,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3398870825767517,
"step": 3675
},
{
"epoch": 3.2526501766784452,
"grad_norm": 0.6170216798782349,
"learning_rate": 2.5898110427382487e-05,
"loss": 0.2543,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2500012516975403,
"step": 3680
},
{
"epoch": 3.2570671378091873,
"grad_norm": 0.5803675055503845,
"learning_rate": 2.5856000025645065e-05,
"loss": 0.2803,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2736228108406067,
"step": 3685
},
{
"epoch": 3.2614840989399294,
"grad_norm": 0.6387597322463989,
"learning_rate": 2.581386120942353e-05,
"loss": 0.3275,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28693830966949463,
"step": 3690
},
{
"epoch": 3.2659010600706715,
"grad_norm": 0.6041744351387024,
"learning_rate": 2.577169418318385e-05,
"loss": 0.309,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3677545189857483,
"step": 3695
},
{
"epoch": 3.2703180212014136,
"grad_norm": 0.591244637966156,
"learning_rate": 2.5729499151528877e-05,
"loss": 0.2956,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3903142213821411,
"step": 3700
},
{
"epoch": 3.2747349823321557,
"grad_norm": 0.5570688247680664,
"learning_rate": 2.568727631919735e-05,
"loss": 0.3056,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.32988813519477844,
"step": 3705
},
{
"epoch": 3.2791519434628977,
"grad_norm": 0.7770934104919434,
"learning_rate": 2.5645025891062897e-05,
"loss": 0.2645,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2805140018463135,
"step": 3710
},
{
"epoch": 3.28356890459364,
"grad_norm": 0.5827252864837646,
"learning_rate": 2.5602748072133054e-05,
"loss": 0.2863,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25772303342819214,
"step": 3715
},
{
"epoch": 3.2879858657243815,
"grad_norm": 0.6013981103897095,
"learning_rate": 2.5560443067548263e-05,
"loss": 0.2814,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24229644238948822,
"step": 3720
},
{
"epoch": 3.2924028268551235,
"grad_norm": 0.5573179721832275,
"learning_rate": 2.5518111082580873e-05,
"loss": 0.251,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2687339782714844,
"step": 3725
},
{
"epoch": 3.2968197879858656,
"grad_norm": 0.5730259418487549,
"learning_rate": 2.547575232263414e-05,
"loss": 0.2566,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2892979681491852,
"step": 3730
},
{
"epoch": 3.3012367491166077,
"grad_norm": 0.578188955783844,
"learning_rate": 2.5433366993241252e-05,
"loss": 0.2739,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25793546438217163,
"step": 3735
},
{
"epoch": 3.3056537102473498,
"grad_norm": 0.6442872881889343,
"learning_rate": 2.5390955300064306e-05,
"loss": 0.2552,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23315435647964478,
"step": 3740
},
{
"epoch": 3.310070671378092,
"grad_norm": 0.6832812428474426,
"learning_rate": 2.5348517448893323e-05,
"loss": 0.283,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28596487641334534,
"step": 3745
},
{
"epoch": 3.314487632508834,
"grad_norm": 0.5652126669883728,
"learning_rate": 2.530605364564526e-05,
"loss": 0.306,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22419892251491547,
"step": 3750
},
{
"epoch": 3.318904593639576,
"grad_norm": 0.5962374806404114,
"learning_rate": 2.5263564096362972e-05,
"loss": 0.2711,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23518124222755432,
"step": 3755
},
{
"epoch": 3.323321554770318,
"grad_norm": 0.5664314031600952,
"learning_rate": 2.5221049007214276e-05,
"loss": 0.2561,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2558237910270691,
"step": 3760
},
{
"epoch": 3.32773851590106,
"grad_norm": 0.5694194436073303,
"learning_rate": 2.5178508584490882e-05,
"loss": 0.2672,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2523389458656311,
"step": 3765
},
{
"epoch": 3.3321554770318023,
"grad_norm": 0.926784873008728,
"learning_rate": 2.5135943034607434e-05,
"loss": 0.3056,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3541930913925171,
"step": 3770
},
{
"epoch": 3.3365724381625443,
"grad_norm": 0.65824294090271,
"learning_rate": 2.50933525641005e-05,
"loss": 0.2076,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19914916157722473,
"step": 3775
},
{
"epoch": 3.340989399293286,
"grad_norm": 0.62986159324646,
"learning_rate": 2.5050737379627575e-05,
"loss": 0.2822,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.285235196352005,
"step": 3780
},
{
"epoch": 3.345406360424028,
"grad_norm": 0.5901169776916504,
"learning_rate": 2.5008097687966052e-05,
"loss": 0.2573,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3077230453491211,
"step": 3785
},
{
"epoch": 3.34982332155477,
"grad_norm": 0.6803475618362427,
"learning_rate": 2.4965433696012255e-05,
"loss": 0.3068,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3818191885948181,
"step": 3790
},
{
"epoch": 3.354240282685512,
"grad_norm": 0.5957804918289185,
"learning_rate": 2.49227456107804e-05,
"loss": 0.3033,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3599034249782562,
"step": 3795
},
{
"epoch": 3.3586572438162543,
"grad_norm": 0.5898772478103638,
"learning_rate": 2.488003363940163e-05,
"loss": 0.2727,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.275057315826416,
"step": 3800
},
{
"epoch": 3.3630742049469964,
"grad_norm": 0.6106370687484741,
"learning_rate": 2.4837297989122987e-05,
"loss": 0.2831,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31545543670654297,
"step": 3805
},
{
"epoch": 3.3674911660777385,
"grad_norm": 0.5764063596725464,
"learning_rate": 2.4794538867306385e-05,
"loss": 0.2837,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30026909708976746,
"step": 3810
},
{
"epoch": 3.3719081272084805,
"grad_norm": 0.5482817888259888,
"learning_rate": 2.4751756481427637e-05,
"loss": 0.2699,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23257222771644592,
"step": 3815
},
{
"epoch": 3.3763250883392226,
"grad_norm": 0.6209834218025208,
"learning_rate": 2.4708951039075462e-05,
"loss": 0.2375,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21792644262313843,
"step": 3820
},
{
"epoch": 3.3807420494699647,
"grad_norm": 0.5752567648887634,
"learning_rate": 2.4666122747950416e-05,
"loss": 0.257,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2612408995628357,
"step": 3825
},
{
"epoch": 3.385159010600707,
"grad_norm": 0.594068706035614,
"learning_rate": 2.4623271815863943e-05,
"loss": 0.2552,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26158034801483154,
"step": 3830
},
{
"epoch": 3.389575971731449,
"grad_norm": 0.6304320693016052,
"learning_rate": 2.4580398450737338e-05,
"loss": 0.3036,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3264097571372986,
"step": 3835
},
{
"epoch": 3.393992932862191,
"grad_norm": 0.7010661959648132,
"learning_rate": 2.4537502860600754e-05,
"loss": 0.2842,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2843078374862671,
"step": 3840
},
{
"epoch": 3.398409893992933,
"grad_norm": 0.6101694107055664,
"learning_rate": 2.4494585253592184e-05,
"loss": 0.2717,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2982633113861084,
"step": 3845
},
{
"epoch": 3.402826855123675,
"grad_norm": 0.5913658738136292,
"learning_rate": 2.445164583795643e-05,
"loss": 0.2567,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3931722640991211,
"step": 3850
},
{
"epoch": 3.407243816254417,
"grad_norm": 0.6326600313186646,
"learning_rate": 2.4408684822044152e-05,
"loss": 0.2485,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1966112107038498,
"step": 3855
},
{
"epoch": 3.411660777385159,
"grad_norm": 0.5460976958274841,
"learning_rate": 2.4365702414310786e-05,
"loss": 0.2891,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2495535910129547,
"step": 3860
},
{
"epoch": 3.416077738515901,
"grad_norm": 0.5842785835266113,
"learning_rate": 2.4322698823315572e-05,
"loss": 0.2936,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22732970118522644,
"step": 3865
},
{
"epoch": 3.420494699646643,
"grad_norm": 0.5686548948287964,
"learning_rate": 2.4279674257720548e-05,
"loss": 0.2531,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24848207831382751,
"step": 3870
},
{
"epoch": 3.424911660777385,
"grad_norm": 0.6319994926452637,
"learning_rate": 2.4236628926289506e-05,
"loss": 0.2879,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2527256906032562,
"step": 3875
},
{
"epoch": 3.429328621908127,
"grad_norm": 0.9111738204956055,
"learning_rate": 2.4193563037887025e-05,
"loss": 0.2488,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27606528997421265,
"step": 3880
},
{
"epoch": 3.4337455830388692,
"grad_norm": 0.9722045063972473,
"learning_rate": 2.4150476801477404e-05,
"loss": 0.2452,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24321609735488892,
"step": 3885
},
{
"epoch": 3.4381625441696113,
"grad_norm": 0.5848979949951172,
"learning_rate": 2.4107370426123685e-05,
"loss": 0.2405,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2286883294582367,
"step": 3890
},
{
"epoch": 3.4425795053003534,
"grad_norm": 0.7078244686126709,
"learning_rate": 2.406424412098664e-05,
"loss": 0.2926,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2944197654724121,
"step": 3895
},
{
"epoch": 3.4469964664310955,
"grad_norm": 0.516459584236145,
"learning_rate": 2.4021098095323713e-05,
"loss": 0.2537,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23038452863693237,
"step": 3900
},
{
"epoch": 3.4514134275618376,
"grad_norm": 0.6297348737716675,
"learning_rate": 2.3977932558488074e-05,
"loss": 0.2725,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28293001651763916,
"step": 3905
},
{
"epoch": 3.4558303886925796,
"grad_norm": 0.6394320726394653,
"learning_rate": 2.3934747719927534e-05,
"loss": 0.2794,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29240959882736206,
"step": 3910
},
{
"epoch": 3.4602473498233217,
"grad_norm": 0.5845627188682556,
"learning_rate": 2.3891543789183573e-05,
"loss": 0.2999,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.34419286251068115,
"step": 3915
},
{
"epoch": 3.464664310954064,
"grad_norm": 0.5385047197341919,
"learning_rate": 2.3848320975890316e-05,
"loss": 0.2683,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23887717723846436,
"step": 3920
},
{
"epoch": 3.4690812720848054,
"grad_norm": 0.5908883810043335,
"learning_rate": 2.3805079489773508e-05,
"loss": 0.2442,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2729918360710144,
"step": 3925
},
{
"epoch": 3.4734982332155475,
"grad_norm": 0.6180974841117859,
"learning_rate": 2.376181954064948e-05,
"loss": 0.2982,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2929500937461853,
"step": 3930
},
{
"epoch": 3.4779151943462896,
"grad_norm": 0.5845738649368286,
"learning_rate": 2.3718541338424176e-05,
"loss": 0.2994,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2808518409729004,
"step": 3935
},
{
"epoch": 3.4823321554770317,
"grad_norm": 0.7396969795227051,
"learning_rate": 2.3675245093092082e-05,
"loss": 0.2486,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2241256982088089,
"step": 3940
},
{
"epoch": 3.4867491166077738,
"grad_norm": 0.6327768564224243,
"learning_rate": 2.3631931014735258e-05,
"loss": 0.2812,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2858206033706665,
"step": 3945
},
{
"epoch": 3.491166077738516,
"grad_norm": 0.6281271576881409,
"learning_rate": 2.358859931352227e-05,
"loss": 0.2562,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22891634702682495,
"step": 3950
},
{
"epoch": 3.495583038869258,
"grad_norm": 0.636397659778595,
"learning_rate": 2.3545250199707207e-05,
"loss": 0.2416,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2959528863430023,
"step": 3955
},
{
"epoch": 3.5,
"grad_norm": 0.6934898495674133,
"learning_rate": 2.350188388362865e-05,
"loss": 0.2578,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3115631937980652,
"step": 3960
},
{
"epoch": 3.504416961130742,
"grad_norm": 0.5889595150947571,
"learning_rate": 2.3458500575708642e-05,
"loss": 0.2683,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31395983695983887,
"step": 3965
},
{
"epoch": 3.508833922261484,
"grad_norm": 0.6764131188392639,
"learning_rate": 2.341510048645167e-05,
"loss": 0.2803,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.301558256149292,
"step": 3970
},
{
"epoch": 3.5132508833922262,
"grad_norm": 0.6070583462715149,
"learning_rate": 2.337168382644367e-05,
"loss": 0.3018,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3173370063304901,
"step": 3975
},
{
"epoch": 3.5176678445229683,
"grad_norm": 0.5840939283370972,
"learning_rate": 2.332825080635094e-05,
"loss": 0.3153,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3090516924858093,
"step": 3980
},
{
"epoch": 3.5220848056537104,
"grad_norm": 0.7202004790306091,
"learning_rate": 2.3284801636919205e-05,
"loss": 0.2874,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.41266167163848877,
"step": 3985
},
{
"epoch": 3.5265017667844525,
"grad_norm": 0.6619871258735657,
"learning_rate": 2.3241336528972522e-05,
"loss": 0.2659,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2852621078491211,
"step": 3990
},
{
"epoch": 3.5309187279151946,
"grad_norm": 0.5971029996871948,
"learning_rate": 2.3197855693412295e-05,
"loss": 0.2901,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24810791015625,
"step": 3995
},
{
"epoch": 3.5353356890459366,
"grad_norm": 0.5984452962875366,
"learning_rate": 2.3154359341216243e-05,
"loss": 0.2733,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23727092146873474,
"step": 4000
},
{
"epoch": 3.5397526501766783,
"grad_norm": 0.5858767628669739,
"learning_rate": 2.311084768343737e-05,
"loss": 0.2487,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27304431796073914,
"step": 4005
},
{
"epoch": 3.5441696113074204,
"grad_norm": 0.6337104439735413,
"learning_rate": 2.306732093120295e-05,
"loss": 0.2892,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.37451407313346863,
"step": 4010
},
{
"epoch": 3.5485865724381624,
"grad_norm": 0.6166836023330688,
"learning_rate": 2.3023779295713497e-05,
"loss": 0.2833,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25973236560821533,
"step": 4015
},
{
"epoch": 3.5530035335689045,
"grad_norm": 0.6111301183700562,
"learning_rate": 2.2980222988241733e-05,
"loss": 0.2633,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25971752405166626,
"step": 4020
},
{
"epoch": 3.5574204946996466,
"grad_norm": 0.6352372765541077,
"learning_rate": 2.293665222013158e-05,
"loss": 0.2422,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21252036094665527,
"step": 4025
},
{
"epoch": 3.5618374558303887,
"grad_norm": 0.6189213395118713,
"learning_rate": 2.2893067202797136e-05,
"loss": 0.2314,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22057734429836273,
"step": 4030
},
{
"epoch": 3.5662544169611308,
"grad_norm": 0.5966265201568604,
"learning_rate": 2.2849468147721615e-05,
"loss": 0.27,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26105162501335144,
"step": 4035
},
{
"epoch": 3.570671378091873,
"grad_norm": 0.7433478832244873,
"learning_rate": 2.280585526645637e-05,
"loss": 0.272,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2277737855911255,
"step": 4040
},
{
"epoch": 3.575088339222615,
"grad_norm": 0.6259749531745911,
"learning_rate": 2.2762228770619815e-05,
"loss": 0.2872,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1985919028520584,
"step": 4045
},
{
"epoch": 3.579505300353357,
"grad_norm": 0.6301651000976562,
"learning_rate": 2.2718588871896454e-05,
"loss": 0.2657,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3158514201641083,
"step": 4050
},
{
"epoch": 3.583922261484099,
"grad_norm": 0.556121289730072,
"learning_rate": 2.2674935782035804e-05,
"loss": 0.2521,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23911920189857483,
"step": 4055
},
{
"epoch": 3.5883392226148407,
"grad_norm": 0.5992854833602905,
"learning_rate": 2.2631269712851385e-05,
"loss": 0.2349,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23733431100845337,
"step": 4060
},
{
"epoch": 3.592756183745583,
"grad_norm": 0.6029264330863953,
"learning_rate": 2.258759087621971e-05,
"loss": 0.26,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30784931778907776,
"step": 4065
},
{
"epoch": 3.597173144876325,
"grad_norm": 0.6338687539100647,
"learning_rate": 2.2543899484079245e-05,
"loss": 0.299,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26149436831474304,
"step": 4070
},
{
"epoch": 3.601590106007067,
"grad_norm": 0.5709251761436462,
"learning_rate": 2.2500195748429352e-05,
"loss": 0.2602,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26341015100479126,
"step": 4075
},
{
"epoch": 3.606007067137809,
"grad_norm": 0.6511635780334473,
"learning_rate": 2.2456479881329315e-05,
"loss": 0.2457,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2105947583913803,
"step": 4080
},
{
"epoch": 3.610424028268551,
"grad_norm": 0.5920666456222534,
"learning_rate": 2.2412752094897267e-05,
"loss": 0.2718,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25122541189193726,
"step": 4085
},
{
"epoch": 3.614840989399293,
"grad_norm": 0.6184494495391846,
"learning_rate": 2.236901260130918e-05,
"loss": 0.2614,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19458022713661194,
"step": 4090
},
{
"epoch": 3.6192579505300353,
"grad_norm": 0.5893781781196594,
"learning_rate": 2.2325261612797832e-05,
"loss": 0.2518,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26221320033073425,
"step": 4095
},
{
"epoch": 3.6236749116607774,
"grad_norm": 0.6632538437843323,
"learning_rate": 2.2281499341651767e-05,
"loss": 0.268,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2836889624595642,
"step": 4100
},
{
"epoch": 3.6280918727915195,
"grad_norm": 0.6132526993751526,
"learning_rate": 2.223772600021429e-05,
"loss": 0.2728,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3212493658065796,
"step": 4105
},
{
"epoch": 3.6325088339222615,
"grad_norm": 0.5957457423210144,
"learning_rate": 2.2193941800882418e-05,
"loss": 0.3153,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4252464175224304,
"step": 4110
},
{
"epoch": 3.6369257950530036,
"grad_norm": 0.6638500094413757,
"learning_rate": 2.2150146956105836e-05,
"loss": 0.3003,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2801547646522522,
"step": 4115
},
{
"epoch": 3.6413427561837457,
"grad_norm": 0.6362109184265137,
"learning_rate": 2.210634167838591e-05,
"loss": 0.2801,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.36497941613197327,
"step": 4120
},
{
"epoch": 3.645759717314488,
"grad_norm": 0.7648757100105286,
"learning_rate": 2.2062526180274607e-05,
"loss": 0.2378,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2305619865655899,
"step": 4125
},
{
"epoch": 3.65017667844523,
"grad_norm": 0.5891234278678894,
"learning_rate": 2.2018700674373487e-05,
"loss": 0.2642,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23286129534244537,
"step": 4130
},
{
"epoch": 3.654593639575972,
"grad_norm": 0.5884703993797302,
"learning_rate": 2.1974865373332695e-05,
"loss": 0.281,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2888106107711792,
"step": 4135
},
{
"epoch": 3.659010600706714,
"grad_norm": 0.5825828313827515,
"learning_rate": 2.1931020489849865e-05,
"loss": 0.2649,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2594233453273773,
"step": 4140
},
{
"epoch": 3.663427561837456,
"grad_norm": 0.6837796568870544,
"learning_rate": 2.1887166236669154e-05,
"loss": 0.2716,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23577167093753815,
"step": 4145
},
{
"epoch": 3.6678445229681977,
"grad_norm": 0.558169960975647,
"learning_rate": 2.184330282658018e-05,
"loss": 0.2425,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22032378613948822,
"step": 4150
},
{
"epoch": 3.67226148409894,
"grad_norm": 0.5991285443305969,
"learning_rate": 2.1799430472416975e-05,
"loss": 0.31,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24205946922302246,
"step": 4155
},
{
"epoch": 3.676678445229682,
"grad_norm": 0.5339157581329346,
"learning_rate": 2.1755549387056997e-05,
"loss": 0.2795,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24257515370845795,
"step": 4160
},
{
"epoch": 3.681095406360424,
"grad_norm": 0.5839236378669739,
"learning_rate": 2.1711659783420043e-05,
"loss": 0.2469,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3155287802219391,
"step": 4165
},
{
"epoch": 3.685512367491166,
"grad_norm": 0.6157808303833008,
"learning_rate": 2.1667761874467256e-05,
"loss": 0.282,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3029802739620209,
"step": 4170
},
{
"epoch": 3.689929328621908,
"grad_norm": 0.6158527731895447,
"learning_rate": 2.162385587320008e-05,
"loss": 0.2547,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24819602072238922,
"step": 4175
},
{
"epoch": 3.6943462897526502,
"grad_norm": 0.6458650231361389,
"learning_rate": 2.1579941992659214e-05,
"loss": 0.2515,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2597864866256714,
"step": 4180
},
{
"epoch": 3.6987632508833923,
"grad_norm": 0.6469552516937256,
"learning_rate": 2.1536020445923595e-05,
"loss": 0.2546,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21913045644760132,
"step": 4185
},
{
"epoch": 3.7031802120141344,
"grad_norm": 0.6282652616500854,
"learning_rate": 2.1492091446109372e-05,
"loss": 0.2705,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23371529579162598,
"step": 4190
},
{
"epoch": 3.7075971731448765,
"grad_norm": 0.5427141785621643,
"learning_rate": 2.1448155206368823e-05,
"loss": 0.2806,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26559561491012573,
"step": 4195
},
{
"epoch": 3.712014134275618,
"grad_norm": 0.6435478329658508,
"learning_rate": 2.1404211939889392e-05,
"loss": 0.2435,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22901174426078796,
"step": 4200
},
{
"epoch": 3.71643109540636,
"grad_norm": 0.553631067276001,
"learning_rate": 2.1360261859892594e-05,
"loss": 0.3053,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3381679654121399,
"step": 4205
},
{
"epoch": 3.7208480565371023,
"grad_norm": 0.6866286993026733,
"learning_rate": 2.1316305179633016e-05,
"loss": 0.3006,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3275683522224426,
"step": 4210
},
{
"epoch": 3.7252650176678443,
"grad_norm": 0.6093161106109619,
"learning_rate": 2.1272342112397272e-05,
"loss": 0.2419,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21553654968738556,
"step": 4215
},
{
"epoch": 3.7296819787985864,
"grad_norm": 0.5793571472167969,
"learning_rate": 2.1228372871502955e-05,
"loss": 0.2842,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24574121832847595,
"step": 4220
},
{
"epoch": 3.7340989399293285,
"grad_norm": 0.6067283153533936,
"learning_rate": 2.1184397670297624e-05,
"loss": 0.2336,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2613888382911682,
"step": 4225
},
{
"epoch": 3.7385159010600706,
"grad_norm": 0.6006746292114258,
"learning_rate": 2.1140416722157765e-05,
"loss": 0.2854,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.38783538341522217,
"step": 4230
},
{
"epoch": 3.7429328621908127,
"grad_norm": 0.5855950117111206,
"learning_rate": 2.1096430240487723e-05,
"loss": 0.253,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2570192515850067,
"step": 4235
},
{
"epoch": 3.7473498233215548,
"grad_norm": 0.632363498210907,
"learning_rate": 2.105243843871873e-05,
"loss": 0.2336,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24486814439296722,
"step": 4240
},
{
"epoch": 3.751766784452297,
"grad_norm": 0.6137506365776062,
"learning_rate": 2.100844153030779e-05,
"loss": 0.2863,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21526217460632324,
"step": 4245
},
{
"epoch": 3.756183745583039,
"grad_norm": 0.567816972732544,
"learning_rate": 2.096443972873673e-05,
"loss": 0.2202,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21241986751556396,
"step": 4250
},
{
"epoch": 3.760600706713781,
"grad_norm": 0.6972650289535522,
"learning_rate": 2.0920433247511092e-05,
"loss": 0.2904,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3464621901512146,
"step": 4255
},
{
"epoch": 3.765017667844523,
"grad_norm": 0.5928968191146851,
"learning_rate": 2.087642230015912e-05,
"loss": 0.2641,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24683159589767456,
"step": 4260
},
{
"epoch": 3.769434628975265,
"grad_norm": 0.8208529353141785,
"learning_rate": 2.0832407100230747e-05,
"loss": 0.2684,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30278170108795166,
"step": 4265
},
{
"epoch": 3.7738515901060072,
"grad_norm": 0.5523584485054016,
"learning_rate": 2.078838786129653e-05,
"loss": 0.276,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28194743394851685,
"step": 4270
},
{
"epoch": 3.7782685512367493,
"grad_norm": 0.6361742615699768,
"learning_rate": 2.0744364796946624e-05,
"loss": 0.2752,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3716488778591156,
"step": 4275
},
{
"epoch": 3.7826855123674914,
"grad_norm": 0.6412866711616516,
"learning_rate": 2.0700338120789754e-05,
"loss": 0.2855,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24867978692054749,
"step": 4280
},
{
"epoch": 3.7871024734982335,
"grad_norm": 0.6166922450065613,
"learning_rate": 2.0656308046452157e-05,
"loss": 0.2542,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25477123260498047,
"step": 4285
},
{
"epoch": 3.791519434628975,
"grad_norm": 0.5771932601928711,
"learning_rate": 2.0612274787576565e-05,
"loss": 0.2917,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2894698977470398,
"step": 4290
},
{
"epoch": 3.795936395759717,
"grad_norm": 0.645137369632721,
"learning_rate": 2.0568238557821175e-05,
"loss": 0.2617,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2622634470462799,
"step": 4295
},
{
"epoch": 3.8003533568904593,
"grad_norm": 0.6164206266403198,
"learning_rate": 2.0524199570858573e-05,
"loss": 0.2591,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28416186571121216,
"step": 4300
},
{
"epoch": 3.8047703180212014,
"grad_norm": 0.6087357401847839,
"learning_rate": 2.048015804037474e-05,
"loss": 0.2536,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22203269600868225,
"step": 4305
},
{
"epoch": 3.8091872791519434,
"grad_norm": 0.6240975260734558,
"learning_rate": 2.0436114180068008e-05,
"loss": 0.2946,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3069062829017639,
"step": 4310
},
{
"epoch": 3.8136042402826855,
"grad_norm": 0.5895731449127197,
"learning_rate": 2.039206820364798e-05,
"loss": 0.2882,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3107955753803253,
"step": 4315
},
{
"epoch": 3.8180212014134276,
"grad_norm": 0.6357214450836182,
"learning_rate": 2.034802032483457e-05,
"loss": 0.2684,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2024916559457779,
"step": 4320
},
{
"epoch": 3.8224381625441697,
"grad_norm": 0.6543165445327759,
"learning_rate": 2.0303970757356894e-05,
"loss": 0.251,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2537229657173157,
"step": 4325
},
{
"epoch": 3.8268551236749118,
"grad_norm": 0.6982588768005371,
"learning_rate": 2.025991971495226e-05,
"loss": 0.2544,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2701827883720398,
"step": 4330
},
{
"epoch": 3.831272084805654,
"grad_norm": 0.5591413378715515,
"learning_rate": 2.021586741136516e-05,
"loss": 0.2919,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3260684907436371,
"step": 4335
},
{
"epoch": 3.835689045936396,
"grad_norm": 0.7472929954528809,
"learning_rate": 2.017181406034617e-05,
"loss": 0.2375,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20827150344848633,
"step": 4340
},
{
"epoch": 3.8401060070671376,
"grad_norm": 0.8127717971801758,
"learning_rate": 2.0127759875650974e-05,
"loss": 0.2724,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18227070569992065,
"step": 4345
},
{
"epoch": 3.8445229681978796,
"grad_norm": 0.5871464610099792,
"learning_rate": 2.0083705071039297e-05,
"loss": 0.2649,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.307786226272583,
"step": 4350
},
{
"epoch": 3.8489399293286217,
"grad_norm": 0.6023159623146057,
"learning_rate": 2.0039649860273855e-05,
"loss": 0.2563,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27349045872688293,
"step": 4355
},
{
"epoch": 3.853356890459364,
"grad_norm": 1.6955772638320923,
"learning_rate": 1.9995594457119364e-05,
"loss": 0.2609,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24877923727035522,
"step": 4360
},
{
"epoch": 3.857773851590106,
"grad_norm": 0.6980836987495422,
"learning_rate": 1.995153907534145e-05,
"loss": 0.3256,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.317715585231781,
"step": 4365
},
{
"epoch": 3.862190812720848,
"grad_norm": 0.6564183831214905,
"learning_rate": 1.990748392870563e-05,
"loss": 0.2733,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3203606605529785,
"step": 4370
},
{
"epoch": 3.86660777385159,
"grad_norm": 0.5388532876968384,
"learning_rate": 1.986342923097631e-05,
"loss": 0.3132,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2857687771320343,
"step": 4375
},
{
"epoch": 3.871024734982332,
"grad_norm": 0.5364903211593628,
"learning_rate": 1.98193751959157e-05,
"loss": 0.2478,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21258722245693207,
"step": 4380
},
{
"epoch": 3.875441696113074,
"grad_norm": 0.636022686958313,
"learning_rate": 1.977532203728278e-05,
"loss": 0.3042,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.355568528175354,
"step": 4385
},
{
"epoch": 3.8798586572438163,
"grad_norm": 0.5986488461494446,
"learning_rate": 1.9731269968832305e-05,
"loss": 0.2595,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2478679120540619,
"step": 4390
},
{
"epoch": 3.8842756183745584,
"grad_norm": 0.5643515586853027,
"learning_rate": 1.9687219204313717e-05,
"loss": 0.2887,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21933361887931824,
"step": 4395
},
{
"epoch": 3.8886925795053005,
"grad_norm": 0.6250163316726685,
"learning_rate": 1.9643169957470157e-05,
"loss": 0.2878,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29644590616226196,
"step": 4400
},
{
"epoch": 3.8931095406360425,
"grad_norm": 0.6132106781005859,
"learning_rate": 1.959912244203737e-05,
"loss": 0.256,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24946492910385132,
"step": 4405
},
{
"epoch": 3.8975265017667846,
"grad_norm": 0.596892774105072,
"learning_rate": 1.9555076871742734e-05,
"loss": 0.2952,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24082526564598083,
"step": 4410
},
{
"epoch": 3.9019434628975267,
"grad_norm": 0.5782345533370972,
"learning_rate": 1.951103346030415e-05,
"loss": 0.2534,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2679441571235657,
"step": 4415
},
{
"epoch": 3.9063604240282688,
"grad_norm": 0.5399941205978394,
"learning_rate": 1.9466992421429076e-05,
"loss": 0.2328,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18268732726573944,
"step": 4420
},
{
"epoch": 3.910777385159011,
"grad_norm": 0.6017982959747314,
"learning_rate": 1.9422953968813454e-05,
"loss": 0.2697,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27265119552612305,
"step": 4425
},
{
"epoch": 3.9151943462897525,
"grad_norm": 0.7943002581596375,
"learning_rate": 1.937891831614066e-05,
"loss": 0.256,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2352999448776245,
"step": 4430
},
{
"epoch": 3.9196113074204946,
"grad_norm": 0.6090646386146545,
"learning_rate": 1.93348856770805e-05,
"loss": 0.2937,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3457292914390564,
"step": 4435
},
{
"epoch": 3.9240282685512367,
"grad_norm": 0.584635317325592,
"learning_rate": 1.929085626528814e-05,
"loss": 0.2579,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30109745264053345,
"step": 4440
},
{
"epoch": 3.9284452296819787,
"grad_norm": 0.6093024611473083,
"learning_rate": 1.9246830294403108e-05,
"loss": 0.2978,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3744407594203949,
"step": 4445
},
{
"epoch": 3.932862190812721,
"grad_norm": 0.594610333442688,
"learning_rate": 1.920280797804822e-05,
"loss": 0.2748,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25274011492729187,
"step": 4450
},
{
"epoch": 3.937279151943463,
"grad_norm": 0.5948688387870789,
"learning_rate": 1.915878952982857e-05,
"loss": 0.3056,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2250659167766571,
"step": 4455
},
{
"epoch": 3.941696113074205,
"grad_norm": 0.608898937702179,
"learning_rate": 1.911477516333048e-05,
"loss": 0.2758,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2589750289916992,
"step": 4460
},
{
"epoch": 3.946113074204947,
"grad_norm": 0.710386335849762,
"learning_rate": 1.907076509212046e-05,
"loss": 0.2521,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19801661372184753,
"step": 4465
},
{
"epoch": 3.950530035335689,
"grad_norm": 0.6093432307243347,
"learning_rate": 1.9026759529744187e-05,
"loss": 0.3113,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26044270396232605,
"step": 4470
},
{
"epoch": 3.954946996466431,
"grad_norm": 0.546137809753418,
"learning_rate": 1.8982758689725447e-05,
"loss": 0.2627,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19887375831604004,
"step": 4475
},
{
"epoch": 3.9593639575971733,
"grad_norm": 0.5550782084465027,
"learning_rate": 1.8938762785565137e-05,
"loss": 0.2416,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2474510371685028,
"step": 4480
},
{
"epoch": 3.963780918727915,
"grad_norm": 0.5750576853752136,
"learning_rate": 1.8894772030740182e-05,
"loss": 0.2648,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26418790221214294,
"step": 4485
},
{
"epoch": 3.968197879858657,
"grad_norm": 0.6318395137786865,
"learning_rate": 1.8850786638702528e-05,
"loss": 0.29,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2796365022659302,
"step": 4490
},
{
"epoch": 3.972614840989399,
"grad_norm": 0.6644579172134399,
"learning_rate": 1.88068068228781e-05,
"loss": 0.2798,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20628443360328674,
"step": 4495
},
{
"epoch": 3.977031802120141,
"grad_norm": 0.6782865524291992,
"learning_rate": 1.876283279666576e-05,
"loss": 0.2907,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3842836320400238,
"step": 4500
},
{
"epoch": 3.9814487632508833,
"grad_norm": 0.5653501152992249,
"learning_rate": 1.87188647734363e-05,
"loss": 0.2882,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2926984429359436,
"step": 4505
},
{
"epoch": 3.9858657243816253,
"grad_norm": 0.6898596286773682,
"learning_rate": 1.8674902966531354e-05,
"loss": 0.2813,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2633523941040039,
"step": 4510
},
{
"epoch": 3.9902826855123674,
"grad_norm": 0.5879707336425781,
"learning_rate": 1.8630947589262417e-05,
"loss": 0.2905,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27371013164520264,
"step": 4515
},
{
"epoch": 3.9946996466431095,
"grad_norm": 0.6024365425109863,
"learning_rate": 1.858699885490977e-05,
"loss": 0.2682,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26837724447250366,
"step": 4520
},
{
"epoch": 3.9991166077738516,
"grad_norm": 0.7211329340934753,
"learning_rate": 1.8543056976721472e-05,
"loss": 0.2526,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.221034437417984,
"step": 4525
},
{
"epoch": 4.004416961130742,
"grad_norm": 0.5379669070243835,
"learning_rate": 1.84991221679123e-05,
"loss": 0.2219,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1902405172586441,
"step": 4530
},
{
"epoch": 4.008833922261484,
"grad_norm": 0.546558141708374,
"learning_rate": 1.845519464166275e-05,
"loss": 0.2414,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19735684990882874,
"step": 4535
},
{
"epoch": 4.013250883392226,
"grad_norm": 0.6087609529495239,
"learning_rate": 1.8411274611117974e-05,
"loss": 0.2617,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2241283804178238,
"step": 4540
},
{
"epoch": 4.017667844522968,
"grad_norm": 0.5435627698898315,
"learning_rate": 1.836736228938674e-05,
"loss": 0.2354,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20003776252269745,
"step": 4545
},
{
"epoch": 4.02208480565371,
"grad_norm": 0.7864016890525818,
"learning_rate": 1.832345788954043e-05,
"loss": 0.2476,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20103882253170013,
"step": 4550
},
{
"epoch": 4.0265017667844525,
"grad_norm": 0.6416686773300171,
"learning_rate": 1.8279561624611962e-05,
"loss": 0.2612,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22663789987564087,
"step": 4555
},
{
"epoch": 4.030918727915195,
"grad_norm": 0.5910441875457764,
"learning_rate": 1.8235673707594822e-05,
"loss": 0.2545,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24896244704723358,
"step": 4560
},
{
"epoch": 4.035335689045937,
"grad_norm": 0.6123059988021851,
"learning_rate": 1.819179435144195e-05,
"loss": 0.2485,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3180444836616516,
"step": 4565
},
{
"epoch": 4.039752650176679,
"grad_norm": 0.6581359505653381,
"learning_rate": 1.8147923769064776e-05,
"loss": 0.2517,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23575004935264587,
"step": 4570
},
{
"epoch": 4.044169611307421,
"grad_norm": 0.6589792370796204,
"learning_rate": 1.8104062173332134e-05,
"loss": 0.242,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2652082145214081,
"step": 4575
},
{
"epoch": 4.048586572438163,
"grad_norm": 0.6208878755569458,
"learning_rate": 1.8060209777069267e-05,
"loss": 0.2391,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2510373592376709,
"step": 4580
},
{
"epoch": 4.053003533568905,
"grad_norm": 0.7314044833183289,
"learning_rate": 1.801636679305679e-05,
"loss": 0.2537,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23933479189872742,
"step": 4585
},
{
"epoch": 4.057420494699647,
"grad_norm": 0.7164866328239441,
"learning_rate": 1.797253343402962e-05,
"loss": 0.2101,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2189275324344635,
"step": 4590
},
{
"epoch": 4.061837455830389,
"grad_norm": 0.7242477536201477,
"learning_rate": 1.7928709912676e-05,
"loss": 0.2666,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23622474074363708,
"step": 4595
},
{
"epoch": 4.06625441696113,
"grad_norm": 0.6429215669631958,
"learning_rate": 1.788489644163642e-05,
"loss": 0.2344,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2727344036102295,
"step": 4600
},
{
"epoch": 4.070671378091872,
"grad_norm": 0.6153692603111267,
"learning_rate": 1.784109323350261e-05,
"loss": 0.2724,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3209673762321472,
"step": 4605
},
{
"epoch": 4.0750883392226145,
"grad_norm": 0.6157684922218323,
"learning_rate": 1.77973005008165e-05,
"loss": 0.2666,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25213688611984253,
"step": 4610
},
{
"epoch": 4.079505300353357,
"grad_norm": 0.5954070687294006,
"learning_rate": 1.7753518456069198e-05,
"loss": 0.2325,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23194274306297302,
"step": 4615
},
{
"epoch": 4.083922261484099,
"grad_norm": 0.6085749864578247,
"learning_rate": 1.770974731169995e-05,
"loss": 0.2456,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23410743474960327,
"step": 4620
},
{
"epoch": 4.088339222614841,
"grad_norm": 0.5981534123420715,
"learning_rate": 1.76659872800951e-05,
"loss": 0.2454,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2934369444847107,
"step": 4625
},
{
"epoch": 4.092756183745583,
"grad_norm": 0.6048264503479004,
"learning_rate": 1.7622238573587093e-05,
"loss": 0.2482,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3395196795463562,
"step": 4630
},
{
"epoch": 4.097173144876325,
"grad_norm": 0.5977709293365479,
"learning_rate": 1.7578501404453388e-05,
"loss": 0.281,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25631198287010193,
"step": 4635
},
{
"epoch": 4.101590106007067,
"grad_norm": 0.6589481234550476,
"learning_rate": 1.7534775984915503e-05,
"loss": 0.2383,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2767007350921631,
"step": 4640
},
{
"epoch": 4.106007067137809,
"grad_norm": 0.6230020523071289,
"learning_rate": 1.7491062527137912e-05,
"loss": 0.2795,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27050620317459106,
"step": 4645
},
{
"epoch": 4.110424028268551,
"grad_norm": 0.6690418720245361,
"learning_rate": 1.744736124322707e-05,
"loss": 0.2497,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26453179121017456,
"step": 4650
},
{
"epoch": 4.114840989399293,
"grad_norm": 0.5822317004203796,
"learning_rate": 1.7403672345230342e-05,
"loss": 0.2322,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24094851315021515,
"step": 4655
},
{
"epoch": 4.119257950530035,
"grad_norm": 0.6514151692390442,
"learning_rate": 1.7359996045135007e-05,
"loss": 0.2192,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1770108938217163,
"step": 4660
},
{
"epoch": 4.123674911660777,
"grad_norm": 0.5683082938194275,
"learning_rate": 1.7316332554867224e-05,
"loss": 0.2665,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2293209284543991,
"step": 4665
},
{
"epoch": 4.1280918727915195,
"grad_norm": 0.6644694805145264,
"learning_rate": 1.7272682086290982e-05,
"loss": 0.2602,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.411828875541687,
"step": 4670
},
{
"epoch": 4.1325088339222615,
"grad_norm": 0.6769957542419434,
"learning_rate": 1.722904485120709e-05,
"loss": 0.2498,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23401038348674774,
"step": 4675
},
{
"epoch": 4.136925795053004,
"grad_norm": 0.615993857383728,
"learning_rate": 1.7185421061352135e-05,
"loss": 0.2403,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2185857892036438,
"step": 4680
},
{
"epoch": 4.141342756183746,
"grad_norm": 0.680887758731842,
"learning_rate": 1.7141810928397495e-05,
"loss": 0.2512,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2617225646972656,
"step": 4685
},
{
"epoch": 4.145759717314488,
"grad_norm": 0.6120012998580933,
"learning_rate": 1.7098214663948243e-05,
"loss": 0.2467,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24213114380836487,
"step": 4690
},
{
"epoch": 4.15017667844523,
"grad_norm": 0.655899703502655,
"learning_rate": 1.7054632479542196e-05,
"loss": 0.2392,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2701648473739624,
"step": 4695
},
{
"epoch": 4.154593639575972,
"grad_norm": 0.6427193880081177,
"learning_rate": 1.7011064586648828e-05,
"loss": 0.2549,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.36555686593055725,
"step": 4700
},
{
"epoch": 4.159010600706714,
"grad_norm": 0.77866530418396,
"learning_rate": 1.6967511196668277e-05,
"loss": 0.2433,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2713695466518402,
"step": 4705
},
{
"epoch": 4.163427561837456,
"grad_norm": 0.6055473685264587,
"learning_rate": 1.6923972520930307e-05,
"loss": 0.2595,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2517254948616028,
"step": 4710
},
{
"epoch": 4.167844522968198,
"grad_norm": 0.6935552358627319,
"learning_rate": 1.688044877069328e-05,
"loss": 0.2606,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2867121696472168,
"step": 4715
},
{
"epoch": 4.17226148409894,
"grad_norm": 0.5792904496192932,
"learning_rate": 1.6836940157143152e-05,
"loss": 0.2477,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26725780963897705,
"step": 4720
},
{
"epoch": 4.176678445229682,
"grad_norm": 0.6815539002418518,
"learning_rate": 1.6793446891392422e-05,
"loss": 0.2758,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31988525390625,
"step": 4725
},
{
"epoch": 4.181095406360424,
"grad_norm": 0.6169085502624512,
"learning_rate": 1.6749969184479116e-05,
"loss": 0.2304,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24607014656066895,
"step": 4730
},
{
"epoch": 4.1855123674911665,
"grad_norm": 0.6831942200660706,
"learning_rate": 1.670650724736577e-05,
"loss": 0.2323,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23659402132034302,
"step": 4735
},
{
"epoch": 4.189929328621908,
"grad_norm": 0.6516426205635071,
"learning_rate": 1.66630612909384e-05,
"loss": 0.2365,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2049470990896225,
"step": 4740
},
{
"epoch": 4.19434628975265,
"grad_norm": 0.7550818920135498,
"learning_rate": 1.661963152600549e-05,
"loss": 0.2397,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22731956839561462,
"step": 4745
},
{
"epoch": 4.198763250883392,
"grad_norm": 0.641147255897522,
"learning_rate": 1.657621816329694e-05,
"loss": 0.2502,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20782433450222015,
"step": 4750
},
{
"epoch": 4.203180212014134,
"grad_norm": 0.6107928156852722,
"learning_rate": 1.6532821413463083e-05,
"loss": 0.2476,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23882174491882324,
"step": 4755
},
{
"epoch": 4.207597173144876,
"grad_norm": 0.621147096157074,
"learning_rate": 1.648944148707363e-05,
"loss": 0.2452,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2622299790382385,
"step": 4760
},
{
"epoch": 4.212014134275618,
"grad_norm": 0.7022213339805603,
"learning_rate": 1.6446078594616666e-05,
"loss": 0.2463,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18493574857711792,
"step": 4765
},
{
"epoch": 4.21643109540636,
"grad_norm": 0.6667992472648621,
"learning_rate": 1.640273294649762e-05,
"loss": 0.2593,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3022603392601013,
"step": 4770
},
{
"epoch": 4.220848056537102,
"grad_norm": 0.6353417038917542,
"learning_rate": 1.635940475303826e-05,
"loss": 0.2516,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28547731041908264,
"step": 4775
},
{
"epoch": 4.225265017667844,
"grad_norm": 0.703382670879364,
"learning_rate": 1.631609422447565e-05,
"loss": 0.2362,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18541069328784943,
"step": 4780
},
{
"epoch": 4.229681978798586,
"grad_norm": 0.5781343579292297,
"learning_rate": 1.6272801570961136e-05,
"loss": 0.2179,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21327394247055054,
"step": 4785
},
{
"epoch": 4.2340989399293285,
"grad_norm": 0.628350019454956,
"learning_rate": 1.6229527002559346e-05,
"loss": 0.2669,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22581510245800018,
"step": 4790
},
{
"epoch": 4.238515901060071,
"grad_norm": 0.6018873453140259,
"learning_rate": 1.6186270729247137e-05,
"loss": 0.2395,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2645873725414276,
"step": 4795
},
{
"epoch": 4.242932862190813,
"grad_norm": 0.6109693646430969,
"learning_rate": 1.614303296091262e-05,
"loss": 0.2536,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23807910084724426,
"step": 4800
},
{
"epoch": 4.247349823321555,
"grad_norm": 0.6540105938911438,
"learning_rate": 1.6099813907354077e-05,
"loss": 0.2925,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3849636912345886,
"step": 4805
},
{
"epoch": 4.251766784452297,
"grad_norm": 0.603500485420227,
"learning_rate": 1.6056613778279026e-05,
"loss": 0.253,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2598439157009125,
"step": 4810
},
{
"epoch": 4.256183745583039,
"grad_norm": 0.6518641710281372,
"learning_rate": 1.6013432783303133e-05,
"loss": 0.2903,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30824732780456543,
"step": 4815
},
{
"epoch": 4.260600706713781,
"grad_norm": 0.6308322548866272,
"learning_rate": 1.5970271131949213e-05,
"loss": 0.2959,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22987233102321625,
"step": 4820
},
{
"epoch": 4.265017667844523,
"grad_norm": 0.5965448617935181,
"learning_rate": 1.5927129033646264e-05,
"loss": 0.2509,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3175223171710968,
"step": 4825
},
{
"epoch": 4.269434628975265,
"grad_norm": 0.6295101642608643,
"learning_rate": 1.588400669772836e-05,
"loss": 0.2623,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2827831506729126,
"step": 4830
},
{
"epoch": 4.273851590106007,
"grad_norm": 0.592918872833252,
"learning_rate": 1.5840904333433717e-05,
"loss": 0.2439,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25347912311553955,
"step": 4835
},
{
"epoch": 4.278268551236749,
"grad_norm": 0.6142850518226624,
"learning_rate": 1.5797822149903625e-05,
"loss": 0.2262,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2786719501018524,
"step": 4840
},
{
"epoch": 4.282685512367491,
"grad_norm": 0.6769436597824097,
"learning_rate": 1.575476035618147e-05,
"loss": 0.253,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27305734157562256,
"step": 4845
},
{
"epoch": 4.2871024734982335,
"grad_norm": 0.6391955018043518,
"learning_rate": 1.5711719161211674e-05,
"loss": 0.2378,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2296718955039978,
"step": 4850
},
{
"epoch": 4.291519434628976,
"grad_norm": 0.6884132623672485,
"learning_rate": 1.5668698773838746e-05,
"loss": 0.2877,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24689459800720215,
"step": 4855
},
{
"epoch": 4.295936395759718,
"grad_norm": 0.6045219898223877,
"learning_rate": 1.562569940280622e-05,
"loss": 0.229,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24978384375572205,
"step": 4860
},
{
"epoch": 4.30035335689046,
"grad_norm": 0.8001757860183716,
"learning_rate": 1.5582721256755632e-05,
"loss": 0.243,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16442768275737762,
"step": 4865
},
{
"epoch": 4.304770318021202,
"grad_norm": 0.6849569082260132,
"learning_rate": 1.5539764544225565e-05,
"loss": 0.2499,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1961660087108612,
"step": 4870
},
{
"epoch": 4.309187279151944,
"grad_norm": 0.6818974614143372,
"learning_rate": 1.5496829473650568e-05,
"loss": 0.2427,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24043825268745422,
"step": 4875
},
{
"epoch": 4.313604240282686,
"grad_norm": 0.5977321863174438,
"learning_rate": 1.5453916253360218e-05,
"loss": 0.2614,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2922767102718353,
"step": 4880
},
{
"epoch": 4.318021201413427,
"grad_norm": 0.6175761818885803,
"learning_rate": 1.5411025091578025e-05,
"loss": 0.2386,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21561937034130096,
"step": 4885
},
{
"epoch": 4.322438162544169,
"grad_norm": 0.6610462069511414,
"learning_rate": 1.5368156196420506e-05,
"loss": 0.3025,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3540098965167999,
"step": 4890
},
{
"epoch": 4.326855123674911,
"grad_norm": 0.5722936987876892,
"learning_rate": 1.5325309775896117e-05,
"loss": 0.2698,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2856179475784302,
"step": 4895
},
{
"epoch": 4.331272084805653,
"grad_norm": 0.5998113751411438,
"learning_rate": 1.5282486037904253e-05,
"loss": 0.2487,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23048731684684753,
"step": 4900
},
{
"epoch": 4.3356890459363955,
"grad_norm": 0.5948532819747925,
"learning_rate": 1.5239685190234287e-05,
"loss": 0.2564,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24808341264724731,
"step": 4905
},
{
"epoch": 4.340106007067138,
"grad_norm": 0.6422690749168396,
"learning_rate": 1.519690744056447e-05,
"loss": 0.2617,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20361298322677612,
"step": 4910
},
{
"epoch": 4.34452296819788,
"grad_norm": 0.6684097051620483,
"learning_rate": 1.5154152996461026e-05,
"loss": 0.2334,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17725619673728943,
"step": 4915
},
{
"epoch": 4.348939929328622,
"grad_norm": 0.6210095286369324,
"learning_rate": 1.5111422065377062e-05,
"loss": 0.2756,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.34692680835723877,
"step": 4920
},
{
"epoch": 4.353356890459364,
"grad_norm": 0.617612898349762,
"learning_rate": 1.5068714854651614e-05,
"loss": 0.2544,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3084735870361328,
"step": 4925
},
{
"epoch": 4.357773851590106,
"grad_norm": 0.6277985572814941,
"learning_rate": 1.5026031571508606e-05,
"loss": 0.2548,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2271774411201477,
"step": 4930
},
{
"epoch": 4.362190812720848,
"grad_norm": 0.6329308152198792,
"learning_rate": 1.498337242305588e-05,
"loss": 0.239,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20148731768131256,
"step": 4935
},
{
"epoch": 4.36660777385159,
"grad_norm": 0.7278103232383728,
"learning_rate": 1.4940737616284163e-05,
"loss": 0.2697,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24024207890033722,
"step": 4940
},
{
"epoch": 4.371024734982332,
"grad_norm": 0.7309764623641968,
"learning_rate": 1.4898127358066061e-05,
"loss": 0.2612,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26143181324005127,
"step": 4945
},
{
"epoch": 4.375441696113074,
"grad_norm": 0.6814038157463074,
"learning_rate": 1.4855541855155086e-05,
"loss": 0.2499,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26438629627227783,
"step": 4950
},
{
"epoch": 4.379858657243816,
"grad_norm": 0.6568551063537598,
"learning_rate": 1.4812981314184607e-05,
"loss": 0.2344,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26820051670074463,
"step": 4955
},
{
"epoch": 4.384275618374558,
"grad_norm": 0.6261412501335144,
"learning_rate": 1.4770445941666905e-05,
"loss": 0.2311,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.282004714012146,
"step": 4960
},
{
"epoch": 4.3886925795053005,
"grad_norm": 0.8228702545166016,
"learning_rate": 1.4727935943992098e-05,
"loss": 0.244,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24492314457893372,
"step": 4965
},
{
"epoch": 4.3931095406360425,
"grad_norm": 0.6519868969917297,
"learning_rate": 1.4685451527427224e-05,
"loss": 0.2287,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2917710840702057,
"step": 4970
},
{
"epoch": 4.397526501766785,
"grad_norm": 0.5828742980957031,
"learning_rate": 1.4642992898115158e-05,
"loss": 0.246,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21987062692642212,
"step": 4975
},
{
"epoch": 4.401943462897527,
"grad_norm": 0.5715969204902649,
"learning_rate": 1.460056026207367e-05,
"loss": 0.2758,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3032934367656708,
"step": 4980
},
{
"epoch": 4.406360424028269,
"grad_norm": 0.7088951468467712,
"learning_rate": 1.4558153825194419e-05,
"loss": 0.2615,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2616375982761383,
"step": 4985
},
{
"epoch": 4.410777385159011,
"grad_norm": 0.5688840746879578,
"learning_rate": 1.4515773793241898e-05,
"loss": 0.2407,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28225451707839966,
"step": 4990
},
{
"epoch": 4.415194346289753,
"grad_norm": 0.6520532369613647,
"learning_rate": 1.4473420371852526e-05,
"loss": 0.2543,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.302867591381073,
"step": 4995
},
{
"epoch": 4.419611307420495,
"grad_norm": 0.6046358942985535,
"learning_rate": 1.4431093766533567e-05,
"loss": 0.2606,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26887935400009155,
"step": 5000
},
{
"epoch": 4.424028268551237,
"grad_norm": 0.6057432889938354,
"learning_rate": 1.4388794182662186e-05,
"loss": 0.2842,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3114582598209381,
"step": 5005
},
{
"epoch": 4.428445229681979,
"grad_norm": 0.616709291934967,
"learning_rate": 1.4346521825484424e-05,
"loss": 0.2327,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2023238241672516,
"step": 5010
},
{
"epoch": 4.432862190812721,
"grad_norm": 0.5935223698616028,
"learning_rate": 1.4304276900114222e-05,
"loss": 0.248,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3148966431617737,
"step": 5015
},
{
"epoch": 4.4372791519434625,
"grad_norm": 0.5329979658126831,
"learning_rate": 1.4262059611532419e-05,
"loss": 0.2224,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20250526070594788,
"step": 5020
},
{
"epoch": 4.4416961130742045,
"grad_norm": 0.6398733854293823,
"learning_rate": 1.4219870164585739e-05,
"loss": 0.2735,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3195938467979431,
"step": 5025
},
{
"epoch": 4.446113074204947,
"grad_norm": 1.2401176691055298,
"learning_rate": 1.417770876398583e-05,
"loss": 0.2832,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27838078141212463,
"step": 5030
},
{
"epoch": 4.450530035335689,
"grad_norm": 0.8684335947036743,
"learning_rate": 1.4135575614308232e-05,
"loss": 0.2552,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29576045274734497,
"step": 5035
},
{
"epoch": 4.454946996466431,
"grad_norm": 0.709621012210846,
"learning_rate": 1.4093470919991442e-05,
"loss": 0.2892,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29040542244911194,
"step": 5040
},
{
"epoch": 4.459363957597173,
"grad_norm": 0.5984362363815308,
"learning_rate": 1.4051394885335836e-05,
"loss": 0.2772,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21487905085086823,
"step": 5045
},
{
"epoch": 4.463780918727915,
"grad_norm": 0.5817950963973999,
"learning_rate": 1.4009347714502778e-05,
"loss": 0.2243,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2756061553955078,
"step": 5050
},
{
"epoch": 4.468197879858657,
"grad_norm": 0.6032534837722778,
"learning_rate": 1.3967329611513543e-05,
"loss": 0.243,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.34259840846061707,
"step": 5055
},
{
"epoch": 4.472614840989399,
"grad_norm": 0.5521541833877563,
"learning_rate": 1.3925340780248373e-05,
"loss": 0.2629,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2480975240468979,
"step": 5060
},
{
"epoch": 4.477031802120141,
"grad_norm": 0.5998513102531433,
"learning_rate": 1.3883381424445506e-05,
"loss": 0.2168,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17923498153686523,
"step": 5065
},
{
"epoch": 4.481448763250883,
"grad_norm": 0.6650380492210388,
"learning_rate": 1.3841451747700098e-05,
"loss": 0.2603,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2855014503002167,
"step": 5070
},
{
"epoch": 4.485865724381625,
"grad_norm": 0.5877347588539124,
"learning_rate": 1.3799551953463362e-05,
"loss": 0.2619,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21608762443065643,
"step": 5075
},
{
"epoch": 4.490282685512367,
"grad_norm": 0.5865316390991211,
"learning_rate": 1.3757682245041466e-05,
"loss": 0.2375,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22935955226421356,
"step": 5080
},
{
"epoch": 4.4946996466431095,
"grad_norm": 0.5989511013031006,
"learning_rate": 1.3715842825594628e-05,
"loss": 0.2427,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2101748287677765,
"step": 5085
},
{
"epoch": 4.499116607773852,
"grad_norm": 0.6382969617843628,
"learning_rate": 1.3674033898136071e-05,
"loss": 0.278,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2421863079071045,
"step": 5090
},
{
"epoch": 4.503533568904594,
"grad_norm": 0.638594388961792,
"learning_rate": 1.3632255665531088e-05,
"loss": 0.2619,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22802847623825073,
"step": 5095
},
{
"epoch": 4.507950530035336,
"grad_norm": 0.6239488124847412,
"learning_rate": 1.3590508330496027e-05,
"loss": 0.2318,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2688833773136139,
"step": 5100
},
{
"epoch": 4.512367491166078,
"grad_norm": 0.6269590258598328,
"learning_rate": 1.3548792095597305e-05,
"loss": 0.2235,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23217537999153137,
"step": 5105
},
{
"epoch": 4.51678445229682,
"grad_norm": 0.6065074801445007,
"learning_rate": 1.3507107163250453e-05,
"loss": 0.2419,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2584981918334961,
"step": 5110
},
{
"epoch": 4.521201413427562,
"grad_norm": 0.6206178665161133,
"learning_rate": 1.3465453735719087e-05,
"loss": 0.2439,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2482869029045105,
"step": 5115
},
{
"epoch": 4.525618374558304,
"grad_norm": 0.6238287091255188,
"learning_rate": 1.3423832015114e-05,
"loss": 0.2832,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31270158290863037,
"step": 5120
},
{
"epoch": 4.530035335689046,
"grad_norm": 0.5761831998825073,
"learning_rate": 1.3382242203392083e-05,
"loss": 0.2112,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1838235706090927,
"step": 5125
},
{
"epoch": 4.534452296819788,
"grad_norm": 0.636617124080658,
"learning_rate": 1.3340684502355443e-05,
"loss": 0.2808,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3398388922214508,
"step": 5130
},
{
"epoch": 4.53886925795053,
"grad_norm": 0.6107141971588135,
"learning_rate": 1.3299159113650357e-05,
"loss": 0.2751,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26246610283851624,
"step": 5135
},
{
"epoch": 4.543286219081272,
"grad_norm": 0.6003983020782471,
"learning_rate": 1.325766623876632e-05,
"loss": 0.2733,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27694079279899597,
"step": 5140
},
{
"epoch": 4.5477031802120145,
"grad_norm": 0.6244370937347412,
"learning_rate": 1.321620607903508e-05,
"loss": 0.261,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26806601881980896,
"step": 5145
},
{
"epoch": 4.5521201413427566,
"grad_norm": 0.6757631897926331,
"learning_rate": 1.3174778835629605e-05,
"loss": 0.2636,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27561718225479126,
"step": 5150
},
{
"epoch": 4.556537102473499,
"grad_norm": 0.6886018514633179,
"learning_rate": 1.3133384709563188e-05,
"loss": 0.2509,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24703247845172882,
"step": 5155
},
{
"epoch": 4.560954063604241,
"grad_norm": 0.6107934713363647,
"learning_rate": 1.309202390168841e-05,
"loss": 0.2404,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28265172243118286,
"step": 5160
},
{
"epoch": 4.565371024734983,
"grad_norm": 0.7151978015899658,
"learning_rate": 1.3050696612696188e-05,
"loss": 0.2432,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25835981965065,
"step": 5165
},
{
"epoch": 4.569787985865725,
"grad_norm": 0.5881211161613464,
"learning_rate": 1.3009403043114796e-05,
"loss": 0.26,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2113930583000183,
"step": 5170
},
{
"epoch": 4.574204946996466,
"grad_norm": 0.7579076886177063,
"learning_rate": 1.2968143393308897e-05,
"loss": 0.2324,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3218274712562561,
"step": 5175
},
{
"epoch": 4.578621908127208,
"grad_norm": 0.5710316896438599,
"learning_rate": 1.2926917863478581e-05,
"loss": 0.2566,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2202773243188858,
"step": 5180
},
{
"epoch": 4.58303886925795,
"grad_norm": 0.6459076404571533,
"learning_rate": 1.2885726653658355e-05,
"loss": 0.2738,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2878916263580322,
"step": 5185
},
{
"epoch": 4.587455830388692,
"grad_norm": 0.5926949381828308,
"learning_rate": 1.2844569963716222e-05,
"loss": 0.2545,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23166950047016144,
"step": 5190
},
{
"epoch": 4.591872791519434,
"grad_norm": 0.6707189083099365,
"learning_rate": 1.280344799335267e-05,
"loss": 0.2543,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25186440348625183,
"step": 5195
},
{
"epoch": 4.5962897526501765,
"grad_norm": 0.573384165763855,
"learning_rate": 1.2762360942099745e-05,
"loss": 0.2415,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24173548817634583,
"step": 5200
},
{
"epoch": 4.6007067137809186,
"grad_norm": 0.6182858347892761,
"learning_rate": 1.2721309009320021e-05,
"loss": 0.2502,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23106129467487335,
"step": 5205
},
{
"epoch": 4.605123674911661,
"grad_norm": 0.6368361115455627,
"learning_rate": 1.268029239420571e-05,
"loss": 0.295,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2934111952781677,
"step": 5210
},
{
"epoch": 4.609540636042403,
"grad_norm": 0.6030951738357544,
"learning_rate": 1.2639311295777632e-05,
"loss": 0.2495,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2604959309101105,
"step": 5215
},
{
"epoch": 4.613957597173145,
"grad_norm": 0.6463732123374939,
"learning_rate": 1.2598365912884267e-05,
"loss": 0.2556,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25189077854156494,
"step": 5220
},
{
"epoch": 4.618374558303887,
"grad_norm": 0.5990574359893799,
"learning_rate": 1.2557456444200831e-05,
"loss": 0.296,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2886963486671448,
"step": 5225
},
{
"epoch": 4.622791519434629,
"grad_norm": 0.6595236659049988,
"learning_rate": 1.2516583088228224e-05,
"loss": 0.2777,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17959409952163696,
"step": 5230
},
{
"epoch": 4.627208480565371,
"grad_norm": 0.5914446711540222,
"learning_rate": 1.2475746043292176e-05,
"loss": 0.2595,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2113860547542572,
"step": 5235
},
{
"epoch": 4.631625441696113,
"grad_norm": 0.6367279887199402,
"learning_rate": 1.243494550754219e-05,
"loss": 0.2763,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2789275348186493,
"step": 5240
},
{
"epoch": 4.636042402826855,
"grad_norm": 0.6216886043548584,
"learning_rate": 1.239418167895063e-05,
"loss": 0.2777,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2759518623352051,
"step": 5245
},
{
"epoch": 4.640459363957597,
"grad_norm": 0.689033031463623,
"learning_rate": 1.2353454755311751e-05,
"loss": 0.2444,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25160372257232666,
"step": 5250
},
{
"epoch": 4.644876325088339,
"grad_norm": 0.5909837484359741,
"learning_rate": 1.2312764934240735e-05,
"loss": 0.2706,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26029708981513977,
"step": 5255
},
{
"epoch": 4.6492932862190814,
"grad_norm": 0.6285930275917053,
"learning_rate": 1.227211241317275e-05,
"loss": 0.2523,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27858561277389526,
"step": 5260
},
{
"epoch": 4.6537102473498235,
"grad_norm": 0.6383758783340454,
"learning_rate": 1.223149738936195e-05,
"loss": 0.2539,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2381717562675476,
"step": 5265
},
{
"epoch": 4.658127208480566,
"grad_norm": 0.6008221507072449,
"learning_rate": 1.219092005988057e-05,
"loss": 0.2527,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2692645490169525,
"step": 5270
},
{
"epoch": 4.662544169611308,
"grad_norm": 0.6864904761314392,
"learning_rate": 1.215038062161792e-05,
"loss": 0.2383,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24149903655052185,
"step": 5275
},
{
"epoch": 4.66696113074205,
"grad_norm": 0.6051673889160156,
"learning_rate": 1.2109879271279486e-05,
"loss": 0.2567,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2649880647659302,
"step": 5280
},
{
"epoch": 4.671378091872792,
"grad_norm": 0.6154433488845825,
"learning_rate": 1.2069416205385902e-05,
"loss": 0.2271,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19507455825805664,
"step": 5285
},
{
"epoch": 4.675795053003534,
"grad_norm": 0.6651699542999268,
"learning_rate": 1.2028991620272081e-05,
"loss": 0.2139,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19150885939598083,
"step": 5290
},
{
"epoch": 4.680212014134275,
"grad_norm": 0.6139252185821533,
"learning_rate": 1.1988605712086199e-05,
"loss": 0.2503,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.201002836227417,
"step": 5295
},
{
"epoch": 4.684628975265017,
"grad_norm": 0.6537069082260132,
"learning_rate": 1.1948258676788751e-05,
"loss": 0.269,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.33721426129341125,
"step": 5300
},
{
"epoch": 4.689045936395759,
"grad_norm": 0.6756031513214111,
"learning_rate": 1.190795071015165e-05,
"loss": 0.2719,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.34285324811935425,
"step": 5305
},
{
"epoch": 4.693462897526501,
"grad_norm": 0.5824704170227051,
"learning_rate": 1.1867682007757191e-05,
"loss": 0.2423,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2109375149011612,
"step": 5310
},
{
"epoch": 4.6978798586572434,
"grad_norm": 0.6546387076377869,
"learning_rate": 1.1827452764997198e-05,
"loss": 0.2419,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17242984473705292,
"step": 5315
},
{
"epoch": 4.7022968197879855,
"grad_norm": 0.698542058467865,
"learning_rate": 1.1787263177071997e-05,
"loss": 0.2423,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18797524273395538,
"step": 5320
},
{
"epoch": 4.706713780918728,
"grad_norm": 0.7340909242630005,
"learning_rate": 1.174711343898952e-05,
"loss": 0.234,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19447045028209686,
"step": 5325
},
{
"epoch": 4.71113074204947,
"grad_norm": 0.5774402022361755,
"learning_rate": 1.1707003745564319e-05,
"loss": 0.232,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24991655349731445,
"step": 5330
},
{
"epoch": 4.715547703180212,
"grad_norm": 0.6813011169433594,
"learning_rate": 1.1666934291416666e-05,
"loss": 0.2505,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2309049814939499,
"step": 5335
},
{
"epoch": 4.719964664310954,
"grad_norm": 0.6249779462814331,
"learning_rate": 1.1626905270971563e-05,
"loss": 0.2342,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20291580259799957,
"step": 5340
},
{
"epoch": 4.724381625441696,
"grad_norm": 0.6181374788284302,
"learning_rate": 1.1586916878457837e-05,
"loss": 0.235,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.34734612703323364,
"step": 5345
},
{
"epoch": 4.728798586572438,
"grad_norm": 0.5957716107368469,
"learning_rate": 1.1546969307907162e-05,
"loss": 0.2824,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21913906931877136,
"step": 5350
},
{
"epoch": 4.73321554770318,
"grad_norm": 0.6221516132354736,
"learning_rate": 1.1507062753153155e-05,
"loss": 0.2466,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20802852511405945,
"step": 5355
},
{
"epoch": 4.737632508833922,
"grad_norm": 0.6126749515533447,
"learning_rate": 1.1467197407830409e-05,
"loss": 0.2835,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3353455662727356,
"step": 5360
},
{
"epoch": 4.742049469964664,
"grad_norm": 0.6599173545837402,
"learning_rate": 1.1427373465373541e-05,
"loss": 0.2764,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.289046049118042,
"step": 5365
},
{
"epoch": 4.746466431095406,
"grad_norm": 0.5880971550941467,
"learning_rate": 1.1387591119016292e-05,
"loss": 0.2267,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2312006950378418,
"step": 5370
},
{
"epoch": 4.750883392226148,
"grad_norm": 0.6444661617279053,
"learning_rate": 1.1347850561790594e-05,
"loss": 0.2895,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2494499385356903,
"step": 5375
},
{
"epoch": 4.7553003533568905,
"grad_norm": 0.5412120819091797,
"learning_rate": 1.1308151986525557e-05,
"loss": 0.2552,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20912671089172363,
"step": 5380
},
{
"epoch": 4.759717314487633,
"grad_norm": 0.5995916724205017,
"learning_rate": 1.1268495585846621e-05,
"loss": 0.2509,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21118956804275513,
"step": 5385
},
{
"epoch": 4.764134275618375,
"grad_norm": 0.6803336143493652,
"learning_rate": 1.1228881552174585e-05,
"loss": 0.2416,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2551300525665283,
"step": 5390
},
{
"epoch": 4.768551236749117,
"grad_norm": 0.6087960600852966,
"learning_rate": 1.1189310077724667e-05,
"loss": 0.2682,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23086312413215637,
"step": 5395
},
{
"epoch": 4.772968197879859,
"grad_norm": 0.6534774303436279,
"learning_rate": 1.1149781354505565e-05,
"loss": 0.2789,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2714969515800476,
"step": 5400
},
{
"epoch": 4.777385159010601,
"grad_norm": 0.5906716585159302,
"learning_rate": 1.111029557431858e-05,
"loss": 0.2106,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25104430317878723,
"step": 5405
},
{
"epoch": 4.781802120141343,
"grad_norm": 0.6094644665718079,
"learning_rate": 1.1070852928756598e-05,
"loss": 0.2423,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.34287014603614807,
"step": 5410
},
{
"epoch": 4.786219081272085,
"grad_norm": 0.6480569243431091,
"learning_rate": 1.1031453609203244e-05,
"loss": 0.2544,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24471309781074524,
"step": 5415
},
{
"epoch": 4.790636042402827,
"grad_norm": 0.6177884936332703,
"learning_rate": 1.0992097806831894e-05,
"loss": 0.2405,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2734081745147705,
"step": 5420
},
{
"epoch": 4.795053003533569,
"grad_norm": 0.666823148727417,
"learning_rate": 1.0952785712604777e-05,
"loss": 0.2846,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23860061168670654,
"step": 5425
},
{
"epoch": 4.799469964664311,
"grad_norm": 0.6771073937416077,
"learning_rate": 1.0913517517272057e-05,
"loss": 0.2523,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2811965048313141,
"step": 5430
},
{
"epoch": 4.803886925795053,
"grad_norm": 0.6511576771736145,
"learning_rate": 1.0874293411370847e-05,
"loss": 0.251,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2048361599445343,
"step": 5435
},
{
"epoch": 4.8083038869257955,
"grad_norm": 0.622969925403595,
"learning_rate": 1.083511358522439e-05,
"loss": 0.2611,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25874412059783936,
"step": 5440
},
{
"epoch": 4.8127208480565375,
"grad_norm": 0.6246639490127563,
"learning_rate": 1.0795978228941025e-05,
"loss": 0.2497,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28046876192092896,
"step": 5445
},
{
"epoch": 4.81713780918728,
"grad_norm": 0.5713747143745422,
"learning_rate": 1.0756887532413328e-05,
"loss": 0.2686,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20135197043418884,
"step": 5450
},
{
"epoch": 4.821554770318021,
"grad_norm": 0.6463353037834167,
"learning_rate": 1.0717841685317207e-05,
"loss": 0.258,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24732375144958496,
"step": 5455
},
{
"epoch": 4.825971731448763,
"grad_norm": 0.6025271415710449,
"learning_rate": 1.0678840877110906e-05,
"loss": 0.2883,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2834791839122772,
"step": 5460
},
{
"epoch": 4.830388692579505,
"grad_norm": 0.5726755261421204,
"learning_rate": 1.0639885297034157e-05,
"loss": 0.2302,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23436738550662994,
"step": 5465
},
{
"epoch": 4.834805653710247,
"grad_norm": 0.6157001852989197,
"learning_rate": 1.060097513410723e-05,
"loss": 0.2783,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25192394852638245,
"step": 5470
},
{
"epoch": 4.839222614840989,
"grad_norm": 0.6347540020942688,
"learning_rate": 1.0562110577130031e-05,
"loss": 0.2754,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.324943482875824,
"step": 5475
},
{
"epoch": 4.843639575971731,
"grad_norm": 0.5740549564361572,
"learning_rate": 1.0523291814681149e-05,
"loss": 0.2622,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25427937507629395,
"step": 5480
},
{
"epoch": 4.848056537102473,
"grad_norm": 0.5769616961479187,
"learning_rate": 1.0484519035117015e-05,
"loss": 0.2484,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2660577595233917,
"step": 5485
},
{
"epoch": 4.852473498233215,
"grad_norm": 0.7008361220359802,
"learning_rate": 1.0445792426570894e-05,
"loss": 0.2689,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2324357032775879,
"step": 5490
},
{
"epoch": 4.8568904593639575,
"grad_norm": 0.6129425764083862,
"learning_rate": 1.040711217695205e-05,
"loss": 0.2246,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.245370551943779,
"step": 5495
},
{
"epoch": 4.8613074204946995,
"grad_norm": 0.6054858565330505,
"learning_rate": 1.0368478473944792e-05,
"loss": 0.2404,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23680360615253448,
"step": 5500
},
{
"epoch": 4.865724381625442,
"grad_norm": 0.5556029081344604,
"learning_rate": 1.0329891505007582e-05,
"loss": 0.242,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21621742844581604,
"step": 5505
},
{
"epoch": 4.870141342756184,
"grad_norm": 0.5951539874076843,
"learning_rate": 1.029135145737212e-05,
"loss": 0.2418,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24147434532642365,
"step": 5510
},
{
"epoch": 4.874558303886926,
"grad_norm": 0.6369734406471252,
"learning_rate": 1.0252858518042413e-05,
"loss": 0.253,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2957211136817932,
"step": 5515
},
{
"epoch": 4.878975265017668,
"grad_norm": 0.6428771018981934,
"learning_rate": 1.0214412873793931e-05,
"loss": 0.2393,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20931799709796906,
"step": 5520
},
{
"epoch": 4.88339222614841,
"grad_norm": 0.591044008731842,
"learning_rate": 1.0176014711172615e-05,
"loss": 0.2694,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25282272696495056,
"step": 5525
},
{
"epoch": 4.887809187279152,
"grad_norm": 0.5807618498802185,
"learning_rate": 1.0137664216494035e-05,
"loss": 0.2504,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26002442836761475,
"step": 5530
},
{
"epoch": 4.892226148409894,
"grad_norm": 0.5614008903503418,
"learning_rate": 1.0099361575842486e-05,
"loss": 0.2173,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2629309296607971,
"step": 5535
},
{
"epoch": 4.896643109540636,
"grad_norm": 0.5964021682739258,
"learning_rate": 1.0061106975070025e-05,
"loss": 0.2604,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3080860376358032,
"step": 5540
},
{
"epoch": 4.901060070671378,
"grad_norm": 0.6463096141815186,
"learning_rate": 1.0022900599795641e-05,
"loss": 0.3287,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2719125747680664,
"step": 5545
},
{
"epoch": 4.90547703180212,
"grad_norm": 0.6948069334030151,
"learning_rate": 9.984742635404313e-06,
"loss": 0.2694,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2654094398021698,
"step": 5550
},
{
"epoch": 4.909893992932862,
"grad_norm": 0.9675594568252563,
"learning_rate": 9.946633267046125e-06,
"loss": 0.2607,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2373758852481842,
"step": 5555
},
{
"epoch": 4.9143109540636045,
"grad_norm": 0.6239486336708069,
"learning_rate": 9.908572679635337e-06,
"loss": 0.2684,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19304318726062775,
"step": 5560
},
{
"epoch": 4.918727915194347,
"grad_norm": 0.6223797798156738,
"learning_rate": 9.87056105784957e-06,
"loss": 0.2712,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2559204697608948,
"step": 5565
},
{
"epoch": 4.923144876325089,
"grad_norm": 0.5596070885658264,
"learning_rate": 9.832598586128796e-06,
"loss": 0.3051,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24063706398010254,
"step": 5570
},
{
"epoch": 4.927561837455831,
"grad_norm": 0.7187504768371582,
"learning_rate": 9.794685448674533e-06,
"loss": 0.2447,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2679745554924011,
"step": 5575
},
{
"epoch": 4.931978798586572,
"grad_norm": 0.6528657078742981,
"learning_rate": 9.756821829448911e-06,
"loss": 0.2278,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2918229103088379,
"step": 5580
},
{
"epoch": 4.936395759717314,
"grad_norm": 0.5671543478965759,
"learning_rate": 9.719007912173786e-06,
"loss": 0.2551,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24266092479228973,
"step": 5585
},
{
"epoch": 4.940812720848056,
"grad_norm": 0.5972075462341309,
"learning_rate": 9.681243880329864e-06,
"loss": 0.2973,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26102444529533386,
"step": 5590
},
{
"epoch": 4.945229681978798,
"grad_norm": 0.6930065751075745,
"learning_rate": 9.643529917155765e-06,
"loss": 0.2431,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15023840963840485,
"step": 5595
},
{
"epoch": 4.94964664310954,
"grad_norm": 0.5619693994522095,
"learning_rate": 9.60586620564721e-06,
"loss": 0.2326,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17770014703273773,
"step": 5600
},
{
"epoch": 4.954063604240282,
"grad_norm": 0.683238685131073,
"learning_rate": 9.568252928556045e-06,
"loss": 0.2577,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27513328194618225,
"step": 5605
},
{
"epoch": 4.958480565371024,
"grad_norm": 0.6224611401557922,
"learning_rate": 9.530690268389419e-06,
"loss": 0.2536,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2843214273452759,
"step": 5610
},
{
"epoch": 4.9628975265017665,
"grad_norm": 0.6748574376106262,
"learning_rate": 9.493178407408898e-06,
"loss": 0.2502,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3191365599632263,
"step": 5615
},
{
"epoch": 4.967314487632509,
"grad_norm": 0.6514118313789368,
"learning_rate": 9.45571752762952e-06,
"loss": 0.2612,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27559196949005127,
"step": 5620
},
{
"epoch": 4.971731448763251,
"grad_norm": 0.5978466868400574,
"learning_rate": 9.418307810818974e-06,
"loss": 0.2224,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2235470712184906,
"step": 5625
},
{
"epoch": 4.976148409893993,
"grad_norm": 0.8135108351707458,
"learning_rate": 9.380949438496694e-06,
"loss": 0.2443,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21574482321739197,
"step": 5630
},
{
"epoch": 4.980565371024735,
"grad_norm": 0.5374975204467773,
"learning_rate": 9.343642591932986e-06,
"loss": 0.2481,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24713656306266785,
"step": 5635
},
{
"epoch": 4.984982332155477,
"grad_norm": 0.6835610866546631,
"learning_rate": 9.306387452148117e-06,
"loss": 0.2555,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2892993092536926,
"step": 5640
},
{
"epoch": 4.989399293286219,
"grad_norm": 0.6098293662071228,
"learning_rate": 9.269184199911507e-06,
"loss": 0.2758,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2980913519859314,
"step": 5645
},
{
"epoch": 4.993816254416961,
"grad_norm": 0.5952140092849731,
"learning_rate": 9.232033015740765e-06,
"loss": 0.283,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3264719247817993,
"step": 5650
},
{
"epoch": 4.998233215547703,
"grad_norm": 0.6798433661460876,
"learning_rate": 9.19493407990087e-06,
"loss": 0.2749,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2580171823501587,
"step": 5655
},
{
"epoch": 5.003533568904594,
"grad_norm": 0.5702685117721558,
"learning_rate": 9.157887572403292e-06,
"loss": 0.2212,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20382651686668396,
"step": 5660
},
{
"epoch": 5.007950530035336,
"grad_norm": 0.692338764667511,
"learning_rate": 9.120893673005095e-06,
"loss": 0.2174,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19380773603916168,
"step": 5665
},
{
"epoch": 5.012367491166078,
"grad_norm": 0.5917826890945435,
"learning_rate": 9.083952561208093e-06,
"loss": 0.2525,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21118654310703278,
"step": 5670
},
{
"epoch": 5.01678445229682,
"grad_norm": 0.6775484681129456,
"learning_rate": 9.04706441625793e-06,
"loss": 0.2458,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3707857131958008,
"step": 5675
},
{
"epoch": 5.021201413427562,
"grad_norm": 0.6675564646720886,
"learning_rate": 9.010229417143298e-06,
"loss": 0.2348,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18382994830608368,
"step": 5680
},
{
"epoch": 5.025618374558304,
"grad_norm": 0.6618363261222839,
"learning_rate": 8.973447742594959e-06,
"loss": 0.2264,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22905008494853973,
"step": 5685
},
{
"epoch": 5.030035335689046,
"grad_norm": 0.6834167242050171,
"learning_rate": 8.936719571084964e-06,
"loss": 0.2468,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16894429922103882,
"step": 5690
},
{
"epoch": 5.034452296819788,
"grad_norm": 1.0125937461853027,
"learning_rate": 8.900045080825772e-06,
"loss": 0.1942,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1643485128879547,
"step": 5695
},
{
"epoch": 5.03886925795053,
"grad_norm": 0.66016685962677,
"learning_rate": 8.863424449769326e-06,
"loss": 0.2063,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19492757320404053,
"step": 5700
},
{
"epoch": 5.043286219081272,
"grad_norm": 0.6835595369338989,
"learning_rate": 8.826857855606268e-06,
"loss": 0.2236,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2136615514755249,
"step": 5705
},
{
"epoch": 5.0477031802120145,
"grad_norm": 0.6571291089057922,
"learning_rate": 8.790345475765028e-06,
"loss": 0.2325,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31727343797683716,
"step": 5710
},
{
"epoch": 5.0521201413427566,
"grad_norm": 0.6805379390716553,
"learning_rate": 8.753887487410988e-06,
"loss": 0.244,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2703685164451599,
"step": 5715
},
{
"epoch": 5.056537102473499,
"grad_norm": 0.6848156452178955,
"learning_rate": 8.71748406744559e-06,
"loss": 0.2635,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2362714409828186,
"step": 5720
},
{
"epoch": 5.060954063604241,
"grad_norm": 0.6533686518669128,
"learning_rate": 8.681135392505521e-06,
"loss": 0.2934,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.33763277530670166,
"step": 5725
},
{
"epoch": 5.065371024734982,
"grad_norm": 0.7272844910621643,
"learning_rate": 8.644841638961827e-06,
"loss": 0.2103,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16482684016227722,
"step": 5730
},
{
"epoch": 5.069787985865724,
"grad_norm": 0.6510602831840515,
"learning_rate": 8.608602982919061e-06,
"loss": 0.2306,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1856457144021988,
"step": 5735
},
{
"epoch": 5.074204946996466,
"grad_norm": 0.7189889550209045,
"learning_rate": 8.57241960021444e-06,
"loss": 0.2622,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23383861780166626,
"step": 5740
},
{
"epoch": 5.078621908127208,
"grad_norm": 0.6912586092948914,
"learning_rate": 8.536291666416971e-06,
"loss": 0.2268,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1808895766735077,
"step": 5745
},
{
"epoch": 5.08303886925795,
"grad_norm": 0.6760678291320801,
"learning_rate": 8.500219356826633e-06,
"loss": 0.2813,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3097551465034485,
"step": 5750
},
{
"epoch": 5.087455830388692,
"grad_norm": 0.6680203080177307,
"learning_rate": 8.464202846473467e-06,
"loss": 0.2059,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20965927839279175,
"step": 5755
},
{
"epoch": 5.091872791519434,
"grad_norm": 0.6519942879676819,
"learning_rate": 8.428242310116817e-06,
"loss": 0.2579,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28410807251930237,
"step": 5760
},
{
"epoch": 5.0962897526501765,
"grad_norm": 0.7109984755516052,
"learning_rate": 8.392337922244383e-06,
"loss": 0.2401,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19458754360675812,
"step": 5765
},
{
"epoch": 5.1007067137809186,
"grad_norm": 4.666537284851074,
"learning_rate": 8.35648985707144e-06,
"loss": 0.2129,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21463000774383545,
"step": 5770
},
{
"epoch": 5.105123674911661,
"grad_norm": 0.7026737928390503,
"learning_rate": 8.320698288539997e-06,
"loss": 0.2755,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1861979365348816,
"step": 5775
},
{
"epoch": 5.109540636042403,
"grad_norm": 0.7229135632514954,
"learning_rate": 8.284963390317885e-06,
"loss": 0.1896,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17001771926879883,
"step": 5780
},
{
"epoch": 5.113957597173145,
"grad_norm": 0.7189328670501709,
"learning_rate": 8.24928533579799e-06,
"loss": 0.2427,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2957168519496918,
"step": 5785
},
{
"epoch": 5.118374558303887,
"grad_norm": 0.6980794668197632,
"learning_rate": 8.21366429809737e-06,
"loss": 0.2294,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27361002564430237,
"step": 5790
},
{
"epoch": 5.122791519434629,
"grad_norm": 0.6661799550056458,
"learning_rate": 8.17810045005644e-06,
"loss": 0.2844,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26608705520629883,
"step": 5795
},
{
"epoch": 5.127208480565371,
"grad_norm": 0.6025856733322144,
"learning_rate": 8.142593964238092e-06,
"loss": 0.2243,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23461788892745972,
"step": 5800
},
{
"epoch": 5.131625441696113,
"grad_norm": 0.6775685548782349,
"learning_rate": 8.107145012926909e-06,
"loss": 0.2261,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20733845233917236,
"step": 5805
},
{
"epoch": 5.136042402826855,
"grad_norm": 0.6828117966651917,
"learning_rate": 8.071753768128299e-06,
"loss": 0.2198,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19489261507987976,
"step": 5810
},
{
"epoch": 5.140459363957597,
"grad_norm": 0.6730659604072571,
"learning_rate": 8.036420401567662e-06,
"loss": 0.236,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2441500872373581,
"step": 5815
},
{
"epoch": 5.144876325088339,
"grad_norm": 0.6313605904579163,
"learning_rate": 8.001145084689563e-06,
"loss": 0.2842,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26540714502334595,
"step": 5820
},
{
"epoch": 5.1492932862190814,
"grad_norm": 0.6368901133537292,
"learning_rate": 7.965927988656903e-06,
"loss": 0.2422,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2015400528907776,
"step": 5825
},
{
"epoch": 5.1537102473498235,
"grad_norm": 0.7787892818450928,
"learning_rate": 7.930769284350084e-06,
"loss": 0.2294,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21253234148025513,
"step": 5830
},
{
"epoch": 5.158127208480566,
"grad_norm": 0.7237471342086792,
"learning_rate": 7.895669142366159e-06,
"loss": 0.2181,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29186874628067017,
"step": 5835
},
{
"epoch": 5.162544169611308,
"grad_norm": 0.677948534488678,
"learning_rate": 7.860627733018065e-06,
"loss": 0.2292,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22216159105300903,
"step": 5840
},
{
"epoch": 5.16696113074205,
"grad_norm": 0.6752915382385254,
"learning_rate": 7.825645226333714e-06,
"loss": 0.219,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18272480368614197,
"step": 5845
},
{
"epoch": 5.171378091872792,
"grad_norm": 0.6883508563041687,
"learning_rate": 7.79072179205523e-06,
"loss": 0.2321,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3108995854854584,
"step": 5850
},
{
"epoch": 5.175795053003534,
"grad_norm": 0.6497143507003784,
"learning_rate": 7.755857599638124e-06,
"loss": 0.2032,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24391454458236694,
"step": 5855
},
{
"epoch": 5.180212014134276,
"grad_norm": 0.6400555372238159,
"learning_rate": 7.721052818250419e-06,
"loss": 0.2759,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.252552330493927,
"step": 5860
},
{
"epoch": 5.184628975265018,
"grad_norm": 0.624398946762085,
"learning_rate": 7.686307616771883e-06,
"loss": 0.2458,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2461286336183548,
"step": 5865
},
{
"epoch": 5.189045936395759,
"grad_norm": 0.6464707255363464,
"learning_rate": 7.651622163793189e-06,
"loss": 0.247,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23976178467273712,
"step": 5870
},
{
"epoch": 5.193462897526501,
"grad_norm": 0.6472598314285278,
"learning_rate": 7.616996627615103e-06,
"loss": 0.2295,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2011461853981018,
"step": 5875
},
{
"epoch": 5.1978798586572434,
"grad_norm": 0.7136189341545105,
"learning_rate": 7.582431176247642e-06,
"loss": 0.2714,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19472795724868774,
"step": 5880
},
{
"epoch": 5.2022968197879855,
"grad_norm": 0.6750161051750183,
"learning_rate": 7.547925977409301e-06,
"loss": 0.2119,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20839814841747284,
"step": 5885
},
{
"epoch": 5.206713780918728,
"grad_norm": 0.6782904267311096,
"learning_rate": 7.5134811985262115e-06,
"loss": 0.2842,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19967156648635864,
"step": 5890
},
{
"epoch": 5.21113074204947,
"grad_norm": 0.5999016761779785,
"learning_rate": 7.479097006731333e-06,
"loss": 0.2569,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2858957052230835,
"step": 5895
},
{
"epoch": 5.215547703180212,
"grad_norm": 0.7441349625587463,
"learning_rate": 7.444773568863646e-06,
"loss": 0.2368,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2590683400630951,
"step": 5900
},
{
"epoch": 5.219964664310954,
"grad_norm": 0.7110154032707214,
"learning_rate": 7.410511051467339e-06,
"loss": 0.247,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17946434020996094,
"step": 5905
},
{
"epoch": 5.224381625441696,
"grad_norm": 0.6729068756103516,
"learning_rate": 7.376309620791016e-06,
"loss": 0.1984,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24539992213249207,
"step": 5910
},
{
"epoch": 5.228798586572438,
"grad_norm": 0.7076547145843506,
"learning_rate": 7.342169442786835e-06,
"loss": 0.2352,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22367143630981445,
"step": 5915
},
{
"epoch": 5.23321554770318,
"grad_norm": 0.6758275032043457,
"learning_rate": 7.308090683109803e-06,
"loss": 0.2557,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.222344771027565,
"step": 5920
},
{
"epoch": 5.237632508833922,
"grad_norm": 0.5967568755149841,
"learning_rate": 7.274073507116865e-06,
"loss": 0.2537,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29815196990966797,
"step": 5925
},
{
"epoch": 5.242049469964664,
"grad_norm": 0.5832816958427429,
"learning_rate": 7.240118079866163e-06,
"loss": 0.2208,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2399141788482666,
"step": 5930
},
{
"epoch": 5.246466431095406,
"grad_norm": 0.6074890494346619,
"learning_rate": 7.206224566116247e-06,
"loss": 0.2618,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19510237872600555,
"step": 5935
},
{
"epoch": 5.250883392226148,
"grad_norm": 0.6020046472549438,
"learning_rate": 7.172393130325208e-06,
"loss": 0.2298,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22199654579162598,
"step": 5940
},
{
"epoch": 5.2553003533568905,
"grad_norm": 0.6055787801742554,
"learning_rate": 7.138623936649951e-06,
"loss": 0.2273,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2283298373222351,
"step": 5945
},
{
"epoch": 5.259717314487633,
"grad_norm": 0.6854278445243835,
"learning_rate": 7.104917148945363e-06,
"loss": 0.2597,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23151074349880219,
"step": 5950
},
{
"epoch": 5.264134275618375,
"grad_norm": 0.6667275428771973,
"learning_rate": 7.0712729307635284e-06,
"loss": 0.2464,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24305643141269684,
"step": 5955
},
{
"epoch": 5.268551236749117,
"grad_norm": 0.6479029655456543,
"learning_rate": 7.037691445352917e-06,
"loss": 0.2446,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2675766944885254,
"step": 5960
},
{
"epoch": 5.272968197879859,
"grad_norm": 0.6230666041374207,
"learning_rate": 7.00417285565762e-06,
"loss": 0.2521,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20548516511917114,
"step": 5965
},
{
"epoch": 5.277385159010601,
"grad_norm": 0.6456372737884521,
"learning_rate": 6.970717324316545e-06,
"loss": 0.2666,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2712244689464569,
"step": 5970
},
{
"epoch": 5.281802120141343,
"grad_norm": 0.6794562339782715,
"learning_rate": 6.937325013662623e-06,
"loss": 0.2322,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29143932461738586,
"step": 5975
},
{
"epoch": 5.286219081272085,
"grad_norm": 0.6608504056930542,
"learning_rate": 6.903996085722033e-06,
"loss": 0.233,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20360919833183289,
"step": 5980
},
{
"epoch": 5.290636042402827,
"grad_norm": 0.6932647824287415,
"learning_rate": 6.8707307022134e-06,
"loss": 0.2401,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2070590704679489,
"step": 5985
},
{
"epoch": 5.295053003533569,
"grad_norm": 0.631776750087738,
"learning_rate": 6.8375290245470296e-06,
"loss": 0.2582,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2469078004360199,
"step": 5990
},
{
"epoch": 5.299469964664311,
"grad_norm": 0.6876675486564636,
"learning_rate": 6.804391213824087e-06,
"loss": 0.2549,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2884756028652191,
"step": 5995
},
{
"epoch": 5.303886925795053,
"grad_norm": 0.6492973566055298,
"learning_rate": 6.771317430835888e-06,
"loss": 0.2322,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1926867663860321,
"step": 6000
},
{
"epoch": 5.3083038869257955,
"grad_norm": 0.6198075413703918,
"learning_rate": 6.73830783606303e-06,
"loss": 0.2568,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3009505867958069,
"step": 6005
},
{
"epoch": 5.3127208480565375,
"grad_norm": 0.6509351134300232,
"learning_rate": 6.705362589674667e-06,
"loss": 0.2372,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22769448161125183,
"step": 6010
},
{
"epoch": 5.317137809187279,
"grad_norm": 0.6019710302352905,
"learning_rate": 6.6724818515277544e-06,
"loss": 0.2299,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2702226936817169,
"step": 6015
},
{
"epoch": 5.321554770318021,
"grad_norm": 0.6558071970939636,
"learning_rate": 6.639665781166189e-06,
"loss": 0.2595,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2750241160392761,
"step": 6020
},
{
"epoch": 5.325971731448763,
"grad_norm": 0.5591468811035156,
"learning_rate": 6.606914537820122e-06,
"loss": 0.2572,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2233850359916687,
"step": 6025
},
{
"epoch": 5.330388692579505,
"grad_norm": 0.6942284107208252,
"learning_rate": 6.574228280405139e-06,
"loss": 0.2525,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19524750113487244,
"step": 6030
},
{
"epoch": 5.334805653710247,
"grad_norm": 0.6840284466743469,
"learning_rate": 6.5416071675215136e-06,
"loss": 0.233,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2810319662094116,
"step": 6035
},
{
"epoch": 5.339222614840989,
"grad_norm": 0.6279734373092651,
"learning_rate": 6.509051357453393e-06,
"loss": 0.2251,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24808743596076965,
"step": 6040
},
{
"epoch": 5.343639575971731,
"grad_norm": 0.6400632262229919,
"learning_rate": 6.476561008168096e-06,
"loss": 0.2538,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21455033123493195,
"step": 6045
},
{
"epoch": 5.348056537102473,
"grad_norm": 0.6099967360496521,
"learning_rate": 6.444136277315296e-06,
"loss": 0.2336,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21934688091278076,
"step": 6050
},
{
"epoch": 5.352473498233215,
"grad_norm": 0.9150162935256958,
"learning_rate": 6.4117773222262805e-06,
"loss": 0.2358,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17225447297096252,
"step": 6055
},
{
"epoch": 5.3568904593639575,
"grad_norm": 0.631752073764801,
"learning_rate": 6.379484299913172e-06,
"loss": 0.2533,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23191042244434357,
"step": 6060
},
{
"epoch": 5.3613074204946995,
"grad_norm": 0.6716257929801941,
"learning_rate": 6.3472573670681805e-06,
"loss": 0.2593,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3306858539581299,
"step": 6065
},
{
"epoch": 5.365724381625442,
"grad_norm": 0.6031466126441956,
"learning_rate": 6.315096680062838e-06,
"loss": 0.2525,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2807907164096832,
"step": 6070
},
{
"epoch": 5.370141342756184,
"grad_norm": 0.6290670037269592,
"learning_rate": 6.283002394947216e-06,
"loss": 0.2355,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22744987905025482,
"step": 6075
},
{
"epoch": 5.374558303886926,
"grad_norm": 0.6004354357719421,
"learning_rate": 6.2509746674492346e-06,
"loss": 0.2552,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23005065321922302,
"step": 6080
},
{
"epoch": 5.378975265017668,
"grad_norm": 0.6632710099220276,
"learning_rate": 6.21901365297382e-06,
"loss": 0.2316,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17470410466194153,
"step": 6085
},
{
"epoch": 5.38339222614841,
"grad_norm": 0.6784233450889587,
"learning_rate": 6.187119506602215e-06,
"loss": 0.2751,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2649574875831604,
"step": 6090
},
{
"epoch": 5.387809187279152,
"grad_norm": 0.6669394969940186,
"learning_rate": 6.1552923830912e-06,
"loss": 0.2403,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3093627393245697,
"step": 6095
},
{
"epoch": 5.392226148409894,
"grad_norm": 0.6822030544281006,
"learning_rate": 6.123532436872353e-06,
"loss": 0.2475,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2727193236351013,
"step": 6100
},
{
"epoch": 5.396643109540636,
"grad_norm": 0.6640558838844299,
"learning_rate": 6.091839822051284e-06,
"loss": 0.2906,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3642868995666504,
"step": 6105
},
{
"epoch": 5.401060070671378,
"grad_norm": 0.6346195936203003,
"learning_rate": 6.060214692406905e-06,
"loss": 0.2527,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.32739025354385376,
"step": 6110
},
{
"epoch": 5.40547703180212,
"grad_norm": 0.662929356098175,
"learning_rate": 6.028657201390682e-06,
"loss": 0.224,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2002319097518921,
"step": 6115
},
{
"epoch": 5.409893992932862,
"grad_norm": 0.6548178195953369,
"learning_rate": 5.99716750212586e-06,
"loss": 0.2456,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2806054651737213,
"step": 6120
},
{
"epoch": 5.4143109540636045,
"grad_norm": 0.6379974484443665,
"learning_rate": 5.965745747406775e-06,
"loss": 0.2818,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3060104548931122,
"step": 6125
},
{
"epoch": 5.418727915194347,
"grad_norm": 0.6715204119682312,
"learning_rate": 5.934392089698064e-06,
"loss": 0.2124,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16398535668849945,
"step": 6130
},
{
"epoch": 5.423144876325089,
"grad_norm": 0.6135415434837341,
"learning_rate": 5.903106681133952e-06,
"loss": 0.2021,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19793318212032318,
"step": 6135
},
{
"epoch": 5.427561837455831,
"grad_norm": 0.7642617225646973,
"learning_rate": 5.871889673517501e-06,
"loss": 0.2439,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24503038823604584,
"step": 6140
},
{
"epoch": 5.431978798586573,
"grad_norm": 0.6476492881774902,
"learning_rate": 5.840741218319881e-06,
"loss": 0.2563,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23620697855949402,
"step": 6145
},
{
"epoch": 5.436395759717314,
"grad_norm": 0.6400974988937378,
"learning_rate": 5.809661466679635e-06,
"loss": 0.246,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21778175234794617,
"step": 6150
},
{
"epoch": 5.440812720848056,
"grad_norm": 0.637679398059845,
"learning_rate": 5.778650569401922e-06,
"loss": 0.2374,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2865186929702759,
"step": 6155
},
{
"epoch": 5.445229681978798,
"grad_norm": 0.6278441548347473,
"learning_rate": 5.747708676957844e-06,
"loss": 0.2198,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22370074689388275,
"step": 6160
},
{
"epoch": 5.44964664310954,
"grad_norm": 0.6846042275428772,
"learning_rate": 5.716835939483641e-06,
"loss": 0.2423,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18103906512260437,
"step": 6165
},
{
"epoch": 5.454063604240282,
"grad_norm": 0.765556812286377,
"learning_rate": 5.686032506780015e-06,
"loss": 0.2228,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20441153645515442,
"step": 6170
},
{
"epoch": 5.458480565371024,
"grad_norm": 0.7388072609901428,
"learning_rate": 5.655298528311388e-06,
"loss": 0.243,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.210384339094162,
"step": 6175
},
{
"epoch": 5.4628975265017665,
"grad_norm": 0.6503342390060425,
"learning_rate": 5.624634153205178e-06,
"loss": 0.2485,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23481428623199463,
"step": 6180
},
{
"epoch": 5.467314487632509,
"grad_norm": 0.6476783752441406,
"learning_rate": 5.594039530251065e-06,
"loss": 0.2386,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2961850166320801,
"step": 6185
},
{
"epoch": 5.471731448763251,
"grad_norm": 0.7560742497444153,
"learning_rate": 5.563514807900285e-06,
"loss": 0.2508,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2309841811656952,
"step": 6190
},
{
"epoch": 5.476148409893993,
"grad_norm": 0.5617563128471375,
"learning_rate": 5.533060134264907e-06,
"loss": 0.2262,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1815638542175293,
"step": 6195
},
{
"epoch": 5.480565371024735,
"grad_norm": 0.7148119211196899,
"learning_rate": 5.5026756571170896e-06,
"loss": 0.2373,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2046487033367157,
"step": 6200
},
{
"epoch": 5.484982332155477,
"grad_norm": 1.1636927127838135,
"learning_rate": 5.472361523888401e-06,
"loss": 0.2203,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16061300039291382,
"step": 6205
},
{
"epoch": 5.489399293286219,
"grad_norm": 0.682188093662262,
"learning_rate": 5.442117881669085e-06,
"loss": 0.2265,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19478756189346313,
"step": 6210
},
{
"epoch": 5.493816254416961,
"grad_norm": 0.6677638292312622,
"learning_rate": 5.411944877207347e-06,
"loss": 0.2506,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2209673374891281,
"step": 6215
},
{
"epoch": 5.498233215547703,
"grad_norm": 0.6800921559333801,
"learning_rate": 5.38184265690864e-06,
"loss": 0.2008,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1875065267086029,
"step": 6220
},
{
"epoch": 5.502650176678445,
"grad_norm": 0.7293074131011963,
"learning_rate": 5.3518113668349645e-06,
"loss": 0.2397,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31664156913757324,
"step": 6225
},
{
"epoch": 5.507067137809187,
"grad_norm": 0.6156737208366394,
"learning_rate": 5.321851152704154e-06,
"loss": 0.2621,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17080801725387573,
"step": 6230
},
{
"epoch": 5.511484098939929,
"grad_norm": 0.6778889298439026,
"learning_rate": 5.291962159889148e-06,
"loss": 0.2454,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24503864347934723,
"step": 6235
},
{
"epoch": 5.5159010600706715,
"grad_norm": 0.7297868728637695,
"learning_rate": 5.262144533417344e-06,
"loss": 0.2354,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2277633249759674,
"step": 6240
},
{
"epoch": 5.520318021201414,
"grad_norm": 0.6664875745773315,
"learning_rate": 5.232398417969815e-06,
"loss": 0.2233,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21163435280323029,
"step": 6245
},
{
"epoch": 5.524734982332156,
"grad_norm": 0.6079632639884949,
"learning_rate": 5.2027239578806734e-06,
"loss": 0.2322,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2564762830734253,
"step": 6250
},
{
"epoch": 5.529151943462898,
"grad_norm": 0.6662228107452393,
"learning_rate": 5.173121297136337e-06,
"loss": 0.201,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21402853727340698,
"step": 6255
},
{
"epoch": 5.53356890459364,
"grad_norm": 0.6768613457679749,
"learning_rate": 5.14359057937484e-06,
"loss": 0.2965,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2002478837966919,
"step": 6260
},
{
"epoch": 5.537985865724382,
"grad_norm": 0.6485766172409058,
"learning_rate": 5.114131947885137e-06,
"loss": 0.2873,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3273354470729828,
"step": 6265
},
{
"epoch": 5.542402826855124,
"grad_norm": 0.6236052513122559,
"learning_rate": 5.084745545606402e-06,
"loss": 0.2543,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17392995953559875,
"step": 6270
},
{
"epoch": 5.546819787985866,
"grad_norm": 0.613959789276123,
"learning_rate": 5.055431515127349e-06,
"loss": 0.2463,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2359083592891693,
"step": 6275
},
{
"epoch": 5.551236749116608,
"grad_norm": 0.6324638724327087,
"learning_rate": 5.026189998685504e-06,
"loss": 0.2449,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24177028238773346,
"step": 6280
},
{
"epoch": 5.55565371024735,
"grad_norm": 0.6312211155891418,
"learning_rate": 4.9970211381665665e-06,
"loss": 0.275,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22367584705352783,
"step": 6285
},
{
"epoch": 5.560070671378092,
"grad_norm": 0.9709349274635315,
"learning_rate": 4.967925075103685e-06,
"loss": 0.2554,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.32471510767936707,
"step": 6290
},
{
"epoch": 5.564487632508834,
"grad_norm": 0.6403810977935791,
"learning_rate": 4.93890195067678e-06,
"loss": 0.2563,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2835839092731476,
"step": 6295
},
{
"epoch": 5.5689045936395765,
"grad_norm": 0.6615350246429443,
"learning_rate": 4.909951905711858e-06,
"loss": 0.2297,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1782667636871338,
"step": 6300
},
{
"epoch": 5.573321554770318,
"grad_norm": 0.7057219743728638,
"learning_rate": 4.881075080680335e-06,
"loss": 0.2317,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28443485498428345,
"step": 6305
},
{
"epoch": 5.57773851590106,
"grad_norm": 0.6871824264526367,
"learning_rate": 4.852271615698349e-06,
"loss": 0.2183,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28328755497932434,
"step": 6310
},
{
"epoch": 5.582155477031802,
"grad_norm": 0.6732246279716492,
"learning_rate": 4.823541650526058e-06,
"loss": 0.2652,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24324092268943787,
"step": 6315
},
{
"epoch": 5.586572438162544,
"grad_norm": 0.6424944400787354,
"learning_rate": 4.7948853245670294e-06,
"loss": 0.2128,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17290586233139038,
"step": 6320
},
{
"epoch": 5.590989399293286,
"grad_norm": 0.659040093421936,
"learning_rate": 4.7663027768674705e-06,
"loss": 0.2051,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15980809926986694,
"step": 6325
},
{
"epoch": 5.595406360424028,
"grad_norm": 0.6853231191635132,
"learning_rate": 4.737794146115633e-06,
"loss": 0.2225,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19948209822177887,
"step": 6330
},
{
"epoch": 5.59982332155477,
"grad_norm": 0.651915431022644,
"learning_rate": 4.7093595706410945e-06,
"loss": 0.2627,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.36641934514045715,
"step": 6335
},
{
"epoch": 5.604240282685512,
"grad_norm": 0.6209613084793091,
"learning_rate": 4.680999188414108e-06,
"loss": 0.24,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20791277289390564,
"step": 6340
},
{
"epoch": 5.608657243816254,
"grad_norm": 0.7123491764068604,
"learning_rate": 4.652713137044927e-06,
"loss": 0.2189,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25710418820381165,
"step": 6345
},
{
"epoch": 5.613074204946996,
"grad_norm": 0.7246097922325134,
"learning_rate": 4.624501553783127e-06,
"loss": 0.2295,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23944400250911713,
"step": 6350
},
{
"epoch": 5.6174911660777385,
"grad_norm": 0.7621732354164124,
"learning_rate": 4.596364575516969e-06,
"loss": 0.2256,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26072970032691956,
"step": 6355
},
{
"epoch": 5.6219081272084805,
"grad_norm": 0.6371416449546814,
"learning_rate": 4.568302338772688e-06,
"loss": 0.2364,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21054017543792725,
"step": 6360
},
{
"epoch": 5.626325088339223,
"grad_norm": 0.6732529401779175,
"learning_rate": 4.540314979713876e-06,
"loss": 0.2299,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19560235738754272,
"step": 6365
},
{
"epoch": 5.630742049469965,
"grad_norm": 0.6369305849075317,
"learning_rate": 4.512402634140804e-06,
"loss": 0.2584,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2767673134803772,
"step": 6370
},
{
"epoch": 5.635159010600707,
"grad_norm": 0.6594879627227783,
"learning_rate": 4.484565437489759e-06,
"loss": 0.2553,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27891114354133606,
"step": 6375
},
{
"epoch": 5.639575971731449,
"grad_norm": 0.6730323433876038,
"learning_rate": 4.456803524832389e-06,
"loss": 0.2364,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2509545385837555,
"step": 6380
},
{
"epoch": 5.643992932862191,
"grad_norm": 0.6865994334220886,
"learning_rate": 4.429117030875052e-06,
"loss": 0.2052,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1903909146785736,
"step": 6385
},
{
"epoch": 5.648409893992933,
"grad_norm": 0.6434392333030701,
"learning_rate": 4.401506089958161e-06,
"loss": 0.2322,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29577142000198364,
"step": 6390
},
{
"epoch": 5.652826855123675,
"grad_norm": 0.6849035620689392,
"learning_rate": 4.37397083605551e-06,
"loss": 0.2162,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2336483895778656,
"step": 6395
},
{
"epoch": 5.657243816254417,
"grad_norm": 0.6565053462982178,
"learning_rate": 4.346511402773688e-06,
"loss": 0.2306,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23053717613220215,
"step": 6400
},
{
"epoch": 5.661660777385159,
"grad_norm": 0.7413392066955566,
"learning_rate": 4.319127923351339e-06,
"loss": 0.2713,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27822649478912354,
"step": 6405
},
{
"epoch": 5.666077738515901,
"grad_norm": 0.613976776599884,
"learning_rate": 4.291820530658595e-06,
"loss": 0.2549,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2500184178352356,
"step": 6410
},
{
"epoch": 5.670494699646643,
"grad_norm": 0.6709519028663635,
"learning_rate": 4.264589357196389e-06,
"loss": 0.2342,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2182079702615738,
"step": 6415
},
{
"epoch": 5.6749116607773855,
"grad_norm": 0.618645966053009,
"learning_rate": 4.2374345350958256e-06,
"loss": 0.2329,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16464778780937195,
"step": 6420
},
{
"epoch": 5.679328621908128,
"grad_norm": 0.6145651936531067,
"learning_rate": 4.2103561961175354e-06,
"loss": 0.2168,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25252848863601685,
"step": 6425
},
{
"epoch": 5.683745583038869,
"grad_norm": 0.7100237607955933,
"learning_rate": 4.183354471651037e-06,
"loss": 0.2357,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21529585123062134,
"step": 6430
},
{
"epoch": 5.688162544169611,
"grad_norm": 0.6366299986839294,
"learning_rate": 4.156429492714109e-06,
"loss": 0.2213,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22616274654865265,
"step": 6435
},
{
"epoch": 5.692579505300353,
"grad_norm": 0.6993550658226013,
"learning_rate": 4.129581389952129e-06,
"loss": 0.2259,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16478785872459412,
"step": 6440
},
{
"epoch": 5.696996466431095,
"grad_norm": 0.6645349860191345,
"learning_rate": 4.102810293637465e-06,
"loss": 0.2262,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22474364936351776,
"step": 6445
},
{
"epoch": 5.701413427561837,
"grad_norm": 0.6140891313552856,
"learning_rate": 4.076116333668838e-06,
"loss": 0.2337,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23928777873516083,
"step": 6450
},
{
"epoch": 5.705830388692579,
"grad_norm": 0.6906691193580627,
"learning_rate": 4.049499639570682e-06,
"loss": 0.2503,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.206419438123703,
"step": 6455
},
{
"epoch": 5.710247349823321,
"grad_norm": 0.8086975812911987,
"learning_rate": 4.022960340492525e-06,
"loss": 0.2277,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2215789258480072,
"step": 6460
},
{
"epoch": 5.714664310954063,
"grad_norm": 0.6342015266418457,
"learning_rate": 3.996498565208358e-06,
"loss": 0.2277,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20739787817001343,
"step": 6465
},
{
"epoch": 5.719081272084805,
"grad_norm": 0.6628281474113464,
"learning_rate": 3.970114442116013e-06,
"loss": 0.1905,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18304401636123657,
"step": 6470
},
{
"epoch": 5.7234982332155475,
"grad_norm": 0.6513307690620422,
"learning_rate": 3.943808099236524e-06,
"loss": 0.2257,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21980533003807068,
"step": 6475
},
{
"epoch": 5.72791519434629,
"grad_norm": 0.6264829635620117,
"learning_rate": 3.917579664213549e-06,
"loss": 0.2471,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3289884328842163,
"step": 6480
},
{
"epoch": 5.732332155477032,
"grad_norm": 0.6363914012908936,
"learning_rate": 3.8914292643126915e-06,
"loss": 0.225,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2795798182487488,
"step": 6485
},
{
"epoch": 5.736749116607774,
"grad_norm": 0.693565309047699,
"learning_rate": 3.865357026420926e-06,
"loss": 0.2457,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2867644131183624,
"step": 6490
},
{
"epoch": 5.741166077738516,
"grad_norm": 0.6560927629470825,
"learning_rate": 3.839363077045974e-06,
"loss": 0.229,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2569573223590851,
"step": 6495
},
{
"epoch": 5.745583038869258,
"grad_norm": 0.6350461840629578,
"learning_rate": 3.8134475423156757e-06,
"loss": 0.2428,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2617478370666504,
"step": 6500
},
{
"epoch": 5.75,
"grad_norm": 0.7265409827232361,
"learning_rate": 3.787610547977396e-06,
"loss": 0.2413,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18774326145648956,
"step": 6505
},
{
"epoch": 5.754416961130742,
"grad_norm": 0.6530236005783081,
"learning_rate": 3.7618522193973994e-06,
"loss": 0.2527,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.277637243270874,
"step": 6510
},
{
"epoch": 5.758833922261484,
"grad_norm": 0.684080958366394,
"learning_rate": 3.7361726815602596e-06,
"loss": 0.2515,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2156839370727539,
"step": 6515
},
{
"epoch": 5.763250883392226,
"grad_norm": 0.6697428822517395,
"learning_rate": 3.710572059068218e-06,
"loss": 0.2364,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26217517256736755,
"step": 6520
},
{
"epoch": 5.767667844522968,
"grad_norm": 0.6842355132102966,
"learning_rate": 3.6850504761406282e-06,
"loss": 0.291,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.34840089082717896,
"step": 6525
},
{
"epoch": 5.77208480565371,
"grad_norm": 0.7177510261535645,
"learning_rate": 3.6596080566133176e-06,
"loss": 0.2566,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25881335139274597,
"step": 6530
},
{
"epoch": 5.7765017667844525,
"grad_norm": 0.6908156275749207,
"learning_rate": 3.6342449239379974e-06,
"loss": 0.2514,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25273647904396057,
"step": 6535
},
{
"epoch": 5.780918727915195,
"grad_norm": 0.6992905139923096,
"learning_rate": 3.608961201181662e-06,
"loss": 0.2722,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27057141065597534,
"step": 6540
},
{
"epoch": 5.785335689045937,
"grad_norm": 0.708998441696167,
"learning_rate": 3.5837570110259945e-06,
"loss": 0.2274,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2736978530883789,
"step": 6545
},
{
"epoch": 5.789752650176679,
"grad_norm": 0.5654464364051819,
"learning_rate": 3.558632475766777e-06,
"loss": 0.237,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23132240772247314,
"step": 6550
},
{
"epoch": 5.794169611307421,
"grad_norm": 0.7154576182365417,
"learning_rate": 3.5335877173132672e-06,
"loss": 0.2623,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29507291316986084,
"step": 6555
},
{
"epoch": 5.798586572438163,
"grad_norm": 0.6882935762405396,
"learning_rate": 3.5086228571876622e-06,
"loss": 0.2782,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2825637757778168,
"step": 6560
},
{
"epoch": 5.803003533568905,
"grad_norm": 0.6030979156494141,
"learning_rate": 3.4837380165244494e-06,
"loss": 0.2596,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29193153977394104,
"step": 6565
},
{
"epoch": 5.807420494699647,
"grad_norm": 0.6919524073600769,
"learning_rate": 3.4589333160698592e-06,
"loss": 0.2359,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2584839463233948,
"step": 6570
},
{
"epoch": 5.811837455830389,
"grad_norm": 0.6700971722602844,
"learning_rate": 3.434208876181262e-06,
"loss": 0.244,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2972269654273987,
"step": 6575
},
{
"epoch": 5.816254416961131,
"grad_norm": 0.6191151142120361,
"learning_rate": 3.409564816826587e-06,
"loss": 0.2667,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31485503911972046,
"step": 6580
},
{
"epoch": 5.820671378091872,
"grad_norm": 0.627770185470581,
"learning_rate": 3.385001257583744e-06,
"loss": 0.2082,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27181103825569153,
"step": 6585
},
{
"epoch": 5.8250883392226145,
"grad_norm": 0.6451102495193481,
"learning_rate": 3.3605183176400402e-06,
"loss": 0.2312,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3111085295677185,
"step": 6590
},
{
"epoch": 5.829505300353357,
"grad_norm": 0.6402949094772339,
"learning_rate": 3.3361161157916012e-06,
"loss": 0.2148,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17799827456474304,
"step": 6595
},
{
"epoch": 5.833922261484099,
"grad_norm": 0.6498696804046631,
"learning_rate": 3.3117947704427866e-06,
"loss": 0.2404,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2313275933265686,
"step": 6600
},
{
"epoch": 5.838339222614841,
"grad_norm": 0.6034268736839294,
"learning_rate": 3.287554399605637e-06,
"loss": 0.2114,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18082204461097717,
"step": 6605
},
{
"epoch": 5.842756183745583,
"grad_norm": 0.6362025141716003,
"learning_rate": 3.2633951208992797e-06,
"loss": 0.2358,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22728809714317322,
"step": 6610
},
{
"epoch": 5.847173144876325,
"grad_norm": 0.6251878142356873,
"learning_rate": 3.2393170515493756e-06,
"loss": 0.2204,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24366095662117004,
"step": 6615
},
{
"epoch": 5.851590106007067,
"grad_norm": 0.6515399813652039,
"learning_rate": 3.2153203083875306e-06,
"loss": 0.2544,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21778994798660278,
"step": 6620
},
{
"epoch": 5.856007067137809,
"grad_norm": 0.8303614854812622,
"learning_rate": 3.19140500785075e-06,
"loss": 0.2382,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14264947175979614,
"step": 6625
},
{
"epoch": 5.860424028268551,
"grad_norm": 0.6612645387649536,
"learning_rate": 3.1675712659808576e-06,
"loss": 0.2356,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22965016961097717,
"step": 6630
},
{
"epoch": 5.864840989399293,
"grad_norm": 0.6375709176063538,
"learning_rate": 3.1438191984239297e-06,
"loss": 0.226,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24031785130500793,
"step": 6635
},
{
"epoch": 5.869257950530035,
"grad_norm": 0.6527304649353027,
"learning_rate": 3.1201489204297663e-06,
"loss": 0.2465,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18436655402183533,
"step": 6640
},
{
"epoch": 5.873674911660777,
"grad_norm": 0.6133697628974915,
"learning_rate": 3.0965605468512837e-06,
"loss": 0.2645,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2529999315738678,
"step": 6645
},
{
"epoch": 5.8780918727915195,
"grad_norm": 0.6855648756027222,
"learning_rate": 3.0730541921439936e-06,
"loss": 0.2243,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2065194845199585,
"step": 6650
},
{
"epoch": 5.8825088339222615,
"grad_norm": 0.7099367380142212,
"learning_rate": 3.049629970365433e-06,
"loss": 0.2309,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20094117522239685,
"step": 6655
},
{
"epoch": 5.886925795053004,
"grad_norm": 0.6442746520042419,
"learning_rate": 3.026287995174615e-06,
"loss": 0.2356,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22805841267108917,
"step": 6660
},
{
"epoch": 5.891342756183746,
"grad_norm": 0.6667559146881104,
"learning_rate": 3.0030283798314785e-06,
"loss": 0.2445,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19497643411159515,
"step": 6665
},
{
"epoch": 5.895759717314488,
"grad_norm": 0.6932128071784973,
"learning_rate": 2.9798512371963207e-06,
"loss": 0.2535,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2080661952495575,
"step": 6670
},
{
"epoch": 5.90017667844523,
"grad_norm": 0.6642143130302429,
"learning_rate": 2.9567566797292914e-06,
"loss": 0.2605,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26065370440483093,
"step": 6675
},
{
"epoch": 5.904593639575972,
"grad_norm": 0.7001204490661621,
"learning_rate": 2.9337448194897943e-06,
"loss": 0.2583,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25018325448036194,
"step": 6680
},
{
"epoch": 5.909010600706714,
"grad_norm": 0.6896331310272217,
"learning_rate": 2.9108157681359837e-06,
"loss": 0.2428,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.317201167345047,
"step": 6685
},
{
"epoch": 5.913427561837456,
"grad_norm": 0.6316036581993103,
"learning_rate": 2.8879696369242062e-06,
"loss": 0.2704,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2903493642807007,
"step": 6690
},
{
"epoch": 5.917844522968198,
"grad_norm": 0.5617479681968689,
"learning_rate": 2.8652065367084627e-06,
"loss": 0.2479,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25086894631385803,
"step": 6695
},
{
"epoch": 5.92226148409894,
"grad_norm": 0.6145947575569153,
"learning_rate": 2.8425265779398704e-06,
"loss": 0.251,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2544384002685547,
"step": 6700
},
{
"epoch": 5.926678445229682,
"grad_norm": 0.718999981880188,
"learning_rate": 2.819929870666129e-06,
"loss": 0.2146,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.155266672372818,
"step": 6705
},
{
"epoch": 5.9310954063604235,
"grad_norm": 0.8380730748176575,
"learning_rate": 2.7974165245309913e-06,
"loss": 0.2275,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1611081063747406,
"step": 6710
},
{
"epoch": 5.935512367491166,
"grad_norm": 0.7612493634223938,
"learning_rate": 2.774986648773701e-06,
"loss": 0.2487,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23470436036586761,
"step": 6715
},
{
"epoch": 5.939929328621908,
"grad_norm": 0.6034536361694336,
"learning_rate": 2.752640352228524e-06,
"loss": 0.254,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2653493583202362,
"step": 6720
},
{
"epoch": 5.94434628975265,
"grad_norm": 0.6654173731803894,
"learning_rate": 2.7303777433241506e-06,
"loss": 0.2321,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25490570068359375,
"step": 6725
},
{
"epoch": 5.948763250883392,
"grad_norm": 0.6058556437492371,
"learning_rate": 2.708198930083219e-06,
"loss": 0.2504,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2901003956794739,
"step": 6730
},
{
"epoch": 5.953180212014134,
"grad_norm": 0.6939293146133423,
"learning_rate": 2.6861040201217692e-06,
"loss": 0.2246,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.260385125875473,
"step": 6735
},
{
"epoch": 5.957597173144876,
"grad_norm": 0.6245858669281006,
"learning_rate": 2.6640931206487252e-06,
"loss": 0.2539,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25670942664146423,
"step": 6740
},
{
"epoch": 5.962014134275618,
"grad_norm": 0.6986921429634094,
"learning_rate": 2.642166338465384e-06,
"loss": 0.2416,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21145933866500854,
"step": 6745
},
{
"epoch": 5.96643109540636,
"grad_norm": 0.6841897964477539,
"learning_rate": 2.6203237799648663e-06,
"loss": 0.213,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18843623995780945,
"step": 6750
},
{
"epoch": 5.970848056537102,
"grad_norm": 0.6969452500343323,
"learning_rate": 2.598565551131653e-06,
"loss": 0.2295,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3311152458190918,
"step": 6755
},
{
"epoch": 5.975265017667844,
"grad_norm": 0.6374683380126953,
"learning_rate": 2.5768917575410134e-06,
"loss": 0.2332,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2040780782699585,
"step": 6760
},
{
"epoch": 5.979681978798586,
"grad_norm": 0.7229990363121033,
"learning_rate": 2.555302504358537e-06,
"loss": 0.2225,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21656137704849243,
"step": 6765
},
{
"epoch": 5.9840989399293285,
"grad_norm": 0.7268469333648682,
"learning_rate": 2.5337978963396003e-06,
"loss": 0.227,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1876240074634552,
"step": 6770
},
{
"epoch": 5.988515901060071,
"grad_norm": 0.642284631729126,
"learning_rate": 2.5123780378288642e-06,
"loss": 0.2404,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24313439428806305,
"step": 6775
},
{
"epoch": 5.992932862190813,
"grad_norm": 0.6617910265922546,
"learning_rate": 2.49104303275977e-06,
"loss": 0.2628,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25242412090301514,
"step": 6780
},
{
"epoch": 5.997349823321555,
"grad_norm": 0.6622885465621948,
"learning_rate": 2.4697929846540335e-06,
"loss": 0.2371,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28016042709350586,
"step": 6785
},
{
"epoch": 6.002650176678445,
"grad_norm": 0.6871181726455688,
"learning_rate": 2.4486279966211425e-06,
"loss": 0.2211,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21785372495651245,
"step": 6790
},
{
"epoch": 6.007067137809187,
"grad_norm": 0.634680986404419,
"learning_rate": 2.427548171357843e-06,
"loss": 0.2488,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23458047211170197,
"step": 6795
},
{
"epoch": 6.011484098939929,
"grad_norm": 0.7124053835868835,
"learning_rate": 2.406553611147684e-06,
"loss": 0.2631,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1963014453649521,
"step": 6800
},
{
"epoch": 6.0159010600706715,
"grad_norm": 0.6288027763366699,
"learning_rate": 2.38564441786046e-06,
"loss": 0.2463,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22201281785964966,
"step": 6805
},
{
"epoch": 6.020318021201414,
"grad_norm": 0.8096534013748169,
"learning_rate": 2.364820692951766e-06,
"loss": 0.2418,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2866779565811157,
"step": 6810
},
{
"epoch": 6.024734982332156,
"grad_norm": 0.7279224395751953,
"learning_rate": 2.3440825374624798e-06,
"loss": 0.2504,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1597205102443695,
"step": 6815
},
{
"epoch": 6.029151943462898,
"grad_norm": 0.6755273938179016,
"learning_rate": 2.3234300520182873e-06,
"loss": 0.2535,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2171267867088318,
"step": 6820
},
{
"epoch": 6.03356890459364,
"grad_norm": 0.6738324761390686,
"learning_rate": 2.3028633368291843e-06,
"loss": 0.2408,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31061723828315735,
"step": 6825
},
{
"epoch": 6.037985865724382,
"grad_norm": 0.6739371418952942,
"learning_rate": 2.2823824916889724e-06,
"loss": 0.2133,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19004075229167938,
"step": 6830
},
{
"epoch": 6.042402826855124,
"grad_norm": 0.7035517692565918,
"learning_rate": 2.261987615974832e-06,
"loss": 0.2345,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29125821590423584,
"step": 6835
},
{
"epoch": 6.046819787985866,
"grad_norm": 0.718480110168457,
"learning_rate": 2.241678808646768e-06,
"loss": 0.227,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21620148420333862,
"step": 6840
},
{
"epoch": 6.051236749116608,
"grad_norm": 0.6759348511695862,
"learning_rate": 2.2214561682471825e-06,
"loss": 0.2263,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22231999039649963,
"step": 6845
},
{
"epoch": 6.05565371024735,
"grad_norm": 0.6698014140129089,
"learning_rate": 2.201319792900374e-06,
"loss": 0.2305,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20235413312911987,
"step": 6850
},
{
"epoch": 6.060070671378092,
"grad_norm": 0.6121541261672974,
"learning_rate": 2.181269780312063e-06,
"loss": 0.2177,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20862017571926117,
"step": 6855
},
{
"epoch": 6.0644876325088335,
"grad_norm": 0.6756522059440613,
"learning_rate": 2.1613062277689266e-06,
"loss": 0.2205,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17809954285621643,
"step": 6860
},
{
"epoch": 6.068904593639576,
"grad_norm": 0.7342451810836792,
"learning_rate": 2.141429232138117e-06,
"loss": 0.2207,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2191966474056244,
"step": 6865
},
{
"epoch": 6.073321554770318,
"grad_norm": 0.7150316834449768,
"learning_rate": 2.1216388898667973e-06,
"loss": 0.2352,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23332220315933228,
"step": 6870
},
{
"epoch": 6.07773851590106,
"grad_norm": 0.6240957379341125,
"learning_rate": 2.1019352969816585e-06,
"loss": 0.265,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24282801151275635,
"step": 6875
},
{
"epoch": 6.082155477031802,
"grad_norm": 0.6066805720329285,
"learning_rate": 2.082318549088491e-06,
"loss": 0.2542,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26603370904922485,
"step": 6880
},
{
"epoch": 6.086572438162544,
"grad_norm": 0.7509016990661621,
"learning_rate": 2.062788741371673e-06,
"loss": 0.2227,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3612693250179291,
"step": 6885
},
{
"epoch": 6.090989399293286,
"grad_norm": 0.701538622379303,
"learning_rate": 2.0433459685937395e-06,
"loss": 0.2593,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.318620890378952,
"step": 6890
},
{
"epoch": 6.095406360424028,
"grad_norm": 0.6263982653617859,
"learning_rate": 2.0239903250949176e-06,
"loss": 0.2221,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23397664725780487,
"step": 6895
},
{
"epoch": 6.09982332155477,
"grad_norm": 0.644121527671814,
"learning_rate": 2.0047219047926614e-06,
"loss": 0.2221,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22818706929683685,
"step": 6900
},
{
"epoch": 6.104240282685512,
"grad_norm": 0.6542277336120605,
"learning_rate": 1.9855408011812117e-06,
"loss": 0.2456,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2627973258495331,
"step": 6905
},
{
"epoch": 6.108657243816254,
"grad_norm": 0.6295666694641113,
"learning_rate": 1.966447107331104e-06,
"loss": 0.2238,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27365830540657043,
"step": 6910
},
{
"epoch": 6.113074204946996,
"grad_norm": 0.6320794224739075,
"learning_rate": 1.9474409158887807e-06,
"loss": 0.233,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23829936981201172,
"step": 6915
},
{
"epoch": 6.1174911660777385,
"grad_norm": 0.7099300622940063,
"learning_rate": 1.9285223190760737e-06,
"loss": 0.2178,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16763125360012054,
"step": 6920
},
{
"epoch": 6.1219081272084805,
"grad_norm": 0.5849335193634033,
"learning_rate": 1.9096914086898087e-06,
"loss": 0.2098,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2682895064353943,
"step": 6925
},
{
"epoch": 6.126325088339223,
"grad_norm": 0.5837305784225464,
"learning_rate": 1.8909482761013254e-06,
"loss": 0.2236,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20228040218353271,
"step": 6930
},
{
"epoch": 6.130742049469965,
"grad_norm": 0.7193915247917175,
"learning_rate": 1.872293012256059e-06,
"loss": 0.2034,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24561667442321777,
"step": 6935
},
{
"epoch": 6.135159010600707,
"grad_norm": 0.7574843168258667,
"learning_rate": 1.853725707673082e-06,
"loss": 0.2399,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26278358697891235,
"step": 6940
},
{
"epoch": 6.139575971731449,
"grad_norm": 0.7833778262138367,
"learning_rate": 1.8352464524446724e-06,
"loss": 0.2054,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18190684914588928,
"step": 6945
},
{
"epoch": 6.143992932862191,
"grad_norm": 0.6389424204826355,
"learning_rate": 1.8168553362358787e-06,
"loss": 0.2165,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.228424072265625,
"step": 6950
},
{
"epoch": 6.148409893992933,
"grad_norm": 0.7503507733345032,
"learning_rate": 1.7985524482840676e-06,
"loss": 0.2483,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3514091968536377,
"step": 6955
},
{
"epoch": 6.152826855123675,
"grad_norm": 0.6644186973571777,
"learning_rate": 1.7803378773985214e-06,
"loss": 0.2445,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23452533781528473,
"step": 6960
},
{
"epoch": 6.157243816254417,
"grad_norm": 0.6842535138130188,
"learning_rate": 1.7622117119599802e-06,
"loss": 0.2313,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2150776982307434,
"step": 6965
},
{
"epoch": 6.161660777385159,
"grad_norm": 0.6151363849639893,
"learning_rate": 1.74417403992023e-06,
"loss": 0.2473,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25320905447006226,
"step": 6970
},
{
"epoch": 6.166077738515901,
"grad_norm": 0.8280779719352722,
"learning_rate": 1.7262249488016648e-06,
"loss": 0.276,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3232913315296173,
"step": 6975
},
{
"epoch": 6.170494699646643,
"grad_norm": 0.6878020167350769,
"learning_rate": 1.708364525696864e-06,
"loss": 0.2731,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3126068711280823,
"step": 6980
},
{
"epoch": 6.1749116607773855,
"grad_norm": 0.7097288966178894,
"learning_rate": 1.6905928572681806e-06,
"loss": 0.2509,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3269100785255432,
"step": 6985
},
{
"epoch": 6.179328621908128,
"grad_norm": 0.6415253281593323,
"learning_rate": 1.6729100297472967e-06,
"loss": 0.2125,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2540552020072937,
"step": 6990
},
{
"epoch": 6.18374558303887,
"grad_norm": 0.6544218063354492,
"learning_rate": 1.6553161289348429e-06,
"loss": 0.2182,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21726730465888977,
"step": 6995
},
{
"epoch": 6.188162544169611,
"grad_norm": 0.6272726655006409,
"learning_rate": 1.637811240199938e-06,
"loss": 0.2144,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27637243270874023,
"step": 7000
},
{
"epoch": 6.192579505300353,
"grad_norm": 0.651709258556366,
"learning_rate": 1.620395448479808e-06,
"loss": 0.2198,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15154890716075897,
"step": 7005
},
{
"epoch": 6.196996466431095,
"grad_norm": 0.6224712133407593,
"learning_rate": 1.603068838279358e-06,
"loss": 0.2222,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2673448920249939,
"step": 7010
},
{
"epoch": 6.201413427561837,
"grad_norm": 0.6794725060462952,
"learning_rate": 1.5858314936707731e-06,
"loss": 0.2144,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26523715257644653,
"step": 7015
},
{
"epoch": 6.205830388692579,
"grad_norm": 0.6605962514877319,
"learning_rate": 1.5686834982930954e-06,
"loss": 0.2489,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24990308284759521,
"step": 7020
},
{
"epoch": 6.210247349823321,
"grad_norm": 0.6467421054840088,
"learning_rate": 1.551624935351832e-06,
"loss": 0.2235,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21919971704483032,
"step": 7025
},
{
"epoch": 6.214664310954063,
"grad_norm": 0.6687191724777222,
"learning_rate": 1.5346558876185459e-06,
"loss": 0.2176,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19604460895061493,
"step": 7030
},
{
"epoch": 6.219081272084805,
"grad_norm": 0.6332302689552307,
"learning_rate": 1.5177764374304493e-06,
"loss": 0.2437,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2982211112976074,
"step": 7035
},
{
"epoch": 6.2234982332155475,
"grad_norm": 0.6573311686515808,
"learning_rate": 1.500986666690012e-06,
"loss": 0.241,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2335302233695984,
"step": 7040
},
{
"epoch": 6.22791519434629,
"grad_norm": 0.6834275126457214,
"learning_rate": 1.4842866568645642e-06,
"loss": 0.2073,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2586025595664978,
"step": 7045
},
{
"epoch": 6.232332155477032,
"grad_norm": 0.6670604944229126,
"learning_rate": 1.4676764889858964e-06,
"loss": 0.2075,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15618737041950226,
"step": 7050
},
{
"epoch": 6.236749116607774,
"grad_norm": 0.6588166356086731,
"learning_rate": 1.4511562436498671e-06,
"loss": 0.2369,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2003191113471985,
"step": 7055
},
{
"epoch": 6.241166077738516,
"grad_norm": 0.6686882972717285,
"learning_rate": 1.4347260010160112e-06,
"loss": 0.2507,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3045847713947296,
"step": 7060
},
{
"epoch": 6.245583038869258,
"grad_norm": 0.8178129196166992,
"learning_rate": 1.418385840807157e-06,
"loss": 0.2332,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.307248055934906,
"step": 7065
},
{
"epoch": 6.25,
"grad_norm": 0.7019538283348083,
"learning_rate": 1.402135842309027e-06,
"loss": 0.2203,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20041900873184204,
"step": 7070
},
{
"epoch": 6.254416961130742,
"grad_norm": 0.6783828735351562,
"learning_rate": 1.3859760843698733e-06,
"loss": 0.2481,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20858903229236603,
"step": 7075
},
{
"epoch": 6.258833922261484,
"grad_norm": 0.7637086510658264,
"learning_rate": 1.3699066454000698e-06,
"loss": 0.2303,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2005981206893921,
"step": 7080
},
{
"epoch": 6.263250883392226,
"grad_norm": 0.7075777649879456,
"learning_rate": 1.353927603371754e-06,
"loss": 0.2606,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28827959299087524,
"step": 7085
},
{
"epoch": 6.267667844522968,
"grad_norm": 0.6434727311134338,
"learning_rate": 1.3380390358184324e-06,
"loss": 0.2137,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19321520626544952,
"step": 7090
},
{
"epoch": 6.27208480565371,
"grad_norm": 0.7264281511306763,
"learning_rate": 1.322241019834616e-06,
"loss": 0.2577,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30105140805244446,
"step": 7095
},
{
"epoch": 6.2765017667844525,
"grad_norm": 0.6986357569694519,
"learning_rate": 1.3065336320754418e-06,
"loss": 0.232,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2179412990808487,
"step": 7100
},
{
"epoch": 6.280918727915195,
"grad_norm": 0.6690407991409302,
"learning_rate": 1.2909169487562978e-06,
"loss": 0.2367,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27286529541015625,
"step": 7105
},
{
"epoch": 6.285335689045937,
"grad_norm": 0.6591483354568481,
"learning_rate": 1.2753910456524588e-06,
"loss": 0.2241,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22066080570220947,
"step": 7110
},
{
"epoch": 6.289752650176679,
"grad_norm": 0.825552225112915,
"learning_rate": 1.2599559980987076e-06,
"loss": 0.2708,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22613100707530975,
"step": 7115
},
{
"epoch": 6.294169611307421,
"grad_norm": 0.6642948985099792,
"learning_rate": 1.2446118809889906e-06,
"loss": 0.2394,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1839137226343155,
"step": 7120
},
{
"epoch": 6.298586572438163,
"grad_norm": 0.7041632533073425,
"learning_rate": 1.22935876877603e-06,
"loss": 0.2177,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19626665115356445,
"step": 7125
},
{
"epoch": 6.303003533568905,
"grad_norm": 0.7177551984786987,
"learning_rate": 1.214196735470985e-06,
"loss": 0.2486,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.276611328125,
"step": 7130
},
{
"epoch": 6.307420494699647,
"grad_norm": 0.6658399701118469,
"learning_rate": 1.1991258546430683e-06,
"loss": 0.2368,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27585369348526,
"step": 7135
},
{
"epoch": 6.311837455830389,
"grad_norm": 0.6692659258842468,
"learning_rate": 1.184146199419216e-06,
"loss": 0.2728,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2560042142868042,
"step": 7140
},
{
"epoch": 6.316254416961131,
"grad_norm": 0.6785102486610413,
"learning_rate": 1.1692578424837131e-06,
"loss": 0.232,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3469223976135254,
"step": 7145
},
{
"epoch": 6.320671378091872,
"grad_norm": 0.6885204911231995,
"learning_rate": 1.1544608560778392e-06,
"loss": 0.198,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1730516254901886,
"step": 7150
},
{
"epoch": 6.3250883392226145,
"grad_norm": 0.7053970694541931,
"learning_rate": 1.139755311999544e-06,
"loss": 0.2155,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22853153944015503,
"step": 7155
},
{
"epoch": 6.329505300353357,
"grad_norm": 0.6633711457252502,
"learning_rate": 1.1251412816030637e-06,
"loss": 0.2489,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18317949771881104,
"step": 7160
},
{
"epoch": 6.333922261484099,
"grad_norm": 0.6200980544090271,
"learning_rate": 1.1106188357986003e-06,
"loss": 0.2302,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28622856736183167,
"step": 7165
},
{
"epoch": 6.338339222614841,
"grad_norm": 0.7030071020126343,
"learning_rate": 1.096188045051969e-06,
"loss": 0.2349,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20836327970027924,
"step": 7170
},
{
"epoch": 6.342756183745583,
"grad_norm": 0.6545681953430176,
"learning_rate": 1.0818489793842523e-06,
"loss": 0.2205,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21316681802272797,
"step": 7175
},
{
"epoch": 6.347173144876325,
"grad_norm": 0.626930296421051,
"learning_rate": 1.0676017083714684e-06,
"loss": 0.2523,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.33314627408981323,
"step": 7180
},
{
"epoch": 6.351590106007067,
"grad_norm": 0.6591570973396301,
"learning_rate": 1.0534463011442276e-06,
"loss": 0.2251,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2925190329551697,
"step": 7185
},
{
"epoch": 6.356007067137809,
"grad_norm": 0.6768887639045715,
"learning_rate": 1.0393828263873985e-06,
"loss": 0.2692,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20045700669288635,
"step": 7190
},
{
"epoch": 6.360424028268551,
"grad_norm": 0.6835484504699707,
"learning_rate": 1.0254113523397736e-06,
"loss": 0.2218,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1818992793560028,
"step": 7195
},
{
"epoch": 6.364840989399293,
"grad_norm": 0.6533926725387573,
"learning_rate": 1.0115319467937402e-06,
"loss": 0.242,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2500065565109253,
"step": 7200
},
{
"epoch": 6.369257950530035,
"grad_norm": 0.6944401860237122,
"learning_rate": 9.977446770949562e-07,
"loss": 0.2294,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23091307282447815,
"step": 7205
},
{
"epoch": 6.373674911660777,
"grad_norm": 0.6814969778060913,
"learning_rate": 9.840496101420106e-07,
"loss": 0.2407,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20970004796981812,
"step": 7210
},
{
"epoch": 6.3780918727915195,
"grad_norm": 0.6713853478431702,
"learning_rate": 9.704468123861077e-07,
"loss": 0.2477,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24036778509616852,
"step": 7215
},
{
"epoch": 6.3825088339222615,
"grad_norm": 0.6768038868904114,
"learning_rate": 9.569363498307482e-07,
"loss": 0.239,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2124941051006317,
"step": 7220
},
{
"epoch": 6.386925795053004,
"grad_norm": 0.7332238554954529,
"learning_rate": 9.43518288031402e-07,
"loss": 0.2365,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25171586871147156,
"step": 7225
},
{
"epoch": 6.391342756183746,
"grad_norm": 0.7168375849723816,
"learning_rate": 9.301926920951798e-07,
"loss": 0.192,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18527813255786896,
"step": 7230
},
{
"epoch": 6.395759717314488,
"grad_norm": 0.7241307497024536,
"learning_rate": 9.169596266805536e-07,
"loss": 0.2508,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25133055448532104,
"step": 7235
},
{
"epoch": 6.40017667844523,
"grad_norm": 0.6914727091789246,
"learning_rate": 9.038191559969967e-07,
"loss": 0.2597,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2576597332954407,
"step": 7240
},
{
"epoch": 6.404593639575972,
"grad_norm": 0.6564986705780029,
"learning_rate": 8.907713438047039e-07,
"loss": 0.2343,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28039878606796265,
"step": 7245
},
{
"epoch": 6.409010600706714,
"grad_norm": 0.690380334854126,
"learning_rate": 8.77816253414272e-07,
"loss": 0.2845,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19468779861927032,
"step": 7250
},
{
"epoch": 6.413427561837456,
"grad_norm": 0.6827090978622437,
"learning_rate": 8.649539476863933e-07,
"loss": 0.1907,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1751917153596878,
"step": 7255
},
{
"epoch": 6.417844522968198,
"grad_norm": 0.7905651926994324,
"learning_rate": 8.521844890315489e-07,
"loss": 0.2369,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22601011395454407,
"step": 7260
},
{
"epoch": 6.42226148409894,
"grad_norm": 0.7389162182807922,
"learning_rate": 8.395079394097072e-07,
"loss": 0.2287,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18410909175872803,
"step": 7265
},
{
"epoch": 6.426678445229682,
"grad_norm": 0.7305995225906372,
"learning_rate": 8.269243603300259e-07,
"loss": 0.2387,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22303178906440735,
"step": 7270
},
{
"epoch": 6.431095406360424,
"grad_norm": 0.7176210284233093,
"learning_rate": 8.144338128505458e-07,
"loss": 0.2216,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2397802472114563,
"step": 7275
},
{
"epoch": 6.435512367491166,
"grad_norm": 0.6935552954673767,
"learning_rate": 8.020363575779044e-07,
"loss": 0.2516,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2560634911060333,
"step": 7280
},
{
"epoch": 6.439929328621908,
"grad_norm": 0.6029765009880066,
"learning_rate": 7.897320546670362e-07,
"loss": 0.2133,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1827746331691742,
"step": 7285
},
{
"epoch": 6.44434628975265,
"grad_norm": 0.6512175798416138,
"learning_rate": 7.775209638208814e-07,
"loss": 0.2498,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3437669277191162,
"step": 7290
},
{
"epoch": 6.448763250883392,
"grad_norm": 0.8887504935264587,
"learning_rate": 7.654031442900978e-07,
"loss": 0.1744,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12437689304351807,
"step": 7295
},
{
"epoch": 6.453180212014134,
"grad_norm": 0.776211678981781,
"learning_rate": 7.533786548727695e-07,
"loss": 0.2215,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26376765966415405,
"step": 7300
},
{
"epoch": 6.457597173144876,
"grad_norm": 0.6631959080696106,
"learning_rate": 7.414475539141275e-07,
"loss": 0.2707,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23870348930358887,
"step": 7305
},
{
"epoch": 6.462014134275618,
"grad_norm": 0.6867117285728455,
"learning_rate": 7.296098993062562e-07,
"loss": 0.1844,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15025901794433594,
"step": 7310
},
{
"epoch": 6.46643109540636,
"grad_norm": 0.6691762208938599,
"learning_rate": 7.178657484878338e-07,
"loss": 0.257,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25004538893699646,
"step": 7315
},
{
"epoch": 6.470848056537102,
"grad_norm": 0.5934985876083374,
"learning_rate": 7.062151584438215e-07,
"loss": 0.2236,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2773967385292053,
"step": 7320
},
{
"epoch": 6.475265017667844,
"grad_norm": 0.6796112656593323,
"learning_rate": 6.946581857052192e-07,
"loss": 0.2281,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2649118900299072,
"step": 7325
},
{
"epoch": 6.479681978798586,
"grad_norm": 0.6687401533126831,
"learning_rate": 6.831948863487703e-07,
"loss": 0.2464,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24808388948440552,
"step": 7330
},
{
"epoch": 6.4840989399293285,
"grad_norm": 0.7123255133628845,
"learning_rate": 6.71825315996697e-07,
"loss": 0.225,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28177204728126526,
"step": 7335
},
{
"epoch": 6.488515901060071,
"grad_norm": 0.6031168103218079,
"learning_rate": 6.605495298164299e-07,
"loss": 0.2491,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21055057644844055,
"step": 7340
},
{
"epoch": 6.492932862190813,
"grad_norm": 0.5988543033599854,
"learning_rate": 6.493675825203416e-07,
"loss": 0.1915,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18854458630084991,
"step": 7345
},
{
"epoch": 6.497349823321555,
"grad_norm": 0.6946279406547546,
"learning_rate": 6.382795283654796e-07,
"loss": 0.2077,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2246820479631424,
"step": 7350
},
{
"epoch": 6.501766784452297,
"grad_norm": 0.7605702877044678,
"learning_rate": 6.272854211532964e-07,
"loss": 0.2322,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16712304949760437,
"step": 7355
},
{
"epoch": 6.506183745583039,
"grad_norm": 0.7368548512458801,
"learning_rate": 6.163853142294041e-07,
"loss": 0.2214,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1958334743976593,
"step": 7360
},
{
"epoch": 6.510600706713781,
"grad_norm": 0.6829209923744202,
"learning_rate": 6.055792604833022e-07,
"loss": 0.2615,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2502579689025879,
"step": 7365
},
{
"epoch": 6.515017667844523,
"grad_norm": 0.6418865323066711,
"learning_rate": 5.948673123481286e-07,
"loss": 0.249,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24650785326957703,
"step": 7370
},
{
"epoch": 6.519434628975265,
"grad_norm": 0.6992244124412537,
"learning_rate": 5.842495218003952e-07,
"loss": 0.2199,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25637686252593994,
"step": 7375
},
{
"epoch": 6.523851590106007,
"grad_norm": 0.6708112955093384,
"learning_rate": 5.737259403597484e-07,
"loss": 0.2334,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.274458646774292,
"step": 7380
},
{
"epoch": 6.528268551236749,
"grad_norm": 0.6802352070808411,
"learning_rate": 5.632966190887157e-07,
"loss": 0.2038,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2165306955575943,
"step": 7385
},
{
"epoch": 6.532685512367491,
"grad_norm": 0.6572979688644409,
"learning_rate": 5.529616085924439e-07,
"loss": 0.233,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2785220146179199,
"step": 7390
},
{
"epoch": 6.5371024734982335,
"grad_norm": 0.702512264251709,
"learning_rate": 5.42720959018479e-07,
"loss": 0.268,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.32956579327583313,
"step": 7395
},
{
"epoch": 6.541519434628976,
"grad_norm": 0.6240583658218384,
"learning_rate": 5.325747200564979e-07,
"loss": 0.2227,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28135302662849426,
"step": 7400
},
{
"epoch": 6.545936395759718,
"grad_norm": 0.7530537247657776,
"learning_rate": 5.225229409380839e-07,
"loss": 0.2127,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2570284605026245,
"step": 7405
},
{
"epoch": 6.55035335689046,
"grad_norm": 0.6967406868934631,
"learning_rate": 5.125656704364801e-07,
"loss": 0.2077,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1789890080690384,
"step": 7410
},
{
"epoch": 6.554770318021202,
"grad_norm": 0.6931060552597046,
"learning_rate": 5.027029568663566e-07,
"loss": 0.2762,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2260131686925888,
"step": 7415
},
{
"epoch": 6.559187279151944,
"grad_norm": 0.7548974752426147,
"learning_rate": 4.929348480835749e-07,
"loss": 0.212,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18322241306304932,
"step": 7420
},
{
"epoch": 6.563604240282686,
"grad_norm": 0.6624125242233276,
"learning_rate": 4.832613914849504e-07,
"loss": 0.2257,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20556357502937317,
"step": 7425
},
{
"epoch": 6.568021201413428,
"grad_norm": 0.7263450622558594,
"learning_rate": 4.7368263400803693e-07,
"loss": 0.2179,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22019252181053162,
"step": 7430
},
{
"epoch": 6.572438162544169,
"grad_norm": 0.6411981582641602,
"learning_rate": 4.6419862213087365e-07,
"loss": 0.221,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2418275624513626,
"step": 7435
},
{
"epoch": 6.576855123674911,
"grad_norm": 0.652529776096344,
"learning_rate": 4.548094018717919e-07,
"loss": 0.2338,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24279262125492096,
"step": 7440
},
{
"epoch": 6.581272084805653,
"grad_norm": 0.7010001540184021,
"learning_rate": 4.4551501878916214e-07,
"loss": 0.2489,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.38378775119781494,
"step": 7445
},
{
"epoch": 6.5856890459363955,
"grad_norm": 0.6368486881256104,
"learning_rate": 4.363155179811962e-07,
"loss": 0.1945,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18857476115226746,
"step": 7450
},
{
"epoch": 6.590106007067138,
"grad_norm": 0.7825067043304443,
"learning_rate": 4.2721094408570974e-07,
"loss": 0.2113,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2131972312927246,
"step": 7455
},
{
"epoch": 6.59452296819788,
"grad_norm": 0.7557323575019836,
"learning_rate": 4.1820134127991794e-07,
"loss": 0.2024,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18626517057418823,
"step": 7460
},
{
"epoch": 6.598939929328622,
"grad_norm": 0.6673694849014282,
"learning_rate": 4.0928675328022027e-07,
"loss": 0.2569,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25416773557662964,
"step": 7465
},
{
"epoch": 6.603356890459364,
"grad_norm": 0.7011893391609192,
"learning_rate": 4.0046722334197375e-07,
"loss": 0.2523,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3195936977863312,
"step": 7470
},
{
"epoch": 6.607773851590106,
"grad_norm": 0.8319687247276306,
"learning_rate": 3.9174279425931105e-07,
"loss": 0.2493,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3487233519554138,
"step": 7475
},
{
"epoch": 6.612190812720848,
"grad_norm": 0.7716171145439148,
"learning_rate": 3.8311350836490514e-07,
"loss": 0.2072,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17440339922904968,
"step": 7480
},
{
"epoch": 6.61660777385159,
"grad_norm": 0.7053916454315186,
"learning_rate": 3.7457940752977594e-07,
"loss": 0.2245,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21849893033504486,
"step": 7485
},
{
"epoch": 6.621024734982332,
"grad_norm": 0.7948006987571716,
"learning_rate": 3.6614053316309074e-07,
"loss": 0.2484,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26136571168899536,
"step": 7490
},
{
"epoch": 6.625441696113074,
"grad_norm": 0.6417169570922852,
"learning_rate": 3.577969262119574e-07,
"loss": 0.2471,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26517075300216675,
"step": 7495
},
{
"epoch": 6.629858657243816,
"grad_norm": 0.7573156952857971,
"learning_rate": 3.4954862716122473e-07,
"loss": 0.245,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16177725791931152,
"step": 7500
},
{
"epoch": 6.634275618374558,
"grad_norm": 0.6673710942268372,
"learning_rate": 3.413956760332937e-07,
"loss": 0.2486,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28934532403945923,
"step": 7505
},
{
"epoch": 6.6386925795053005,
"grad_norm": 0.7154965400695801,
"learning_rate": 3.3333811238791316e-07,
"loss": 0.2397,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19894251227378845,
"step": 7510
},
{
"epoch": 6.6431095406360425,
"grad_norm": 0.8006401658058167,
"learning_rate": 3.2537597532199315e-07,
"loss": 0.2319,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23368564248085022,
"step": 7515
},
{
"epoch": 6.647526501766785,
"grad_norm": 0.6231130361557007,
"learning_rate": 3.175093034694188e-07,
"loss": 0.2462,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19901973009109497,
"step": 7520
},
{
"epoch": 6.651943462897527,
"grad_norm": 0.6645359396934509,
"learning_rate": 3.0973813500085215e-07,
"loss": 0.2524,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23790386319160461,
"step": 7525
},
{
"epoch": 6.656360424028269,
"grad_norm": 0.7502444386482239,
"learning_rate": 3.0206250762356393e-07,
"loss": 0.2229,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22129979729652405,
"step": 7530
},
{
"epoch": 6.660777385159011,
"grad_norm": 0.6512811183929443,
"learning_rate": 2.944824585812289e-07,
"loss": 0.253,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24692480266094208,
"step": 7535
},
{
"epoch": 6.665194346289753,
"grad_norm": 0.6338093876838684,
"learning_rate": 2.86998024653764e-07,
"loss": 0.2605,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21317501366138458,
"step": 7540
},
{
"epoch": 6.669611307420495,
"grad_norm": 0.7195187211036682,
"learning_rate": 2.7960924215714394e-07,
"loss": 0.2675,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21621374785900116,
"step": 7545
},
{
"epoch": 6.674028268551237,
"grad_norm": 0.750506579875946,
"learning_rate": 2.723161469432123e-07,
"loss": 0.1965,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15070626139640808,
"step": 7550
},
{
"epoch": 6.678445229681979,
"grad_norm": 0.6777002811431885,
"learning_rate": 2.6511877439953536e-07,
"loss": 0.2459,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2585234045982361,
"step": 7555
},
{
"epoch": 6.68286219081272,
"grad_norm": 0.6883443593978882,
"learning_rate": 2.5801715944919983e-07,
"loss": 0.2413,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20979022979736328,
"step": 7560
},
{
"epoch": 6.6872791519434625,
"grad_norm": 1.0622401237487793,
"learning_rate": 2.510113365506639e-07,
"loss": 0.2139,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2046893984079361,
"step": 7565
},
{
"epoch": 6.6916961130742045,
"grad_norm": 0.6978748440742493,
"learning_rate": 2.441013396975822e-07,
"loss": 0.2604,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24113841354846954,
"step": 7570
},
{
"epoch": 6.696113074204947,
"grad_norm": 0.7417298555374146,
"learning_rate": 2.3728720241864123e-07,
"loss": 0.2582,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2803192436695099,
"step": 7575
},
{
"epoch": 6.700530035335689,
"grad_norm": 0.6066814661026001,
"learning_rate": 2.3056895777740174e-07,
"loss": 0.2462,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2779116630554199,
"step": 7580
},
{
"epoch": 6.704946996466431,
"grad_norm": 0.7119974493980408,
"learning_rate": 2.2394663837213005e-07,
"loss": 0.226,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18332818150520325,
"step": 7585
},
{
"epoch": 6.709363957597173,
"grad_norm": 0.6575401425361633,
"learning_rate": 2.1742027633564477e-07,
"loss": 0.2058,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22709758579730988,
"step": 7590
},
{
"epoch": 6.713780918727915,
"grad_norm": 0.6911765933036804,
"learning_rate": 2.1098990333516144e-07,
"loss": 0.2245,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1879327893257141,
"step": 7595
},
{
"epoch": 6.718197879858657,
"grad_norm": 0.6780998110771179,
"learning_rate": 2.0465555057213705e-07,
"loss": 0.2762,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2876776456832886,
"step": 7600
},
{
"epoch": 6.722614840989399,
"grad_norm": 0.6587400436401367,
"learning_rate": 1.9841724878211676e-07,
"loss": 0.2409,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2115318775177002,
"step": 7605
},
{
"epoch": 6.727031802120141,
"grad_norm": 0.7173461318016052,
"learning_rate": 1.9227502823458976e-07,
"loss": 0.2223,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23369640111923218,
"step": 7610
},
{
"epoch": 6.731448763250883,
"grad_norm": 1.5791888236999512,
"learning_rate": 1.8622891873284254e-07,
"loss": 0.2322,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1805000901222229,
"step": 7615
},
{
"epoch": 6.735865724381625,
"grad_norm": 0.720446765422821,
"learning_rate": 1.8027894961380353e-07,
"loss": 0.237,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25251534581184387,
"step": 7620
},
{
"epoch": 6.740282685512367,
"grad_norm": 0.8006609082221985,
"learning_rate": 1.7442514974792103e-07,
"loss": 0.2704,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23466309905052185,
"step": 7625
},
{
"epoch": 6.7446996466431095,
"grad_norm": 0.6387749910354614,
"learning_rate": 1.6866754753899429e-07,
"loss": 0.2293,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29380500316619873,
"step": 7630
},
{
"epoch": 6.749116607773852,
"grad_norm": 0.6837402582168579,
"learning_rate": 1.6300617092406933e-07,
"loss": 0.2852,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24347926676273346,
"step": 7635
},
{
"epoch": 6.753533568904594,
"grad_norm": 0.6359254717826843,
"learning_rate": 1.5744104737327458e-07,
"loss": 0.2428,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2921152710914612,
"step": 7640
},
{
"epoch": 6.757950530035336,
"grad_norm": 0.7265500426292419,
"learning_rate": 1.5197220388970313e-07,
"loss": 0.2053,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1532514989376068,
"step": 7645
},
{
"epoch": 6.762367491166078,
"grad_norm": 0.6616162061691284,
"learning_rate": 1.4659966700927952e-07,
"loss": 0.2316,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23041373491287231,
"step": 7650
},
{
"epoch": 6.76678445229682,
"grad_norm": 0.6630392670631409,
"learning_rate": 1.413234628006288e-07,
"loss": 0.2372,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2294270396232605,
"step": 7655
},
{
"epoch": 6.771201413427562,
"grad_norm": 0.6096079349517822,
"learning_rate": 1.3614361686494549e-07,
"loss": 0.2375,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2563217282295227,
"step": 7660
},
{
"epoch": 6.775618374558304,
"grad_norm": 0.6756962537765503,
"learning_rate": 1.310601543358847e-07,
"loss": 0.2402,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18047744035720825,
"step": 7665
},
{
"epoch": 6.780035335689046,
"grad_norm": 0.855846107006073,
"learning_rate": 1.260730998794202e-07,
"loss": 0.2261,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21389582753181458,
"step": 7670
},
{
"epoch": 6.784452296819788,
"grad_norm": 0.64863520860672,
"learning_rate": 1.2118247769373758e-07,
"loss": 0.2558,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3245598375797272,
"step": 7675
},
{
"epoch": 6.78886925795053,
"grad_norm": 0.6336214542388916,
"learning_rate": 1.163883115091169e-07,
"loss": 0.2173,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2048630714416504,
"step": 7680
},
{
"epoch": 6.793286219081272,
"grad_norm": 0.7539443373680115,
"learning_rate": 1.1169062458781022e-07,
"loss": 0.2131,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18259122967720032,
"step": 7685
},
{
"epoch": 6.7977031802120145,
"grad_norm": 0.7678613662719727,
"learning_rate": 1.0708943972393748e-07,
"loss": 0.2261,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23812338709831238,
"step": 7690
},
{
"epoch": 6.8021201413427566,
"grad_norm": 0.6185624003410339,
"learning_rate": 1.025847792433643e-07,
"loss": 0.2611,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2555660903453827,
"step": 7695
},
{
"epoch": 6.806537102473499,
"grad_norm": 0.6969472169876099,
"learning_rate": 9.817666500360867e-08,
"loss": 0.2362,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20245115458965302,
"step": 7700
},
{
"epoch": 6.810954063604241,
"grad_norm": 0.678084135055542,
"learning_rate": 9.386511839372114e-08,
"loss": 0.2119,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21555975079536438,
"step": 7705
},
{
"epoch": 6.815371024734983,
"grad_norm": 0.6462898850440979,
"learning_rate": 8.965016033418705e-08,
"loss": 0.2257,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20813468098640442,
"step": 7710
},
{
"epoch": 6.819787985865725,
"grad_norm": 0.6597393155097961,
"learning_rate": 8.553181127683108e-08,
"loss": 0.2653,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23082324862480164,
"step": 7715
},
{
"epoch": 6.824204946996466,
"grad_norm": 0.6514454483985901,
"learning_rate": 8.1510091204704e-08,
"loss": 0.2372,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23393850028514862,
"step": 7720
},
{
"epoch": 6.828621908127208,
"grad_norm": 0.7454639077186584,
"learning_rate": 7.758501963199605e-08,
"loss": 0.2673,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26989006996154785,
"step": 7725
},
{
"epoch": 6.83303886925795,
"grad_norm": 0.6669009327888489,
"learning_rate": 7.375661560394154e-08,
"loss": 0.2844,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2976769208908081,
"step": 7730
},
{
"epoch": 6.837455830388692,
"grad_norm": 0.7060198783874512,
"learning_rate": 7.002489769672105e-08,
"loss": 0.2247,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.265421986579895,
"step": 7735
},
{
"epoch": 6.841872791519434,
"grad_norm": 0.7038555145263672,
"learning_rate": 6.638988401737933e-08,
"loss": 0.2383,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2049991488456726,
"step": 7740
},
{
"epoch": 6.8462897526501765,
"grad_norm": 0.7277244925498962,
"learning_rate": 6.285159220372982e-08,
"loss": 0.2929,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20731091499328613,
"step": 7745
},
{
"epoch": 6.8507067137809186,
"grad_norm": 0.6668598651885986,
"learning_rate": 5.941003942427026e-08,
"loss": 0.2092,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23843859136104584,
"step": 7750
},
{
"epoch": 6.855123674911661,
"grad_norm": 0.6584317684173584,
"learning_rate": 5.6065242378104957e-08,
"loss": 0.2357,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24361775815486908,
"step": 7755
},
{
"epoch": 6.859540636042403,
"grad_norm": 0.6468645334243774,
"learning_rate": 5.281721729486044e-08,
"loss": 0.2196,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2681673467159271,
"step": 7760
},
{
"epoch": 6.863957597173145,
"grad_norm": 0.5975419282913208,
"learning_rate": 4.966597993460109e-08,
"loss": 0.2458,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2951905131340027,
"step": 7765
},
{
"epoch": 6.868374558303887,
"grad_norm": 0.7025241255760193,
"learning_rate": 4.6611545587762486e-08,
"loss": 0.1947,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22322210669517517,
"step": 7770
},
{
"epoch": 6.872791519434629,
"grad_norm": 0.7678356170654297,
"learning_rate": 4.365392907507149e-08,
"loss": 0.1886,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15035316348075867,
"step": 7775
},
{
"epoch": 6.877208480565371,
"grad_norm": 0.6374990344047546,
"learning_rate": 4.079314474747742e-08,
"loss": 0.2243,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24768120050430298,
"step": 7780
},
{
"epoch": 6.881625441696113,
"grad_norm": 0.6044369339942932,
"learning_rate": 3.802920648607433e-08,
"loss": 0.2157,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22181479632854462,
"step": 7785
},
{
"epoch": 6.886042402826855,
"grad_norm": 0.6744352579116821,
"learning_rate": 3.536212770204772e-08,
"loss": 0.2375,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29637134075164795,
"step": 7790
},
{
"epoch": 6.890459363957597,
"grad_norm": 0.6962430477142334,
"learning_rate": 3.279192133659459e-08,
"loss": 0.2164,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2750842869281769,
"step": 7795
},
{
"epoch": 6.894876325088339,
"grad_norm": 0.6851283311843872,
"learning_rate": 3.0318599860872377e-08,
"loss": 0.2213,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29073143005371094,
"step": 7800
},
{
"epoch": 6.8992932862190814,
"grad_norm": 0.7150095701217651,
"learning_rate": 2.7942175275932347e-08,
"loss": 0.2358,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22673749923706055,
"step": 7805
},
{
"epoch": 6.9037102473498235,
"grad_norm": 0.7254000306129456,
"learning_rate": 2.5662659112659637e-08,
"loss": 0.2383,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2616110146045685,
"step": 7810
},
{
"epoch": 6.908127208480566,
"grad_norm": 0.613750696182251,
"learning_rate": 2.3480062431724404e-08,
"loss": 0.2359,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17015941441059113,
"step": 7815
},
{
"epoch": 6.912544169611308,
"grad_norm": 0.6614463925361633,
"learning_rate": 2.1394395823524093e-08,
"loss": 0.198,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.182722270488739,
"step": 7820
},
{
"epoch": 6.91696113074205,
"grad_norm": 0.7142705917358398,
"learning_rate": 1.9405669408127935e-08,
"loss": 0.2227,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23320986330509186,
"step": 7825
},
{
"epoch": 6.921378091872792,
"grad_norm": 0.701187252998352,
"learning_rate": 1.7513892835236967e-08,
"loss": 0.2251,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23433920741081238,
"step": 7830
},
{
"epoch": 6.925795053003534,
"grad_norm": 0.6869240999221802,
"learning_rate": 1.5719075284126307e-08,
"loss": 0.2325,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24257785081863403,
"step": 7835
},
{
"epoch": 6.930212014134275,
"grad_norm": 0.656295657157898,
"learning_rate": 1.4021225463614063e-08,
"loss": 0.2759,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3004852533340454,
"step": 7840
},
{
"epoch": 6.934628975265017,
"grad_norm": 0.6697101593017578,
"learning_rate": 1.2420351612003611e-08,
"loss": 0.2478,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24285230040550232,
"step": 7845
},
{
"epoch": 6.939045936395759,
"grad_norm": 0.6110602617263794,
"learning_rate": 1.0916461497059161e-08,
"loss": 0.1851,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1842736005783081,
"step": 7850
},
{
"epoch": 6.943462897526501,
"grad_norm": 0.7281805276870728,
"learning_rate": 9.509562415952468e-09,
"loss": 0.2303,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19850802421569824,
"step": 7855
},
{
"epoch": 6.9478798586572434,
"grad_norm": 0.6979923248291016,
"learning_rate": 8.199661195240626e-09,
"loss": 0.2786,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3593960404396057,
"step": 7860
},
{
"epoch": 6.9522968197879855,
"grad_norm": 0.7403237223625183,
"learning_rate": 6.9867641908305524e-09,
"loss": 0.2273,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.32437562942504883,
"step": 7865
},
{
"epoch": 6.956713780918728,
"grad_norm": 0.6508459448814392,
"learning_rate": 5.870877287934562e-09,
"loss": 0.2254,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27816981077194214,
"step": 7870
},
{
"epoch": 6.96113074204947,
"grad_norm": 0.8912989497184753,
"learning_rate": 4.852005901063717e-09,
"loss": 0.207,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17091882228851318,
"step": 7875
},
{
"epoch": 6.965547703180212,
"grad_norm": 0.6817123889923096,
"learning_rate": 3.930154973985634e-09,
"loss": 0.2314,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20704488456249237,
"step": 7880
},
{
"epoch": 6.969964664310954,
"grad_norm": 0.6807764172554016,
"learning_rate": 3.1053289797022825e-09,
"loss": 0.2361,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19280683994293213,
"step": 7885
},
{
"epoch": 6.974381625441696,
"grad_norm": 0.7215080261230469,
"learning_rate": 2.37753192043888e-09,
"loss": 0.2094,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20111405849456787,
"step": 7890
},
{
"epoch": 6.978798586572438,
"grad_norm": 0.7553413510322571,
"learning_rate": 1.746767327610588e-09,
"loss": 0.2498,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21126879751682281,
"step": 7895
},
{
"epoch": 6.98321554770318,
"grad_norm": 0.6956353187561035,
"learning_rate": 1.2130382618114057e-09,
"loss": 0.2267,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26631930470466614,
"step": 7900
},
{
"epoch": 6.987632508833922,
"grad_norm": 0.9583781361579895,
"learning_rate": 7.763473128052923e-10,
"loss": 0.2211,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20020142197608948,
"step": 7905
},
{
"epoch": 6.992049469964664,
"grad_norm": 0.7983410358428955,
"learning_rate": 4.366965994995198e-10,
"loss": 0.2046,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17487777769565582,
"step": 7910
},
{
"epoch": 6.996466431095406,
"grad_norm": 0.7559086084365845,
"learning_rate": 1.9408776995355483e-10,
"loss": 0.1849,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1718912273645401,
"step": 7915
},
{
"epoch": 6.999116607773852,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2685817778110504,
"step": 7918,
"total_flos": 7677204649476096.0,
"train_loss": 0.2887290850240313,
"train_runtime": 73070.9484,
"train_samples_per_second": 0.108,
"train_steps_per_second": 0.108
}
],
"logging_steps": 5,
"max_steps": 7924,
"num_input_tokens_seen": 0,
"num_train_epochs": 7,
"save_steps": 1500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7677204649476096.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}