Files
glm46-swesmith-maxeps-131k-lc/trainer_state.json
ModelHub XC 826bd6ab23 初始化项目,由ModelHub XC社区提供模型
Model: laion/glm46-swesmith-maxeps-131k-lc
Source: Original Platform
2026-05-26 11:24:19 +08:00

24076 lines
637 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.999475890985325,
"eval_steps": 500,
"global_step": 13350,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002620545073375262,
"grad_norm": 11.724857330322266,
"learning_rate": 1.197604790419162e-07,
"loss": 0.6087,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.6668875217437744,
"step": 5
},
{
"epoch": 0.005241090146750524,
"grad_norm": 14.185229301452637,
"learning_rate": 2.694610778443114e-07,
"loss": 0.6416,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.6689453125,
"step": 10
},
{
"epoch": 0.007861635220125786,
"grad_norm": 14.250484466552734,
"learning_rate": 4.191616766467066e-07,
"loss": 0.6312,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.6533203125,
"step": 15
},
{
"epoch": 0.010482180293501049,
"grad_norm": 12.26307487487793,
"learning_rate": 5.688622754491019e-07,
"loss": 0.6082,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.5997869968414307,
"step": 20
},
{
"epoch": 0.01310272536687631,
"grad_norm": 11.114072799682617,
"learning_rate": 7.18562874251497e-07,
"loss": 0.6011,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.5580273866653442,
"step": 25
},
{
"epoch": 0.015723270440251572,
"grad_norm": 10.276555061340332,
"learning_rate": 8.682634730538923e-07,
"loss": 0.6239,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.5910165309906006,
"step": 30
},
{
"epoch": 0.018343815513626835,
"grad_norm": 7.9180121421813965,
"learning_rate": 1.0179640718562875e-06,
"loss": 0.6123,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.5606018900871277,
"step": 35
},
{
"epoch": 0.020964360587002098,
"grad_norm": 7.015914440155029,
"learning_rate": 1.1676646706586827e-06,
"loss": 0.5562,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.572616457939148,
"step": 40
},
{
"epoch": 0.02358490566037736,
"grad_norm": 6.32131290435791,
"learning_rate": 1.3173652694610781e-06,
"loss": 0.5463,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.5819799900054932,
"step": 45
},
{
"epoch": 0.02620545073375262,
"grad_norm": 4.320750713348389,
"learning_rate": 1.4670658682634732e-06,
"loss": 0.5135,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.44384765625,
"step": 50
},
{
"epoch": 0.028825995807127882,
"grad_norm": 3.183272361755371,
"learning_rate": 1.6167664670658684e-06,
"loss": 0.5003,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.5493164658546448,
"step": 55
},
{
"epoch": 0.031446540880503145,
"grad_norm": 2.9379472732543945,
"learning_rate": 1.7664670658682636e-06,
"loss": 0.5153,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.5202266573905945,
"step": 60
},
{
"epoch": 0.034067085953878404,
"grad_norm": 2.233424425125122,
"learning_rate": 1.916167664670659e-06,
"loss": 0.4874,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.48310330510139465,
"step": 65
},
{
"epoch": 0.03668763102725367,
"grad_norm": 1.9633489847183228,
"learning_rate": 2.065868263473054e-06,
"loss": 0.462,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4763810634613037,
"step": 70
},
{
"epoch": 0.03930817610062893,
"grad_norm": 2.0084543228149414,
"learning_rate": 2.215568862275449e-06,
"loss": 0.4679,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4606221914291382,
"step": 75
},
{
"epoch": 0.041928721174004195,
"grad_norm": 1.543745517730713,
"learning_rate": 2.3652694610778446e-06,
"loss": 0.4551,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4795137643814087,
"step": 80
},
{
"epoch": 0.044549266247379454,
"grad_norm": 1.4478498697280884,
"learning_rate": 2.5149700598802396e-06,
"loss": 0.4271,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4189101457595825,
"step": 85
},
{
"epoch": 0.04716981132075472,
"grad_norm": 1.2258864641189575,
"learning_rate": 2.664670658682635e-06,
"loss": 0.3946,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3954263925552368,
"step": 90
},
{
"epoch": 0.04979035639412998,
"grad_norm": 1.826331615447998,
"learning_rate": 2.81437125748503e-06,
"loss": 0.4298,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4531334638595581,
"step": 95
},
{
"epoch": 0.05241090146750524,
"grad_norm": 1.2551552057266235,
"learning_rate": 2.9640718562874255e-06,
"loss": 0.4199,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.360871821641922,
"step": 100
},
{
"epoch": 0.055031446540880505,
"grad_norm": 1.3189074993133545,
"learning_rate": 3.113772455089821e-06,
"loss": 0.4025,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.36634361743927,
"step": 105
},
{
"epoch": 0.057651991614255764,
"grad_norm": 1.0882036685943604,
"learning_rate": 3.263473053892216e-06,
"loss": 0.3986,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3869917094707489,
"step": 110
},
{
"epoch": 0.06027253668763103,
"grad_norm": 1.296404242515564,
"learning_rate": 3.4131736526946114e-06,
"loss": 0.4135,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4592534005641937,
"step": 115
},
{
"epoch": 0.06289308176100629,
"grad_norm": 1.1988800764083862,
"learning_rate": 3.562874251497006e-06,
"loss": 0.3877,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.358315646648407,
"step": 120
},
{
"epoch": 0.06551362683438156,
"grad_norm": 1.1711173057556152,
"learning_rate": 3.7125748502994014e-06,
"loss": 0.3901,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4149848222732544,
"step": 125
},
{
"epoch": 0.06813417190775681,
"grad_norm": 1.0720573663711548,
"learning_rate": 3.862275449101797e-06,
"loss": 0.3836,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.4089677929878235,
"step": 130
},
{
"epoch": 0.07075471698113207,
"grad_norm": 1.0855176448822021,
"learning_rate": 4.011976047904192e-06,
"loss": 0.3835,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.36083984375,
"step": 135
},
{
"epoch": 0.07337526205450734,
"grad_norm": 1.0493624210357666,
"learning_rate": 4.161676646706587e-06,
"loss": 0.3664,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3388671875,
"step": 140
},
{
"epoch": 0.0759958071278826,
"grad_norm": 1.1222777366638184,
"learning_rate": 4.311377245508982e-06,
"loss": 0.3917,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.37147727608680725,
"step": 145
},
{
"epoch": 0.07861635220125786,
"grad_norm": 0.912915825843811,
"learning_rate": 4.461077844311378e-06,
"loss": 0.3112,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2651228904724121,
"step": 150
},
{
"epoch": 0.08123689727463312,
"grad_norm": 0.7332404851913452,
"learning_rate": 4.610778443113773e-06,
"loss": 0.3405,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3047122061252594,
"step": 155
},
{
"epoch": 0.08385744234800839,
"grad_norm": 0.8747320175170898,
"learning_rate": 4.760479041916168e-06,
"loss": 0.3459,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.351220965385437,
"step": 160
},
{
"epoch": 0.08647798742138364,
"grad_norm": 1.0423736572265625,
"learning_rate": 4.910179640718563e-06,
"loss": 0.3364,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3234074115753174,
"step": 165
},
{
"epoch": 0.08909853249475891,
"grad_norm": 1.1128911972045898,
"learning_rate": 5.059880239520959e-06,
"loss": 0.341,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.37532955408096313,
"step": 170
},
{
"epoch": 0.09171907756813417,
"grad_norm": 0.9953895807266235,
"learning_rate": 5.209580838323353e-06,
"loss": 0.3598,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3447265625,
"step": 175
},
{
"epoch": 0.09433962264150944,
"grad_norm": 1.1297733783721924,
"learning_rate": 5.359281437125749e-06,
"loss": 0.333,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3620813190937042,
"step": 180
},
{
"epoch": 0.09696016771488469,
"grad_norm": 0.9839518070220947,
"learning_rate": 5.508982035928144e-06,
"loss": 0.3589,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3644392788410187,
"step": 185
},
{
"epoch": 0.09958071278825996,
"grad_norm": 0.9680199027061462,
"learning_rate": 5.658682634730539e-06,
"loss": 0.3303,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3452174961566925,
"step": 190
},
{
"epoch": 0.10220125786163523,
"grad_norm": 1.0094338655471802,
"learning_rate": 5.808383233532935e-06,
"loss": 0.3228,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3340649902820587,
"step": 195
},
{
"epoch": 0.10482180293501048,
"grad_norm": 0.9142029881477356,
"learning_rate": 5.95808383233533e-06,
"loss": 0.334,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2673777937889099,
"step": 200
},
{
"epoch": 0.10744234800838574,
"grad_norm": 2.28641939163208,
"learning_rate": 6.107784431137725e-06,
"loss": 0.3214,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3424881398677826,
"step": 205
},
{
"epoch": 0.11006289308176101,
"grad_norm": 1.3070180416107178,
"learning_rate": 6.25748502994012e-06,
"loss": 0.317,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.33521905541419983,
"step": 210
},
{
"epoch": 0.11268343815513626,
"grad_norm": 1.0307612419128418,
"learning_rate": 6.407185628742516e-06,
"loss": 0.351,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.33849790692329407,
"step": 215
},
{
"epoch": 0.11530398322851153,
"grad_norm": 0.7611124515533447,
"learning_rate": 6.556886227544911e-06,
"loss": 0.33,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31713446974754333,
"step": 220
},
{
"epoch": 0.1179245283018868,
"grad_norm": 1.1252394914627075,
"learning_rate": 6.706586826347305e-06,
"loss": 0.3151,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30296391248703003,
"step": 225
},
{
"epoch": 0.12054507337526206,
"grad_norm": 1.2389367818832397,
"learning_rate": 6.8562874251497016e-06,
"loss": 0.3268,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3277322053909302,
"step": 230
},
{
"epoch": 0.12316561844863731,
"grad_norm": 1.3844778537750244,
"learning_rate": 7.005988023952096e-06,
"loss": 0.3227,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28340739011764526,
"step": 235
},
{
"epoch": 0.12578616352201258,
"grad_norm": 1.3721177577972412,
"learning_rate": 7.155688622754492e-06,
"loss": 0.3369,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.32868561148643494,
"step": 240
},
{
"epoch": 0.12840670859538783,
"grad_norm": 0.9161558747291565,
"learning_rate": 7.305389221556887e-06,
"loss": 0.3311,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3234233856201172,
"step": 245
},
{
"epoch": 0.1310272536687631,
"grad_norm": 0.9054167866706848,
"learning_rate": 7.4550898203592825e-06,
"loss": 0.3368,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.35056421160697937,
"step": 250
},
{
"epoch": 0.13364779874213836,
"grad_norm": 0.8199090957641602,
"learning_rate": 7.604790419161677e-06,
"loss": 0.3229,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3535444140434265,
"step": 255
},
{
"epoch": 0.13626834381551362,
"grad_norm": 0.8128832578659058,
"learning_rate": 7.754491017964072e-06,
"loss": 0.3265,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3213815689086914,
"step": 260
},
{
"epoch": 0.1388888888888889,
"grad_norm": 1.2015630006790161,
"learning_rate": 7.904191616766468e-06,
"loss": 0.3267,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3715681731700897,
"step": 265
},
{
"epoch": 0.14150943396226415,
"grad_norm": 0.9504061937332153,
"learning_rate": 8.053892215568863e-06,
"loss": 0.3377,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.32142454385757446,
"step": 270
},
{
"epoch": 0.1441299790356394,
"grad_norm": 1.0159927606582642,
"learning_rate": 8.203592814371259e-06,
"loss": 0.3108,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3076171875,
"step": 275
},
{
"epoch": 0.14675052410901468,
"grad_norm": 0.9303966164588928,
"learning_rate": 8.353293413173653e-06,
"loss": 0.3086,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2712244391441345,
"step": 280
},
{
"epoch": 0.14937106918238993,
"grad_norm": 2.135909080505371,
"learning_rate": 8.50299401197605e-06,
"loss": 0.3045,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3287564814090729,
"step": 285
},
{
"epoch": 0.1519916142557652,
"grad_norm": 1.1756415367126465,
"learning_rate": 8.652694610778444e-06,
"loss": 0.3074,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3280114233493805,
"step": 290
},
{
"epoch": 0.15461215932914046,
"grad_norm": 0.899340808391571,
"learning_rate": 8.802395209580839e-06,
"loss": 0.3026,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2846500277519226,
"step": 295
},
{
"epoch": 0.15723270440251572,
"grad_norm": 0.9873744249343872,
"learning_rate": 8.952095808383234e-06,
"loss": 0.3079,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.33438584208488464,
"step": 300
},
{
"epoch": 0.159853249475891,
"grad_norm": 1.1683752536773682,
"learning_rate": 9.10179640718563e-06,
"loss": 0.2949,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.283935546875,
"step": 305
},
{
"epoch": 0.16247379454926625,
"grad_norm": 0.8605905175209045,
"learning_rate": 9.251497005988024e-06,
"loss": 0.3146,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3393379747867584,
"step": 310
},
{
"epoch": 0.1650943396226415,
"grad_norm": 1.0230729579925537,
"learning_rate": 9.401197604790419e-06,
"loss": 0.2774,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30908203125,
"step": 315
},
{
"epoch": 0.16771488469601678,
"grad_norm": 1.3808443546295166,
"learning_rate": 9.550898203592815e-06,
"loss": 0.2966,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2983994781970978,
"step": 320
},
{
"epoch": 0.17033542976939203,
"grad_norm": 1.0363587141036987,
"learning_rate": 9.70059880239521e-06,
"loss": 0.2853,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27548953890800476,
"step": 325
},
{
"epoch": 0.17295597484276728,
"grad_norm": 0.9016790390014648,
"learning_rate": 9.850299401197606e-06,
"loss": 0.3172,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2579153776168823,
"step": 330
},
{
"epoch": 0.17557651991614256,
"grad_norm": 1.2287031412124634,
"learning_rate": 1e-05,
"loss": 0.3169,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3329937756061554,
"step": 335
},
{
"epoch": 0.17819706498951782,
"grad_norm": 0.9567996263504028,
"learning_rate": 1.0149700598802397e-05,
"loss": 0.3022,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3393104672431946,
"step": 340
},
{
"epoch": 0.18081761006289307,
"grad_norm": 1.061110258102417,
"learning_rate": 1.029940119760479e-05,
"loss": 0.3101,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3320685923099518,
"step": 345
},
{
"epoch": 0.18343815513626835,
"grad_norm": 0.909108579158783,
"learning_rate": 1.0449101796407186e-05,
"loss": 0.281,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26276031136512756,
"step": 350
},
{
"epoch": 0.1860587002096436,
"grad_norm": 1.0113543272018433,
"learning_rate": 1.0598802395209583e-05,
"loss": 0.3055,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.34771761298179626,
"step": 355
},
{
"epoch": 0.18867924528301888,
"grad_norm": 0.7303060293197632,
"learning_rate": 1.0748502994011977e-05,
"loss": 0.3203,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3523019850254059,
"step": 360
},
{
"epoch": 0.19129979035639413,
"grad_norm": 1.0004197359085083,
"learning_rate": 1.0898203592814372e-05,
"loss": 0.2992,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2729937434196472,
"step": 365
},
{
"epoch": 0.19392033542976939,
"grad_norm": 1.0297443866729736,
"learning_rate": 1.1047904191616768e-05,
"loss": 0.2786,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27099609375,
"step": 370
},
{
"epoch": 0.19654088050314467,
"grad_norm": 2.403956651687622,
"learning_rate": 1.1197604790419163e-05,
"loss": 0.2917,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.249267578125,
"step": 375
},
{
"epoch": 0.19916142557651992,
"grad_norm": 2.3001322746276855,
"learning_rate": 1.1347305389221557e-05,
"loss": 0.2929,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3072666525840759,
"step": 380
},
{
"epoch": 0.20178197064989517,
"grad_norm": 0.873443603515625,
"learning_rate": 1.1497005988023952e-05,
"loss": 0.2645,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27001953125,
"step": 385
},
{
"epoch": 0.20440251572327045,
"grad_norm": 1.0583703517913818,
"learning_rate": 1.1646706586826348e-05,
"loss": 0.2894,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.259521484375,
"step": 390
},
{
"epoch": 0.2070230607966457,
"grad_norm": 0.9356868267059326,
"learning_rate": 1.1796407185628744e-05,
"loss": 0.2728,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27055835723876953,
"step": 395
},
{
"epoch": 0.20964360587002095,
"grad_norm": 0.9089909195899963,
"learning_rate": 1.1946107784431137e-05,
"loss": 0.3039,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3303355276584625,
"step": 400
},
{
"epoch": 0.21226415094339623,
"grad_norm": 1.0864156484603882,
"learning_rate": 1.2095808383233534e-05,
"loss": 0.2842,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2611331045627594,
"step": 405
},
{
"epoch": 0.2148846960167715,
"grad_norm": 0.903243362903595,
"learning_rate": 1.224550898203593e-05,
"loss": 0.3116,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31614750623703003,
"step": 410
},
{
"epoch": 0.21750524109014674,
"grad_norm": 0.9702324271202087,
"learning_rate": 1.2395209580838323e-05,
"loss": 0.2772,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.278564453125,
"step": 415
},
{
"epoch": 0.22012578616352202,
"grad_norm": 0.8921561241149902,
"learning_rate": 1.2544910179640719e-05,
"loss": 0.282,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28161734342575073,
"step": 420
},
{
"epoch": 0.22274633123689727,
"grad_norm": 1.4527266025543213,
"learning_rate": 1.2694610778443115e-05,
"loss": 0.2858,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3099859654903412,
"step": 425
},
{
"epoch": 0.22536687631027252,
"grad_norm": 1.3680222034454346,
"learning_rate": 1.284431137724551e-05,
"loss": 0.3097,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3171539008617401,
"step": 430
},
{
"epoch": 0.2279874213836478,
"grad_norm": 1.5904136896133423,
"learning_rate": 1.2994011976047905e-05,
"loss": 0.2802,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23265689611434937,
"step": 435
},
{
"epoch": 0.23060796645702306,
"grad_norm": 0.776222825050354,
"learning_rate": 1.3143712574850301e-05,
"loss": 0.2874,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2671012878417969,
"step": 440
},
{
"epoch": 0.23322851153039834,
"grad_norm": 1.0297017097473145,
"learning_rate": 1.3293413173652696e-05,
"loss": 0.255,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24852225184440613,
"step": 445
},
{
"epoch": 0.2358490566037736,
"grad_norm": 0.9424745440483093,
"learning_rate": 1.3443113772455092e-05,
"loss": 0.2994,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3500230610370636,
"step": 450
},
{
"epoch": 0.23846960167714884,
"grad_norm": 1.6893078088760376,
"learning_rate": 1.3592814371257486e-05,
"loss": 0.2994,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28594040870666504,
"step": 455
},
{
"epoch": 0.24109014675052412,
"grad_norm": 0.9330493211746216,
"learning_rate": 1.3742514970059881e-05,
"loss": 0.2906,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.34921205043792725,
"step": 460
},
{
"epoch": 0.24371069182389937,
"grad_norm": 0.854236364364624,
"learning_rate": 1.3892215568862277e-05,
"loss": 0.2652,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27588099241256714,
"step": 465
},
{
"epoch": 0.24633123689727462,
"grad_norm": 1.0205166339874268,
"learning_rate": 1.404191616766467e-05,
"loss": 0.2785,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2535867989063263,
"step": 470
},
{
"epoch": 0.2489517819706499,
"grad_norm": 0.9875480532646179,
"learning_rate": 1.4191616766467067e-05,
"loss": 0.283,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.291537880897522,
"step": 475
},
{
"epoch": 0.25157232704402516,
"grad_norm": 0.8087443113327026,
"learning_rate": 1.4341317365269463e-05,
"loss": 0.2834,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31447193026542664,
"step": 480
},
{
"epoch": 0.25419287211740044,
"grad_norm": 0.8693152666091919,
"learning_rate": 1.4491017964071859e-05,
"loss": 0.2815,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28755509853363037,
"step": 485
},
{
"epoch": 0.25681341719077566,
"grad_norm": 0.8461484909057617,
"learning_rate": 1.4640718562874252e-05,
"loss": 0.279,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30435264110565186,
"step": 490
},
{
"epoch": 0.25943396226415094,
"grad_norm": 0.9464346170425415,
"learning_rate": 1.4790419161676648e-05,
"loss": 0.2957,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31394729018211365,
"step": 495
},
{
"epoch": 0.2620545073375262,
"grad_norm": 0.9506782293319702,
"learning_rate": 1.4940119760479045e-05,
"loss": 0.2702,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2537984848022461,
"step": 500
},
{
"epoch": 0.26467505241090145,
"grad_norm": 0.8543886542320251,
"learning_rate": 1.5089820359281437e-05,
"loss": 0.2885,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2579079866409302,
"step": 505
},
{
"epoch": 0.2672955974842767,
"grad_norm": 1.0954477787017822,
"learning_rate": 1.5239520958083834e-05,
"loss": 0.2722,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25832486152648926,
"step": 510
},
{
"epoch": 0.269916142557652,
"grad_norm": 1.5733797550201416,
"learning_rate": 1.538922155688623e-05,
"loss": 0.2545,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2169189453125,
"step": 515
},
{
"epoch": 0.27253668763102723,
"grad_norm": 0.9599918723106384,
"learning_rate": 1.5538922155688625e-05,
"loss": 0.269,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27587890625,
"step": 520
},
{
"epoch": 0.2751572327044025,
"grad_norm": 0.7465035319328308,
"learning_rate": 1.5688622754491018e-05,
"loss": 0.303,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30516883730888367,
"step": 525
},
{
"epoch": 0.2777777777777778,
"grad_norm": 1.8104451894760132,
"learning_rate": 1.5838323353293414e-05,
"loss": 0.2497,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2503080368041992,
"step": 530
},
{
"epoch": 0.280398322851153,
"grad_norm": 0.8634847402572632,
"learning_rate": 1.598802395209581e-05,
"loss": 0.2873,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30565452575683594,
"step": 535
},
{
"epoch": 0.2830188679245283,
"grad_norm": 1.3197232484817505,
"learning_rate": 1.6137724550898203e-05,
"loss": 0.2649,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2626909613609314,
"step": 540
},
{
"epoch": 0.2856394129979036,
"grad_norm": 1.1044285297393799,
"learning_rate": 1.62874251497006e-05,
"loss": 0.2716,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2565639019012451,
"step": 545
},
{
"epoch": 0.2882599580712788,
"grad_norm": 2.0272889137268066,
"learning_rate": 1.6437125748502996e-05,
"loss": 0.2676,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2064208984375,
"step": 550
},
{
"epoch": 0.2908805031446541,
"grad_norm": 1.0531353950500488,
"learning_rate": 1.6586826347305392e-05,
"loss": 0.2784,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.32692980766296387,
"step": 555
},
{
"epoch": 0.29350104821802936,
"grad_norm": 1.0479260683059692,
"learning_rate": 1.6736526946107785e-05,
"loss": 0.2805,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27001953125,
"step": 560
},
{
"epoch": 0.29612159329140464,
"grad_norm": 1.0311917066574097,
"learning_rate": 1.688622754491018e-05,
"loss": 0.271,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2805914878845215,
"step": 565
},
{
"epoch": 0.29874213836477986,
"grad_norm": 0.7885255217552185,
"learning_rate": 1.7035928143712577e-05,
"loss": 0.2821,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3129787743091583,
"step": 570
},
{
"epoch": 0.30136268343815514,
"grad_norm": 0.96186763048172,
"learning_rate": 1.718562874251497e-05,
"loss": 0.2695,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26282820105552673,
"step": 575
},
{
"epoch": 0.3039832285115304,
"grad_norm": 0.9567992687225342,
"learning_rate": 1.7335329341317367e-05,
"loss": 0.272,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27879998087882996,
"step": 580
},
{
"epoch": 0.30660377358490565,
"grad_norm": 1.8625792264938354,
"learning_rate": 1.7485029940119763e-05,
"loss": 0.2574,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29016339778900146,
"step": 585
},
{
"epoch": 0.30922431865828093,
"grad_norm": 1.0454092025756836,
"learning_rate": 1.763473053892216e-05,
"loss": 0.2617,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2546542286872864,
"step": 590
},
{
"epoch": 0.3118448637316562,
"grad_norm": 1.0812140703201294,
"learning_rate": 1.7784431137724552e-05,
"loss": 0.2361,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23095703125,
"step": 595
},
{
"epoch": 0.31446540880503143,
"grad_norm": 0.9106873273849487,
"learning_rate": 1.793413173652695e-05,
"loss": 0.2618,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25149843096733093,
"step": 600
},
{
"epoch": 0.3170859538784067,
"grad_norm": 0.7510280609130859,
"learning_rate": 1.8083832335329345e-05,
"loss": 0.2822,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25785890221595764,
"step": 605
},
{
"epoch": 0.319706498951782,
"grad_norm": 1.044481635093689,
"learning_rate": 1.8233532934131738e-05,
"loss": 0.2627,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2672847807407379,
"step": 610
},
{
"epoch": 0.3223270440251572,
"grad_norm": 0.9411810636520386,
"learning_rate": 1.8383233532934134e-05,
"loss": 0.2691,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30929991602897644,
"step": 615
},
{
"epoch": 0.3249475890985325,
"grad_norm": 1.0617573261260986,
"learning_rate": 1.853293413173653e-05,
"loss": 0.2738,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22705078125,
"step": 620
},
{
"epoch": 0.3275681341719078,
"grad_norm": 0.8805792331695557,
"learning_rate": 1.8682634730538923e-05,
"loss": 0.2639,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.239990234375,
"step": 625
},
{
"epoch": 0.330188679245283,
"grad_norm": 1.3362305164337158,
"learning_rate": 1.883233532934132e-05,
"loss": 0.2726,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23389750719070435,
"step": 630
},
{
"epoch": 0.3328092243186583,
"grad_norm": 1.0816547870635986,
"learning_rate": 1.8982035928143712e-05,
"loss": 0.2577,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.216552734375,
"step": 635
},
{
"epoch": 0.33542976939203356,
"grad_norm": 1.0076228380203247,
"learning_rate": 1.913173652694611e-05,
"loss": 0.2518,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23681640625,
"step": 640
},
{
"epoch": 0.3380503144654088,
"grad_norm": 1.2804813385009766,
"learning_rate": 1.9281437125748505e-05,
"loss": 0.2531,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.239990234375,
"step": 645
},
{
"epoch": 0.34067085953878407,
"grad_norm": 1.0749441385269165,
"learning_rate": 1.9431137724550898e-05,
"loss": 0.2791,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28917208313941956,
"step": 650
},
{
"epoch": 0.34329140461215935,
"grad_norm": 0.7442736029624939,
"learning_rate": 1.9580838323353294e-05,
"loss": 0.2784,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25295543670654297,
"step": 655
},
{
"epoch": 0.34591194968553457,
"grad_norm": 0.8936218023300171,
"learning_rate": 1.973053892215569e-05,
"loss": 0.2635,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27409684658050537,
"step": 660
},
{
"epoch": 0.34853249475890985,
"grad_norm": 0.8553372621536255,
"learning_rate": 1.9880239520958083e-05,
"loss": 0.2542,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2538270652294159,
"step": 665
},
{
"epoch": 0.35115303983228513,
"grad_norm": 0.8348433375358582,
"learning_rate": 2.002994011976048e-05,
"loss": 0.2731,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26132816076278687,
"step": 670
},
{
"epoch": 0.35377358490566035,
"grad_norm": 0.8847088813781738,
"learning_rate": 2.0179640718562872e-05,
"loss": 0.277,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.274321973323822,
"step": 675
},
{
"epoch": 0.35639412997903563,
"grad_norm": 0.8822789788246155,
"learning_rate": 2.0329341317365272e-05,
"loss": 0.2708,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.272810161113739,
"step": 680
},
{
"epoch": 0.3590146750524109,
"grad_norm": 1.0013290643692017,
"learning_rate": 2.0479041916167665e-05,
"loss": 0.2665,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2875209450721741,
"step": 685
},
{
"epoch": 0.36163522012578614,
"grad_norm": 1.2342756986618042,
"learning_rate": 2.0628742514970065e-05,
"loss": 0.248,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22775958478450775,
"step": 690
},
{
"epoch": 0.3642557651991614,
"grad_norm": 1.2740610837936401,
"learning_rate": 2.0778443113772458e-05,
"loss": 0.2633,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21473848819732666,
"step": 695
},
{
"epoch": 0.3668763102725367,
"grad_norm": 0.8021836280822754,
"learning_rate": 2.092814371257485e-05,
"loss": 0.2546,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26980459690093994,
"step": 700
},
{
"epoch": 0.3694968553459119,
"grad_norm": 1.2527188062667847,
"learning_rate": 2.107784431137725e-05,
"loss": 0.2689,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25510236620903015,
"step": 705
},
{
"epoch": 0.3721174004192872,
"grad_norm": 0.7445383071899414,
"learning_rate": 2.1227544910179643e-05,
"loss": 0.2668,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27235108613967896,
"step": 710
},
{
"epoch": 0.3747379454926625,
"grad_norm": 0.8911032676696777,
"learning_rate": 2.1377245508982036e-05,
"loss": 0.2808,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3051337003707886,
"step": 715
},
{
"epoch": 0.37735849056603776,
"grad_norm": 0.9463456273078918,
"learning_rate": 2.1526946107784436e-05,
"loss": 0.2538,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2794130742549896,
"step": 720
},
{
"epoch": 0.379979035639413,
"grad_norm": 1.009350299835205,
"learning_rate": 2.167664670658683e-05,
"loss": 0.2627,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.31796935200691223,
"step": 725
},
{
"epoch": 0.38259958071278827,
"grad_norm": 0.9498868584632874,
"learning_rate": 2.182634730538922e-05,
"loss": 0.2463,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.214111328125,
"step": 730
},
{
"epoch": 0.38522012578616355,
"grad_norm": 1.2612749338150024,
"learning_rate": 2.197604790419162e-05,
"loss": 0.2796,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28517019748687744,
"step": 735
},
{
"epoch": 0.38784067085953877,
"grad_norm": 1.0274648666381836,
"learning_rate": 2.2125748502994014e-05,
"loss": 0.2598,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3108254373073578,
"step": 740
},
{
"epoch": 0.39046121593291405,
"grad_norm": 1.821379542350769,
"learning_rate": 2.2275449101796407e-05,
"loss": 0.2419,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22509765625,
"step": 745
},
{
"epoch": 0.39308176100628933,
"grad_norm": 0.9160019755363464,
"learning_rate": 2.2425149700598807e-05,
"loss": 0.2681,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2566436231136322,
"step": 750
},
{
"epoch": 0.39570230607966456,
"grad_norm": 1.0037506818771362,
"learning_rate": 2.25748502994012e-05,
"loss": 0.2433,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23486328125,
"step": 755
},
{
"epoch": 0.39832285115303984,
"grad_norm": 1.2298099994659424,
"learning_rate": 2.2724550898203596e-05,
"loss": 0.277,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.255581796169281,
"step": 760
},
{
"epoch": 0.4009433962264151,
"grad_norm": 1.3319263458251953,
"learning_rate": 2.287425149700599e-05,
"loss": 0.2404,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.211669921875,
"step": 765
},
{
"epoch": 0.40356394129979034,
"grad_norm": 4.390848159790039,
"learning_rate": 2.3023952095808385e-05,
"loss": 0.2599,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.284297913312912,
"step": 770
},
{
"epoch": 0.4061844863731656,
"grad_norm": 2.03829288482666,
"learning_rate": 2.317365269461078e-05,
"loss": 0.2554,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26315489411354065,
"step": 775
},
{
"epoch": 0.4088050314465409,
"grad_norm": 1.0062929391860962,
"learning_rate": 2.3323353293413174e-05,
"loss": 0.2458,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.224853515625,
"step": 780
},
{
"epoch": 0.4114255765199161,
"grad_norm": 1.4772664308547974,
"learning_rate": 2.347305389221557e-05,
"loss": 0.2523,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23134742677211761,
"step": 785
},
{
"epoch": 0.4140461215932914,
"grad_norm": 1.4864003658294678,
"learning_rate": 2.3622754491017967e-05,
"loss": 0.2502,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.210205078125,
"step": 790
},
{
"epoch": 0.4166666666666667,
"grad_norm": 0.9668591022491455,
"learning_rate": 2.377245508982036e-05,
"loss": 0.264,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2741929590702057,
"step": 795
},
{
"epoch": 0.4192872117400419,
"grad_norm": 0.9752196669578552,
"learning_rate": 2.3922155688622756e-05,
"loss": 0.2687,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2708283066749573,
"step": 800
},
{
"epoch": 0.4219077568134172,
"grad_norm": 1.0176275968551636,
"learning_rate": 2.4071856287425152e-05,
"loss": 0.2484,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.253340482711792,
"step": 805
},
{
"epoch": 0.42452830188679247,
"grad_norm": 1.0713645219802856,
"learning_rate": 2.4221556886227545e-05,
"loss": 0.2452,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2933781147003174,
"step": 810
},
{
"epoch": 0.4271488469601677,
"grad_norm": 0.956200122833252,
"learning_rate": 2.437125748502994e-05,
"loss": 0.2385,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26098963618278503,
"step": 815
},
{
"epoch": 0.429769392033543,
"grad_norm": 0.943717896938324,
"learning_rate": 2.4520958083832338e-05,
"loss": 0.2529,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24085500836372375,
"step": 820
},
{
"epoch": 0.43238993710691825,
"grad_norm": 1.687644362449646,
"learning_rate": 2.467065868263473e-05,
"loss": 0.2317,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22789129614830017,
"step": 825
},
{
"epoch": 0.4350104821802935,
"grad_norm": 0.9256353378295898,
"learning_rate": 2.482035928143713e-05,
"loss": 0.2504,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2247050702571869,
"step": 830
},
{
"epoch": 0.43763102725366876,
"grad_norm": 0.8353980779647827,
"learning_rate": 2.4970059880239523e-05,
"loss": 0.2453,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2708541750907898,
"step": 835
},
{
"epoch": 0.44025157232704404,
"grad_norm": 0.9708998799324036,
"learning_rate": 2.5119760479041916e-05,
"loss": 0.2494,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.217529296875,
"step": 840
},
{
"epoch": 0.44287211740041926,
"grad_norm": 1.3114147186279297,
"learning_rate": 2.5269461077844316e-05,
"loss": 0.2517,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.224853515625,
"step": 845
},
{
"epoch": 0.44549266247379454,
"grad_norm": 1.0196064710617065,
"learning_rate": 2.541916167664671e-05,
"loss": 0.2538,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25506791472435,
"step": 850
},
{
"epoch": 0.4481132075471698,
"grad_norm": 1.0026663541793823,
"learning_rate": 2.55688622754491e-05,
"loss": 0.2364,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22676116228103638,
"step": 855
},
{
"epoch": 0.45073375262054505,
"grad_norm": 1.156481385231018,
"learning_rate": 2.57185628742515e-05,
"loss": 0.2621,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27671852707862854,
"step": 860
},
{
"epoch": 0.4533542976939203,
"grad_norm": 0.8996175527572632,
"learning_rate": 2.5868263473053894e-05,
"loss": 0.2538,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24104604125022888,
"step": 865
},
{
"epoch": 0.4559748427672956,
"grad_norm": 0.9830065369606018,
"learning_rate": 2.6017964071856287e-05,
"loss": 0.2569,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25553032755851746,
"step": 870
},
{
"epoch": 0.4585953878406709,
"grad_norm": 0.9731114506721497,
"learning_rate": 2.6167664670658687e-05,
"loss": 0.2313,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23193359375,
"step": 875
},
{
"epoch": 0.4612159329140461,
"grad_norm": 1.0539577007293701,
"learning_rate": 2.631736526946108e-05,
"loss": 0.2376,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22667427361011505,
"step": 880
},
{
"epoch": 0.4638364779874214,
"grad_norm": 0.9352318644523621,
"learning_rate": 2.6467065868263476e-05,
"loss": 0.2485,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2803516983985901,
"step": 885
},
{
"epoch": 0.46645702306079667,
"grad_norm": 0.9622676968574524,
"learning_rate": 2.6616766467065872e-05,
"loss": 0.2235,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.205322265625,
"step": 890
},
{
"epoch": 0.4690775681341719,
"grad_norm": 0.9191893935203552,
"learning_rate": 2.6766467065868265e-05,
"loss": 0.263,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28229767084121704,
"step": 895
},
{
"epoch": 0.4716981132075472,
"grad_norm": 0.8149584531784058,
"learning_rate": 2.691616766467066e-05,
"loss": 0.2596,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2862365245819092,
"step": 900
},
{
"epoch": 0.47431865828092246,
"grad_norm": 0.9685344696044922,
"learning_rate": 2.7065868263473058e-05,
"loss": 0.2554,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22389689087867737,
"step": 905
},
{
"epoch": 0.4769392033542977,
"grad_norm": 0.9538320302963257,
"learning_rate": 2.721556886227545e-05,
"loss": 0.2529,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23646777868270874,
"step": 910
},
{
"epoch": 0.47955974842767296,
"grad_norm": 1.021291971206665,
"learning_rate": 2.7365269461077847e-05,
"loss": 0.2358,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.223388671875,
"step": 915
},
{
"epoch": 0.48218029350104824,
"grad_norm": 0.7586928009986877,
"learning_rate": 2.751497005988024e-05,
"loss": 0.2377,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23621279001235962,
"step": 920
},
{
"epoch": 0.48480083857442346,
"grad_norm": 0.8782613277435303,
"learning_rate": 2.7664670658682636e-05,
"loss": 0.2339,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25007951259613037,
"step": 925
},
{
"epoch": 0.48742138364779874,
"grad_norm": 0.8057980537414551,
"learning_rate": 2.7814371257485033e-05,
"loss": 0.2394,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23260241746902466,
"step": 930
},
{
"epoch": 0.490041928721174,
"grad_norm": 0.9458929300308228,
"learning_rate": 2.7964071856287425e-05,
"loss": 0.2524,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.30464041233062744,
"step": 935
},
{
"epoch": 0.49266247379454925,
"grad_norm": 1.1031662225723267,
"learning_rate": 2.8113772455089822e-05,
"loss": 0.2508,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21631650626659393,
"step": 940
},
{
"epoch": 0.49528301886792453,
"grad_norm": 1.1147128343582153,
"learning_rate": 2.8263473053892218e-05,
"loss": 0.2207,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1950717717409134,
"step": 945
},
{
"epoch": 0.4979035639412998,
"grad_norm": 0.8075921535491943,
"learning_rate": 2.841317365269461e-05,
"loss": 0.2521,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2745627760887146,
"step": 950
},
{
"epoch": 0.500524109014675,
"grad_norm": 0.7695682644844055,
"learning_rate": 2.856287425149701e-05,
"loss": 0.2421,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2553519606590271,
"step": 955
},
{
"epoch": 0.5031446540880503,
"grad_norm": 0.9481866359710693,
"learning_rate": 2.8712574850299403e-05,
"loss": 0.2309,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2060546875,
"step": 960
},
{
"epoch": 0.5057651991614256,
"grad_norm": 3.3165810108184814,
"learning_rate": 2.8862275449101796e-05,
"loss": 0.2392,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25720077753067017,
"step": 965
},
{
"epoch": 0.5083857442348009,
"grad_norm": 0.8951301574707031,
"learning_rate": 2.9011976047904196e-05,
"loss": 0.2587,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29488053917884827,
"step": 970
},
{
"epoch": 0.5110062893081762,
"grad_norm": 0.8743593096733093,
"learning_rate": 2.916167664670659e-05,
"loss": 0.2309,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.212537944316864,
"step": 975
},
{
"epoch": 0.5136268343815513,
"grad_norm": 0.9430364370346069,
"learning_rate": 2.9311377245508982e-05,
"loss": 0.2448,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24036072194576263,
"step": 980
},
{
"epoch": 0.5162473794549266,
"grad_norm": 1.2850607633590698,
"learning_rate": 2.946107784431138e-05,
"loss": 0.2396,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22472365200519562,
"step": 985
},
{
"epoch": 0.5188679245283019,
"grad_norm": 0.9026057124137878,
"learning_rate": 2.9610778443113774e-05,
"loss": 0.2532,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2544049918651581,
"step": 990
},
{
"epoch": 0.5214884696016772,
"grad_norm": 1.04141366481781,
"learning_rate": 2.9760479041916167e-05,
"loss": 0.2214,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17437362670898438,
"step": 995
},
{
"epoch": 0.5241090146750524,
"grad_norm": 0.9281846880912781,
"learning_rate": 2.9910179640718567e-05,
"loss": 0.2439,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23749575018882751,
"step": 1000
},
{
"epoch": 0.5267295597484277,
"grad_norm": 1.122997760772705,
"learning_rate": 3.005988023952096e-05,
"loss": 0.24,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.254638671875,
"step": 1005
},
{
"epoch": 0.5293501048218029,
"grad_norm": 0.7896718382835388,
"learning_rate": 3.020958083832336e-05,
"loss": 0.2474,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24245308339595795,
"step": 1010
},
{
"epoch": 0.5319706498951782,
"grad_norm": 1.19742751121521,
"learning_rate": 3.0359281437125753e-05,
"loss": 0.2397,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21044921875,
"step": 1015
},
{
"epoch": 0.5345911949685535,
"grad_norm": 0.7850223183631897,
"learning_rate": 3.0508982035928145e-05,
"loss": 0.2639,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29461705684661865,
"step": 1020
},
{
"epoch": 0.5372117400419287,
"grad_norm": 0.9502226710319519,
"learning_rate": 3.0658682634730545e-05,
"loss": 0.2455,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28391456604003906,
"step": 1025
},
{
"epoch": 0.539832285115304,
"grad_norm": 0.8148714900016785,
"learning_rate": 3.0808383233532935e-05,
"loss": 0.2493,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28964686393737793,
"step": 1030
},
{
"epoch": 0.5424528301886793,
"grad_norm": 0.768586277961731,
"learning_rate": 3.095808383233533e-05,
"loss": 0.2438,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23348668217658997,
"step": 1035
},
{
"epoch": 0.5450733752620545,
"grad_norm": 1.006455898284912,
"learning_rate": 3.110778443113773e-05,
"loss": 0.2286,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21520912647247314,
"step": 1040
},
{
"epoch": 0.5476939203354297,
"grad_norm": 0.782268226146698,
"learning_rate": 3.1257485029940124e-05,
"loss": 0.2594,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27343398332595825,
"step": 1045
},
{
"epoch": 0.550314465408805,
"grad_norm": 0.7622452974319458,
"learning_rate": 3.140718562874251e-05,
"loss": 0.2368,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2747539281845093,
"step": 1050
},
{
"epoch": 0.5529350104821803,
"grad_norm": 1.9257678985595703,
"learning_rate": 3.1556886227544916e-05,
"loss": 0.2404,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24580204486846924,
"step": 1055
},
{
"epoch": 0.5555555555555556,
"grad_norm": 1.0492233037948608,
"learning_rate": 3.1706586826347306e-05,
"loss": 0.2609,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2686328887939453,
"step": 1060
},
{
"epoch": 0.5581761006289309,
"grad_norm": 0.8673623204231262,
"learning_rate": 3.18562874251497e-05,
"loss": 0.2446,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23184172809123993,
"step": 1065
},
{
"epoch": 0.560796645702306,
"grad_norm": 0.86591637134552,
"learning_rate": 3.20059880239521e-05,
"loss": 0.2449,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2521081864833832,
"step": 1070
},
{
"epoch": 0.5634171907756813,
"grad_norm": 1.0577750205993652,
"learning_rate": 3.2155688622754494e-05,
"loss": 0.2352,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22219401597976685,
"step": 1075
},
{
"epoch": 0.5660377358490566,
"grad_norm": 0.9919980764389038,
"learning_rate": 3.230538922155689e-05,
"loss": 0.2428,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22773528099060059,
"step": 1080
},
{
"epoch": 0.5686582809224319,
"grad_norm": 0.9499963521957397,
"learning_rate": 3.245508982035929e-05,
"loss": 0.2269,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.208740234375,
"step": 1085
},
{
"epoch": 0.5712788259958071,
"grad_norm": 1.0068999528884888,
"learning_rate": 3.2604790419161677e-05,
"loss": 0.249,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25735247135162354,
"step": 1090
},
{
"epoch": 0.5738993710691824,
"grad_norm": 0.8172402381896973,
"learning_rate": 3.275449101796407e-05,
"loss": 0.2448,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2182607352733612,
"step": 1095
},
{
"epoch": 0.5765199161425576,
"grad_norm": 0.7479739785194397,
"learning_rate": 3.290419161676647e-05,
"loss": 0.2394,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22888004779815674,
"step": 1100
},
{
"epoch": 0.5791404612159329,
"grad_norm": 0.775371789932251,
"learning_rate": 3.3053892215568865e-05,
"loss": 0.2586,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3027857542037964,
"step": 1105
},
{
"epoch": 0.5817610062893082,
"grad_norm": 1.130009651184082,
"learning_rate": 3.320359281437126e-05,
"loss": 0.2359,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.207763671875,
"step": 1110
},
{
"epoch": 0.5843815513626834,
"grad_norm": 0.8480820655822754,
"learning_rate": 3.335329341317366e-05,
"loss": 0.2289,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.211669921875,
"step": 1115
},
{
"epoch": 0.5870020964360587,
"grad_norm": 1.0589594841003418,
"learning_rate": 3.350299401197605e-05,
"loss": 0.2468,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2513768672943115,
"step": 1120
},
{
"epoch": 0.589622641509434,
"grad_norm": 57.0751953125,
"learning_rate": 3.3652694610778444e-05,
"loss": 0.3119,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.6061429381370544,
"step": 1125
},
{
"epoch": 0.5922431865828093,
"grad_norm": 0.8701210618019104,
"learning_rate": 3.380239520958084e-05,
"loss": 0.2347,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2277510166168213,
"step": 1130
},
{
"epoch": 0.5948637316561844,
"grad_norm": 0.867240846157074,
"learning_rate": 3.3952095808383236e-05,
"loss": 0.2345,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22389352321624756,
"step": 1135
},
{
"epoch": 0.5974842767295597,
"grad_norm": 1.079352855682373,
"learning_rate": 3.410179640718563e-05,
"loss": 0.2404,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.211181640625,
"step": 1140
},
{
"epoch": 0.600104821802935,
"grad_norm": 0.8751652836799622,
"learning_rate": 3.425149700598803e-05,
"loss": 0.2282,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24226129055023193,
"step": 1145
},
{
"epoch": 0.6027253668763103,
"grad_norm": 0.9897102117538452,
"learning_rate": 3.4401197604790425e-05,
"loss": 0.2278,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1945013552904129,
"step": 1150
},
{
"epoch": 0.6053459119496856,
"grad_norm": 1.09795081615448,
"learning_rate": 3.4550898203592815e-05,
"loss": 0.2389,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2258596420288086,
"step": 1155
},
{
"epoch": 0.6079664570230608,
"grad_norm": 0.9403954148292542,
"learning_rate": 3.470059880239521e-05,
"loss": 0.2329,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27871084213256836,
"step": 1160
},
{
"epoch": 0.610587002096436,
"grad_norm": 1.3040416240692139,
"learning_rate": 3.485029940119761e-05,
"loss": 0.2263,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20994171500205994,
"step": 1165
},
{
"epoch": 0.6132075471698113,
"grad_norm": 0.9638862013816833,
"learning_rate": 3.5000000000000004e-05,
"loss": 0.2396,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20459549129009247,
"step": 1170
},
{
"epoch": 0.6158280922431866,
"grad_norm": 0.9757469296455383,
"learning_rate": 3.514970059880239e-05,
"loss": 0.2414,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.260408878326416,
"step": 1175
},
{
"epoch": 0.6184486373165619,
"grad_norm": 0.8674975633621216,
"learning_rate": 3.5299401197604796e-05,
"loss": 0.2617,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2864234447479248,
"step": 1180
},
{
"epoch": 0.6210691823899371,
"grad_norm": 0.890983521938324,
"learning_rate": 3.5449101796407186e-05,
"loss": 0.2542,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.259521484375,
"step": 1185
},
{
"epoch": 0.6236897274633124,
"grad_norm": 0.9182384610176086,
"learning_rate": 3.559880239520958e-05,
"loss": 0.2487,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24978932738304138,
"step": 1190
},
{
"epoch": 0.6263102725366876,
"grad_norm": 0.9169577956199646,
"learning_rate": 3.574850299401198e-05,
"loss": 0.2319,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29184383153915405,
"step": 1195
},
{
"epoch": 0.6289308176100629,
"grad_norm": 2.9035658836364746,
"learning_rate": 3.5898203592814375e-05,
"loss": 0.2276,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23210959136486053,
"step": 1200
},
{
"epoch": 0.6315513626834381,
"grad_norm": 0.7462544441223145,
"learning_rate": 3.604790419161677e-05,
"loss": 0.258,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24800057709217072,
"step": 1205
},
{
"epoch": 0.6341719077568134,
"grad_norm": 0.9018626809120178,
"learning_rate": 3.619760479041917e-05,
"loss": 0.2266,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25354671478271484,
"step": 1210
},
{
"epoch": 0.6367924528301887,
"grad_norm": 0.7574321031570435,
"learning_rate": 3.634730538922156e-05,
"loss": 0.2402,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2568485736846924,
"step": 1215
},
{
"epoch": 0.639412997903564,
"grad_norm": 0.8622102737426758,
"learning_rate": 3.649700598802396e-05,
"loss": 0.2502,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2628772258758545,
"step": 1220
},
{
"epoch": 0.6420335429769392,
"grad_norm": 0.910557746887207,
"learning_rate": 3.664670658682635e-05,
"loss": 0.2517,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2687186300754547,
"step": 1225
},
{
"epoch": 0.6446540880503144,
"grad_norm": 0.9514092803001404,
"learning_rate": 3.6796407185628746e-05,
"loss": 0.2377,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23730027675628662,
"step": 1230
},
{
"epoch": 0.6472746331236897,
"grad_norm": 0.9319348335266113,
"learning_rate": 3.694610778443114e-05,
"loss": 0.238,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2547975778579712,
"step": 1235
},
{
"epoch": 0.649895178197065,
"grad_norm": 1.4198458194732666,
"learning_rate": 3.709580838323354e-05,
"loss": 0.2091,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22032234072685242,
"step": 1240
},
{
"epoch": 0.6525157232704403,
"grad_norm": 1.0361565351486206,
"learning_rate": 3.724550898203593e-05,
"loss": 0.2435,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19445861876010895,
"step": 1245
},
{
"epoch": 0.6551362683438156,
"grad_norm": 0.8487932085990906,
"learning_rate": 3.739520958083833e-05,
"loss": 0.2424,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24062740802764893,
"step": 1250
},
{
"epoch": 0.6577568134171907,
"grad_norm": 0.8280177116394043,
"learning_rate": 3.754491017964072e-05,
"loss": 0.2507,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2391597330570221,
"step": 1255
},
{
"epoch": 0.660377358490566,
"grad_norm": 0.7015854120254517,
"learning_rate": 3.769461077844312e-05,
"loss": 0.2299,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23055067658424377,
"step": 1260
},
{
"epoch": 0.6629979035639413,
"grad_norm": 1.7529536485671997,
"learning_rate": 3.784431137724551e-05,
"loss": 0.2136,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1573486328125,
"step": 1265
},
{
"epoch": 0.6656184486373166,
"grad_norm": 1.1261061429977417,
"learning_rate": 3.799401197604791e-05,
"loss": 0.2385,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26623547077178955,
"step": 1270
},
{
"epoch": 0.6682389937106918,
"grad_norm": 0.6807898283004761,
"learning_rate": 3.8143712574850306e-05,
"loss": 0.2382,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2307087481021881,
"step": 1275
},
{
"epoch": 0.6708595387840671,
"grad_norm": 0.6805742979049683,
"learning_rate": 3.8293413173652695e-05,
"loss": 0.2352,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2753327488899231,
"step": 1280
},
{
"epoch": 0.6734800838574424,
"grad_norm": 0.8867018818855286,
"learning_rate": 3.844311377245509e-05,
"loss": 0.2382,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2446499615907669,
"step": 1285
},
{
"epoch": 0.6761006289308176,
"grad_norm": 1.0850870609283447,
"learning_rate": 3.859281437125749e-05,
"loss": 0.2176,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.203857421875,
"step": 1290
},
{
"epoch": 0.6787211740041929,
"grad_norm": 0.7444595694541931,
"learning_rate": 3.8742514970059884e-05,
"loss": 0.2376,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2482675313949585,
"step": 1295
},
{
"epoch": 0.6813417190775681,
"grad_norm": 0.9599826335906982,
"learning_rate": 3.889221556886228e-05,
"loss": 0.2217,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1557583212852478,
"step": 1300
},
{
"epoch": 0.6839622641509434,
"grad_norm": 0.9627584218978882,
"learning_rate": 3.9041916167664676e-05,
"loss": 0.2399,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.198858380317688,
"step": 1305
},
{
"epoch": 0.6865828092243187,
"grad_norm": 0.8851011991500854,
"learning_rate": 3.9191616766467066e-05,
"loss": 0.242,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2949499189853668,
"step": 1310
},
{
"epoch": 0.689203354297694,
"grad_norm": 0.8310570120811462,
"learning_rate": 3.934131736526946e-05,
"loss": 0.22,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24201838672161102,
"step": 1315
},
{
"epoch": 0.6918238993710691,
"grad_norm": 1.0353339910507202,
"learning_rate": 3.949101796407186e-05,
"loss": 0.2231,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2144128680229187,
"step": 1320
},
{
"epoch": 0.6944444444444444,
"grad_norm": 0.6979736685752869,
"learning_rate": 3.9640718562874255e-05,
"loss": 0.2445,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27705004811286926,
"step": 1325
},
{
"epoch": 0.6970649895178197,
"grad_norm": 0.9733805060386658,
"learning_rate": 3.9790419161676644e-05,
"loss": 0.2357,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20027337968349457,
"step": 1330
},
{
"epoch": 0.699685534591195,
"grad_norm": 1.0096243619918823,
"learning_rate": 3.994011976047905e-05,
"loss": 0.2304,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26803430914878845,
"step": 1335
},
{
"epoch": 0.7023060796645703,
"grad_norm": 0.980590283870697,
"learning_rate": 3.999999385200795e-05,
"loss": 0.2404,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.202880859375,
"step": 1340
},
{
"epoch": 0.7049266247379455,
"grad_norm": 0.80577552318573,
"learning_rate": 3.999995628095911e-05,
"loss": 0.2315,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2268478274345398,
"step": 1345
},
{
"epoch": 0.7075471698113207,
"grad_norm": 1.0306898355484009,
"learning_rate": 3.999988455447666e-05,
"loss": 0.2237,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22385285794734955,
"step": 1350
},
{
"epoch": 0.710167714884696,
"grad_norm": 0.8197321891784668,
"learning_rate": 3.9999778672683076e-05,
"loss": 0.2352,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24937386810779572,
"step": 1355
},
{
"epoch": 0.7127882599580713,
"grad_norm": 0.8815612196922302,
"learning_rate": 3.99996386357592e-05,
"loss": 0.2275,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26051896810531616,
"step": 1360
},
{
"epoch": 0.7154088050314465,
"grad_norm": 0.8673317432403564,
"learning_rate": 3.999946444394417e-05,
"loss": 0.2468,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2650872766971588,
"step": 1365
},
{
"epoch": 0.7180293501048218,
"grad_norm": 0.8530757427215576,
"learning_rate": 3.9999256097535466e-05,
"loss": 0.2257,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2251889407634735,
"step": 1370
},
{
"epoch": 0.7206498951781971,
"grad_norm": 0.7286533713340759,
"learning_rate": 3.999901359688891e-05,
"loss": 0.2184,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17423991858959198,
"step": 1375
},
{
"epoch": 0.7232704402515723,
"grad_norm": 1.0834404230117798,
"learning_rate": 3.999873694241863e-05,
"loss": 0.2337,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2013612985610962,
"step": 1380
},
{
"epoch": 0.7258909853249476,
"grad_norm": 0.9604234099388123,
"learning_rate": 3.999842613459709e-05,
"loss": 0.2242,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22143641114234924,
"step": 1385
},
{
"epoch": 0.7285115303983228,
"grad_norm": 1.0743560791015625,
"learning_rate": 3.9998081173955076e-05,
"loss": 0.2271,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1707763671875,
"step": 1390
},
{
"epoch": 0.7311320754716981,
"grad_norm": 0.7798816561698914,
"learning_rate": 3.999770206108172e-05,
"loss": 0.2222,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21589408814907074,
"step": 1395
},
{
"epoch": 0.7337526205450734,
"grad_norm": 0.8090185523033142,
"learning_rate": 3.999728879662443e-05,
"loss": 0.2375,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2567808926105499,
"step": 1400
},
{
"epoch": 0.7363731656184487,
"grad_norm": 0.828900158405304,
"learning_rate": 3.9996841381289e-05,
"loss": 0.2277,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27614784240722656,
"step": 1405
},
{
"epoch": 0.7389937106918238,
"grad_norm": 0.9465203285217285,
"learning_rate": 3.99963598158395e-05,
"loss": 0.2562,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23473942279815674,
"step": 1410
},
{
"epoch": 0.7416142557651991,
"grad_norm": 0.7388080358505249,
"learning_rate": 3.999584410109834e-05,
"loss": 0.2467,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23388850688934326,
"step": 1415
},
{
"epoch": 0.7442348008385744,
"grad_norm": 0.9057663679122925,
"learning_rate": 3.999529423794624e-05,
"loss": 0.2112,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23402154445648193,
"step": 1420
},
{
"epoch": 0.7468553459119497,
"grad_norm": 0.8521957397460938,
"learning_rate": 3.9994710227322256e-05,
"loss": 0.2233,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.219970703125,
"step": 1425
},
{
"epoch": 0.749475890985325,
"grad_norm": 0.7135666012763977,
"learning_rate": 3.999409207022373e-05,
"loss": 0.2209,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16821300983428955,
"step": 1430
},
{
"epoch": 0.7520964360587002,
"grad_norm": 0.7875863909721375,
"learning_rate": 3.999343976770635e-05,
"loss": 0.2493,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26239144802093506,
"step": 1435
},
{
"epoch": 0.7547169811320755,
"grad_norm": 0.8010939359664917,
"learning_rate": 3.9992753320884086e-05,
"loss": 0.2293,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18904992938041687,
"step": 1440
},
{
"epoch": 0.7573375262054507,
"grad_norm": 0.7313894629478455,
"learning_rate": 3.9992032730929254e-05,
"loss": 0.2268,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2471504658460617,
"step": 1445
},
{
"epoch": 0.759958071278826,
"grad_norm": 0.9344741106033325,
"learning_rate": 3.9991277999072436e-05,
"loss": 0.2198,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21785646677017212,
"step": 1450
},
{
"epoch": 0.7625786163522013,
"grad_norm": 1.380718469619751,
"learning_rate": 3.9990489126602565e-05,
"loss": 0.2266,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26836979389190674,
"step": 1455
},
{
"epoch": 0.7651991614255765,
"grad_norm": 0.7142491340637207,
"learning_rate": 3.998966611486686e-05,
"loss": 0.2525,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27713894844055176,
"step": 1460
},
{
"epoch": 0.7678197064989518,
"grad_norm": 0.8345506191253662,
"learning_rate": 3.998880896527082e-05,
"loss": 0.2471,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23274078965187073,
"step": 1465
},
{
"epoch": 0.7704402515723271,
"grad_norm": 0.8071177005767822,
"learning_rate": 3.998791767927828e-05,
"loss": 0.2227,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2065439373254776,
"step": 1470
},
{
"epoch": 0.7730607966457023,
"grad_norm": 0.9820889830589294,
"learning_rate": 3.9986992258411355e-05,
"loss": 0.2318,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2510470747947693,
"step": 1475
},
{
"epoch": 0.7756813417190775,
"grad_norm": 0.664943277835846,
"learning_rate": 3.998603270425045e-05,
"loss": 0.2298,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2430516481399536,
"step": 1480
},
{
"epoch": 0.7783018867924528,
"grad_norm": 0.6739146113395691,
"learning_rate": 3.998503901843427e-05,
"loss": 0.2431,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27743998169898987,
"step": 1485
},
{
"epoch": 0.7809224318658281,
"grad_norm": 0.7599772810935974,
"learning_rate": 3.998401120265981e-05,
"loss": 0.219,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2510407567024231,
"step": 1490
},
{
"epoch": 0.7835429769392034,
"grad_norm": 0.8791052103042603,
"learning_rate": 3.9982949258682345e-05,
"loss": 0.2426,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23215271532535553,
"step": 1495
},
{
"epoch": 0.7861635220125787,
"grad_norm": 0.9418508410453796,
"learning_rate": 3.9981853188315444e-05,
"loss": 0.2438,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.224778950214386,
"step": 1500
},
{
"epoch": 0.7887840670859538,
"grad_norm": 1.0868874788284302,
"learning_rate": 3.998072299343093e-05,
"loss": 0.2197,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16823123395442963,
"step": 1505
},
{
"epoch": 0.7914046121593291,
"grad_norm": 0.6775776743888855,
"learning_rate": 3.997955867595895e-05,
"loss": 0.2339,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21675719320774078,
"step": 1510
},
{
"epoch": 0.7940251572327044,
"grad_norm": 0.9414287805557251,
"learning_rate": 3.9978360237887876e-05,
"loss": 0.2325,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25559812784194946,
"step": 1515
},
{
"epoch": 0.7966457023060797,
"grad_norm": 0.964307963848114,
"learning_rate": 3.997712768126438e-05,
"loss": 0.2232,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23508593440055847,
"step": 1520
},
{
"epoch": 0.799266247379455,
"grad_norm": 0.6535694003105164,
"learning_rate": 3.997586100819338e-05,
"loss": 0.2457,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2811439335346222,
"step": 1525
},
{
"epoch": 0.8018867924528302,
"grad_norm": 0.822663426399231,
"learning_rate": 3.99745602208381e-05,
"loss": 0.2246,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22198763489723206,
"step": 1530
},
{
"epoch": 0.8045073375262054,
"grad_norm": 0.8911979794502258,
"learning_rate": 3.997322532141995e-05,
"loss": 0.2447,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27367645502090454,
"step": 1535
},
{
"epoch": 0.8071278825995807,
"grad_norm": 1.597509503364563,
"learning_rate": 3.9971856312218664e-05,
"loss": 0.2121,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1748046875,
"step": 1540
},
{
"epoch": 0.809748427672956,
"grad_norm": 0.6583138108253479,
"learning_rate": 3.99704531955722e-05,
"loss": 0.2247,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21396388113498688,
"step": 1545
},
{
"epoch": 0.8123689727463312,
"grad_norm": 0.7443684339523315,
"learning_rate": 3.9969015973876765e-05,
"loss": 0.2338,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22403575479984283,
"step": 1550
},
{
"epoch": 0.8149895178197065,
"grad_norm": 0.7412883639335632,
"learning_rate": 3.996754464958681e-05,
"loss": 0.2335,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24360555410385132,
"step": 1555
},
{
"epoch": 0.8176100628930818,
"grad_norm": 0.635653018951416,
"learning_rate": 3.9966039225215025e-05,
"loss": 0.2313,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2305343598127365,
"step": 1560
},
{
"epoch": 0.820230607966457,
"grad_norm": 0.6679280996322632,
"learning_rate": 3.9964499703332334e-05,
"loss": 0.2309,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20315735042095184,
"step": 1565
},
{
"epoch": 0.8228511530398323,
"grad_norm": 0.7374495267868042,
"learning_rate": 3.996292608656791e-05,
"loss": 0.2125,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21739450097084045,
"step": 1570
},
{
"epoch": 0.8254716981132075,
"grad_norm": 0.7598218321800232,
"learning_rate": 3.996131837760912e-05,
"loss": 0.2328,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.186279296875,
"step": 1575
},
{
"epoch": 0.8280922431865828,
"grad_norm": 0.6665475368499756,
"learning_rate": 3.9959676579201574e-05,
"loss": 0.2134,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21755671501159668,
"step": 1580
},
{
"epoch": 0.8307127882599581,
"grad_norm": 0.7411794662475586,
"learning_rate": 3.995800069414909e-05,
"loss": 0.2175,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2294345200061798,
"step": 1585
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.864759087562561,
"learning_rate": 3.995629072531372e-05,
"loss": 0.2273,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24587363004684448,
"step": 1590
},
{
"epoch": 0.8359538784067087,
"grad_norm": 0.7389988303184509,
"learning_rate": 3.995454667561569e-05,
"loss": 0.2241,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2165318727493286,
"step": 1595
},
{
"epoch": 0.8385744234800838,
"grad_norm": 0.7519676685333252,
"learning_rate": 3.9952768548033455e-05,
"loss": 0.2535,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26860612630844116,
"step": 1600
},
{
"epoch": 0.8411949685534591,
"grad_norm": 0.8388245701789856,
"learning_rate": 3.995095634560365e-05,
"loss": 0.2347,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25020381808280945,
"step": 1605
},
{
"epoch": 0.8438155136268344,
"grad_norm": 0.8016890287399292,
"learning_rate": 3.994911007142112e-05,
"loss": 0.2426,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23435023427009583,
"step": 1610
},
{
"epoch": 0.8464360587002097,
"grad_norm": 1.0219448804855347,
"learning_rate": 3.994722972863888e-05,
"loss": 0.2278,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2231299877166748,
"step": 1615
},
{
"epoch": 0.8490566037735849,
"grad_norm": 0.8658933639526367,
"learning_rate": 3.9945315320468125e-05,
"loss": 0.2143,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22396323084831238,
"step": 1620
},
{
"epoch": 0.8516771488469602,
"grad_norm": 0.8496825098991394,
"learning_rate": 3.994336685017825e-05,
"loss": 0.2234,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18982970714569092,
"step": 1625
},
{
"epoch": 0.8542976939203354,
"grad_norm": 0.9361600875854492,
"learning_rate": 3.994138432109679e-05,
"loss": 0.2219,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.202392578125,
"step": 1630
},
{
"epoch": 0.8569182389937107,
"grad_norm": 0.958289623260498,
"learning_rate": 3.993936773660948e-05,
"loss": 0.2324,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21435546875,
"step": 1635
},
{
"epoch": 0.859538784067086,
"grad_norm": 0.7888805866241455,
"learning_rate": 3.993731710016018e-05,
"loss": 0.2294,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20953628420829773,
"step": 1640
},
{
"epoch": 0.8621593291404612,
"grad_norm": 0.8121855854988098,
"learning_rate": 3.993523241525091e-05,
"loss": 0.2185,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.201904296875,
"step": 1645
},
{
"epoch": 0.8647798742138365,
"grad_norm": 0.8300842046737671,
"learning_rate": 3.9933113685441844e-05,
"loss": 0.2146,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22567909955978394,
"step": 1650
},
{
"epoch": 0.8674004192872118,
"grad_norm": 0.8639819622039795,
"learning_rate": 3.9930960914351316e-05,
"loss": 0.2177,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22009161114692688,
"step": 1655
},
{
"epoch": 0.870020964360587,
"grad_norm": 0.7249926924705505,
"learning_rate": 3.992877410565576e-05,
"loss": 0.2372,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23612543940544128,
"step": 1660
},
{
"epoch": 0.8726415094339622,
"grad_norm": 0.7727657556533813,
"learning_rate": 3.992655326308975e-05,
"loss": 0.2077,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25405246019363403,
"step": 1665
},
{
"epoch": 0.8752620545073375,
"grad_norm": 0.7392059564590454,
"learning_rate": 3.9924298390446e-05,
"loss": 0.2251,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2551038861274719,
"step": 1670
},
{
"epoch": 0.8778825995807128,
"grad_norm": 0.7574449181556702,
"learning_rate": 3.992200949157531e-05,
"loss": 0.2532,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2989419996738434,
"step": 1675
},
{
"epoch": 0.8805031446540881,
"grad_norm": 1.003849744796753,
"learning_rate": 3.991968657038663e-05,
"loss": 0.2327,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21192163228988647,
"step": 1680
},
{
"epoch": 0.8831236897274634,
"grad_norm": 0.7698060274124146,
"learning_rate": 3.9917329630846955e-05,
"loss": 0.222,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23520153760910034,
"step": 1685
},
{
"epoch": 0.8857442348008385,
"grad_norm": 0.8432544469833374,
"learning_rate": 3.991493867698144e-05,
"loss": 0.2253,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22624069452285767,
"step": 1690
},
{
"epoch": 0.8883647798742138,
"grad_norm": 1.2525861263275146,
"learning_rate": 3.991251371287327e-05,
"loss": 0.2269,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25401657819747925,
"step": 1695
},
{
"epoch": 0.8909853249475891,
"grad_norm": 0.7080795168876648,
"learning_rate": 3.991005474266377e-05,
"loss": 0.2328,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22854046523571014,
"step": 1700
},
{
"epoch": 0.8936058700209644,
"grad_norm": 0.7806035876274109,
"learning_rate": 3.990756177055228e-05,
"loss": 0.2291,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24327123165130615,
"step": 1705
},
{
"epoch": 0.8962264150943396,
"grad_norm": 0.8708102703094482,
"learning_rate": 3.990503480079624e-05,
"loss": 0.2572,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28605660796165466,
"step": 1710
},
{
"epoch": 0.8988469601677149,
"grad_norm": 0.745387613773346,
"learning_rate": 3.9902473837711166e-05,
"loss": 0.2443,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24579143524169922,
"step": 1715
},
{
"epoch": 0.9014675052410901,
"grad_norm": 0.7314765453338623,
"learning_rate": 3.9899878885670586e-05,
"loss": 0.2166,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22977255284786224,
"step": 1720
},
{
"epoch": 0.9040880503144654,
"grad_norm": 0.8882567286491394,
"learning_rate": 3.989724994910611e-05,
"loss": 0.2615,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2808782756328583,
"step": 1725
},
{
"epoch": 0.9067085953878407,
"grad_norm": 0.9777259826660156,
"learning_rate": 3.989458703250737e-05,
"loss": 0.2397,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19820043444633484,
"step": 1730
},
{
"epoch": 0.9093291404612159,
"grad_norm": 0.6205902695655823,
"learning_rate": 3.989189014042202e-05,
"loss": 0.2199,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24654075503349304,
"step": 1735
},
{
"epoch": 0.9119496855345912,
"grad_norm": 0.7724601030349731,
"learning_rate": 3.988915927745576e-05,
"loss": 0.2001,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18580566346645355,
"step": 1740
},
{
"epoch": 0.9145702306079665,
"grad_norm": 0.7739741206169128,
"learning_rate": 3.9886394448272274e-05,
"loss": 0.2371,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26846638321876526,
"step": 1745
},
{
"epoch": 0.9171907756813418,
"grad_norm": 0.8521780371665955,
"learning_rate": 3.988359565759328e-05,
"loss": 0.2126,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1624755859375,
"step": 1750
},
{
"epoch": 0.9198113207547169,
"grad_norm": 1.187021017074585,
"learning_rate": 3.988076291019849e-05,
"loss": 0.2214,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22318154573440552,
"step": 1755
},
{
"epoch": 0.9224318658280922,
"grad_norm": 0.84868323802948,
"learning_rate": 3.987789621092558e-05,
"loss": 0.2532,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23538881540298462,
"step": 1760
},
{
"epoch": 0.9250524109014675,
"grad_norm": 0.8934110403060913,
"learning_rate": 3.9874995564670245e-05,
"loss": 0.2231,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20849609375,
"step": 1765
},
{
"epoch": 0.9276729559748428,
"grad_norm": 1.0175093412399292,
"learning_rate": 3.987206097638614e-05,
"loss": 0.257,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21352073550224304,
"step": 1770
},
{
"epoch": 0.9302935010482181,
"grad_norm": 0.7733330726623535,
"learning_rate": 3.986909245108487e-05,
"loss": 0.2424,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2377007156610489,
"step": 1775
},
{
"epoch": 0.9329140461215933,
"grad_norm": 0.9361969828605652,
"learning_rate": 3.9866089993836006e-05,
"loss": 0.2168,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22012554109096527,
"step": 1780
},
{
"epoch": 0.9355345911949685,
"grad_norm": 0.7702367305755615,
"learning_rate": 3.986305360976709e-05,
"loss": 0.2432,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2497783899307251,
"step": 1785
},
{
"epoch": 0.9381551362683438,
"grad_norm": 0.7354511618614197,
"learning_rate": 3.985998330406357e-05,
"loss": 0.2343,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19119471311569214,
"step": 1790
},
{
"epoch": 0.9407756813417191,
"grad_norm": 3.6996049880981445,
"learning_rate": 3.9856879081968846e-05,
"loss": 0.2216,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.255986750125885,
"step": 1795
},
{
"epoch": 0.9433962264150944,
"grad_norm": 0.7672015428543091,
"learning_rate": 3.985374094878423e-05,
"loss": 0.2202,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22132608294487,
"step": 1800
},
{
"epoch": 0.9460167714884696,
"grad_norm": 0.9519333839416504,
"learning_rate": 3.985056890986895e-05,
"loss": 0.2166,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2124580442905426,
"step": 1805
},
{
"epoch": 0.9486373165618449,
"grad_norm": 0.7499129772186279,
"learning_rate": 3.984736297064012e-05,
"loss": 0.2235,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22277656197547913,
"step": 1810
},
{
"epoch": 0.9512578616352201,
"grad_norm": 0.7869206070899963,
"learning_rate": 3.984412313657279e-05,
"loss": 0.2222,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18107184767723083,
"step": 1815
},
{
"epoch": 0.9538784067085954,
"grad_norm": 0.6203080415725708,
"learning_rate": 3.984084941319985e-05,
"loss": 0.2191,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.29906970262527466,
"step": 1820
},
{
"epoch": 0.9564989517819706,
"grad_norm": 0.7308595776557922,
"learning_rate": 3.983754180611209e-05,
"loss": 0.2391,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2216796875,
"step": 1825
},
{
"epoch": 0.9591194968553459,
"grad_norm": 0.686708390712738,
"learning_rate": 3.983420032095817e-05,
"loss": 0.2421,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23726975917816162,
"step": 1830
},
{
"epoch": 0.9617400419287212,
"grad_norm": 0.633300244808197,
"learning_rate": 3.983082496344458e-05,
"loss": 0.2228,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22657258808612823,
"step": 1835
},
{
"epoch": 0.9643605870020965,
"grad_norm": 0.6701763868331909,
"learning_rate": 3.982741573933568e-05,
"loss": 0.2363,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22205190360546112,
"step": 1840
},
{
"epoch": 0.9669811320754716,
"grad_norm": 0.749075174331665,
"learning_rate": 3.9823972654453664e-05,
"loss": 0.2261,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22223912179470062,
"step": 1845
},
{
"epoch": 0.9696016771488469,
"grad_norm": 0.9106373190879822,
"learning_rate": 3.9820495714678536e-05,
"loss": 0.2036,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20068359375,
"step": 1850
},
{
"epoch": 0.9722222222222222,
"grad_norm": 2.0793375968933105,
"learning_rate": 3.981698492594814e-05,
"loss": 0.2129,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19463402032852173,
"step": 1855
},
{
"epoch": 0.9748427672955975,
"grad_norm": 0.995448112487793,
"learning_rate": 3.981344029425811e-05,
"loss": 0.2196,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20898227393627167,
"step": 1860
},
{
"epoch": 0.9774633123689728,
"grad_norm": 0.8250730633735657,
"learning_rate": 3.980986182566188e-05,
"loss": 0.2307,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2784325182437897,
"step": 1865
},
{
"epoch": 0.980083857442348,
"grad_norm": 0.7888976335525513,
"learning_rate": 3.980624952627067e-05,
"loss": 0.232,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1858631819486618,
"step": 1870
},
{
"epoch": 0.9827044025157232,
"grad_norm": 0.6838705539703369,
"learning_rate": 3.980260340225347e-05,
"loss": 0.2289,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22918249666690826,
"step": 1875
},
{
"epoch": 0.9853249475890985,
"grad_norm": 0.737946093082428,
"learning_rate": 3.979892345983706e-05,
"loss": 0.2264,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24606642127037048,
"step": 1880
},
{
"epoch": 0.9879454926624738,
"grad_norm": 0.649776816368103,
"learning_rate": 3.979520970530594e-05,
"loss": 0.2278,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22287237644195557,
"step": 1885
},
{
"epoch": 0.9905660377358491,
"grad_norm": 0.8000211715698242,
"learning_rate": 3.979146214500237e-05,
"loss": 0.2398,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2615301012992859,
"step": 1890
},
{
"epoch": 0.9931865828092243,
"grad_norm": 0.6818714141845703,
"learning_rate": 3.9787680785326343e-05,
"loss": 0.2103,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20701850950717926,
"step": 1895
},
{
"epoch": 0.9958071278825996,
"grad_norm": 0.8771920204162598,
"learning_rate": 3.978386563273557e-05,
"loss": 0.2325,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2713809609413147,
"step": 1900
},
{
"epoch": 0.9984276729559748,
"grad_norm": 0.6226481795310974,
"learning_rate": 3.978001669374548e-05,
"loss": 0.2313,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24410386383533478,
"step": 1905
},
{
"epoch": 1.00104821802935,
"grad_norm": 0.7404968738555908,
"learning_rate": 3.9776133974929193e-05,
"loss": 0.201,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20939648151397705,
"step": 1910
},
{
"epoch": 1.0036687631027255,
"grad_norm": 0.8718746900558472,
"learning_rate": 3.9772217482917524e-05,
"loss": 0.1977,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18115234375,
"step": 1915
},
{
"epoch": 1.0062893081761006,
"grad_norm": 0.7572813034057617,
"learning_rate": 3.9768267224398956e-05,
"loss": 0.2204,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21850603818893433,
"step": 1920
},
{
"epoch": 1.0089098532494758,
"grad_norm": 0.7003916501998901,
"learning_rate": 3.976428320611965e-05,
"loss": 0.2008,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14007088541984558,
"step": 1925
},
{
"epoch": 1.0115303983228512,
"grad_norm": 0.808007001876831,
"learning_rate": 3.976026543488341e-05,
"loss": 0.2156,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20655229687690735,
"step": 1930
},
{
"epoch": 1.0141509433962264,
"grad_norm": 0.7994524240493774,
"learning_rate": 3.9756213917551685e-05,
"loss": 0.2088,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21075238287448883,
"step": 1935
},
{
"epoch": 1.0167714884696017,
"grad_norm": 0.7276920676231384,
"learning_rate": 3.975212866104356e-05,
"loss": 0.2034,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20128029584884644,
"step": 1940
},
{
"epoch": 1.019392033542977,
"grad_norm": 0.628477156162262,
"learning_rate": 3.974800967233574e-05,
"loss": 0.2324,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26335054636001587,
"step": 1945
},
{
"epoch": 1.0220125786163523,
"grad_norm": 0.784572958946228,
"learning_rate": 3.974385695846252e-05,
"loss": 0.2007,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21776393055915833,
"step": 1950
},
{
"epoch": 1.0246331236897275,
"grad_norm": 0.7957406044006348,
"learning_rate": 3.9739670526515815e-05,
"loss": 0.2092,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21917438507080078,
"step": 1955
},
{
"epoch": 1.0272536687631026,
"grad_norm": 0.7487369179725647,
"learning_rate": 3.9735450383645104e-05,
"loss": 0.1986,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1875,
"step": 1960
},
{
"epoch": 1.029874213836478,
"grad_norm": 0.7331587672233582,
"learning_rate": 3.9731196537057445e-05,
"loss": 0.2249,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19073855876922607,
"step": 1965
},
{
"epoch": 1.0324947589098532,
"grad_norm": 0.827370285987854,
"learning_rate": 3.972690899401745e-05,
"loss": 0.2076,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20354562997817993,
"step": 1970
},
{
"epoch": 1.0351153039832286,
"grad_norm": 0.6845994591712952,
"learning_rate": 3.9722587761847294e-05,
"loss": 0.2144,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24186748266220093,
"step": 1975
},
{
"epoch": 1.0377358490566038,
"grad_norm": 0.7059889435768127,
"learning_rate": 3.971823284792665e-05,
"loss": 0.2165,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22607070207595825,
"step": 1980
},
{
"epoch": 1.040356394129979,
"grad_norm": 0.6191701889038086,
"learning_rate": 3.9713844259692746e-05,
"loss": 0.2152,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19120293855667114,
"step": 1985
},
{
"epoch": 1.0429769392033543,
"grad_norm": 0.6356191039085388,
"learning_rate": 3.970942200464031e-05,
"loss": 0.2068,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21611721813678741,
"step": 1990
},
{
"epoch": 1.0455974842767295,
"grad_norm": 0.6623996496200562,
"learning_rate": 3.9704966090321536e-05,
"loss": 0.2161,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2316063493490219,
"step": 1995
},
{
"epoch": 1.0482180293501049,
"grad_norm": 0.7194094061851501,
"learning_rate": 3.970047652434615e-05,
"loss": 0.2052,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2116880565881729,
"step": 2000
},
{
"epoch": 1.05083857442348,
"grad_norm": 0.9472671151161194,
"learning_rate": 3.9695953314381305e-05,
"loss": 0.2152,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20667138695716858,
"step": 2005
},
{
"epoch": 1.0534591194968554,
"grad_norm": 0.9666203260421753,
"learning_rate": 3.969139646815165e-05,
"loss": 0.2201,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2363303005695343,
"step": 2010
},
{
"epoch": 1.0560796645702306,
"grad_norm": 0.8482215404510498,
"learning_rate": 3.9686805993439226e-05,
"loss": 0.2039,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21171987056732178,
"step": 2015
},
{
"epoch": 1.0587002096436058,
"grad_norm": 1.468690276145935,
"learning_rate": 3.968218189808356e-05,
"loss": 0.1857,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.190673828125,
"step": 2020
},
{
"epoch": 1.0613207547169812,
"grad_norm": 0.7537864446640015,
"learning_rate": 3.967752418998155e-05,
"loss": 0.2281,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2312135249376297,
"step": 2025
},
{
"epoch": 1.0639412997903563,
"grad_norm": 0.7670570015907288,
"learning_rate": 3.9672832877087524e-05,
"loss": 0.2033,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24082399904727936,
"step": 2030
},
{
"epoch": 1.0665618448637317,
"grad_norm": 0.7302185893058777,
"learning_rate": 3.966810796741318e-05,
"loss": 0.2164,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2091417908668518,
"step": 2035
},
{
"epoch": 1.069182389937107,
"grad_norm": 0.7771731615066528,
"learning_rate": 3.9663349469027626e-05,
"loss": 0.2248,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2181556224822998,
"step": 2040
},
{
"epoch": 1.0718029350104823,
"grad_norm": 0.7886734008789062,
"learning_rate": 3.9658557390057286e-05,
"loss": 0.2197,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23269076645374298,
"step": 2045
},
{
"epoch": 1.0744234800838575,
"grad_norm": 1.0671067237854004,
"learning_rate": 3.965373173868596e-05,
"loss": 0.211,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.178466796875,
"step": 2050
},
{
"epoch": 1.0770440251572326,
"grad_norm": 0.7055492401123047,
"learning_rate": 3.9648872523154785e-05,
"loss": 0.1981,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18505859375,
"step": 2055
},
{
"epoch": 1.079664570230608,
"grad_norm": 0.7833647131919861,
"learning_rate": 3.96439797517622e-05,
"loss": 0.2074,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21262788772583008,
"step": 2060
},
{
"epoch": 1.0822851153039832,
"grad_norm": 0.8312519192695618,
"learning_rate": 3.963905343286396e-05,
"loss": 0.2008,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2077818363904953,
"step": 2065
},
{
"epoch": 1.0849056603773586,
"grad_norm": 0.7561509609222412,
"learning_rate": 3.963409357487312e-05,
"loss": 0.2079,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23984187841415405,
"step": 2070
},
{
"epoch": 1.0875262054507338,
"grad_norm": 0.6779179573059082,
"learning_rate": 3.9629100186259994e-05,
"loss": 0.2279,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21529117226600647,
"step": 2075
},
{
"epoch": 1.090146750524109,
"grad_norm": 0.7629055976867676,
"learning_rate": 3.9624073275552176e-05,
"loss": 0.2046,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19189453125,
"step": 2080
},
{
"epoch": 1.0927672955974843,
"grad_norm": 0.7624127864837646,
"learning_rate": 3.96190128513345e-05,
"loss": 0.2097,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21261179447174072,
"step": 2085
},
{
"epoch": 1.0953878406708595,
"grad_norm": 0.9637182950973511,
"learning_rate": 3.9613918922249025e-05,
"loss": 0.1901,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1895751953125,
"step": 2090
},
{
"epoch": 1.0980083857442349,
"grad_norm": 0.6332825422286987,
"learning_rate": 3.960879149699505e-05,
"loss": 0.196,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19877511262893677,
"step": 2095
},
{
"epoch": 1.10062893081761,
"grad_norm": 0.7005939483642578,
"learning_rate": 3.960363058432906e-05,
"loss": 0.2077,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.171142578125,
"step": 2100
},
{
"epoch": 1.1032494758909852,
"grad_norm": 0.8345542550086975,
"learning_rate": 3.959843619306472e-05,
"loss": 0.2043,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22179073095321655,
"step": 2105
},
{
"epoch": 1.1058700209643606,
"grad_norm": 0.6734275221824646,
"learning_rate": 3.959320833207292e-05,
"loss": 0.2301,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24089553952217102,
"step": 2110
},
{
"epoch": 1.1084905660377358,
"grad_norm": 0.883843183517456,
"learning_rate": 3.958794701028164e-05,
"loss": 0.2037,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.231886625289917,
"step": 2115
},
{
"epoch": 1.1111111111111112,
"grad_norm": 1.0141050815582275,
"learning_rate": 3.958265223667605e-05,
"loss": 0.2103,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2082097977399826,
"step": 2120
},
{
"epoch": 1.1137316561844863,
"grad_norm": 0.9002329111099243,
"learning_rate": 3.957732402029842e-05,
"loss": 0.2059,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.166748046875,
"step": 2125
},
{
"epoch": 1.1163522012578617,
"grad_norm": 0.7092249989509583,
"learning_rate": 3.957196237024817e-05,
"loss": 0.2313,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.27344369888305664,
"step": 2130
},
{
"epoch": 1.118972746331237,
"grad_norm": 0.601761519908905,
"learning_rate": 3.956656729568178e-05,
"loss": 0.2101,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22603391110897064,
"step": 2135
},
{
"epoch": 1.121593291404612,
"grad_norm": 0.7879045605659485,
"learning_rate": 3.956113880581282e-05,
"loss": 0.2111,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20690859854221344,
"step": 2140
},
{
"epoch": 1.1242138364779874,
"grad_norm": 0.6645146608352661,
"learning_rate": 3.955567690991195e-05,
"loss": 0.223,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21006427705287933,
"step": 2145
},
{
"epoch": 1.1268343815513626,
"grad_norm": 0.6369447112083435,
"learning_rate": 3.9550181617306845e-05,
"loss": 0.2318,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23548580706119537,
"step": 2150
},
{
"epoch": 1.129454926624738,
"grad_norm": 0.7624822854995728,
"learning_rate": 3.9544652937382235e-05,
"loss": 0.1966,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.175537109375,
"step": 2155
},
{
"epoch": 1.1320754716981132,
"grad_norm": 0.8087216019630432,
"learning_rate": 3.953909087957987e-05,
"loss": 0.1976,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16578355431556702,
"step": 2160
},
{
"epoch": 1.1346960167714886,
"grad_norm": 1.0704227685928345,
"learning_rate": 3.9533495453398485e-05,
"loss": 0.2171,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1859961748123169,
"step": 2165
},
{
"epoch": 1.1373165618448637,
"grad_norm": 0.8290188908576965,
"learning_rate": 3.952786666839382e-05,
"loss": 0.2155,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22795447707176208,
"step": 2170
},
{
"epoch": 1.139937106918239,
"grad_norm": 0.7330989837646484,
"learning_rate": 3.9522204534178574e-05,
"loss": 0.1959,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20861802995204926,
"step": 2175
},
{
"epoch": 1.1425576519916143,
"grad_norm": 0.7780997157096863,
"learning_rate": 3.9516509060422395e-05,
"loss": 0.2103,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21568194031715393,
"step": 2180
},
{
"epoch": 1.1451781970649895,
"grad_norm": 0.7849951982498169,
"learning_rate": 3.9510780256851886e-05,
"loss": 0.2216,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2479293793439865,
"step": 2185
},
{
"epoch": 1.1477987421383649,
"grad_norm": 0.9373758435249329,
"learning_rate": 3.950501813325054e-05,
"loss": 0.2058,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19455841183662415,
"step": 2190
},
{
"epoch": 1.15041928721174,
"grad_norm": 0.7577291131019592,
"learning_rate": 3.949922269945878e-05,
"loss": 0.2258,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19436845183372498,
"step": 2195
},
{
"epoch": 1.1530398322851152,
"grad_norm": 0.7371213436126709,
"learning_rate": 3.9493393965373904e-05,
"loss": 0.219,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2209862470626831,
"step": 2200
},
{
"epoch": 1.1556603773584906,
"grad_norm": 0.632513701915741,
"learning_rate": 3.948753194095008e-05,
"loss": 0.2091,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2065919041633606,
"step": 2205
},
{
"epoch": 1.1582809224318658,
"grad_norm": 0.8861731886863708,
"learning_rate": 3.9481636636198325e-05,
"loss": 0.2027,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19612561166286469,
"step": 2210
},
{
"epoch": 1.1609014675052411,
"grad_norm": 1.6322046518325806,
"learning_rate": 3.94757080611865e-05,
"loss": 0.1947,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.185546875,
"step": 2215
},
{
"epoch": 1.1635220125786163,
"grad_norm": 0.7297495603561401,
"learning_rate": 3.9469746226039285e-05,
"loss": 0.2012,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1977044939994812,
"step": 2220
},
{
"epoch": 1.1661425576519917,
"grad_norm": 0.7956055402755737,
"learning_rate": 3.946375114093816e-05,
"loss": 0.1991,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19768553972244263,
"step": 2225
},
{
"epoch": 1.1687631027253669,
"grad_norm": 0.6843878626823425,
"learning_rate": 3.9457722816121354e-05,
"loss": 0.2115,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23284509778022766,
"step": 2230
},
{
"epoch": 1.171383647798742,
"grad_norm": 0.8009164333343506,
"learning_rate": 3.945166126188392e-05,
"loss": 0.1903,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.176334410905838,
"step": 2235
},
{
"epoch": 1.1740041928721174,
"grad_norm": 0.8924102187156677,
"learning_rate": 3.9445566488577624e-05,
"loss": 0.201,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16902442276477814,
"step": 2240
},
{
"epoch": 1.1766247379454926,
"grad_norm": 0.6214645504951477,
"learning_rate": 3.943943850661097e-05,
"loss": 0.2011,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19259414076805115,
"step": 2245
},
{
"epoch": 1.179245283018868,
"grad_norm": 1.5890018939971924,
"learning_rate": 3.943327732644917e-05,
"loss": 0.2003,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23410651087760925,
"step": 2250
},
{
"epoch": 1.1818658280922432,
"grad_norm": 0.6572086215019226,
"learning_rate": 3.942708295861415e-05,
"loss": 0.2193,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2210836112499237,
"step": 2255
},
{
"epoch": 1.1844863731656186,
"grad_norm": 0.9096604585647583,
"learning_rate": 3.942085541368448e-05,
"loss": 0.2123,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1952608823776245,
"step": 2260
},
{
"epoch": 1.1871069182389937,
"grad_norm": 0.7038206458091736,
"learning_rate": 3.941459470229542e-05,
"loss": 0.1928,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19516688585281372,
"step": 2265
},
{
"epoch": 1.189727463312369,
"grad_norm": 0.8303375840187073,
"learning_rate": 3.940830083513885e-05,
"loss": 0.2058,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22259192168712616,
"step": 2270
},
{
"epoch": 1.1923480083857443,
"grad_norm": 0.708311140537262,
"learning_rate": 3.940197382296329e-05,
"loss": 0.214,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23684288561344147,
"step": 2275
},
{
"epoch": 1.1949685534591195,
"grad_norm": 0.6626346707344055,
"learning_rate": 3.9395613676573863e-05,
"loss": 0.2181,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.245641827583313,
"step": 2280
},
{
"epoch": 1.1975890985324948,
"grad_norm": 0.7180384397506714,
"learning_rate": 3.9389220406832256e-05,
"loss": 0.2016,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19217683374881744,
"step": 2285
},
{
"epoch": 1.20020964360587,
"grad_norm": 0.6733828186988831,
"learning_rate": 3.938279402465674e-05,
"loss": 0.1839,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17612981796264648,
"step": 2290
},
{
"epoch": 1.2028301886792452,
"grad_norm": 0.5625250339508057,
"learning_rate": 3.937633454102214e-05,
"loss": 0.1945,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1847318410873413,
"step": 2295
},
{
"epoch": 1.2054507337526206,
"grad_norm": 0.5799747705459595,
"learning_rate": 3.93698419669598e-05,
"loss": 0.2065,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21696288883686066,
"step": 2300
},
{
"epoch": 1.2080712788259957,
"grad_norm": 0.6593776345252991,
"learning_rate": 3.936331631355757e-05,
"loss": 0.2048,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18715828657150269,
"step": 2305
},
{
"epoch": 1.2106918238993711,
"grad_norm": 0.7460365295410156,
"learning_rate": 3.9356757591959815e-05,
"loss": 0.2206,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21599701046943665,
"step": 2310
},
{
"epoch": 1.2133123689727463,
"grad_norm": 0.6883477568626404,
"learning_rate": 3.9350165813367344e-05,
"loss": 0.2114,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1934875249862671,
"step": 2315
},
{
"epoch": 1.2159329140461215,
"grad_norm": 0.7474843263626099,
"learning_rate": 3.9343540989037455e-05,
"loss": 0.2111,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21142578125,
"step": 2320
},
{
"epoch": 1.2185534591194969,
"grad_norm": 0.7178785800933838,
"learning_rate": 3.933688313028384e-05,
"loss": 0.2146,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2500786781311035,
"step": 2325
},
{
"epoch": 1.221174004192872,
"grad_norm": 0.7264668941497803,
"learning_rate": 3.933019224847663e-05,
"loss": 0.2121,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22011686861515045,
"step": 2330
},
{
"epoch": 1.2237945492662474,
"grad_norm": 0.6529995203018188,
"learning_rate": 3.9323468355042354e-05,
"loss": 0.2178,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19955116510391235,
"step": 2335
},
{
"epoch": 1.2264150943396226,
"grad_norm": 0.7731424570083618,
"learning_rate": 3.931671146146391e-05,
"loss": 0.1926,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.201904296875,
"step": 2340
},
{
"epoch": 1.229035639412998,
"grad_norm": 0.6317564845085144,
"learning_rate": 3.930992157928056e-05,
"loss": 0.1962,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21055123209953308,
"step": 2345
},
{
"epoch": 1.2316561844863732,
"grad_norm": 0.7882787585258484,
"learning_rate": 3.930309872008788e-05,
"loss": 0.2244,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21452301740646362,
"step": 2350
},
{
"epoch": 1.2342767295597485,
"grad_norm": 0.6930122971534729,
"learning_rate": 3.92962428955378e-05,
"loss": 0.2011,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19605226814746857,
"step": 2355
},
{
"epoch": 1.2368972746331237,
"grad_norm": 0.7814037799835205,
"learning_rate": 3.928935411733852e-05,
"loss": 0.2076,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17131970822811127,
"step": 2360
},
{
"epoch": 1.2395178197064989,
"grad_norm": 0.6624770760536194,
"learning_rate": 3.928243239725453e-05,
"loss": 0.2002,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22540496289730072,
"step": 2365
},
{
"epoch": 1.2421383647798743,
"grad_norm": 0.8233559727668762,
"learning_rate": 3.927547774710658e-05,
"loss": 0.2039,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2119140625,
"step": 2370
},
{
"epoch": 1.2447589098532494,
"grad_norm": 0.7578710317611694,
"learning_rate": 3.926849017877163e-05,
"loss": 0.1919,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20740604400634766,
"step": 2375
},
{
"epoch": 1.2473794549266248,
"grad_norm": 0.786601722240448,
"learning_rate": 3.926146970418289e-05,
"loss": 0.2168,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24346387386322021,
"step": 2380
},
{
"epoch": 1.25,
"grad_norm": 0.7464667558670044,
"learning_rate": 3.925441633532976e-05,
"loss": 0.2058,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.185876727104187,
"step": 2385
},
{
"epoch": 1.2526205450733752,
"grad_norm": 0.6592541337013245,
"learning_rate": 3.92473300842578e-05,
"loss": 0.1942,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18294116854667664,
"step": 2390
},
{
"epoch": 1.2552410901467506,
"grad_norm": 0.6724171042442322,
"learning_rate": 3.9240210963068734e-05,
"loss": 0.2013,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21190500259399414,
"step": 2395
},
{
"epoch": 1.2578616352201257,
"grad_norm": 0.8747267127037048,
"learning_rate": 3.923305898392043e-05,
"loss": 0.213,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.25538158416748047,
"step": 2400
},
{
"epoch": 1.2604821802935011,
"grad_norm": 0.7435500621795654,
"learning_rate": 3.922587415902686e-05,
"loss": 0.2191,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2292519509792328,
"step": 2405
},
{
"epoch": 1.2631027253668763,
"grad_norm": 0.790035605430603,
"learning_rate": 3.921865650065809e-05,
"loss": 0.209,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21131622791290283,
"step": 2410
},
{
"epoch": 1.2657232704402515,
"grad_norm": 0.7259888648986816,
"learning_rate": 3.921140602114026e-05,
"loss": 0.2129,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22094830870628357,
"step": 2415
},
{
"epoch": 1.2683438155136268,
"grad_norm": 0.5739971399307251,
"learning_rate": 3.920412273285556e-05,
"loss": 0.1962,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18913546204566956,
"step": 2420
},
{
"epoch": 1.270964360587002,
"grad_norm": 0.8419296145439148,
"learning_rate": 3.9196806648242216e-05,
"loss": 0.2064,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.203369140625,
"step": 2425
},
{
"epoch": 1.2735849056603774,
"grad_norm": 0.6042289137840271,
"learning_rate": 3.9189457779794446e-05,
"loss": 0.2043,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2455616444349289,
"step": 2430
},
{
"epoch": 1.2762054507337526,
"grad_norm": 0.6255780458450317,
"learning_rate": 3.9182076140062475e-05,
"loss": 0.2042,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22539886832237244,
"step": 2435
},
{
"epoch": 1.2788259958071277,
"grad_norm": 0.6214666366577148,
"learning_rate": 3.9174661741652483e-05,
"loss": 0.2057,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20764382183551788,
"step": 2440
},
{
"epoch": 1.2814465408805031,
"grad_norm": 0.8051807880401611,
"learning_rate": 3.91672145972266e-05,
"loss": 0.1999,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19399192929267883,
"step": 2445
},
{
"epoch": 1.2840670859538785,
"grad_norm": 0.6633710265159607,
"learning_rate": 3.915973471950287e-05,
"loss": 0.225,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22795894742012024,
"step": 2450
},
{
"epoch": 1.2866876310272537,
"grad_norm": 0.6695340871810913,
"learning_rate": 3.915222212125526e-05,
"loss": 0.2077,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.167012557387352,
"step": 2455
},
{
"epoch": 1.2893081761006289,
"grad_norm": 14.448149681091309,
"learning_rate": 3.914467681531358e-05,
"loss": 0.2177,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23487228155136108,
"step": 2460
},
{
"epoch": 1.2919287211740043,
"grad_norm": 0.696930468082428,
"learning_rate": 3.9137098814563535e-05,
"loss": 0.2161,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2489500641822815,
"step": 2465
},
{
"epoch": 1.2945492662473794,
"grad_norm": 0.7310530543327332,
"learning_rate": 3.912948813194663e-05,
"loss": 0.2189,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20653784275054932,
"step": 2470
},
{
"epoch": 1.2971698113207548,
"grad_norm": 0.6279557943344116,
"learning_rate": 3.9121844780460226e-05,
"loss": 0.1909,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19262602925300598,
"step": 2475
},
{
"epoch": 1.29979035639413,
"grad_norm": 0.8946002125740051,
"learning_rate": 3.911416877315743e-05,
"loss": 0.2144,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.237060546875,
"step": 2480
},
{
"epoch": 1.3024109014675052,
"grad_norm": 0.701214075088501,
"learning_rate": 3.9106460123147145e-05,
"loss": 0.2005,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.182861328125,
"step": 2485
},
{
"epoch": 1.3050314465408805,
"grad_norm": 0.718437671661377,
"learning_rate": 3.909871884359401e-05,
"loss": 0.2309,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2242903858423233,
"step": 2490
},
{
"epoch": 1.3076519916142557,
"grad_norm": 0.6559104323387146,
"learning_rate": 3.90909449477184e-05,
"loss": 0.2131,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1921052485704422,
"step": 2495
},
{
"epoch": 1.310272536687631,
"grad_norm": 0.8101383447647095,
"learning_rate": 3.9083138448796385e-05,
"loss": 0.1957,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1594591736793518,
"step": 2500
},
{
"epoch": 1.3128930817610063,
"grad_norm": 0.7414583563804626,
"learning_rate": 3.907529936015971e-05,
"loss": 0.2013,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21024464070796967,
"step": 2505
},
{
"epoch": 1.3155136268343814,
"grad_norm": 0.6516440510749817,
"learning_rate": 3.9067427695195764e-05,
"loss": 0.1987,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2437392771244049,
"step": 2510
},
{
"epoch": 1.3181341719077568,
"grad_norm": 0.8260552883148193,
"learning_rate": 3.905952346734759e-05,
"loss": 0.2162,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21447709202766418,
"step": 2515
},
{
"epoch": 1.320754716981132,
"grad_norm": 0.8752504587173462,
"learning_rate": 3.905158669011385e-05,
"loss": 0.2065,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16129451990127563,
"step": 2520
},
{
"epoch": 1.3233752620545074,
"grad_norm": 0.7147773504257202,
"learning_rate": 3.904361737704876e-05,
"loss": 0.1988,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21069785952568054,
"step": 2525
},
{
"epoch": 1.3259958071278826,
"grad_norm": 0.6199226379394531,
"learning_rate": 3.903561554176213e-05,
"loss": 0.1971,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2516867518424988,
"step": 2530
},
{
"epoch": 1.3286163522012577,
"grad_norm": 0.8225551843643188,
"learning_rate": 3.902758119791928e-05,
"loss": 0.2136,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1904865801334381,
"step": 2535
},
{
"epoch": 1.3312368972746331,
"grad_norm": 0.6185528039932251,
"learning_rate": 3.901951435924107e-05,
"loss": 0.2117,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23471373319625854,
"step": 2540
},
{
"epoch": 1.3338574423480085,
"grad_norm": 0.7478176355361938,
"learning_rate": 3.901141503950386e-05,
"loss": 0.2252,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20767131447792053,
"step": 2545
},
{
"epoch": 1.3364779874213837,
"grad_norm": 1.0327668190002441,
"learning_rate": 3.900328325253946e-05,
"loss": 0.1844,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18234822154045105,
"step": 2550
},
{
"epoch": 1.3390985324947589,
"grad_norm": 0.6867483854293823,
"learning_rate": 3.8995119012235134e-05,
"loss": 0.1999,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1513671875,
"step": 2555
},
{
"epoch": 1.3417190775681342,
"grad_norm": 2.9362030029296875,
"learning_rate": 3.898692233253358e-05,
"loss": 0.2128,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22732852399349213,
"step": 2560
},
{
"epoch": 1.3443396226415094,
"grad_norm": 0.7277422547340393,
"learning_rate": 3.8978693227432874e-05,
"loss": 0.2124,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21256864070892334,
"step": 2565
},
{
"epoch": 1.3469601677148848,
"grad_norm": 0.7678915858268738,
"learning_rate": 3.897043171098649e-05,
"loss": 0.2056,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21991431713104248,
"step": 2570
},
{
"epoch": 1.34958071278826,
"grad_norm": 0.6507485508918762,
"learning_rate": 3.8962137797303235e-05,
"loss": 0.2197,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23043885827064514,
"step": 2575
},
{
"epoch": 1.3522012578616351,
"grad_norm": 0.8911487460136414,
"learning_rate": 3.8953811500547266e-05,
"loss": 0.1867,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1748046875,
"step": 2580
},
{
"epoch": 1.3548218029350105,
"grad_norm": 0.6960018277168274,
"learning_rate": 3.8945452834938005e-05,
"loss": 0.182,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1771518588066101,
"step": 2585
},
{
"epoch": 1.3574423480083857,
"grad_norm": 0.6837010979652405,
"learning_rate": 3.8937061814750194e-05,
"loss": 0.1966,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23394589126110077,
"step": 2590
},
{
"epoch": 1.360062893081761,
"grad_norm": 0.6866763234138489,
"learning_rate": 3.8928638454313795e-05,
"loss": 0.2259,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2412635087966919,
"step": 2595
},
{
"epoch": 1.3626834381551363,
"grad_norm": 0.8366163372993469,
"learning_rate": 3.8920182768014034e-05,
"loss": 0.2101,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22028440237045288,
"step": 2600
},
{
"epoch": 1.3653039832285114,
"grad_norm": 0.7486369013786316,
"learning_rate": 3.891169477029131e-05,
"loss": 0.1983,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20773348212242126,
"step": 2605
},
{
"epoch": 1.3679245283018868,
"grad_norm": 0.6341990232467651,
"learning_rate": 3.890317447564123e-05,
"loss": 0.2024,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1899259388446808,
"step": 2610
},
{
"epoch": 1.370545073375262,
"grad_norm": 0.6667135953903198,
"learning_rate": 3.889462189861452e-05,
"loss": 0.2089,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1973501443862915,
"step": 2615
},
{
"epoch": 1.3731656184486374,
"grad_norm": 0.7557380199432373,
"learning_rate": 3.888603705381709e-05,
"loss": 0.1967,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2168242335319519,
"step": 2620
},
{
"epoch": 1.3757861635220126,
"grad_norm": 0.731421172618866,
"learning_rate": 3.8877419955909905e-05,
"loss": 0.2162,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20348013937473297,
"step": 2625
},
{
"epoch": 1.3784067085953877,
"grad_norm": 0.6229634284973145,
"learning_rate": 3.886877061960905e-05,
"loss": 0.1907,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2174549698829651,
"step": 2630
},
{
"epoch": 1.381027253668763,
"grad_norm": 0.8051416277885437,
"learning_rate": 3.886008905968563e-05,
"loss": 0.2412,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22031381726264954,
"step": 2635
},
{
"epoch": 1.3836477987421385,
"grad_norm": 0.6593210101127625,
"learning_rate": 3.8851375290965816e-05,
"loss": 0.1999,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17712393403053284,
"step": 2640
},
{
"epoch": 1.3862683438155137,
"grad_norm": 0.6895683407783508,
"learning_rate": 3.884262932833076e-05,
"loss": 0.2126,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1672450453042984,
"step": 2645
},
{
"epoch": 1.3888888888888888,
"grad_norm": 0.8762910962104797,
"learning_rate": 3.88338511867166e-05,
"loss": 0.1984,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16539700329303741,
"step": 2650
},
{
"epoch": 1.3915094339622642,
"grad_norm": 0.6993823647499084,
"learning_rate": 3.882504088111444e-05,
"loss": 0.2219,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2100042849779129,
"step": 2655
},
{
"epoch": 1.3941299790356394,
"grad_norm": 0.7604408860206604,
"learning_rate": 3.8816198426570296e-05,
"loss": 0.2167,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2532285749912262,
"step": 2660
},
{
"epoch": 1.3967505241090148,
"grad_norm": 0.6632134318351746,
"learning_rate": 3.880732383818509e-05,
"loss": 0.2115,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2341681867837906,
"step": 2665
},
{
"epoch": 1.39937106918239,
"grad_norm": 0.7571194767951965,
"learning_rate": 3.879841713111463e-05,
"loss": 0.1962,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19808389246463776,
"step": 2670
},
{
"epoch": 1.4019916142557651,
"grad_norm": 0.985468864440918,
"learning_rate": 3.8789478320569585e-05,
"loss": 0.208,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21600691974163055,
"step": 2675
},
{
"epoch": 1.4046121593291405,
"grad_norm": 0.7668294906616211,
"learning_rate": 3.878050742181542e-05,
"loss": 0.2127,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22027012705802917,
"step": 2680
},
{
"epoch": 1.4072327044025157,
"grad_norm": 0.7187151312828064,
"learning_rate": 3.8771504450172415e-05,
"loss": 0.2065,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1777723878622055,
"step": 2685
},
{
"epoch": 1.409853249475891,
"grad_norm": 0.6384240388870239,
"learning_rate": 3.876246942101563e-05,
"loss": 0.193,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21956053376197815,
"step": 2690
},
{
"epoch": 1.4124737945492662,
"grad_norm": 0.8472657799720764,
"learning_rate": 3.875340234977486e-05,
"loss": 0.174,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17662659287452698,
"step": 2695
},
{
"epoch": 1.4150943396226414,
"grad_norm": 0.7782993912696838,
"learning_rate": 3.874430325193464e-05,
"loss": 0.2001,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20671376585960388,
"step": 2700
},
{
"epoch": 1.4177148846960168,
"grad_norm": 1.1483628749847412,
"learning_rate": 3.873517214303417e-05,
"loss": 0.1982,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1874844878911972,
"step": 2705
},
{
"epoch": 1.420335429769392,
"grad_norm": 0.936142086982727,
"learning_rate": 3.872600903866733e-05,
"loss": 0.2218,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16953758895397186,
"step": 2710
},
{
"epoch": 1.4229559748427674,
"grad_norm": 0.9390134811401367,
"learning_rate": 3.871681395448266e-05,
"loss": 0.1916,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19737115502357483,
"step": 2715
},
{
"epoch": 1.4255765199161425,
"grad_norm": 0.762726366519928,
"learning_rate": 3.8707586906183294e-05,
"loss": 0.2084,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2282119244337082,
"step": 2720
},
{
"epoch": 1.4281970649895177,
"grad_norm": 0.6033948063850403,
"learning_rate": 3.869832790952695e-05,
"loss": 0.1972,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20079247653484344,
"step": 2725
},
{
"epoch": 1.430817610062893,
"grad_norm": 0.8036638498306274,
"learning_rate": 3.868903698032593e-05,
"loss": 0.2063,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.216064453125,
"step": 2730
},
{
"epoch": 1.4334381551362683,
"grad_norm": 0.6354225277900696,
"learning_rate": 3.867971413444704e-05,
"loss": 0.198,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2076335847377777,
"step": 2735
},
{
"epoch": 1.4360587002096437,
"grad_norm": 0.6323251724243164,
"learning_rate": 3.867035938781161e-05,
"loss": 0.2135,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19645678997039795,
"step": 2740
},
{
"epoch": 1.4386792452830188,
"grad_norm": 0.6425936222076416,
"learning_rate": 3.866097275639545e-05,
"loss": 0.202,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19408687949180603,
"step": 2745
},
{
"epoch": 1.441299790356394,
"grad_norm": 1.0641465187072754,
"learning_rate": 3.865155425622882e-05,
"loss": 0.1956,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16387057304382324,
"step": 2750
},
{
"epoch": 1.4439203354297694,
"grad_norm": 0.6685094833374023,
"learning_rate": 3.86421039033964e-05,
"loss": 0.2016,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15119390189647675,
"step": 2755
},
{
"epoch": 1.4465408805031448,
"grad_norm": 0.7995813488960266,
"learning_rate": 3.8632621714037266e-05,
"loss": 0.1831,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23123614490032196,
"step": 2760
},
{
"epoch": 1.44916142557652,
"grad_norm": 0.6669260263442993,
"learning_rate": 3.862310770434487e-05,
"loss": 0.2173,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2532452344894409,
"step": 2765
},
{
"epoch": 1.4517819706498951,
"grad_norm": 0.7086100578308105,
"learning_rate": 3.861356189056701e-05,
"loss": 0.2179,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2617954611778259,
"step": 2770
},
{
"epoch": 1.4544025157232705,
"grad_norm": 0.638028085231781,
"learning_rate": 3.8603984289005786e-05,
"loss": 0.2173,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26442041993141174,
"step": 2775
},
{
"epoch": 1.4570230607966457,
"grad_norm": 0.5652633309364319,
"learning_rate": 3.85943749160176e-05,
"loss": 0.1881,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19999033212661743,
"step": 2780
},
{
"epoch": 1.459643605870021,
"grad_norm": 0.6860530972480774,
"learning_rate": 3.858473378801309e-05,
"loss": 0.1957,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20773279666900635,
"step": 2785
},
{
"epoch": 1.4622641509433962,
"grad_norm": 0.677933394908905,
"learning_rate": 3.857506092145714e-05,
"loss": 0.1997,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21120098233222961,
"step": 2790
},
{
"epoch": 1.4648846960167714,
"grad_norm": 0.7018347978591919,
"learning_rate": 3.856535633286884e-05,
"loss": 0.1804,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1769612580537796,
"step": 2795
},
{
"epoch": 1.4675052410901468,
"grad_norm": 0.6802659630775452,
"learning_rate": 3.855562003882144e-05,
"loss": 0.2116,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22609364986419678,
"step": 2800
},
{
"epoch": 1.470125786163522,
"grad_norm": 0.6637560129165649,
"learning_rate": 3.854585205594235e-05,
"loss": 0.2103,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.173095703125,
"step": 2805
},
{
"epoch": 1.4727463312368974,
"grad_norm": 0.560377836227417,
"learning_rate": 3.853605240091309e-05,
"loss": 0.2141,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2323237955570221,
"step": 2810
},
{
"epoch": 1.4753668763102725,
"grad_norm": 0.7306016087532043,
"learning_rate": 3.8526221090469266e-05,
"loss": 0.2135,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1875,
"step": 2815
},
{
"epoch": 1.4779874213836477,
"grad_norm": 0.7161574959754944,
"learning_rate": 3.851635814140055e-05,
"loss": 0.2107,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20000721514225006,
"step": 2820
},
{
"epoch": 1.480607966457023,
"grad_norm": 0.617828369140625,
"learning_rate": 3.850646357055065e-05,
"loss": 0.1961,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16674214601516724,
"step": 2825
},
{
"epoch": 1.4832285115303983,
"grad_norm": 0.5676167607307434,
"learning_rate": 3.8496537394817264e-05,
"loss": 0.2158,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20249316096305847,
"step": 2830
},
{
"epoch": 1.4858490566037736,
"grad_norm": 1.0004444122314453,
"learning_rate": 3.8486579631152067e-05,
"loss": 0.2021,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19540755450725555,
"step": 2835
},
{
"epoch": 1.4884696016771488,
"grad_norm": 0.8708783388137817,
"learning_rate": 3.84765902965607e-05,
"loss": 0.1794,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1630859375,
"step": 2840
},
{
"epoch": 1.491090146750524,
"grad_norm": 1.0635863542556763,
"learning_rate": 3.846656940810269e-05,
"loss": 0.2061,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1687188297510147,
"step": 2845
},
{
"epoch": 1.4937106918238994,
"grad_norm": 0.7047156095504761,
"learning_rate": 3.845651698289145e-05,
"loss": 0.1936,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.171891450881958,
"step": 2850
},
{
"epoch": 1.4963312368972748,
"grad_norm": 1.9242242574691772,
"learning_rate": 3.844643303809429e-05,
"loss": 0.2316,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19759413599967957,
"step": 2855
},
{
"epoch": 1.49895178197065,
"grad_norm": 0.6161275506019592,
"learning_rate": 3.8436317590932315e-05,
"loss": 0.2087,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.213210791349411,
"step": 2860
},
{
"epoch": 1.501572327044025,
"grad_norm": 0.6815282702445984,
"learning_rate": 3.842617065868043e-05,
"loss": 0.2058,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18007223308086395,
"step": 2865
},
{
"epoch": 1.5041928721174003,
"grad_norm": 0.7069063782691956,
"learning_rate": 3.841599225866733e-05,
"loss": 0.2074,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15446972846984863,
"step": 2870
},
{
"epoch": 1.5068134171907757,
"grad_norm": 0.6494333744049072,
"learning_rate": 3.8405782408275425e-05,
"loss": 0.1953,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21661195158958435,
"step": 2875
},
{
"epoch": 1.509433962264151,
"grad_norm": 0.6672247648239136,
"learning_rate": 3.8395541124940843e-05,
"loss": 0.1981,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21872758865356445,
"step": 2880
},
{
"epoch": 1.5120545073375262,
"grad_norm": 0.6205090880393982,
"learning_rate": 3.8385268426153415e-05,
"loss": 0.2226,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24721340835094452,
"step": 2885
},
{
"epoch": 1.5146750524109014,
"grad_norm": 0.8091459274291992,
"learning_rate": 3.8374964329456574e-05,
"loss": 0.1934,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21637308597564697,
"step": 2890
},
{
"epoch": 1.5172955974842768,
"grad_norm": 0.7390249967575073,
"learning_rate": 3.8364628852447424e-05,
"loss": 0.2159,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16475994884967804,
"step": 2895
},
{
"epoch": 1.519916142557652,
"grad_norm": 0.7167327404022217,
"learning_rate": 3.835426201277664e-05,
"loss": 0.2099,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21945439279079437,
"step": 2900
},
{
"epoch": 1.5225366876310273,
"grad_norm": 0.6998141407966614,
"learning_rate": 3.834386382814845e-05,
"loss": 0.1926,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.205071821808815,
"step": 2905
},
{
"epoch": 1.5251572327044025,
"grad_norm": 0.7944632768630981,
"learning_rate": 3.833343431632062e-05,
"loss": 0.2031,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18502473831176758,
"step": 2910
},
{
"epoch": 1.5277777777777777,
"grad_norm": 0.795866072177887,
"learning_rate": 3.83229734951044e-05,
"loss": 0.2106,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.177978515625,
"step": 2915
},
{
"epoch": 1.530398322851153,
"grad_norm": 0.6461061835289001,
"learning_rate": 3.831248138236455e-05,
"loss": 0.2041,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21445702016353607,
"step": 2920
},
{
"epoch": 1.5330188679245285,
"grad_norm": 0.6679595708847046,
"learning_rate": 3.830195799601922e-05,
"loss": 0.2058,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19536687433719635,
"step": 2925
},
{
"epoch": 1.5356394129979036,
"grad_norm": 0.7739956974983215,
"learning_rate": 3.829140335404e-05,
"loss": 0.2032,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20849609375,
"step": 2930
},
{
"epoch": 1.5382599580712788,
"grad_norm": 0.6194927096366882,
"learning_rate": 3.8280817474451845e-05,
"loss": 0.2061,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20353049039840698,
"step": 2935
},
{
"epoch": 1.540880503144654,
"grad_norm": 0.8548099398612976,
"learning_rate": 3.827020037533306e-05,
"loss": 0.2187,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2137005627155304,
"step": 2940
},
{
"epoch": 1.5435010482180294,
"grad_norm": 0.6645764112472534,
"learning_rate": 3.825955207481527e-05,
"loss": 0.2019,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2067786604166031,
"step": 2945
},
{
"epoch": 1.5461215932914047,
"grad_norm": 0.7723944187164307,
"learning_rate": 3.824887259108337e-05,
"loss": 0.1972,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2169840931892395,
"step": 2950
},
{
"epoch": 1.54874213836478,
"grad_norm": 0.6538282632827759,
"learning_rate": 3.8238161942375534e-05,
"loss": 0.208,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19555526971817017,
"step": 2955
},
{
"epoch": 1.551362683438155,
"grad_norm": 0.682479977607727,
"learning_rate": 3.8227420146983134e-05,
"loss": 0.2061,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.197683647274971,
"step": 2960
},
{
"epoch": 1.5539832285115303,
"grad_norm": 0.8361935019493103,
"learning_rate": 3.821664722325075e-05,
"loss": 0.1957,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2167164385318756,
"step": 2965
},
{
"epoch": 1.5566037735849056,
"grad_norm": 0.7050533294677734,
"learning_rate": 3.820584318957611e-05,
"loss": 0.1982,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22836749255657196,
"step": 2970
},
{
"epoch": 1.559224318658281,
"grad_norm": 0.6500424146652222,
"learning_rate": 3.819500806441009e-05,
"loss": 0.212,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2061050683259964,
"step": 2975
},
{
"epoch": 1.5618448637316562,
"grad_norm": 0.6116467714309692,
"learning_rate": 3.8184141866256636e-05,
"loss": 0.1931,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19401168823242188,
"step": 2980
},
{
"epoch": 1.5644654088050314,
"grad_norm": 0.6403232216835022,
"learning_rate": 3.8173244613672785e-05,
"loss": 0.2197,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22676880657672882,
"step": 2985
},
{
"epoch": 1.5670859538784065,
"grad_norm": 0.6228274703025818,
"learning_rate": 3.816231632526858e-05,
"loss": 0.1936,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19302821159362793,
"step": 2990
},
{
"epoch": 1.569706498951782,
"grad_norm": 0.888628363609314,
"learning_rate": 3.815135701970711e-05,
"loss": 0.2239,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2211262583732605,
"step": 2995
},
{
"epoch": 1.5723270440251573,
"grad_norm": 0.7154015302658081,
"learning_rate": 3.814036671570438e-05,
"loss": 0.2115,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20973986387252808,
"step": 3000
},
{
"epoch": 1.5749475890985325,
"grad_norm": 0.6683697700500488,
"learning_rate": 3.8129345432029376e-05,
"loss": 0.206,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24227727949619293,
"step": 3005
},
{
"epoch": 1.5775681341719077,
"grad_norm": 0.817297101020813,
"learning_rate": 3.8118293187503975e-05,
"loss": 0.2006,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22506284713745117,
"step": 3010
},
{
"epoch": 1.580188679245283,
"grad_norm": 1.3425382375717163,
"learning_rate": 3.810721000100293e-05,
"loss": 0.2164,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23591598868370056,
"step": 3015
},
{
"epoch": 1.5828092243186582,
"grad_norm": 0.6170550584793091,
"learning_rate": 3.8096095891453824e-05,
"loss": 0.1934,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19402346014976501,
"step": 3020
},
{
"epoch": 1.5854297693920336,
"grad_norm": 0.6385833621025085,
"learning_rate": 3.808495087783707e-05,
"loss": 0.1951,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19327062368392944,
"step": 3025
},
{
"epoch": 1.5880503144654088,
"grad_norm": 0.5318775177001953,
"learning_rate": 3.8073774979185845e-05,
"loss": 0.1849,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19630634784698486,
"step": 3030
},
{
"epoch": 1.590670859538784,
"grad_norm": 0.7159228324890137,
"learning_rate": 3.8062568214586076e-05,
"loss": 0.1948,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23486372828483582,
"step": 3035
},
{
"epoch": 1.5932914046121593,
"grad_norm": 0.5865247845649719,
"learning_rate": 3.80513306031764e-05,
"loss": 0.2026,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19960322976112366,
"step": 3040
},
{
"epoch": 1.5959119496855347,
"grad_norm": 0.7836723327636719,
"learning_rate": 3.804006216414812e-05,
"loss": 0.1981,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2001953125,
"step": 3045
},
{
"epoch": 1.59853249475891,
"grad_norm": 0.6605933904647827,
"learning_rate": 3.8028762916745216e-05,
"loss": 0.2052,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22346261143684387,
"step": 3050
},
{
"epoch": 1.601153039832285,
"grad_norm": 0.6677698493003845,
"learning_rate": 3.801743288026426e-05,
"loss": 0.1956,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19307313859462738,
"step": 3055
},
{
"epoch": 1.6037735849056602,
"grad_norm": 0.7019439339637756,
"learning_rate": 3.8006072074054415e-05,
"loss": 0.2077,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2159917950630188,
"step": 3060
},
{
"epoch": 1.6063941299790356,
"grad_norm": 0.6048671007156372,
"learning_rate": 3.7994680517517374e-05,
"loss": 0.2109,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21212241053581238,
"step": 3065
},
{
"epoch": 1.609014675052411,
"grad_norm": 0.5906047821044922,
"learning_rate": 3.798325823010737e-05,
"loss": 0.2181,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18436996638774872,
"step": 3070
},
{
"epoch": 1.6116352201257862,
"grad_norm": 1.061728596687317,
"learning_rate": 3.7971805231331096e-05,
"loss": 0.1971,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17529296875,
"step": 3075
},
{
"epoch": 1.6142557651991614,
"grad_norm": 0.6743407845497131,
"learning_rate": 3.79603215407477e-05,
"loss": 0.2033,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19340994954109192,
"step": 3080
},
{
"epoch": 1.6168763102725365,
"grad_norm": 0.8559533953666687,
"learning_rate": 3.7948807177968755e-05,
"loss": 0.2104,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22995486855506897,
"step": 3085
},
{
"epoch": 1.619496855345912,
"grad_norm": 0.66473788022995,
"learning_rate": 3.79372621626582e-05,
"loss": 0.2091,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20205003023147583,
"step": 3090
},
{
"epoch": 1.6221174004192873,
"grad_norm": 0.6741588115692139,
"learning_rate": 3.792568651453233e-05,
"loss": 0.1963,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2298995554447174,
"step": 3095
},
{
"epoch": 1.6247379454926625,
"grad_norm": 0.7121372818946838,
"learning_rate": 3.7914080253359754e-05,
"loss": 0.2077,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23662368953227997,
"step": 3100
},
{
"epoch": 1.6273584905660377,
"grad_norm": 0.7736591100692749,
"learning_rate": 3.790244339896136e-05,
"loss": 0.2117,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19614163041114807,
"step": 3105
},
{
"epoch": 1.629979035639413,
"grad_norm": 0.7388415932655334,
"learning_rate": 3.7890775971210286e-05,
"loss": 0.1973,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19440676271915436,
"step": 3110
},
{
"epoch": 1.6325995807127882,
"grad_norm": 0.7736803889274597,
"learning_rate": 3.787907799003186e-05,
"loss": 0.1976,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18566857278347015,
"step": 3115
},
{
"epoch": 1.6352201257861636,
"grad_norm": 0.6769443154335022,
"learning_rate": 3.786734947540363e-05,
"loss": 0.1983,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19205525517463684,
"step": 3120
},
{
"epoch": 1.6378406708595388,
"grad_norm": 0.6849276423454285,
"learning_rate": 3.7855590447355243e-05,
"loss": 0.1969,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20867285132408142,
"step": 3125
},
{
"epoch": 1.640461215932914,
"grad_norm": 0.7193086743354797,
"learning_rate": 3.7843800925968495e-05,
"loss": 0.2015,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20997187495231628,
"step": 3130
},
{
"epoch": 1.6430817610062893,
"grad_norm": 0.642825722694397,
"learning_rate": 3.7831980931377234e-05,
"loss": 0.206,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17490190267562866,
"step": 3135
},
{
"epoch": 1.6457023060796647,
"grad_norm": 1.0223510265350342,
"learning_rate": 3.782013048376736e-05,
"loss": 0.1954,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1529541015625,
"step": 3140
},
{
"epoch": 1.64832285115304,
"grad_norm": 0.7299026250839233,
"learning_rate": 3.7808249603376773e-05,
"loss": 0.2199,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23865553736686707,
"step": 3145
},
{
"epoch": 1.650943396226415,
"grad_norm": 0.6869032382965088,
"learning_rate": 3.779633831049535e-05,
"loss": 0.1996,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18986861407756805,
"step": 3150
},
{
"epoch": 1.6535639412997902,
"grad_norm": 0.6794070601463318,
"learning_rate": 3.7784396625464896e-05,
"loss": 0.1862,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18808311223983765,
"step": 3155
},
{
"epoch": 1.6561844863731656,
"grad_norm": 0.6022070646286011,
"learning_rate": 3.777242456867914e-05,
"loss": 0.1893,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.221364825963974,
"step": 3160
},
{
"epoch": 1.658805031446541,
"grad_norm": 0.7784854173660278,
"learning_rate": 3.776042216058365e-05,
"loss": 0.2058,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.222900390625,
"step": 3165
},
{
"epoch": 1.6614255765199162,
"grad_norm": 0.6530669331550598,
"learning_rate": 3.774838942167587e-05,
"loss": 0.2133,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2069305181503296,
"step": 3170
},
{
"epoch": 1.6640461215932913,
"grad_norm": 0.654613733291626,
"learning_rate": 3.773632637250498e-05,
"loss": 0.196,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24302370846271515,
"step": 3175
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.9669556617736816,
"learning_rate": 3.772423303367199e-05,
"loss": 0.199,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20167219638824463,
"step": 3180
},
{
"epoch": 1.669287211740042,
"grad_norm": 0.7367469668388367,
"learning_rate": 3.77121094258296e-05,
"loss": 0.2186,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18835796415805817,
"step": 3185
},
{
"epoch": 1.6719077568134173,
"grad_norm": 0.7441819310188293,
"learning_rate": 3.7699955569682185e-05,
"loss": 0.2425,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.3099658787250519,
"step": 3190
},
{
"epoch": 1.6745283018867925,
"grad_norm": 0.7120652198791504,
"learning_rate": 3.7687771485985834e-05,
"loss": 0.1976,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1901395171880722,
"step": 3195
},
{
"epoch": 1.6771488469601676,
"grad_norm": 0.8141222596168518,
"learning_rate": 3.767555719554821e-05,
"loss": 0.2126,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24998538196086884,
"step": 3200
},
{
"epoch": 1.679769392033543,
"grad_norm": 0.7448544502258301,
"learning_rate": 3.766331271922858e-05,
"loss": 0.2117,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.203841432929039,
"step": 3205
},
{
"epoch": 1.6823899371069182,
"grad_norm": 0.5663997530937195,
"learning_rate": 3.765103807793776e-05,
"loss": 0.173,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1611168533563614,
"step": 3210
},
{
"epoch": 1.6850104821802936,
"grad_norm": 0.6563591361045837,
"learning_rate": 3.763873329263808e-05,
"loss": 0.2195,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1935395896434784,
"step": 3215
},
{
"epoch": 1.6876310272536688,
"grad_norm": 0.7698992490768433,
"learning_rate": 3.762639838434335e-05,
"loss": 0.1918,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15444648265838623,
"step": 3220
},
{
"epoch": 1.690251572327044,
"grad_norm": 0.8108476996421814,
"learning_rate": 3.7614033374118826e-05,
"loss": 0.2056,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2104421854019165,
"step": 3225
},
{
"epoch": 1.6928721174004193,
"grad_norm": 0.7241305708885193,
"learning_rate": 3.760163828308116e-05,
"loss": 0.1915,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19673112034797668,
"step": 3230
},
{
"epoch": 1.6954926624737947,
"grad_norm": 0.6991493105888367,
"learning_rate": 3.75892131323984e-05,
"loss": 0.2169,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2223394215106964,
"step": 3235
},
{
"epoch": 1.6981132075471699,
"grad_norm": 0.6248032450675964,
"learning_rate": 3.757675794328989e-05,
"loss": 0.2,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2107730656862259,
"step": 3240
},
{
"epoch": 1.700733752620545,
"grad_norm": 0.6475279331207275,
"learning_rate": 3.756427273702632e-05,
"loss": 0.2194,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21362945437431335,
"step": 3245
},
{
"epoch": 1.7033542976939202,
"grad_norm": 0.7653929591178894,
"learning_rate": 3.75517575349296e-05,
"loss": 0.2126,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2135382890701294,
"step": 3250
},
{
"epoch": 1.7059748427672956,
"grad_norm": 0.6877974271774292,
"learning_rate": 3.7539212358372885e-05,
"loss": 0.1971,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2015942931175232,
"step": 3255
},
{
"epoch": 1.708595387840671,
"grad_norm": 0.5155228972434998,
"learning_rate": 3.752663722878053e-05,
"loss": 0.1953,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1829644739627838,
"step": 3260
},
{
"epoch": 1.7112159329140462,
"grad_norm": 0.6721019148826599,
"learning_rate": 3.751403216762803e-05,
"loss": 0.2138,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21119743585586548,
"step": 3265
},
{
"epoch": 1.7138364779874213,
"grad_norm": 0.6383962035179138,
"learning_rate": 3.7501397196441996e-05,
"loss": 0.2119,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19863849878311157,
"step": 3270
},
{
"epoch": 1.7164570230607965,
"grad_norm": 0.53727126121521,
"learning_rate": 3.748873233680012e-05,
"loss": 0.2058,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18619093298912048,
"step": 3275
},
{
"epoch": 1.719077568134172,
"grad_norm": 0.6815948486328125,
"learning_rate": 3.7476037610331135e-05,
"loss": 0.2131,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20206259191036224,
"step": 3280
},
{
"epoch": 1.7216981132075473,
"grad_norm": 0.7351505160331726,
"learning_rate": 3.746331303871479e-05,
"loss": 0.1897,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21092607080936432,
"step": 3285
},
{
"epoch": 1.7243186582809225,
"grad_norm": 0.7017987966537476,
"learning_rate": 3.745055864368179e-05,
"loss": 0.2144,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23522168397903442,
"step": 3290
},
{
"epoch": 1.7269392033542976,
"grad_norm": 0.7407639622688293,
"learning_rate": 3.743777444701378e-05,
"loss": 0.1942,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2146407663822174,
"step": 3295
},
{
"epoch": 1.7295597484276728,
"grad_norm": 0.8630208373069763,
"learning_rate": 3.7424960470543294e-05,
"loss": 0.2071,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18362826108932495,
"step": 3300
},
{
"epoch": 1.7321802935010482,
"grad_norm": 0.662527859210968,
"learning_rate": 3.741211673615374e-05,
"loss": 0.2256,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21645821630954742,
"step": 3305
},
{
"epoch": 1.7348008385744236,
"grad_norm": 0.7605148553848267,
"learning_rate": 3.7399243265779305e-05,
"loss": 0.1955,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19093088805675507,
"step": 3310
},
{
"epoch": 1.7374213836477987,
"grad_norm": 0.6030600070953369,
"learning_rate": 3.7386340081405004e-05,
"loss": 0.2298,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2234543412923813,
"step": 3315
},
{
"epoch": 1.740041928721174,
"grad_norm": 0.6327378153800964,
"learning_rate": 3.737340720506657e-05,
"loss": 0.1985,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2092643678188324,
"step": 3320
},
{
"epoch": 1.7426624737945493,
"grad_norm": 0.6156671047210693,
"learning_rate": 3.736044465885046e-05,
"loss": 0.1898,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.220514714717865,
"step": 3325
},
{
"epoch": 1.7452830188679245,
"grad_norm": 0.643059253692627,
"learning_rate": 3.734745246489379e-05,
"loss": 0.2048,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2512093186378479,
"step": 3330
},
{
"epoch": 1.7479035639412999,
"grad_norm": 0.7039141654968262,
"learning_rate": 3.73344306453843e-05,
"loss": 0.2092,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22273048758506775,
"step": 3335
},
{
"epoch": 1.750524109014675,
"grad_norm": 0.6144287586212158,
"learning_rate": 3.732137922256035e-05,
"loss": 0.2167,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17802879214286804,
"step": 3340
},
{
"epoch": 1.7531446540880502,
"grad_norm": 0.7636461853981018,
"learning_rate": 3.7308298218710816e-05,
"loss": 0.1947,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.186279296875,
"step": 3345
},
{
"epoch": 1.7557651991614256,
"grad_norm": 0.6746559739112854,
"learning_rate": 3.729518765617513e-05,
"loss": 0.2199,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20318138599395752,
"step": 3350
},
{
"epoch": 1.758385744234801,
"grad_norm": 0.6666935682296753,
"learning_rate": 3.7282047557343195e-05,
"loss": 0.2148,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1769717037677765,
"step": 3355
},
{
"epoch": 1.7610062893081762,
"grad_norm": 0.5784667134284973,
"learning_rate": 3.726887794465533e-05,
"loss": 0.1945,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18660691380500793,
"step": 3360
},
{
"epoch": 1.7636268343815513,
"grad_norm": 0.7593470811843872,
"learning_rate": 3.725567884060229e-05,
"loss": 0.2122,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2277555763721466,
"step": 3365
},
{
"epoch": 1.7662473794549265,
"grad_norm": 0.6410986185073853,
"learning_rate": 3.724245026772518e-05,
"loss": 0.2203,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19018101692199707,
"step": 3370
},
{
"epoch": 1.7688679245283019,
"grad_norm": 0.7130261659622192,
"learning_rate": 3.7229192248615416e-05,
"loss": 0.2081,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2189968228340149,
"step": 3375
},
{
"epoch": 1.7714884696016773,
"grad_norm": 0.7862639427185059,
"learning_rate": 3.721590480591474e-05,
"loss": 0.1946,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20198023319244385,
"step": 3380
},
{
"epoch": 1.7741090146750524,
"grad_norm": 0.6717849969863892,
"learning_rate": 3.72025879623151e-05,
"loss": 0.1959,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20632538199424744,
"step": 3385
},
{
"epoch": 1.7767295597484276,
"grad_norm": 0.6856803297996521,
"learning_rate": 3.718924174055868e-05,
"loss": 0.1926,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18674078583717346,
"step": 3390
},
{
"epoch": 1.7793501048218028,
"grad_norm": 0.6597937941551208,
"learning_rate": 3.717586616343784e-05,
"loss": 0.1913,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2081529200077057,
"step": 3395
},
{
"epoch": 1.7819706498951782,
"grad_norm": 0.548897922039032,
"learning_rate": 3.716246125379504e-05,
"loss": 0.2059,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18970248103141785,
"step": 3400
},
{
"epoch": 1.7845911949685536,
"grad_norm": 0.6445600986480713,
"learning_rate": 3.714902703452288e-05,
"loss": 0.2047,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21960783004760742,
"step": 3405
},
{
"epoch": 1.7872117400419287,
"grad_norm": 0.6185094714164734,
"learning_rate": 3.713556352856398e-05,
"loss": 0.2154,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2151571810245514,
"step": 3410
},
{
"epoch": 1.789832285115304,
"grad_norm": 0.6007769107818604,
"learning_rate": 3.712207075891097e-05,
"loss": 0.1987,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23578466475009918,
"step": 3415
},
{
"epoch": 1.7924528301886793,
"grad_norm": 0.7605302333831787,
"learning_rate": 3.7108548748606496e-05,
"loss": 0.1991,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1687735766172409,
"step": 3420
},
{
"epoch": 1.7950733752620545,
"grad_norm": 0.5006226897239685,
"learning_rate": 3.70949975207431e-05,
"loss": 0.1915,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.208736851811409,
"step": 3425
},
{
"epoch": 1.7976939203354299,
"grad_norm": 0.5228962302207947,
"learning_rate": 3.708141709846323e-05,
"loss": 0.2108,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17655402421951294,
"step": 3430
},
{
"epoch": 1.800314465408805,
"grad_norm": 0.6512285470962524,
"learning_rate": 3.70678075049592e-05,
"loss": 0.1878,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21836760640144348,
"step": 3435
},
{
"epoch": 1.8029350104821802,
"grad_norm": 1.5839763879776,
"learning_rate": 3.7054168763473155e-05,
"loss": 0.1929,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.177001953125,
"step": 3440
},
{
"epoch": 1.8055555555555556,
"grad_norm": 0.6855888962745667,
"learning_rate": 3.704050089729699e-05,
"loss": 0.1745,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19496369361877441,
"step": 3445
},
{
"epoch": 1.808176100628931,
"grad_norm": 0.6062113046646118,
"learning_rate": 3.702680392977235e-05,
"loss": 0.198,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16827154159545898,
"step": 3450
},
{
"epoch": 1.8107966457023061,
"grad_norm": 1.070608377456665,
"learning_rate": 3.7013077884290576e-05,
"loss": 0.2063,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20059406757354736,
"step": 3455
},
{
"epoch": 1.8134171907756813,
"grad_norm": 0.5718883872032166,
"learning_rate": 3.699932278429268e-05,
"loss": 0.2137,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22236788272857666,
"step": 3460
},
{
"epoch": 1.8160377358490565,
"grad_norm": 0.6839277148246765,
"learning_rate": 3.698553865326928e-05,
"loss": 0.2077,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.221435546875,
"step": 3465
},
{
"epoch": 1.8186582809224319,
"grad_norm": 0.7797523736953735,
"learning_rate": 3.6971725514760576e-05,
"loss": 0.1978,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20486325025558472,
"step": 3470
},
{
"epoch": 1.8212788259958073,
"grad_norm": 0.7781976461410522,
"learning_rate": 3.69578833923563e-05,
"loss": 0.2032,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18268337845802307,
"step": 3475
},
{
"epoch": 1.8238993710691824,
"grad_norm": 0.5963297486305237,
"learning_rate": 3.6944012309695707e-05,
"loss": 0.1952,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17451441287994385,
"step": 3480
},
{
"epoch": 1.8265199161425576,
"grad_norm": 0.7559292316436768,
"learning_rate": 3.693011229046747e-05,
"loss": 0.2074,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20954449474811554,
"step": 3485
},
{
"epoch": 1.8291404612159328,
"grad_norm": 0.5257536172866821,
"learning_rate": 3.691618335840972e-05,
"loss": 0.2107,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21420666575431824,
"step": 3490
},
{
"epoch": 1.8317610062893082,
"grad_norm": 0.7904267311096191,
"learning_rate": 3.690222553730992e-05,
"loss": 0.2208,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23457945883274078,
"step": 3495
},
{
"epoch": 1.8343815513626835,
"grad_norm": 0.6557732820510864,
"learning_rate": 3.688823885100491e-05,
"loss": 0.1912,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20230236649513245,
"step": 3500
},
{
"epoch": 1.8370020964360587,
"grad_norm": 0.7868777513504028,
"learning_rate": 3.6874223323380804e-05,
"loss": 0.1962,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.212323397397995,
"step": 3505
},
{
"epoch": 1.8396226415094339,
"grad_norm": 0.6201764941215515,
"learning_rate": 3.686017897837298e-05,
"loss": 0.1904,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19490529596805573,
"step": 3510
},
{
"epoch": 1.8422431865828093,
"grad_norm": 0.8452920317649841,
"learning_rate": 3.684610583996602e-05,
"loss": 0.1993,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1875,
"step": 3515
},
{
"epoch": 1.8448637316561844,
"grad_norm": 0.7257571816444397,
"learning_rate": 3.683200393219369e-05,
"loss": 0.1929,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19361209869384766,
"step": 3520
},
{
"epoch": 1.8474842767295598,
"grad_norm": 0.5986731052398682,
"learning_rate": 3.681787327913888e-05,
"loss": 0.1942,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.197706937789917,
"step": 3525
},
{
"epoch": 1.850104821802935,
"grad_norm": 0.8025481104850769,
"learning_rate": 3.680371390493356e-05,
"loss": 0.1956,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21123269200325012,
"step": 3530
},
{
"epoch": 1.8527253668763102,
"grad_norm": 0.7123976349830627,
"learning_rate": 3.678952583375878e-05,
"loss": 0.1891,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.166748046875,
"step": 3535
},
{
"epoch": 1.8553459119496856,
"grad_norm": 0.6865976452827454,
"learning_rate": 3.6775309089844566e-05,
"loss": 0.1934,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1982835829257965,
"step": 3540
},
{
"epoch": 1.857966457023061,
"grad_norm": 0.6656506061553955,
"learning_rate": 3.676106369746993e-05,
"loss": 0.1954,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2171439528465271,
"step": 3545
},
{
"epoch": 1.8605870020964361,
"grad_norm": 0.6462177634239197,
"learning_rate": 3.67467896809628e-05,
"loss": 0.1928,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18315325677394867,
"step": 3550
},
{
"epoch": 1.8632075471698113,
"grad_norm": 0.6657865643501282,
"learning_rate": 3.673248706469999e-05,
"loss": 0.2062,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.207036554813385,
"step": 3555
},
{
"epoch": 1.8658280922431865,
"grad_norm": 0.7065151929855347,
"learning_rate": 3.6718155873107156e-05,
"loss": 0.1859,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18963903188705444,
"step": 3560
},
{
"epoch": 1.8684486373165619,
"grad_norm": 1.0115692615509033,
"learning_rate": 3.670379613065875e-05,
"loss": 0.1891,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.174276202917099,
"step": 3565
},
{
"epoch": 1.8710691823899372,
"grad_norm": 0.6386069655418396,
"learning_rate": 3.668940786187801e-05,
"loss": 0.2028,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24545133113861084,
"step": 3570
},
{
"epoch": 1.8736897274633124,
"grad_norm": 0.6867290139198303,
"learning_rate": 3.667499109133683e-05,
"loss": 0.2071,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2165694534778595,
"step": 3575
},
{
"epoch": 1.8763102725366876,
"grad_norm": 2.2979297637939453,
"learning_rate": 3.6660545843655856e-05,
"loss": 0.2067,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18144424259662628,
"step": 3580
},
{
"epoch": 1.8789308176100628,
"grad_norm": 0.6569699645042419,
"learning_rate": 3.664607214350429e-05,
"loss": 0.2059,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21263781189918518,
"step": 3585
},
{
"epoch": 1.8815513626834381,
"grad_norm": 0.6411572098731995,
"learning_rate": 3.66315700156e-05,
"loss": 0.1996,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18590950965881348,
"step": 3590
},
{
"epoch": 1.8841719077568135,
"grad_norm": 0.6736159324645996,
"learning_rate": 3.6617039484709324e-05,
"loss": 0.2142,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20901615917682648,
"step": 3595
},
{
"epoch": 1.8867924528301887,
"grad_norm": 0.7786797285079956,
"learning_rate": 3.660248057564717e-05,
"loss": 0.2083,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21553653478622437,
"step": 3600
},
{
"epoch": 1.8894129979035639,
"grad_norm": 0.6000698208808899,
"learning_rate": 3.658789331327688e-05,
"loss": 0.1961,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18407447636127472,
"step": 3605
},
{
"epoch": 1.892033542976939,
"grad_norm": 0.5410313010215759,
"learning_rate": 3.657327772251022e-05,
"loss": 0.2014,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19665025174617767,
"step": 3610
},
{
"epoch": 1.8946540880503144,
"grad_norm": 0.6245362758636475,
"learning_rate": 3.6558633828307335e-05,
"loss": 0.2012,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21915487945079803,
"step": 3615
},
{
"epoch": 1.8972746331236898,
"grad_norm": 0.6884681582450867,
"learning_rate": 3.654396165567671e-05,
"loss": 0.1811,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22231367230415344,
"step": 3620
},
{
"epoch": 1.899895178197065,
"grad_norm": 0.8241666555404663,
"learning_rate": 3.6529261229675134e-05,
"loss": 0.2135,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.179931640625,
"step": 3625
},
{
"epoch": 1.9025157232704402,
"grad_norm": 0.6740656495094299,
"learning_rate": 3.6514532575407606e-05,
"loss": 0.1868,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19351506233215332,
"step": 3630
},
{
"epoch": 1.9051362683438156,
"grad_norm": 1.3846477270126343,
"learning_rate": 3.6499775718027374e-05,
"loss": 0.1935,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21881192922592163,
"step": 3635
},
{
"epoch": 1.9077568134171907,
"grad_norm": 0.6799472570419312,
"learning_rate": 3.648499068273584e-05,
"loss": 0.1874,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1958981156349182,
"step": 3640
},
{
"epoch": 1.9103773584905661,
"grad_norm": 0.7112404108047485,
"learning_rate": 3.6470177494782525e-05,
"loss": 0.2124,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2296728789806366,
"step": 3645
},
{
"epoch": 1.9129979035639413,
"grad_norm": 0.6902780532836914,
"learning_rate": 3.6455336179465006e-05,
"loss": 0.218,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2503230571746826,
"step": 3650
},
{
"epoch": 1.9156184486373165,
"grad_norm": 0.6022567749023438,
"learning_rate": 3.6440466762128945e-05,
"loss": 0.1969,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17072564363479614,
"step": 3655
},
{
"epoch": 1.9182389937106918,
"grad_norm": 0.618741512298584,
"learning_rate": 3.642556926816795e-05,
"loss": 0.2026,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21050477027893066,
"step": 3660
},
{
"epoch": 1.9208595387840672,
"grad_norm": 0.7455676794052124,
"learning_rate": 3.64106437230236e-05,
"loss": 0.1904,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1719132959842682,
"step": 3665
},
{
"epoch": 1.9234800838574424,
"grad_norm": 0.6303054690361023,
"learning_rate": 3.639569015218537e-05,
"loss": 0.1935,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17934748530387878,
"step": 3670
},
{
"epoch": 1.9261006289308176,
"grad_norm": 0.6130335330963135,
"learning_rate": 3.638070858119061e-05,
"loss": 0.2027,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.154541015625,
"step": 3675
},
{
"epoch": 1.9287211740041927,
"grad_norm": 0.6253183484077454,
"learning_rate": 3.6365699035624465e-05,
"loss": 0.2027,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19237437844276428,
"step": 3680
},
{
"epoch": 1.9313417190775681,
"grad_norm": 0.7445136308670044,
"learning_rate": 3.635066154111989e-05,
"loss": 0.2132,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2229272425174713,
"step": 3685
},
{
"epoch": 1.9339622641509435,
"grad_norm": 0.6628245115280151,
"learning_rate": 3.6335596123357515e-05,
"loss": 0.1794,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18328478932380676,
"step": 3690
},
{
"epoch": 1.9365828092243187,
"grad_norm": 0.667185366153717,
"learning_rate": 3.6320502808065716e-05,
"loss": 0.2002,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20585229992866516,
"step": 3695
},
{
"epoch": 1.9392033542976939,
"grad_norm": 0.5516197085380554,
"learning_rate": 3.630538162102048e-05,
"loss": 0.2059,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18774524331092834,
"step": 3700
},
{
"epoch": 1.941823899371069,
"grad_norm": 0.6480304002761841,
"learning_rate": 3.62902325880454e-05,
"loss": 0.2146,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.26353558897972107,
"step": 3705
},
{
"epoch": 1.9444444444444444,
"grad_norm": 0.5691676735877991,
"learning_rate": 3.627505573501162e-05,
"loss": 0.2207,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2253093421459198,
"step": 3710
},
{
"epoch": 1.9470649895178198,
"grad_norm": 0.651718020439148,
"learning_rate": 3.6259851087837785e-05,
"loss": 0.1969,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18974995613098145,
"step": 3715
},
{
"epoch": 1.949685534591195,
"grad_norm": 0.7716585397720337,
"learning_rate": 3.6244618672490036e-05,
"loss": 0.2007,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2197091281414032,
"step": 3720
},
{
"epoch": 1.9523060796645701,
"grad_norm": 0.6229172348976135,
"learning_rate": 3.622935851498191e-05,
"loss": 0.1959,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.197265625,
"step": 3725
},
{
"epoch": 1.9549266247379455,
"grad_norm": 0.5668113231658936,
"learning_rate": 3.621407064137433e-05,
"loss": 0.2107,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24369171261787415,
"step": 3730
},
{
"epoch": 1.9575471698113207,
"grad_norm": 0.588336706161499,
"learning_rate": 3.619875507777555e-05,
"loss": 0.1989,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21228131651878357,
"step": 3735
},
{
"epoch": 1.960167714884696,
"grad_norm": 0.6574887633323669,
"learning_rate": 3.6183411850341106e-05,
"loss": 0.1961,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18895301222801208,
"step": 3740
},
{
"epoch": 1.9627882599580713,
"grad_norm": 0.7427954077720642,
"learning_rate": 3.616804098527379e-05,
"loss": 0.202,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18273983895778656,
"step": 3745
},
{
"epoch": 1.9654088050314464,
"grad_norm": 1.0722060203552246,
"learning_rate": 3.615264250882359e-05,
"loss": 0.183,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.151123046875,
"step": 3750
},
{
"epoch": 1.9680293501048218,
"grad_norm": 0.590573251247406,
"learning_rate": 3.613721644728765e-05,
"loss": 0.1934,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17236328125,
"step": 3755
},
{
"epoch": 1.9706498951781972,
"grad_norm": 0.7106091976165771,
"learning_rate": 3.6121762827010206e-05,
"loss": 0.2109,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22868366539478302,
"step": 3760
},
{
"epoch": 1.9732704402515724,
"grad_norm": 0.6222302913665771,
"learning_rate": 3.610628167438258e-05,
"loss": 0.2062,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21538963913917542,
"step": 3765
},
{
"epoch": 1.9758909853249476,
"grad_norm": 0.6666224002838135,
"learning_rate": 3.60907730158431e-05,
"loss": 0.198,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1969054937362671,
"step": 3770
},
{
"epoch": 1.9785115303983227,
"grad_norm": 0.7065848112106323,
"learning_rate": 3.607523687787707e-05,
"loss": 0.2255,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.24432584643363953,
"step": 3775
},
{
"epoch": 1.9811320754716981,
"grad_norm": 0.6347010135650635,
"learning_rate": 3.605967328701673e-05,
"loss": 0.2109,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19049328565597534,
"step": 3780
},
{
"epoch": 1.9837526205450735,
"grad_norm": 0.9422894716262817,
"learning_rate": 3.60440822698412e-05,
"loss": 0.1974,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17729416489601135,
"step": 3785
},
{
"epoch": 1.9863731656184487,
"grad_norm": 0.738005518913269,
"learning_rate": 3.602846385297642e-05,
"loss": 0.2035,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21540319919586182,
"step": 3790
},
{
"epoch": 1.9889937106918238,
"grad_norm": 0.6134724617004395,
"learning_rate": 3.601281806309516e-05,
"loss": 0.1867,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16434405744075775,
"step": 3795
},
{
"epoch": 1.991614255765199,
"grad_norm": 0.7037052512168884,
"learning_rate": 3.599714492691689e-05,
"loss": 0.1896,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21146099269390106,
"step": 3800
},
{
"epoch": 1.9942348008385744,
"grad_norm": 0.5308715105056763,
"learning_rate": 3.598144447120783e-05,
"loss": 0.2222,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.235158771276474,
"step": 3805
},
{
"epoch": 1.9968553459119498,
"grad_norm": 0.7394761443138123,
"learning_rate": 3.596571672278083e-05,
"loss": 0.1989,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2079474925994873,
"step": 3810
},
{
"epoch": 1.999475890985325,
"grad_norm": 0.594095766544342,
"learning_rate": 3.5949961708495335e-05,
"loss": 0.1817,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18311980366706848,
"step": 3815
},
{
"epoch": 2.002620545073375,
"grad_norm": 0.7084780931472778,
"learning_rate": 3.593417945525739e-05,
"loss": 0.1562,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1945737898349762,
"step": 3820
},
{
"epoch": 2.0052410901467503,
"grad_norm": 0.6283460259437561,
"learning_rate": 3.591836999001952e-05,
"loss": 0.1817,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.203782320022583,
"step": 3825
},
{
"epoch": 2.007861635220126,
"grad_norm": 0.7033699154853821,
"learning_rate": 3.5902533339780756e-05,
"loss": 0.1565,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.154052734375,
"step": 3830
},
{
"epoch": 2.010482180293501,
"grad_norm": 0.7841500043869019,
"learning_rate": 3.588666953158653e-05,
"loss": 0.1972,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18807217478752136,
"step": 3835
},
{
"epoch": 2.0131027253668763,
"grad_norm": 0.6844927668571472,
"learning_rate": 3.587077859252868e-05,
"loss": 0.1767,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18625213205814362,
"step": 3840
},
{
"epoch": 2.0157232704402515,
"grad_norm": 0.7283322215080261,
"learning_rate": 3.585486054974535e-05,
"loss": 0.1643,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20896628499031067,
"step": 3845
},
{
"epoch": 2.0183438155136266,
"grad_norm": 0.8837510943412781,
"learning_rate": 3.583891543042097e-05,
"loss": 0.1938,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17919921875,
"step": 3850
},
{
"epoch": 2.0209643605870022,
"grad_norm": 0.7254173755645752,
"learning_rate": 3.582294326178624e-05,
"loss": 0.1861,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17642877995967865,
"step": 3855
},
{
"epoch": 2.0235849056603774,
"grad_norm": 0.5675681233406067,
"learning_rate": 3.5806944071118036e-05,
"loss": 0.1695,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17149925231933594,
"step": 3860
},
{
"epoch": 2.0262054507337526,
"grad_norm": 0.5747088193893433,
"learning_rate": 3.579091788573938e-05,
"loss": 0.1863,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16926203668117523,
"step": 3865
},
{
"epoch": 2.0288259958071277,
"grad_norm": 0.6015620231628418,
"learning_rate": 3.577486473301939e-05,
"loss": 0.1848,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1605832278728485,
"step": 3870
},
{
"epoch": 2.0314465408805034,
"grad_norm": 0.9510326385498047,
"learning_rate": 3.575878464037325e-05,
"loss": 0.1608,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16941693425178528,
"step": 3875
},
{
"epoch": 2.0340670859538785,
"grad_norm": 0.6109377145767212,
"learning_rate": 3.574267763526215e-05,
"loss": 0.1656,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15363606810569763,
"step": 3880
},
{
"epoch": 2.0366876310272537,
"grad_norm": 0.5519078969955444,
"learning_rate": 3.5726543745193236e-05,
"loss": 0.1764,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19953450560569763,
"step": 3885
},
{
"epoch": 2.039308176100629,
"grad_norm": 0.7259485721588135,
"learning_rate": 3.571038299771957e-05,
"loss": 0.1949,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19868190586566925,
"step": 3890
},
{
"epoch": 2.041928721174004,
"grad_norm": 0.7612906694412231,
"learning_rate": 3.569419542044008e-05,
"loss": 0.1569,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.172119140625,
"step": 3895
},
{
"epoch": 2.0445492662473796,
"grad_norm": 0.7879012823104858,
"learning_rate": 3.56779810409995e-05,
"loss": 0.1616,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1760033667087555,
"step": 3900
},
{
"epoch": 2.047169811320755,
"grad_norm": 0.6542850136756897,
"learning_rate": 3.566173988708836e-05,
"loss": 0.1752,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2122265100479126,
"step": 3905
},
{
"epoch": 2.04979035639413,
"grad_norm": 0.6596713066101074,
"learning_rate": 3.5645471986442905e-05,
"loss": 0.1859,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18576733767986298,
"step": 3910
},
{
"epoch": 2.052410901467505,
"grad_norm": 0.7132096290588379,
"learning_rate": 3.5629177366845055e-05,
"loss": 0.1656,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1605224609375,
"step": 3915
},
{
"epoch": 2.0550314465408803,
"grad_norm": 0.6687309741973877,
"learning_rate": 3.561285605612236e-05,
"loss": 0.1921,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21139320731163025,
"step": 3920
},
{
"epoch": 2.057651991614256,
"grad_norm": 0.6709274053573608,
"learning_rate": 3.5596508082147944e-05,
"loss": 0.1641,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1706489622592926,
"step": 3925
},
{
"epoch": 2.060272536687631,
"grad_norm": 0.6589915156364441,
"learning_rate": 3.558013347284049e-05,
"loss": 0.1727,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17125006020069122,
"step": 3930
},
{
"epoch": 2.0628930817610063,
"grad_norm": 0.6327504515647888,
"learning_rate": 3.5563732256164136e-05,
"loss": 0.1861,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1846763789653778,
"step": 3935
},
{
"epoch": 2.0655136268343814,
"grad_norm": 0.6399828791618347,
"learning_rate": 3.554730446012849e-05,
"loss": 0.1818,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14534403383731842,
"step": 3940
},
{
"epoch": 2.0681341719077566,
"grad_norm": 0.5941177010536194,
"learning_rate": 3.553085011278854e-05,
"loss": 0.1811,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2060641646385193,
"step": 3945
},
{
"epoch": 2.0707547169811322,
"grad_norm": 0.7381300926208496,
"learning_rate": 3.551436924224461e-05,
"loss": 0.1802,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1723439246416092,
"step": 3950
},
{
"epoch": 2.0733752620545074,
"grad_norm": 1.0024296045303345,
"learning_rate": 3.549786187664231e-05,
"loss": 0.1814,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21678271889686584,
"step": 3955
},
{
"epoch": 2.0759958071278826,
"grad_norm": 0.6965764164924622,
"learning_rate": 3.548132804417255e-05,
"loss": 0.1823,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1474609375,
"step": 3960
},
{
"epoch": 2.0786163522012577,
"grad_norm": 0.6827486753463745,
"learning_rate": 3.546476777307137e-05,
"loss": 0.1682,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17864082753658295,
"step": 3965
},
{
"epoch": 2.081236897274633,
"grad_norm": 0.6874611377716064,
"learning_rate": 3.544818109162e-05,
"loss": 0.1697,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18936476111412048,
"step": 3970
},
{
"epoch": 2.0838574423480085,
"grad_norm": 0.6896753311157227,
"learning_rate": 3.543156802814478e-05,
"loss": 0.1934,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20261545479297638,
"step": 3975
},
{
"epoch": 2.0864779874213837,
"grad_norm": 0.719116747379303,
"learning_rate": 3.5414928611017085e-05,
"loss": 0.1565,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1512451171875,
"step": 3980
},
{
"epoch": 2.089098532494759,
"grad_norm": 0.6998587846755981,
"learning_rate": 3.53982628686533e-05,
"loss": 0.1615,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.170166015625,
"step": 3985
},
{
"epoch": 2.091719077568134,
"grad_norm": 0.801823616027832,
"learning_rate": 3.538157082951477e-05,
"loss": 0.1749,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.144775390625,
"step": 3990
},
{
"epoch": 2.0943396226415096,
"grad_norm": 0.6100899577140808,
"learning_rate": 3.536485252210775e-05,
"loss": 0.1884,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16361545026302338,
"step": 3995
},
{
"epoch": 2.096960167714885,
"grad_norm": 0.7097408771514893,
"learning_rate": 3.534810797498335e-05,
"loss": 0.1948,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15076175332069397,
"step": 4000
},
{
"epoch": 2.09958071278826,
"grad_norm": 0.5789570212364197,
"learning_rate": 3.533133721673751e-05,
"loss": 0.1774,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1568412184715271,
"step": 4005
},
{
"epoch": 2.102201257861635,
"grad_norm": 0.6784456968307495,
"learning_rate": 3.5314540276010895e-05,
"loss": 0.1762,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2062300741672516,
"step": 4010
},
{
"epoch": 2.1048218029350103,
"grad_norm": 0.5736457109451294,
"learning_rate": 3.529771718148893e-05,
"loss": 0.1567,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17741183936595917,
"step": 4015
},
{
"epoch": 2.107442348008386,
"grad_norm": 0.6271811127662659,
"learning_rate": 3.528086796190167e-05,
"loss": 0.1985,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19951367378234863,
"step": 4020
},
{
"epoch": 2.110062893081761,
"grad_norm": 1.0117782354354858,
"learning_rate": 3.526399264602381e-05,
"loss": 0.1669,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14599609375,
"step": 4025
},
{
"epoch": 2.1126834381551363,
"grad_norm": 0.6283865571022034,
"learning_rate": 3.524709126267458e-05,
"loss": 0.1947,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18431368470191956,
"step": 4030
},
{
"epoch": 2.1153039832285114,
"grad_norm": 0.7869764566421509,
"learning_rate": 3.523016384071777e-05,
"loss": 0.1799,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17630180716514587,
"step": 4035
},
{
"epoch": 2.1179245283018866,
"grad_norm": 0.6120043992996216,
"learning_rate": 3.521321040906159e-05,
"loss": 0.1684,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16492129862308502,
"step": 4040
},
{
"epoch": 2.120545073375262,
"grad_norm": 0.6537008881568909,
"learning_rate": 3.5196230996658704e-05,
"loss": 0.1653,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19163097441196442,
"step": 4045
},
{
"epoch": 2.1231656184486374,
"grad_norm": 0.6703867316246033,
"learning_rate": 3.517922563250615e-05,
"loss": 0.1657,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2131059169769287,
"step": 4050
},
{
"epoch": 2.1257861635220126,
"grad_norm": 0.6623901128768921,
"learning_rate": 3.5162194345645256e-05,
"loss": 0.1715,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18095040321350098,
"step": 4055
},
{
"epoch": 2.1284067085953877,
"grad_norm": 0.7682641744613647,
"learning_rate": 3.514513716516164e-05,
"loss": 0.1871,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15569807589054108,
"step": 4060
},
{
"epoch": 2.131027253668763,
"grad_norm": 0.6966356635093689,
"learning_rate": 3.512805412018512e-05,
"loss": 0.174,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14013671875,
"step": 4065
},
{
"epoch": 2.1336477987421385,
"grad_norm": 0.6977257132530212,
"learning_rate": 3.5110945239889725e-05,
"loss": 0.1674,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13693030178546906,
"step": 4070
},
{
"epoch": 2.1362683438155137,
"grad_norm": 0.5598815679550171,
"learning_rate": 3.509381055349357e-05,
"loss": 0.1678,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18437781929969788,
"step": 4075
},
{
"epoch": 2.138888888888889,
"grad_norm": 0.5758344531059265,
"learning_rate": 3.507665009025885e-05,
"loss": 0.1977,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2025020569562912,
"step": 4080
},
{
"epoch": 2.141509433962264,
"grad_norm": 0.5952780246734619,
"learning_rate": 3.505946387949177e-05,
"loss": 0.1588,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1604464054107666,
"step": 4085
},
{
"epoch": 2.1441299790356396,
"grad_norm": 0.6980146169662476,
"learning_rate": 3.5042251950542536e-05,
"loss": 0.1744,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17276695370674133,
"step": 4090
},
{
"epoch": 2.146750524109015,
"grad_norm": 0.5248650908470154,
"learning_rate": 3.502501433280525e-05,
"loss": 0.1729,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17749759554862976,
"step": 4095
},
{
"epoch": 2.14937106918239,
"grad_norm": 0.6207015514373779,
"learning_rate": 3.5007751055717895e-05,
"loss": 0.1886,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21023103594779968,
"step": 4100
},
{
"epoch": 2.151991614255765,
"grad_norm": 0.6230864524841309,
"learning_rate": 3.499046214876227e-05,
"loss": 0.1804,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16375795006752014,
"step": 4105
},
{
"epoch": 2.1546121593291403,
"grad_norm": 0.6781209111213684,
"learning_rate": 3.497314764146394e-05,
"loss": 0.1666,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1437818706035614,
"step": 4110
},
{
"epoch": 2.157232704402516,
"grad_norm": 0.5978240966796875,
"learning_rate": 3.49558075633922e-05,
"loss": 0.1889,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20453524589538574,
"step": 4115
},
{
"epoch": 2.159853249475891,
"grad_norm": 0.8283042907714844,
"learning_rate": 3.493844194416001e-05,
"loss": 0.1694,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15572591125965118,
"step": 4120
},
{
"epoch": 2.1624737945492662,
"grad_norm": 0.6346971988677979,
"learning_rate": 3.4921050813423944e-05,
"loss": 0.2003,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.167236328125,
"step": 4125
},
{
"epoch": 2.1650943396226414,
"grad_norm": 0.7900070548057556,
"learning_rate": 3.490363420088415e-05,
"loss": 0.1688,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16584350168704987,
"step": 4130
},
{
"epoch": 2.1677148846960166,
"grad_norm": 0.9483361840248108,
"learning_rate": 3.488619213628429e-05,
"loss": 0.1778,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20415958762168884,
"step": 4135
},
{
"epoch": 2.170335429769392,
"grad_norm": 0.6414875984191895,
"learning_rate": 3.4868724649411486e-05,
"loss": 0.1903,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1789773851633072,
"step": 4140
},
{
"epoch": 2.1729559748427674,
"grad_norm": 0.6929765343666077,
"learning_rate": 3.48512317700963e-05,
"loss": 0.1942,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21046538650989532,
"step": 4145
},
{
"epoch": 2.1755765199161425,
"grad_norm": 0.6449279189109802,
"learning_rate": 3.483371352821263e-05,
"loss": 0.1766,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16827432811260223,
"step": 4150
},
{
"epoch": 2.1781970649895177,
"grad_norm": 0.606066107749939,
"learning_rate": 3.48161699536777e-05,
"loss": 0.1789,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16351351141929626,
"step": 4155
},
{
"epoch": 2.180817610062893,
"grad_norm": 0.7101812362670898,
"learning_rate": 3.4798601076451986e-05,
"loss": 0.1637,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19216609001159668,
"step": 4160
},
{
"epoch": 2.1834381551362685,
"grad_norm": 1.1365573406219482,
"learning_rate": 3.47810069265392e-05,
"loss": 0.1704,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18184298276901245,
"step": 4165
},
{
"epoch": 2.1860587002096437,
"grad_norm": 0.6491653919219971,
"learning_rate": 3.476338753398618e-05,
"loss": 0.1813,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18884286284446716,
"step": 4170
},
{
"epoch": 2.188679245283019,
"grad_norm": 0.6283389329910278,
"learning_rate": 3.474574292888292e-05,
"loss": 0.1889,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19282126426696777,
"step": 4175
},
{
"epoch": 2.191299790356394,
"grad_norm": 0.7391582727432251,
"learning_rate": 3.472807314136242e-05,
"loss": 0.1874,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1503807157278061,
"step": 4180
},
{
"epoch": 2.1939203354297696,
"grad_norm": 0.5754171013832092,
"learning_rate": 3.471037820160072e-05,
"loss": 0.1783,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18280450999736786,
"step": 4185
},
{
"epoch": 2.1965408805031448,
"grad_norm": 0.6991333961486816,
"learning_rate": 3.469265813981679e-05,
"loss": 0.1833,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20395539700984955,
"step": 4190
},
{
"epoch": 2.19916142557652,
"grad_norm": 0.7103485465049744,
"learning_rate": 3.467491298627252e-05,
"loss": 0.1648,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1837499439716339,
"step": 4195
},
{
"epoch": 2.201781970649895,
"grad_norm": 0.6320298314094543,
"learning_rate": 3.465714277127266e-05,
"loss": 0.1764,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16248497366905212,
"step": 4200
},
{
"epoch": 2.2044025157232703,
"grad_norm": 0.7922936677932739,
"learning_rate": 3.463934752516474e-05,
"loss": 0.1824,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17559239268302917,
"step": 4205
},
{
"epoch": 2.207023060796646,
"grad_norm": 0.5327322483062744,
"learning_rate": 3.4621527278339025e-05,
"loss": 0.1808,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18227726221084595,
"step": 4210
},
{
"epoch": 2.209643605870021,
"grad_norm": 0.688612163066864,
"learning_rate": 3.460368206122852e-05,
"loss": 0.184,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17920489609241486,
"step": 4215
},
{
"epoch": 2.2122641509433962,
"grad_norm": 0.5882250070571899,
"learning_rate": 3.458581190430884e-05,
"loss": 0.1766,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16983014345169067,
"step": 4220
},
{
"epoch": 2.2148846960167714,
"grad_norm": 0.6443110108375549,
"learning_rate": 3.4567916838098195e-05,
"loss": 0.1695,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1739184558391571,
"step": 4225
},
{
"epoch": 2.2175052410901466,
"grad_norm": 0.8445901870727539,
"learning_rate": 3.454999689315734e-05,
"loss": 0.166,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18260544538497925,
"step": 4230
},
{
"epoch": 2.220125786163522,
"grad_norm": 0.6660024523735046,
"learning_rate": 3.453205210008952e-05,
"loss": 0.1714,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.173113614320755,
"step": 4235
},
{
"epoch": 2.2227463312368974,
"grad_norm": 0.6402644515037537,
"learning_rate": 3.4514082489540415e-05,
"loss": 0.1779,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16839855909347534,
"step": 4240
},
{
"epoch": 2.2253668763102725,
"grad_norm": 0.6560665369033813,
"learning_rate": 3.4496088092198076e-05,
"loss": 0.188,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1756685972213745,
"step": 4245
},
{
"epoch": 2.2279874213836477,
"grad_norm": 0.8085160255432129,
"learning_rate": 3.44780689387929e-05,
"loss": 0.1656,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14646513760089874,
"step": 4250
},
{
"epoch": 2.230607966457023,
"grad_norm": 0.5747509002685547,
"learning_rate": 3.446002506009754e-05,
"loss": 0.2022,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2267492413520813,
"step": 4255
},
{
"epoch": 2.2332285115303985,
"grad_norm": 1.1871565580368042,
"learning_rate": 3.44419564869269e-05,
"loss": 0.1893,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20584087073802948,
"step": 4260
},
{
"epoch": 2.2358490566037736,
"grad_norm": 0.6333910822868347,
"learning_rate": 3.4423863250138045e-05,
"loss": 0.1925,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22530904412269592,
"step": 4265
},
{
"epoch": 2.238469601677149,
"grad_norm": 0.6745782494544983,
"learning_rate": 3.440574538063016e-05,
"loss": 0.1719,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1735236644744873,
"step": 4270
},
{
"epoch": 2.241090146750524,
"grad_norm": 0.6441916823387146,
"learning_rate": 3.438760290934449e-05,
"loss": 0.1873,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21418049931526184,
"step": 4275
},
{
"epoch": 2.2437106918238996,
"grad_norm": 0.6465288400650024,
"learning_rate": 3.436943586726431e-05,
"loss": 0.1842,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1877419799566269,
"step": 4280
},
{
"epoch": 2.2463312368972748,
"grad_norm": 0.6912493109703064,
"learning_rate": 3.435124428541484e-05,
"loss": 0.1759,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11175537109375,
"step": 4285
},
{
"epoch": 2.24895178197065,
"grad_norm": 0.670586347579956,
"learning_rate": 3.433302819486322e-05,
"loss": 0.1896,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15557770431041718,
"step": 4290
},
{
"epoch": 2.251572327044025,
"grad_norm": 0.6130393147468567,
"learning_rate": 3.431478762671844e-05,
"loss": 0.1829,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1899413913488388,
"step": 4295
},
{
"epoch": 2.2541928721174003,
"grad_norm": 0.609475314617157,
"learning_rate": 3.42965226121313e-05,
"loss": 0.1755,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15828870236873627,
"step": 4300
},
{
"epoch": 2.2568134171907754,
"grad_norm": 0.6548188328742981,
"learning_rate": 3.4278233182294335e-05,
"loss": 0.1743,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20844203233718872,
"step": 4305
},
{
"epoch": 2.259433962264151,
"grad_norm": 0.7024995684623718,
"learning_rate": 3.4259919368441794e-05,
"loss": 0.1871,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2090778946876526,
"step": 4310
},
{
"epoch": 2.262054507337526,
"grad_norm": 0.7198363542556763,
"learning_rate": 3.424158120184955e-05,
"loss": 0.1699,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1468142420053482,
"step": 4315
},
{
"epoch": 2.2646750524109014,
"grad_norm": 0.600344717502594,
"learning_rate": 3.422321871383507e-05,
"loss": 0.1984,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16410166025161743,
"step": 4320
},
{
"epoch": 2.2672955974842766,
"grad_norm": 0.6233639717102051,
"learning_rate": 3.4204831935757365e-05,
"loss": 0.1848,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20671750605106354,
"step": 4325
},
{
"epoch": 2.269916142557652,
"grad_norm": 0.8082700967788696,
"learning_rate": 3.418642089901692e-05,
"loss": 0.1604,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15148866176605225,
"step": 4330
},
{
"epoch": 2.2725366876310273,
"grad_norm": 0.6904008984565735,
"learning_rate": 3.4167985635055655e-05,
"loss": 0.1875,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17664828896522522,
"step": 4335
},
{
"epoch": 2.2751572327044025,
"grad_norm": 0.5504633188247681,
"learning_rate": 3.4149526175356854e-05,
"loss": 0.1996,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19843214750289917,
"step": 4340
},
{
"epoch": 2.2777777777777777,
"grad_norm": 0.60943603515625,
"learning_rate": 3.413104255144514e-05,
"loss": 0.1808,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20119717717170715,
"step": 4345
},
{
"epoch": 2.280398322851153,
"grad_norm": 0.668181836605072,
"learning_rate": 3.4112534794886376e-05,
"loss": 0.1682,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19905316829681396,
"step": 4350
},
{
"epoch": 2.2830188679245285,
"grad_norm": 0.7229729294776917,
"learning_rate": 3.409400293728767e-05,
"loss": 0.1764,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1654052734375,
"step": 4355
},
{
"epoch": 2.2856394129979036,
"grad_norm": 0.5337548851966858,
"learning_rate": 3.407544701029725e-05,
"loss": 0.1838,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21104368567466736,
"step": 4360
},
{
"epoch": 2.288259958071279,
"grad_norm": 0.6288163661956787,
"learning_rate": 3.40568670456045e-05,
"loss": 0.1871,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1397732049226761,
"step": 4365
},
{
"epoch": 2.290880503144654,
"grad_norm": 0.6795221567153931,
"learning_rate": 3.4038263074939805e-05,
"loss": 0.1685,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17288152873516083,
"step": 4370
},
{
"epoch": 2.2935010482180296,
"grad_norm": 0.6144188046455383,
"learning_rate": 3.401963513007458e-05,
"loss": 0.1591,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1681046187877655,
"step": 4375
},
{
"epoch": 2.2961215932914047,
"grad_norm": 0.6623589396476746,
"learning_rate": 3.400098324282116e-05,
"loss": 0.1726,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17139089107513428,
"step": 4380
},
{
"epoch": 2.29874213836478,
"grad_norm": 0.7196406126022339,
"learning_rate": 3.39823074450328e-05,
"loss": 0.1684,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1435546875,
"step": 4385
},
{
"epoch": 2.301362683438155,
"grad_norm": 0.7317269444465637,
"learning_rate": 3.3963607768603545e-05,
"loss": 0.1755,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1617431640625,
"step": 4390
},
{
"epoch": 2.3039832285115303,
"grad_norm": 0.6227354407310486,
"learning_rate": 3.3944884245468255e-05,
"loss": 0.1767,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1522216796875,
"step": 4395
},
{
"epoch": 2.3066037735849054,
"grad_norm": 0.7358293533325195,
"learning_rate": 3.3926136907602503e-05,
"loss": 0.1931,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1415230929851532,
"step": 4400
},
{
"epoch": 2.309224318658281,
"grad_norm": 0.6808892488479614,
"learning_rate": 3.390736578702253e-05,
"loss": 0.1744,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1495361328125,
"step": 4405
},
{
"epoch": 2.311844863731656,
"grad_norm": 0.5879157781600952,
"learning_rate": 3.388857091578519e-05,
"loss": 0.185,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21268212795257568,
"step": 4410
},
{
"epoch": 2.3144654088050314,
"grad_norm": 0.6618671417236328,
"learning_rate": 3.3869752325987915e-05,
"loss": 0.17,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17198902368545532,
"step": 4415
},
{
"epoch": 2.3170859538784065,
"grad_norm": 0.6409109234809875,
"learning_rate": 3.385091004976861e-05,
"loss": 0.1916,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18544016778469086,
"step": 4420
},
{
"epoch": 2.319706498951782,
"grad_norm": 0.5145808458328247,
"learning_rate": 3.3832044119305666e-05,
"loss": 0.1731,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17380250990390778,
"step": 4425
},
{
"epoch": 2.3223270440251573,
"grad_norm": 0.5956165790557861,
"learning_rate": 3.381315456681785e-05,
"loss": 0.18,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14881974458694458,
"step": 4430
},
{
"epoch": 2.3249475890985325,
"grad_norm": 0.7136877179145813,
"learning_rate": 3.3794241424564275e-05,
"loss": 0.1901,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17421871423721313,
"step": 4435
},
{
"epoch": 2.3275681341719077,
"grad_norm": 0.6358838081359863,
"learning_rate": 3.377530472484435e-05,
"loss": 0.1824,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21158553659915924,
"step": 4440
},
{
"epoch": 2.330188679245283,
"grad_norm": 0.5762045383453369,
"learning_rate": 3.375634449999769e-05,
"loss": 0.1819,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19911928474903107,
"step": 4445
},
{
"epoch": 2.3328092243186584,
"grad_norm": 0.6769942045211792,
"learning_rate": 3.373736078240411e-05,
"loss": 0.1786,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16046220064163208,
"step": 4450
},
{
"epoch": 2.3354297693920336,
"grad_norm": 0.6089766025543213,
"learning_rate": 3.371835360448353e-05,
"loss": 0.1738,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19723621010780334,
"step": 4455
},
{
"epoch": 2.338050314465409,
"grad_norm": 0.780342698097229,
"learning_rate": 3.369932299869594e-05,
"loss": 0.1798,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1385498046875,
"step": 4460
},
{
"epoch": 2.340670859538784,
"grad_norm": 0.6272355318069458,
"learning_rate": 3.368026899754136e-05,
"loss": 0.1776,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16443294286727905,
"step": 4465
},
{
"epoch": 2.3432914046121596,
"grad_norm": 0.6277251243591309,
"learning_rate": 3.366119163355972e-05,
"loss": 0.1727,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17139068245887756,
"step": 4470
},
{
"epoch": 2.3459119496855347,
"grad_norm": 0.6159740090370178,
"learning_rate": 3.364209093933088e-05,
"loss": 0.1861,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15588770806789398,
"step": 4475
},
{
"epoch": 2.34853249475891,
"grad_norm": 0.6222087740898132,
"learning_rate": 3.362296694747455e-05,
"loss": 0.1718,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16325759887695312,
"step": 4480
},
{
"epoch": 2.351153039832285,
"grad_norm": 0.6561034917831421,
"learning_rate": 3.36038196906502e-05,
"loss": 0.1975,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2421393245458603,
"step": 4485
},
{
"epoch": 2.3537735849056602,
"grad_norm": 0.6397557258605957,
"learning_rate": 3.358464920155704e-05,
"loss": 0.1722,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14547546207904816,
"step": 4490
},
{
"epoch": 2.3563941299790354,
"grad_norm": 0.5537540912628174,
"learning_rate": 3.3565455512933974e-05,
"loss": 0.1929,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19911175966262817,
"step": 4495
},
{
"epoch": 2.359014675052411,
"grad_norm": 0.6250444054603577,
"learning_rate": 3.35462386575595e-05,
"loss": 0.1836,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1976991593837738,
"step": 4500
},
{
"epoch": 2.361635220125786,
"grad_norm": 0.5753092765808105,
"learning_rate": 3.3526998668251696e-05,
"loss": 0.1819,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19621963798999786,
"step": 4505
},
{
"epoch": 2.3642557651991614,
"grad_norm": 0.6889363527297974,
"learning_rate": 3.3507735577868144e-05,
"loss": 0.1878,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17461004853248596,
"step": 4510
},
{
"epoch": 2.3668763102725365,
"grad_norm": 0.5696137547492981,
"learning_rate": 3.3488449419305876e-05,
"loss": 0.174,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18081456422805786,
"step": 4515
},
{
"epoch": 2.369496855345912,
"grad_norm": 0.6583285331726074,
"learning_rate": 3.3469140225501316e-05,
"loss": 0.176,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20351186394691467,
"step": 4520
},
{
"epoch": 2.3721174004192873,
"grad_norm": 0.7312102317810059,
"learning_rate": 3.344980802943023e-05,
"loss": 0.1638,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18066838383674622,
"step": 4525
},
{
"epoch": 2.3747379454926625,
"grad_norm": 0.8290305733680725,
"learning_rate": 3.3430452864107674e-05,
"loss": 0.178,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18665027618408203,
"step": 4530
},
{
"epoch": 2.3773584905660377,
"grad_norm": 0.5993841290473938,
"learning_rate": 3.341107476258792e-05,
"loss": 0.1706,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12851500511169434,
"step": 4535
},
{
"epoch": 2.379979035639413,
"grad_norm": 0.6254639029502869,
"learning_rate": 3.3391673757964404e-05,
"loss": 0.185,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18954774737358093,
"step": 4540
},
{
"epoch": 2.3825995807127884,
"grad_norm": 0.678989589214325,
"learning_rate": 3.33722498833697e-05,
"loss": 0.162,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.153564453125,
"step": 4545
},
{
"epoch": 2.3852201257861636,
"grad_norm": 0.6548760533332825,
"learning_rate": 3.3352803171975415e-05,
"loss": 0.1615,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19703298807144165,
"step": 4550
},
{
"epoch": 2.3878406708595388,
"grad_norm": 0.5945609211921692,
"learning_rate": 3.3333333656992166e-05,
"loss": 0.1868,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17958563566207886,
"step": 4555
},
{
"epoch": 2.390461215932914,
"grad_norm": 0.5302448868751526,
"learning_rate": 3.331384137166951e-05,
"loss": 0.1787,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16719529032707214,
"step": 4560
},
{
"epoch": 2.3930817610062896,
"grad_norm": 0.5767799019813538,
"learning_rate": 3.32943263492959e-05,
"loss": 0.1985,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19227570295333862,
"step": 4565
},
{
"epoch": 2.3957023060796647,
"grad_norm": 0.8207547664642334,
"learning_rate": 3.32747886231986e-05,
"loss": 0.1711,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1671919822692871,
"step": 4570
},
{
"epoch": 2.39832285115304,
"grad_norm": 0.8117937445640564,
"learning_rate": 3.325522822674366e-05,
"loss": 0.1726,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19688598811626434,
"step": 4575
},
{
"epoch": 2.400943396226415,
"grad_norm": 0.8320883512496948,
"learning_rate": 3.323564519333586e-05,
"loss": 0.1831,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20656168460845947,
"step": 4580
},
{
"epoch": 2.4035639412997902,
"grad_norm": 0.6701602339744568,
"learning_rate": 3.321603955641861e-05,
"loss": 0.1677,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16803507506847382,
"step": 4585
},
{
"epoch": 2.4061844863731654,
"grad_norm": 0.7060126662254333,
"learning_rate": 3.319641134947393e-05,
"loss": 0.1774,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17268584668636322,
"step": 4590
},
{
"epoch": 2.408805031446541,
"grad_norm": 0.6497982740402222,
"learning_rate": 3.31767606060224e-05,
"loss": 0.1638,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16267868876457214,
"step": 4595
},
{
"epoch": 2.411425576519916,
"grad_norm": 0.5415063500404358,
"learning_rate": 3.315708735962307e-05,
"loss": 0.1727,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1775253713130951,
"step": 4600
},
{
"epoch": 2.4140461215932913,
"grad_norm": 0.6065043807029724,
"learning_rate": 3.313739164387343e-05,
"loss": 0.1805,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1512911468744278,
"step": 4605
},
{
"epoch": 2.4166666666666665,
"grad_norm": 0.6144443154335022,
"learning_rate": 3.311767349240934e-05,
"loss": 0.1715,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19169479608535767,
"step": 4610
},
{
"epoch": 2.419287211740042,
"grad_norm": 0.6852019429206848,
"learning_rate": 3.309793293890497e-05,
"loss": 0.1777,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1656094640493393,
"step": 4615
},
{
"epoch": 2.4219077568134173,
"grad_norm": 0.5891123414039612,
"learning_rate": 3.3078170017072744e-05,
"loss": 0.1746,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.180824413895607,
"step": 4620
},
{
"epoch": 2.4245283018867925,
"grad_norm": 0.618079423904419,
"learning_rate": 3.305838476066331e-05,
"loss": 0.1792,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17430126667022705,
"step": 4625
},
{
"epoch": 2.4271488469601676,
"grad_norm": 0.6806156635284424,
"learning_rate": 3.303857720346544e-05,
"loss": 0.1528,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16445207595825195,
"step": 4630
},
{
"epoch": 2.429769392033543,
"grad_norm": 0.7231004238128662,
"learning_rate": 3.3018747379305994e-05,
"loss": 0.1815,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.153076171875,
"step": 4635
},
{
"epoch": 2.4323899371069184,
"grad_norm": 0.669330358505249,
"learning_rate": 3.299889532204985e-05,
"loss": 0.1649,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1876327395439148,
"step": 4640
},
{
"epoch": 2.4350104821802936,
"grad_norm": 0.603354811668396,
"learning_rate": 3.2979021065599864e-05,
"loss": 0.1595,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16586905717849731,
"step": 4645
},
{
"epoch": 2.4376310272536688,
"grad_norm": 0.6138980984687805,
"learning_rate": 3.29591246438968e-05,
"loss": 0.1655,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17583346366882324,
"step": 4650
},
{
"epoch": 2.440251572327044,
"grad_norm": 0.6296398639678955,
"learning_rate": 3.293920609091929e-05,
"loss": 0.176,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16585686802864075,
"step": 4655
},
{
"epoch": 2.442872117400419,
"grad_norm": 0.7336766719818115,
"learning_rate": 3.291926544068375e-05,
"loss": 0.1761,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14485588669776917,
"step": 4660
},
{
"epoch": 2.4454926624737947,
"grad_norm": 0.5738866925239563,
"learning_rate": 3.289930272724431e-05,
"loss": 0.1731,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21165721118450165,
"step": 4665
},
{
"epoch": 2.44811320754717,
"grad_norm": 0.575883686542511,
"learning_rate": 3.2879317984692825e-05,
"loss": 0.1609,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1656447947025299,
"step": 4670
},
{
"epoch": 2.450733752620545,
"grad_norm": 0.5860551595687866,
"learning_rate": 3.2859311247158734e-05,
"loss": 0.1738,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17280437052249908,
"step": 4675
},
{
"epoch": 2.45335429769392,
"grad_norm": 1.0741225481033325,
"learning_rate": 3.283928254880906e-05,
"loss": 0.1765,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13873291015625,
"step": 4680
},
{
"epoch": 2.4559748427672954,
"grad_norm": 0.599844753742218,
"learning_rate": 3.2819231923848316e-05,
"loss": 0.166,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14130263030529022,
"step": 4685
},
{
"epoch": 2.458595387840671,
"grad_norm": 0.6118952035903931,
"learning_rate": 3.2799159406518464e-05,
"loss": 0.1817,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1787024289369583,
"step": 4690
},
{
"epoch": 2.461215932914046,
"grad_norm": 0.6382375955581665,
"learning_rate": 3.277906503109885e-05,
"loss": 0.1542,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17545664310455322,
"step": 4695
},
{
"epoch": 2.4638364779874213,
"grad_norm": 0.7123702764511108,
"learning_rate": 3.275894883190618e-05,
"loss": 0.1869,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2054811716079712,
"step": 4700
},
{
"epoch": 2.4664570230607965,
"grad_norm": 0.630284309387207,
"learning_rate": 3.273881084329438e-05,
"loss": 0.1611,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1761714667081833,
"step": 4705
},
{
"epoch": 2.469077568134172,
"grad_norm": 0.6742514371871948,
"learning_rate": 3.271865109965462e-05,
"loss": 0.1876,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18644127249717712,
"step": 4710
},
{
"epoch": 2.4716981132075473,
"grad_norm": 0.7128267288208008,
"learning_rate": 3.269846963541521e-05,
"loss": 0.1915,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16147366166114807,
"step": 4715
},
{
"epoch": 2.4743186582809225,
"grad_norm": 0.5690448880195618,
"learning_rate": 3.267826648504157e-05,
"loss": 0.1735,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.169921875,
"step": 4720
},
{
"epoch": 2.4769392033542976,
"grad_norm": 0.6697767376899719,
"learning_rate": 3.2658041683036124e-05,
"loss": 0.18,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1703624427318573,
"step": 4725
},
{
"epoch": 2.479559748427673,
"grad_norm": 0.6302558183670044,
"learning_rate": 3.263779526393831e-05,
"loss": 0.1729,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17672468721866608,
"step": 4730
},
{
"epoch": 2.4821802935010484,
"grad_norm": 0.6719659566879272,
"learning_rate": 3.261752726232446e-05,
"loss": 0.1776,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15950210392475128,
"step": 4735
},
{
"epoch": 2.4848008385744236,
"grad_norm": 0.6178978085517883,
"learning_rate": 3.2597237712807764e-05,
"loss": 0.1775,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19610846042633057,
"step": 4740
},
{
"epoch": 2.4874213836477987,
"grad_norm": 0.6141926050186157,
"learning_rate": 3.2576926650038225e-05,
"loss": 0.1719,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14078906178474426,
"step": 4745
},
{
"epoch": 2.490041928721174,
"grad_norm": 0.5836093425750732,
"learning_rate": 3.255659410870257e-05,
"loss": 0.1773,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17038002610206604,
"step": 4750
},
{
"epoch": 2.492662473794549,
"grad_norm": 0.6279257535934448,
"learning_rate": 3.253624012352421e-05,
"loss": 0.1893,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22794798016548157,
"step": 4755
},
{
"epoch": 2.4952830188679247,
"grad_norm": 0.6302879452705383,
"learning_rate": 3.251586472926317e-05,
"loss": 0.181,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1786530762910843,
"step": 4760
},
{
"epoch": 2.4979035639413,
"grad_norm": 0.6581209301948547,
"learning_rate": 3.249546796071608e-05,
"loss": 0.1679,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1811736524105072,
"step": 4765
},
{
"epoch": 2.500524109014675,
"grad_norm": 0.6320407390594482,
"learning_rate": 3.2475049852716014e-05,
"loss": 0.1704,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17114879190921783,
"step": 4770
},
{
"epoch": 2.50314465408805,
"grad_norm": 0.7551243305206299,
"learning_rate": 3.245461044013253e-05,
"loss": 0.177,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18058930337429047,
"step": 4775
},
{
"epoch": 2.5057651991614254,
"grad_norm": 0.6153421998023987,
"learning_rate": 3.243414975787154e-05,
"loss": 0.1892,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1871110498905182,
"step": 4780
},
{
"epoch": 2.508385744234801,
"grad_norm": 0.6175753474235535,
"learning_rate": 3.24136678408753e-05,
"loss": 0.1781,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19896206259727478,
"step": 4785
},
{
"epoch": 2.511006289308176,
"grad_norm": 0.6540406346321106,
"learning_rate": 3.239316472412233e-05,
"loss": 0.1667,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1719357818365097,
"step": 4790
},
{
"epoch": 2.5136268343815513,
"grad_norm": 0.6842586398124695,
"learning_rate": 3.237264044262734e-05,
"loss": 0.1893,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23990176618099213,
"step": 4795
},
{
"epoch": 2.5162473794549265,
"grad_norm": 0.5659977197647095,
"learning_rate": 3.23520950314412e-05,
"loss": 0.1691,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19691407680511475,
"step": 4800
},
{
"epoch": 2.518867924528302,
"grad_norm": 0.736707866191864,
"learning_rate": 3.233152852565085e-05,
"loss": 0.1854,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16525039076805115,
"step": 4805
},
{
"epoch": 2.5214884696016773,
"grad_norm": 0.6787170171737671,
"learning_rate": 3.231094096037927e-05,
"loss": 0.1847,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17906887829303741,
"step": 4810
},
{
"epoch": 2.5241090146750524,
"grad_norm": 0.6288094520568848,
"learning_rate": 3.22903323707854e-05,
"loss": 0.1665,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18031513690948486,
"step": 4815
},
{
"epoch": 2.5267295597484276,
"grad_norm": 0.630490243434906,
"learning_rate": 3.2269702792064066e-05,
"loss": 0.1532,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1419677734375,
"step": 4820
},
{
"epoch": 2.529350104821803,
"grad_norm": 0.5428565740585327,
"learning_rate": 3.224905225944598e-05,
"loss": 0.1846,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2248590588569641,
"step": 4825
},
{
"epoch": 2.531970649895178,
"grad_norm": 0.6126314401626587,
"learning_rate": 3.2228380808197594e-05,
"loss": 0.1842,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2059944123029709,
"step": 4830
},
{
"epoch": 2.5345911949685536,
"grad_norm": 0.6274798512458801,
"learning_rate": 3.2207688473621116e-05,
"loss": 0.1706,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19701413810253143,
"step": 4835
},
{
"epoch": 2.5372117400419287,
"grad_norm": 0.5996156334877014,
"learning_rate": 3.2186975291054406e-05,
"loss": 0.1693,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14583313465118408,
"step": 4840
},
{
"epoch": 2.539832285115304,
"grad_norm": 0.9609743356704712,
"learning_rate": 3.2166241295870915e-05,
"loss": 0.1756,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13916015625,
"step": 4845
},
{
"epoch": 2.5424528301886795,
"grad_norm": 0.645316481590271,
"learning_rate": 3.2145486523479664e-05,
"loss": 0.1883,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19266776740550995,
"step": 4850
},
{
"epoch": 2.5450733752620547,
"grad_norm": 0.7192069292068481,
"learning_rate": 3.212471100932513e-05,
"loss": 0.1727,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.152587890625,
"step": 4855
},
{
"epoch": 2.54769392033543,
"grad_norm": 0.6661515235900879,
"learning_rate": 3.210391478888725e-05,
"loss": 0.1878,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22219903767108917,
"step": 4860
},
{
"epoch": 2.550314465408805,
"grad_norm": 0.5181549191474915,
"learning_rate": 3.208309789768127e-05,
"loss": 0.1611,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14796018600463867,
"step": 4865
},
{
"epoch": 2.55293501048218,
"grad_norm": 0.6805993914604187,
"learning_rate": 3.206226037125778e-05,
"loss": 0.1704,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1424269825220108,
"step": 4870
},
{
"epoch": 2.5555555555555554,
"grad_norm": 0.6539928317070007,
"learning_rate": 3.204140224520259e-05,
"loss": 0.1805,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17243647575378418,
"step": 4875
},
{
"epoch": 2.558176100628931,
"grad_norm": 0.7448137998580933,
"learning_rate": 3.20205235551367e-05,
"loss": 0.1734,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1661716103553772,
"step": 4880
},
{
"epoch": 2.560796645702306,
"grad_norm": 0.6820452213287354,
"learning_rate": 3.1999624336716207e-05,
"loss": 0.2037,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.174968421459198,
"step": 4885
},
{
"epoch": 2.5634171907756813,
"grad_norm": 0.5573581457138062,
"learning_rate": 3.197870462563231e-05,
"loss": 0.167,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1981693059206009,
"step": 4890
},
{
"epoch": 2.5660377358490565,
"grad_norm": 0.7350602149963379,
"learning_rate": 3.195776445761116e-05,
"loss": 0.1739,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.163818359375,
"step": 4895
},
{
"epoch": 2.568658280922432,
"grad_norm": 0.6017246842384338,
"learning_rate": 3.1936803868413865e-05,
"loss": 0.1815,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18484187126159668,
"step": 4900
},
{
"epoch": 2.5712788259958073,
"grad_norm": 0.5709414482116699,
"learning_rate": 3.1915822893836394e-05,
"loss": 0.1902,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1708984375,
"step": 4905
},
{
"epoch": 2.5738993710691824,
"grad_norm": 0.5710614323616028,
"learning_rate": 3.189482156970956e-05,
"loss": 0.181,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2219957709312439,
"step": 4910
},
{
"epoch": 2.5765199161425576,
"grad_norm": 0.7193589210510254,
"learning_rate": 3.187379993189889e-05,
"loss": 0.1828,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2016737461090088,
"step": 4915
},
{
"epoch": 2.5791404612159328,
"grad_norm": 0.6822991967201233,
"learning_rate": 3.1852758016304625e-05,
"loss": 0.1698,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1361376941204071,
"step": 4920
},
{
"epoch": 2.581761006289308,
"grad_norm": 0.6136971116065979,
"learning_rate": 3.1831695858861635e-05,
"loss": 0.1793,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18536555767059326,
"step": 4925
},
{
"epoch": 2.5843815513626835,
"grad_norm": 0.7393411993980408,
"learning_rate": 3.181061349553935e-05,
"loss": 0.1828,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18716517090797424,
"step": 4930
},
{
"epoch": 2.5870020964360587,
"grad_norm": 0.7095828056335449,
"learning_rate": 3.178951096234172e-05,
"loss": 0.1614,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.158935546875,
"step": 4935
},
{
"epoch": 2.589622641509434,
"grad_norm": 0.6684853434562683,
"learning_rate": 3.176838829530712e-05,
"loss": 0.1751,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14074695110321045,
"step": 4940
},
{
"epoch": 2.5922431865828095,
"grad_norm": 0.6725984811782837,
"learning_rate": 3.174724553050833e-05,
"loss": 0.186,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19328799843788147,
"step": 4945
},
{
"epoch": 2.5948637316561847,
"grad_norm": 0.6634368896484375,
"learning_rate": 3.172608270405244e-05,
"loss": 0.1759,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18889939785003662,
"step": 4950
},
{
"epoch": 2.59748427672956,
"grad_norm": 0.7839432954788208,
"learning_rate": 3.1704899852080816e-05,
"loss": 0.1771,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15167236328125,
"step": 4955
},
{
"epoch": 2.600104821802935,
"grad_norm": 0.7503156065940857,
"learning_rate": 3.1683697010768995e-05,
"loss": 0.1758,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.146728515625,
"step": 4960
},
{
"epoch": 2.60272536687631,
"grad_norm": 0.5571033954620361,
"learning_rate": 3.166247421632668e-05,
"loss": 0.1674,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17306920886039734,
"step": 4965
},
{
"epoch": 2.6053459119496853,
"grad_norm": 0.6313229203224182,
"learning_rate": 3.1641231504997624e-05,
"loss": 0.1832,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20780566334724426,
"step": 4970
},
{
"epoch": 2.607966457023061,
"grad_norm": 0.689773678779602,
"learning_rate": 3.161996891305962e-05,
"loss": 0.2011,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20171569287776947,
"step": 4975
},
{
"epoch": 2.610587002096436,
"grad_norm": 0.5588131546974182,
"learning_rate": 3.15986864768244e-05,
"loss": 0.1743,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21974435448646545,
"step": 4980
},
{
"epoch": 2.6132075471698113,
"grad_norm": 0.5979903340339661,
"learning_rate": 3.1577384232637575e-05,
"loss": 0.1797,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16760723292827606,
"step": 4985
},
{
"epoch": 2.6158280922431865,
"grad_norm": 0.6243706345558167,
"learning_rate": 3.15560622168786e-05,
"loss": 0.1713,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16010913252830505,
"step": 4990
},
{
"epoch": 2.618448637316562,
"grad_norm": 0.7762710452079773,
"learning_rate": 3.1534720465960694e-05,
"loss": 0.1827,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20435717701911926,
"step": 4995
},
{
"epoch": 2.6210691823899372,
"grad_norm": 0.7093287706375122,
"learning_rate": 3.151335901633077e-05,
"loss": 0.1817,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20002923905849457,
"step": 5000
},
{
"epoch": 2.6236897274633124,
"grad_norm": 0.7017514705657959,
"learning_rate": 3.1491977904469384e-05,
"loss": 0.1895,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17623648047447205,
"step": 5005
},
{
"epoch": 2.6263102725366876,
"grad_norm": 0.5963422060012817,
"learning_rate": 3.147057716689068e-05,
"loss": 0.1818,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18271183967590332,
"step": 5010
},
{
"epoch": 2.6289308176100628,
"grad_norm": 0.5557655096054077,
"learning_rate": 3.14491568401423e-05,
"loss": 0.1917,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16214272379875183,
"step": 5015
},
{
"epoch": 2.631551362683438,
"grad_norm": 0.7692015171051025,
"learning_rate": 3.142771696080536e-05,
"loss": 0.179,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16943359375,
"step": 5020
},
{
"epoch": 2.6341719077568135,
"grad_norm": 0.6690882444381714,
"learning_rate": 3.140625756549436e-05,
"loss": 0.1832,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18332576751708984,
"step": 5025
},
{
"epoch": 2.6367924528301887,
"grad_norm": 0.733025074005127,
"learning_rate": 3.138477869085712e-05,
"loss": 0.1763,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.197998046875,
"step": 5030
},
{
"epoch": 2.639412997903564,
"grad_norm": 0.6765258312225342,
"learning_rate": 3.1363280373574744e-05,
"loss": 0.1795,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19180743396282196,
"step": 5035
},
{
"epoch": 2.642033542976939,
"grad_norm": 0.6777477264404297,
"learning_rate": 3.134176265036153e-05,
"loss": 0.1869,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1671317219734192,
"step": 5040
},
{
"epoch": 2.6446540880503147,
"grad_norm": 0.8515976071357727,
"learning_rate": 3.1320225557964896e-05,
"loss": 0.183,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.161865234375,
"step": 5045
},
{
"epoch": 2.64727463312369,
"grad_norm": 0.5167196393013,
"learning_rate": 3.129866913316538e-05,
"loss": 0.184,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21703512966632843,
"step": 5050
},
{
"epoch": 2.649895178197065,
"grad_norm": 0.6473361253738403,
"learning_rate": 3.127709341277651e-05,
"loss": 0.1802,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18052998185157776,
"step": 5055
},
{
"epoch": 2.65251572327044,
"grad_norm": 0.5987588167190552,
"learning_rate": 3.125549843364477e-05,
"loss": 0.1859,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20642319321632385,
"step": 5060
},
{
"epoch": 2.6551362683438153,
"grad_norm": 0.5620844960212708,
"learning_rate": 3.1233884232649534e-05,
"loss": 0.1877,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19808673858642578,
"step": 5065
},
{
"epoch": 2.6577568134171905,
"grad_norm": 0.7065621018409729,
"learning_rate": 3.1212250846703e-05,
"loss": 0.1898,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.155029296875,
"step": 5070
},
{
"epoch": 2.660377358490566,
"grad_norm": 0.6492888927459717,
"learning_rate": 3.1190598312750145e-05,
"loss": 0.1729,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2139112949371338,
"step": 5075
},
{
"epoch": 2.6629979035639413,
"grad_norm": 0.7838486433029175,
"learning_rate": 3.116892666776861e-05,
"loss": 0.1805,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18071314692497253,
"step": 5080
},
{
"epoch": 2.6656184486373165,
"grad_norm": 0.6746211647987366,
"learning_rate": 3.114723594876872e-05,
"loss": 0.1799,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16943359375,
"step": 5085
},
{
"epoch": 2.668238993710692,
"grad_norm": 0.5902780294418335,
"learning_rate": 3.112552619279335e-05,
"loss": 0.1627,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1680593490600586,
"step": 5090
},
{
"epoch": 2.6708595387840672,
"grad_norm": 0.6151570677757263,
"learning_rate": 3.1103797436917874e-05,
"loss": 0.1832,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2051677107810974,
"step": 5095
},
{
"epoch": 2.6734800838574424,
"grad_norm": 0.6031655669212341,
"learning_rate": 3.108204971825013e-05,
"loss": 0.1811,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.202290877699852,
"step": 5100
},
{
"epoch": 2.6761006289308176,
"grad_norm": 0.7880269289016724,
"learning_rate": 3.106028307393034e-05,
"loss": 0.1813,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19555190205574036,
"step": 5105
},
{
"epoch": 2.6787211740041927,
"grad_norm": 0.5152765512466431,
"learning_rate": 3.103849754113106e-05,
"loss": 0.1666,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1540350317955017,
"step": 5110
},
{
"epoch": 2.681341719077568,
"grad_norm": 0.5880343914031982,
"learning_rate": 3.101669315705706e-05,
"loss": 0.1736,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18420341610908508,
"step": 5115
},
{
"epoch": 2.6839622641509435,
"grad_norm": 0.7327151894569397,
"learning_rate": 3.099486995894535e-05,
"loss": 0.1902,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21202674508094788,
"step": 5120
},
{
"epoch": 2.6865828092243187,
"grad_norm": 0.6346898078918457,
"learning_rate": 3.097302798406504e-05,
"loss": 0.1915,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19018851220607758,
"step": 5125
},
{
"epoch": 2.689203354297694,
"grad_norm": 0.5346789360046387,
"learning_rate": 3.0951167269717326e-05,
"loss": 0.1935,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19498604536056519,
"step": 5130
},
{
"epoch": 2.691823899371069,
"grad_norm": 0.6258835792541504,
"learning_rate": 3.092928785323539e-05,
"loss": 0.1711,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18919895589351654,
"step": 5135
},
{
"epoch": 2.6944444444444446,
"grad_norm": 0.6863317489624023,
"learning_rate": 3.090738977198437e-05,
"loss": 0.168,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16847869753837585,
"step": 5140
},
{
"epoch": 2.69706498951782,
"grad_norm": 0.5800849199295044,
"learning_rate": 3.088547306336126e-05,
"loss": 0.179,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17609632015228271,
"step": 5145
},
{
"epoch": 2.699685534591195,
"grad_norm": 0.636333167552948,
"learning_rate": 3.08635377647949e-05,
"loss": 0.1696,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19171679019927979,
"step": 5150
},
{
"epoch": 2.70230607966457,
"grad_norm": 0.6417024731636047,
"learning_rate": 3.084158391374583e-05,
"loss": 0.1658,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18236128985881805,
"step": 5155
},
{
"epoch": 2.7049266247379453,
"grad_norm": 0.5931425094604492,
"learning_rate": 3.08196115477063e-05,
"loss": 0.1753,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17998279631137848,
"step": 5160
},
{
"epoch": 2.7075471698113205,
"grad_norm": 0.5689296126365662,
"learning_rate": 3.0797620704200186e-05,
"loss": 0.159,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15252912044525146,
"step": 5165
},
{
"epoch": 2.710167714884696,
"grad_norm": 0.7568827271461487,
"learning_rate": 3.07756114207829e-05,
"loss": 0.1622,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.157958984375,
"step": 5170
},
{
"epoch": 2.7127882599580713,
"grad_norm": 0.6383182406425476,
"learning_rate": 3.0753583735041365e-05,
"loss": 0.1955,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22475168108940125,
"step": 5175
},
{
"epoch": 2.7154088050314464,
"grad_norm": 0.7478203773498535,
"learning_rate": 3.073153768459391e-05,
"loss": 0.1759,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1389104276895523,
"step": 5180
},
{
"epoch": 2.718029350104822,
"grad_norm": 0.8033480644226074,
"learning_rate": 3.0709473307090244e-05,
"loss": 0.1814,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17884859442710876,
"step": 5185
},
{
"epoch": 2.720649895178197,
"grad_norm": 0.6497260332107544,
"learning_rate": 3.0687390640211374e-05,
"loss": 0.1926,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20827674865722656,
"step": 5190
},
{
"epoch": 2.7232704402515724,
"grad_norm": 0.5410135388374329,
"learning_rate": 3.0665289721669526e-05,
"loss": 0.1703,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1410600244998932,
"step": 5195
},
{
"epoch": 2.7258909853249476,
"grad_norm": 0.661300003528595,
"learning_rate": 3.064317058920811e-05,
"loss": 0.1626,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15847226977348328,
"step": 5200
},
{
"epoch": 2.7285115303983227,
"grad_norm": 0.6560903787612915,
"learning_rate": 3.062103328060164e-05,
"loss": 0.1691,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16378501057624817,
"step": 5205
},
{
"epoch": 2.731132075471698,
"grad_norm": 0.5811970233917236,
"learning_rate": 3.0598877833655654e-05,
"loss": 0.1674,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17363318800926208,
"step": 5210
},
{
"epoch": 2.7337526205450735,
"grad_norm": 0.5654155015945435,
"learning_rate": 3.057670428620669e-05,
"loss": 0.1885,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16175119578838348,
"step": 5215
},
{
"epoch": 2.7363731656184487,
"grad_norm": 0.6081898212432861,
"learning_rate": 3.0554512676122196e-05,
"loss": 0.1812,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16766831278800964,
"step": 5220
},
{
"epoch": 2.738993710691824,
"grad_norm": 0.6194442510604858,
"learning_rate": 3.053230304130043e-05,
"loss": 0.1618,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15402266383171082,
"step": 5225
},
{
"epoch": 2.741614255765199,
"grad_norm": 0.6389869451522827,
"learning_rate": 3.0510075419670496e-05,
"loss": 0.1867,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1833145022392273,
"step": 5230
},
{
"epoch": 2.7442348008385746,
"grad_norm": 0.6671640276908875,
"learning_rate": 3.048782984919215e-05,
"loss": 0.167,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1536865234375,
"step": 5235
},
{
"epoch": 2.74685534591195,
"grad_norm": 0.6398336887359619,
"learning_rate": 3.0465566367855847e-05,
"loss": 0.1835,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20122002065181732,
"step": 5240
},
{
"epoch": 2.749475890985325,
"grad_norm": 0.6063717007637024,
"learning_rate": 3.044328501368261e-05,
"loss": 0.1724,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22785818576812744,
"step": 5245
},
{
"epoch": 2.7520964360587,
"grad_norm": 0.6103911399841309,
"learning_rate": 3.0420985824723984e-05,
"loss": 0.162,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18639829754829407,
"step": 5250
},
{
"epoch": 2.7547169811320753,
"grad_norm": 0.6719363927841187,
"learning_rate": 3.0398668839061978e-05,
"loss": 0.1801,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18195107579231262,
"step": 5255
},
{
"epoch": 2.7573375262054505,
"grad_norm": 0.6338904500007629,
"learning_rate": 3.037633409480899e-05,
"loss": 0.1782,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17371147871017456,
"step": 5260
},
{
"epoch": 2.759958071278826,
"grad_norm": 0.6469314098358154,
"learning_rate": 3.0353981630107748e-05,
"loss": 0.1709,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13950802385807037,
"step": 5265
},
{
"epoch": 2.7625786163522013,
"grad_norm": 0.6446741819381714,
"learning_rate": 3.0331611483131245e-05,
"loss": 0.1882,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1679340898990631,
"step": 5270
},
{
"epoch": 2.7651991614255764,
"grad_norm": 0.7273684144020081,
"learning_rate": 3.0309223692082663e-05,
"loss": 0.1651,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13474009931087494,
"step": 5275
},
{
"epoch": 2.767819706498952,
"grad_norm": 0.6342905759811401,
"learning_rate": 3.028681829519532e-05,
"loss": 0.1864,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21595671772956848,
"step": 5280
},
{
"epoch": 2.770440251572327,
"grad_norm": 0.7033205032348633,
"learning_rate": 3.026439533073261e-05,
"loss": 0.1768,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20636935532093048,
"step": 5285
},
{
"epoch": 2.7730607966457024,
"grad_norm": 0.6942194700241089,
"learning_rate": 3.0241954836987916e-05,
"loss": 0.1695,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1712348610162735,
"step": 5290
},
{
"epoch": 2.7756813417190775,
"grad_norm": 0.5782949924468994,
"learning_rate": 3.0219496852284558e-05,
"loss": 0.1783,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18249915540218353,
"step": 5295
},
{
"epoch": 2.7783018867924527,
"grad_norm": 0.6877314448356628,
"learning_rate": 3.0197021414975735e-05,
"loss": 0.1678,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.166259765625,
"step": 5300
},
{
"epoch": 2.780922431865828,
"grad_norm": 0.6748788952827454,
"learning_rate": 3.0174528563444447e-05,
"loss": 0.1734,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14892578125,
"step": 5305
},
{
"epoch": 2.7835429769392035,
"grad_norm": 0.8148460984230042,
"learning_rate": 3.0152018336103427e-05,
"loss": 0.1756,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14063332974910736,
"step": 5310
},
{
"epoch": 2.7861635220125787,
"grad_norm": 1.9282485246658325,
"learning_rate": 3.0129490771395086e-05,
"loss": 0.1776,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19029554724693298,
"step": 5315
},
{
"epoch": 2.788784067085954,
"grad_norm": 0.6980833411216736,
"learning_rate": 3.0106945907791455e-05,
"loss": 0.1888,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.147705078125,
"step": 5320
},
{
"epoch": 2.791404612159329,
"grad_norm": 0.5489820241928101,
"learning_rate": 3.0084383783794094e-05,
"loss": 0.1751,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1689077913761139,
"step": 5325
},
{
"epoch": 2.7940251572327046,
"grad_norm": 0.6219632029533386,
"learning_rate": 3.0061804437934037e-05,
"loss": 0.1762,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18127012252807617,
"step": 5330
},
{
"epoch": 2.79664570230608,
"grad_norm": 0.5704904794692993,
"learning_rate": 3.0039207908771747e-05,
"loss": 0.1701,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17348316311836243,
"step": 5335
},
{
"epoch": 2.799266247379455,
"grad_norm": 0.6238038539886475,
"learning_rate": 3.0016594234897015e-05,
"loss": 0.1823,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.156494140625,
"step": 5340
},
{
"epoch": 2.80188679245283,
"grad_norm": 0.6026485562324524,
"learning_rate": 2.9993963454928914e-05,
"loss": 0.1852,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18142390251159668,
"step": 5345
},
{
"epoch": 2.8045073375262053,
"grad_norm": 0.7556951642036438,
"learning_rate": 2.997131560751574e-05,
"loss": 0.164,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1475830078125,
"step": 5350
},
{
"epoch": 2.8071278825995805,
"grad_norm": 0.6020495295524597,
"learning_rate": 2.994865073133492e-05,
"loss": 0.1864,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20331259071826935,
"step": 5355
},
{
"epoch": 2.809748427672956,
"grad_norm": 0.582724928855896,
"learning_rate": 2.9925968865092994e-05,
"loss": 0.1591,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16749607026576996,
"step": 5360
},
{
"epoch": 2.8123689727463312,
"grad_norm": 0.5949836373329163,
"learning_rate": 2.9903270047525467e-05,
"loss": 0.1806,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19134676456451416,
"step": 5365
},
{
"epoch": 2.8149895178197064,
"grad_norm": 0.5697270035743713,
"learning_rate": 2.9880554317396843e-05,
"loss": 0.1871,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21274207532405853,
"step": 5370
},
{
"epoch": 2.817610062893082,
"grad_norm": 0.5818173885345459,
"learning_rate": 2.985782171350048e-05,
"loss": 0.1716,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16496126353740692,
"step": 5375
},
{
"epoch": 2.820230607966457,
"grad_norm": 0.9107900261878967,
"learning_rate": 2.9835072274658556e-05,
"loss": 0.1782,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1845887154340744,
"step": 5380
},
{
"epoch": 2.8228511530398324,
"grad_norm": 0.6681698560714722,
"learning_rate": 2.9812306039722016e-05,
"loss": 0.171,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14111310243606567,
"step": 5385
},
{
"epoch": 2.8254716981132075,
"grad_norm": 0.58545982837677,
"learning_rate": 2.978952304757045e-05,
"loss": 0.1884,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17585648596286774,
"step": 5390
},
{
"epoch": 2.8280922431865827,
"grad_norm": 0.5710182189941406,
"learning_rate": 2.9766723337112124e-05,
"loss": 0.1856,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1905125230550766,
"step": 5395
},
{
"epoch": 2.830712788259958,
"grad_norm": 0.6644915342330933,
"learning_rate": 2.974390694728381e-05,
"loss": 0.1737,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18250641226768494,
"step": 5400
},
{
"epoch": 2.8333333333333335,
"grad_norm": 0.7214245200157166,
"learning_rate": 2.972107391705077e-05,
"loss": 0.1482,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1199951171875,
"step": 5405
},
{
"epoch": 2.8359538784067087,
"grad_norm": 0.5463569164276123,
"learning_rate": 2.9698224285406697e-05,
"loss": 0.1874,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22129079699516296,
"step": 5410
},
{
"epoch": 2.838574423480084,
"grad_norm": 0.6725033521652222,
"learning_rate": 2.9675358091373634e-05,
"loss": 0.1566,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16396062076091766,
"step": 5415
},
{
"epoch": 2.841194968553459,
"grad_norm": 0.7897777557373047,
"learning_rate": 2.9652475374001898e-05,
"loss": 0.1646,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1701788604259491,
"step": 5420
},
{
"epoch": 2.8438155136268346,
"grad_norm": 0.6413025856018066,
"learning_rate": 2.9629576172370035e-05,
"loss": 0.1797,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18991181254386902,
"step": 5425
},
{
"epoch": 2.8464360587002098,
"grad_norm": 0.5760217905044556,
"learning_rate": 2.960666052558474e-05,
"loss": 0.1713,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16620710492134094,
"step": 5430
},
{
"epoch": 2.849056603773585,
"grad_norm": 0.5823150873184204,
"learning_rate": 2.9583728472780787e-05,
"loss": 0.1664,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1885368973016739,
"step": 5435
},
{
"epoch": 2.85167714884696,
"grad_norm": 0.5929847359657288,
"learning_rate": 2.9560780053120982e-05,
"loss": 0.1579,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1745993196964264,
"step": 5440
},
{
"epoch": 2.8542976939203353,
"grad_norm": 0.5878052711486816,
"learning_rate": 2.9537815305796056e-05,
"loss": 0.1917,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2162623107433319,
"step": 5445
},
{
"epoch": 2.8569182389937104,
"grad_norm": 0.595242977142334,
"learning_rate": 2.951483427002465e-05,
"loss": 0.1645,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16339543461799622,
"step": 5450
},
{
"epoch": 2.859538784067086,
"grad_norm": 0.6756539940834045,
"learning_rate": 2.9491836985053215e-05,
"loss": 0.1804,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19334867596626282,
"step": 5455
},
{
"epoch": 2.8621593291404612,
"grad_norm": 0.6055642366409302,
"learning_rate": 2.946882349015594e-05,
"loss": 0.1744,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.22528602182865143,
"step": 5460
},
{
"epoch": 2.8647798742138364,
"grad_norm": 0.6377682089805603,
"learning_rate": 2.9445793824634715e-05,
"loss": 0.199,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17041015625,
"step": 5465
},
{
"epoch": 2.867400419287212,
"grad_norm": 0.6229009032249451,
"learning_rate": 2.9422748027819025e-05,
"loss": 0.1665,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.162109375,
"step": 5470
},
{
"epoch": 2.870020964360587,
"grad_norm": 0.6600066423416138,
"learning_rate": 2.9399686139065924e-05,
"loss": 0.1772,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2039642333984375,
"step": 5475
},
{
"epoch": 2.8726415094339623,
"grad_norm": 0.5950238704681396,
"learning_rate": 2.9376608197759934e-05,
"loss": 0.1907,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16557833552360535,
"step": 5480
},
{
"epoch": 2.8752620545073375,
"grad_norm": 0.7862025499343872,
"learning_rate": 2.9353514243313004e-05,
"loss": 0.1623,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14168497920036316,
"step": 5485
},
{
"epoch": 2.8778825995807127,
"grad_norm": 0.6496447324752808,
"learning_rate": 2.9330404315164413e-05,
"loss": 0.1996,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2142472267150879,
"step": 5490
},
{
"epoch": 2.880503144654088,
"grad_norm": 0.6302199959754944,
"learning_rate": 2.9307278452780726e-05,
"loss": 0.1654,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13268312811851501,
"step": 5495
},
{
"epoch": 2.8831236897274635,
"grad_norm": 0.8944174647331238,
"learning_rate": 2.928413669565573e-05,
"loss": 0.1755,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16154518723487854,
"step": 5500
},
{
"epoch": 2.8857442348008386,
"grad_norm": 0.6077420711517334,
"learning_rate": 2.9260979083310345e-05,
"loss": 0.1724,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16899073123931885,
"step": 5505
},
{
"epoch": 2.888364779874214,
"grad_norm": 0.7525168061256409,
"learning_rate": 2.9237805655292572e-05,
"loss": 0.1757,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14932957291603088,
"step": 5510
},
{
"epoch": 2.890985324947589,
"grad_norm": 0.7273001074790955,
"learning_rate": 2.921461645117743e-05,
"loss": 0.1752,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17291836440563202,
"step": 5515
},
{
"epoch": 2.8936058700209646,
"grad_norm": 0.6908173561096191,
"learning_rate": 2.9191411510566852e-05,
"loss": 0.1871,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1522650569677353,
"step": 5520
},
{
"epoch": 2.8962264150943398,
"grad_norm": 0.6004059910774231,
"learning_rate": 2.9168190873089685e-05,
"loss": 0.1668,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18317842483520508,
"step": 5525
},
{
"epoch": 2.898846960167715,
"grad_norm": 0.7183316349983215,
"learning_rate": 2.9144954578401558e-05,
"loss": 0.1937,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.158203125,
"step": 5530
},
{
"epoch": 2.90146750524109,
"grad_norm": 0.6961800456047058,
"learning_rate": 2.912170266618483e-05,
"loss": 0.1768,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18650424480438232,
"step": 5535
},
{
"epoch": 2.9040880503144653,
"grad_norm": 0.9425960183143616,
"learning_rate": 2.9098435176148567e-05,
"loss": 0.1725,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20058107376098633,
"step": 5540
},
{
"epoch": 2.9067085953878404,
"grad_norm": 0.5319423675537109,
"learning_rate": 2.9075152148028394e-05,
"loss": 0.1646,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13040751218795776,
"step": 5545
},
{
"epoch": 2.909329140461216,
"grad_norm": 0.6470221877098083,
"learning_rate": 2.9051853621586513e-05,
"loss": 0.1841,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20197980105876923,
"step": 5550
},
{
"epoch": 2.911949685534591,
"grad_norm": 0.7041011452674866,
"learning_rate": 2.9028539636611567e-05,
"loss": 0.1671,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18308019638061523,
"step": 5555
},
{
"epoch": 2.9145702306079664,
"grad_norm": 0.6379269957542419,
"learning_rate": 2.9005210232918596e-05,
"loss": 0.1714,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1962052583694458,
"step": 5560
},
{
"epoch": 2.917190775681342,
"grad_norm": 0.6516327261924744,
"learning_rate": 2.8981865450349006e-05,
"loss": 0.1909,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18401208519935608,
"step": 5565
},
{
"epoch": 2.919811320754717,
"grad_norm": 0.5459520220756531,
"learning_rate": 2.8958505328770415e-05,
"loss": 0.2012,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23481181263923645,
"step": 5570
},
{
"epoch": 2.9224318658280923,
"grad_norm": 0.7334117889404297,
"learning_rate": 2.893512990807669e-05,
"loss": 0.1936,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19439546763896942,
"step": 5575
},
{
"epoch": 2.9250524109014675,
"grad_norm": 0.6233519315719604,
"learning_rate": 2.8911739228187782e-05,
"loss": 0.1503,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1621095985174179,
"step": 5580
},
{
"epoch": 2.9276729559748427,
"grad_norm": 0.5482489466667175,
"learning_rate": 2.8888333329049728e-05,
"loss": 0.1725,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19045138359069824,
"step": 5585
},
{
"epoch": 2.930293501048218,
"grad_norm": 0.6794843077659607,
"learning_rate": 2.8864912250634543e-05,
"loss": 0.1766,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16787664592266083,
"step": 5590
},
{
"epoch": 2.9329140461215935,
"grad_norm": 0.5271611213684082,
"learning_rate": 2.8841476032940162e-05,
"loss": 0.1553,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15002946555614471,
"step": 5595
},
{
"epoch": 2.9355345911949686,
"grad_norm": 0.5626645684242249,
"learning_rate": 2.88180247159904e-05,
"loss": 0.1659,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15474967658519745,
"step": 5600
},
{
"epoch": 2.938155136268344,
"grad_norm": 0.5897443294525146,
"learning_rate": 2.8794558339834825e-05,
"loss": 0.1713,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16423803567886353,
"step": 5605
},
{
"epoch": 2.940775681341719,
"grad_norm": 0.5693492889404297,
"learning_rate": 2.877107694454874e-05,
"loss": 0.1895,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1666092872619629,
"step": 5610
},
{
"epoch": 2.9433962264150946,
"grad_norm": 0.5590540170669556,
"learning_rate": 2.8747580570233098e-05,
"loss": 0.1765,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1851804554462433,
"step": 5615
},
{
"epoch": 2.9460167714884697,
"grad_norm": 0.5822227001190186,
"learning_rate": 2.8724069257014425e-05,
"loss": 0.1761,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.174712672829628,
"step": 5620
},
{
"epoch": 2.948637316561845,
"grad_norm": 0.5724350214004517,
"learning_rate": 2.8700543045044767e-05,
"loss": 0.1769,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2074946165084839,
"step": 5625
},
{
"epoch": 2.95125786163522,
"grad_norm": 0.6528369188308716,
"learning_rate": 2.8677001974501607e-05,
"loss": 0.1674,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12680822610855103,
"step": 5630
},
{
"epoch": 2.9538784067085953,
"grad_norm": 0.5912206172943115,
"learning_rate": 2.865344608558781e-05,
"loss": 0.186,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1962258368730545,
"step": 5635
},
{
"epoch": 2.9564989517819704,
"grad_norm": 0.9319872856140137,
"learning_rate": 2.8629875418531542e-05,
"loss": 0.1814,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.151123046875,
"step": 5640
},
{
"epoch": 2.959119496855346,
"grad_norm": 0.5565921664237976,
"learning_rate": 2.860629001358621e-05,
"loss": 0.1837,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20103786885738373,
"step": 5645
},
{
"epoch": 2.961740041928721,
"grad_norm": 0.7086861729621887,
"learning_rate": 2.8582689911030383e-05,
"loss": 0.1653,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2012900412082672,
"step": 5650
},
{
"epoch": 2.9643605870020964,
"grad_norm": 0.508179247379303,
"learning_rate": 2.8559075151167745e-05,
"loss": 0.1869,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1763106882572174,
"step": 5655
},
{
"epoch": 2.9669811320754715,
"grad_norm": 0.5380913019180298,
"learning_rate": 2.8535445774326994e-05,
"loss": 0.1653,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20369559526443481,
"step": 5660
},
{
"epoch": 2.969601677148847,
"grad_norm": 0.6445121169090271,
"learning_rate": 2.8511801820861807e-05,
"loss": 0.1866,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1454983353614807,
"step": 5665
},
{
"epoch": 2.9722222222222223,
"grad_norm": 0.6975541114807129,
"learning_rate": 2.8488143331150743e-05,
"loss": 0.1544,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15515579283237457,
"step": 5670
},
{
"epoch": 2.9748427672955975,
"grad_norm": 0.6473163366317749,
"learning_rate": 2.8464470345597184e-05,
"loss": 0.1895,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20755667984485626,
"step": 5675
},
{
"epoch": 2.9774633123689727,
"grad_norm": 0.6191983222961426,
"learning_rate": 2.844078290462928e-05,
"loss": 0.1623,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17519129812717438,
"step": 5680
},
{
"epoch": 2.980083857442348,
"grad_norm": 0.6182118654251099,
"learning_rate": 2.8417081048699855e-05,
"loss": 0.1912,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17867231369018555,
"step": 5685
},
{
"epoch": 2.982704402515723,
"grad_norm": 0.6826756596565247,
"learning_rate": 2.8393364818286363e-05,
"loss": 0.1789,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.159912109375,
"step": 5690
},
{
"epoch": 2.9853249475890986,
"grad_norm": 0.5628302097320557,
"learning_rate": 2.8369634253890797e-05,
"loss": 0.1651,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1419781893491745,
"step": 5695
},
{
"epoch": 2.987945492662474,
"grad_norm": 0.6937417387962341,
"learning_rate": 2.8345889396039615e-05,
"loss": 0.1657,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15064573287963867,
"step": 5700
},
{
"epoch": 2.990566037735849,
"grad_norm": 0.5172299742698669,
"learning_rate": 2.8322130285283725e-05,
"loss": 0.1806,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2054455578327179,
"step": 5705
},
{
"epoch": 2.9931865828092246,
"grad_norm": 0.6245455741882324,
"learning_rate": 2.829835696219834e-05,
"loss": 0.1897,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1823698729276657,
"step": 5710
},
{
"epoch": 2.9958071278825997,
"grad_norm": 0.6723186373710632,
"learning_rate": 2.8274569467382962e-05,
"loss": 0.1753,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17229504883289337,
"step": 5715
},
{
"epoch": 2.998427672955975,
"grad_norm": 0.87140291929245,
"learning_rate": 2.8250767841461283e-05,
"loss": 0.1684,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17452089488506317,
"step": 5720
},
{
"epoch": 3.001572327044025,
"grad_norm": 0.6093711853027344,
"learning_rate": 2.822695212508114e-05,
"loss": 0.1416,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15838420391082764,
"step": 5725
},
{
"epoch": 3.0041928721174003,
"grad_norm": 0.6168466210365295,
"learning_rate": 2.820312235891443e-05,
"loss": 0.1788,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1453549861907959,
"step": 5730
},
{
"epoch": 3.006813417190776,
"grad_norm": 0.6319136619567871,
"learning_rate": 2.8179278583657034e-05,
"loss": 0.1445,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.165153369307518,
"step": 5735
},
{
"epoch": 3.009433962264151,
"grad_norm": 0.6942062973976135,
"learning_rate": 2.8155420840028767e-05,
"loss": 0.1551,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.125732421875,
"step": 5740
},
{
"epoch": 3.012054507337526,
"grad_norm": 0.7496764659881592,
"learning_rate": 2.81315491687733e-05,
"loss": 0.1497,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1387939453125,
"step": 5745
},
{
"epoch": 3.0146750524109014,
"grad_norm": 0.7578096985816956,
"learning_rate": 2.8107663610658087e-05,
"loss": 0.1531,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1143798828125,
"step": 5750
},
{
"epoch": 3.0172955974842766,
"grad_norm": 0.5947731137275696,
"learning_rate": 2.80837642064743e-05,
"loss": 0.1244,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13372820615768433,
"step": 5755
},
{
"epoch": 3.019916142557652,
"grad_norm": 0.7277220487594604,
"learning_rate": 2.8059850997036745e-05,
"loss": 0.1628,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15905359387397766,
"step": 5760
},
{
"epoch": 3.0225366876310273,
"grad_norm": 0.558275580406189,
"learning_rate": 2.8035924023183816e-05,
"loss": 0.1528,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15842102468013763,
"step": 5765
},
{
"epoch": 3.0251572327044025,
"grad_norm": 0.7598258852958679,
"learning_rate": 2.8011983325777415e-05,
"loss": 0.1481,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1642048954963684,
"step": 5770
},
{
"epoch": 3.0277777777777777,
"grad_norm": 0.7024770379066467,
"learning_rate": 2.7988028945702874e-05,
"loss": 0.1359,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12566912174224854,
"step": 5775
},
{
"epoch": 3.030398322851153,
"grad_norm": 0.6844687461853027,
"learning_rate": 2.7964060923868888e-05,
"loss": 0.159,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14308127760887146,
"step": 5780
},
{
"epoch": 3.0330188679245285,
"grad_norm": 0.7333658337593079,
"learning_rate": 2.794007930120747e-05,
"loss": 0.1508,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13852116465568542,
"step": 5785
},
{
"epoch": 3.0356394129979036,
"grad_norm": 0.7345966100692749,
"learning_rate": 2.791608411867383e-05,
"loss": 0.1697,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1815803349018097,
"step": 5790
},
{
"epoch": 3.038259958071279,
"grad_norm": 0.5574893355369568,
"learning_rate": 2.789207541724636e-05,
"loss": 0.1569,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17574036121368408,
"step": 5795
},
{
"epoch": 3.040880503144654,
"grad_norm": 0.7342729568481445,
"learning_rate": 2.7868053237926527e-05,
"loss": 0.1498,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1334531009197235,
"step": 5800
},
{
"epoch": 3.043501048218029,
"grad_norm": 0.8768147826194763,
"learning_rate": 2.784401762173882e-05,
"loss": 0.13,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12249755859375,
"step": 5805
},
{
"epoch": 3.0461215932914047,
"grad_norm": 0.7460582256317139,
"learning_rate": 2.7819968609730677e-05,
"loss": 0.1546,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1651483029127121,
"step": 5810
},
{
"epoch": 3.04874213836478,
"grad_norm": 0.6598678827285767,
"learning_rate": 2.7795906242972396e-05,
"loss": 0.1402,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13518714904785156,
"step": 5815
},
{
"epoch": 3.051362683438155,
"grad_norm": 0.8546496033668518,
"learning_rate": 2.7771830562557104e-05,
"loss": 0.1446,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11572265625,
"step": 5820
},
{
"epoch": 3.0539832285115303,
"grad_norm": 0.6218474507331848,
"learning_rate": 2.774774160960066e-05,
"loss": 0.1598,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17280012369155884,
"step": 5825
},
{
"epoch": 3.056603773584906,
"grad_norm": 0.9477034211158752,
"learning_rate": 2.7723639425241585e-05,
"loss": 0.1439,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1212158203125,
"step": 5830
},
{
"epoch": 3.059224318658281,
"grad_norm": 0.7466812133789062,
"learning_rate": 2.769952405064099e-05,
"loss": 0.15,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14890992641448975,
"step": 5835
},
{
"epoch": 3.061844863731656,
"grad_norm": 0.7301759719848633,
"learning_rate": 2.767539552698252e-05,
"loss": 0.1327,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15387779474258423,
"step": 5840
},
{
"epoch": 3.0644654088050314,
"grad_norm": 0.6939190030097961,
"learning_rate": 2.7651253895472284e-05,
"loss": 0.1638,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12840767204761505,
"step": 5845
},
{
"epoch": 3.0670859538784065,
"grad_norm": 0.5036671757698059,
"learning_rate": 2.7627099197338757e-05,
"loss": 0.1701,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19116505980491638,
"step": 5850
},
{
"epoch": 3.069706498951782,
"grad_norm": 0.6274353861808777,
"learning_rate": 2.7602931473832736e-05,
"loss": 0.1669,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1835746318101883,
"step": 5855
},
{
"epoch": 3.0723270440251573,
"grad_norm": 0.7170372605323792,
"learning_rate": 2.7578750766227272e-05,
"loss": 0.1611,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1607617437839508,
"step": 5860
},
{
"epoch": 3.0749475890985325,
"grad_norm": 0.5228092074394226,
"learning_rate": 2.7554557115817588e-05,
"loss": 0.1605,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15039002895355225,
"step": 5865
},
{
"epoch": 3.0775681341719077,
"grad_norm": 0.6270290017127991,
"learning_rate": 2.753035056392099e-05,
"loss": 0.1513,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14429956674575806,
"step": 5870
},
{
"epoch": 3.080188679245283,
"grad_norm": 0.7377024292945862,
"learning_rate": 2.750613115187685e-05,
"loss": 0.1174,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1204833984375,
"step": 5875
},
{
"epoch": 3.0828092243186584,
"grad_norm": 0.7889018654823303,
"learning_rate": 2.7481898921046462e-05,
"loss": 0.1323,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10128215700387955,
"step": 5880
},
{
"epoch": 3.0854297693920336,
"grad_norm": 0.8123045563697815,
"learning_rate": 2.745765391281306e-05,
"loss": 0.1408,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1146240234375,
"step": 5885
},
{
"epoch": 3.088050314465409,
"grad_norm": 0.5776121616363525,
"learning_rate": 2.7433396168581654e-05,
"loss": 0.1621,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17960049211978912,
"step": 5890
},
{
"epoch": 3.090670859538784,
"grad_norm": 0.72197026014328,
"learning_rate": 2.740912572977903e-05,
"loss": 0.1402,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14819848537445068,
"step": 5895
},
{
"epoch": 3.093291404612159,
"grad_norm": 0.6737906336784363,
"learning_rate": 2.738484263785365e-05,
"loss": 0.1372,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1411348134279251,
"step": 5900
},
{
"epoch": 3.0959119496855347,
"grad_norm": 0.6078870892524719,
"learning_rate": 2.736054693427557e-05,
"loss": 0.1614,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18547850847244263,
"step": 5905
},
{
"epoch": 3.09853249475891,
"grad_norm": 0.6489272713661194,
"learning_rate": 2.7336238660536413e-05,
"loss": 0.15,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14268571138381958,
"step": 5910
},
{
"epoch": 3.101153039832285,
"grad_norm": 0.6875291466712952,
"learning_rate": 2.7311917858149226e-05,
"loss": 0.1626,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17005501687526703,
"step": 5915
},
{
"epoch": 3.1037735849056602,
"grad_norm": 0.7525482773780823,
"learning_rate": 2.7287584568648507e-05,
"loss": 0.1463,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15657764673233032,
"step": 5920
},
{
"epoch": 3.1063941299790354,
"grad_norm": 0.6066417694091797,
"learning_rate": 2.726323883359003e-05,
"loss": 0.1491,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15884283185005188,
"step": 5925
},
{
"epoch": 3.109014675052411,
"grad_norm": 0.6516925096511841,
"learning_rate": 2.723888069455084e-05,
"loss": 0.1502,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12646484375,
"step": 5930
},
{
"epoch": 3.111635220125786,
"grad_norm": 0.6193258166313171,
"learning_rate": 2.7214510193129186e-05,
"loss": 0.1591,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18081727623939514,
"step": 5935
},
{
"epoch": 3.1142557651991614,
"grad_norm": 0.6331426501274109,
"learning_rate": 2.719012737094439e-05,
"loss": 0.1487,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14574037492275238,
"step": 5940
},
{
"epoch": 3.1168763102725365,
"grad_norm": 0.801331639289856,
"learning_rate": 2.7165732269636863e-05,
"loss": 0.1364,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.109619140625,
"step": 5945
},
{
"epoch": 3.119496855345912,
"grad_norm": 0.7135635614395142,
"learning_rate": 2.714132493086793e-05,
"loss": 0.1685,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16051837801933289,
"step": 5950
},
{
"epoch": 3.1221174004192873,
"grad_norm": 0.6845545172691345,
"learning_rate": 2.7116905396319863e-05,
"loss": 0.1537,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18284091353416443,
"step": 5955
},
{
"epoch": 3.1247379454926625,
"grad_norm": 0.8440403342247009,
"learning_rate": 2.7092473707695737e-05,
"loss": 0.1544,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16937442123889923,
"step": 5960
},
{
"epoch": 3.1273584905660377,
"grad_norm": 0.6862518787384033,
"learning_rate": 2.706802990671939e-05,
"loss": 0.1583,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1287841796875,
"step": 5965
},
{
"epoch": 3.129979035639413,
"grad_norm": 0.8747920393943787,
"learning_rate": 2.704357403513534e-05,
"loss": 0.1488,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14729902148246765,
"step": 5970
},
{
"epoch": 3.1325995807127884,
"grad_norm": 0.8272216320037842,
"learning_rate": 2.701910613470873e-05,
"loss": 0.1534,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1635628640651703,
"step": 5975
},
{
"epoch": 3.1352201257861636,
"grad_norm": 0.6622633337974548,
"learning_rate": 2.699462624722523e-05,
"loss": 0.1574,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16519419848918915,
"step": 5980
},
{
"epoch": 3.1378406708595388,
"grad_norm": 0.6107540726661682,
"learning_rate": 2.6970134414491e-05,
"loss": 0.155,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17579969763755798,
"step": 5985
},
{
"epoch": 3.140461215932914,
"grad_norm": 0.734446108341217,
"learning_rate": 2.6945630678332584e-05,
"loss": 0.1342,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15087890625,
"step": 5990
},
{
"epoch": 3.143081761006289,
"grad_norm": 0.7071894407272339,
"learning_rate": 2.692111508059686e-05,
"loss": 0.1557,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15364445745944977,
"step": 5995
},
{
"epoch": 3.1457023060796647,
"grad_norm": 0.5757654905319214,
"learning_rate": 2.6896587663150965e-05,
"loss": 0.1566,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16788052022457123,
"step": 6000
},
{
"epoch": 3.14832285115304,
"grad_norm": 0.6631264090538025,
"learning_rate": 2.687204846788222e-05,
"loss": 0.1484,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13493452966213226,
"step": 6005
},
{
"epoch": 3.150943396226415,
"grad_norm": 0.7087613344192505,
"learning_rate": 2.6847497536698058e-05,
"loss": 0.1547,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16542352735996246,
"step": 6010
},
{
"epoch": 3.1535639412997902,
"grad_norm": 0.6962686777114868,
"learning_rate": 2.6822934911525958e-05,
"loss": 0.1514,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1555195450782776,
"step": 6015
},
{
"epoch": 3.1561844863731654,
"grad_norm": 0.6705347895622253,
"learning_rate": 2.679836063431336e-05,
"loss": 0.1678,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1522950977087021,
"step": 6020
},
{
"epoch": 3.158805031446541,
"grad_norm": 0.5783252716064453,
"learning_rate": 2.677377474702762e-05,
"loss": 0.1593,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17791618406772614,
"step": 6025
},
{
"epoch": 3.161425576519916,
"grad_norm": 0.6777276396751404,
"learning_rate": 2.6749177291655905e-05,
"loss": 0.161,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15510901808738708,
"step": 6030
},
{
"epoch": 3.1640461215932913,
"grad_norm": 0.6445097327232361,
"learning_rate": 2.6724568310205153e-05,
"loss": 0.1342,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18770700693130493,
"step": 6035
},
{
"epoch": 3.1666666666666665,
"grad_norm": 0.6930022835731506,
"learning_rate": 2.6699947844701967e-05,
"loss": 0.1444,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17430582642555237,
"step": 6040
},
{
"epoch": 3.169287211740042,
"grad_norm": 0.6261948943138123,
"learning_rate": 2.6675315937192574e-05,
"loss": 0.1619,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1623680591583252,
"step": 6045
},
{
"epoch": 3.1719077568134173,
"grad_norm": 0.7149227261543274,
"learning_rate": 2.665067262974275e-05,
"loss": 0.1442,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1636398881673813,
"step": 6050
},
{
"epoch": 3.1745283018867925,
"grad_norm": 0.6243942379951477,
"learning_rate": 2.6626017964437726e-05,
"loss": 0.1384,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13991810381412506,
"step": 6055
},
{
"epoch": 3.1771488469601676,
"grad_norm": 0.649437665939331,
"learning_rate": 2.6601351983382123e-05,
"loss": 0.1553,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1520819365978241,
"step": 6060
},
{
"epoch": 3.179769392033543,
"grad_norm": 0.6788381338119507,
"learning_rate": 2.6576674728699905e-05,
"loss": 0.1423,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16024520993232727,
"step": 6065
},
{
"epoch": 3.1823899371069184,
"grad_norm": 0.7072592377662659,
"learning_rate": 2.655198624253428e-05,
"loss": 0.1478,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1642773151397705,
"step": 6070
},
{
"epoch": 3.1850104821802936,
"grad_norm": 0.616181492805481,
"learning_rate": 2.6527286567047634e-05,
"loss": 0.1567,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.177655428647995,
"step": 6075
},
{
"epoch": 3.1876310272536688,
"grad_norm": 0.6885983943939209,
"learning_rate": 2.6502575744421473e-05,
"loss": 0.1589,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14435414969921112,
"step": 6080
},
{
"epoch": 3.190251572327044,
"grad_norm": 0.7557274699211121,
"learning_rate": 2.647785381685633e-05,
"loss": 0.1676,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1525026261806488,
"step": 6085
},
{
"epoch": 3.192872117400419,
"grad_norm": 0.5805322527885437,
"learning_rate": 2.6453120826571705e-05,
"loss": 0.1587,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13723093271255493,
"step": 6090
},
{
"epoch": 3.1954926624737947,
"grad_norm": 0.5989310145378113,
"learning_rate": 2.6428376815805984e-05,
"loss": 0.1775,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18171349167823792,
"step": 6095
},
{
"epoch": 3.19811320754717,
"grad_norm": 0.6322781443595886,
"learning_rate": 2.6403621826816385e-05,
"loss": 0.1446,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13691356778144836,
"step": 6100
},
{
"epoch": 3.200733752620545,
"grad_norm": 0.6193546056747437,
"learning_rate": 2.637885590187888e-05,
"loss": 0.1678,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15641272068023682,
"step": 6105
},
{
"epoch": 3.20335429769392,
"grad_norm": 0.6977561116218567,
"learning_rate": 2.6354079083288087e-05,
"loss": 0.1477,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17221683263778687,
"step": 6110
},
{
"epoch": 3.2059748427672954,
"grad_norm": 0.6117526888847351,
"learning_rate": 2.6329291413357263e-05,
"loss": 0.1449,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18370969593524933,
"step": 6115
},
{
"epoch": 3.208595387840671,
"grad_norm": 0.6177181601524353,
"learning_rate": 2.630449293441818e-05,
"loss": 0.1675,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14681090414524078,
"step": 6120
},
{
"epoch": 3.211215932914046,
"grad_norm": 0.7336227297782898,
"learning_rate": 2.6279683688821056e-05,
"loss": 0.1809,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17036855220794678,
"step": 6125
},
{
"epoch": 3.2138364779874213,
"grad_norm": 0.6561317443847656,
"learning_rate": 2.6254863718934525e-05,
"loss": 0.149,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15689235925674438,
"step": 6130
},
{
"epoch": 3.2164570230607965,
"grad_norm": 0.5736268162727356,
"learning_rate": 2.6230033067145516e-05,
"loss": 0.1472,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17615295946598053,
"step": 6135
},
{
"epoch": 3.219077568134172,
"grad_norm": 0.5321448445320129,
"learning_rate": 2.620519177585921e-05,
"loss": 0.1582,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20982366800308228,
"step": 6140
},
{
"epoch": 3.2216981132075473,
"grad_norm": 0.6963729858398438,
"learning_rate": 2.618033988749895e-05,
"loss": 0.1672,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15652288496494293,
"step": 6145
},
{
"epoch": 3.2243186582809225,
"grad_norm": 0.7338701486587524,
"learning_rate": 2.615547744450618e-05,
"loss": 0.1603,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.184444397687912,
"step": 6150
},
{
"epoch": 3.2269392033542976,
"grad_norm": 0.6008429527282715,
"learning_rate": 2.6130604489340367e-05,
"loss": 0.1651,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17205990850925446,
"step": 6155
},
{
"epoch": 3.229559748427673,
"grad_norm": 0.677862823009491,
"learning_rate": 2.610572106447894e-05,
"loss": 0.1605,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19454920291900635,
"step": 6160
},
{
"epoch": 3.2321802935010484,
"grad_norm": 0.6206879615783691,
"learning_rate": 2.608082721241719e-05,
"loss": 0.1636,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19852474331855774,
"step": 6165
},
{
"epoch": 3.2348008385744236,
"grad_norm": 0.6645183563232422,
"learning_rate": 2.6055922975668235e-05,
"loss": 0.157,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14958696067333221,
"step": 6170
},
{
"epoch": 3.2374213836477987,
"grad_norm": 0.7822976112365723,
"learning_rate": 2.6031008396762908e-05,
"loss": 0.1566,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14029009640216827,
"step": 6175
},
{
"epoch": 3.240041928721174,
"grad_norm": 0.6289648413658142,
"learning_rate": 2.6006083518249724e-05,
"loss": 0.1766,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2056703120470047,
"step": 6180
},
{
"epoch": 3.242662473794549,
"grad_norm": 0.5854620933532715,
"learning_rate": 2.5981148382694773e-05,
"loss": 0.1331,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1278085708618164,
"step": 6185
},
{
"epoch": 3.2452830188679247,
"grad_norm": 0.7568283081054688,
"learning_rate": 2.5956203032681667e-05,
"loss": 0.1524,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12183448672294617,
"step": 6190
},
{
"epoch": 3.2479035639413,
"grad_norm": 0.6330800652503967,
"learning_rate": 2.5931247510811464e-05,
"loss": 0.1484,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1322021484375,
"step": 6195
},
{
"epoch": 3.250524109014675,
"grad_norm": 0.6357014179229736,
"learning_rate": 2.5906281859702582e-05,
"loss": 0.1583,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17333781719207764,
"step": 6200
},
{
"epoch": 3.25314465408805,
"grad_norm": 0.7124892473220825,
"learning_rate": 2.5881306121990758e-05,
"loss": 0.162,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15561464428901672,
"step": 6205
},
{
"epoch": 3.2557651991614254,
"grad_norm": 0.717791736125946,
"learning_rate": 2.5856320340328934e-05,
"loss": 0.1622,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15265150368213654,
"step": 6210
},
{
"epoch": 3.258385744234801,
"grad_norm": 0.7117369771003723,
"learning_rate": 2.5831324557387216e-05,
"loss": 0.1658,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1384900063276291,
"step": 6215
},
{
"epoch": 3.261006289308176,
"grad_norm": 0.6919196844100952,
"learning_rate": 2.580631881585279e-05,
"loss": 0.1444,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1550731062889099,
"step": 6220
},
{
"epoch": 3.2636268343815513,
"grad_norm": 0.7333115339279175,
"learning_rate": 2.5781303158429844e-05,
"loss": 0.1526,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12255859375,
"step": 6225
},
{
"epoch": 3.2662473794549265,
"grad_norm": 0.6055130362510681,
"learning_rate": 2.575627762783951e-05,
"loss": 0.1606,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23269812762737274,
"step": 6230
},
{
"epoch": 3.268867924528302,
"grad_norm": 0.6821181178092957,
"learning_rate": 2.573124226681976e-05,
"loss": 0.1522,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13218367099761963,
"step": 6235
},
{
"epoch": 3.2714884696016773,
"grad_norm": 0.7422898411750793,
"learning_rate": 2.5706197118125375e-05,
"loss": 0.1479,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15026451647281647,
"step": 6240
},
{
"epoch": 3.2741090146750524,
"grad_norm": 0.66852205991745,
"learning_rate": 2.568114222452785e-05,
"loss": 0.1579,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1492151916027069,
"step": 6245
},
{
"epoch": 3.2767295597484276,
"grad_norm": 0.6432539820671082,
"learning_rate": 2.5656077628815305e-05,
"loss": 0.163,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15138231217861176,
"step": 6250
},
{
"epoch": 3.279350104821803,
"grad_norm": 0.6414669156074524,
"learning_rate": 2.5631003373792452e-05,
"loss": 0.1558,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19593289494514465,
"step": 6255
},
{
"epoch": 3.281970649895178,
"grad_norm": 0.7199597954750061,
"learning_rate": 2.5605919502280482e-05,
"loss": 0.1368,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1262434870004654,
"step": 6260
},
{
"epoch": 3.2845911949685536,
"grad_norm": 0.7075192928314209,
"learning_rate": 2.5580826057117002e-05,
"loss": 0.1455,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14794921875,
"step": 6265
},
{
"epoch": 3.2872117400419287,
"grad_norm": 0.6006430983543396,
"learning_rate": 2.5555723081156005e-05,
"loss": 0.1533,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11506500095129013,
"step": 6270
},
{
"epoch": 3.289832285115304,
"grad_norm": 0.5886109471321106,
"learning_rate": 2.5530610617267718e-05,
"loss": 0.1624,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14662334322929382,
"step": 6275
},
{
"epoch": 3.292452830188679,
"grad_norm": 0.6336468458175659,
"learning_rate": 2.5505488708338596e-05,
"loss": 0.1612,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16814139485359192,
"step": 6280
},
{
"epoch": 3.2950733752620547,
"grad_norm": 0.6334772706031799,
"learning_rate": 2.5480357397271222e-05,
"loss": 0.1519,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12603622674942017,
"step": 6285
},
{
"epoch": 3.29769392033543,
"grad_norm": 0.779043436050415,
"learning_rate": 2.5455216726984215e-05,
"loss": 0.1492,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1370849609375,
"step": 6290
},
{
"epoch": 3.300314465408805,
"grad_norm": 0.795427143573761,
"learning_rate": 2.5430066740412214e-05,
"loss": 0.1554,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11929628252983093,
"step": 6295
},
{
"epoch": 3.30293501048218,
"grad_norm": 0.6863541603088379,
"learning_rate": 2.5404907480505735e-05,
"loss": 0.1562,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15934820473194122,
"step": 6300
},
{
"epoch": 3.3055555555555554,
"grad_norm": 0.7187139391899109,
"learning_rate": 2.537973899023114e-05,
"loss": 0.1516,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15310056507587433,
"step": 6305
},
{
"epoch": 3.308176100628931,
"grad_norm": 0.7239200472831726,
"learning_rate": 2.535456131257057e-05,
"loss": 0.1542,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15294574201107025,
"step": 6310
},
{
"epoch": 3.310796645702306,
"grad_norm": 0.5413670539855957,
"learning_rate": 2.5329374490521836e-05,
"loss": 0.1567,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.110429547727108,
"step": 6315
},
{
"epoch": 3.3134171907756813,
"grad_norm": 0.6516697406768799,
"learning_rate": 2.5304178567098374e-05,
"loss": 0.1599,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17944911122322083,
"step": 6320
},
{
"epoch": 3.3160377358490565,
"grad_norm": 0.669211745262146,
"learning_rate": 2.5278973585329168e-05,
"loss": 0.1404,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1333250105381012,
"step": 6325
},
{
"epoch": 3.318658280922432,
"grad_norm": 0.7142298221588135,
"learning_rate": 2.525375958825866e-05,
"loss": 0.1642,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15886715054512024,
"step": 6330
},
{
"epoch": 3.3212788259958073,
"grad_norm": 0.8329634666442871,
"learning_rate": 2.52285366189467e-05,
"loss": 0.152,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1321033090353012,
"step": 6335
},
{
"epoch": 3.3238993710691824,
"grad_norm": 0.6707449555397034,
"learning_rate": 2.5203304720468445e-05,
"loss": 0.1566,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15341900289058685,
"step": 6340
},
{
"epoch": 3.3265199161425576,
"grad_norm": 0.5349269509315491,
"learning_rate": 2.5178063935914324e-05,
"loss": 0.1453,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15282392501831055,
"step": 6345
},
{
"epoch": 3.3291404612159328,
"grad_norm": 0.8673381209373474,
"learning_rate": 2.515281430838992e-05,
"loss": 0.169,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1706235706806183,
"step": 6350
},
{
"epoch": 3.331761006289308,
"grad_norm": 0.6685834527015686,
"learning_rate": 2.5127555881015923e-05,
"loss": 0.1757,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2108490765094757,
"step": 6355
},
{
"epoch": 3.3343815513626835,
"grad_norm": 0.7365850210189819,
"learning_rate": 2.5102288696928066e-05,
"loss": 0.15,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.130615234375,
"step": 6360
},
{
"epoch": 3.3370020964360587,
"grad_norm": 0.8285936713218689,
"learning_rate": 2.5077012799277006e-05,
"loss": 0.1495,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1509149670600891,
"step": 6365
},
{
"epoch": 3.339622641509434,
"grad_norm": 0.6469640135765076,
"learning_rate": 2.5051728231228322e-05,
"loss": 0.158,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.135266974568367,
"step": 6370
},
{
"epoch": 3.342243186582809,
"grad_norm": 0.5959151387214661,
"learning_rate": 2.502643503596237e-05,
"loss": 0.1437,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.151039719581604,
"step": 6375
},
{
"epoch": 3.3448637316561847,
"grad_norm": 0.7715269327163696,
"learning_rate": 2.5001133256674233e-05,
"loss": 0.1503,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17917034029960632,
"step": 6380
},
{
"epoch": 3.34748427672956,
"grad_norm": 0.6614984273910522,
"learning_rate": 2.4975822936573684e-05,
"loss": 0.1751,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1380983591079712,
"step": 6385
},
{
"epoch": 3.350104821802935,
"grad_norm": 0.5487889051437378,
"learning_rate": 2.495050411888506e-05,
"loss": 0.1651,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20875106751918793,
"step": 6390
},
{
"epoch": 3.35272536687631,
"grad_norm": 1.8391904830932617,
"learning_rate": 2.4925176846847214e-05,
"loss": 0.1486,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11009234189987183,
"step": 6395
},
{
"epoch": 3.3553459119496853,
"grad_norm": 0.6677514314651489,
"learning_rate": 2.489984116371344e-05,
"loss": 0.1545,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1642158031463623,
"step": 6400
},
{
"epoch": 3.357966457023061,
"grad_norm": 0.6820788383483887,
"learning_rate": 2.4874497112751394e-05,
"loss": 0.1502,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1800456941127777,
"step": 6405
},
{
"epoch": 3.360587002096436,
"grad_norm": 0.6414058804512024,
"learning_rate": 2.4849144737243026e-05,
"loss": 0.1627,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17555800080299377,
"step": 6410
},
{
"epoch": 3.3632075471698113,
"grad_norm": 0.7890809178352356,
"learning_rate": 2.4823784080484495e-05,
"loss": 0.1306,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12165604531764984,
"step": 6415
},
{
"epoch": 3.3658280922431865,
"grad_norm": 0.8478096723556519,
"learning_rate": 2.479841518578611e-05,
"loss": 0.1442,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17640218138694763,
"step": 6420
},
{
"epoch": 3.368448637316562,
"grad_norm": 0.6587736010551453,
"learning_rate": 2.4773038096472247e-05,
"loss": 0.1659,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15387940406799316,
"step": 6425
},
{
"epoch": 3.3710691823899372,
"grad_norm": 0.6497980356216431,
"learning_rate": 2.474765285588127e-05,
"loss": 0.1407,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11815936118364334,
"step": 6430
},
{
"epoch": 3.3736897274633124,
"grad_norm": 0.6267364621162415,
"learning_rate": 2.4722259507365475e-05,
"loss": 0.1585,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16179129481315613,
"step": 6435
},
{
"epoch": 3.3763102725366876,
"grad_norm": 0.7024030089378357,
"learning_rate": 2.4696858094290992e-05,
"loss": 0.1309,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1190185546875,
"step": 6440
},
{
"epoch": 3.3789308176100628,
"grad_norm": 0.7756052017211914,
"learning_rate": 2.4671448660037732e-05,
"loss": 0.1497,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12463849037885666,
"step": 6445
},
{
"epoch": 3.381551362683438,
"grad_norm": 0.7756056189537048,
"learning_rate": 2.464603124799931e-05,
"loss": 0.151,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.177520290017128,
"step": 6450
},
{
"epoch": 3.3841719077568135,
"grad_norm": 0.6775057315826416,
"learning_rate": 2.4620605901582943e-05,
"loss": 0.1485,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14761006832122803,
"step": 6455
},
{
"epoch": 3.3867924528301887,
"grad_norm": 0.6803090572357178,
"learning_rate": 2.4595172664209425e-05,
"loss": 0.1345,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14740537106990814,
"step": 6460
},
{
"epoch": 3.389412997903564,
"grad_norm": 0.6456284523010254,
"learning_rate": 2.4569731579313007e-05,
"loss": 0.162,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15551471710205078,
"step": 6465
},
{
"epoch": 3.392033542976939,
"grad_norm": 0.6624186038970947,
"learning_rate": 2.4544282690341344e-05,
"loss": 0.1647,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17889371514320374,
"step": 6470
},
{
"epoch": 3.3946540880503147,
"grad_norm": 0.6124078631401062,
"learning_rate": 2.4518826040755435e-05,
"loss": 0.1451,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13483217358589172,
"step": 6475
},
{
"epoch": 3.39727463312369,
"grad_norm": 0.6263321042060852,
"learning_rate": 2.4493361674029515e-05,
"loss": 0.1498,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15271490812301636,
"step": 6480
},
{
"epoch": 3.399895178197065,
"grad_norm": 0.648196280002594,
"learning_rate": 2.4467889633650996e-05,
"loss": 0.1419,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13612696528434753,
"step": 6485
},
{
"epoch": 3.40251572327044,
"grad_norm": 0.672197163105011,
"learning_rate": 2.444240996312041e-05,
"loss": 0.1494,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11425255239009857,
"step": 6490
},
{
"epoch": 3.4051362683438153,
"grad_norm": 0.5785332918167114,
"learning_rate": 2.4416922705951312e-05,
"loss": 0.1634,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.21997715532779694,
"step": 6495
},
{
"epoch": 3.407756813417191,
"grad_norm": 0.8766397833824158,
"learning_rate": 2.4391427905670215e-05,
"loss": 0.1456,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1305435299873352,
"step": 6500
},
{
"epoch": 3.410377358490566,
"grad_norm": 0.745764434337616,
"learning_rate": 2.436592560581651e-05,
"loss": 0.1353,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10039962828159332,
"step": 6505
},
{
"epoch": 3.4129979035639413,
"grad_norm": 0.672865629196167,
"learning_rate": 2.4340415849942386e-05,
"loss": 0.1482,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11167111992835999,
"step": 6510
},
{
"epoch": 3.4156184486373165,
"grad_norm": 0.5722970366477966,
"learning_rate": 2.4314898681612794e-05,
"loss": 0.153,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.136262446641922,
"step": 6515
},
{
"epoch": 3.418238993710692,
"grad_norm": 0.7656019926071167,
"learning_rate": 2.4289374144405318e-05,
"loss": 0.1447,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12890625,
"step": 6520
},
{
"epoch": 3.4208595387840672,
"grad_norm": 0.6020995378494263,
"learning_rate": 2.426384228191014e-05,
"loss": 0.1343,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1420569270849228,
"step": 6525
},
{
"epoch": 3.4234800838574424,
"grad_norm": 0.6600314378738403,
"learning_rate": 2.4238303137729945e-05,
"loss": 0.1441,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11346963047981262,
"step": 6530
},
{
"epoch": 3.4261006289308176,
"grad_norm": 0.6210017800331116,
"learning_rate": 2.421275675547985e-05,
"loss": 0.1696,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1867697685956955,
"step": 6535
},
{
"epoch": 3.4287211740041927,
"grad_norm": 0.7550157904624939,
"learning_rate": 2.4187203178787347e-05,
"loss": 0.1615,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16229265928268433,
"step": 6540
},
{
"epoch": 3.431341719077568,
"grad_norm": 0.5855455994606018,
"learning_rate": 2.41616424512922e-05,
"loss": 0.1537,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16221889853477478,
"step": 6545
},
{
"epoch": 3.4339622641509435,
"grad_norm": 0.5946267247200012,
"learning_rate": 2.4136074616646396e-05,
"loss": 0.1532,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16361533105373383,
"step": 6550
},
{
"epoch": 3.4365828092243187,
"grad_norm": 0.7061434984207153,
"learning_rate": 2.411049971851405e-05,
"loss": 0.1546,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15311777591705322,
"step": 6555
},
{
"epoch": 3.439203354297694,
"grad_norm": 0.6284918785095215,
"learning_rate": 2.4084917800571344e-05,
"loss": 0.1611,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1728515625,
"step": 6560
},
{
"epoch": 3.441823899371069,
"grad_norm": 0.5714501142501831,
"learning_rate": 2.405932890650645e-05,
"loss": 0.1425,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1305420696735382,
"step": 6565
},
{
"epoch": 3.4444444444444446,
"grad_norm": 0.67555832862854,
"learning_rate": 2.4033733080019453e-05,
"loss": 0.1564,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16778060793876648,
"step": 6570
},
{
"epoch": 3.44706498951782,
"grad_norm": 0.823415994644165,
"learning_rate": 2.400813036482228e-05,
"loss": 0.1805,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18518805503845215,
"step": 6575
},
{
"epoch": 3.449685534591195,
"grad_norm": 0.7290722727775574,
"learning_rate": 2.398252080463861e-05,
"loss": 0.157,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.145263671875,
"step": 6580
},
{
"epoch": 3.45230607966457,
"grad_norm": 0.6552735567092896,
"learning_rate": 2.3956904443203825e-05,
"loss": 0.138,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1675729751586914,
"step": 6585
},
{
"epoch": 3.4549266247379453,
"grad_norm": 0.8209438323974609,
"learning_rate": 2.3931281324264918e-05,
"loss": 0.1727,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10845947265625,
"step": 6590
},
{
"epoch": 3.457547169811321,
"grad_norm": 0.5601760745048523,
"learning_rate": 2.3905651491580423e-05,
"loss": 0.1579,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1745111048221588,
"step": 6595
},
{
"epoch": 3.460167714884696,
"grad_norm": 0.5397576689720154,
"learning_rate": 2.3880014988920327e-05,
"loss": 0.1584,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2028835266828537,
"step": 6600
},
{
"epoch": 3.4627882599580713,
"grad_norm": 0.6182676553726196,
"learning_rate": 2.3854371860066034e-05,
"loss": 0.1548,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15692049264907837,
"step": 6605
},
{
"epoch": 3.4654088050314464,
"grad_norm": 0.6674803495407104,
"learning_rate": 2.3828722148810236e-05,
"loss": 0.1416,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15361738204956055,
"step": 6610
},
{
"epoch": 3.468029350104822,
"grad_norm": 0.614689826965332,
"learning_rate": 2.380306589895689e-05,
"loss": 0.1576,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1334121972322464,
"step": 6615
},
{
"epoch": 3.470649895178197,
"grad_norm": 0.6762018799781799,
"learning_rate": 2.3777403154321107e-05,
"loss": 0.1456,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12771591544151306,
"step": 6620
},
{
"epoch": 3.4732704402515724,
"grad_norm": 0.6038153171539307,
"learning_rate": 2.3751733958729083e-05,
"loss": 0.1672,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14504867792129517,
"step": 6625
},
{
"epoch": 3.4758909853249476,
"grad_norm": 0.7279376983642578,
"learning_rate": 2.372605835601805e-05,
"loss": 0.1443,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14499513804912567,
"step": 6630
},
{
"epoch": 3.4785115303983227,
"grad_norm": 0.7328532934188843,
"learning_rate": 2.370037639003616e-05,
"loss": 0.1529,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16632357239723206,
"step": 6635
},
{
"epoch": 3.481132075471698,
"grad_norm": 0.6980881690979004,
"learning_rate": 2.3674688104642453e-05,
"loss": 0.1481,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13219040632247925,
"step": 6640
},
{
"epoch": 3.4837526205450735,
"grad_norm": 0.6118700504302979,
"learning_rate": 2.364899354370675e-05,
"loss": 0.1552,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14038139581680298,
"step": 6645
},
{
"epoch": 3.4863731656184487,
"grad_norm": 0.6516825556755066,
"learning_rate": 2.3623292751109582e-05,
"loss": 0.1544,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1734481155872345,
"step": 6650
},
{
"epoch": 3.488993710691824,
"grad_norm": 0.5811892151832581,
"learning_rate": 2.3597585770742138e-05,
"loss": 0.1768,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.197306290268898,
"step": 6655
},
{
"epoch": 3.491614255765199,
"grad_norm": 0.6686034202575684,
"learning_rate": 2.3571872646506165e-05,
"loss": 0.1491,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14100056886672974,
"step": 6660
},
{
"epoch": 3.4942348008385746,
"grad_norm": 0.6539227962493896,
"learning_rate": 2.3546153422313903e-05,
"loss": 0.1453,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1873638778924942,
"step": 6665
},
{
"epoch": 3.49685534591195,
"grad_norm": 0.6214085817337036,
"learning_rate": 2.3520428142088018e-05,
"loss": 0.1656,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2092817723751068,
"step": 6670
},
{
"epoch": 3.499475890985325,
"grad_norm": 0.726628839969635,
"learning_rate": 2.3494696849761497e-05,
"loss": 0.1596,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14948055148124695,
"step": 6675
},
{
"epoch": 3.5020964360587,
"grad_norm": 0.6781795024871826,
"learning_rate": 2.3468959589277623e-05,
"loss": 0.1524,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1506710946559906,
"step": 6680
},
{
"epoch": 3.5047169811320753,
"grad_norm": 0.7744038105010986,
"learning_rate": 2.3443216404589844e-05,
"loss": 0.1555,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15132437646389008,
"step": 6685
},
{
"epoch": 3.5073375262054505,
"grad_norm": 0.6445820331573486,
"learning_rate": 2.3417467339661757e-05,
"loss": 0.1635,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16688039898872375,
"step": 6690
},
{
"epoch": 3.509958071278826,
"grad_norm": 0.5953259468078613,
"learning_rate": 2.3391712438466962e-05,
"loss": 0.1394,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15444037318229675,
"step": 6695
},
{
"epoch": 3.5125786163522013,
"grad_norm": 0.7253164649009705,
"learning_rate": 2.3365951744989054e-05,
"loss": 0.1632,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.102294921875,
"step": 6700
},
{
"epoch": 3.5151991614255764,
"grad_norm": 0.668173611164093,
"learning_rate": 2.334018530322151e-05,
"loss": 0.1522,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12434506416320801,
"step": 6705
},
{
"epoch": 3.517819706498952,
"grad_norm": 0.59187251329422,
"learning_rate": 2.331441315716763e-05,
"loss": 0.1506,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1513979732990265,
"step": 6710
},
{
"epoch": 3.520440251572327,
"grad_norm": 0.6256042122840881,
"learning_rate": 2.3288635350840445e-05,
"loss": 0.1472,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12702324986457825,
"step": 6715
},
{
"epoch": 3.5230607966457024,
"grad_norm": 0.6789478063583374,
"learning_rate": 2.3262851928262665e-05,
"loss": 0.1521,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1434326171875,
"step": 6720
},
{
"epoch": 3.5256813417190775,
"grad_norm": 0.7792441844940186,
"learning_rate": 2.323706293346658e-05,
"loss": 0.1694,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1643582582473755,
"step": 6725
},
{
"epoch": 3.5283018867924527,
"grad_norm": 0.7054676413536072,
"learning_rate": 2.321126841049401e-05,
"loss": 0.1538,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17248061299324036,
"step": 6730
},
{
"epoch": 3.530922431865828,
"grad_norm": 0.7300561666488647,
"learning_rate": 2.3185468403396198e-05,
"loss": 0.1452,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13691771030426025,
"step": 6735
},
{
"epoch": 3.5335429769392035,
"grad_norm": 0.7270481586456299,
"learning_rate": 2.315966295623376e-05,
"loss": 0.1607,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1448974609375,
"step": 6740
},
{
"epoch": 3.5361635220125787,
"grad_norm": 0.6351275444030762,
"learning_rate": 2.3133852113076616e-05,
"loss": 0.1631,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18207350373268127,
"step": 6745
},
{
"epoch": 3.538784067085954,
"grad_norm": 0.7486494779586792,
"learning_rate": 2.3108035918003875e-05,
"loss": 0.1489,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10829313099384308,
"step": 6750
},
{
"epoch": 3.541404612159329,
"grad_norm": 0.7265515327453613,
"learning_rate": 2.308221441510382e-05,
"loss": 0.1397,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12119436264038086,
"step": 6755
},
{
"epoch": 3.5440251572327046,
"grad_norm": 0.5404002666473389,
"learning_rate": 2.3056387648473753e-05,
"loss": 0.1611,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16169658303260803,
"step": 6760
},
{
"epoch": 3.54664570230608,
"grad_norm": 0.7101503014564514,
"learning_rate": 2.303055566222001e-05,
"loss": 0.1422,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16694357991218567,
"step": 6765
},
{
"epoch": 3.549266247379455,
"grad_norm": 0.7440077066421509,
"learning_rate": 2.300471850045782e-05,
"loss": 0.1285,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13593001663684845,
"step": 6770
},
{
"epoch": 3.55188679245283,
"grad_norm": 0.693570613861084,
"learning_rate": 2.297887620731124e-05,
"loss": 0.1623,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14585727453231812,
"step": 6775
},
{
"epoch": 3.5545073375262053,
"grad_norm": 0.6090971231460571,
"learning_rate": 2.295302882691312e-05,
"loss": 0.1649,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1292724609375,
"step": 6780
},
{
"epoch": 3.5571278825995805,
"grad_norm": 0.6697745323181152,
"learning_rate": 2.2927176403404978e-05,
"loss": 0.1774,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.149550199508667,
"step": 6785
},
{
"epoch": 3.559748427672956,
"grad_norm": 0.883256196975708,
"learning_rate": 2.290131898093693e-05,
"loss": 0.1402,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13478536903858185,
"step": 6790
},
{
"epoch": 3.5623689727463312,
"grad_norm": 0.6198585033416748,
"learning_rate": 2.2875456603667664e-05,
"loss": 0.1481,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15345340967178345,
"step": 6795
},
{
"epoch": 3.5649895178197064,
"grad_norm": 0.6998546123504639,
"learning_rate": 2.2849589315764303e-05,
"loss": 0.1718,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17192421853542328,
"step": 6800
},
{
"epoch": 3.567610062893082,
"grad_norm": 0.7005147337913513,
"learning_rate": 2.2823717161402375e-05,
"loss": 0.1624,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16433775424957275,
"step": 6805
},
{
"epoch": 3.570230607966457,
"grad_norm": 0.7533726096153259,
"learning_rate": 2.27978401847657e-05,
"loss": 0.1605,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11318092048168182,
"step": 6810
},
{
"epoch": 3.5728511530398324,
"grad_norm": 0.776470959186554,
"learning_rate": 2.2771958430046342e-05,
"loss": 0.1533,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11972829699516296,
"step": 6815
},
{
"epoch": 3.5754716981132075,
"grad_norm": 0.6570600867271423,
"learning_rate": 2.2746071941444537e-05,
"loss": 0.174,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16138510406017303,
"step": 6820
},
{
"epoch": 3.5780922431865827,
"grad_norm": 0.6985480785369873,
"learning_rate": 2.2720180763168576e-05,
"loss": 0.1497,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12646484375,
"step": 6825
},
{
"epoch": 3.580712788259958,
"grad_norm": 0.5580952763557434,
"learning_rate": 2.269428493943479e-05,
"loss": 0.1564,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18356704711914062,
"step": 6830
},
{
"epoch": 3.5833333333333335,
"grad_norm": 0.6391990780830383,
"learning_rate": 2.2668384514467427e-05,
"loss": 0.1493,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1443229615688324,
"step": 6835
},
{
"epoch": 3.5859538784067087,
"grad_norm": 0.6866182684898376,
"learning_rate": 2.2642479532498597e-05,
"loss": 0.1751,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15328383445739746,
"step": 6840
},
{
"epoch": 3.588574423480084,
"grad_norm": 0.69412761926651,
"learning_rate": 2.2616570037768187e-05,
"loss": 0.1639,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.137939453125,
"step": 6845
},
{
"epoch": 3.591194968553459,
"grad_norm": 0.6370830535888672,
"learning_rate": 2.25906560745238e-05,
"loss": 0.1833,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15124990046024323,
"step": 6850
},
{
"epoch": 3.5938155136268346,
"grad_norm": 0.7414199709892273,
"learning_rate": 2.256473768702066e-05,
"loss": 0.1351,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1309814453125,
"step": 6855
},
{
"epoch": 3.5964360587002098,
"grad_norm": 0.702250599861145,
"learning_rate": 2.2538814919521556e-05,
"loss": 0.1599,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.130854994058609,
"step": 6860
},
{
"epoch": 3.599056603773585,
"grad_norm": 0.6545102000236511,
"learning_rate": 2.2512887816296755e-05,
"loss": 0.1561,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1243896484375,
"step": 6865
},
{
"epoch": 3.60167714884696,
"grad_norm": 0.6784298419952393,
"learning_rate": 2.2486956421623917e-05,
"loss": 0.1549,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15646380186080933,
"step": 6870
},
{
"epoch": 3.6042976939203353,
"grad_norm": 0.6086201071739197,
"learning_rate": 2.2461020779788054e-05,
"loss": 0.14,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14351555705070496,
"step": 6875
},
{
"epoch": 3.6069182389937104,
"grad_norm": 0.7209429144859314,
"learning_rate": 2.2435080935081402e-05,
"loss": 0.156,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1248779296875,
"step": 6880
},
{
"epoch": 3.609538784067086,
"grad_norm": 0.703329861164093,
"learning_rate": 2.240913693180341e-05,
"loss": 0.1317,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13986779749393463,
"step": 6885
},
{
"epoch": 3.6121593291404612,
"grad_norm": 0.6641044616699219,
"learning_rate": 2.2383188814260585e-05,
"loss": 0.1485,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17025399208068848,
"step": 6890
},
{
"epoch": 3.6147798742138364,
"grad_norm": 0.6589853763580322,
"learning_rate": 2.2357236626766504e-05,
"loss": 0.1404,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1534072756767273,
"step": 6895
},
{
"epoch": 3.617400419287212,
"grad_norm": 0.7012056112289429,
"learning_rate": 2.233128041364166e-05,
"loss": 0.148,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1505126953125,
"step": 6900
},
{
"epoch": 3.620020964360587,
"grad_norm": 0.8724771738052368,
"learning_rate": 2.230532021921345e-05,
"loss": 0.1397,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13603194057941437,
"step": 6905
},
{
"epoch": 3.6226415094339623,
"grad_norm": 0.809339702129364,
"learning_rate": 2.2279356087816044e-05,
"loss": 0.15,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14090360701084137,
"step": 6910
},
{
"epoch": 3.6252620545073375,
"grad_norm": 0.6809837222099304,
"learning_rate": 2.2253388063790356e-05,
"loss": 0.1513,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19587504863739014,
"step": 6915
},
{
"epoch": 3.6278825995807127,
"grad_norm": 0.6588430404663086,
"learning_rate": 2.2227416191483928e-05,
"loss": 0.1612,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1605924516916275,
"step": 6920
},
{
"epoch": 3.630503144654088,
"grad_norm": 0.6808624863624573,
"learning_rate": 2.2201440515250897e-05,
"loss": 0.1438,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1401149183511734,
"step": 6925
},
{
"epoch": 3.6331236897274635,
"grad_norm": 0.5970581769943237,
"learning_rate": 2.217546107945188e-05,
"loss": 0.1597,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12778399884700775,
"step": 6930
},
{
"epoch": 3.6357442348008386,
"grad_norm": 0.647731602191925,
"learning_rate": 2.2149477928453914e-05,
"loss": 0.1554,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16550582647323608,
"step": 6935
},
{
"epoch": 3.638364779874214,
"grad_norm": 0.6262145638465881,
"learning_rate": 2.212349110663039e-05,
"loss": 0.1538,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15969672799110413,
"step": 6940
},
{
"epoch": 3.640985324947589,
"grad_norm": 0.7175219655036926,
"learning_rate": 2.209750065836096e-05,
"loss": 0.1564,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1234130859375,
"step": 6945
},
{
"epoch": 3.6436058700209646,
"grad_norm": 0.6249467730522156,
"learning_rate": 2.207150662803148e-05,
"loss": 0.1583,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18427444994449615,
"step": 6950
},
{
"epoch": 3.6462264150943398,
"grad_norm": 0.7114428281784058,
"learning_rate": 2.204550906003391e-05,
"loss": 0.135,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14605392515659332,
"step": 6955
},
{
"epoch": 3.648846960167715,
"grad_norm": 0.7079446315765381,
"learning_rate": 2.2019507998766253e-05,
"loss": 0.1465,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14103522896766663,
"step": 6960
},
{
"epoch": 3.65146750524109,
"grad_norm": 0.6679487228393555,
"learning_rate": 2.199350348863249e-05,
"loss": 0.1482,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16830191016197205,
"step": 6965
},
{
"epoch": 3.6540880503144653,
"grad_norm": 0.7247354388237,
"learning_rate": 2.1967495574042484e-05,
"loss": 0.1617,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15935799479484558,
"step": 6970
},
{
"epoch": 3.6567085953878404,
"grad_norm": 0.7235015630722046,
"learning_rate": 2.194148429941191e-05,
"loss": 0.1639,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1375732421875,
"step": 6975
},
{
"epoch": 3.659329140461216,
"grad_norm": 0.6391201019287109,
"learning_rate": 2.191546970916218e-05,
"loss": 0.1629,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14263559877872467,
"step": 6980
},
{
"epoch": 3.661949685534591,
"grad_norm": 0.6432464718818665,
"learning_rate": 2.1889451847720372e-05,
"loss": 0.1576,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14456170797348022,
"step": 6985
},
{
"epoch": 3.6645702306079664,
"grad_norm": 0.5945658087730408,
"learning_rate": 2.186343075951916e-05,
"loss": 0.137,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12893876433372498,
"step": 6990
},
{
"epoch": 3.667190775681342,
"grad_norm": 0.6155629754066467,
"learning_rate": 2.1837406488996703e-05,
"loss": 0.1493,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19243334233760834,
"step": 6995
},
{
"epoch": 3.669811320754717,
"grad_norm": 0.6677305698394775,
"learning_rate": 2.181137908059663e-05,
"loss": 0.1479,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15798315405845642,
"step": 7000
},
{
"epoch": 3.6724318658280923,
"grad_norm": 0.5484825968742371,
"learning_rate": 2.1785348578767893e-05,
"loss": 0.1683,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14938029646873474,
"step": 7005
},
{
"epoch": 3.6750524109014675,
"grad_norm": 0.6847086548805237,
"learning_rate": 2.1759315027964743e-05,
"loss": 0.1725,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15053415298461914,
"step": 7010
},
{
"epoch": 3.6776729559748427,
"grad_norm": 0.6554835438728333,
"learning_rate": 2.173327847264665e-05,
"loss": 0.1654,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1666020154953003,
"step": 7015
},
{
"epoch": 3.680293501048218,
"grad_norm": 0.5411213636398315,
"learning_rate": 2.170723895727819e-05,
"loss": 0.1455,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10216250270605087,
"step": 7020
},
{
"epoch": 3.6829140461215935,
"grad_norm": 1.39761221408844,
"learning_rate": 2.1681196526329015e-05,
"loss": 0.1503,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15248467028141022,
"step": 7025
},
{
"epoch": 3.6855345911949686,
"grad_norm": 0.6272345781326294,
"learning_rate": 2.1655151224273747e-05,
"loss": 0.1737,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2032470405101776,
"step": 7030
},
{
"epoch": 3.688155136268344,
"grad_norm": 0.6710824966430664,
"learning_rate": 2.162910309559191e-05,
"loss": 0.1628,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1658201515674591,
"step": 7035
},
{
"epoch": 3.690775681341719,
"grad_norm": 0.6399701833724976,
"learning_rate": 2.1603052184767863e-05,
"loss": 0.1395,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.120361328125,
"step": 7040
},
{
"epoch": 3.6933962264150946,
"grad_norm": 0.7688432335853577,
"learning_rate": 2.1576998536290706e-05,
"loss": 0.1565,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11587116122245789,
"step": 7045
},
{
"epoch": 3.6960167714884697,
"grad_norm": 0.6019247174263,
"learning_rate": 2.155094219465422e-05,
"loss": 0.148,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17584499716758728,
"step": 7050
},
{
"epoch": 3.698637316561845,
"grad_norm": 0.6681298613548279,
"learning_rate": 2.1524883204356786e-05,
"loss": 0.1672,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1889631301164627,
"step": 7055
},
{
"epoch": 3.70125786163522,
"grad_norm": 0.6414555311203003,
"learning_rate": 2.1498821609901306e-05,
"loss": 0.1612,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15406014025211334,
"step": 7060
},
{
"epoch": 3.7038784067085953,
"grad_norm": 0.601821780204773,
"learning_rate": 2.1472757455795135e-05,
"loss": 0.1614,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14945435523986816,
"step": 7065
},
{
"epoch": 3.7064989517819704,
"grad_norm": 0.6997553706169128,
"learning_rate": 2.1446690786549986e-05,
"loss": 0.1613,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16845703125,
"step": 7070
},
{
"epoch": 3.709119496855346,
"grad_norm": 0.6811888813972473,
"learning_rate": 2.142062164668188e-05,
"loss": 0.1677,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13721466064453125,
"step": 7075
},
{
"epoch": 3.711740041928721,
"grad_norm": 0.6376758217811584,
"learning_rate": 2.1394550080711056e-05,
"loss": 0.1508,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15672701597213745,
"step": 7080
},
{
"epoch": 3.7143605870020964,
"grad_norm": 0.75472092628479,
"learning_rate": 2.1368476133161885e-05,
"loss": 0.1318,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1363525390625,
"step": 7085
},
{
"epoch": 3.7169811320754715,
"grad_norm": 0.6204655766487122,
"learning_rate": 2.1342399848562826e-05,
"loss": 0.1537,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.160270094871521,
"step": 7090
},
{
"epoch": 3.719601677148847,
"grad_norm": 0.7598617076873779,
"learning_rate": 2.1316321271446306e-05,
"loss": 0.1665,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1729973554611206,
"step": 7095
},
{
"epoch": 3.7222222222222223,
"grad_norm": 0.6129899024963379,
"learning_rate": 2.129024044634868e-05,
"loss": 0.1551,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14653250575065613,
"step": 7100
},
{
"epoch": 3.7248427672955975,
"grad_norm": 0.8236209750175476,
"learning_rate": 2.1264157417810153e-05,
"loss": 0.1477,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09567755460739136,
"step": 7105
},
{
"epoch": 3.7274633123689727,
"grad_norm": 0.7946968078613281,
"learning_rate": 2.1238072230374655e-05,
"loss": 0.1463,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.145263671875,
"step": 7110
},
{
"epoch": 3.730083857442348,
"grad_norm": 0.9173590540885925,
"learning_rate": 2.121198492858985e-05,
"loss": 0.1483,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1373291015625,
"step": 7115
},
{
"epoch": 3.732704402515723,
"grad_norm": 0.6552045345306396,
"learning_rate": 2.1185895557006982e-05,
"loss": 0.1159,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12957391142845154,
"step": 7120
},
{
"epoch": 3.7353249475890986,
"grad_norm": 0.67323899269104,
"learning_rate": 2.1159804160180826e-05,
"loss": 0.1445,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1298828125,
"step": 7125
},
{
"epoch": 3.737945492662474,
"grad_norm": 0.7391826510429382,
"learning_rate": 2.1133710782669653e-05,
"loss": 0.146,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1224365234375,
"step": 7130
},
{
"epoch": 3.740566037735849,
"grad_norm": 0.6076016426086426,
"learning_rate": 2.1107615469035078e-05,
"loss": 0.1649,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1857558786869049,
"step": 7135
},
{
"epoch": 3.7431865828092246,
"grad_norm": 0.6100286245346069,
"learning_rate": 2.1081518263842032e-05,
"loss": 0.1354,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13551019132137299,
"step": 7140
},
{
"epoch": 3.7458071278825997,
"grad_norm": 0.6343952417373657,
"learning_rate": 2.1055419211658687e-05,
"loss": 0.1561,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1612548828125,
"step": 7145
},
{
"epoch": 3.748427672955975,
"grad_norm": 0.6255004405975342,
"learning_rate": 2.102931835705636e-05,
"loss": 0.162,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20365479588508606,
"step": 7150
},
{
"epoch": 3.75104821802935,
"grad_norm": 0.6127068400382996,
"learning_rate": 2.1003215744609452e-05,
"loss": 0.15,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1393146812915802,
"step": 7155
},
{
"epoch": 3.7536687631027252,
"grad_norm": 0.6279597282409668,
"learning_rate": 2.0977111418895363e-05,
"loss": 0.1551,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.151123046875,
"step": 7160
},
{
"epoch": 3.7562893081761004,
"grad_norm": 0.6501970291137695,
"learning_rate": 2.095100542449441e-05,
"loss": 0.1384,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1400650590658188,
"step": 7165
},
{
"epoch": 3.758909853249476,
"grad_norm": 0.6475972533226013,
"learning_rate": 2.0924897805989778e-05,
"loss": 0.1563,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1405632048845291,
"step": 7170
},
{
"epoch": 3.761530398322851,
"grad_norm": 0.6913469433784485,
"learning_rate": 2.089878860796741e-05,
"loss": 0.1457,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15082892775535583,
"step": 7175
},
{
"epoch": 3.7641509433962264,
"grad_norm": 0.6681004762649536,
"learning_rate": 2.087267787501596e-05,
"loss": 0.1561,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1363525390625,
"step": 7180
},
{
"epoch": 3.7667714884696015,
"grad_norm": 0.6305525898933411,
"learning_rate": 2.0846565651726688e-05,
"loss": 0.155,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16507667303085327,
"step": 7185
},
{
"epoch": 3.769392033542977,
"grad_norm": 0.7154058218002319,
"learning_rate": 2.0820451982693406e-05,
"loss": 0.1441,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.118896484375,
"step": 7190
},
{
"epoch": 3.7720125786163523,
"grad_norm": 0.8042317032814026,
"learning_rate": 2.0794336912512403e-05,
"loss": 0.1681,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1513671875,
"step": 7195
},
{
"epoch": 3.7746331236897275,
"grad_norm": 0.7067472338676453,
"learning_rate": 2.076822048578235e-05,
"loss": 0.1472,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1341552734375,
"step": 7200
},
{
"epoch": 3.7772536687631026,
"grad_norm": 0.6020834445953369,
"learning_rate": 2.0742102747104243e-05,
"loss": 0.1564,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18506141006946564,
"step": 7205
},
{
"epoch": 3.779874213836478,
"grad_norm": 0.6437265276908875,
"learning_rate": 2.0715983741081306e-05,
"loss": 0.1585,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15423917770385742,
"step": 7210
},
{
"epoch": 3.782494758909853,
"grad_norm": 0.6600501537322998,
"learning_rate": 2.068986351231894e-05,
"loss": 0.1715,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14562933146953583,
"step": 7215
},
{
"epoch": 3.7851153039832286,
"grad_norm": 0.6256850957870483,
"learning_rate": 2.0663742105424626e-05,
"loss": 0.1437,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12755300104618073,
"step": 7220
},
{
"epoch": 3.7877358490566038,
"grad_norm": 0.622725248336792,
"learning_rate": 2.063761956500786e-05,
"loss": 0.1449,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.167802631855011,
"step": 7225
},
{
"epoch": 3.790356394129979,
"grad_norm": 0.5516610145568848,
"learning_rate": 2.0611495935680085e-05,
"loss": 0.1541,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.177067369222641,
"step": 7230
},
{
"epoch": 3.7929769392033545,
"grad_norm": 0.5736691951751709,
"learning_rate": 2.0585371262054584e-05,
"loss": 0.1458,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17594820261001587,
"step": 7235
},
{
"epoch": 3.7955974842767297,
"grad_norm": 0.8078172206878662,
"learning_rate": 2.0559245588746433e-05,
"loss": 0.155,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1312255859375,
"step": 7240
},
{
"epoch": 3.798218029350105,
"grad_norm": 0.6607158184051514,
"learning_rate": 2.0533118960372418e-05,
"loss": 0.146,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15671448409557343,
"step": 7245
},
{
"epoch": 3.80083857442348,
"grad_norm": 0.6096231341362,
"learning_rate": 2.0506991421550948e-05,
"loss": 0.1362,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1362011730670929,
"step": 7250
},
{
"epoch": 3.8034591194968552,
"grad_norm": 0.5730605125427246,
"learning_rate": 2.0480863016901988e-05,
"loss": 0.1255,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15595975518226624,
"step": 7255
},
{
"epoch": 3.8060796645702304,
"grad_norm": 0.510726809501648,
"learning_rate": 2.0454733791046996e-05,
"loss": 0.1509,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19028598070144653,
"step": 7260
},
{
"epoch": 3.808700209643606,
"grad_norm": 0.6243789792060852,
"learning_rate": 2.042860378860881e-05,
"loss": 0.1503,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1603560745716095,
"step": 7265
},
{
"epoch": 3.811320754716981,
"grad_norm": 0.6757491827011108,
"learning_rate": 2.040247305421162e-05,
"loss": 0.1571,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18410971760749817,
"step": 7270
},
{
"epoch": 3.8139412997903563,
"grad_norm": 0.5957233905792236,
"learning_rate": 2.037634163248084e-05,
"loss": 0.1606,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17781707644462585,
"step": 7275
},
{
"epoch": 3.8165618448637315,
"grad_norm": 0.7109359502792358,
"learning_rate": 2.0350209568043068e-05,
"loss": 0.1502,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1116943359375,
"step": 7280
},
{
"epoch": 3.819182389937107,
"grad_norm": 0.7432882189750671,
"learning_rate": 2.0324076905526012e-05,
"loss": 0.1431,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1364685595035553,
"step": 7285
},
{
"epoch": 3.8218029350104823,
"grad_norm": 0.60526043176651,
"learning_rate": 2.029794368955838e-05,
"loss": 0.1599,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16713152825832367,
"step": 7290
},
{
"epoch": 3.8244234800838575,
"grad_norm": 0.5748690962791443,
"learning_rate": 2.0271809964769842e-05,
"loss": 0.1496,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20099319517612457,
"step": 7295
},
{
"epoch": 3.8270440251572326,
"grad_norm": 0.6210798025131226,
"learning_rate": 2.0245675775790934e-05,
"loss": 0.1686,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17095521092414856,
"step": 7300
},
{
"epoch": 3.829664570230608,
"grad_norm": 0.6181237101554871,
"learning_rate": 2.0219541167252968e-05,
"loss": 0.1476,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13480696082115173,
"step": 7305
},
{
"epoch": 3.832285115303983,
"grad_norm": 0.7305977940559387,
"learning_rate": 2.0193406183788e-05,
"loss": 0.1478,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1747744083404541,
"step": 7310
},
{
"epoch": 3.8349056603773586,
"grad_norm": 0.6723249554634094,
"learning_rate": 2.01672708700287e-05,
"loss": 0.1605,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17026358842849731,
"step": 7315
},
{
"epoch": 3.8375262054507338,
"grad_norm": 0.5861697793006897,
"learning_rate": 2.0141135270608326e-05,
"loss": 0.1451,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17438700795173645,
"step": 7320
},
{
"epoch": 3.840146750524109,
"grad_norm": 0.5344928503036499,
"learning_rate": 2.0114999430160607e-05,
"loss": 0.1534,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1679978370666504,
"step": 7325
},
{
"epoch": 3.8427672955974845,
"grad_norm": 0.6318730115890503,
"learning_rate": 2.0088863393319684e-05,
"loss": 0.1438,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19917961955070496,
"step": 7330
},
{
"epoch": 3.8453878406708597,
"grad_norm": 0.7025963664054871,
"learning_rate": 2.006272720472005e-05,
"loss": 0.169,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16618266701698303,
"step": 7335
},
{
"epoch": 3.848008385744235,
"grad_norm": 0.6110601425170898,
"learning_rate": 2.0036590908996433e-05,
"loss": 0.1616,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19544419646263123,
"step": 7340
},
{
"epoch": 3.85062893081761,
"grad_norm": 0.6725945472717285,
"learning_rate": 2.001045455078376e-05,
"loss": 0.1497,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14471367001533508,
"step": 7345
},
{
"epoch": 3.853249475890985,
"grad_norm": 0.7329033017158508,
"learning_rate": 1.9984318174717063e-05,
"loss": 0.1416,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1296122521162033,
"step": 7350
},
{
"epoch": 3.8558700209643604,
"grad_norm": 0.6029754281044006,
"learning_rate": 1.9958181825431408e-05,
"loss": 0.1799,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17006736993789673,
"step": 7355
},
{
"epoch": 3.858490566037736,
"grad_norm": 0.6240707039833069,
"learning_rate": 1.9932045547561794e-05,
"loss": 0.1568,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14021959900856018,
"step": 7360
},
{
"epoch": 3.861111111111111,
"grad_norm": 0.6396682858467102,
"learning_rate": 1.9905909385743127e-05,
"loss": 0.1522,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.136474609375,
"step": 7365
},
{
"epoch": 3.8637316561844863,
"grad_norm": 0.6918826103210449,
"learning_rate": 1.9879773384610097e-05,
"loss": 0.1446,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10507458448410034,
"step": 7370
},
{
"epoch": 3.8663522012578615,
"grad_norm": 0.6699333786964417,
"learning_rate": 1.985363758879713e-05,
"loss": 0.1695,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1435866504907608,
"step": 7375
},
{
"epoch": 3.868972746331237,
"grad_norm": 0.588420033454895,
"learning_rate": 1.9827502042938287e-05,
"loss": 0.1576,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14458289742469788,
"step": 7380
},
{
"epoch": 3.8715932914046123,
"grad_norm": 0.7174352407455444,
"learning_rate": 1.9801366791667208e-05,
"loss": 0.1492,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14362692832946777,
"step": 7385
},
{
"epoch": 3.8742138364779874,
"grad_norm": 0.6377548575401306,
"learning_rate": 1.9775231879617046e-05,
"loss": 0.1611,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17670518159866333,
"step": 7390
},
{
"epoch": 3.8768343815513626,
"grad_norm": 0.6253235936164856,
"learning_rate": 1.9749097351420352e-05,
"loss": 0.1663,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1592959463596344,
"step": 7395
},
{
"epoch": 3.879454926624738,
"grad_norm": 0.5513318777084351,
"learning_rate": 1.9722963251709033e-05,
"loss": 0.1468,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12524718046188354,
"step": 7400
},
{
"epoch": 3.882075471698113,
"grad_norm": 0.6378021240234375,
"learning_rate": 1.9696829625114262e-05,
"loss": 0.1651,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17486140131950378,
"step": 7405
},
{
"epoch": 3.8846960167714886,
"grad_norm": 0.6169103384017944,
"learning_rate": 1.9670696516266402e-05,
"loss": 0.1503,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1326904296875,
"step": 7410
},
{
"epoch": 3.8873165618448637,
"grad_norm": 0.6443046927452087,
"learning_rate": 1.9644563969794937e-05,
"loss": 0.1561,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17288607358932495,
"step": 7415
},
{
"epoch": 3.889937106918239,
"grad_norm": 0.7150650024414062,
"learning_rate": 1.961843203032838e-05,
"loss": 0.1621,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17397406697273254,
"step": 7420
},
{
"epoch": 3.8925576519916145,
"grad_norm": 1.0315990447998047,
"learning_rate": 1.9592300742494227e-05,
"loss": 0.1583,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16239425539970398,
"step": 7425
},
{
"epoch": 3.8951781970649897,
"grad_norm": 0.8368551731109619,
"learning_rate": 1.9566170150918842e-05,
"loss": 0.1383,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1172657310962677,
"step": 7430
},
{
"epoch": 3.897798742138365,
"grad_norm": 0.6009153127670288,
"learning_rate": 1.95400403002274e-05,
"loss": 0.1507,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1353941410779953,
"step": 7435
},
{
"epoch": 3.90041928721174,
"grad_norm": 0.5876301527023315,
"learning_rate": 1.9513911235043833e-05,
"loss": 0.1464,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1405186504125595,
"step": 7440
},
{
"epoch": 3.903039832285115,
"grad_norm": 0.6061398386955261,
"learning_rate": 1.9487782999990707e-05,
"loss": 0.1625,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1707381010055542,
"step": 7445
},
{
"epoch": 3.9056603773584904,
"grad_norm": 0.682398796081543,
"learning_rate": 1.9461655639689176e-05,
"loss": 0.1542,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10355198383331299,
"step": 7450
},
{
"epoch": 3.908280922431866,
"grad_norm": 0.5958063006401062,
"learning_rate": 1.943552919875891e-05,
"loss": 0.1439,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14010480046272278,
"step": 7455
},
{
"epoch": 3.910901467505241,
"grad_norm": 0.6603404879570007,
"learning_rate": 1.9409403721817997e-05,
"loss": 0.1493,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1476980298757553,
"step": 7460
},
{
"epoch": 3.9135220125786163,
"grad_norm": 0.6378297805786133,
"learning_rate": 1.938327925348289e-05,
"loss": 0.15,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19074776768684387,
"step": 7465
},
{
"epoch": 3.9161425576519915,
"grad_norm": 0.531608521938324,
"learning_rate": 1.9357155838368314e-05,
"loss": 0.1593,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1572052240371704,
"step": 7470
},
{
"epoch": 3.918763102725367,
"grad_norm": 0.619661271572113,
"learning_rate": 1.9331033521087187e-05,
"loss": 0.1634,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17789630591869354,
"step": 7475
},
{
"epoch": 3.9213836477987423,
"grad_norm": 0.6324253678321838,
"learning_rate": 1.9304912346250567e-05,
"loss": 0.152,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15719234943389893,
"step": 7480
},
{
"epoch": 3.9240041928721174,
"grad_norm": 0.6691383123397827,
"learning_rate": 1.9278792358467552e-05,
"loss": 0.1669,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17446158826351166,
"step": 7485
},
{
"epoch": 3.9266247379454926,
"grad_norm": 0.6252411007881165,
"learning_rate": 1.925267360234522e-05,
"loss": 0.1534,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1358642578125,
"step": 7490
},
{
"epoch": 3.9292452830188678,
"grad_norm": 0.5381175875663757,
"learning_rate": 1.9226556122488533e-05,
"loss": 0.1516,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1207832396030426,
"step": 7495
},
{
"epoch": 3.931865828092243,
"grad_norm": 0.5609357953071594,
"learning_rate": 1.9200439963500282e-05,
"loss": 0.1411,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1350494772195816,
"step": 7500
},
{
"epoch": 3.9344863731656186,
"grad_norm": 0.6950834393501282,
"learning_rate": 1.917432516998101e-05,
"loss": 0.1378,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14841407537460327,
"step": 7505
},
{
"epoch": 3.9371069182389937,
"grad_norm": 0.6106055378913879,
"learning_rate": 1.9148211786528904e-05,
"loss": 0.1737,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.2322182059288025,
"step": 7510
},
{
"epoch": 3.939727463312369,
"grad_norm": 0.641797661781311,
"learning_rate": 1.912209985773977e-05,
"loss": 0.1572,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16424429416656494,
"step": 7515
},
{
"epoch": 3.9423480083857445,
"grad_norm": 0.7005577087402344,
"learning_rate": 1.9095989428206917e-05,
"loss": 0.1415,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1828462779521942,
"step": 7520
},
{
"epoch": 3.9449685534591197,
"grad_norm": 0.7762574553489685,
"learning_rate": 1.906988054252109e-05,
"loss": 0.1454,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11207491159439087,
"step": 7525
},
{
"epoch": 3.947589098532495,
"grad_norm": 0.6599898934364319,
"learning_rate": 1.9043773245270406e-05,
"loss": 0.1456,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1344425529241562,
"step": 7530
},
{
"epoch": 3.95020964360587,
"grad_norm": 0.5022367238998413,
"learning_rate": 1.9017667581040264e-05,
"loss": 0.1435,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11625401675701141,
"step": 7535
},
{
"epoch": 3.952830188679245,
"grad_norm": 0.687852680683136,
"learning_rate": 1.8991563594413274e-05,
"loss": 0.15,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16913428902626038,
"step": 7540
},
{
"epoch": 3.9554507337526204,
"grad_norm": 0.636171817779541,
"learning_rate": 1.8965461329969186e-05,
"loss": 0.1457,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14577579498291016,
"step": 7545
},
{
"epoch": 3.958071278825996,
"grad_norm": 0.7088593244552612,
"learning_rate": 1.89393608322848e-05,
"loss": 0.1567,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18438458442687988,
"step": 7550
},
{
"epoch": 3.960691823899371,
"grad_norm": 0.8053141236305237,
"learning_rate": 1.891326214593391e-05,
"loss": 0.1409,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1474609375,
"step": 7555
},
{
"epoch": 3.9633123689727463,
"grad_norm": 0.5420475006103516,
"learning_rate": 1.888716531548721e-05,
"loss": 0.1469,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1517164409160614,
"step": 7560
},
{
"epoch": 3.9659329140461215,
"grad_norm": 0.6235396265983582,
"learning_rate": 1.8861070385512222e-05,
"loss": 0.1712,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15240763127803802,
"step": 7565
},
{
"epoch": 3.968553459119497,
"grad_norm": 0.6375654339790344,
"learning_rate": 1.883497740057323e-05,
"loss": 0.1439,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14083002507686615,
"step": 7570
},
{
"epoch": 3.9711740041928723,
"grad_norm": 0.6710426807403564,
"learning_rate": 1.8808886405231184e-05,
"loss": 0.1404,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15685389935970306,
"step": 7575
},
{
"epoch": 3.9737945492662474,
"grad_norm": 0.7832931876182556,
"learning_rate": 1.8782797444043657e-05,
"loss": 0.155,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18243786692619324,
"step": 7580
},
{
"epoch": 3.9764150943396226,
"grad_norm": 0.5275224447250366,
"learning_rate": 1.8756710561564728e-05,
"loss": 0.1409,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09319180250167847,
"step": 7585
},
{
"epoch": 3.9790356394129978,
"grad_norm": 0.7512395977973938,
"learning_rate": 1.8730625802344927e-05,
"loss": 0.1651,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16206032037734985,
"step": 7590
},
{
"epoch": 3.981656184486373,
"grad_norm": 0.7400315999984741,
"learning_rate": 1.870454321093118e-05,
"loss": 0.1516,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1293114572763443,
"step": 7595
},
{
"epoch": 3.9842767295597485,
"grad_norm": 0.6416282057762146,
"learning_rate": 1.8678462831866684e-05,
"loss": 0.1424,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12756970524787903,
"step": 7600
},
{
"epoch": 3.9868972746331237,
"grad_norm": 1.3672630786895752,
"learning_rate": 1.8652384709690875e-05,
"loss": 0.1557,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1692837029695511,
"step": 7605
},
{
"epoch": 3.989517819706499,
"grad_norm": 0.670573890209198,
"learning_rate": 1.8626308888939323e-05,
"loss": 0.1431,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1169547587633133,
"step": 7610
},
{
"epoch": 3.992138364779874,
"grad_norm": 0.7186034321784973,
"learning_rate": 1.8600235414143676e-05,
"loss": 0.1531,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15219153463840485,
"step": 7615
},
{
"epoch": 3.9947589098532497,
"grad_norm": 0.5585160851478577,
"learning_rate": 1.8574164329831578e-05,
"loss": 0.1543,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18344330787658691,
"step": 7620
},
{
"epoch": 3.997379454926625,
"grad_norm": 0.6685236692428589,
"learning_rate": 1.8548095680526577e-05,
"loss": 0.1516,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18843969702720642,
"step": 7625
},
{
"epoch": 4.000524109014675,
"grad_norm": 0.5740066170692444,
"learning_rate": 1.852202951074808e-05,
"loss": 0.158,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14020760357379913,
"step": 7630
},
{
"epoch": 4.00314465408805,
"grad_norm": 0.6754254102706909,
"learning_rate": 1.8495965865011247e-05,
"loss": 0.1491,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1560601145029068,
"step": 7635
},
{
"epoch": 4.005765199161425,
"grad_norm": 0.7221053838729858,
"learning_rate": 1.8469904787826928e-05,
"loss": 0.1332,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.137377068400383,
"step": 7640
},
{
"epoch": 4.0083857442348005,
"grad_norm": 0.5754213929176331,
"learning_rate": 1.8443846323701596e-05,
"loss": 0.1319,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1294102668762207,
"step": 7645
},
{
"epoch": 4.011006289308176,
"grad_norm": 0.7324739694595337,
"learning_rate": 1.841779051713725e-05,
"loss": 0.1341,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10472945868968964,
"step": 7650
},
{
"epoch": 4.013626834381552,
"grad_norm": 0.6781483888626099,
"learning_rate": 1.839173741263136e-05,
"loss": 0.1378,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11547206342220306,
"step": 7655
},
{
"epoch": 4.016247379454927,
"grad_norm": 0.7400633692741394,
"learning_rate": 1.836568705467678e-05,
"loss": 0.1368,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18429158627986908,
"step": 7660
},
{
"epoch": 4.018867924528302,
"grad_norm": 0.7450043559074402,
"learning_rate": 1.8339639487761663e-05,
"loss": 0.1047,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11984512209892273,
"step": 7665
},
{
"epoch": 4.021488469601677,
"grad_norm": 0.570132851600647,
"learning_rate": 1.8313594756369407e-05,
"loss": 0.1428,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.157229483127594,
"step": 7670
},
{
"epoch": 4.024109014675052,
"grad_norm": 0.6760542988777161,
"learning_rate": 1.8287552904978566e-05,
"loss": 0.127,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1246337890625,
"step": 7675
},
{
"epoch": 4.026729559748428,
"grad_norm": 0.833230197429657,
"learning_rate": 1.8261513978062768e-05,
"loss": 0.1374,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12040312588214874,
"step": 7680
},
{
"epoch": 4.029350104821803,
"grad_norm": 0.6724702715873718,
"learning_rate": 1.8235478020090658e-05,
"loss": 0.1364,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13720819354057312,
"step": 7685
},
{
"epoch": 4.031970649895178,
"grad_norm": 0.6574586033821106,
"learning_rate": 1.82094450755258e-05,
"loss": 0.1323,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1236087828874588,
"step": 7690
},
{
"epoch": 4.034591194968553,
"grad_norm": 0.6379922032356262,
"learning_rate": 1.8183415188826623e-05,
"loss": 0.1386,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15040497481822968,
"step": 7695
},
{
"epoch": 4.037211740041928,
"grad_norm": 0.7528687715530396,
"learning_rate": 1.8157388404446324e-05,
"loss": 0.1287,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11081687361001968,
"step": 7700
},
{
"epoch": 4.039832285115304,
"grad_norm": 0.6273317337036133,
"learning_rate": 1.81313647668328e-05,
"loss": 0.1485,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15230341255664825,
"step": 7705
},
{
"epoch": 4.0424528301886795,
"grad_norm": 0.7650076150894165,
"learning_rate": 1.810534432042859e-05,
"loss": 0.1303,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13308845460414886,
"step": 7710
},
{
"epoch": 4.045073375262055,
"grad_norm": 0.8710198998451233,
"learning_rate": 1.8079327109670762e-05,
"loss": 0.1127,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10908554494380951,
"step": 7715
},
{
"epoch": 4.04769392033543,
"grad_norm": 0.6835393905639648,
"learning_rate": 1.805331317899088e-05,
"loss": 0.1233,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14658352732658386,
"step": 7720
},
{
"epoch": 4.050314465408805,
"grad_norm": 0.6922931671142578,
"learning_rate": 1.802730257281489e-05,
"loss": 0.1291,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15485787391662598,
"step": 7725
},
{
"epoch": 4.05293501048218,
"grad_norm": 0.6863040328025818,
"learning_rate": 1.800129533556306e-05,
"loss": 0.1353,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11109860241413116,
"step": 7730
},
{
"epoch": 4.055555555555555,
"grad_norm": 0.7296552062034607,
"learning_rate": 1.797529151164992e-05,
"loss": 0.1286,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12409099191427231,
"step": 7735
},
{
"epoch": 4.0581761006289305,
"grad_norm": 0.6822858452796936,
"learning_rate": 1.7949291145484153e-05,
"loss": 0.1251,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09227552264928818,
"step": 7740
},
{
"epoch": 4.060796645702306,
"grad_norm": 0.6392486095428467,
"learning_rate": 1.7923294281468552e-05,
"loss": 0.1305,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10040283203125,
"step": 7745
},
{
"epoch": 4.063417190775682,
"grad_norm": 0.5952988862991333,
"learning_rate": 1.789730096399992e-05,
"loss": 0.1339,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15841393172740936,
"step": 7750
},
{
"epoch": 4.066037735849057,
"grad_norm": 0.6478767991065979,
"learning_rate": 1.7871311237468997e-05,
"loss": 0.1508,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17274433374404907,
"step": 7755
},
{
"epoch": 4.068658280922432,
"grad_norm": 0.7296462059020996,
"learning_rate": 1.7845325146260416e-05,
"loss": 0.1223,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11344051361083984,
"step": 7760
},
{
"epoch": 4.071278825995807,
"grad_norm": 0.7088618874549866,
"learning_rate": 1.7819342734752573e-05,
"loss": 0.1368,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13282841444015503,
"step": 7765
},
{
"epoch": 4.073899371069182,
"grad_norm": 0.6578029990196228,
"learning_rate": 1.7793364047317588e-05,
"loss": 0.1177,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1344364732503891,
"step": 7770
},
{
"epoch": 4.076519916142558,
"grad_norm": 0.6128595471382141,
"learning_rate": 1.7767389128321235e-05,
"loss": 0.1414,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13483229279518127,
"step": 7775
},
{
"epoch": 4.079140461215933,
"grad_norm": 0.7561476826667786,
"learning_rate": 1.7741418022122835e-05,
"loss": 0.1144,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1260986328125,
"step": 7780
},
{
"epoch": 4.081761006289308,
"grad_norm": 0.8561640381813049,
"learning_rate": 1.771545077307521e-05,
"loss": 0.1358,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14445775747299194,
"step": 7785
},
{
"epoch": 4.084381551362683,
"grad_norm": 0.6901652216911316,
"learning_rate": 1.7689487425524587e-05,
"loss": 0.1344,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.105224609375,
"step": 7790
},
{
"epoch": 4.087002096436058,
"grad_norm": 0.6184242963790894,
"learning_rate": 1.7663528023810528e-05,
"loss": 0.1376,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16451911628246307,
"step": 7795
},
{
"epoch": 4.089622641509434,
"grad_norm": 0.5835419297218323,
"learning_rate": 1.763757261226587e-05,
"loss": 0.1423,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12152544409036636,
"step": 7800
},
{
"epoch": 4.0922431865828095,
"grad_norm": 0.7047265768051147,
"learning_rate": 1.7611621235216614e-05,
"loss": 0.1317,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15300840139389038,
"step": 7805
},
{
"epoch": 4.094863731656185,
"grad_norm": 0.7085900902748108,
"learning_rate": 1.7585673936981903e-05,
"loss": 0.1267,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13223685324192047,
"step": 7810
},
{
"epoch": 4.09748427672956,
"grad_norm": 0.7397329211235046,
"learning_rate": 1.755973076187388e-05,
"loss": 0.1283,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11137672513723373,
"step": 7815
},
{
"epoch": 4.100104821802935,
"grad_norm": 0.7091107964515686,
"learning_rate": 1.753379175419766e-05,
"loss": 0.1518,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14884835481643677,
"step": 7820
},
{
"epoch": 4.10272536687631,
"grad_norm": 0.8747248649597168,
"learning_rate": 1.750785695825125e-05,
"loss": 0.1462,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14700660109519958,
"step": 7825
},
{
"epoch": 4.105345911949685,
"grad_norm": 0.7035658955574036,
"learning_rate": 1.7481926418325453e-05,
"loss": 0.1209,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1455896496772766,
"step": 7830
},
{
"epoch": 4.1079664570230605,
"grad_norm": 0.6393303275108337,
"learning_rate": 1.7456000178703816e-05,
"loss": 0.1478,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12629064917564392,
"step": 7835
},
{
"epoch": 4.110587002096436,
"grad_norm": 0.6895425915718079,
"learning_rate": 1.7430078283662522e-05,
"loss": 0.1489,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14216358959674835,
"step": 7840
},
{
"epoch": 4.113207547169812,
"grad_norm": 0.7577569484710693,
"learning_rate": 1.7404160777470352e-05,
"loss": 0.1231,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11018848419189453,
"step": 7845
},
{
"epoch": 4.115828092243187,
"grad_norm": 0.7650940418243408,
"learning_rate": 1.7378247704388585e-05,
"loss": 0.1098,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11372706294059753,
"step": 7850
},
{
"epoch": 4.118448637316562,
"grad_norm": 0.5672314763069153,
"learning_rate": 1.7352339108670925e-05,
"loss": 0.1268,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12074200809001923,
"step": 7855
},
{
"epoch": 4.121069182389937,
"grad_norm": 0.6222376227378845,
"learning_rate": 1.7326435034563447e-05,
"loss": 0.1243,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1330033838748932,
"step": 7860
},
{
"epoch": 4.123689727463312,
"grad_norm": 0.6126108169555664,
"learning_rate": 1.730053552630448e-05,
"loss": 0.131,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12257729470729828,
"step": 7865
},
{
"epoch": 4.126310272536688,
"grad_norm": 0.7355426549911499,
"learning_rate": 1.727464062812457e-05,
"loss": 0.1332,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11887337267398834,
"step": 7870
},
{
"epoch": 4.128930817610063,
"grad_norm": 0.653061032295227,
"learning_rate": 1.7248750384246396e-05,
"loss": 0.1411,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14558589458465576,
"step": 7875
},
{
"epoch": 4.131551362683438,
"grad_norm": 0.6871939897537231,
"learning_rate": 1.7222864838884672e-05,
"loss": 0.1166,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.103271484375,
"step": 7880
},
{
"epoch": 4.134171907756813,
"grad_norm": 0.6735332608222961,
"learning_rate": 1.7196984036246093e-05,
"loss": 0.1372,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1126708984375,
"step": 7885
},
{
"epoch": 4.136792452830188,
"grad_norm": 0.7757828831672668,
"learning_rate": 1.7171108020529267e-05,
"loss": 0.1272,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09894275665283203,
"step": 7890
},
{
"epoch": 4.139412997903564,
"grad_norm": 0.658591091632843,
"learning_rate": 1.7145236835924603e-05,
"loss": 0.1348,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18379972875118256,
"step": 7895
},
{
"epoch": 4.1420335429769395,
"grad_norm": 0.6948657631874084,
"learning_rate": 1.711937052661429e-05,
"loss": 0.131,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15797369182109833,
"step": 7900
},
{
"epoch": 4.144654088050315,
"grad_norm": 0.670413613319397,
"learning_rate": 1.709350913677217e-05,
"loss": 0.1251,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13112309575080872,
"step": 7905
},
{
"epoch": 4.14727463312369,
"grad_norm": 0.8161856532096863,
"learning_rate": 1.7067652710563682e-05,
"loss": 0.1369,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15099668502807617,
"step": 7910
},
{
"epoch": 4.149895178197065,
"grad_norm": 0.557365894317627,
"learning_rate": 1.7041801292145807e-05,
"loss": 0.1201,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1425701081752777,
"step": 7915
},
{
"epoch": 4.15251572327044,
"grad_norm": 0.69334876537323,
"learning_rate": 1.7015954925666945e-05,
"loss": 0.1261,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.149521604180336,
"step": 7920
},
{
"epoch": 4.155136268343815,
"grad_norm": 0.6496643424034119,
"learning_rate": 1.69901136552669e-05,
"loss": 0.1308,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1431989073753357,
"step": 7925
},
{
"epoch": 4.1577568134171905,
"grad_norm": 0.7024996876716614,
"learning_rate": 1.6964277525076757e-05,
"loss": 0.1239,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0966796875,
"step": 7930
},
{
"epoch": 4.160377358490566,
"grad_norm": 0.5853867530822754,
"learning_rate": 1.6938446579218815e-05,
"loss": 0.1344,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12605339288711548,
"step": 7935
},
{
"epoch": 4.162997903563942,
"grad_norm": 0.5790950059890747,
"learning_rate": 1.6912620861806536e-05,
"loss": 0.1245,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12532863020896912,
"step": 7940
},
{
"epoch": 4.165618448637317,
"grad_norm": 0.6582181453704834,
"learning_rate": 1.688680041694444e-05,
"loss": 0.1588,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1869627833366394,
"step": 7945
},
{
"epoch": 4.168238993710692,
"grad_norm": 0.7480989098548889,
"learning_rate": 1.6860985288728052e-05,
"loss": 0.1326,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1243896484375,
"step": 7950
},
{
"epoch": 4.170859538784067,
"grad_norm": 0.7857028245925903,
"learning_rate": 1.683517552124381e-05,
"loss": 0.1315,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13845235109329224,
"step": 7955
},
{
"epoch": 4.173480083857442,
"grad_norm": 0.7153283357620239,
"learning_rate": 1.6809371158569002e-05,
"loss": 0.1169,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10776656866073608,
"step": 7960
},
{
"epoch": 4.176100628930818,
"grad_norm": 0.6991490125656128,
"learning_rate": 1.678357224477169e-05,
"loss": 0.1456,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15616081655025482,
"step": 7965
},
{
"epoch": 4.178721174004193,
"grad_norm": 0.6320271492004395,
"learning_rate": 1.6757778823910612e-05,
"loss": 0.1542,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15206626057624817,
"step": 7970
},
{
"epoch": 4.181341719077568,
"grad_norm": 0.6700145602226257,
"learning_rate": 1.673199094003515e-05,
"loss": 0.1251,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.111328125,
"step": 7975
},
{
"epoch": 4.183962264150943,
"grad_norm": 0.7927061915397644,
"learning_rate": 1.670620863718521e-05,
"loss": 0.1168,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.114990234375,
"step": 7980
},
{
"epoch": 4.186582809224318,
"grad_norm": 0.6464593410491943,
"learning_rate": 1.668043195939118e-05,
"loss": 0.1485,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11941881477832794,
"step": 7985
},
{
"epoch": 4.189203354297694,
"grad_norm": 0.6977652311325073,
"learning_rate": 1.6654660950673834e-05,
"loss": 0.1219,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10890907049179077,
"step": 7990
},
{
"epoch": 4.1918238993710695,
"grad_norm": 0.7073503732681274,
"learning_rate": 1.6628895655044272e-05,
"loss": 0.1468,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15555106103420258,
"step": 7995
},
{
"epoch": 4.194444444444445,
"grad_norm": 0.6783562302589417,
"learning_rate": 1.660313611650382e-05,
"loss": 0.1229,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13440021872520447,
"step": 8000
},
{
"epoch": 4.19706498951782,
"grad_norm": 0.6996365189552307,
"learning_rate": 1.6577382379043997e-05,
"loss": 0.1238,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16365362703800201,
"step": 8005
},
{
"epoch": 4.199685534591195,
"grad_norm": 0.7659716010093689,
"learning_rate": 1.6551634486646394e-05,
"loss": 0.1161,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11299479007720947,
"step": 8010
},
{
"epoch": 4.20230607966457,
"grad_norm": 0.7514837980270386,
"learning_rate": 1.652589248328264e-05,
"loss": 0.127,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13047926127910614,
"step": 8015
},
{
"epoch": 4.204926624737945,
"grad_norm": 0.8097921013832092,
"learning_rate": 1.6500156412914286e-05,
"loss": 0.1252,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12737253308296204,
"step": 8020
},
{
"epoch": 4.2075471698113205,
"grad_norm": 0.9040654301643372,
"learning_rate": 1.6474426319492758e-05,
"loss": 0.135,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13547413051128387,
"step": 8025
},
{
"epoch": 4.210167714884696,
"grad_norm": 0.6679673790931702,
"learning_rate": 1.6448702246959286e-05,
"loss": 0.1303,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11428596079349518,
"step": 8030
},
{
"epoch": 4.212788259958071,
"grad_norm": 0.6575073599815369,
"learning_rate": 1.6422984239244803e-05,
"loss": 0.1332,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09771472215652466,
"step": 8035
},
{
"epoch": 4.215408805031447,
"grad_norm": 0.8515417575836182,
"learning_rate": 1.6397272340269892e-05,
"loss": 0.1305,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0955810546875,
"step": 8040
},
{
"epoch": 4.218029350104822,
"grad_norm": 0.6153324842453003,
"learning_rate": 1.63715665939447e-05,
"loss": 0.1209,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15846960246562958,
"step": 8045
},
{
"epoch": 4.220649895178197,
"grad_norm": 0.7781662344932556,
"learning_rate": 1.6345867044168867e-05,
"loss": 0.1344,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1522197723388672,
"step": 8050
},
{
"epoch": 4.223270440251572,
"grad_norm": 0.7323370575904846,
"learning_rate": 1.6320173734831463e-05,
"loss": 0.151,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14353422820568085,
"step": 8055
},
{
"epoch": 4.225890985324948,
"grad_norm": 0.6446478962898254,
"learning_rate": 1.6294486709810875e-05,
"loss": 0.1264,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11535608768463135,
"step": 8060
},
{
"epoch": 4.228511530398323,
"grad_norm": 0.8349292874336243,
"learning_rate": 1.6268806012974785e-05,
"loss": 0.1177,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09714643657207489,
"step": 8065
},
{
"epoch": 4.231132075471698,
"grad_norm": 0.6118739247322083,
"learning_rate": 1.6243131688180048e-05,
"loss": 0.1475,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11294543743133545,
"step": 8070
},
{
"epoch": 4.233752620545073,
"grad_norm": 0.790520966053009,
"learning_rate": 1.6217463779272647e-05,
"loss": 0.1275,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0958251953125,
"step": 8075
},
{
"epoch": 4.236373165618448,
"grad_norm": 0.7248908281326294,
"learning_rate": 1.6191802330087606e-05,
"loss": 0.1226,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1535649597644806,
"step": 8080
},
{
"epoch": 4.238993710691824,
"grad_norm": 0.7249651551246643,
"learning_rate": 1.6166147384448915e-05,
"loss": 0.1464,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13027583062648773,
"step": 8085
},
{
"epoch": 4.2416142557651995,
"grad_norm": 0.8868975639343262,
"learning_rate": 1.614049898616947e-05,
"loss": 0.1238,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0872802734375,
"step": 8090
},
{
"epoch": 4.244234800838575,
"grad_norm": 0.6978241801261902,
"learning_rate": 1.611485717905096e-05,
"loss": 0.1422,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1411801278591156,
"step": 8095
},
{
"epoch": 4.24685534591195,
"grad_norm": 0.6798672676086426,
"learning_rate": 1.6089222006883835e-05,
"loss": 0.1306,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10818228125572205,
"step": 8100
},
{
"epoch": 4.249475890985325,
"grad_norm": 0.6767073273658752,
"learning_rate": 1.6063593513447223e-05,
"loss": 0.1477,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17610739171504974,
"step": 8105
},
{
"epoch": 4.2520964360587,
"grad_norm": 0.6395707726478577,
"learning_rate": 1.6037971742508826e-05,
"loss": 0.1433,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1640520840883255,
"step": 8110
},
{
"epoch": 4.254716981132075,
"grad_norm": 0.7487215399742126,
"learning_rate": 1.6012356737824873e-05,
"loss": 0.1339,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11843766272068024,
"step": 8115
},
{
"epoch": 4.2573375262054505,
"grad_norm": 0.698691725730896,
"learning_rate": 1.598674854314005e-05,
"loss": 0.1273,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13690641522407532,
"step": 8120
},
{
"epoch": 4.259958071278826,
"grad_norm": 0.6861567497253418,
"learning_rate": 1.5961147202187385e-05,
"loss": 0.1326,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14201557636260986,
"step": 8125
},
{
"epoch": 4.262578616352201,
"grad_norm": 0.639648973941803,
"learning_rate": 1.5935552758688237e-05,
"loss": 0.1236,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14524777233600616,
"step": 8130
},
{
"epoch": 4.265199161425577,
"grad_norm": 0.7146438360214233,
"learning_rate": 1.5909965256352156e-05,
"loss": 0.1466,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1595865935087204,
"step": 8135
},
{
"epoch": 4.267819706498952,
"grad_norm": 0.6025714874267578,
"learning_rate": 1.588438473887685e-05,
"loss": 0.1268,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12222793698310852,
"step": 8140
},
{
"epoch": 4.270440251572327,
"grad_norm": 0.6809830665588379,
"learning_rate": 1.5858811249948104e-05,
"loss": 0.1303,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1384069174528122,
"step": 8145
},
{
"epoch": 4.273060796645702,
"grad_norm": 0.8114305734634399,
"learning_rate": 1.5833244833239686e-05,
"loss": 0.1554,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15316900610923767,
"step": 8150
},
{
"epoch": 4.2756813417190775,
"grad_norm": 0.6200181841850281,
"learning_rate": 1.58076855324133e-05,
"loss": 0.1261,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14065788686275482,
"step": 8155
},
{
"epoch": 4.278301886792453,
"grad_norm": 0.8265540599822998,
"learning_rate": 1.578213339111849e-05,
"loss": 0.1271,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1082763671875,
"step": 8160
},
{
"epoch": 4.280922431865828,
"grad_norm": 0.6733942627906799,
"learning_rate": 1.575658845299257e-05,
"loss": 0.1283,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13398826122283936,
"step": 8165
},
{
"epoch": 4.283542976939203,
"grad_norm": 0.5992264151573181,
"learning_rate": 1.5731050761660563e-05,
"loss": 0.1376,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16435667872428894,
"step": 8170
},
{
"epoch": 4.286163522012578,
"grad_norm": 0.6579468846321106,
"learning_rate": 1.570552036073511e-05,
"loss": 0.1274,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13947761058807373,
"step": 8175
},
{
"epoch": 4.288784067085954,
"grad_norm": 0.689167857170105,
"learning_rate": 1.5679997293816397e-05,
"loss": 0.1327,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.119140625,
"step": 8180
},
{
"epoch": 4.2914046121593294,
"grad_norm": 0.6589269042015076,
"learning_rate": 1.56544816044921e-05,
"loss": 0.1325,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1422399878501892,
"step": 8185
},
{
"epoch": 4.294025157232705,
"grad_norm": 0.8908357620239258,
"learning_rate": 1.5628973336337273e-05,
"loss": 0.1415,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12804053723812103,
"step": 8190
},
{
"epoch": 4.29664570230608,
"grad_norm": 0.7792320847511292,
"learning_rate": 1.560347253291432e-05,
"loss": 0.1274,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1163330078125,
"step": 8195
},
{
"epoch": 4.299266247379455,
"grad_norm": 0.6766254305839539,
"learning_rate": 1.557797923777288e-05,
"loss": 0.1242,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15499071776866913,
"step": 8200
},
{
"epoch": 4.30188679245283,
"grad_norm": 0.7019739747047424,
"learning_rate": 1.5552493494449775e-05,
"loss": 0.1366,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1348932981491089,
"step": 8205
},
{
"epoch": 4.304507337526205,
"grad_norm": 0.8038340210914612,
"learning_rate": 1.552701534646894e-05,
"loss": 0.1193,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11774717271327972,
"step": 8210
},
{
"epoch": 4.3071278825995805,
"grad_norm": 0.7018369436264038,
"learning_rate": 1.5501544837341316e-05,
"loss": 0.1406,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14359250664710999,
"step": 8215
},
{
"epoch": 4.309748427672956,
"grad_norm": 0.5905582308769226,
"learning_rate": 1.5476082010564825e-05,
"loss": 0.1418,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14024695754051208,
"step": 8220
},
{
"epoch": 4.312368972746331,
"grad_norm": 0.6606282591819763,
"learning_rate": 1.545062690962425e-05,
"loss": 0.1323,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13133929669857025,
"step": 8225
},
{
"epoch": 4.314989517819707,
"grad_norm": 0.6196896433830261,
"learning_rate": 1.5425179577991182e-05,
"loss": 0.1191,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12420549988746643,
"step": 8230
},
{
"epoch": 4.317610062893082,
"grad_norm": 0.7207083106040955,
"learning_rate": 1.539974005912396e-05,
"loss": 0.1318,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12050847709178925,
"step": 8235
},
{
"epoch": 4.320230607966457,
"grad_norm": 0.6700676679611206,
"learning_rate": 1.5374308396467555e-05,
"loss": 0.1432,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13546261191368103,
"step": 8240
},
{
"epoch": 4.322851153039832,
"grad_norm": 0.7328806519508362,
"learning_rate": 1.534888463345355e-05,
"loss": 0.1341,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1011962890625,
"step": 8245
},
{
"epoch": 4.3254716981132075,
"grad_norm": 0.6616674065589905,
"learning_rate": 1.5323468813500016e-05,
"loss": 0.1167,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13191799819469452,
"step": 8250
},
{
"epoch": 4.328092243186583,
"grad_norm": 0.9578312039375305,
"learning_rate": 1.529806098001146e-05,
"loss": 0.1403,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1744530200958252,
"step": 8255
},
{
"epoch": 4.330712788259958,
"grad_norm": 0.591022253036499,
"learning_rate": 1.5272661176378765e-05,
"loss": 0.1331,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14783424139022827,
"step": 8260
},
{
"epoch": 4.333333333333333,
"grad_norm": 0.7101303935050964,
"learning_rate": 1.524726944597908e-05,
"loss": 0.1455,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11623962223529816,
"step": 8265
},
{
"epoch": 4.335953878406708,
"grad_norm": 0.6866541504859924,
"learning_rate": 1.5221885832175791e-05,
"loss": 0.1284,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12282105535268784,
"step": 8270
},
{
"epoch": 4.338574423480084,
"grad_norm": 0.827079713344574,
"learning_rate": 1.51965103783184e-05,
"loss": 0.1276,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.133544921875,
"step": 8275
},
{
"epoch": 4.341194968553459,
"grad_norm": 0.7082659006118774,
"learning_rate": 1.5171143127742483e-05,
"loss": 0.1504,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13507716357707977,
"step": 8280
},
{
"epoch": 4.343815513626835,
"grad_norm": 0.7135539054870605,
"learning_rate": 1.5145784123769614e-05,
"loss": 0.1308,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11422479152679443,
"step": 8285
},
{
"epoch": 4.34643605870021,
"grad_norm": 0.6408969759941101,
"learning_rate": 1.5120433409707267e-05,
"loss": 0.1253,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13356246054172516,
"step": 8290
},
{
"epoch": 4.349056603773585,
"grad_norm": 0.7403706908226013,
"learning_rate": 1.5095091028848778e-05,
"loss": 0.1166,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11962890625,
"step": 8295
},
{
"epoch": 4.35167714884696,
"grad_norm": 0.7667035460472107,
"learning_rate": 1.506975702447324e-05,
"loss": 0.1386,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15348896384239197,
"step": 8300
},
{
"epoch": 4.354297693920335,
"grad_norm": 0.61168372631073,
"learning_rate": 1.5044431439845433e-05,
"loss": 0.1311,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1137775182723999,
"step": 8305
},
{
"epoch": 4.3569182389937104,
"grad_norm": 0.699834942817688,
"learning_rate": 1.5019114318215779e-05,
"loss": 0.138,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14548508822917938,
"step": 8310
},
{
"epoch": 4.359538784067086,
"grad_norm": 0.744339108467102,
"learning_rate": 1.4993805702820234e-05,
"loss": 0.117,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08885983377695084,
"step": 8315
},
{
"epoch": 4.362159329140461,
"grad_norm": 0.6417985558509827,
"learning_rate": 1.496850563688022e-05,
"loss": 0.1414,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1308688521385193,
"step": 8320
},
{
"epoch": 4.364779874213837,
"grad_norm": 0.7600770592689514,
"learning_rate": 1.4943214163602582e-05,
"loss": 0.1155,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11033851653337479,
"step": 8325
},
{
"epoch": 4.367400419287212,
"grad_norm": 0.7353156805038452,
"learning_rate": 1.4917931326179462e-05,
"loss": 0.1276,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15977749228477478,
"step": 8330
},
{
"epoch": 4.370020964360587,
"grad_norm": 0.7362678050994873,
"learning_rate": 1.489265716778828e-05,
"loss": 0.1316,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16832853853702545,
"step": 8335
},
{
"epoch": 4.372641509433962,
"grad_norm": 0.8924562335014343,
"learning_rate": 1.4867391731591618e-05,
"loss": 0.107,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.081787109375,
"step": 8340
},
{
"epoch": 4.3752620545073375,
"grad_norm": 0.8289515972137451,
"learning_rate": 1.4842135060737162e-05,
"loss": 0.1248,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09356689453125,
"step": 8345
},
{
"epoch": 4.377882599580713,
"grad_norm": 0.7817913293838501,
"learning_rate": 1.4816887198357642e-05,
"loss": 0.1404,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15546870231628418,
"step": 8350
},
{
"epoch": 4.380503144654088,
"grad_norm": 0.7873523235321045,
"learning_rate": 1.4791648187570727e-05,
"loss": 0.1471,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12130207568407059,
"step": 8355
},
{
"epoch": 4.383123689727463,
"grad_norm": 0.8174839019775391,
"learning_rate": 1.4766418071478987e-05,
"loss": 0.1245,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0906982421875,
"step": 8360
},
{
"epoch": 4.385744234800838,
"grad_norm": 0.6824502348899841,
"learning_rate": 1.4741196893169793e-05,
"loss": 0.1358,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1260560303926468,
"step": 8365
},
{
"epoch": 4.388364779874214,
"grad_norm": 0.7154617309570312,
"learning_rate": 1.4715984695715247e-05,
"loss": 0.1342,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13355013728141785,
"step": 8370
},
{
"epoch": 4.390985324947589,
"grad_norm": 0.7102477550506592,
"learning_rate": 1.4690781522172129e-05,
"loss": 0.125,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1316867172718048,
"step": 8375
},
{
"epoch": 4.393605870020965,
"grad_norm": 0.6073978543281555,
"learning_rate": 1.4665587415581791e-05,
"loss": 0.1393,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18074709177017212,
"step": 8380
},
{
"epoch": 4.39622641509434,
"grad_norm": 0.7010768055915833,
"learning_rate": 1.4640402418970116e-05,
"loss": 0.1313,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15058743953704834,
"step": 8385
},
{
"epoch": 4.398846960167715,
"grad_norm": 0.7449144721031189,
"learning_rate": 1.4615226575347419e-05,
"loss": 0.139,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14759394526481628,
"step": 8390
},
{
"epoch": 4.40146750524109,
"grad_norm": 0.8424119353294373,
"learning_rate": 1.4590059927708379e-05,
"loss": 0.1351,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1319580078125,
"step": 8395
},
{
"epoch": 4.404088050314465,
"grad_norm": 0.7126275300979614,
"learning_rate": 1.4564902519031992e-05,
"loss": 0.1356,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09475725889205933,
"step": 8400
},
{
"epoch": 4.40670859538784,
"grad_norm": 0.6241462230682373,
"learning_rate": 1.453975439228145e-05,
"loss": 0.1341,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10678377747535706,
"step": 8405
},
{
"epoch": 4.409329140461216,
"grad_norm": 0.6343846917152405,
"learning_rate": 1.4514615590404115e-05,
"loss": 0.1486,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16542157530784607,
"step": 8410
},
{
"epoch": 4.411949685534591,
"grad_norm": 0.7402783036231995,
"learning_rate": 1.4489486156331412e-05,
"loss": 0.1455,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13801810145378113,
"step": 8415
},
{
"epoch": 4.414570230607967,
"grad_norm": 0.7390091419219971,
"learning_rate": 1.4464366132978764e-05,
"loss": 0.1332,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13943634927272797,
"step": 8420
},
{
"epoch": 4.417190775681342,
"grad_norm": 0.6415871381759644,
"learning_rate": 1.4439255563245539e-05,
"loss": 0.1362,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12398558855056763,
"step": 8425
},
{
"epoch": 4.419811320754717,
"grad_norm": 2.288583278656006,
"learning_rate": 1.4414154490014944e-05,
"loss": 0.1488,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18700379133224487,
"step": 8430
},
{
"epoch": 4.422431865828092,
"grad_norm": 0.659214437007904,
"learning_rate": 1.4389062956153974e-05,
"loss": 0.1365,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13834397494792938,
"step": 8435
},
{
"epoch": 4.4250524109014675,
"grad_norm": 0.7340996265411377,
"learning_rate": 1.436398100451334e-05,
"loss": 0.1238,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11822886765003204,
"step": 8440
},
{
"epoch": 4.427672955974843,
"grad_norm": 0.7849664688110352,
"learning_rate": 1.4338908677927377e-05,
"loss": 0.1481,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13987746834754944,
"step": 8445
},
{
"epoch": 4.430293501048218,
"grad_norm": 0.7479705214500427,
"learning_rate": 1.4313846019213995e-05,
"loss": 0.1333,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.129150390625,
"step": 8450
},
{
"epoch": 4.432914046121593,
"grad_norm": 0.6910154223442078,
"learning_rate": 1.4288793071174578e-05,
"loss": 0.1515,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14056730270385742,
"step": 8455
},
{
"epoch": 4.435534591194968,
"grad_norm": 0.748752772808075,
"learning_rate": 1.4263749876593936e-05,
"loss": 0.1242,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1345258355140686,
"step": 8460
},
{
"epoch": 4.438155136268344,
"grad_norm": 0.7462594509124756,
"learning_rate": 1.4238716478240225e-05,
"loss": 0.1303,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10595703125,
"step": 8465
},
{
"epoch": 4.440775681341719,
"grad_norm": 0.6870795488357544,
"learning_rate": 1.421369291886486e-05,
"loss": 0.1448,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15087157487869263,
"step": 8470
},
{
"epoch": 4.443396226415095,
"grad_norm": 0.8474176526069641,
"learning_rate": 1.4188679241202472e-05,
"loss": 0.1355,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1037876084446907,
"step": 8475
},
{
"epoch": 4.44601677148847,
"grad_norm": 0.7224940061569214,
"learning_rate": 1.4163675487970796e-05,
"loss": 0.1211,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11664362996816635,
"step": 8480
},
{
"epoch": 4.448637316561845,
"grad_norm": 0.625144898891449,
"learning_rate": 1.4138681701870626e-05,
"loss": 0.1501,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17151805758476257,
"step": 8485
},
{
"epoch": 4.45125786163522,
"grad_norm": 0.701992392539978,
"learning_rate": 1.4113697925585745e-05,
"loss": 0.1443,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11887902021408081,
"step": 8490
},
{
"epoch": 4.453878406708595,
"grad_norm": 0.6782894730567932,
"learning_rate": 1.408872420178282e-05,
"loss": 0.1193,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13557225465774536,
"step": 8495
},
{
"epoch": 4.45649895178197,
"grad_norm": 0.7478619813919067,
"learning_rate": 1.4063760573111372e-05,
"loss": 0.1427,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14224380254745483,
"step": 8500
},
{
"epoch": 4.459119496855346,
"grad_norm": 0.8462481498718262,
"learning_rate": 1.4038807082203668e-05,
"loss": 0.1311,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13301105797290802,
"step": 8505
},
{
"epoch": 4.461740041928721,
"grad_norm": 0.7129939794540405,
"learning_rate": 1.4013863771674662e-05,
"loss": 0.1282,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11456423997879028,
"step": 8510
},
{
"epoch": 4.464360587002097,
"grad_norm": 0.5032153725624084,
"learning_rate": 1.3988930684121935e-05,
"loss": 0.1153,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1306523084640503,
"step": 8515
},
{
"epoch": 4.466981132075472,
"grad_norm": 0.6775069236755371,
"learning_rate": 1.3964007862125595e-05,
"loss": 0.1307,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12584161758422852,
"step": 8520
},
{
"epoch": 4.469601677148847,
"grad_norm": 0.6101483702659607,
"learning_rate": 1.3939095348248231e-05,
"loss": 0.1407,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1297893524169922,
"step": 8525
},
{
"epoch": 4.472222222222222,
"grad_norm": 0.8307243585586548,
"learning_rate": 1.3914193185034814e-05,
"loss": 0.1525,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1431855857372284,
"step": 8530
},
{
"epoch": 4.4748427672955975,
"grad_norm": 0.7188790440559387,
"learning_rate": 1.3889301415012648e-05,
"loss": 0.1405,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1144997626543045,
"step": 8535
},
{
"epoch": 4.477463312368973,
"grad_norm": 0.6485366225242615,
"learning_rate": 1.386442008069129e-05,
"loss": 0.1412,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1250472068786621,
"step": 8540
},
{
"epoch": 4.480083857442348,
"grad_norm": 0.7434397339820862,
"learning_rate": 1.3839549224562469e-05,
"loss": 0.1304,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13347342610359192,
"step": 8545
},
{
"epoch": 4.482704402515723,
"grad_norm": 0.6738256812095642,
"learning_rate": 1.3814688889100016e-05,
"loss": 0.1245,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14393022656440735,
"step": 8550
},
{
"epoch": 4.485324947589098,
"grad_norm": 0.6852513551712036,
"learning_rate": 1.3789839116759812e-05,
"loss": 0.1226,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1534811109304428,
"step": 8555
},
{
"epoch": 4.487945492662474,
"grad_norm": 0.8099626302719116,
"learning_rate": 1.3764999949979677e-05,
"loss": 0.1303,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11964830011129379,
"step": 8560
},
{
"epoch": 4.490566037735849,
"grad_norm": 0.7180936336517334,
"learning_rate": 1.3740171431179335e-05,
"loss": 0.1397,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1126708984375,
"step": 8565
},
{
"epoch": 4.493186582809225,
"grad_norm": 0.6698822975158691,
"learning_rate": 1.3715353602760318e-05,
"loss": 0.1274,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12569712102413177,
"step": 8570
},
{
"epoch": 4.4958071278826,
"grad_norm": 0.7475360035896301,
"learning_rate": 1.3690546507105898e-05,
"loss": 0.1267,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14391563832759857,
"step": 8575
},
{
"epoch": 4.498427672955975,
"grad_norm": 0.6875959634780884,
"learning_rate": 1.3665750186581035e-05,
"loss": 0.134,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12100572884082794,
"step": 8580
},
{
"epoch": 4.50104821802935,
"grad_norm": 0.574510931968689,
"learning_rate": 1.3640964683532265e-05,
"loss": 0.1378,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15553082525730133,
"step": 8585
},
{
"epoch": 4.503668763102725,
"grad_norm": 0.641708254814148,
"learning_rate": 1.361619004028767e-05,
"loss": 0.1406,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14186792075634003,
"step": 8590
},
{
"epoch": 4.5062893081761,
"grad_norm": 0.697419285774231,
"learning_rate": 1.3591426299156766e-05,
"loss": 0.1329,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11964790523052216,
"step": 8595
},
{
"epoch": 4.508909853249476,
"grad_norm": 0.6248086094856262,
"learning_rate": 1.3566673502430465e-05,
"loss": 0.1265,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1105770617723465,
"step": 8600
},
{
"epoch": 4.511530398322851,
"grad_norm": 0.6745949387550354,
"learning_rate": 1.3541931692380992e-05,
"loss": 0.1315,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1352331042289734,
"step": 8605
},
{
"epoch": 4.514150943396227,
"grad_norm": 0.7878817319869995,
"learning_rate": 1.3517200911261792e-05,
"loss": 0.1362,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15772098302841187,
"step": 8610
},
{
"epoch": 4.516771488469602,
"grad_norm": 0.7717771530151367,
"learning_rate": 1.3492481201307493e-05,
"loss": 0.1361,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15128159523010254,
"step": 8615
},
{
"epoch": 4.519392033542977,
"grad_norm": 0.6273295283317566,
"learning_rate": 1.3467772604733803e-05,
"loss": 0.142,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17118710279464722,
"step": 8620
},
{
"epoch": 4.522012578616352,
"grad_norm": 0.5775997638702393,
"learning_rate": 1.3443075163737454e-05,
"loss": 0.1148,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1448180377483368,
"step": 8625
},
{
"epoch": 4.5246331236897275,
"grad_norm": 0.7417553663253784,
"learning_rate": 1.3418388920496132e-05,
"loss": 0.1089,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1005859375,
"step": 8630
},
{
"epoch": 4.527253668763103,
"grad_norm": 1.3467016220092773,
"learning_rate": 1.3393713917168398e-05,
"loss": 0.1439,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13034547865390778,
"step": 8635
},
{
"epoch": 4.529874213836478,
"grad_norm": 0.7181364297866821,
"learning_rate": 1.336905019589361e-05,
"loss": 0.1552,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12785106897354126,
"step": 8640
},
{
"epoch": 4.532494758909853,
"grad_norm": 0.6840164661407471,
"learning_rate": 1.3344397798791872e-05,
"loss": 0.1242,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1290283203125,
"step": 8645
},
{
"epoch": 4.535115303983228,
"grad_norm": 0.6312596797943115,
"learning_rate": 1.3319756767963931e-05,
"loss": 0.1397,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15783429145812988,
"step": 8650
},
{
"epoch": 4.537735849056604,
"grad_norm": 0.5777861475944519,
"learning_rate": 1.329512714549115e-05,
"loss": 0.1187,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11967919766902924,
"step": 8655
},
{
"epoch": 4.540356394129979,
"grad_norm": 0.715965986251831,
"learning_rate": 1.327050897343538e-05,
"loss": 0.1261,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15220537781715393,
"step": 8660
},
{
"epoch": 4.5429769392033545,
"grad_norm": 0.6857814788818359,
"learning_rate": 1.324590229383893e-05,
"loss": 0.1579,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12025237083435059,
"step": 8665
},
{
"epoch": 4.54559748427673,
"grad_norm": 0.6896271705627441,
"learning_rate": 1.3221307148724488e-05,
"loss": 0.1533,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1713636815547943,
"step": 8670
},
{
"epoch": 4.548218029350105,
"grad_norm": 0.7923415899276733,
"learning_rate": 1.3196723580095037e-05,
"loss": 0.1269,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12427599728107452,
"step": 8675
},
{
"epoch": 4.55083857442348,
"grad_norm": 0.6951085925102234,
"learning_rate": 1.317215162993379e-05,
"loss": 0.1268,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13826394081115723,
"step": 8680
},
{
"epoch": 4.553459119496855,
"grad_norm": 0.7655824422836304,
"learning_rate": 1.3147591340204118e-05,
"loss": 0.129,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.107421875,
"step": 8685
},
{
"epoch": 4.55607966457023,
"grad_norm": 0.7229164242744446,
"learning_rate": 1.312304275284948e-05,
"loss": 0.1236,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11998310685157776,
"step": 8690
},
{
"epoch": 4.558700209643606,
"grad_norm": 0.9815998077392578,
"learning_rate": 1.3098505909793356e-05,
"loss": 0.1317,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10400390625,
"step": 8695
},
{
"epoch": 4.561320754716981,
"grad_norm": 0.7889739871025085,
"learning_rate": 1.3073980852939148e-05,
"loss": 0.1304,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1103515625,
"step": 8700
},
{
"epoch": 4.563941299790356,
"grad_norm": 0.7040817737579346,
"learning_rate": 1.304946762417016e-05,
"loss": 0.1401,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13906672596931458,
"step": 8705
},
{
"epoch": 4.566561844863732,
"grad_norm": 0.6110599040985107,
"learning_rate": 1.3024966265349481e-05,
"loss": 0.1582,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15552054345607758,
"step": 8710
},
{
"epoch": 4.569182389937107,
"grad_norm": 0.8028501868247986,
"learning_rate": 1.3000476818319928e-05,
"loss": 0.1492,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12405084073543549,
"step": 8715
},
{
"epoch": 4.571802935010482,
"grad_norm": 0.5874795913696289,
"learning_rate": 1.2975999324903968e-05,
"loss": 0.1452,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1393960416316986,
"step": 8720
},
{
"epoch": 4.5744234800838575,
"grad_norm": 0.5880939364433289,
"learning_rate": 1.295153382690367e-05,
"loss": 0.1352,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11091320216655731,
"step": 8725
},
{
"epoch": 4.577044025157233,
"grad_norm": 0.7161071300506592,
"learning_rate": 1.292708036610061e-05,
"loss": 0.1234,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12063398957252502,
"step": 8730
},
{
"epoch": 4.579664570230608,
"grad_norm": 0.7140787243843079,
"learning_rate": 1.2902638984255801e-05,
"loss": 0.142,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09417724609375,
"step": 8735
},
{
"epoch": 4.582285115303983,
"grad_norm": 0.6972057819366455,
"learning_rate": 1.2878209723109645e-05,
"loss": 0.1538,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1201431006193161,
"step": 8740
},
{
"epoch": 4.584905660377358,
"grad_norm": 0.7499229907989502,
"learning_rate": 1.2853792624381823e-05,
"loss": 0.1475,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1542520821094513,
"step": 8745
},
{
"epoch": 4.587526205450734,
"grad_norm": 0.7541307806968689,
"learning_rate": 1.2829387729771262e-05,
"loss": 0.1259,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15811032056808472,
"step": 8750
},
{
"epoch": 4.590146750524109,
"grad_norm": 0.7339674830436707,
"learning_rate": 1.2804995080956038e-05,
"loss": 0.135,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12139548361301422,
"step": 8755
},
{
"epoch": 4.5927672955974845,
"grad_norm": 0.6182024478912354,
"learning_rate": 1.2780614719593312e-05,
"loss": 0.1391,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16165617108345032,
"step": 8760
},
{
"epoch": 4.59538784067086,
"grad_norm": 0.6684214472770691,
"learning_rate": 1.2756246687319278e-05,
"loss": 0.1475,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13279467821121216,
"step": 8765
},
{
"epoch": 4.598008385744235,
"grad_norm": 0.6575222015380859,
"learning_rate": 1.273189102574905e-05,
"loss": 0.1388,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.100341796875,
"step": 8770
},
{
"epoch": 4.60062893081761,
"grad_norm": 0.6948766708374023,
"learning_rate": 1.2707547776476641e-05,
"loss": 0.1257,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08551025390625,
"step": 8775
},
{
"epoch": 4.603249475890985,
"grad_norm": 0.7402828335762024,
"learning_rate": 1.2683216981074847e-05,
"loss": 0.1264,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.071685791015625,
"step": 8780
},
{
"epoch": 4.60587002096436,
"grad_norm": 0.6999821066856384,
"learning_rate": 1.2658898681095193e-05,
"loss": 0.1327,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10844863206148148,
"step": 8785
},
{
"epoch": 4.6084905660377355,
"grad_norm": 0.6915682554244995,
"learning_rate": 1.2634592918067889e-05,
"loss": 0.1165,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11777526885271072,
"step": 8790
},
{
"epoch": 4.611111111111111,
"grad_norm": 0.775738537311554,
"learning_rate": 1.261029973350171e-05,
"loss": 0.1202,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1317407190799713,
"step": 8795
},
{
"epoch": 4.613731656184486,
"grad_norm": 0.7596877217292786,
"learning_rate": 1.2586019168883965e-05,
"loss": 0.1515,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1843775510787964,
"step": 8800
},
{
"epoch": 4.616352201257862,
"grad_norm": 0.6865528225898743,
"learning_rate": 1.2561751265680405e-05,
"loss": 0.1374,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1653093695640564,
"step": 8805
},
{
"epoch": 4.618972746331237,
"grad_norm": 0.6690707802772522,
"learning_rate": 1.2537496065335148e-05,
"loss": 0.1333,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15160250663757324,
"step": 8810
},
{
"epoch": 4.621593291404612,
"grad_norm": 0.5920203924179077,
"learning_rate": 1.2513253609270644e-05,
"loss": 0.1368,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.138197124004364,
"step": 8815
},
{
"epoch": 4.6242138364779874,
"grad_norm": 0.733187198638916,
"learning_rate": 1.248902393888755e-05,
"loss": 0.1471,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1322871744632721,
"step": 8820
},
{
"epoch": 4.626834381551363,
"grad_norm": 0.7857728600502014,
"learning_rate": 1.2464807095564712e-05,
"loss": 0.1362,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12924149632453918,
"step": 8825
},
{
"epoch": 4.629454926624738,
"grad_norm": 0.6490778923034668,
"learning_rate": 1.2440603120659058e-05,
"loss": 0.1274,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12592902779579163,
"step": 8830
},
{
"epoch": 4.632075471698113,
"grad_norm": 0.6917393803596497,
"learning_rate": 1.2416412055505532e-05,
"loss": 0.1212,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12949758768081665,
"step": 8835
},
{
"epoch": 4.634696016771488,
"grad_norm": 0.7808963656425476,
"learning_rate": 1.2392233941417051e-05,
"loss": 0.1447,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1295260488986969,
"step": 8840
},
{
"epoch": 4.637316561844864,
"grad_norm": 0.6719533801078796,
"learning_rate": 1.2368068819684402e-05,
"loss": 0.1206,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10296630859375,
"step": 8845
},
{
"epoch": 4.639937106918239,
"grad_norm": 0.6666312217712402,
"learning_rate": 1.2343916731576178e-05,
"loss": 0.1284,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13879592716693878,
"step": 8850
},
{
"epoch": 4.6425576519916145,
"grad_norm": 0.7610325217247009,
"learning_rate": 1.231977771833873e-05,
"loss": 0.1306,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0919189453125,
"step": 8855
},
{
"epoch": 4.64517819706499,
"grad_norm": 0.6327428221702576,
"learning_rate": 1.2295651821196061e-05,
"loss": 0.1262,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14302411675453186,
"step": 8860
},
{
"epoch": 4.647798742138365,
"grad_norm": 0.6388610005378723,
"learning_rate": 1.22715390813498e-05,
"loss": 0.1259,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15068340301513672,
"step": 8865
},
{
"epoch": 4.65041928721174,
"grad_norm": 0.7151187658309937,
"learning_rate": 1.2247439539979085e-05,
"loss": 0.1454,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11161041259765625,
"step": 8870
},
{
"epoch": 4.653039832285115,
"grad_norm": 0.8098499774932861,
"learning_rate": 1.2223353238240512e-05,
"loss": 0.1168,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.098876953125,
"step": 8875
},
{
"epoch": 4.65566037735849,
"grad_norm": 0.6818023920059204,
"learning_rate": 1.2199280217268085e-05,
"loss": 0.1217,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13234014809131622,
"step": 8880
},
{
"epoch": 4.6582809224318655,
"grad_norm": 0.9082877039909363,
"learning_rate": 1.2175220518173112e-05,
"loss": 0.1437,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13379597663879395,
"step": 8885
},
{
"epoch": 4.660901467505241,
"grad_norm": 0.7240698337554932,
"learning_rate": 1.2151174182044159e-05,
"loss": 0.1297,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14722000062465668,
"step": 8890
},
{
"epoch": 4.663522012578616,
"grad_norm": 0.5974624752998352,
"learning_rate": 1.2127141249946966e-05,
"loss": 0.1389,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11771177500486374,
"step": 8895
},
{
"epoch": 4.666142557651992,
"grad_norm": 0.7507408261299133,
"learning_rate": 1.2103121762924382e-05,
"loss": 0.1476,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.155793696641922,
"step": 8900
},
{
"epoch": 4.668763102725367,
"grad_norm": 0.6103518009185791,
"learning_rate": 1.2079115761996298e-05,
"loss": 0.1276,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14163264632225037,
"step": 8905
},
{
"epoch": 4.671383647798742,
"grad_norm": 0.6876264810562134,
"learning_rate": 1.205512328815957e-05,
"loss": 0.1416,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14926040172576904,
"step": 8910
},
{
"epoch": 4.674004192872117,
"grad_norm": 0.6459044218063354,
"learning_rate": 1.2031144382387963e-05,
"loss": 0.1443,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15989676117897034,
"step": 8915
},
{
"epoch": 4.676624737945493,
"grad_norm": 0.7647832036018372,
"learning_rate": 1.2007179085632055e-05,
"loss": 0.1132,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.088623046875,
"step": 8920
},
{
"epoch": 4.679245283018868,
"grad_norm": 0.773857057094574,
"learning_rate": 1.1983227438819189e-05,
"loss": 0.125,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12303590774536133,
"step": 8925
},
{
"epoch": 4.681865828092243,
"grad_norm": 0.6240308880805969,
"learning_rate": 1.1959289482853404e-05,
"loss": 0.1479,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14185336232185364,
"step": 8930
},
{
"epoch": 4.684486373165618,
"grad_norm": 0.6923026442527771,
"learning_rate": 1.1935365258615347e-05,
"loss": 0.1281,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10647977888584137,
"step": 8935
},
{
"epoch": 4.687106918238994,
"grad_norm": 0.6889780759811401,
"learning_rate": 1.1911454806962231e-05,
"loss": 0.1448,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14253661036491394,
"step": 8940
},
{
"epoch": 4.689727463312369,
"grad_norm": 0.8454015851020813,
"learning_rate": 1.1887558168727726e-05,
"loss": 0.1263,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12030220776796341,
"step": 8945
},
{
"epoch": 4.6923480083857445,
"grad_norm": 0.6042851209640503,
"learning_rate": 1.1863675384721927e-05,
"loss": 0.1425,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13166247308254242,
"step": 8950
},
{
"epoch": 4.69496855345912,
"grad_norm": 0.7077848315238953,
"learning_rate": 1.1839806495731265e-05,
"loss": 0.1171,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12613347172737122,
"step": 8955
},
{
"epoch": 4.697589098532495,
"grad_norm": 0.6560515761375427,
"learning_rate": 1.1815951542518447e-05,
"loss": 0.1216,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14886903762817383,
"step": 8960
},
{
"epoch": 4.70020964360587,
"grad_norm": 0.834663987159729,
"learning_rate": 1.1792110565822363e-05,
"loss": 0.1458,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1405927985906601,
"step": 8965
},
{
"epoch": 4.702830188679245,
"grad_norm": 0.6441571712493896,
"learning_rate": 1.1768283606358062e-05,
"loss": 0.1383,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17624220252037048,
"step": 8970
},
{
"epoch": 4.70545073375262,
"grad_norm": 0.6325451135635376,
"learning_rate": 1.1744470704816626e-05,
"loss": 0.1326,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11755526810884476,
"step": 8975
},
{
"epoch": 4.7080712788259955,
"grad_norm": 0.7124117612838745,
"learning_rate": 1.1720671901865158e-05,
"loss": 0.1238,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08987025916576385,
"step": 8980
},
{
"epoch": 4.710691823899371,
"grad_norm": 0.7094039916992188,
"learning_rate": 1.1696887238146655e-05,
"loss": 0.133,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14215922355651855,
"step": 8985
},
{
"epoch": 4.713312368972746,
"grad_norm": 0.6898460388183594,
"learning_rate": 1.1673116754279982e-05,
"loss": 0.1452,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15627765655517578,
"step": 8990
},
{
"epoch": 4.715932914046122,
"grad_norm": 0.7450935244560242,
"learning_rate": 1.1649360490859794e-05,
"loss": 0.1368,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14466890692710876,
"step": 8995
},
{
"epoch": 4.718553459119497,
"grad_norm": 0.7478159070014954,
"learning_rate": 1.1625618488456452e-05,
"loss": 0.1239,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1301785111427307,
"step": 9000
},
{
"epoch": 4.721174004192872,
"grad_norm": 0.666782796382904,
"learning_rate": 1.1601890787615962e-05,
"loss": 0.1317,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1365184485912323,
"step": 9005
},
{
"epoch": 4.723794549266247,
"grad_norm": 0.7336409687995911,
"learning_rate": 1.1578177428859899e-05,
"loss": 0.1411,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11938644200563431,
"step": 9010
},
{
"epoch": 4.726415094339623,
"grad_norm": 0.6312332153320312,
"learning_rate": 1.1554478452685372e-05,
"loss": 0.1234,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13300299644470215,
"step": 9015
},
{
"epoch": 4.729035639412998,
"grad_norm": 0.7238254547119141,
"learning_rate": 1.1530793899564903e-05,
"loss": 0.1191,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10100212693214417,
"step": 9020
},
{
"epoch": 4.731656184486373,
"grad_norm": 0.6394257545471191,
"learning_rate": 1.1507123809946385e-05,
"loss": 0.1231,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1652788668870926,
"step": 9025
},
{
"epoch": 4.734276729559748,
"grad_norm": 0.6746800541877747,
"learning_rate": 1.1483468224253018e-05,
"loss": 0.1246,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16844013333320618,
"step": 9030
},
{
"epoch": 4.736897274633124,
"grad_norm": 0.6455603837966919,
"learning_rate": 1.1459827182883223e-05,
"loss": 0.1432,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12587332725524902,
"step": 9035
},
{
"epoch": 4.739517819706499,
"grad_norm": 0.7118901014328003,
"learning_rate": 1.1436200726210603e-05,
"loss": 0.1363,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1436968594789505,
"step": 9040
},
{
"epoch": 4.7421383647798745,
"grad_norm": 0.6509990096092224,
"learning_rate": 1.1412588894583832e-05,
"loss": 0.1466,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17771729826927185,
"step": 9045
},
{
"epoch": 4.74475890985325,
"grad_norm": 0.71531742811203,
"learning_rate": 1.1388991728326615e-05,
"loss": 0.1525,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13924604654312134,
"step": 9050
},
{
"epoch": 4.747379454926625,
"grad_norm": 0.7064411640167236,
"learning_rate": 1.1365409267737615e-05,
"loss": 0.1102,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07723115384578705,
"step": 9055
},
{
"epoch": 4.75,
"grad_norm": 0.6295397877693176,
"learning_rate": 1.1341841553090369e-05,
"loss": 0.1221,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1593475043773651,
"step": 9060
},
{
"epoch": 4.752620545073375,
"grad_norm": 0.6610961556434631,
"learning_rate": 1.1318288624633258e-05,
"loss": 0.1627,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14274698495864868,
"step": 9065
},
{
"epoch": 4.75524109014675,
"grad_norm": 0.7972453236579895,
"learning_rate": 1.129475052258938e-05,
"loss": 0.1557,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13769108057022095,
"step": 9070
},
{
"epoch": 4.7578616352201255,
"grad_norm": 0.657624363899231,
"learning_rate": 1.1271227287156536e-05,
"loss": 0.1289,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13159124553203583,
"step": 9075
},
{
"epoch": 4.760482180293501,
"grad_norm": 0.7180423140525818,
"learning_rate": 1.1247718958507121e-05,
"loss": 0.1235,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13836832344532013,
"step": 9080
},
{
"epoch": 4.763102725366876,
"grad_norm": 0.7991257309913635,
"learning_rate": 1.122422557678808e-05,
"loss": 0.1346,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1132948100566864,
"step": 9085
},
{
"epoch": 4.765723270440252,
"grad_norm": 0.71286940574646,
"learning_rate": 1.1200747182120842e-05,
"loss": 0.1339,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15693451464176178,
"step": 9090
},
{
"epoch": 4.768343815513627,
"grad_norm": 0.6712046265602112,
"learning_rate": 1.1177283814601227e-05,
"loss": 0.1249,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.135392963886261,
"step": 9095
},
{
"epoch": 4.770964360587002,
"grad_norm": 0.6733860969543457,
"learning_rate": 1.11538355142994e-05,
"loss": 0.1383,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13899490237236023,
"step": 9100
},
{
"epoch": 4.773584905660377,
"grad_norm": 0.6214482188224792,
"learning_rate": 1.1130402321259788e-05,
"loss": 0.1392,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18730586767196655,
"step": 9105
},
{
"epoch": 4.776205450733753,
"grad_norm": 0.7401185631752014,
"learning_rate": 1.1106984275501014e-05,
"loss": 0.1418,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1090477854013443,
"step": 9110
},
{
"epoch": 4.778825995807128,
"grad_norm": 0.6611148118972778,
"learning_rate": 1.1083581417015858e-05,
"loss": 0.1579,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15509769320487976,
"step": 9115
},
{
"epoch": 4.781446540880503,
"grad_norm": 0.7677807211875916,
"learning_rate": 1.1060193785771139e-05,
"loss": 0.1167,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09381103515625,
"step": 9120
},
{
"epoch": 4.784067085953878,
"grad_norm": 0.8474097847938538,
"learning_rate": 1.1036821421707677e-05,
"loss": 0.1044,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1416015625,
"step": 9125
},
{
"epoch": 4.786687631027254,
"grad_norm": 0.6468724608421326,
"learning_rate": 1.1013464364740223e-05,
"loss": 0.1401,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15126362442970276,
"step": 9130
},
{
"epoch": 4.789308176100629,
"grad_norm": 0.6623623967170715,
"learning_rate": 1.0990122654757373e-05,
"loss": 0.1327,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14331893622875214,
"step": 9135
},
{
"epoch": 4.7919287211740045,
"grad_norm": 0.6035583019256592,
"learning_rate": 1.0966796331621546e-05,
"loss": 0.1316,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13826671242713928,
"step": 9140
},
{
"epoch": 4.79454926624738,
"grad_norm": 0.661192774772644,
"learning_rate": 1.094348543516885e-05,
"loss": 0.1335,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12409496307373047,
"step": 9145
},
{
"epoch": 4.797169811320755,
"grad_norm": 0.6580976247787476,
"learning_rate": 1.0920190005209066e-05,
"loss": 0.1552,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19092583656311035,
"step": 9150
},
{
"epoch": 4.79979035639413,
"grad_norm": 0.684177041053772,
"learning_rate": 1.0896910081525554e-05,
"loss": 0.138,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15298570692539215,
"step": 9155
},
{
"epoch": 4.802410901467505,
"grad_norm": 0.6210129857063293,
"learning_rate": 1.0873645703875186e-05,
"loss": 0.1275,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13470567762851715,
"step": 9160
},
{
"epoch": 4.80503144654088,
"grad_norm": 0.7088992595672607,
"learning_rate": 1.0850396911988312e-05,
"loss": 0.1267,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16002580523490906,
"step": 9165
},
{
"epoch": 4.8076519916142555,
"grad_norm": 0.7904854416847229,
"learning_rate": 1.0827163745568638e-05,
"loss": 0.1169,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1083984375,
"step": 9170
},
{
"epoch": 4.810272536687631,
"grad_norm": 0.7116056680679321,
"learning_rate": 1.08039462442932e-05,
"loss": 0.1408,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14277006685733795,
"step": 9175
},
{
"epoch": 4.812893081761006,
"grad_norm": 0.7433311343193054,
"learning_rate": 1.0780744447812266e-05,
"loss": 0.1097,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10260009765625,
"step": 9180
},
{
"epoch": 4.815513626834382,
"grad_norm": 0.9626566171646118,
"learning_rate": 1.0757558395749292e-05,
"loss": 0.141,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1836649775505066,
"step": 9185
},
{
"epoch": 4.818134171907757,
"grad_norm": 0.7340207695960999,
"learning_rate": 1.0734388127700863e-05,
"loss": 0.1154,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12539434432983398,
"step": 9190
},
{
"epoch": 4.820754716981132,
"grad_norm": 0.8621208667755127,
"learning_rate": 1.0711233683236584e-05,
"loss": 0.1297,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1251404583454132,
"step": 9195
},
{
"epoch": 4.823375262054507,
"grad_norm": 0.69330894947052,
"learning_rate": 1.0688095101899046e-05,
"loss": 0.1113,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08990478515625,
"step": 9200
},
{
"epoch": 4.825995807127883,
"grad_norm": 0.7332335710525513,
"learning_rate": 1.0664972423203748e-05,
"loss": 0.1268,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12357576936483383,
"step": 9205
},
{
"epoch": 4.828616352201258,
"grad_norm": 0.7066367864608765,
"learning_rate": 1.0641865686639025e-05,
"loss": 0.1308,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14807681739330292,
"step": 9210
},
{
"epoch": 4.831236897274633,
"grad_norm": 0.6947424411773682,
"learning_rate": 1.0618774931666014e-05,
"loss": 0.1304,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11925715208053589,
"step": 9215
},
{
"epoch": 4.833857442348008,
"grad_norm": 0.7224918603897095,
"learning_rate": 1.0595700197718526e-05,
"loss": 0.12,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10990692675113678,
"step": 9220
},
{
"epoch": 4.836477987421384,
"grad_norm": 0.6890212297439575,
"learning_rate": 1.0572641524203028e-05,
"loss": 0.1456,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15643475949764252,
"step": 9225
},
{
"epoch": 4.839098532494759,
"grad_norm": 0.7679932117462158,
"learning_rate": 1.054959895049855e-05,
"loss": 0.1387,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1330183744430542,
"step": 9230
},
{
"epoch": 4.8417190775681345,
"grad_norm": 0.6915727853775024,
"learning_rate": 1.0526572515956635e-05,
"loss": 0.1298,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1190517470240593,
"step": 9235
},
{
"epoch": 4.84433962264151,
"grad_norm": 0.7194444537162781,
"learning_rate": 1.0503562259901257e-05,
"loss": 0.1151,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1297692060470581,
"step": 9240
},
{
"epoch": 4.846960167714885,
"grad_norm": 0.7311253547668457,
"learning_rate": 1.0480568221628778e-05,
"loss": 0.1024,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10064697265625,
"step": 9245
},
{
"epoch": 4.84958071278826,
"grad_norm": 0.7731415033340454,
"learning_rate": 1.0457590440407848e-05,
"loss": 0.1332,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13664931058883667,
"step": 9250
},
{
"epoch": 4.852201257861635,
"grad_norm": 0.6909032464027405,
"learning_rate": 1.043462895547935e-05,
"loss": 0.1157,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13590867817401886,
"step": 9255
},
{
"epoch": 4.85482180293501,
"grad_norm": 0.622092068195343,
"learning_rate": 1.0411683806056345e-05,
"loss": 0.14,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1474238932132721,
"step": 9260
},
{
"epoch": 4.8574423480083855,
"grad_norm": 0.7725269198417664,
"learning_rate": 1.0388755031323993e-05,
"loss": 0.1103,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13723696768283844,
"step": 9265
},
{
"epoch": 4.860062893081761,
"grad_norm": 0.6898708343505859,
"learning_rate": 1.0365842670439502e-05,
"loss": 0.1253,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12042510509490967,
"step": 9270
},
{
"epoch": 4.862683438155136,
"grad_norm": 0.719258725643158,
"learning_rate": 1.034294676253203e-05,
"loss": 0.1155,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0797119140625,
"step": 9275
},
{
"epoch": 4.865303983228512,
"grad_norm": 0.63101726770401,
"learning_rate": 1.0320067346702652e-05,
"loss": 0.1285,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1552150547504425,
"step": 9280
},
{
"epoch": 4.867924528301887,
"grad_norm": 0.7382326722145081,
"learning_rate": 1.0297204462024265e-05,
"loss": 0.1382,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1240234375,
"step": 9285
},
{
"epoch": 4.870545073375262,
"grad_norm": 0.7246947288513184,
"learning_rate": 1.0274358147541536e-05,
"loss": 0.1245,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10537716001272202,
"step": 9290
},
{
"epoch": 4.873165618448637,
"grad_norm": 0.6631788611412048,
"learning_rate": 1.0251528442270855e-05,
"loss": 0.1251,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14953824877738953,
"step": 9295
},
{
"epoch": 4.8757861635220126,
"grad_norm": 0.7744284272193909,
"learning_rate": 1.0228715385200224e-05,
"loss": 0.1342,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1300048828125,
"step": 9300
},
{
"epoch": 4.878406708595388,
"grad_norm": 0.6991315484046936,
"learning_rate": 1.0205919015289221e-05,
"loss": 0.1412,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13060849905014038,
"step": 9305
},
{
"epoch": 4.881027253668763,
"grad_norm": 0.6638711094856262,
"learning_rate": 1.0183139371468926e-05,
"loss": 0.1523,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16712617874145508,
"step": 9310
},
{
"epoch": 4.883647798742138,
"grad_norm": 0.7683344483375549,
"learning_rate": 1.0160376492641846e-05,
"loss": 0.1387,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11959155648946762,
"step": 9315
},
{
"epoch": 4.886268343815514,
"grad_norm": 0.6923864483833313,
"learning_rate": 1.013763041768188e-05,
"loss": 0.129,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15276679396629333,
"step": 9320
},
{
"epoch": 4.888888888888889,
"grad_norm": 1.0996633768081665,
"learning_rate": 1.0114901185434211e-05,
"loss": 0.1312,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15459886193275452,
"step": 9325
},
{
"epoch": 4.8915094339622645,
"grad_norm": 0.7718807458877563,
"learning_rate": 1.0092188834715262e-05,
"loss": 0.15,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1363377869129181,
"step": 9330
},
{
"epoch": 4.89412997903564,
"grad_norm": 0.7277114391326904,
"learning_rate": 1.0069493404312627e-05,
"loss": 0.1174,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15835943818092346,
"step": 9335
},
{
"epoch": 4.896750524109015,
"grad_norm": 0.802894115447998,
"learning_rate": 1.0046814932984996e-05,
"loss": 0.1247,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1007080078125,
"step": 9340
},
{
"epoch": 4.89937106918239,
"grad_norm": 0.700408399105072,
"learning_rate": 1.0024153459462119e-05,
"loss": 0.1238,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.114013671875,
"step": 9345
},
{
"epoch": 4.901991614255765,
"grad_norm": 0.838503360748291,
"learning_rate": 1.0001509022444698e-05,
"loss": 0.1162,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12203644216060638,
"step": 9350
},
{
"epoch": 4.90461215932914,
"grad_norm": 0.5777682662010193,
"learning_rate": 9.978881660604345e-06,
"loss": 0.1509,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15224018692970276,
"step": 9355
},
{
"epoch": 4.9072327044025155,
"grad_norm": 0.9380687475204468,
"learning_rate": 9.956271412583512e-06,
"loss": 0.1245,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09423828125,
"step": 9360
},
{
"epoch": 4.909853249475891,
"grad_norm": 0.70067298412323,
"learning_rate": 9.933678316995414e-06,
"loss": 0.1293,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1251220703125,
"step": 9365
},
{
"epoch": 4.912473794549266,
"grad_norm": 0.6209483742713928,
"learning_rate": 9.911102412424006e-06,
"loss": 0.138,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14406529068946838,
"step": 9370
},
{
"epoch": 4.915094339622642,
"grad_norm": 0.6728774309158325,
"learning_rate": 9.88854373742385e-06,
"loss": 0.1212,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1072998046875,
"step": 9375
},
{
"epoch": 4.917714884696017,
"grad_norm": 0.6713111996650696,
"learning_rate": 9.866002330520098e-06,
"loss": 0.1416,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12103331089019775,
"step": 9380
},
{
"epoch": 4.920335429769392,
"grad_norm": 0.6407737135887146,
"learning_rate": 9.843478230208411e-06,
"loss": 0.1268,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13770996034145355,
"step": 9385
},
{
"epoch": 4.922955974842767,
"grad_norm": 0.7303878664970398,
"learning_rate": 9.820971474954887e-06,
"loss": 0.1269,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12143614143133163,
"step": 9390
},
{
"epoch": 4.9255765199161425,
"grad_norm": 0.6689024567604065,
"learning_rate": 9.798482103196023e-06,
"loss": 0.1258,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12794151902198792,
"step": 9395
},
{
"epoch": 4.928197064989518,
"grad_norm": 0.750069260597229,
"learning_rate": 9.776010153338606e-06,
"loss": 0.1226,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11221548169851303,
"step": 9400
},
{
"epoch": 4.930817610062893,
"grad_norm": 0.7729977369308472,
"learning_rate": 9.753555663759683e-06,
"loss": 0.1399,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.143976092338562,
"step": 9405
},
{
"epoch": 4.933438155136268,
"grad_norm": 0.6980475187301636,
"learning_rate": 9.731118672806476e-06,
"loss": 0.1387,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1155320480465889,
"step": 9410
},
{
"epoch": 4.936058700209644,
"grad_norm": 0.7056296467781067,
"learning_rate": 9.70869921879632e-06,
"loss": 0.1063,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11625757813453674,
"step": 9415
},
{
"epoch": 4.938679245283019,
"grad_norm": 0.9438043236732483,
"learning_rate": 9.686297340016624e-06,
"loss": 0.1404,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1055908203125,
"step": 9420
},
{
"epoch": 4.941299790356394,
"grad_norm": 0.6738733053207397,
"learning_rate": 9.663913074724758e-06,
"loss": 0.1327,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16575837135314941,
"step": 9425
},
{
"epoch": 4.94392033542977,
"grad_norm": 0.6602095365524292,
"learning_rate": 9.641546461148016e-06,
"loss": 0.1319,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13226762413978577,
"step": 9430
},
{
"epoch": 4.946540880503145,
"grad_norm": 0.8503509163856506,
"learning_rate": 9.619197537483558e-06,
"loss": 0.142,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10953769087791443,
"step": 9435
},
{
"epoch": 4.94916142557652,
"grad_norm": 0.6526133418083191,
"learning_rate": 9.596866341898318e-06,
"loss": 0.1219,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11833551526069641,
"step": 9440
},
{
"epoch": 4.951781970649895,
"grad_norm": 0.6714123487472534,
"learning_rate": 9.574552912528962e-06,
"loss": 0.1407,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14650098979473114,
"step": 9445
},
{
"epoch": 4.95440251572327,
"grad_norm": 0.6884752511978149,
"learning_rate": 9.55225728748183e-06,
"loss": 0.1433,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11290435492992401,
"step": 9450
},
{
"epoch": 4.9570230607966455,
"grad_norm": 0.7303773760795593,
"learning_rate": 9.529979504832832e-06,
"loss": 0.1117,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1060909554362297,
"step": 9455
},
{
"epoch": 4.959643605870021,
"grad_norm": 0.6748950481414795,
"learning_rate": 9.507719602627417e-06,
"loss": 0.1331,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1291377991437912,
"step": 9460
},
{
"epoch": 4.962264150943396,
"grad_norm": 0.678915798664093,
"learning_rate": 9.485477618880501e-06,
"loss": 0.1416,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14954330027103424,
"step": 9465
},
{
"epoch": 4.964884696016772,
"grad_norm": 0.6271790266036987,
"learning_rate": 9.463253591576392e-06,
"loss": 0.1313,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15000781416893005,
"step": 9470
},
{
"epoch": 4.967505241090147,
"grad_norm": 0.6814647912979126,
"learning_rate": 9.441047558668746e-06,
"loss": 0.1375,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14967915415763855,
"step": 9475
},
{
"epoch": 4.970125786163522,
"grad_norm": 0.6391023397445679,
"learning_rate": 9.418859558080478e-06,
"loss": 0.1309,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16454076766967773,
"step": 9480
},
{
"epoch": 4.972746331236897,
"grad_norm": 0.6564544439315796,
"learning_rate": 9.396689627703706e-06,
"loss": 0.1416,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14007678627967834,
"step": 9485
},
{
"epoch": 4.9753668763102725,
"grad_norm": 0.6617010235786438,
"learning_rate": 9.374537805399695e-06,
"loss": 0.1338,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17006102204322815,
"step": 9490
},
{
"epoch": 4.977987421383648,
"grad_norm": 0.7234999537467957,
"learning_rate": 9.352404128998774e-06,
"loss": 0.1515,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11756335198879242,
"step": 9495
},
{
"epoch": 4.980607966457023,
"grad_norm": 0.6654846668243408,
"learning_rate": 9.330288636300306e-06,
"loss": 0.1231,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08251953125,
"step": 9500
},
{
"epoch": 4.983228511530398,
"grad_norm": 0.7360222935676575,
"learning_rate": 9.308191365072578e-06,
"loss": 0.1359,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1528562754392624,
"step": 9505
},
{
"epoch": 4.985849056603773,
"grad_norm": 0.8071749806404114,
"learning_rate": 9.28611235305277e-06,
"loss": 0.1197,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09523770958185196,
"step": 9510
},
{
"epoch": 4.988469601677149,
"grad_norm": 0.8710860013961792,
"learning_rate": 9.26405163794687e-06,
"loss": 0.1439,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13526538014411926,
"step": 9515
},
{
"epoch": 4.991090146750524,
"grad_norm": 0.7235217094421387,
"learning_rate": 9.24200925742962e-06,
"loss": 0.1351,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1304163634777069,
"step": 9520
},
{
"epoch": 4.9937106918239,
"grad_norm": 0.6458355784416199,
"learning_rate": 9.219985249144472e-06,
"loss": 0.1345,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16256138682365417,
"step": 9525
},
{
"epoch": 4.996331236897275,
"grad_norm": 0.7006630897521973,
"learning_rate": 9.197979650703476e-06,
"loss": 0.135,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11299961060285568,
"step": 9530
},
{
"epoch": 4.99895178197065,
"grad_norm": 0.7289224863052368,
"learning_rate": 9.175992499687254e-06,
"loss": 0.123,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12888827919960022,
"step": 9535
},
{
"epoch": 5.0020964360587,
"grad_norm": 0.6883781552314758,
"learning_rate": 9.154023833644923e-06,
"loss": 0.1171,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13848619163036346,
"step": 9540
},
{
"epoch": 5.004716981132075,
"grad_norm": 0.6228224635124207,
"learning_rate": 9.132073690094018e-06,
"loss": 0.1454,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14985814690589905,
"step": 9545
},
{
"epoch": 5.0073375262054505,
"grad_norm": 0.7366200089454651,
"learning_rate": 9.110142106520474e-06,
"loss": 0.121,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10620991885662079,
"step": 9550
},
{
"epoch": 5.009958071278826,
"grad_norm": 0.7490249276161194,
"learning_rate": 9.088229120378503e-06,
"loss": 0.1206,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13648374378681183,
"step": 9555
},
{
"epoch": 5.012578616352202,
"grad_norm": 0.6735389232635498,
"learning_rate": 9.066334769090559e-06,
"loss": 0.1307,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.19621829688549042,
"step": 9560
},
{
"epoch": 5.015199161425577,
"grad_norm": 0.7313094735145569,
"learning_rate": 9.044459090047284e-06,
"loss": 0.1182,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14157983660697937,
"step": 9565
},
{
"epoch": 5.017819706498952,
"grad_norm": 0.8101787567138672,
"learning_rate": 9.022602120607411e-06,
"loss": 0.119,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11646367609500885,
"step": 9570
},
{
"epoch": 5.020440251572327,
"grad_norm": 0.6893128156661987,
"learning_rate": 9.000763898097756e-06,
"loss": 0.1123,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13469628989696503,
"step": 9575
},
{
"epoch": 5.023060796645702,
"grad_norm": 0.8310616612434387,
"learning_rate": 8.978944459813084e-06,
"loss": 0.1142,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11655493825674057,
"step": 9580
},
{
"epoch": 5.0256813417190775,
"grad_norm": 0.7457146644592285,
"learning_rate": 8.9571438430161e-06,
"loss": 0.1122,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12168244272470474,
"step": 9585
},
{
"epoch": 5.028301886792453,
"grad_norm": 0.7926444411277771,
"learning_rate": 8.93536208493736e-06,
"loss": 0.1226,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12764409184455872,
"step": 9590
},
{
"epoch": 5.030922431865828,
"grad_norm": 0.6953998804092407,
"learning_rate": 8.91359922277521e-06,
"loss": 0.1281,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08046749979257584,
"step": 9595
},
{
"epoch": 5.033542976939203,
"grad_norm": 0.6795186400413513,
"learning_rate": 8.891855293695741e-06,
"loss": 0.1202,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1052897572517395,
"step": 9600
},
{
"epoch": 5.036163522012578,
"grad_norm": 0.7303555607795715,
"learning_rate": 8.870130334832695e-06,
"loss": 0.1153,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08300430327653885,
"step": 9605
},
{
"epoch": 5.038784067085954,
"grad_norm": 0.851283073425293,
"learning_rate": 8.848424383287427e-06,
"loss": 0.1156,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.074462890625,
"step": 9610
},
{
"epoch": 5.0414046121593294,
"grad_norm": 0.6680898666381836,
"learning_rate": 8.826737476128822e-06,
"loss": 0.123,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1081383228302002,
"step": 9615
},
{
"epoch": 5.044025157232705,
"grad_norm": 0.7580778002738953,
"learning_rate": 8.805069650393239e-06,
"loss": 0.1132,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1034734845161438,
"step": 9620
},
{
"epoch": 5.04664570230608,
"grad_norm": 0.7712807059288025,
"learning_rate": 8.783420943084477e-06,
"loss": 0.0895,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07745361328125,
"step": 9625
},
{
"epoch": 5.049266247379455,
"grad_norm": 0.750026524066925,
"learning_rate": 8.761791391173656e-06,
"loss": 0.103,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11945658922195435,
"step": 9630
},
{
"epoch": 5.05188679245283,
"grad_norm": 0.6891661882400513,
"learning_rate": 8.740181031599194e-06,
"loss": 0.1311,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12674002349376678,
"step": 9635
},
{
"epoch": 5.054507337526205,
"grad_norm": 0.7564114332199097,
"learning_rate": 8.71858990126673e-06,
"loss": 0.1147,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14253893494606018,
"step": 9640
},
{
"epoch": 5.0571278825995805,
"grad_norm": 0.8268321752548218,
"learning_rate": 8.697018037049061e-06,
"loss": 0.1068,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07177734375,
"step": 9645
},
{
"epoch": 5.059748427672956,
"grad_norm": 0.6657952070236206,
"learning_rate": 8.6754654757861e-06,
"loss": 0.1259,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07985062152147293,
"step": 9650
},
{
"epoch": 5.062368972746331,
"grad_norm": 0.7740989923477173,
"learning_rate": 8.653932254284772e-06,
"loss": 0.1137,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09762389957904816,
"step": 9655
},
{
"epoch": 5.064989517819707,
"grad_norm": 0.6569227576255798,
"learning_rate": 8.632418409318985e-06,
"loss": 0.1091,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11397574096918106,
"step": 9660
},
{
"epoch": 5.067610062893082,
"grad_norm": 0.7708141803741455,
"learning_rate": 8.610923977629555e-06,
"loss": 0.1188,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.082061767578125,
"step": 9665
},
{
"epoch": 5.070230607966457,
"grad_norm": 0.6552038788795471,
"learning_rate": 8.589448995924144e-06,
"loss": 0.1144,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08863822370767593,
"step": 9670
},
{
"epoch": 5.072851153039832,
"grad_norm": 0.7292214035987854,
"learning_rate": 8.567993500877188e-06,
"loss": 0.1182,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11741748452186584,
"step": 9675
},
{
"epoch": 5.0754716981132075,
"grad_norm": 0.7536949515342712,
"learning_rate": 8.54655752912987e-06,
"loss": 0.0983,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08876580744981766,
"step": 9680
},
{
"epoch": 5.078092243186583,
"grad_norm": 0.7254672050476074,
"learning_rate": 8.52514111729001e-06,
"loss": 0.1312,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1279667466878891,
"step": 9685
},
{
"epoch": 5.080712788259958,
"grad_norm": 0.6217379570007324,
"learning_rate": 8.503744301932026e-06,
"loss": 0.1263,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15234941244125366,
"step": 9690
},
{
"epoch": 5.083333333333333,
"grad_norm": 0.6784071922302246,
"learning_rate": 8.482367119596876e-06,
"loss": 0.1214,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1081690639257431,
"step": 9695
},
{
"epoch": 5.085953878406708,
"grad_norm": 0.7554691433906555,
"learning_rate": 8.46100960679198e-06,
"loss": 0.122,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0892043262720108,
"step": 9700
},
{
"epoch": 5.088574423480084,
"grad_norm": 0.7500377297401428,
"learning_rate": 8.439671799991184e-06,
"loss": 0.1165,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1136789470911026,
"step": 9705
},
{
"epoch": 5.091194968553459,
"grad_norm": 0.645708441734314,
"learning_rate": 8.418353735634666e-06,
"loss": 0.1256,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13962142169475555,
"step": 9710
},
{
"epoch": 5.093815513626835,
"grad_norm": 0.6366642117500305,
"learning_rate": 8.39705545012889e-06,
"loss": 0.1193,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10192590206861496,
"step": 9715
},
{
"epoch": 5.09643605870021,
"grad_norm": 0.6908004879951477,
"learning_rate": 8.375776979846546e-06,
"loss": 0.1141,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13654020428657532,
"step": 9720
},
{
"epoch": 5.099056603773585,
"grad_norm": 0.7502008080482483,
"learning_rate": 8.354518361126475e-06,
"loss": 0.107,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09822264313697815,
"step": 9725
},
{
"epoch": 5.10167714884696,
"grad_norm": 0.7582405805587769,
"learning_rate": 8.333279630273636e-06,
"loss": 0.1012,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0853373259305954,
"step": 9730
},
{
"epoch": 5.104297693920335,
"grad_norm": 0.7920611500740051,
"learning_rate": 8.312060823559006e-06,
"loss": 0.1254,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13328997790813446,
"step": 9735
},
{
"epoch": 5.1069182389937104,
"grad_norm": 0.8097739219665527,
"learning_rate": 8.290861977219542e-06,
"loss": 0.1136,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0924072265625,
"step": 9740
},
{
"epoch": 5.109538784067086,
"grad_norm": 0.7220619916915894,
"learning_rate": 8.26968312745811e-06,
"loss": 0.1171,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12379848957061768,
"step": 9745
},
{
"epoch": 5.112159329140461,
"grad_norm": 0.9506103992462158,
"learning_rate": 8.248524310443424e-06,
"loss": 0.1119,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10959365218877792,
"step": 9750
},
{
"epoch": 5.114779874213837,
"grad_norm": 0.6837508678436279,
"learning_rate": 8.227385562310004e-06,
"loss": 0.1236,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1349945366382599,
"step": 9755
},
{
"epoch": 5.117400419287212,
"grad_norm": 0.9021229147911072,
"learning_rate": 8.206266919158079e-06,
"loss": 0.0918,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0799560546875,
"step": 9760
},
{
"epoch": 5.120020964360587,
"grad_norm": 0.738277018070221,
"learning_rate": 8.185168417053548e-06,
"loss": 0.1285,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1264088898897171,
"step": 9765
},
{
"epoch": 5.122641509433962,
"grad_norm": 0.633439838886261,
"learning_rate": 8.164090092027914e-06,
"loss": 0.1013,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10584621131420135,
"step": 9770
},
{
"epoch": 5.1252620545073375,
"grad_norm": 0.5706722736358643,
"learning_rate": 8.143031980078213e-06,
"loss": 0.1156,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10877234488725662,
"step": 9775
},
{
"epoch": 5.127882599580713,
"grad_norm": 0.7367931604385376,
"learning_rate": 8.12199411716699e-06,
"loss": 0.121,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10262522101402283,
"step": 9780
},
{
"epoch": 5.130503144654088,
"grad_norm": 0.6430312991142273,
"learning_rate": 8.100976539222179e-06,
"loss": 0.1093,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1129990965127945,
"step": 9785
},
{
"epoch": 5.133123689727463,
"grad_norm": 0.8123562335968018,
"learning_rate": 8.079979282137083e-06,
"loss": 0.1176,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08623982965946198,
"step": 9790
},
{
"epoch": 5.135744234800838,
"grad_norm": 0.8307282328605652,
"learning_rate": 8.059002381770303e-06,
"loss": 0.1304,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11143720149993896,
"step": 9795
},
{
"epoch": 5.138364779874214,
"grad_norm": 0.8005284070968628,
"learning_rate": 8.038045873945664e-06,
"loss": 0.114,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10873573273420334,
"step": 9800
},
{
"epoch": 5.140985324947589,
"grad_norm": 0.7001315355300903,
"learning_rate": 8.017109794452194e-06,
"loss": 0.1005,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1235947459936142,
"step": 9805
},
{
"epoch": 5.143605870020965,
"grad_norm": 0.7119617462158203,
"learning_rate": 7.996194179044003e-06,
"loss": 0.1262,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13383308053016663,
"step": 9810
},
{
"epoch": 5.14622641509434,
"grad_norm": 0.678630530834198,
"learning_rate": 7.975299063440268e-06,
"loss": 0.1119,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13731247186660767,
"step": 9815
},
{
"epoch": 5.148846960167715,
"grad_norm": 0.8533686995506287,
"learning_rate": 7.95442448332515e-06,
"loss": 0.1258,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1360068917274475,
"step": 9820
},
{
"epoch": 5.15146750524109,
"grad_norm": 0.8401104211807251,
"learning_rate": 7.933570474347738e-06,
"loss": 0.1267,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08184814453125,
"step": 9825
},
{
"epoch": 5.154088050314465,
"grad_norm": 0.8351700305938721,
"learning_rate": 7.912737072122012e-06,
"loss": 0.133,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12352661788463593,
"step": 9830
},
{
"epoch": 5.15670859538784,
"grad_norm": 0.8089619874954224,
"learning_rate": 7.891924312226738e-06,
"loss": 0.1134,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12224520742893219,
"step": 9835
},
{
"epoch": 5.159329140461216,
"grad_norm": 0.7358208298683167,
"learning_rate": 7.87113223020543e-06,
"loss": 0.1097,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09696923196315765,
"step": 9840
},
{
"epoch": 5.161949685534591,
"grad_norm": 0.6873857378959656,
"learning_rate": 7.8503608615663e-06,
"loss": 0.1111,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10394269227981567,
"step": 9845
},
{
"epoch": 5.164570230607967,
"grad_norm": 0.7390537858009338,
"learning_rate": 7.829610241782171e-06,
"loss": 0.1224,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1274482011795044,
"step": 9850
},
{
"epoch": 5.167190775681342,
"grad_norm": 0.6764417290687561,
"learning_rate": 7.808880406290455e-06,
"loss": 0.126,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11439989507198334,
"step": 9855
},
{
"epoch": 5.169811320754717,
"grad_norm": 0.8547785878181458,
"learning_rate": 7.78817139049305e-06,
"loss": 0.1115,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10105232149362564,
"step": 9860
},
{
"epoch": 5.172431865828092,
"grad_norm": 0.7551053762435913,
"learning_rate": 7.767483229756303e-06,
"loss": 0.0992,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13016045093536377,
"step": 9865
},
{
"epoch": 5.1750524109014675,
"grad_norm": 0.7922313213348389,
"learning_rate": 7.746815959410947e-06,
"loss": 0.1517,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1217041015625,
"step": 9870
},
{
"epoch": 5.177672955974843,
"grad_norm": 0.7648522257804871,
"learning_rate": 7.726169614752036e-06,
"loss": 0.1115,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09638641774654388,
"step": 9875
},
{
"epoch": 5.180293501048218,
"grad_norm": 0.6685067415237427,
"learning_rate": 7.705544231038887e-06,
"loss": 0.1164,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10683226585388184,
"step": 9880
},
{
"epoch": 5.182914046121593,
"grad_norm": 0.7491695284843445,
"learning_rate": 7.684939843495035e-06,
"loss": 0.1084,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10484476387500763,
"step": 9885
},
{
"epoch": 5.185534591194968,
"grad_norm": 0.7718347907066345,
"learning_rate": 7.664356487308136e-06,
"loss": 0.1181,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0891261026263237,
"step": 9890
},
{
"epoch": 5.188155136268344,
"grad_norm": 0.7778302431106567,
"learning_rate": 7.643794197629946e-06,
"loss": 0.1072,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10242325812578201,
"step": 9895
},
{
"epoch": 5.190775681341719,
"grad_norm": 0.7481498122215271,
"learning_rate": 7.623253009576233e-06,
"loss": 0.1067,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10735617578029633,
"step": 9900
},
{
"epoch": 5.193396226415095,
"grad_norm": 0.6880243420600891,
"learning_rate": 7.6027329582267266e-06,
"loss": 0.1129,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.093168243765831,
"step": 9905
},
{
"epoch": 5.19601677148847,
"grad_norm": 0.7669791579246521,
"learning_rate": 7.582234078625082e-06,
"loss": 0.1221,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13155482709407806,
"step": 9910
},
{
"epoch": 5.198637316561845,
"grad_norm": 0.7284143567085266,
"learning_rate": 7.561756405778773e-06,
"loss": 0.1393,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16921260952949524,
"step": 9915
},
{
"epoch": 5.20125786163522,
"grad_norm": 0.7484710812568665,
"learning_rate": 7.541299974659066e-06,
"loss": 0.1146,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09271240234375,
"step": 9920
},
{
"epoch": 5.203878406708595,
"grad_norm": 0.7038311958312988,
"learning_rate": 7.520864820200953e-06,
"loss": 0.1162,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10301141440868378,
"step": 9925
},
{
"epoch": 5.20649895178197,
"grad_norm": 0.7804470658302307,
"learning_rate": 7.50045097730308e-06,
"loss": 0.1243,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12956282496452332,
"step": 9930
},
{
"epoch": 5.209119496855346,
"grad_norm": 0.7377161979675293,
"learning_rate": 7.480058480827719e-06,
"loss": 0.127,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11771019548177719,
"step": 9935
},
{
"epoch": 5.211740041928721,
"grad_norm": 0.6337024569511414,
"learning_rate": 7.45968736560067e-06,
"loss": 0.1126,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1121087297797203,
"step": 9940
},
{
"epoch": 5.214360587002097,
"grad_norm": 0.6332011818885803,
"learning_rate": 7.439337666411219e-06,
"loss": 0.1256,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08941650390625,
"step": 9945
},
{
"epoch": 5.216981132075472,
"grad_norm": 0.9817304611206055,
"learning_rate": 7.419009418012084e-06,
"loss": 0.1091,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09293422847986221,
"step": 9950
},
{
"epoch": 5.219601677148847,
"grad_norm": 0.6849780082702637,
"learning_rate": 7.398702655119341e-06,
"loss": 0.1176,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15425044298171997,
"step": 9955
},
{
"epoch": 5.222222222222222,
"grad_norm": 0.7723156809806824,
"learning_rate": 7.378417412412393e-06,
"loss": 0.1191,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08868657052516937,
"step": 9960
},
{
"epoch": 5.2248427672955975,
"grad_norm": 0.645289957523346,
"learning_rate": 7.358153724533874e-06,
"loss": 0.1124,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12060213088989258,
"step": 9965
},
{
"epoch": 5.227463312368973,
"grad_norm": 0.7643070220947266,
"learning_rate": 7.337911626089611e-06,
"loss": 0.1226,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10711387544870377,
"step": 9970
},
{
"epoch": 5.230083857442348,
"grad_norm": 0.6850946545600891,
"learning_rate": 7.3176911516485605e-06,
"loss": 0.1297,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12889599800109863,
"step": 9975
},
{
"epoch": 5.232704402515723,
"grad_norm": 0.6873239874839783,
"learning_rate": 7.297492335742746e-06,
"loss": 0.1221,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11314275115728378,
"step": 9980
},
{
"epoch": 5.235324947589098,
"grad_norm": 0.9334907531738281,
"learning_rate": 7.277315212867224e-06,
"loss": 0.109,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1055908203125,
"step": 9985
},
{
"epoch": 5.237945492662474,
"grad_norm": 0.7646591067314148,
"learning_rate": 7.25715981747998e-06,
"loss": 0.1135,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11272291839122772,
"step": 9990
},
{
"epoch": 5.240566037735849,
"grad_norm": 0.6887595057487488,
"learning_rate": 7.2370261840019e-06,
"loss": 0.1328,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1418076455593109,
"step": 9995
},
{
"epoch": 5.243186582809225,
"grad_norm": 0.7327706813812256,
"learning_rate": 7.216914346816715e-06,
"loss": 0.1287,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08446444571018219,
"step": 10000
},
{
"epoch": 5.2458071278826,
"grad_norm": 0.7251383662223816,
"learning_rate": 7.196824340270916e-06,
"loss": 0.1055,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09525515139102936,
"step": 10005
},
{
"epoch": 5.248427672955975,
"grad_norm": 0.8185909390449524,
"learning_rate": 7.176756198673734e-06,
"loss": 0.1123,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14663901925086975,
"step": 10010
},
{
"epoch": 5.25104821802935,
"grad_norm": 0.7505481839179993,
"learning_rate": 7.156709956297041e-06,
"loss": 0.1298,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11667823791503906,
"step": 10015
},
{
"epoch": 5.253668763102725,
"grad_norm": 0.8053024411201477,
"learning_rate": 7.136685647375321e-06,
"loss": 0.1149,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11640357971191406,
"step": 10020
},
{
"epoch": 5.2562893081761,
"grad_norm": 0.8202130198478699,
"learning_rate": 7.116683306105592e-06,
"loss": 0.107,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08791359513998032,
"step": 10025
},
{
"epoch": 5.258909853249476,
"grad_norm": 0.6813123226165771,
"learning_rate": 7.096702966647358e-06,
"loss": 0.1226,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1360882818698883,
"step": 10030
},
{
"epoch": 5.261530398322851,
"grad_norm": 0.7559633255004883,
"learning_rate": 7.076744663122561e-06,
"loss": 0.1186,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09497682005167007,
"step": 10035
},
{
"epoch": 5.264150943396227,
"grad_norm": 0.6545113921165466,
"learning_rate": 7.0568084296154955e-06,
"loss": 0.1066,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1339862048625946,
"step": 10040
},
{
"epoch": 5.266771488469602,
"grad_norm": 0.7296022772789001,
"learning_rate": 7.036894300172774e-06,
"loss": 0.1054,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.099609375,
"step": 10045
},
{
"epoch": 5.269392033542977,
"grad_norm": 0.7355965971946716,
"learning_rate": 7.0170023088032534e-06,
"loss": 0.1334,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1306404173374176,
"step": 10050
},
{
"epoch": 5.272012578616352,
"grad_norm": 0.771010160446167,
"learning_rate": 6.997132489477981e-06,
"loss": 0.1078,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08247143030166626,
"step": 10055
},
{
"epoch": 5.2746331236897275,
"grad_norm": 0.7686759829521179,
"learning_rate": 6.977284876130162e-06,
"loss": 0.1238,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11266212165355682,
"step": 10060
},
{
"epoch": 5.277253668763103,
"grad_norm": 0.7104469537734985,
"learning_rate": 6.957459502655053e-06,
"loss": 0.1079,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1234317421913147,
"step": 10065
},
{
"epoch": 5.279874213836478,
"grad_norm": 0.7212462425231934,
"learning_rate": 6.937656402909938e-06,
"loss": 0.1101,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1201360747218132,
"step": 10070
},
{
"epoch": 5.282494758909853,
"grad_norm": 0.6866129636764526,
"learning_rate": 6.917875610714069e-06,
"loss": 0.1351,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.20779114961624146,
"step": 10075
},
{
"epoch": 5.285115303983228,
"grad_norm": 0.8186706304550171,
"learning_rate": 6.898117159848594e-06,
"loss": 0.1092,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07537841796875,
"step": 10080
},
{
"epoch": 5.287735849056604,
"grad_norm": 0.835132896900177,
"learning_rate": 6.878381084056503e-06,
"loss": 0.1245,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12258610129356384,
"step": 10085
},
{
"epoch": 5.290356394129979,
"grad_norm": 0.7027671337127686,
"learning_rate": 6.858667417042593e-06,
"loss": 0.1188,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13058139383792877,
"step": 10090
},
{
"epoch": 5.2929769392033545,
"grad_norm": 0.7041785717010498,
"learning_rate": 6.838976192473372e-06,
"loss": 0.1051,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1019287109375,
"step": 10095
},
{
"epoch": 5.29559748427673,
"grad_norm": 0.7192649841308594,
"learning_rate": 6.819307443977035e-06,
"loss": 0.0987,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09088463336229324,
"step": 10100
},
{
"epoch": 5.298218029350105,
"grad_norm": 0.6497074365615845,
"learning_rate": 6.799661205143382e-06,
"loss": 0.13,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17226599156856537,
"step": 10105
},
{
"epoch": 5.30083857442348,
"grad_norm": 0.6994401812553406,
"learning_rate": 6.780037509523771e-06,
"loss": 0.1304,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10001536458730698,
"step": 10110
},
{
"epoch": 5.303459119496855,
"grad_norm": 0.7387732863426208,
"learning_rate": 6.7604363906310825e-06,
"loss": 0.1224,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18090754747390747,
"step": 10115
},
{
"epoch": 5.30607966457023,
"grad_norm": 0.7848963737487793,
"learning_rate": 6.7408578819396155e-06,
"loss": 0.121,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11017556488513947,
"step": 10120
},
{
"epoch": 5.308700209643606,
"grad_norm": 0.7292885184288025,
"learning_rate": 6.721302016885067e-06,
"loss": 0.11,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09594656527042389,
"step": 10125
},
{
"epoch": 5.311320754716981,
"grad_norm": 0.6747096180915833,
"learning_rate": 6.701768828864466e-06,
"loss": 0.1258,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12736985087394714,
"step": 10130
},
{
"epoch": 5.313941299790357,
"grad_norm": 0.817677915096283,
"learning_rate": 6.6822583512360975e-06,
"loss": 0.1137,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10613100230693817,
"step": 10135
},
{
"epoch": 5.316561844863732,
"grad_norm": 0.7237040996551514,
"learning_rate": 6.662770617319494e-06,
"loss": 0.1203,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14210447669029236,
"step": 10140
},
{
"epoch": 5.319182389937107,
"grad_norm": 0.6779884099960327,
"learning_rate": 6.643305660395318e-06,
"loss": 0.1289,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14578291773796082,
"step": 10145
},
{
"epoch": 5.321802935010482,
"grad_norm": 0.766578733921051,
"learning_rate": 6.623863513705348e-06,
"loss": 0.1179,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1254599392414093,
"step": 10150
},
{
"epoch": 5.3244234800838575,
"grad_norm": 0.7032365798950195,
"learning_rate": 6.604444210452403e-06,
"loss": 0.0913,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09285767376422882,
"step": 10155
},
{
"epoch": 5.327044025157233,
"grad_norm": 0.5468205809593201,
"learning_rate": 6.585047783800285e-06,
"loss": 0.1162,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09434093534946442,
"step": 10160
},
{
"epoch": 5.329664570230608,
"grad_norm": 0.7430211901664734,
"learning_rate": 6.565674266873745e-06,
"loss": 0.1124,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10109928250312805,
"step": 10165
},
{
"epoch": 5.332285115303983,
"grad_norm": 0.6913556456565857,
"learning_rate": 6.546323692758396e-06,
"loss": 0.1145,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11161768436431885,
"step": 10170
},
{
"epoch": 5.334905660377358,
"grad_norm": 0.7440565824508667,
"learning_rate": 6.52699609450067e-06,
"loss": 0.1282,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1132674366235733,
"step": 10175
},
{
"epoch": 5.337526205450734,
"grad_norm": 0.5860890746116638,
"learning_rate": 6.5076915051077675e-06,
"loss": 0.1343,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13795891404151917,
"step": 10180
},
{
"epoch": 5.340146750524109,
"grad_norm": 0.6547601222991943,
"learning_rate": 6.488409957547581e-06,
"loss": 0.1227,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15738970041275024,
"step": 10185
},
{
"epoch": 5.3427672955974845,
"grad_norm": 0.8251211643218994,
"learning_rate": 6.469151484748679e-06,
"loss": 0.1297,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1241455078125,
"step": 10190
},
{
"epoch": 5.34538784067086,
"grad_norm": 0.6415407657623291,
"learning_rate": 6.449916119600201e-06,
"loss": 0.1169,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12454849481582642,
"step": 10195
},
{
"epoch": 5.348008385744235,
"grad_norm": 0.6995936632156372,
"learning_rate": 6.4307038949518305e-06,
"loss": 0.1332,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1687215268611908,
"step": 10200
},
{
"epoch": 5.35062893081761,
"grad_norm": 0.7267442345619202,
"learning_rate": 6.411514843613725e-06,
"loss": 0.122,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11219580471515656,
"step": 10205
},
{
"epoch": 5.353249475890985,
"grad_norm": 0.7010683417320251,
"learning_rate": 6.39234899835649e-06,
"loss": 0.1134,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12867935001850128,
"step": 10210
},
{
"epoch": 5.35587002096436,
"grad_norm": 0.6663080453872681,
"learning_rate": 6.373206391911069e-06,
"loss": 0.1043,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11025137454271317,
"step": 10215
},
{
"epoch": 5.3584905660377355,
"grad_norm": 0.732987105846405,
"learning_rate": 6.354087056968748e-06,
"loss": 0.107,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10928837954998016,
"step": 10220
},
{
"epoch": 5.361111111111111,
"grad_norm": 0.9579931497573853,
"learning_rate": 6.334991026181052e-06,
"loss": 0.1174,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11715349555015564,
"step": 10225
},
{
"epoch": 5.363731656184487,
"grad_norm": 0.728870689868927,
"learning_rate": 6.315918332159714e-06,
"loss": 0.1265,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13661688566207886,
"step": 10230
},
{
"epoch": 5.366352201257862,
"grad_norm": 0.7422712445259094,
"learning_rate": 6.296869007476609e-06,
"loss": 0.119,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.143565371632576,
"step": 10235
},
{
"epoch": 5.368972746331237,
"grad_norm": 0.7388545870780945,
"learning_rate": 6.277843084663701e-06,
"loss": 0.1104,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1201258972287178,
"step": 10240
},
{
"epoch": 5.371593291404612,
"grad_norm": 0.8039642572402954,
"learning_rate": 6.258840596213005e-06,
"loss": 0.1198,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10702769458293915,
"step": 10245
},
{
"epoch": 5.3742138364779874,
"grad_norm": 0.6975395679473877,
"learning_rate": 6.239861574576498e-06,
"loss": 0.1127,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11157487332820892,
"step": 10250
},
{
"epoch": 5.376834381551363,
"grad_norm": 0.6503974199295044,
"learning_rate": 6.220906052166085e-06,
"loss": 0.109,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11010022461414337,
"step": 10255
},
{
"epoch": 5.379454926624738,
"grad_norm": 0.7469361424446106,
"learning_rate": 6.201974061353542e-06,
"loss": 0.1236,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09031128138303757,
"step": 10260
},
{
"epoch": 5.382075471698113,
"grad_norm": 0.6768584847450256,
"learning_rate": 6.183065634470453e-06,
"loss": 0.1116,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09780827164649963,
"step": 10265
},
{
"epoch": 5.384696016771488,
"grad_norm": 0.6973583102226257,
"learning_rate": 6.164180803808173e-06,
"loss": 0.1359,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15749692916870117,
"step": 10270
},
{
"epoch": 5.387316561844864,
"grad_norm": 0.6480283737182617,
"learning_rate": 6.145319601617749e-06,
"loss": 0.1148,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14943397045135498,
"step": 10275
},
{
"epoch": 5.389937106918239,
"grad_norm": 0.7254815101623535,
"learning_rate": 6.126482060109877e-06,
"loss": 0.1082,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08257943391799927,
"step": 10280
},
{
"epoch": 5.3925576519916145,
"grad_norm": 0.7758432030677795,
"learning_rate": 6.1076682114548465e-06,
"loss": 0.1277,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12551628053188324,
"step": 10285
},
{
"epoch": 5.39517819706499,
"grad_norm": 0.7967376708984375,
"learning_rate": 6.088878087782488e-06,
"loss": 0.1206,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11224541068077087,
"step": 10290
},
{
"epoch": 5.397798742138365,
"grad_norm": 0.7282092571258545,
"learning_rate": 6.070111721182104e-06,
"loss": 0.114,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12284990400075912,
"step": 10295
},
{
"epoch": 5.40041928721174,
"grad_norm": 0.7961103320121765,
"learning_rate": 6.051369143702446e-06,
"loss": 0.115,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.075592041015625,
"step": 10300
},
{
"epoch": 5.403039832285115,
"grad_norm": 0.6946623921394348,
"learning_rate": 6.032650387351624e-06,
"loss": 0.1156,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09668971598148346,
"step": 10305
},
{
"epoch": 5.40566037735849,
"grad_norm": 0.6397780179977417,
"learning_rate": 6.013955484097067e-06,
"loss": 0.1289,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18690338730812073,
"step": 10310
},
{
"epoch": 5.4082809224318655,
"grad_norm": 0.6478099822998047,
"learning_rate": 5.9952844658654744e-06,
"loss": 0.1214,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14790332317352295,
"step": 10315
},
{
"epoch": 5.410901467505241,
"grad_norm": 0.7488334774971008,
"learning_rate": 5.9766373645427415e-06,
"loss": 0.1162,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11961181461811066,
"step": 10320
},
{
"epoch": 5.413522012578617,
"grad_norm": 0.8306141495704651,
"learning_rate": 5.958014211973943e-06,
"loss": 0.1195,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1055527925491333,
"step": 10325
},
{
"epoch": 5.416142557651992,
"grad_norm": 0.7220061421394348,
"learning_rate": 5.9394150399632385e-06,
"loss": 0.1405,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16748718917369843,
"step": 10330
},
{
"epoch": 5.418763102725367,
"grad_norm": 0.6414279937744141,
"learning_rate": 5.920839880273832e-06,
"loss": 0.1092,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09563075006008148,
"step": 10335
},
{
"epoch": 5.421383647798742,
"grad_norm": 0.7283735871315002,
"learning_rate": 5.902288764627928e-06,
"loss": 0.1274,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11764983832836151,
"step": 10340
},
{
"epoch": 5.424004192872117,
"grad_norm": 0.6365904211997986,
"learning_rate": 5.883761724706656e-06,
"loss": 0.1285,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13443568348884583,
"step": 10345
},
{
"epoch": 5.426624737945493,
"grad_norm": 0.7272641658782959,
"learning_rate": 5.8652587921500544e-06,
"loss": 0.1148,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08130456507205963,
"step": 10350
},
{
"epoch": 5.429245283018868,
"grad_norm": 0.6290727853775024,
"learning_rate": 5.846779998556971e-06,
"loss": 0.1188,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1324918270111084,
"step": 10355
},
{
"epoch": 5.431865828092243,
"grad_norm": 0.6942636966705322,
"learning_rate": 5.828325375485033e-06,
"loss": 0.1261,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10782308876514435,
"step": 10360
},
{
"epoch": 5.434486373165618,
"grad_norm": 0.856620192527771,
"learning_rate": 5.809894954450592e-06,
"loss": 0.1264,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11584212630987167,
"step": 10365
},
{
"epoch": 5.437106918238994,
"grad_norm": 0.6576579809188843,
"learning_rate": 5.791488766928664e-06,
"loss": 0.1114,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1250620186328888,
"step": 10370
},
{
"epoch": 5.439727463312369,
"grad_norm": 0.6821559071540833,
"learning_rate": 5.773106844352894e-06,
"loss": 0.1307,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11624176800251007,
"step": 10375
},
{
"epoch": 5.4423480083857445,
"grad_norm": 0.6944690346717834,
"learning_rate": 5.7547492181154715e-06,
"loss": 0.1139,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11554088443517685,
"step": 10380
},
{
"epoch": 5.44496855345912,
"grad_norm": 0.6521162390708923,
"learning_rate": 5.7364159195670975e-06,
"loss": 0.1103,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11223713308572769,
"step": 10385
},
{
"epoch": 5.447589098532495,
"grad_norm": 0.844800591468811,
"learning_rate": 5.718106980016933e-06,
"loss": 0.1011,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10950352251529694,
"step": 10390
},
{
"epoch": 5.45020964360587,
"grad_norm": 0.6249414086341858,
"learning_rate": 5.6998224307325246e-06,
"loss": 0.1355,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17765092849731445,
"step": 10395
},
{
"epoch": 5.452830188679245,
"grad_norm": 0.7176016569137573,
"learning_rate": 5.68156230293979e-06,
"loss": 0.1104,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10899336636066437,
"step": 10400
},
{
"epoch": 5.45545073375262,
"grad_norm": 0.7706964015960693,
"learning_rate": 5.66332662782292e-06,
"loss": 0.0888,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.074981689453125,
"step": 10405
},
{
"epoch": 5.4580712788259955,
"grad_norm": 0.8711444139480591,
"learning_rate": 5.645115436524353e-06,
"loss": 0.0983,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11633491516113281,
"step": 10410
},
{
"epoch": 5.460691823899371,
"grad_norm": 0.7096289396286011,
"learning_rate": 5.626928760144712e-06,
"loss": 0.1167,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10099075734615326,
"step": 10415
},
{
"epoch": 5.463312368972747,
"grad_norm": 0.6985383629798889,
"learning_rate": 5.6087666297427526e-06,
"loss": 0.1129,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08926206827163696,
"step": 10420
},
{
"epoch": 5.465932914046122,
"grad_norm": 0.7082505822181702,
"learning_rate": 5.590629076335323e-06,
"loss": 0.1181,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11006351560354233,
"step": 10425
},
{
"epoch": 5.468553459119497,
"grad_norm": 0.7185156345367432,
"learning_rate": 5.572516130897288e-06,
"loss": 0.111,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13114425539970398,
"step": 10430
},
{
"epoch": 5.471174004192872,
"grad_norm": 0.7408719658851624,
"learning_rate": 5.554427824361488e-06,
"loss": 0.1102,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13163840770721436,
"step": 10435
},
{
"epoch": 5.473794549266247,
"grad_norm": 1.1774944067001343,
"learning_rate": 5.5363641876186905e-06,
"loss": 0.1239,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15403783321380615,
"step": 10440
},
{
"epoch": 5.476415094339623,
"grad_norm": 0.6841291785240173,
"learning_rate": 5.518325251517522e-06,
"loss": 0.1325,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1528058797121048,
"step": 10445
},
{
"epoch": 5.479035639412998,
"grad_norm": 0.7858748435974121,
"learning_rate": 5.500311046864448e-06,
"loss": 0.1127,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12168926000595093,
"step": 10450
},
{
"epoch": 5.481656184486373,
"grad_norm": 0.8195909261703491,
"learning_rate": 5.482321604423679e-06,
"loss": 0.1234,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13854841887950897,
"step": 10455
},
{
"epoch": 5.484276729559748,
"grad_norm": 0.7601556181907654,
"learning_rate": 5.4643569549171385e-06,
"loss": 0.1214,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13748139142990112,
"step": 10460
},
{
"epoch": 5.486897274633124,
"grad_norm": 0.8059583306312561,
"learning_rate": 5.446417129024417e-06,
"loss": 0.111,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15006282925605774,
"step": 10465
},
{
"epoch": 5.489517819706499,
"grad_norm": 0.7704102396965027,
"learning_rate": 5.428502157382702e-06,
"loss": 0.1394,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10409608483314514,
"step": 10470
},
{
"epoch": 5.4921383647798745,
"grad_norm": 0.8262134194374084,
"learning_rate": 5.410612070586752e-06,
"loss": 0.1104,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10782438516616821,
"step": 10475
},
{
"epoch": 5.49475890985325,
"grad_norm": 0.6677366495132446,
"learning_rate": 5.39274689918881e-06,
"loss": 0.106,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1179676502943039,
"step": 10480
},
{
"epoch": 5.497379454926625,
"grad_norm": 0.9120731353759766,
"learning_rate": 5.374906673698581e-06,
"loss": 0.1039,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0929182767868042,
"step": 10485
},
{
"epoch": 5.5,
"grad_norm": 0.6679109930992126,
"learning_rate": 5.357091424583159e-06,
"loss": 0.1193,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10127764195203781,
"step": 10490
},
{
"epoch": 5.502620545073375,
"grad_norm": 0.6787980198860168,
"learning_rate": 5.339301182266985e-06,
"loss": 0.11,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1122751459479332,
"step": 10495
},
{
"epoch": 5.50524109014675,
"grad_norm": 0.7567312121391296,
"learning_rate": 5.321535977131809e-06,
"loss": 0.0949,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08058898150920868,
"step": 10500
},
{
"epoch": 5.5078616352201255,
"grad_norm": 0.8747363090515137,
"learning_rate": 5.303795839516606e-06,
"loss": 0.1096,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0567626953125,
"step": 10505
},
{
"epoch": 5.510482180293501,
"grad_norm": 0.8775766491889954,
"learning_rate": 5.286080799717543e-06,
"loss": 0.1133,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1152627170085907,
"step": 10510
},
{
"epoch": 5.513102725366876,
"grad_norm": 0.6846725940704346,
"learning_rate": 5.268390887987935e-06,
"loss": 0.1278,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1599602997303009,
"step": 10515
},
{
"epoch": 5.515723270440252,
"grad_norm": 0.6617890000343323,
"learning_rate": 5.250726134538177e-06,
"loss": 0.1324,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14858025312423706,
"step": 10520
},
{
"epoch": 5.518343815513627,
"grad_norm": 0.701453447341919,
"learning_rate": 5.233086569535692e-06,
"loss": 0.1162,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12819528579711914,
"step": 10525
},
{
"epoch": 5.520964360587002,
"grad_norm": 0.8231136202812195,
"learning_rate": 5.215472223104909e-06,
"loss": 0.102,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14212852716445923,
"step": 10530
},
{
"epoch": 5.523584905660377,
"grad_norm": 0.7562174797058105,
"learning_rate": 5.19788312532717e-06,
"loss": 0.1151,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1172027587890625,
"step": 10535
},
{
"epoch": 5.526205450733753,
"grad_norm": 0.668741762638092,
"learning_rate": 5.180319306240702e-06,
"loss": 0.1247,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09932608157396317,
"step": 10540
},
{
"epoch": 5.528825995807128,
"grad_norm": 0.7404559254646301,
"learning_rate": 5.162780795840567e-06,
"loss": 0.1096,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11360190808773041,
"step": 10545
},
{
"epoch": 5.531446540880503,
"grad_norm": 0.6470763683319092,
"learning_rate": 5.145267624078594e-06,
"loss": 0.1069,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10094386339187622,
"step": 10550
},
{
"epoch": 5.534067085953878,
"grad_norm": 0.6381714344024658,
"learning_rate": 5.1277798208633565e-06,
"loss": 0.1298,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12636935710906982,
"step": 10555
},
{
"epoch": 5.536687631027254,
"grad_norm": 1.1980273723602295,
"learning_rate": 5.110317416060093e-06,
"loss": 0.1098,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09641356766223907,
"step": 10560
},
{
"epoch": 5.539308176100629,
"grad_norm": 0.6636906862258911,
"learning_rate": 5.092880439490666e-06,
"loss": 0.1299,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11983007192611694,
"step": 10565
},
{
"epoch": 5.5419287211740045,
"grad_norm": 0.7041227221488953,
"learning_rate": 5.075468920933517e-06,
"loss": 0.1233,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11246918141841888,
"step": 10570
},
{
"epoch": 5.54454926624738,
"grad_norm": 0.6758451461791992,
"learning_rate": 5.058082890123605e-06,
"loss": 0.1149,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.131456658244133,
"step": 10575
},
{
"epoch": 5.547169811320755,
"grad_norm": 0.7600293159484863,
"learning_rate": 5.040722376752374e-06,
"loss": 0.1128,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11408248543739319,
"step": 10580
},
{
"epoch": 5.54979035639413,
"grad_norm": 0.7650055885314941,
"learning_rate": 5.02338741046768e-06,
"loss": 0.0888,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08961398899555206,
"step": 10585
},
{
"epoch": 5.552410901467505,
"grad_norm": 0.6169334053993225,
"learning_rate": 5.006078020873748e-06,
"loss": 0.1246,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1591559648513794,
"step": 10590
},
{
"epoch": 5.55503144654088,
"grad_norm": 0.7118397355079651,
"learning_rate": 4.988794237531129e-06,
"loss": 0.1172,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09753826260566711,
"step": 10595
},
{
"epoch": 5.5576519916142555,
"grad_norm": 0.7271219491958618,
"learning_rate": 4.971536089956641e-06,
"loss": 0.1066,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12421578168869019,
"step": 10600
},
{
"epoch": 5.560272536687631,
"grad_norm": 0.6494109630584717,
"learning_rate": 4.954303607623332e-06,
"loss": 0.1185,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0917762741446495,
"step": 10605
},
{
"epoch": 5.562893081761006,
"grad_norm": 0.8092367053031921,
"learning_rate": 4.937096819960408e-06,
"loss": 0.1077,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09707881510257721,
"step": 10610
},
{
"epoch": 5.565513626834382,
"grad_norm": 0.9010205864906311,
"learning_rate": 4.919915756353198e-06,
"loss": 0.1247,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10653796792030334,
"step": 10615
},
{
"epoch": 5.568134171907757,
"grad_norm": 0.7276707887649536,
"learning_rate": 4.902760446143096e-06,
"loss": 0.1152,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11297433078289032,
"step": 10620
},
{
"epoch": 5.570754716981132,
"grad_norm": 0.8087003231048584,
"learning_rate": 4.885630918627518e-06,
"loss": 0.1053,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1004638671875,
"step": 10625
},
{
"epoch": 5.573375262054507,
"grad_norm": 0.7698782086372375,
"learning_rate": 4.86852720305986e-06,
"loss": 0.1012,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1025627851486206,
"step": 10630
},
{
"epoch": 5.575995807127883,
"grad_norm": 0.7195867300033569,
"learning_rate": 4.85144932864942e-06,
"loss": 0.0957,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0869738757610321,
"step": 10635
},
{
"epoch": 5.578616352201258,
"grad_norm": 0.7677379846572876,
"learning_rate": 4.834397324561375e-06,
"loss": 0.1042,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0792236328125,
"step": 10640
},
{
"epoch": 5.581236897274633,
"grad_norm": 0.7311434149742126,
"learning_rate": 4.817371219916713e-06,
"loss": 0.098,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10641766339540482,
"step": 10645
},
{
"epoch": 5.583857442348008,
"grad_norm": 0.6872872710227966,
"learning_rate": 4.800371043792198e-06,
"loss": 0.1121,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09992322325706482,
"step": 10650
},
{
"epoch": 5.586477987421384,
"grad_norm": 0.7587729692459106,
"learning_rate": 4.783396825220319e-06,
"loss": 0.0983,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0865052118897438,
"step": 10655
},
{
"epoch": 5.589098532494759,
"grad_norm": 0.8601624369621277,
"learning_rate": 4.766448593189226e-06,
"loss": 0.1104,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0723876953125,
"step": 10660
},
{
"epoch": 5.5917190775681345,
"grad_norm": 0.6991299390792847,
"learning_rate": 4.7495263766426905e-06,
"loss": 0.1246,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12729085981845856,
"step": 10665
},
{
"epoch": 5.59433962264151,
"grad_norm": 0.7156105637550354,
"learning_rate": 4.732630204480059e-06,
"loss": 0.1214,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11206556856632233,
"step": 10670
},
{
"epoch": 5.596960167714885,
"grad_norm": 0.7052958011627197,
"learning_rate": 4.715760105556198e-06,
"loss": 0.1139,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10625350475311279,
"step": 10675
},
{
"epoch": 5.59958071278826,
"grad_norm": 0.8378977179527283,
"learning_rate": 4.69891610868145e-06,
"loss": 0.1212,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13235339522361755,
"step": 10680
},
{
"epoch": 5.602201257861635,
"grad_norm": 0.8395311236381531,
"learning_rate": 4.68209824262158e-06,
"loss": 0.1233,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0919189453125,
"step": 10685
},
{
"epoch": 5.60482180293501,
"grad_norm": 0.702622652053833,
"learning_rate": 4.665306536097725e-06,
"loss": 0.1078,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11648667603731155,
"step": 10690
},
{
"epoch": 5.6074423480083855,
"grad_norm": 0.8039387464523315,
"learning_rate": 4.648541017786345e-06,
"loss": 0.1117,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11662253737449646,
"step": 10695
},
{
"epoch": 5.610062893081761,
"grad_norm": 0.7079611420631409,
"learning_rate": 4.63180171631918e-06,
"loss": 0.1347,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12504009902477264,
"step": 10700
},
{
"epoch": 5.612683438155136,
"grad_norm": 0.7281865477561951,
"learning_rate": 4.615088660283202e-06,
"loss": 0.1232,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12699584662914276,
"step": 10705
},
{
"epoch": 5.615303983228512,
"grad_norm": 0.7949917316436768,
"learning_rate": 4.598401878220557e-06,
"loss": 0.1132,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08386878669261932,
"step": 10710
},
{
"epoch": 5.617924528301887,
"grad_norm": 0.768836259841919,
"learning_rate": 4.581741398628521e-06,
"loss": 0.1057,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10535287111997604,
"step": 10715
},
{
"epoch": 5.620545073375262,
"grad_norm": 0.6575037240982056,
"learning_rate": 4.565107249959449e-06,
"loss": 0.1075,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.06976318359375,
"step": 10720
},
{
"epoch": 5.623165618448637,
"grad_norm": 0.6681073904037476,
"learning_rate": 4.54849946062073e-06,
"loss": 0.1108,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1465725600719452,
"step": 10725
},
{
"epoch": 5.6257861635220126,
"grad_norm": 0.7189939022064209,
"learning_rate": 4.531918058974736e-06,
"loss": 0.118,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13257884979248047,
"step": 10730
},
{
"epoch": 5.628406708595388,
"grad_norm": 0.7120517492294312,
"learning_rate": 4.515363073338788e-06,
"loss": 0.1044,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07698134332895279,
"step": 10735
},
{
"epoch": 5.631027253668763,
"grad_norm": 0.7609550952911377,
"learning_rate": 4.498834531985075e-06,
"loss": 0.1312,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11576057970523834,
"step": 10740
},
{
"epoch": 5.633647798742138,
"grad_norm": 0.6521153450012207,
"learning_rate": 4.482332463140635e-06,
"loss": 0.141,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.129469633102417,
"step": 10745
},
{
"epoch": 5.636268343815514,
"grad_norm": 0.7928845882415771,
"learning_rate": 4.465856894987297e-06,
"loss": 0.1304,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1300087422132492,
"step": 10750
},
{
"epoch": 5.638888888888889,
"grad_norm": 0.7226012349128723,
"learning_rate": 4.4494078556616246e-06,
"loss": 0.1184,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13354867696762085,
"step": 10755
},
{
"epoch": 5.6415094339622645,
"grad_norm": 0.7212560176849365,
"learning_rate": 4.4329853732548925e-06,
"loss": 0.1117,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08856201171875,
"step": 10760
},
{
"epoch": 5.64412997903564,
"grad_norm": 0.7965877652168274,
"learning_rate": 4.416589475813009e-06,
"loss": 0.1241,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0980224609375,
"step": 10765
},
{
"epoch": 5.646750524109015,
"grad_norm": 0.6735566854476929,
"learning_rate": 4.400220191336484e-06,
"loss": 0.1381,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1463538408279419,
"step": 10770
},
{
"epoch": 5.64937106918239,
"grad_norm": 0.7604423761367798,
"learning_rate": 4.383877547780378e-06,
"loss": 0.1159,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1259516477584839,
"step": 10775
},
{
"epoch": 5.651991614255765,
"grad_norm": 0.7322911024093628,
"learning_rate": 4.3675615730542505e-06,
"loss": 0.1254,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.125588059425354,
"step": 10780
},
{
"epoch": 5.65461215932914,
"grad_norm": 0.9131481647491455,
"learning_rate": 4.351272295022133e-06,
"loss": 0.1448,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1126449927687645,
"step": 10785
},
{
"epoch": 5.6572327044025155,
"grad_norm": 0.6480017304420471,
"learning_rate": 4.335009741502452e-06,
"loss": 0.1121,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09486915916204453,
"step": 10790
},
{
"epoch": 5.659853249475891,
"grad_norm": 0.8091151714324951,
"learning_rate": 4.318773940267991e-06,
"loss": 0.107,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.096101313829422,
"step": 10795
},
{
"epoch": 5.662473794549266,
"grad_norm": 0.6447314023971558,
"learning_rate": 4.302564919045855e-06,
"loss": 0.111,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09380515664815903,
"step": 10800
},
{
"epoch": 5.665094339622642,
"grad_norm": 0.6767293810844421,
"learning_rate": 4.286382705517407e-06,
"loss": 0.1232,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11664150655269623,
"step": 10805
},
{
"epoch": 5.667714884696017,
"grad_norm": 0.9022296667098999,
"learning_rate": 4.270227327318244e-06,
"loss": 0.1116,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0869140625,
"step": 10810
},
{
"epoch": 5.670335429769392,
"grad_norm": 0.6763343811035156,
"learning_rate": 4.2540988120381165e-06,
"loss": 0.1316,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12650880217552185,
"step": 10815
},
{
"epoch": 5.672955974842767,
"grad_norm": 0.7594171166419983,
"learning_rate": 4.23799718722091e-06,
"loss": 0.1104,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10680019855499268,
"step": 10820
},
{
"epoch": 5.6755765199161425,
"grad_norm": 0.694604754447937,
"learning_rate": 4.2219224803645795e-06,
"loss": 0.1084,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0924072265625,
"step": 10825
},
{
"epoch": 5.678197064989518,
"grad_norm": 0.752474308013916,
"learning_rate": 4.20587471892111e-06,
"loss": 0.141,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1477525681257248,
"step": 10830
},
{
"epoch": 5.680817610062893,
"grad_norm": 0.7720589637756348,
"learning_rate": 4.189853930296486e-06,
"loss": 0.1147,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08408753573894501,
"step": 10835
},
{
"epoch": 5.683438155136268,
"grad_norm": 0.7506592273712158,
"learning_rate": 4.173860141850612e-06,
"loss": 0.1201,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10575760155916214,
"step": 10840
},
{
"epoch": 5.686058700209644,
"grad_norm": 0.7803653478622437,
"learning_rate": 4.157893380897282e-06,
"loss": 0.1344,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12382243573665619,
"step": 10845
},
{
"epoch": 5.688679245283019,
"grad_norm": 0.7440863847732544,
"learning_rate": 4.1419536747041425e-06,
"loss": 0.124,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10115906596183777,
"step": 10850
},
{
"epoch": 5.691299790356394,
"grad_norm": 0.7986370325088501,
"learning_rate": 4.126041050492624e-06,
"loss": 0.1184,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0830078125,
"step": 10855
},
{
"epoch": 5.69392033542977,
"grad_norm": 0.6744484901428223,
"learning_rate": 4.110155535437927e-06,
"loss": 0.1042,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11421257257461548,
"step": 10860
},
{
"epoch": 5.696540880503145,
"grad_norm": 0.8917695879936218,
"learning_rate": 4.094297156668936e-06,
"loss": 0.1081,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10523484647274017,
"step": 10865
},
{
"epoch": 5.69916142557652,
"grad_norm": 0.6464312076568604,
"learning_rate": 4.078465941268204e-06,
"loss": 0.1231,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14706212282180786,
"step": 10870
},
{
"epoch": 5.701781970649895,
"grad_norm": 0.7350417375564575,
"learning_rate": 4.062661916271889e-06,
"loss": 0.1126,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08206360787153244,
"step": 10875
},
{
"epoch": 5.70440251572327,
"grad_norm": 0.7763562202453613,
"learning_rate": 4.046885108669709e-06,
"loss": 0.1184,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12558457255363464,
"step": 10880
},
{
"epoch": 5.7070230607966455,
"grad_norm": 0.7393556237220764,
"learning_rate": 4.031135545404923e-06,
"loss": 0.1203,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1080063134431839,
"step": 10885
},
{
"epoch": 5.709643605870021,
"grad_norm": 0.697335422039032,
"learning_rate": 4.015413253374239e-06,
"loss": 0.1077,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09124629944562912,
"step": 10890
},
{
"epoch": 5.712264150943396,
"grad_norm": 0.7845637202262878,
"learning_rate": 3.999718259427805e-06,
"loss": 0.1115,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08410429209470749,
"step": 10895
},
{
"epoch": 5.714884696016772,
"grad_norm": 0.8138260841369629,
"learning_rate": 3.9840505903691414e-06,
"loss": 0.1092,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.124274343252182,
"step": 10900
},
{
"epoch": 5.717505241090147,
"grad_norm": 12.967086791992188,
"learning_rate": 3.968410272955106e-06,
"loss": 0.1189,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14421498775482178,
"step": 10905
},
{
"epoch": 5.720125786163522,
"grad_norm": 0.7884721159934998,
"learning_rate": 3.952797333895855e-06,
"loss": 0.1322,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08462636172771454,
"step": 10910
},
{
"epoch": 5.722746331236897,
"grad_norm": 0.754406750202179,
"learning_rate": 3.937211799854781e-06,
"loss": 0.1356,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14049619436264038,
"step": 10915
},
{
"epoch": 5.7253668763102725,
"grad_norm": 0.7726715803146362,
"learning_rate": 3.921653697448475e-06,
"loss": 0.0934,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07763671875,
"step": 10920
},
{
"epoch": 5.727987421383648,
"grad_norm": 0.7438585162162781,
"learning_rate": 3.90612305324668e-06,
"loss": 0.1021,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.090576171875,
"step": 10925
},
{
"epoch": 5.730607966457023,
"grad_norm": 0.776539146900177,
"learning_rate": 3.890619893772245e-06,
"loss": 0.1162,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12244457006454468,
"step": 10930
},
{
"epoch": 5.733228511530398,
"grad_norm": 0.7650336623191833,
"learning_rate": 3.875144245501093e-06,
"loss": 0.1151,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08882482349872589,
"step": 10935
},
{
"epoch": 5.735849056603773,
"grad_norm": 0.7434911131858826,
"learning_rate": 3.859696134862152e-06,
"loss": 0.1162,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09627208113670349,
"step": 10940
},
{
"epoch": 5.738469601677149,
"grad_norm": 0.7924013137817383,
"learning_rate": 3.844275588237325e-06,
"loss": 0.1037,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12574627995491028,
"step": 10945
},
{
"epoch": 5.741090146750524,
"grad_norm": 0.806307315826416,
"learning_rate": 3.828882631961442e-06,
"loss": 0.1078,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10893553495407104,
"step": 10950
},
{
"epoch": 5.7437106918239,
"grad_norm": 0.7530555725097656,
"learning_rate": 3.813517292322215e-06,
"loss": 0.1162,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13084349036216736,
"step": 10955
},
{
"epoch": 5.746331236897275,
"grad_norm": 0.7580758929252625,
"learning_rate": 3.7981795955601896e-06,
"loss": 0.0995,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10415025055408478,
"step": 10960
},
{
"epoch": 5.74895178197065,
"grad_norm": 0.6946631669998169,
"learning_rate": 3.7828695678687166e-06,
"loss": 0.116,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11199931800365448,
"step": 10965
},
{
"epoch": 5.751572327044025,
"grad_norm": 0.7735135555267334,
"learning_rate": 3.7675872353938814e-06,
"loss": 0.1215,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12082436680793762,
"step": 10970
},
{
"epoch": 5.7541928721174,
"grad_norm": 0.8254667520523071,
"learning_rate": 3.7523326242344717e-06,
"loss": 0.1167,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11641869693994522,
"step": 10975
},
{
"epoch": 5.756813417190775,
"grad_norm": 0.7062658071517944,
"learning_rate": 3.7371057604419415e-06,
"loss": 0.13,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13770698010921478,
"step": 10980
},
{
"epoch": 5.759433962264151,
"grad_norm": 0.6900745034217834,
"learning_rate": 3.7219066700203455e-06,
"loss": 0.1072,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1039891242980957,
"step": 10985
},
{
"epoch": 5.762054507337526,
"grad_norm": 0.7733207941055298,
"learning_rate": 3.7067353789263294e-06,
"loss": 0.1303,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12287549674510956,
"step": 10990
},
{
"epoch": 5.764675052410902,
"grad_norm": 0.9098268151283264,
"learning_rate": 3.691591913069048e-06,
"loss": 0.1093,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08826844394207001,
"step": 10995
},
{
"epoch": 5.767295597484277,
"grad_norm": 0.7530337572097778,
"learning_rate": 3.6764762983101344e-06,
"loss": 0.1079,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12231530994176865,
"step": 11000
},
{
"epoch": 5.769916142557652,
"grad_norm": 0.669246256351471,
"learning_rate": 3.6613885604636703e-06,
"loss": 0.1088,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13573946058750153,
"step": 11005
},
{
"epoch": 5.772536687631027,
"grad_norm": 0.7240214347839355,
"learning_rate": 3.6463287252961134e-06,
"loss": 0.1233,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14697837829589844,
"step": 11010
},
{
"epoch": 5.7751572327044025,
"grad_norm": 0.6674785614013672,
"learning_rate": 3.6312968185262908e-06,
"loss": 0.1284,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13692863285541534,
"step": 11015
},
{
"epoch": 5.777777777777778,
"grad_norm": 0.7756078839302063,
"learning_rate": 3.6162928658253195e-06,
"loss": 0.1187,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12126202881336212,
"step": 11020
},
{
"epoch": 5.780398322851153,
"grad_norm": 0.7499447464942932,
"learning_rate": 3.601316892816582e-06,
"loss": 0.1173,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14638075232505798,
"step": 11025
},
{
"epoch": 5.783018867924528,
"grad_norm": 1.003929853439331,
"learning_rate": 3.586368925075674e-06,
"loss": 0.1114,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16575685143470764,
"step": 11030
},
{
"epoch": 5.785639412997903,
"grad_norm": 0.722449541091919,
"learning_rate": 3.571448988130364e-06,
"loss": 0.1231,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11033296585083008,
"step": 11035
},
{
"epoch": 5.788259958071279,
"grad_norm": 0.8108361959457397,
"learning_rate": 3.556557107460565e-06,
"loss": 0.1049,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.084716796875,
"step": 11040
},
{
"epoch": 5.790880503144654,
"grad_norm": 0.7148273587226868,
"learning_rate": 3.5416933084982576e-06,
"loss": 0.1154,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14857742190361023,
"step": 11045
},
{
"epoch": 5.79350104821803,
"grad_norm": 0.7880802750587463,
"learning_rate": 3.52685761662747e-06,
"loss": 0.1209,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08220598101615906,
"step": 11050
},
{
"epoch": 5.796121593291405,
"grad_norm": 0.7853556275367737,
"learning_rate": 3.5120500571842375e-06,
"loss": 0.1079,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09490779042243958,
"step": 11055
},
{
"epoch": 5.79874213836478,
"grad_norm": 0.7573359608650208,
"learning_rate": 3.497270655456537e-06,
"loss": 0.1134,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12122859060764313,
"step": 11060
},
{
"epoch": 5.801362683438155,
"grad_norm": 0.69629967212677,
"learning_rate": 3.4825194366842797e-06,
"loss": 0.1126,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1040906012058258,
"step": 11065
},
{
"epoch": 5.80398322851153,
"grad_norm": 0.7572780251502991,
"learning_rate": 3.4677964260592267e-06,
"loss": 0.1278,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1337701380252838,
"step": 11070
},
{
"epoch": 5.806603773584905,
"grad_norm": 0.8365534543991089,
"learning_rate": 3.4531016487249747e-06,
"loss": 0.1145,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07947821915149689,
"step": 11075
},
{
"epoch": 5.809224318658281,
"grad_norm": 0.771355152130127,
"learning_rate": 3.438435129776905e-06,
"loss": 0.1152,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10697413980960846,
"step": 11080
},
{
"epoch": 5.811844863731656,
"grad_norm": 0.7912852764129639,
"learning_rate": 3.42379689426213e-06,
"loss": 0.1272,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13008055090904236,
"step": 11085
},
{
"epoch": 5.814465408805032,
"grad_norm": 0.7045580744743347,
"learning_rate": 3.409186967179483e-06,
"loss": 0.1188,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1028495579957962,
"step": 11090
},
{
"epoch": 5.817085953878407,
"grad_norm": 0.7422047853469849,
"learning_rate": 3.394605373479427e-06,
"loss": 0.107,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10188481211662292,
"step": 11095
},
{
"epoch": 5.819706498951782,
"grad_norm": 0.7149353623390198,
"learning_rate": 3.3800521380640538e-06,
"loss": 0.116,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12615402042865753,
"step": 11100
},
{
"epoch": 5.822327044025157,
"grad_norm": 0.7516360282897949,
"learning_rate": 3.3655272857870202e-06,
"loss": 0.1117,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1172906905412674,
"step": 11105
},
{
"epoch": 5.8249475890985325,
"grad_norm": 0.718627393245697,
"learning_rate": 3.3510308414535062e-06,
"loss": 0.1053,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12420696020126343,
"step": 11110
},
{
"epoch": 5.827568134171908,
"grad_norm": 0.6898729205131531,
"learning_rate": 3.3365628298201935e-06,
"loss": 0.1181,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11398045718669891,
"step": 11115
},
{
"epoch": 5.830188679245283,
"grad_norm": 0.7775851488113403,
"learning_rate": 3.3221232755951903e-06,
"loss": 0.1196,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1542092263698578,
"step": 11120
},
{
"epoch": 5.832809224318658,
"grad_norm": 0.7814466953277588,
"learning_rate": 3.307712203438014e-06,
"loss": 0.129,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10209138691425323,
"step": 11125
},
{
"epoch": 5.835429769392033,
"grad_norm": 0.851301908493042,
"learning_rate": 3.2933296379595394e-06,
"loss": 0.1073,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0756775438785553,
"step": 11130
},
{
"epoch": 5.838050314465409,
"grad_norm": 0.8173316717147827,
"learning_rate": 3.2789756037219524e-06,
"loss": 0.1013,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1055392324924469,
"step": 11135
},
{
"epoch": 5.840670859538784,
"grad_norm": 0.7540410757064819,
"learning_rate": 3.2646501252387287e-06,
"loss": 0.1248,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09263626486063004,
"step": 11140
},
{
"epoch": 5.84329140461216,
"grad_norm": 0.8481481671333313,
"learning_rate": 3.2503532269745654e-06,
"loss": 0.1139,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13721612095832825,
"step": 11145
},
{
"epoch": 5.845911949685535,
"grad_norm": 0.734102725982666,
"learning_rate": 3.2360849333453515e-06,
"loss": 0.1061,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11949004232883453,
"step": 11150
},
{
"epoch": 5.84853249475891,
"grad_norm": 0.8791471123695374,
"learning_rate": 3.221845268718129e-06,
"loss": 0.1198,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11480453610420227,
"step": 11155
},
{
"epoch": 5.851153039832285,
"grad_norm": 0.7812302708625793,
"learning_rate": 3.20763425741105e-06,
"loss": 0.0916,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.04547119140625,
"step": 11160
},
{
"epoch": 5.85377358490566,
"grad_norm": 0.8186652064323425,
"learning_rate": 3.1934519236933204e-06,
"loss": 0.1066,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10640643537044525,
"step": 11165
},
{
"epoch": 5.856394129979035,
"grad_norm": 0.802610456943512,
"learning_rate": 3.1792982917851932e-06,
"loss": 0.1217,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08154296875,
"step": 11170
},
{
"epoch": 5.859014675052411,
"grad_norm": 0.7426859140396118,
"learning_rate": 3.165173385857889e-06,
"loss": 0.1099,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0936279296875,
"step": 11175
},
{
"epoch": 5.861635220125786,
"grad_norm": 0.7227411866188049,
"learning_rate": 3.1510772300335747e-06,
"loss": 0.1246,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12074403464794159,
"step": 11180
},
{
"epoch": 5.864255765199162,
"grad_norm": 0.9096740484237671,
"learning_rate": 3.1370098483853173e-06,
"loss": 0.1039,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09936092048883438,
"step": 11185
},
{
"epoch": 5.866876310272537,
"grad_norm": 0.6973651647567749,
"learning_rate": 3.1229712649370403e-06,
"loss": 0.1027,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1065673828125,
"step": 11190
},
{
"epoch": 5.869496855345912,
"grad_norm": 0.6310497522354126,
"learning_rate": 3.1089615036635034e-06,
"loss": 0.129,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1379322111606598,
"step": 11195
},
{
"epoch": 5.872117400419287,
"grad_norm": 0.6595391035079956,
"learning_rate": 3.094980588490224e-06,
"loss": 0.1342,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13927483558654785,
"step": 11200
},
{
"epoch": 5.8747379454926625,
"grad_norm": 0.6636078357696533,
"learning_rate": 3.0810285432934695e-06,
"loss": 0.1142,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08349460363388062,
"step": 11205
},
{
"epoch": 5.877358490566038,
"grad_norm": 0.7393668293952942,
"learning_rate": 3.0671053919001957e-06,
"loss": 0.101,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07989126443862915,
"step": 11210
},
{
"epoch": 5.879979035639413,
"grad_norm": 0.6790786981582642,
"learning_rate": 3.0532111580880163e-06,
"loss": 0.1229,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10980883240699768,
"step": 11215
},
{
"epoch": 5.882599580712788,
"grad_norm": 0.7920456528663635,
"learning_rate": 3.039345865585168e-06,
"loss": 0.121,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0809326171875,
"step": 11220
},
{
"epoch": 5.885220125786163,
"grad_norm": 0.8068638443946838,
"learning_rate": 3.025509538070457e-06,
"loss": 0.1089,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11082607507705688,
"step": 11225
},
{
"epoch": 5.887840670859539,
"grad_norm": 0.6608523726463318,
"learning_rate": 3.011702199173221e-06,
"loss": 0.1042,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12623870372772217,
"step": 11230
},
{
"epoch": 5.890461215932914,
"grad_norm": 0.68641197681427,
"learning_rate": 2.997923872473292e-06,
"loss": 0.1031,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11337362229824066,
"step": 11235
},
{
"epoch": 5.8930817610062896,
"grad_norm": 0.8305394053459167,
"learning_rate": 2.9841745815009558e-06,
"loss": 0.1139,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.109619140625,
"step": 11240
},
{
"epoch": 5.895702306079665,
"grad_norm": 0.6855704188346863,
"learning_rate": 2.97045434973692e-06,
"loss": 0.1058,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.073974609375,
"step": 11245
},
{
"epoch": 5.89832285115304,
"grad_norm": 0.7410557866096497,
"learning_rate": 2.956763200612256e-06,
"loss": 0.1075,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10438671708106995,
"step": 11250
},
{
"epoch": 5.900943396226415,
"grad_norm": 0.8150331974029541,
"learning_rate": 2.9431011575083723e-06,
"loss": 0.1139,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0777587890625,
"step": 11255
},
{
"epoch": 5.90356394129979,
"grad_norm": 0.6317834854125977,
"learning_rate": 2.92946824375697e-06,
"loss": 0.1081,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12193545699119568,
"step": 11260
},
{
"epoch": 5.906184486373165,
"grad_norm": 0.7798405289649963,
"learning_rate": 2.9158644826399986e-06,
"loss": 0.1347,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11343325674533844,
"step": 11265
},
{
"epoch": 5.908805031446541,
"grad_norm": 0.7575414180755615,
"learning_rate": 2.902289897389634e-06,
"loss": 0.1362,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14311093091964722,
"step": 11270
},
{
"epoch": 5.911425576519916,
"grad_norm": 0.9065014123916626,
"learning_rate": 2.8887445111882194e-06,
"loss": 0.1096,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08463536202907562,
"step": 11275
},
{
"epoch": 5.914046121593291,
"grad_norm": 0.8347724080085754,
"learning_rate": 2.8752283471682284e-06,
"loss": 0.1131,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08719807118177414,
"step": 11280
},
{
"epoch": 5.916666666666667,
"grad_norm": 0.7125123739242554,
"learning_rate": 2.861741428412237e-06,
"loss": 0.1446,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1454043686389923,
"step": 11285
},
{
"epoch": 5.919287211740042,
"grad_norm": 0.8080534338951111,
"learning_rate": 2.848283777952865e-06,
"loss": 0.1146,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11547332257032394,
"step": 11290
},
{
"epoch": 5.921907756813417,
"grad_norm": 0.7206215262413025,
"learning_rate": 2.8348554187727685e-06,
"loss": 0.1172,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13346555829048157,
"step": 11295
},
{
"epoch": 5.9245283018867925,
"grad_norm": 0.6634690165519714,
"learning_rate": 2.821456373804563e-06,
"loss": 0.1191,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09581804275512695,
"step": 11300
},
{
"epoch": 5.927148846960168,
"grad_norm": 0.6794573664665222,
"learning_rate": 2.8080866659308114e-06,
"loss": 0.1303,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13424135744571686,
"step": 11305
},
{
"epoch": 5.929769392033543,
"grad_norm": 0.6448708772659302,
"learning_rate": 2.794746317983967e-06,
"loss": 0.1451,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15221071243286133,
"step": 11310
},
{
"epoch": 5.932389937106918,
"grad_norm": 0.5301897525787354,
"learning_rate": 2.7814353527463488e-06,
"loss": 0.1203,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17727480828762054,
"step": 11315
},
{
"epoch": 5.935010482180293,
"grad_norm": 0.7591878175735474,
"learning_rate": 2.7681537929501034e-06,
"loss": 0.1077,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11045671999454498,
"step": 11320
},
{
"epoch": 5.937631027253669,
"grad_norm": 0.7652134895324707,
"learning_rate": 2.754901661277145e-06,
"loss": 0.1388,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.135330468416214,
"step": 11325
},
{
"epoch": 5.940251572327044,
"grad_norm": 0.7571619153022766,
"learning_rate": 2.7416789803591394e-06,
"loss": 0.112,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12743711471557617,
"step": 11330
},
{
"epoch": 5.9428721174004195,
"grad_norm": 0.8233155012130737,
"learning_rate": 2.7284857727774584e-06,
"loss": 0.1181,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08775623887777328,
"step": 11335
},
{
"epoch": 5.945492662473795,
"grad_norm": 0.7032680511474609,
"learning_rate": 2.71532206106313e-06,
"loss": 0.1151,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15328362584114075,
"step": 11340
},
{
"epoch": 5.94811320754717,
"grad_norm": 0.749940812587738,
"learning_rate": 2.7021878676968285e-06,
"loss": 0.1085,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.118592768907547,
"step": 11345
},
{
"epoch": 5.950733752620545,
"grad_norm": 0.8226772546768188,
"learning_rate": 2.689083215108799e-06,
"loss": 0.1318,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12129893153905869,
"step": 11350
},
{
"epoch": 5.95335429769392,
"grad_norm": 0.6892474293708801,
"learning_rate": 2.6760081256788482e-06,
"loss": 0.1345,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12827900052070618,
"step": 11355
},
{
"epoch": 5.955974842767295,
"grad_norm": 0.7389814257621765,
"learning_rate": 2.6629626217362914e-06,
"loss": 0.118,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1389472335577011,
"step": 11360
},
{
"epoch": 5.9585953878406706,
"grad_norm": 0.7197535037994385,
"learning_rate": 2.6499467255599153e-06,
"loss": 0.09,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11740542948246002,
"step": 11365
},
{
"epoch": 5.961215932914046,
"grad_norm": 0.6489915251731873,
"learning_rate": 2.636960459377955e-06,
"loss": 0.1172,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1260216236114502,
"step": 11370
},
{
"epoch": 5.963836477987421,
"grad_norm": 0.7277437448501587,
"learning_rate": 2.624003845368035e-06,
"loss": 0.1164,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10765425115823746,
"step": 11375
},
{
"epoch": 5.966457023060797,
"grad_norm": 0.6941485404968262,
"learning_rate": 2.6110769056571394e-06,
"loss": 0.1109,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0910947397351265,
"step": 11380
},
{
"epoch": 5.969077568134172,
"grad_norm": 0.6762799024581909,
"learning_rate": 2.598179662321576e-06,
"loss": 0.1256,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1689760833978653,
"step": 11385
},
{
"epoch": 5.971698113207547,
"grad_norm": 0.7307757139205933,
"learning_rate": 2.585312137386946e-06,
"loss": 0.1161,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0966796875,
"step": 11390
},
{
"epoch": 5.9743186582809225,
"grad_norm": 0.7055436372756958,
"learning_rate": 2.5724743528280828e-06,
"loss": 0.1224,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14848220348358154,
"step": 11395
},
{
"epoch": 5.976939203354298,
"grad_norm": 0.7922831773757935,
"learning_rate": 2.5596663305690506e-06,
"loss": 0.1132,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11176641285419464,
"step": 11400
},
{
"epoch": 5.979559748427673,
"grad_norm": 0.6653242707252502,
"learning_rate": 2.5468880924830685e-06,
"loss": 0.1065,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09289851784706116,
"step": 11405
},
{
"epoch": 5.982180293501048,
"grad_norm": 0.6594162583351135,
"learning_rate": 2.5341396603924984e-06,
"loss": 0.12,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12634675204753876,
"step": 11410
},
{
"epoch": 5.984800838574423,
"grad_norm": 0.7897002696990967,
"learning_rate": 2.5214210560688e-06,
"loss": 0.1205,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1091359332203865,
"step": 11415
},
{
"epoch": 5.987421383647799,
"grad_norm": 0.8033177852630615,
"learning_rate": 2.508732301232486e-06,
"loss": 0.1163,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12310639023780823,
"step": 11420
},
{
"epoch": 5.990041928721174,
"grad_norm": 0.7952110171318054,
"learning_rate": 2.496073417553113e-06,
"loss": 0.1052,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09496569633483887,
"step": 11425
},
{
"epoch": 5.9926624737945495,
"grad_norm": 0.8437222838401794,
"learning_rate": 2.483444426649202e-06,
"loss": 0.1054,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0804443359375,
"step": 11430
},
{
"epoch": 5.995283018867925,
"grad_norm": 0.7011773586273193,
"learning_rate": 2.470845350088238e-06,
"loss": 0.1297,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14462977647781372,
"step": 11435
},
{
"epoch": 5.9979035639413,
"grad_norm": 0.7636061906814575,
"learning_rate": 2.45827620938661e-06,
"loss": 0.1086,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0979013666510582,
"step": 11440
},
{
"epoch": 6.00104821802935,
"grad_norm": 0.6727458238601685,
"learning_rate": 2.4457370260095846e-06,
"loss": 0.1008,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12848661839962006,
"step": 11445
},
{
"epoch": 6.003668763102725,
"grad_norm": 0.9353975653648376,
"learning_rate": 2.4332278213712824e-06,
"loss": 0.1281,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11782145500183105,
"step": 11450
},
{
"epoch": 6.0062893081761,
"grad_norm": 0.7893511652946472,
"learning_rate": 2.420748616834607e-06,
"loss": 0.1017,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12847980856895447,
"step": 11455
},
{
"epoch": 6.008909853249476,
"grad_norm": 0.7531543374061584,
"learning_rate": 2.4082994337112386e-06,
"loss": 0.0989,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0888490155339241,
"step": 11460
},
{
"epoch": 6.011530398322851,
"grad_norm": 0.6247485876083374,
"learning_rate": 2.3958802932615856e-06,
"loss": 0.1116,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12102822959423065,
"step": 11465
},
{
"epoch": 6.014150943396227,
"grad_norm": 0.7316791415214539,
"learning_rate": 2.3834912166947487e-06,
"loss": 0.1007,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0911519005894661,
"step": 11470
},
{
"epoch": 6.016771488469602,
"grad_norm": 0.7604213953018188,
"learning_rate": 2.3711322251684956e-06,
"loss": 0.0862,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0772705078125,
"step": 11475
},
{
"epoch": 6.019392033542977,
"grad_norm": 0.8012372851371765,
"learning_rate": 2.358803339789202e-06,
"loss": 0.1082,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11557720601558685,
"step": 11480
},
{
"epoch": 6.022012578616352,
"grad_norm": 0.8371411561965942,
"learning_rate": 2.346504581611837e-06,
"loss": 0.1016,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10546975582838058,
"step": 11485
},
{
"epoch": 6.0246331236897275,
"grad_norm": 0.6822553873062134,
"learning_rate": 2.3342359716399175e-06,
"loss": 0.1137,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15785232186317444,
"step": 11490
},
{
"epoch": 6.027253668763103,
"grad_norm": 0.673607349395752,
"learning_rate": 2.3219975308254684e-06,
"loss": 0.1141,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11714327335357666,
"step": 11495
},
{
"epoch": 6.029874213836478,
"grad_norm": 0.7590088248252869,
"learning_rate": 2.309789280069008e-06,
"loss": 0.1172,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10126854479312897,
"step": 11500
},
{
"epoch": 6.032494758909853,
"grad_norm": 0.73857581615448,
"learning_rate": 2.297611240219484e-06,
"loss": 0.1008,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10918985307216644,
"step": 11505
},
{
"epoch": 6.035115303983228,
"grad_norm": 0.7437694072723389,
"learning_rate": 2.2854634320742487e-06,
"loss": 0.1031,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08391405642032623,
"step": 11510
},
{
"epoch": 6.037735849056604,
"grad_norm": 0.7067047357559204,
"learning_rate": 2.273345876379036e-06,
"loss": 0.11,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.136289581656456,
"step": 11515
},
{
"epoch": 6.040356394129979,
"grad_norm": 0.6687672138214111,
"learning_rate": 2.2612585938278996e-06,
"loss": 0.1113,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12116891145706177,
"step": 11520
},
{
"epoch": 6.0429769392033545,
"grad_norm": 0.6922745704650879,
"learning_rate": 2.249201605063216e-06,
"loss": 0.1183,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1263124644756317,
"step": 11525
},
{
"epoch": 6.04559748427673,
"grad_norm": 0.6233412623405457,
"learning_rate": 2.2371749306756073e-06,
"loss": 0.1169,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14116165041923523,
"step": 11530
},
{
"epoch": 6.048218029350105,
"grad_norm": 0.7557879686355591,
"learning_rate": 2.2251785912039357e-06,
"loss": 0.1279,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14125700294971466,
"step": 11535
},
{
"epoch": 6.05083857442348,
"grad_norm": 0.7540751099586487,
"learning_rate": 2.213212607135251e-06,
"loss": 0.1182,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07376109063625336,
"step": 11540
},
{
"epoch": 6.053459119496855,
"grad_norm": 0.7563124895095825,
"learning_rate": 2.2012769989047665e-06,
"loss": 0.1105,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0994349867105484,
"step": 11545
},
{
"epoch": 6.05607966457023,
"grad_norm": 0.7129912972450256,
"learning_rate": 2.1893717868958243e-06,
"loss": 0.1056,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12413402646780014,
"step": 11550
},
{
"epoch": 6.058700209643606,
"grad_norm": 0.7121222019195557,
"learning_rate": 2.177496991439851e-06,
"loss": 0.1313,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12695494294166565,
"step": 11555
},
{
"epoch": 6.061320754716981,
"grad_norm": 0.7974750995635986,
"learning_rate": 2.165652632816331e-06,
"loss": 0.1082,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.06833033263683319,
"step": 11560
},
{
"epoch": 6.063941299790357,
"grad_norm": 0.676853597164154,
"learning_rate": 2.1538387312527665e-06,
"loss": 0.1165,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15408313274383545,
"step": 11565
},
{
"epoch": 6.066561844863732,
"grad_norm": 0.6463554501533508,
"learning_rate": 2.1420553069246462e-06,
"loss": 0.0979,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08351656794548035,
"step": 11570
},
{
"epoch": 6.069182389937107,
"grad_norm": 0.7661669850349426,
"learning_rate": 2.13030237995542e-06,
"loss": 0.0975,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08501426875591278,
"step": 11575
},
{
"epoch": 6.071802935010482,
"grad_norm": 0.7557024955749512,
"learning_rate": 2.1185799704164432e-06,
"loss": 0.091,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0771484375,
"step": 11580
},
{
"epoch": 6.0744234800838575,
"grad_norm": 0.7830671072006226,
"learning_rate": 2.10688809832696e-06,
"loss": 0.1031,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0934370681643486,
"step": 11585
},
{
"epoch": 6.077044025157233,
"grad_norm": 0.6532689929008484,
"learning_rate": 2.0952267836540608e-06,
"loss": 0.1198,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1707833707332611,
"step": 11590
},
{
"epoch": 6.079664570230608,
"grad_norm": 0.8896797895431519,
"learning_rate": 2.08359604631265e-06,
"loss": 0.1157,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10823449492454529,
"step": 11595
},
{
"epoch": 6.082285115303983,
"grad_norm": 0.7435277700424194,
"learning_rate": 2.0719959061654137e-06,
"loss": 0.1184,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1346738636493683,
"step": 11600
},
{
"epoch": 6.084905660377358,
"grad_norm": 0.6748736500740051,
"learning_rate": 2.0604263830227957e-06,
"loss": 0.1234,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1394091546535492,
"step": 11605
},
{
"epoch": 6.087526205450734,
"grad_norm": 0.8019354343414307,
"learning_rate": 2.0488874966429352e-06,
"loss": 0.0817,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.06707763671875,
"step": 11610
},
{
"epoch": 6.090146750524109,
"grad_norm": 0.8450024724006653,
"learning_rate": 2.0373792667316604e-06,
"loss": 0.0987,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.072906494140625,
"step": 11615
},
{
"epoch": 6.0927672955974845,
"grad_norm": 0.6614810228347778,
"learning_rate": 2.0259017129424417e-06,
"loss": 0.1231,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12428902834653854,
"step": 11620
},
{
"epoch": 6.09538784067086,
"grad_norm": 0.7350912094116211,
"learning_rate": 2.0144548548763643e-06,
"loss": 0.108,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10314503312110901,
"step": 11625
},
{
"epoch": 6.098008385744235,
"grad_norm": 0.6813164353370667,
"learning_rate": 2.0030387120820927e-06,
"loss": 0.0986,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08183074742555618,
"step": 11630
},
{
"epoch": 6.10062893081761,
"grad_norm": 0.7465524077415466,
"learning_rate": 1.991653304055836e-06,
"loss": 0.1131,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10792015492916107,
"step": 11635
},
{
"epoch": 6.103249475890985,
"grad_norm": 0.723408579826355,
"learning_rate": 1.9802986502413103e-06,
"loss": 0.0986,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0633544921875,
"step": 11640
},
{
"epoch": 6.10587002096436,
"grad_norm": 0.7599599361419678,
"learning_rate": 1.9689747700297167e-06,
"loss": 0.1017,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07861328125,
"step": 11645
},
{
"epoch": 6.1084905660377355,
"grad_norm": 0.7111448645591736,
"learning_rate": 1.9576816827596934e-06,
"loss": 0.103,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15611860156059265,
"step": 11650
},
{
"epoch": 6.111111111111111,
"grad_norm": 0.7046041488647461,
"learning_rate": 1.946419407717308e-06,
"loss": 0.1103,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12058594822883606,
"step": 11655
},
{
"epoch": 6.113731656184487,
"grad_norm": 0.7179438471794128,
"learning_rate": 1.9351879641359895e-06,
"loss": 0.0852,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0791943222284317,
"step": 11660
},
{
"epoch": 6.116352201257862,
"grad_norm": 0.7104761600494385,
"learning_rate": 1.923987371196523e-06,
"loss": 0.1172,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11489333212375641,
"step": 11665
},
{
"epoch": 6.118972746331237,
"grad_norm": 0.7625128626823425,
"learning_rate": 1.9128176480270057e-06,
"loss": 0.1134,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1123046875,
"step": 11670
},
{
"epoch": 6.121593291404612,
"grad_norm": 0.7586326599121094,
"learning_rate": 1.9016788137028142e-06,
"loss": 0.0924,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10534888505935669,
"step": 11675
},
{
"epoch": 6.1242138364779874,
"grad_norm": 0.77974933385849,
"learning_rate": 1.8905708872465788e-06,
"loss": 0.1094,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.06809628754854202,
"step": 11680
},
{
"epoch": 6.126834381551363,
"grad_norm": 0.8171315789222717,
"learning_rate": 1.8794938876281432e-06,
"loss": 0.1131,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08233642578125,
"step": 11685
},
{
"epoch": 6.129454926624738,
"grad_norm": 0.8048850893974304,
"learning_rate": 1.868447833764535e-06,
"loss": 0.1052,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12104252725839615,
"step": 11690
},
{
"epoch": 6.132075471698113,
"grad_norm": 0.7302783727645874,
"learning_rate": 1.8574327445199315e-06,
"loss": 0.1192,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13073480129241943,
"step": 11695
},
{
"epoch": 6.134696016771488,
"grad_norm": 0.7510926127433777,
"learning_rate": 1.8464486387056291e-06,
"loss": 0.1074,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11252527683973312,
"step": 11700
},
{
"epoch": 6.137316561844864,
"grad_norm": 0.6938128471374512,
"learning_rate": 1.8354955350800163e-06,
"loss": 0.1319,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17961004376411438,
"step": 11705
},
{
"epoch": 6.139937106918239,
"grad_norm": 0.7121496200561523,
"learning_rate": 1.824573452348537e-06,
"loss": 0.119,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11327329277992249,
"step": 11710
},
{
"epoch": 6.1425576519916145,
"grad_norm": 0.7868003249168396,
"learning_rate": 1.8136824091636552e-06,
"loss": 0.1068,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11851361393928528,
"step": 11715
},
{
"epoch": 6.14517819706499,
"grad_norm": 0.7592499256134033,
"learning_rate": 1.8028224241248238e-06,
"loss": 0.1218,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12001194059848785,
"step": 11720
},
{
"epoch": 6.147798742138365,
"grad_norm": 0.7448607087135315,
"learning_rate": 1.7919935157784585e-06,
"loss": 0.1018,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14102524518966675,
"step": 11725
},
{
"epoch": 6.15041928721174,
"grad_norm": 0.7317970395088196,
"learning_rate": 1.7811957026179017e-06,
"loss": 0.1073,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1257852464914322,
"step": 11730
},
{
"epoch": 6.153039832285115,
"grad_norm": 0.7459965348243713,
"learning_rate": 1.770429003083396e-06,
"loss": 0.1186,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12738224864006042,
"step": 11735
},
{
"epoch": 6.15566037735849,
"grad_norm": 0.7520008683204651,
"learning_rate": 1.7596934355620465e-06,
"loss": 0.1376,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15343382954597473,
"step": 11740
},
{
"epoch": 6.1582809224318655,
"grad_norm": 0.7318786382675171,
"learning_rate": 1.74898901838779e-06,
"loss": 0.1016,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.061004638671875,
"step": 11745
},
{
"epoch": 6.160901467505241,
"grad_norm": 0.7465088367462158,
"learning_rate": 1.7383157698413676e-06,
"loss": 0.1032,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11167745292186737,
"step": 11750
},
{
"epoch": 6.163522012578617,
"grad_norm": 0.6921051740646362,
"learning_rate": 1.727673708150286e-06,
"loss": 0.1165,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15981853008270264,
"step": 11755
},
{
"epoch": 6.166142557651992,
"grad_norm": 0.7283172607421875,
"learning_rate": 1.7170628514888044e-06,
"loss": 0.1056,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.052154541015625,
"step": 11760
},
{
"epoch": 6.168763102725367,
"grad_norm": 0.7145721316337585,
"learning_rate": 1.7064832179778768e-06,
"loss": 0.1035,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10321943461894989,
"step": 11765
},
{
"epoch": 6.171383647798742,
"grad_norm": 0.7393687963485718,
"learning_rate": 1.695934825685146e-06,
"loss": 0.1071,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10480769723653793,
"step": 11770
},
{
"epoch": 6.174004192872117,
"grad_norm": 0.7510984539985657,
"learning_rate": 1.6854176926248956e-06,
"loss": 0.1122,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12445969879627228,
"step": 11775
},
{
"epoch": 6.176624737945493,
"grad_norm": 0.7398217916488647,
"learning_rate": 1.6749318367580203e-06,
"loss": 0.1281,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10730582475662231,
"step": 11780
},
{
"epoch": 6.179245283018868,
"grad_norm": 0.6224637627601624,
"learning_rate": 1.6644772759920157e-06,
"loss": 0.1002,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1414836049079895,
"step": 11785
},
{
"epoch": 6.181865828092243,
"grad_norm": 0.8317475318908691,
"learning_rate": 1.6540540281809225e-06,
"loss": 0.1098,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0701904296875,
"step": 11790
},
{
"epoch": 6.184486373165618,
"grad_norm": 0.7141923904418945,
"learning_rate": 1.6436621111253036e-06,
"loss": 0.1059,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1531287431716919,
"step": 11795
},
{
"epoch": 6.187106918238993,
"grad_norm": 0.7269362807273865,
"learning_rate": 1.633301542572221e-06,
"loss": 0.1121,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12649017572402954,
"step": 11800
},
{
"epoch": 6.189727463312369,
"grad_norm": 0.7622941136360168,
"learning_rate": 1.6229723402151987e-06,
"loss": 0.1087,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12106990069150925,
"step": 11805
},
{
"epoch": 6.1923480083857445,
"grad_norm": 0.8292533159255981,
"learning_rate": 1.6126745216941908e-06,
"loss": 0.1017,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0821533203125,
"step": 11810
},
{
"epoch": 6.19496855345912,
"grad_norm": 0.8477041721343994,
"learning_rate": 1.6024081045955652e-06,
"loss": 0.0963,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09291965514421463,
"step": 11815
},
{
"epoch": 6.197589098532495,
"grad_norm": 0.6917146444320679,
"learning_rate": 1.5921731064520552e-06,
"loss": 0.1021,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13043524324893951,
"step": 11820
},
{
"epoch": 6.20020964360587,
"grad_norm": 0.7436804175376892,
"learning_rate": 1.5819695447427364e-06,
"loss": 0.1305,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13131611049175262,
"step": 11825
},
{
"epoch": 6.202830188679245,
"grad_norm": 0.7311981320381165,
"learning_rate": 1.5717974368930033e-06,
"loss": 0.1249,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10772877931594849,
"step": 11830
},
{
"epoch": 6.20545073375262,
"grad_norm": 0.8186302185058594,
"learning_rate": 1.5616568002745247e-06,
"loss": 0.1019,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0828857421875,
"step": 11835
},
{
"epoch": 6.2080712788259955,
"grad_norm": 0.8164761662483215,
"learning_rate": 1.551547652205241e-06,
"loss": 0.1094,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1388622224330902,
"step": 11840
},
{
"epoch": 6.210691823899371,
"grad_norm": 0.7896164059638977,
"learning_rate": 1.541470009949302e-06,
"loss": 0.1099,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1046142578125,
"step": 11845
},
{
"epoch": 6.213312368972747,
"grad_norm": 0.7347028851509094,
"learning_rate": 1.5314238907170565e-06,
"loss": 0.1133,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09812355786561966,
"step": 11850
},
{
"epoch": 6.215932914046122,
"grad_norm": 0.7074013948440552,
"learning_rate": 1.5214093116650208e-06,
"loss": 0.1052,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07903372496366501,
"step": 11855
},
{
"epoch": 6.218553459119497,
"grad_norm": 0.7754842638969421,
"learning_rate": 1.511426289895841e-06,
"loss": 0.1125,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12341928482055664,
"step": 11860
},
{
"epoch": 6.221174004192872,
"grad_norm": 0.8584949970245361,
"learning_rate": 1.5014748424582859e-06,
"loss": 0.0933,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07411631941795349,
"step": 11865
},
{
"epoch": 6.223794549266247,
"grad_norm": 0.7787871956825256,
"learning_rate": 1.4915549863471901e-06,
"loss": 0.1041,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10432527214288712,
"step": 11870
},
{
"epoch": 6.226415094339623,
"grad_norm": 0.7362038493156433,
"learning_rate": 1.4816667385034378e-06,
"loss": 0.0907,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12271437793970108,
"step": 11875
},
{
"epoch": 6.229035639412998,
"grad_norm": 0.7616882920265198,
"learning_rate": 1.4718101158139343e-06,
"loss": 0.1054,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08734153211116791,
"step": 11880
},
{
"epoch": 6.231656184486373,
"grad_norm": 0.6927313804626465,
"learning_rate": 1.4619851351115787e-06,
"loss": 0.1097,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0926610603928566,
"step": 11885
},
{
"epoch": 6.234276729559748,
"grad_norm": 0.78810054063797,
"learning_rate": 1.4521918131752345e-06,
"loss": 0.1093,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0959969088435173,
"step": 11890
},
{
"epoch": 6.236897274633123,
"grad_norm": 0.8270073533058167,
"learning_rate": 1.4424301667296936e-06,
"loss": 0.1047,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07994962483644485,
"step": 11895
},
{
"epoch": 6.239517819706499,
"grad_norm": 0.7329707741737366,
"learning_rate": 1.4327002124456545e-06,
"loss": 0.1167,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11037370562553406,
"step": 11900
},
{
"epoch": 6.2421383647798745,
"grad_norm": 0.7574793696403503,
"learning_rate": 1.4230019669396966e-06,
"loss": 0.1232,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13014623522758484,
"step": 11905
},
{
"epoch": 6.24475890985325,
"grad_norm": 0.7613834738731384,
"learning_rate": 1.4133354467742422e-06,
"loss": 0.1094,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09843027591705322,
"step": 11910
},
{
"epoch": 6.247379454926625,
"grad_norm": 0.7687358260154724,
"learning_rate": 1.4037006684575393e-06,
"loss": 0.1007,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09907668828964233,
"step": 11915
},
{
"epoch": 6.25,
"grad_norm": 0.7559594511985779,
"learning_rate": 1.3940976484436264e-06,
"loss": 0.1068,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10573053359985352,
"step": 11920
},
{
"epoch": 6.252620545073375,
"grad_norm": 0.6567190289497375,
"learning_rate": 1.3845264031323025e-06,
"loss": 0.132,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1519894301891327,
"step": 11925
},
{
"epoch": 6.25524109014675,
"grad_norm": 0.6811748743057251,
"learning_rate": 1.3749869488691037e-06,
"loss": 0.1237,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13222289085388184,
"step": 11930
},
{
"epoch": 6.2578616352201255,
"grad_norm": 0.7532901763916016,
"learning_rate": 1.3654793019452761e-06,
"loss": 0.0992,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09068848937749863,
"step": 11935
},
{
"epoch": 6.260482180293501,
"grad_norm": 0.7820981740951538,
"learning_rate": 1.3560034785977515e-06,
"loss": 0.1049,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09503139555454254,
"step": 11940
},
{
"epoch": 6.263102725366876,
"grad_norm": 0.7701623439788818,
"learning_rate": 1.346559495009101e-06,
"loss": 0.0999,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11630750447511673,
"step": 11945
},
{
"epoch": 6.265723270440252,
"grad_norm": 0.7650212049484253,
"learning_rate": 1.3371473673075298e-06,
"loss": 0.1123,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09112050384283066,
"step": 11950
},
{
"epoch": 6.268343815513627,
"grad_norm": 0.7648396492004395,
"learning_rate": 1.32776711156684e-06,
"loss": 0.1104,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11140650510787964,
"step": 11955
},
{
"epoch": 6.270964360587002,
"grad_norm": 0.5840007066726685,
"learning_rate": 1.3184187438063956e-06,
"loss": 0.1036,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.06048639118671417,
"step": 11960
},
{
"epoch": 6.273584905660377,
"grad_norm": 0.9658522605895996,
"learning_rate": 1.3091022799911168e-06,
"loss": 0.1148,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.102294921875,
"step": 11965
},
{
"epoch": 6.276205450733753,
"grad_norm": 0.7145773768424988,
"learning_rate": 1.2998177360314279e-06,
"loss": 0.1029,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10112839937210083,
"step": 11970
},
{
"epoch": 6.278825995807128,
"grad_norm": 0.6984833478927612,
"learning_rate": 1.2905651277832454e-06,
"loss": 0.1059,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10083213448524475,
"step": 11975
},
{
"epoch": 6.281446540880503,
"grad_norm": 0.7433158159255981,
"learning_rate": 1.281344471047945e-06,
"loss": 0.1192,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13840986788272858,
"step": 11980
},
{
"epoch": 6.284067085953878,
"grad_norm": 0.6915749907493591,
"learning_rate": 1.2721557815723373e-06,
"loss": 0.1155,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11917416751384735,
"step": 11985
},
{
"epoch": 6.286687631027253,
"grad_norm": 0.8083169460296631,
"learning_rate": 1.2629990750486431e-06,
"loss": 0.0931,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0660400390625,
"step": 11990
},
{
"epoch": 6.289308176100629,
"grad_norm": 0.6940762400627136,
"learning_rate": 1.2538743671144605e-06,
"loss": 0.1013,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09669125825166702,
"step": 11995
},
{
"epoch": 6.2919287211740045,
"grad_norm": 0.8445092439651489,
"learning_rate": 1.2447816733527374e-06,
"loss": 0.1009,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09276933968067169,
"step": 12000
},
{
"epoch": 6.29454926624738,
"grad_norm": 0.7265570163726807,
"learning_rate": 1.235721009291757e-06,
"loss": 0.1011,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11874869465827942,
"step": 12005
},
{
"epoch": 6.297169811320755,
"grad_norm": 0.7973197102546692,
"learning_rate": 1.2266923904050954e-06,
"loss": 0.1101,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12004204094409943,
"step": 12010
},
{
"epoch": 6.29979035639413,
"grad_norm": 0.798222005367279,
"learning_rate": 1.2176958321116073e-06,
"loss": 0.1287,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1184849962592125,
"step": 12015
},
{
"epoch": 6.302410901467505,
"grad_norm": 0.736455500125885,
"learning_rate": 1.208731349775394e-06,
"loss": 0.1181,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12855270504951477,
"step": 12020
},
{
"epoch": 6.30503144654088,
"grad_norm": 0.7713168859481812,
"learning_rate": 1.1997989587057779e-06,
"loss": 0.1143,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09117522835731506,
"step": 12025
},
{
"epoch": 6.3076519916142555,
"grad_norm": 0.7758849263191223,
"learning_rate": 1.190898674157277e-06,
"loss": 0.1067,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08564572036266327,
"step": 12030
},
{
"epoch": 6.310272536687631,
"grad_norm": 0.802696704864502,
"learning_rate": 1.1820305113295794e-06,
"loss": 0.1209,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14391744136810303,
"step": 12035
},
{
"epoch": 6.312893081761006,
"grad_norm": 0.8025701642036438,
"learning_rate": 1.1731944853675103e-06,
"loss": 0.0951,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09395985305309296,
"step": 12040
},
{
"epoch": 6.315513626834382,
"grad_norm": 0.7337117791175842,
"learning_rate": 1.164390611361026e-06,
"loss": 0.108,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.111670583486557,
"step": 12045
},
{
"epoch": 6.318134171907757,
"grad_norm": 0.8311132192611694,
"learning_rate": 1.1556189043451593e-06,
"loss": 0.1209,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12455315887928009,
"step": 12050
},
{
"epoch": 6.320754716981132,
"grad_norm": 0.7437119483947754,
"learning_rate": 1.1468793793000189e-06,
"loss": 0.1154,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11051569133996964,
"step": 12055
},
{
"epoch": 6.323375262054507,
"grad_norm": 0.7538794875144958,
"learning_rate": 1.138172051150752e-06,
"loss": 0.0883,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08596548438072205,
"step": 12060
},
{
"epoch": 6.325995807127883,
"grad_norm": 0.7586971521377563,
"learning_rate": 1.1294969347675133e-06,
"loss": 0.1255,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10219697654247284,
"step": 12065
},
{
"epoch": 6.328616352201258,
"grad_norm": 0.8248292207717896,
"learning_rate": 1.1208540449654603e-06,
"loss": 0.1081,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10271692276000977,
"step": 12070
},
{
"epoch": 6.331236897274633,
"grad_norm": 0.8947778940200806,
"learning_rate": 1.1122433965047063e-06,
"loss": 0.1082,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.089599609375,
"step": 12075
},
{
"epoch": 6.333857442348008,
"grad_norm": 0.8684989213943481,
"learning_rate": 1.1036650040903018e-06,
"loss": 0.1089,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10005056113004684,
"step": 12080
},
{
"epoch": 6.336477987421383,
"grad_norm": 0.7829846143722534,
"learning_rate": 1.095118882372217e-06,
"loss": 0.1073,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.089599609375,
"step": 12085
},
{
"epoch": 6.339098532494759,
"grad_norm": 0.6981558799743652,
"learning_rate": 1.086605045945306e-06,
"loss": 0.1166,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1020331084728241,
"step": 12090
},
{
"epoch": 6.3417190775681345,
"grad_norm": 0.6989213824272156,
"learning_rate": 1.0781235093492937e-06,
"loss": 0.1106,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0916748046875,
"step": 12095
},
{
"epoch": 6.34433962264151,
"grad_norm": 0.7455583810806274,
"learning_rate": 1.069674287068736e-06,
"loss": 0.1024,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0858154296875,
"step": 12100
},
{
"epoch": 6.346960167714885,
"grad_norm": 0.6839850544929504,
"learning_rate": 1.0612573935330084e-06,
"loss": 0.1126,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10747925192117691,
"step": 12105
},
{
"epoch": 6.34958071278826,
"grad_norm": 0.9391524195671082,
"learning_rate": 1.052872843116277e-06,
"loss": 0.1024,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09704363346099854,
"step": 12110
},
{
"epoch": 6.352201257861635,
"grad_norm": 0.7337355613708496,
"learning_rate": 1.0445206501374638e-06,
"loss": 0.0974,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09076334536075592,
"step": 12115
},
{
"epoch": 6.35482180293501,
"grad_norm": 0.7760245203971863,
"learning_rate": 1.0362008288602454e-06,
"loss": 0.1143,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09361493587493896,
"step": 12120
},
{
"epoch": 6.3574423480083855,
"grad_norm": 0.8386433124542236,
"learning_rate": 1.0279133934930074e-06,
"loss": 0.1114,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13330651819705963,
"step": 12125
},
{
"epoch": 6.360062893081761,
"grad_norm": 0.7231929302215576,
"learning_rate": 1.0196583581888264e-06,
"loss": 0.1103,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10863436013460159,
"step": 12130
},
{
"epoch": 6.362683438155136,
"grad_norm": 0.8203999996185303,
"learning_rate": 1.0114357370454475e-06,
"loss": 0.1133,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09395897388458252,
"step": 12135
},
{
"epoch": 6.365303983228512,
"grad_norm": 0.6534098982810974,
"learning_rate": 1.0032455441052602e-06,
"loss": 0.1131,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08841674029827118,
"step": 12140
},
{
"epoch": 6.367924528301887,
"grad_norm": 0.7433450222015381,
"learning_rate": 9.950877933552804e-07,
"loss": 0.0976,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09493722021579742,
"step": 12145
},
{
"epoch": 6.370545073375262,
"grad_norm": 0.6835120916366577,
"learning_rate": 9.869624987271108e-07,
"loss": 0.0972,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07848665118217468,
"step": 12150
},
{
"epoch": 6.373165618448637,
"grad_norm": 0.7295222878456116,
"learning_rate": 9.788696740969295e-07,
"loss": 0.1217,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15375928580760956,
"step": 12155
},
{
"epoch": 6.3757861635220126,
"grad_norm": 0.6435827016830444,
"learning_rate": 9.70809333285463e-07,
"loss": 0.105,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11237955093383789,
"step": 12160
},
{
"epoch": 6.378406708595388,
"grad_norm": 0.6830844879150391,
"learning_rate": 9.627814900579624e-07,
"loss": 0.1062,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12887975573539734,
"step": 12165
},
{
"epoch": 6.381027253668763,
"grad_norm": 0.7148608565330505,
"learning_rate": 9.547861581241834e-07,
"loss": 0.1243,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09197840094566345,
"step": 12170
},
{
"epoch": 6.383647798742138,
"grad_norm": 0.7562742233276367,
"learning_rate": 9.468233511383573e-07,
"loss": 0.0994,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0972900390625,
"step": 12175
},
{
"epoch": 6.386268343815513,
"grad_norm": 0.8822031617164612,
"learning_rate": 9.388930826991682e-07,
"loss": 0.0895,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0692138671875,
"step": 12180
},
{
"epoch": 6.388888888888889,
"grad_norm": 0.671484649181366,
"learning_rate": 9.309953663497362e-07,
"loss": 0.1081,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10735087841749191,
"step": 12185
},
{
"epoch": 6.3915094339622645,
"grad_norm": 0.6452421545982361,
"learning_rate": 9.231302155775812e-07,
"loss": 0.123,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1407989263534546,
"step": 12190
},
{
"epoch": 6.39412997903564,
"grad_norm": 0.8478002548217773,
"learning_rate": 9.152976438146211e-07,
"loss": 0.0896,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11116098612546921,
"step": 12195
},
{
"epoch": 6.396750524109015,
"grad_norm": 0.8632563352584839,
"learning_rate": 9.074976644371269e-07,
"loss": 0.097,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10032831877470016,
"step": 12200
},
{
"epoch": 6.39937106918239,
"grad_norm": 0.7530129551887512,
"learning_rate": 8.997302907657124e-07,
"loss": 0.1005,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11775366961956024,
"step": 12205
},
{
"epoch": 6.401991614255765,
"grad_norm": 0.6896882057189941,
"learning_rate": 8.919955360653066e-07,
"loss": 0.0986,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08039015531539917,
"step": 12210
},
{
"epoch": 6.40461215932914,
"grad_norm": 0.7121017575263977,
"learning_rate": 8.842934135451297e-07,
"loss": 0.1064,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10184863209724426,
"step": 12215
},
{
"epoch": 6.4072327044025155,
"grad_norm": 0.7153964638710022,
"learning_rate": 8.766239363586826e-07,
"loss": 0.1221,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10507543385028839,
"step": 12220
},
{
"epoch": 6.409853249475891,
"grad_norm": 0.6596793532371521,
"learning_rate": 8.689871176037102e-07,
"loss": 0.1182,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0855712890625,
"step": 12225
},
{
"epoch": 6.412473794549266,
"grad_norm": 0.7590340375900269,
"learning_rate": 8.613829703221799e-07,
"loss": 0.1135,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14787140488624573,
"step": 12230
},
{
"epoch": 6.415094339622642,
"grad_norm": 0.6925032138824463,
"learning_rate": 8.538115075002707e-07,
"loss": 0.0817,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07074858248233795,
"step": 12235
},
{
"epoch": 6.417714884696017,
"grad_norm": 0.7162047624588013,
"learning_rate": 8.46272742068337e-07,
"loss": 0.1257,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11975658684968948,
"step": 12240
},
{
"epoch": 6.420335429769392,
"grad_norm": 0.857296884059906,
"learning_rate": 8.387666869008981e-07,
"loss": 0.1075,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.06390380859375,
"step": 12245
},
{
"epoch": 6.422955974842767,
"grad_norm": 0.6864899396896362,
"learning_rate": 8.312933548166136e-07,
"loss": 0.1032,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.089599609375,
"step": 12250
},
{
"epoch": 6.4255765199161425,
"grad_norm": 0.6637594699859619,
"learning_rate": 8.238527585782563e-07,
"loss": 0.1222,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12327419221401215,
"step": 12255
},
{
"epoch": 6.428197064989518,
"grad_norm": 0.7350227236747742,
"learning_rate": 8.164449108926887e-07,
"loss": 0.1149,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13680747151374817,
"step": 12260
},
{
"epoch": 6.430817610062893,
"grad_norm": 0.6388922929763794,
"learning_rate": 8.090698244108553e-07,
"loss": 0.1061,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13326486945152283,
"step": 12265
},
{
"epoch": 6.433438155136268,
"grad_norm": 0.7068225741386414,
"learning_rate": 8.017275117277434e-07,
"loss": 0.1112,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14121992886066437,
"step": 12270
},
{
"epoch": 6.436058700209643,
"grad_norm": 0.7588358521461487,
"learning_rate": 7.944179853823786e-07,
"loss": 0.122,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09675169736146927,
"step": 12275
},
{
"epoch": 6.438679245283019,
"grad_norm": 0.793441653251648,
"learning_rate": 7.871412578577886e-07,
"loss": 0.1011,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13168825209140778,
"step": 12280
},
{
"epoch": 6.441299790356394,
"grad_norm": 0.7840567231178284,
"learning_rate": 7.798973415809885e-07,
"loss": 0.1221,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12841647863388062,
"step": 12285
},
{
"epoch": 6.44392033542977,
"grad_norm": 0.6803513765335083,
"learning_rate": 7.726862489229625e-07,
"loss": 0.107,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11007022857666016,
"step": 12290
},
{
"epoch": 6.446540880503145,
"grad_norm": 0.7925393581390381,
"learning_rate": 7.65507992198633e-07,
"loss": 0.1033,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10711553692817688,
"step": 12295
},
{
"epoch": 6.44916142557652,
"grad_norm": 0.7759236693382263,
"learning_rate": 7.583625836668562e-07,
"loss": 0.0751,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.05218505859375,
"step": 12300
},
{
"epoch": 6.451781970649895,
"grad_norm": 0.7366616129875183,
"learning_rate": 7.51250035530382e-07,
"loss": 0.1242,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10968296229839325,
"step": 12305
},
{
"epoch": 6.45440251572327,
"grad_norm": 0.7767581939697266,
"learning_rate": 7.441703599358474e-07,
"loss": 0.1171,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11889313906431198,
"step": 12310
},
{
"epoch": 6.4570230607966455,
"grad_norm": 0.7596882581710815,
"learning_rate": 7.371235689737455e-07,
"loss": 0.1193,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11869388073682785,
"step": 12315
},
{
"epoch": 6.459643605870021,
"grad_norm": 0.8227205872535706,
"learning_rate": 7.301096746784098e-07,
"loss": 0.1134,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10239917039871216,
"step": 12320
},
{
"epoch": 6.462264150943396,
"grad_norm": 0.7390924096107483,
"learning_rate": 7.231286890280053e-07,
"loss": 0.1133,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13894927501678467,
"step": 12325
},
{
"epoch": 6.464884696016772,
"grad_norm": 0.7605389356613159,
"learning_rate": 7.161806239444824e-07,
"loss": 0.1186,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1234489157795906,
"step": 12330
},
{
"epoch": 6.467505241090147,
"grad_norm": 0.7993542551994324,
"learning_rate": 7.092654912935759e-07,
"loss": 0.1117,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07925504446029663,
"step": 12335
},
{
"epoch": 6.470125786163522,
"grad_norm": 0.7740378379821777,
"learning_rate": 7.023833028847793e-07,
"loss": 0.1023,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12133189290761948,
"step": 12340
},
{
"epoch": 6.472746331236897,
"grad_norm": 0.7178128957748413,
"learning_rate": 6.955340704713243e-07,
"loss": 0.0887,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07630637288093567,
"step": 12345
},
{
"epoch": 6.4753668763102725,
"grad_norm": 0.7819163799285889,
"learning_rate": 6.887178057501632e-07,
"loss": 0.1234,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1381218135356903,
"step": 12350
},
{
"epoch": 6.477987421383648,
"grad_norm": 0.7058833241462708,
"learning_rate": 6.819345203619443e-07,
"loss": 0.1192,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13427695631980896,
"step": 12355
},
{
"epoch": 6.480607966457023,
"grad_norm": 0.7779912352561951,
"learning_rate": 6.751842258909969e-07,
"loss": 0.12,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.103742316365242,
"step": 12360
},
{
"epoch": 6.483228511530398,
"grad_norm": 0.7333693504333496,
"learning_rate": 6.684669338653083e-07,
"loss": 0.1047,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08863117545843124,
"step": 12365
},
{
"epoch": 6.485849056603773,
"grad_norm": 0.7994443774223328,
"learning_rate": 6.617826557564977e-07,
"loss": 0.1077,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12241359055042267,
"step": 12370
},
{
"epoch": 6.488469601677149,
"grad_norm": 0.7210835218429565,
"learning_rate": 6.551314029798206e-07,
"loss": 0.0933,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07509548217058182,
"step": 12375
},
{
"epoch": 6.491090146750524,
"grad_norm": 0.7040566802024841,
"learning_rate": 6.485131868941197e-07,
"loss": 0.098,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07635965943336487,
"step": 12380
},
{
"epoch": 6.4937106918239,
"grad_norm": 0.742750346660614,
"learning_rate": 6.419280188018207e-07,
"loss": 0.099,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0703125,
"step": 12385
},
{
"epoch": 6.496331236897275,
"grad_norm": 0.7622641324996948,
"learning_rate": 6.353759099489121e-07,
"loss": 0.115,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11391450464725494,
"step": 12390
},
{
"epoch": 6.49895178197065,
"grad_norm": 0.7823505997657776,
"learning_rate": 6.28856871524921e-07,
"loss": 0.1001,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08123779296875,
"step": 12395
},
{
"epoch": 6.501572327044025,
"grad_norm": 0.7582354545593262,
"learning_rate": 6.223709146629064e-07,
"loss": 0.1226,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1327885389328003,
"step": 12400
},
{
"epoch": 6.5041928721174,
"grad_norm": 0.756651759147644,
"learning_rate": 6.159180504394236e-07,
"loss": 0.1,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13122040033340454,
"step": 12405
},
{
"epoch": 6.506813417190775,
"grad_norm": 0.7833355069160461,
"learning_rate": 6.09498289874515e-07,
"loss": 0.1101,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1171574741601944,
"step": 12410
},
{
"epoch": 6.509433962264151,
"grad_norm": 0.699315071105957,
"learning_rate": 6.031116439316931e-07,
"loss": 0.12,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12373475730419159,
"step": 12415
},
{
"epoch": 6.512054507337526,
"grad_norm": 0.7405616044998169,
"learning_rate": 5.967581235179065e-07,
"loss": 0.1108,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08771055191755295,
"step": 12420
},
{
"epoch": 6.514675052410902,
"grad_norm": 0.6853896379470825,
"learning_rate": 5.904377394835514e-07,
"loss": 0.1082,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12733067572116852,
"step": 12425
},
{
"epoch": 6.517295597484277,
"grad_norm": 0.5839729905128479,
"learning_rate": 5.841505026224181e-07,
"loss": 0.1167,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14495626091957092,
"step": 12430
},
{
"epoch": 6.519916142557652,
"grad_norm": 0.7660456299781799,
"learning_rate": 5.778964236716977e-07,
"loss": 0.0963,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11170623451471329,
"step": 12435
},
{
"epoch": 6.522536687631027,
"grad_norm": 0.7043365836143494,
"learning_rate": 5.716755133119512e-07,
"loss": 0.1082,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10523848235607147,
"step": 12440
},
{
"epoch": 6.5251572327044025,
"grad_norm": 0.7790372967720032,
"learning_rate": 5.654877821670979e-07,
"loss": 0.107,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10113519430160522,
"step": 12445
},
{
"epoch": 6.527777777777778,
"grad_norm": 0.7878185510635376,
"learning_rate": 5.593332408043872e-07,
"loss": 0.1115,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10966823250055313,
"step": 12450
},
{
"epoch": 6.530398322851153,
"grad_norm": 0.6953631043434143,
"learning_rate": 5.532118997344027e-07,
"loss": 0.1036,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09158299118280411,
"step": 12455
},
{
"epoch": 6.533018867924528,
"grad_norm": 0.7004905939102173,
"learning_rate": 5.471237694110132e-07,
"loss": 0.095,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08766491711139679,
"step": 12460
},
{
"epoch": 6.535639412997903,
"grad_norm": 0.7077845931053162,
"learning_rate": 5.410688602313797e-07,
"loss": 0.1069,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1242421418428421,
"step": 12465
},
{
"epoch": 6.538259958071279,
"grad_norm": 0.8396255373954773,
"learning_rate": 5.350471825359305e-07,
"loss": 0.1207,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15242958068847656,
"step": 12470
},
{
"epoch": 6.540880503144654,
"grad_norm": 0.6612614989280701,
"learning_rate": 5.290587466083308e-07,
"loss": 0.114,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12696123123168945,
"step": 12475
},
{
"epoch": 6.54350104821803,
"grad_norm": 0.7015223503112793,
"learning_rate": 5.231035626754932e-07,
"loss": 0.1,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14191541075706482,
"step": 12480
},
{
"epoch": 6.546121593291405,
"grad_norm": 0.8182874917984009,
"learning_rate": 5.171816409075314e-07,
"loss": 0.1002,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09325539320707321,
"step": 12485
},
{
"epoch": 6.54874213836478,
"grad_norm": 0.8494343161582947,
"learning_rate": 5.112929914177556e-07,
"loss": 0.1066,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.075439453125,
"step": 12490
},
{
"epoch": 6.551362683438155,
"grad_norm": 0.717740535736084,
"learning_rate": 5.054376242626591e-07,
"loss": 0.0841,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.06298828125,
"step": 12495
},
{
"epoch": 6.55398322851153,
"grad_norm": 0.8419182896614075,
"learning_rate": 4.996155494418897e-07,
"loss": 0.102,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10433823615312576,
"step": 12500
},
{
"epoch": 6.556603773584905,
"grad_norm": 0.7867080569267273,
"learning_rate": 4.938267768982496e-07,
"loss": 0.1087,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11582699418067932,
"step": 12505
},
{
"epoch": 6.559224318658281,
"grad_norm": 0.6588146090507507,
"learning_rate": 4.880713165176598e-07,
"loss": 0.1068,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10758035629987717,
"step": 12510
},
{
"epoch": 6.561844863731656,
"grad_norm": 0.7257134914398193,
"learning_rate": 4.823491781291534e-07,
"loss": 0.1042,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10703811049461365,
"step": 12515
},
{
"epoch": 6.564465408805032,
"grad_norm": 0.7567834258079529,
"learning_rate": 4.766603715048557e-07,
"loss": 0.1144,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10694126784801483,
"step": 12520
},
{
"epoch": 6.567085953878407,
"grad_norm": 0.7547696828842163,
"learning_rate": 4.710049063599753e-07,
"loss": 0.1093,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08135071396827698,
"step": 12525
},
{
"epoch": 6.569706498951782,
"grad_norm": 0.7312501072883606,
"learning_rate": 4.6538279235277315e-07,
"loss": 0.1159,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10318690538406372,
"step": 12530
},
{
"epoch": 6.572327044025157,
"grad_norm": 0.7074016332626343,
"learning_rate": 4.597940390845601e-07,
"loss": 0.086,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10734502971172333,
"step": 12535
},
{
"epoch": 6.5749475890985325,
"grad_norm": 0.7671141624450684,
"learning_rate": 4.542386560996681e-07,
"loss": 0.1028,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1200064942240715,
"step": 12540
},
{
"epoch": 6.577568134171908,
"grad_norm": 0.7219176888465881,
"learning_rate": 4.487166528854459e-07,
"loss": 0.1005,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08383356034755707,
"step": 12545
},
{
"epoch": 6.580188679245283,
"grad_norm": 0.878116250038147,
"learning_rate": 4.432280388722343e-07,
"loss": 0.1106,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0889734998345375,
"step": 12550
},
{
"epoch": 6.582809224318658,
"grad_norm": 0.6791242957115173,
"learning_rate": 4.377728234333534e-07,
"loss": 0.1212,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13134799897670746,
"step": 12555
},
{
"epoch": 6.585429769392033,
"grad_norm": 0.6606261730194092,
"learning_rate": 4.3235101588508633e-07,
"loss": 0.1135,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10282780230045319,
"step": 12560
},
{
"epoch": 6.588050314465409,
"grad_norm": 0.9038112163543701,
"learning_rate": 4.269626254866643e-07,
"loss": 0.1017,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09375,
"step": 12565
},
{
"epoch": 6.590670859538784,
"grad_norm": 0.7051216959953308,
"learning_rate": 4.216076614402442e-07,
"loss": 0.121,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0945984348654747,
"step": 12570
},
{
"epoch": 6.59329140461216,
"grad_norm": 0.7136359214782715,
"learning_rate": 4.162861328909018e-07,
"loss": 0.1033,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07963068783283234,
"step": 12575
},
{
"epoch": 6.595911949685535,
"grad_norm": 0.683529794216156,
"learning_rate": 4.1099804892661855e-07,
"loss": 0.1049,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1307920664548874,
"step": 12580
},
{
"epoch": 6.59853249475891,
"grad_norm": 0.7927236557006836,
"learning_rate": 4.0574341857824826e-07,
"loss": 0.1067,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12521082162857056,
"step": 12585
},
{
"epoch": 6.601153039832285,
"grad_norm": 0.7777563333511353,
"learning_rate": 4.005222508195217e-07,
"loss": 0.0964,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.085205078125,
"step": 12590
},
{
"epoch": 6.60377358490566,
"grad_norm": 0.8762742280960083,
"learning_rate": 3.9533455456702173e-07,
"loss": 0.1133,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08815234154462814,
"step": 12595
},
{
"epoch": 6.606394129979035,
"grad_norm": 0.7172031998634338,
"learning_rate": 3.9018033868016616e-07,
"loss": 0.0846,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0655192881822586,
"step": 12600
},
{
"epoch": 6.609014675052411,
"grad_norm": 0.6910161972045898,
"learning_rate": 3.8505961196120044e-07,
"loss": 0.1,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12381216883659363,
"step": 12605
},
{
"epoch": 6.611635220125786,
"grad_norm": 0.7001184821128845,
"learning_rate": 3.7997238315517606e-07,
"loss": 0.1314,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1847023069858551,
"step": 12610
},
{
"epoch": 6.614255765199162,
"grad_norm": 0.7264262437820435,
"learning_rate": 3.7491866094993446e-07,
"loss": 0.1191,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10422031581401825,
"step": 12615
},
{
"epoch": 6.616876310272537,
"grad_norm": 0.7688695192337036,
"learning_rate": 3.698984539761008e-07,
"loss": 0.1028,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07764077186584473,
"step": 12620
},
{
"epoch": 6.619496855345912,
"grad_norm": 0.7013809084892273,
"learning_rate": 3.649117708070571e-07,
"loss": 0.1065,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11546897888183594,
"step": 12625
},
{
"epoch": 6.622117400419287,
"grad_norm": 0.7238118052482605,
"learning_rate": 3.5995861995894444e-07,
"loss": 0.1211,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13196802139282227,
"step": 12630
},
{
"epoch": 6.6247379454926625,
"grad_norm": 0.7912337183952332,
"learning_rate": 3.5503900989062755e-07,
"loss": 0.1153,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12135768681764603,
"step": 12635
},
{
"epoch": 6.627358490566038,
"grad_norm": 0.7144173979759216,
"learning_rate": 3.5015294900369703e-07,
"loss": 0.1241,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11887726187705994,
"step": 12640
},
{
"epoch": 6.629979035639413,
"grad_norm": 0.6379136443138123,
"learning_rate": 3.453004456424491e-07,
"loss": 0.1067,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12516003847122192,
"step": 12645
},
{
"epoch": 6.632599580712788,
"grad_norm": 0.7322754859924316,
"learning_rate": 3.404815080938639e-07,
"loss": 0.1227,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12225642800331116,
"step": 12650
},
{
"epoch": 6.635220125786163,
"grad_norm": 0.7234863638877869,
"learning_rate": 3.356961445876117e-07,
"loss": 0.1203,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11731066554784775,
"step": 12655
},
{
"epoch": 6.637840670859539,
"grad_norm": 0.7147253155708313,
"learning_rate": 3.309443632960152e-07,
"loss": 0.1064,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14796984195709229,
"step": 12660
},
{
"epoch": 6.640461215932914,
"grad_norm": 0.7932282090187073,
"learning_rate": 3.2622617233404985e-07,
"loss": 0.1041,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09972994774580002,
"step": 12665
},
{
"epoch": 6.6430817610062896,
"grad_norm": 0.828298032283783,
"learning_rate": 3.21541579759328e-07,
"loss": 0.1015,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10119494795799255,
"step": 12670
},
{
"epoch": 6.645702306079665,
"grad_norm": 0.6335622668266296,
"learning_rate": 3.1689059357207674e-07,
"loss": 0.1104,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09908124059438705,
"step": 12675
},
{
"epoch": 6.64832285115304,
"grad_norm": 0.7649173736572266,
"learning_rate": 3.122732217151403e-07,
"loss": 0.1135,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10169924795627594,
"step": 12680
},
{
"epoch": 6.650943396226415,
"grad_norm": 0.7193025350570679,
"learning_rate": 3.076894720739532e-07,
"loss": 0.0907,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11542670428752899,
"step": 12685
},
{
"epoch": 6.65356394129979,
"grad_norm": 0.7355470657348633,
"learning_rate": 3.0313935247652695e-07,
"loss": 0.1199,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11160104721784592,
"step": 12690
},
{
"epoch": 6.656184486373165,
"grad_norm": 0.6548497080802917,
"learning_rate": 2.9862287069344575e-07,
"loss": 0.0909,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13360291719436646,
"step": 12695
},
{
"epoch": 6.658805031446541,
"grad_norm": 0.859886646270752,
"learning_rate": 2.9414003443784867e-07,
"loss": 0.0886,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.059539794921875,
"step": 12700
},
{
"epoch": 6.661425576519916,
"grad_norm": 0.8096195459365845,
"learning_rate": 2.896908513654073e-07,
"loss": 0.1051,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12544667720794678,
"step": 12705
},
{
"epoch": 6.664046121593291,
"grad_norm": 0.695798933506012,
"learning_rate": 2.852753290743326e-07,
"loss": 0.1213,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1460527777671814,
"step": 12710
},
{
"epoch": 6.666666666666667,
"grad_norm": 0.7370837330818176,
"learning_rate": 2.808934751053438e-07,
"loss": 0.0949,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0875244140625,
"step": 12715
},
{
"epoch": 6.669287211740042,
"grad_norm": 0.7537276148796082,
"learning_rate": 2.7654529694166157e-07,
"loss": 0.1094,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1236792579293251,
"step": 12720
},
{
"epoch": 6.671907756813417,
"grad_norm": 0.7928598523139954,
"learning_rate": 2.722308020089992e-07,
"loss": 0.104,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0823974609375,
"step": 12725
},
{
"epoch": 6.6745283018867925,
"grad_norm": 0.8015418648719788,
"learning_rate": 2.6794999767554287e-07,
"loss": 0.1022,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.059906005859375,
"step": 12730
},
{
"epoch": 6.677148846960168,
"grad_norm": 0.7586283683776855,
"learning_rate": 2.637028912519468e-07,
"loss": 0.1218,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10995981842279434,
"step": 12735
},
{
"epoch": 6.679769392033543,
"grad_norm": 0.7164087295532227,
"learning_rate": 2.5948948999131585e-07,
"loss": 0.1094,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10501286387443542,
"step": 12740
},
{
"epoch": 6.682389937106918,
"grad_norm": 1.1453492641448975,
"learning_rate": 2.553098010891919e-07,
"loss": 0.1128,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17530810832977295,
"step": 12745
},
{
"epoch": 6.685010482180293,
"grad_norm": 0.7735922932624817,
"learning_rate": 2.511638316835474e-07,
"loss": 0.1226,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10268770903348923,
"step": 12750
},
{
"epoch": 6.687631027253669,
"grad_norm": 0.8022793531417847,
"learning_rate": 2.470515888547609e-07,
"loss": 0.1135,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11690790951251984,
"step": 12755
},
{
"epoch": 6.690251572327044,
"grad_norm": 0.672698974609375,
"learning_rate": 2.429730796256236e-07,
"loss": 0.1013,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08837890625,
"step": 12760
},
{
"epoch": 6.6928721174004195,
"grad_norm": 0.6244108080863953,
"learning_rate": 2.3892831096131494e-07,
"loss": 0.118,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1031256839632988,
"step": 12765
},
{
"epoch": 6.695492662473795,
"grad_norm": 0.7368753552436829,
"learning_rate": 2.3491728976938742e-07,
"loss": 0.111,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12542890012264252,
"step": 12770
},
{
"epoch": 6.69811320754717,
"grad_norm": 0.760115921497345,
"learning_rate": 2.3094002289976824e-07,
"loss": 0.1168,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11905722320079803,
"step": 12775
},
{
"epoch": 6.700733752620545,
"grad_norm": 0.6448925137519836,
"learning_rate": 2.2699651714473302e-07,
"loss": 0.1176,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11876273900270462,
"step": 12780
},
{
"epoch": 6.70335429769392,
"grad_norm": 0.7480875849723816,
"learning_rate": 2.2308677923890576e-07,
"loss": 0.0937,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.080322265625,
"step": 12785
},
{
"epoch": 6.705974842767295,
"grad_norm": 0.7540961503982544,
"learning_rate": 2.1921081585923875e-07,
"loss": 0.1167,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15754400193691254,
"step": 12790
},
{
"epoch": 6.7085953878406706,
"grad_norm": 0.7379507422447205,
"learning_rate": 2.153686336250105e-07,
"loss": 0.1101,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07952894270420074,
"step": 12795
},
{
"epoch": 6.711215932914046,
"grad_norm": 0.7464003562927246,
"learning_rate": 2.1156023909780111e-07,
"loss": 0.1047,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12541179358959198,
"step": 12800
},
{
"epoch": 6.713836477987421,
"grad_norm": 0.7177678346633911,
"learning_rate": 2.0778563878149471e-07,
"loss": 0.1075,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12439662963151932,
"step": 12805
},
{
"epoch": 6.716457023060797,
"grad_norm": 0.7599697113037109,
"learning_rate": 2.0404483912226158e-07,
"loss": 0.1051,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10825493931770325,
"step": 12810
},
{
"epoch": 6.719077568134172,
"grad_norm": 1.1192086935043335,
"learning_rate": 2.0033784650854927e-07,
"loss": 0.117,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17269468307495117,
"step": 12815
},
{
"epoch": 6.721698113207547,
"grad_norm": 0.719208300113678,
"learning_rate": 1.9666466727106481e-07,
"loss": 0.1048,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0932646170258522,
"step": 12820
},
{
"epoch": 6.7243186582809225,
"grad_norm": 0.7651455402374268,
"learning_rate": 1.9302530768277706e-07,
"loss": 0.1014,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12158767879009247,
"step": 12825
},
{
"epoch": 6.726939203354298,
"grad_norm": 0.7352505326271057,
"learning_rate": 1.8941977395888988e-07,
"loss": 0.1102,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11577711254358292,
"step": 12830
},
{
"epoch": 6.729559748427673,
"grad_norm": 0.7424346804618835,
"learning_rate": 1.8584807225684898e-07,
"loss": 0.0986,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08648681640625,
"step": 12835
},
{
"epoch": 6.732180293501048,
"grad_norm": 0.7273895144462585,
"learning_rate": 1.8231020867631955e-07,
"loss": 0.1103,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15292765200138092,
"step": 12840
},
{
"epoch": 6.734800838574423,
"grad_norm": 0.6363976001739502,
"learning_rate": 1.7880618925917526e-07,
"loss": 0.1156,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1068999320268631,
"step": 12845
},
{
"epoch": 6.737421383647799,
"grad_norm": 0.6364636421203613,
"learning_rate": 1.753360199894938e-07,
"loss": 0.111,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14879164099693298,
"step": 12850
},
{
"epoch": 6.740041928721174,
"grad_norm": 0.6741135120391846,
"learning_rate": 1.7189970679354794e-07,
"loss": 0.1166,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13488692045211792,
"step": 12855
},
{
"epoch": 6.7426624737945495,
"grad_norm": 0.6478638648986816,
"learning_rate": 1.6849725553978792e-07,
"loss": 0.1091,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1508275717496872,
"step": 12860
},
{
"epoch": 6.745283018867925,
"grad_norm": 0.7270845770835876,
"learning_rate": 1.6512867203883453e-07,
"loss": 0.0985,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12802273035049438,
"step": 12865
},
{
"epoch": 6.7479035639413,
"grad_norm": 0.716550886631012,
"learning_rate": 1.6179396204347497e-07,
"loss": 0.1195,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1199185699224472,
"step": 12870
},
{
"epoch": 6.750524109014675,
"grad_norm": 0.8040631413459778,
"learning_rate": 1.5849313124864262e-07,
"loss": 0.1147,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10303539782762527,
"step": 12875
},
{
"epoch": 6.75314465408805,
"grad_norm": 0.6849897503852844,
"learning_rate": 1.55226185291415e-07,
"loss": 0.1013,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13410991430282593,
"step": 12880
},
{
"epoch": 6.755765199161425,
"grad_norm": 0.7915083169937134,
"learning_rate": 1.5199312975100243e-07,
"loss": 0.0865,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09747314453125,
"step": 12885
},
{
"epoch": 6.7583857442348005,
"grad_norm": 0.7480253577232361,
"learning_rate": 1.4879397014873954e-07,
"loss": 0.1164,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10575560480356216,
"step": 12890
},
{
"epoch": 6.761006289308176,
"grad_norm": 0.7745283246040344,
"learning_rate": 1.4562871194806926e-07,
"loss": 0.0933,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08332160860300064,
"step": 12895
},
{
"epoch": 6.763626834381551,
"grad_norm": 0.672770082950592,
"learning_rate": 1.4249736055454545e-07,
"loss": 0.1078,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1391640454530716,
"step": 12900
},
{
"epoch": 6.766247379454927,
"grad_norm": 0.83051598072052,
"learning_rate": 1.3939992131581038e-07,
"loss": 0.1181,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1187271699309349,
"step": 12905
},
{
"epoch": 6.768867924528302,
"grad_norm": 0.7149649262428284,
"learning_rate": 1.3633639952159495e-07,
"loss": 0.1183,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11559177190065384,
"step": 12910
},
{
"epoch": 6.771488469601677,
"grad_norm": 0.8395388126373291,
"learning_rate": 1.3330680040370525e-07,
"loss": 0.0976,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0789794921875,
"step": 12915
},
{
"epoch": 6.774109014675052,
"grad_norm": 0.8058507442474365,
"learning_rate": 1.303111291360204e-07,
"loss": 0.1129,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0765380859375,
"step": 12920
},
{
"epoch": 6.776729559748428,
"grad_norm": 0.7542924284934998,
"learning_rate": 1.2734939083447028e-07,
"loss": 0.0907,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12192582339048386,
"step": 12925
},
{
"epoch": 6.779350104821803,
"grad_norm": 0.6925415396690369,
"learning_rate": 1.2442159055703785e-07,
"loss": 0.0985,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11571474373340607,
"step": 12930
},
{
"epoch": 6.781970649895178,
"grad_norm": 0.8176593780517578,
"learning_rate": 1.2152773330375233e-07,
"loss": 0.1079,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.115481436252594,
"step": 12935
},
{
"epoch": 6.784591194968553,
"grad_norm": 0.7937777042388916,
"learning_rate": 1.1866782401666943e-07,
"loss": 0.0997,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11660611629486084,
"step": 12940
},
{
"epoch": 6.787211740041929,
"grad_norm": 0.7690672874450684,
"learning_rate": 1.1584186757987336e-07,
"loss": 0.1024,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10116563737392426,
"step": 12945
},
{
"epoch": 6.789832285115304,
"grad_norm": 0.6998053193092346,
"learning_rate": 1.1304986881946145e-07,
"loss": 0.1104,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15394994616508484,
"step": 12950
},
{
"epoch": 6.7924528301886795,
"grad_norm": 0.876414954662323,
"learning_rate": 1.1029183250354181e-07,
"loss": 0.1151,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09614819288253784,
"step": 12955
},
{
"epoch": 6.795073375262055,
"grad_norm": 0.7758975625038147,
"learning_rate": 1.0756776334222008e-07,
"loss": 0.1035,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07234509289264679,
"step": 12960
},
{
"epoch": 6.79769392033543,
"grad_norm": 0.7724853754043579,
"learning_rate": 1.0487766598759496e-07,
"loss": 0.1176,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12443333864212036,
"step": 12965
},
{
"epoch": 6.800314465408805,
"grad_norm": 0.7516874074935913,
"learning_rate": 1.0222154503374937e-07,
"loss": 0.1127,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12476566433906555,
"step": 12970
},
{
"epoch": 6.80293501048218,
"grad_norm": 0.6587119698524475,
"learning_rate": 9.959940501674148e-08,
"loss": 0.1145,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10120025277137756,
"step": 12975
},
{
"epoch": 6.805555555555555,
"grad_norm": 0.8887156844139099,
"learning_rate": 9.701125041459592e-08,
"loss": 0.1128,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.17159056663513184,
"step": 12980
},
{
"epoch": 6.8081761006289305,
"grad_norm": 0.7278609871864319,
"learning_rate": 9.445708564729927e-08,
"loss": 0.1181,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0933285504579544,
"step": 12985
},
{
"epoch": 6.810796645702306,
"grad_norm": 0.682710587978363,
"learning_rate": 9.193691507679126e-08,
"loss": 0.0947,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.06728173047304153,
"step": 12990
},
{
"epoch": 6.813417190775681,
"grad_norm": 0.7441592216491699,
"learning_rate": 8.945074300696022e-08,
"loss": 0.1128,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08619271218776703,
"step": 12995
},
{
"epoch": 6.816037735849057,
"grad_norm": 0.794715940952301,
"learning_rate": 8.699857368362985e-08,
"loss": 0.1019,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13376472890377045,
"step": 13000
},
{
"epoch": 6.818658280922432,
"grad_norm": 0.804305911064148,
"learning_rate": 8.45804112945503e-08,
"loss": 0.1031,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07986754924058914,
"step": 13005
},
{
"epoch": 6.821278825995807,
"grad_norm": 0.7131937742233276,
"learning_rate": 8.21962599694004e-08,
"loss": 0.1236,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12417107820510864,
"step": 13010
},
{
"epoch": 6.823899371069182,
"grad_norm": 0.7585114240646362,
"learning_rate": 7.984612377977874e-08,
"loss": 0.1005,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12065385282039642,
"step": 13015
},
{
"epoch": 6.826519916142558,
"grad_norm": 0.6928541660308838,
"learning_rate": 7.753000673919042e-08,
"loss": 0.1095,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1194111630320549,
"step": 13020
},
{
"epoch": 6.829140461215933,
"grad_norm": 0.7906724214553833,
"learning_rate": 7.524791280303812e-08,
"loss": 0.1093,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09456080198287964,
"step": 13025
},
{
"epoch": 6.831761006289308,
"grad_norm": 0.7523605227470398,
"learning_rate": 7.299984586862874e-08,
"loss": 0.1006,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09011338651180267,
"step": 13030
},
{
"epoch": 6.834381551362683,
"grad_norm": 0.7620218992233276,
"learning_rate": 7.07858097751557e-08,
"loss": 0.1162,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12259824573993683,
"step": 13035
},
{
"epoch": 6.837002096436059,
"grad_norm": 0.7315376996994019,
"learning_rate": 6.860580830369668e-08,
"loss": 0.094,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.080322265625,
"step": 13040
},
{
"epoch": 6.839622641509434,
"grad_norm": 0.7305793762207031,
"learning_rate": 6.64598451772025e-08,
"loss": 0.0954,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.06353759765625,
"step": 13045
},
{
"epoch": 6.8422431865828095,
"grad_norm": 0.798496425151825,
"learning_rate": 6.434792406049717e-08,
"loss": 0.1067,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0895010381937027,
"step": 13050
},
{
"epoch": 6.844863731656185,
"grad_norm": 2.8543035984039307,
"learning_rate": 6.227004856026897e-08,
"loss": 0.1237,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11394209414720535,
"step": 13055
},
{
"epoch": 6.84748427672956,
"grad_norm": 0.8442890048027039,
"learning_rate": 6.022622222505936e-08,
"loss": 0.0876,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0831298828125,
"step": 13060
},
{
"epoch": 6.850104821802935,
"grad_norm": 0.7585592865943909,
"learning_rate": 5.8216448545265205e-08,
"loss": 0.1171,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09647373855113983,
"step": 13065
},
{
"epoch": 6.85272536687631,
"grad_norm": 0.7444117665290833,
"learning_rate": 5.6240730953132096e-08,
"loss": 0.1108,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11333517730236053,
"step": 13070
},
{
"epoch": 6.855345911949685,
"grad_norm": 0.7710156440734863,
"learning_rate": 5.429907282273883e-08,
"loss": 0.1169,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10573013871908188,
"step": 13075
},
{
"epoch": 6.8579664570230605,
"grad_norm": 0.7597822546958923,
"learning_rate": 5.239147747000406e-08,
"loss": 0.1135,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08099365234375,
"step": 13080
},
{
"epoch": 6.860587002096436,
"grad_norm": 0.6881340146064758,
"learning_rate": 5.051794815266853e-08,
"loss": 0.0996,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10512430965900421,
"step": 13085
},
{
"epoch": 6.863207547169811,
"grad_norm": 0.8049854636192322,
"learning_rate": 4.867848807029951e-08,
"loss": 0.0969,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0690917819738388,
"step": 13090
},
{
"epoch": 6.865828092243187,
"grad_norm": 0.7548791766166687,
"learning_rate": 4.687310036428638e-08,
"loss": 0.0914,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.06378173828125,
"step": 13095
},
{
"epoch": 6.868448637316562,
"grad_norm": 0.7037566304206848,
"learning_rate": 4.510178811782284e-08,
"loss": 0.1019,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11814156919717789,
"step": 13100
},
{
"epoch": 6.871069182389937,
"grad_norm": 0.7759885787963867,
"learning_rate": 4.336455435591358e-08,
"loss": 0.0909,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10180674493312836,
"step": 13105
},
{
"epoch": 6.873689727463312,
"grad_norm": 0.730364978313446,
"learning_rate": 4.166140204536096e-08,
"loss": 0.0902,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.064483642578125,
"step": 13110
},
{
"epoch": 6.876310272536688,
"grad_norm": 0.6661535501480103,
"learning_rate": 3.999233409476943e-08,
"loss": 0.0872,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08388853818178177,
"step": 13115
},
{
"epoch": 6.878930817610063,
"grad_norm": 0.787157416343689,
"learning_rate": 3.835735335453228e-08,
"loss": 0.1164,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14255258440971375,
"step": 13120
},
{
"epoch": 6.881551362683438,
"grad_norm": 0.7189652919769287,
"learning_rate": 3.6756462616827084e-08,
"loss": 0.1029,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08161546289920807,
"step": 13125
},
{
"epoch": 6.884171907756813,
"grad_norm": 0.8704705238342285,
"learning_rate": 3.5189664615615795e-08,
"loss": 0.1195,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12327359616756439,
"step": 13130
},
{
"epoch": 6.886792452830189,
"grad_norm": 0.7432771921157837,
"learning_rate": 3.365696202664026e-08,
"loss": 0.0895,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11050796508789062,
"step": 13135
},
{
"epoch": 6.889412997903564,
"grad_norm": 0.6858586668968201,
"learning_rate": 3.215835746741114e-08,
"loss": 0.0943,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10106977075338364,
"step": 13140
},
{
"epoch": 6.8920335429769395,
"grad_norm": 0.6864309310913086,
"learning_rate": 3.069385349720788e-08,
"loss": 0.1089,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11738134920597076,
"step": 13145
},
{
"epoch": 6.894654088050315,
"grad_norm": 0.7780885696411133,
"learning_rate": 2.9263452617074306e-08,
"loss": 0.1044,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09812843799591064,
"step": 13150
},
{
"epoch": 6.89727463312369,
"grad_norm": 0.7669959664344788,
"learning_rate": 2.7867157269814147e-08,
"loss": 0.1251,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14291706681251526,
"step": 13155
},
{
"epoch": 6.899895178197065,
"grad_norm": 0.7585298418998718,
"learning_rate": 2.6504969839986627e-08,
"loss": 0.1129,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08612832427024841,
"step": 13160
},
{
"epoch": 6.90251572327044,
"grad_norm": 0.7068600654602051,
"learning_rate": 2.5176892653899777e-08,
"loss": 0.1045,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12019000947475433,
"step": 13165
},
{
"epoch": 6.905136268343815,
"grad_norm": 0.7073253393173218,
"learning_rate": 2.3882927979614888e-08,
"loss": 0.1107,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11398753523826599,
"step": 13170
},
{
"epoch": 6.9077568134171905,
"grad_norm": 0.7321997284889221,
"learning_rate": 2.2623078026930978e-08,
"loss": 0.1133,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12146317958831787,
"step": 13175
},
{
"epoch": 6.910377358490566,
"grad_norm": 0.7757676243782043,
"learning_rate": 2.139734494738699e-08,
"loss": 0.1248,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11253353208303452,
"step": 13180
},
{
"epoch": 6.912997903563941,
"grad_norm": 0.783780574798584,
"learning_rate": 2.0205730834264027e-08,
"loss": 0.103,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07848242670297623,
"step": 13185
},
{
"epoch": 6.915618448637317,
"grad_norm": 0.7034590840339661,
"learning_rate": 1.9048237722567586e-08,
"loss": 0.1072,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12093672901391983,
"step": 13190
},
{
"epoch": 6.918238993710692,
"grad_norm": 0.7753239870071411,
"learning_rate": 1.7924867589038663e-08,
"loss": 0.1061,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10527370125055313,
"step": 13195
},
{
"epoch": 6.920859538784067,
"grad_norm": 0.7496992349624634,
"learning_rate": 1.6835622352138203e-08,
"loss": 0.1114,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07733316719532013,
"step": 13200
},
{
"epoch": 6.923480083857442,
"grad_norm": 0.7619433999061584,
"learning_rate": 1.5780503872055986e-08,
"loss": 0.112,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09532599151134491,
"step": 13205
},
{
"epoch": 6.926100628930818,
"grad_norm": 0.7757188677787781,
"learning_rate": 1.475951395069286e-08,
"loss": 0.1092,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0898994505405426,
"step": 13210
},
{
"epoch": 6.928721174004193,
"grad_norm": 0.6868317723274231,
"learning_rate": 1.3772654331674073e-08,
"loss": 0.1064,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08451978862285614,
"step": 13215
},
{
"epoch": 6.931341719077568,
"grad_norm": 0.7334279417991638,
"learning_rate": 1.2819926700333718e-08,
"loss": 0.1152,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10934390872716904,
"step": 13220
},
{
"epoch": 6.933962264150943,
"grad_norm": 0.8165897727012634,
"learning_rate": 1.190133268371696e-08,
"loss": 0.1112,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12218495458364487,
"step": 13225
},
{
"epoch": 6.936582809224319,
"grad_norm": 0.8433283567428589,
"learning_rate": 1.1016873850573372e-08,
"loss": 0.1083,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10986328125,
"step": 13230
},
{
"epoch": 6.939203354297694,
"grad_norm": 0.7000241279602051,
"learning_rate": 1.0166551711363604e-08,
"loss": 0.1102,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0697021484375,
"step": 13235
},
{
"epoch": 6.9418238993710695,
"grad_norm": 0.8155228495597839,
"learning_rate": 9.350367718243825e-09,
"loss": 0.1145,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11756150424480438,
"step": 13240
},
{
"epoch": 6.944444444444445,
"grad_norm": 0.7745731472969055,
"learning_rate": 8.568323265074618e-09,
"loss": 0.0991,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11353258788585663,
"step": 13245
},
{
"epoch": 6.94706498951782,
"grad_norm": 0.9809739589691162,
"learning_rate": 7.820419687409874e-09,
"loss": 0.0959,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09842462837696075,
"step": 13250
},
{
"epoch": 6.949685534591195,
"grad_norm": 0.8062945604324341,
"learning_rate": 7.106658262505672e-09,
"loss": 0.1105,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09640898555517197,
"step": 13255
},
{
"epoch": 6.95230607966457,
"grad_norm": 0.7400102615356445,
"learning_rate": 6.427040209302515e-09,
"loss": 0.1079,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12121498584747314,
"step": 13260
},
{
"epoch": 6.954926624737945,
"grad_norm": 0.7922000885009766,
"learning_rate": 5.781566688436435e-09,
"loss": 0.1174,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12481498718261719,
"step": 13265
},
{
"epoch": 6.9575471698113205,
"grad_norm": 0.8348727822303772,
"learning_rate": 5.17023880223011e-09,
"loss": 0.1,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0753173828125,
"step": 13270
},
{
"epoch": 6.960167714884696,
"grad_norm": 0.6546790599822998,
"learning_rate": 4.593057594697304e-09,
"loss": 0.0947,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1255844235420227,
"step": 13275
},
{
"epoch": 6.962788259958071,
"grad_norm": 0.7211824059486389,
"learning_rate": 4.050024051531765e-09,
"loss": 0.1156,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10128942131996155,
"step": 13280
},
{
"epoch": 6.965408805031447,
"grad_norm": 0.7522569298744202,
"learning_rate": 3.541139100111668e-09,
"loss": 0.1063,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.081298828125,
"step": 13285
},
{
"epoch": 6.968029350104822,
"grad_norm": 0.7041332721710205,
"learning_rate": 3.066403609499613e-09,
"loss": 0.0966,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10874584317207336,
"step": 13290
},
{
"epoch": 6.970649895178197,
"grad_norm": 0.8444633483886719,
"learning_rate": 2.625818390438184e-09,
"loss": 0.1052,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1311793029308319,
"step": 13295
},
{
"epoch": 6.973270440251572,
"grad_norm": 0.7568145394325256,
"learning_rate": 2.219384195345509e-09,
"loss": 0.0964,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08481334149837494,
"step": 13300
},
{
"epoch": 6.975890985324948,
"grad_norm": 0.699336051940918,
"learning_rate": 1.8471017183241401e-09,
"loss": 0.1204,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.18024323880672455,
"step": 13305
},
{
"epoch": 6.978511530398323,
"grad_norm": 0.8261719346046448,
"learning_rate": 1.5089715951432937e-09,
"loss": 0.1143,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09984341263771057,
"step": 13310
},
{
"epoch": 6.981132075471698,
"grad_norm": 0.7103723883628845,
"learning_rate": 1.2049944032566096e-09,
"loss": 0.1313,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09648270159959793,
"step": 13315
},
{
"epoch": 6.983752620545073,
"grad_norm": 0.807367742061615,
"learning_rate": 9.351706617910516e-10,
"loss": 0.1036,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08563232421875,
"step": 13320
},
{
"epoch": 6.986373165618449,
"grad_norm": 0.8853840827941895,
"learning_rate": 6.995008315402451e-10,
"loss": 0.1018,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.069091796875,
"step": 13325
},
{
"epoch": 6.988993710691824,
"grad_norm": 0.7650611996650696,
"learning_rate": 4.979853149755797e-10,
"loss": 0.1116,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15118631720542908,
"step": 13330
},
{
"epoch": 6.9916142557651995,
"grad_norm": 0.7030468583106995,
"learning_rate": 3.3062445624398864e-10,
"loss": 0.1057,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0745849609375,
"step": 13335
},
{
"epoch": 6.994234800838575,
"grad_norm": 0.8280528783798218,
"learning_rate": 1.9741854115906679e-10,
"loss": 0.1018,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0935848206281662,
"step": 13340
},
{
"epoch": 6.99685534591195,
"grad_norm": 0.667475700378418,
"learning_rate": 9.836779720551193e-11,
"loss": 0.1284,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11847852170467377,
"step": 13345
},
{
"epoch": 6.999475890985325,
"grad_norm": 0.6849798560142517,
"learning_rate": 3.347239353912457e-11,
"loss": 0.099,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15633749961853027,
"step": 13350
},
{
"epoch": 6.999475890985325,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15633749961853027,
"step": 13350,
"total_flos": 1.29440113754112e+16,
"train_loss": 0.16645406688197276,
"train_runtime": 120834.4603,
"train_samples_per_second": 0.111,
"train_steps_per_second": 0.111
}
],
"logging_steps": 5,
"max_steps": 13356,
"num_input_tokens_seen": 0,
"num_train_epochs": 7,
"save_steps": 1500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.29440113754112e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}